diff options
Diffstat (limited to 'contrib/llvm/lib/Target/PowerPC')
74 files changed, 48142 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp new file mode 100644 index 000000000000..d7066d58709a --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -0,0 +1,1658 @@ +//===-- PPCAsmParser.cpp - Parse PowerPC asm to MCInst instructions ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCMCExpr.h" +#include "PPCTargetStreamer.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +static unsigned RRegs[32] = { + PPC::R0, PPC::R1, PPC::R2, PPC::R3, + PPC::R4, PPC::R5, PPC::R6, PPC::R7, + PPC::R8, PPC::R9, PPC::R10, PPC::R11, + PPC::R12, PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31 +}; +static unsigned RRegsNoR0[32] = { + PPC::ZERO, + PPC::R1, PPC::R2, PPC::R3, + PPC::R4, PPC::R5, PPC::R6, PPC::R7, + PPC::R8, PPC::R9, PPC::R10, PPC::R11, + PPC::R12, PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31 +}; +static unsigned XRegs[32] = { + PPC::X0, PPC::X1, PPC::X2, PPC::X3, + PPC::X4, PPC::X5, PPC::X6, PPC::X7, + PPC::X8, PPC::X9, PPC::X10, PPC::X11, + PPC::X12, PPC::X13, PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31 +}; +static unsigned XRegsNoX0[32] = { + PPC::ZERO8, + PPC::X1, PPC::X2, PPC::X3, + PPC::X4, PPC::X5, PPC::X6, PPC::X7, + PPC::X8, PPC::X9, PPC::X10, PPC::X11, + PPC::X12, PPC::X13, PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31 +}; +static unsigned FRegs[32] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31 +}; +static unsigned VRegs[32] = { + PPC::V0, PPC::V1, PPC::V2, PPC::V3, + PPC::V4, PPC::V5, PPC::V6, PPC::V7, + PPC::V8, PPC::V9, PPC::V10, PPC::V11, + PPC::V12, PPC::V13, PPC::V14, PPC::V15, + PPC::V16, PPC::V17, PPC::V18, PPC::V19, + PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, + PPC::V28, PPC::V29, PPC::V30, PPC::V31 +}; +static unsigned VSRegs[64] = { + PPC::VSL0, PPC::VSL1, PPC::VSL2, PPC::VSL3, + PPC::VSL4, PPC::VSL5, PPC::VSL6, PPC::VSL7, + PPC::VSL8, PPC::VSL9, PPC::VSL10, PPC::VSL11, + PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15, + PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19, + PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23, + PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27, + PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31, + + PPC::VSH0, PPC::VSH1, PPC::VSH2, PPC::VSH3, + PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, + PPC::VSH8, PPC::VSH9, PPC::VSH10, PPC::VSH11, + PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15, + PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19, + PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23, + PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27, + PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31 +}; +static unsigned VSFRegs[64] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31, + + PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3, + PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7, + PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11, + PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15, + PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19, + PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23, + PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27, + PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31 +}; +static unsigned CRBITRegs[32] = { + PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN, + PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN, + PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN, + PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN, + PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN, + PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN, + PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN, + PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN +}; +static unsigned CRRegs[8] = { + PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3, + PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7 +}; + +// Evaluate an expression containing condition register +// or condition register field symbols. Returns positive +// value on success, or -1 on error. +static int64_t +EvaluateCRExpr(const MCExpr *E) { + switch (E->getKind()) { + case MCExpr::Target: + return -1; + + case MCExpr::Constant: { + int64_t Res = cast<MCConstantExpr>(E)->getValue(); + return Res < 0 ? -1 : Res; + } + + case MCExpr::SymbolRef: { + const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E); + StringRef Name = SRE->getSymbol().getName(); + + if (Name == "lt") return 0; + if (Name == "gt") return 1; + if (Name == "eq") return 2; + if (Name == "so") return 3; + if (Name == "un") return 3; + + if (Name == "cr0") return 0; + if (Name == "cr1") return 1; + if (Name == "cr2") return 2; + if (Name == "cr3") return 3; + if (Name == "cr4") return 4; + if (Name == "cr5") return 5; + if (Name == "cr6") return 6; + if (Name == "cr7") return 7; + + return -1; + } + + case MCExpr::Unary: + return -1; + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(E); + int64_t LHSVal = EvaluateCRExpr(BE->getLHS()); + int64_t RHSVal = EvaluateCRExpr(BE->getRHS()); + int64_t Res; + + if (LHSVal < 0 || RHSVal < 0) + return -1; + + switch (BE->getOpcode()) { + default: return -1; + case MCBinaryExpr::Add: Res = LHSVal + RHSVal; break; + case MCBinaryExpr::Mul: Res = LHSVal * RHSVal; break; + } + + return Res < 0 ? -1 : Res; + } + } + + llvm_unreachable("Invalid expression kind!"); +} + +struct PPCOperand; + +class PPCAsmParser : public MCTargetAsmParser { + MCSubtargetInfo &STI; + MCAsmParser &Parser; + const MCInstrInfo &MII; + bool IsPPC64; + bool IsDarwin; + + MCAsmParser &getParser() const { return Parser; } + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + + void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } + bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + + bool isPPC64() const { return IsPPC64; } + bool isDarwin() const { return IsDarwin; } + + bool MatchRegisterName(const AsmToken &Tok, + unsigned &RegNo, int64_t &IntVal); + + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + + const MCExpr *ExtractModifierFromExpr(const MCExpr *E, + PPCMCExpr::VariantKind &Variant); + const MCExpr *FixupVariantKind(const MCExpr *E); + bool ParseExpression(const MCExpr *&EVal); + bool ParseDarwinExpression(const MCExpr *&EVal); + + bool ParseOperand(OperandVector &Operands); + + bool ParseDirectiveWord(unsigned Size, SMLoc L); + bool ParseDirectiveTC(unsigned Size, SMLoc L); + bool ParseDirectiveMachine(SMLoc L); + bool ParseDarwinDirectiveMachine(SMLoc L); + bool ParseDirectiveAbiVersion(SMLoc L); + bool ParseDirectiveLocalEntry(SMLoc L); + + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + unsigned &ErrorInfo, + bool MatchingInlineAsm) override; + + void ProcessInstruction(MCInst &Inst, const OperandVector &Ops); + + /// @name Auto-generated Match Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "PPCGenAsmMatcher.inc" + + /// } + + +public: + PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, + const MCInstrInfo &_MII, + const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(_MII) { + // Check for 64-bit vs. 32-bit pointer mode. + Triple TheTriple(STI.getTargetTriple()); + IsPPC64 = (TheTriple.getArch() == Triple::ppc64 || + TheTriple.getArch() == Triple::ppc64le); + IsDarwin = TheTriple.isMacOSX(); + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + + bool ParseDirective(AsmToken DirectiveID) override; + + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, + unsigned Kind) override; + + const MCExpr *applyModifierToExpr(const MCExpr *E, + MCSymbolRefExpr::VariantKind, + MCContext &Ctx) override; +}; + +/// PPCOperand - Instances of this class represent a parsed PowerPC machine +/// instruction. +struct PPCOperand : public MCParsedAsmOperand { + enum KindTy { + Token, + Immediate, + Expression, + TLSRegister + } Kind; + + SMLoc StartLoc, EndLoc; + bool IsPPC64; + + struct TokOp { + const char *Data; + unsigned Length; + }; + + struct ImmOp { + int64_t Val; + }; + + struct ExprOp { + const MCExpr *Val; + int64_t CRVal; // Cached result of EvaluateCRExpr(Val) + }; + + struct TLSRegOp { + const MCSymbolRefExpr *Sym; + }; + + union { + struct TokOp Tok; + struct ImmOp Imm; + struct ExprOp Expr; + struct TLSRegOp TLSReg; + }; + + PPCOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} +public: + PPCOperand(const PPCOperand &o) : MCParsedAsmOperand() { + Kind = o.Kind; + StartLoc = o.StartLoc; + EndLoc = o.EndLoc; + IsPPC64 = o.IsPPC64; + switch (Kind) { + case Token: + Tok = o.Tok; + break; + case Immediate: + Imm = o.Imm; + break; + case Expression: + Expr = o.Expr; + break; + case TLSRegister: + TLSReg = o.TLSReg; + break; + } + } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const override { return StartLoc; } + + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const override { return EndLoc; } + + /// isPPC64 - True if this operand is for an instruction in 64-bit mode. + bool isPPC64() const { return IsPPC64; } + + int64_t getImm() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.Val; + } + + const MCExpr *getExpr() const { + assert(Kind == Expression && "Invalid access!"); + return Expr.Val; + } + + int64_t getExprCRVal() const { + assert(Kind == Expression && "Invalid access!"); + return Expr.CRVal; + } + + const MCExpr *getTLSReg() const { + assert(Kind == TLSRegister && "Invalid access!"); + return TLSReg.Sym; + } + + unsigned getReg() const override { + assert(isRegNumber() && "Invalid access!"); + return (unsigned) Imm.Val; + } + + unsigned getVSReg() const { + assert(isVSRegNumber() && "Invalid access!"); + return (unsigned) Imm.Val; + } + + unsigned getCCReg() const { + assert(isCCRegNumber() && "Invalid access!"); + return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal); + } + + unsigned getCRBit() const { + assert(isCRBitNumber() && "Invalid access!"); + return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal); + } + + unsigned getCRBitMask() const { + assert(isCRBitMask() && "Invalid access!"); + return 7 - countTrailingZeros<uint64_t>(Imm.Val); + } + + bool isToken() const override { return Kind == Token; } + bool isImm() const override { return Kind == Immediate || Kind == Expression; } + bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); } + bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); } + bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); } + bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); } + bool isU16Imm() const { return Kind == Expression || + (Kind == Immediate && isUInt<16>(getImm())); } + bool isS16Imm() const { return Kind == Expression || + (Kind == Immediate && isInt<16>(getImm())); } + bool isS16ImmX4() const { return Kind == Expression || + (Kind == Immediate && isInt<16>(getImm()) && + (getImm() & 3) == 0); } + bool isS17Imm() const { return Kind == Expression || + (Kind == Immediate && isInt<17>(getImm())); } + bool isTLSReg() const { return Kind == TLSRegister; } + bool isDirectBr() const { return Kind == Expression || + (Kind == Immediate && isInt<26>(getImm()) && + (getImm() & 3) == 0); } + bool isCondBr() const { return Kind == Expression || + (Kind == Immediate && isInt<16>(getImm()) && + (getImm() & 3) == 0); } + bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); } + bool isVSRegNumber() const { return Kind == Immediate && isUInt<6>(getImm()); } + bool isCCRegNumber() const { return (Kind == Expression + && isUInt<3>(getExprCRVal())) || + (Kind == Immediate + && isUInt<3>(getImm())); } + bool isCRBitNumber() const { return (Kind == Expression + && isUInt<5>(getExprCRVal())) || + (Kind == Immediate + && isUInt<5>(getImm())); } + bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) && + isPowerOf2_32(getImm()); } + bool isMem() const override { return false; } + bool isReg() const override { return false; } + + void addRegOperands(MCInst &Inst, unsigned N) const { + llvm_unreachable("addRegOperands"); + } + + void addRegGPRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(RRegs[getReg()])); + } + + void addRegGPRCNoR0Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(RRegsNoR0[getReg()])); + } + + void addRegG8RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(XRegs[getReg()])); + } + + void addRegG8RCNoX0Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(XRegsNoX0[getReg()])); + } + + void addRegGxRCOperands(MCInst &Inst, unsigned N) const { + if (isPPC64()) + addRegG8RCOperands(Inst, N); + else + addRegGPRCOperands(Inst, N); + } + + void addRegGxRCNoR0Operands(MCInst &Inst, unsigned N) const { + if (isPPC64()) + addRegG8RCNoX0Operands(Inst, N); + else + addRegGPRCNoR0Operands(Inst, N); + } + + void addRegF4RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(FRegs[getReg()])); + } + + void addRegF8RCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(FRegs[getReg()])); + } + + void addRegVRRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(VRegs[getReg()])); + } + + void addRegVSRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(VSRegs[getVSReg()])); + } + + void addRegVSFRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(VSFRegs[getVSReg()])); + } + + void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(CRBITRegs[getCRBit()])); + } + + void addRegCRRCOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(CRRegs[getCCReg()])); + } + + void addCRBitMaskOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(CRRegs[getCRBitMask()])); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + if (Kind == Immediate) + Inst.addOperand(MCOperand::CreateImm(getImm())); + else + Inst.addOperand(MCOperand::CreateExpr(getExpr())); + } + + void addBranchTargetOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + if (Kind == Immediate) + Inst.addOperand(MCOperand::CreateImm(getImm() / 4)); + else + Inst.addOperand(MCOperand::CreateExpr(getExpr())); + } + + void addTLSRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateExpr(getTLSReg())); + } + + StringRef getToken() const { + assert(Kind == Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + void print(raw_ostream &OS) const override; + + static std::unique_ptr<PPCOperand> CreateToken(StringRef Str, SMLoc S, + bool IsPPC64) { + auto Op = make_unique<PPCOperand>(Token); + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> + CreateTokenWithStringCopy(StringRef Str, SMLoc S, bool IsPPC64) { + // Allocate extra memory for the string and copy it. + // FIXME: This is incorrect, Operands are owned by unique_ptr with a default + // deleter which will destroy them by simply using "delete", not correctly + // calling operator delete on this extra memory after calling the dtor + // explicitly. + void *Mem = ::operator new(sizeof(PPCOperand) + Str.size()); + std::unique_ptr<PPCOperand> Op(new (Mem) PPCOperand(Token)); + Op->Tok.Data = (const char *)(Op.get() + 1); + Op->Tok.Length = Str.size(); + std::memcpy((void *)Op->Tok.Data, Str.data(), Str.size()); + Op->StartLoc = S; + Op->EndLoc = S; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> CreateImm(int64_t Val, SMLoc S, SMLoc E, + bool IsPPC64) { + auto Op = make_unique<PPCOperand>(Immediate); + Op->Imm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> CreateExpr(const MCExpr *Val, SMLoc S, + SMLoc E, bool IsPPC64) { + auto Op = make_unique<PPCOperand>(Expression); + Op->Expr.Val = Val; + Op->Expr.CRVal = EvaluateCRExpr(Val); + Op->StartLoc = S; + Op->EndLoc = E; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> + CreateTLSReg(const MCSymbolRefExpr *Sym, SMLoc S, SMLoc E, bool IsPPC64) { + auto Op = make_unique<PPCOperand>(TLSRegister); + Op->TLSReg.Sym = Sym; + Op->StartLoc = S; + Op->EndLoc = E; + Op->IsPPC64 = IsPPC64; + return Op; + } + + static std::unique_ptr<PPCOperand> + CreateFromMCExpr(const MCExpr *Val, SMLoc S, SMLoc E, bool IsPPC64) { + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val)) + return CreateImm(CE->getValue(), S, E, IsPPC64); + + if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val)) + if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS) + return CreateTLSReg(SRE, S, E, IsPPC64); + + return CreateExpr(Val, S, E, IsPPC64); + } +}; + +} // end anonymous namespace. + +void PPCOperand::print(raw_ostream &OS) const { + switch (Kind) { + case Token: + OS << "'" << getToken() << "'"; + break; + case Immediate: + OS << getImm(); + break; + case Expression: + getExpr()->print(OS); + break; + case TLSRegister: + getTLSReg()->print(OS); + break; + } +} + +void PPCAsmParser::ProcessInstruction(MCInst &Inst, + const OperandVector &Operands) { + int Opcode = Inst.getOpcode(); + switch (Opcode) { + case PPC::LAx: { + MCInst TmpInst; + TmpInst.setOpcode(PPC::LA); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(2)); + TmpInst.addOperand(Inst.getOperand(1)); + Inst = TmpInst; + break; + } + case PPC::SUBI: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(PPC::ADDI); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(-N)); + Inst = TmpInst; + break; + } + case PPC::SUBIS: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(PPC::ADDIS); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(-N)); + Inst = TmpInst; + break; + } + case PPC::SUBIC: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(PPC::ADDIC); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(-N)); + Inst = TmpInst; + break; + } + case PPC::SUBICo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(PPC::ADDICo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(-N)); + Inst = TmpInst; + break; + } + case PPC::EXTLWI: + case PPC::EXTLWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::EXTLWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(B)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + TmpInst.addOperand(MCOperand::CreateImm(N - 1)); + Inst = TmpInst; + break; + } + case PPC::EXTRWI: + case PPC::EXTRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::EXTRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(B + N)); + TmpInst.addOperand(MCOperand::CreateImm(32 - N)); + TmpInst.addOperand(MCOperand::CreateImm(31)); + Inst = TmpInst; + break; + } + case PPC::INSLWI: + case PPC::INSLWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::INSLWI? PPC::RLWIMI : PPC::RLWIMIo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(32 - B)); + TmpInst.addOperand(MCOperand::CreateImm(B)); + TmpInst.addOperand(MCOperand::CreateImm((B + N) - 1)); + Inst = TmpInst; + break; + } + case PPC::INSRWI: + case PPC::INSRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::INSRWI? PPC::RLWIMI : PPC::RLWIMIo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(32 - (B + N))); + TmpInst.addOperand(MCOperand::CreateImm(B)); + TmpInst.addOperand(MCOperand::CreateImm((B + N) - 1)); + Inst = TmpInst; + break; + } + case PPC::ROTRWI: + case PPC::ROTRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::ROTRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(32 - N)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + TmpInst.addOperand(MCOperand::CreateImm(31)); + Inst = TmpInst; + break; + } + case PPC::SLWI: + case PPC::SLWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::SLWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(N)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + TmpInst.addOperand(MCOperand::CreateImm(31 - N)); + Inst = TmpInst; + break; + } + case PPC::SRWI: + case PPC::SRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::SRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(32 - N)); + TmpInst.addOperand(MCOperand::CreateImm(N)); + TmpInst.addOperand(MCOperand::CreateImm(31)); + Inst = TmpInst; + break; + } + case PPC::CLRRWI: + case PPC::CLRRWIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::CLRRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + TmpInst.addOperand(MCOperand::CreateImm(31 - N)); + Inst = TmpInst; + break; + } + case PPC::CLRLSLWI: + case PPC::CLRLSLWIo: { + MCInst TmpInst; + int64_t B = Inst.getOperand(2).getImm(); + int64_t N = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::CLRLSLWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(N)); + TmpInst.addOperand(MCOperand::CreateImm(B - N)); + TmpInst.addOperand(MCOperand::CreateImm(31 - N)); + Inst = TmpInst; + break; + } + case PPC::EXTLDI: + case PPC::EXTLDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::EXTLDI? PPC::RLDICR : PPC::RLDICRo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(B)); + TmpInst.addOperand(MCOperand::CreateImm(N - 1)); + Inst = TmpInst; + break; + } + case PPC::EXTRDI: + case PPC::EXTRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::EXTRDI? PPC::RLDICL : PPC::RLDICLo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(B + N)); + TmpInst.addOperand(MCOperand::CreateImm(64 - N)); + Inst = TmpInst; + break; + } + case PPC::INSRDI: + case PPC::INSRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + int64_t B = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::INSRDI? PPC::RLDIMI : PPC::RLDIMIo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(64 - (B + N))); + TmpInst.addOperand(MCOperand::CreateImm(B)); + Inst = TmpInst; + break; + } + case PPC::ROTRDI: + case PPC::ROTRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::ROTRDI? PPC::RLDICL : PPC::RLDICLo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(64 - N)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + Inst = TmpInst; + break; + } + case PPC::SLDI: + case PPC::SLDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::SLDI? PPC::RLDICR : PPC::RLDICRo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(N)); + TmpInst.addOperand(MCOperand::CreateImm(63 - N)); + Inst = TmpInst; + break; + } + case PPC::SRDI: + case PPC::SRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::SRDI? PPC::RLDICL : PPC::RLDICLo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(64 - N)); + TmpInst.addOperand(MCOperand::CreateImm(N)); + Inst = TmpInst; + break; + } + case PPC::CLRRDI: + case PPC::CLRRDIo: { + MCInst TmpInst; + int64_t N = Inst.getOperand(2).getImm(); + TmpInst.setOpcode(Opcode == PPC::CLRRDI? PPC::RLDICR : PPC::RLDICRo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(0)); + TmpInst.addOperand(MCOperand::CreateImm(63 - N)); + Inst = TmpInst; + break; + } + case PPC::CLRLSLDI: + case PPC::CLRLSLDIo: { + MCInst TmpInst; + int64_t B = Inst.getOperand(2).getImm(); + int64_t N = Inst.getOperand(3).getImm(); + TmpInst.setOpcode(Opcode == PPC::CLRLSLDI? PPC::RLDIC : PPC::RLDICo); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::CreateImm(N)); + TmpInst.addOperand(MCOperand::CreateImm(B - N)); + Inst = TmpInst; + break; + } + } +} + +bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, unsigned &ErrorInfo, + bool MatchingInlineAsm) { + MCInst Inst; + + switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) { + default: break; + case Match_Success: + // Post-process instructions (typically extended mnemonics) + ProcessInstruction(Inst, Operands); + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, STI); + return false; + case Match_MissingFeature: + return Error(IDLoc, "instruction use requires an option to be enabled"); + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0U) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = ((PPCOperand &)*Operands[ErrorInfo]).getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + } + + return Error(ErrorLoc, "invalid operand for instruction"); + } + } + + llvm_unreachable("Implement any new match types added!"); +} + +bool PPCAsmParser:: +MatchRegisterName(const AsmToken &Tok, unsigned &RegNo, int64_t &IntVal) { + if (Tok.is(AsmToken::Identifier)) { + StringRef Name = Tok.getString(); + + if (Name.equals_lower("lr")) { + RegNo = isPPC64()? PPC::LR8 : PPC::LR; + IntVal = 8; + return false; + } else if (Name.equals_lower("ctr")) { + RegNo = isPPC64()? PPC::CTR8 : PPC::CTR; + IntVal = 9; + return false; + } else if (Name.equals_lower("vrsave")) { + RegNo = PPC::VRSAVE; + IntVal = 256; + return false; + } else if (Name.startswith_lower("r") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal]; + return false; + } else if (Name.startswith_lower("f") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = FRegs[IntVal]; + return false; + } else if (Name.startswith_lower("v") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = VRegs[IntVal]; + return false; + } else if (Name.startswith_lower("cr") && + !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) { + RegNo = CRRegs[IntVal]; + return false; + } + } + + return true; +} + +bool PPCAsmParser:: +ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + const AsmToken &Tok = Parser.getTok(); + StartLoc = Tok.getLoc(); + EndLoc = Tok.getEndLoc(); + RegNo = 0; + int64_t IntVal; + + if (!MatchRegisterName(Tok, RegNo, IntVal)) { + Parser.Lex(); // Eat identifier token. + return false; + } + + return Error(StartLoc, "invalid register name"); +} + +/// Extract \code @l/@ha \endcode modifier from expression. Recursively scan +/// the expression and check for VK_PPC_LO/HI/HA +/// symbol variants. If all symbols with modifier use the same +/// variant, return the corresponding PPCMCExpr::VariantKind, +/// and a modified expression using the default symbol variant. +/// Otherwise, return NULL. +const MCExpr *PPCAsmParser:: +ExtractModifierFromExpr(const MCExpr *E, + PPCMCExpr::VariantKind &Variant) { + MCContext &Context = getParser().getContext(); + Variant = PPCMCExpr::VK_PPC_None; + + switch (E->getKind()) { + case MCExpr::Target: + case MCExpr::Constant: + return nullptr; + + case MCExpr::SymbolRef: { + const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E); + + switch (SRE->getKind()) { + case MCSymbolRefExpr::VK_PPC_LO: + Variant = PPCMCExpr::VK_PPC_LO; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Variant = PPCMCExpr::VK_PPC_HI; + break; + case MCSymbolRefExpr::VK_PPC_HA: + Variant = PPCMCExpr::VK_PPC_HA; + break; + case MCSymbolRefExpr::VK_PPC_HIGHER: + Variant = PPCMCExpr::VK_PPC_HIGHER; + break; + case MCSymbolRefExpr::VK_PPC_HIGHERA: + Variant = PPCMCExpr::VK_PPC_HIGHERA; + break; + case MCSymbolRefExpr::VK_PPC_HIGHEST: + Variant = PPCMCExpr::VK_PPC_HIGHEST; + break; + case MCSymbolRefExpr::VK_PPC_HIGHESTA: + Variant = PPCMCExpr::VK_PPC_HIGHESTA; + break; + default: + return nullptr; + } + + return MCSymbolRefExpr::Create(&SRE->getSymbol(), Context); + } + + case MCExpr::Unary: { + const MCUnaryExpr *UE = cast<MCUnaryExpr>(E); + const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant); + if (!Sub) + return nullptr; + return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context); + } + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(E); + PPCMCExpr::VariantKind LHSVariant, RHSVariant; + const MCExpr *LHS = ExtractModifierFromExpr(BE->getLHS(), LHSVariant); + const MCExpr *RHS = ExtractModifierFromExpr(BE->getRHS(), RHSVariant); + + if (!LHS && !RHS) + return nullptr; + + if (!LHS) LHS = BE->getLHS(); + if (!RHS) RHS = BE->getRHS(); + + if (LHSVariant == PPCMCExpr::VK_PPC_None) + Variant = RHSVariant; + else if (RHSVariant == PPCMCExpr::VK_PPC_None) + Variant = LHSVariant; + else if (LHSVariant == RHSVariant) + Variant = LHSVariant; + else + return nullptr; + + return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context); + } + } + + llvm_unreachable("Invalid expression kind!"); +} + +/// Find all VK_TLSGD/VK_TLSLD symbol references in expression and replace +/// them by VK_PPC_TLSGD/VK_PPC_TLSLD. This is necessary to avoid having +/// _GLOBAL_OFFSET_TABLE_ created via ELFObjectWriter::RelocNeedsGOT. +/// FIXME: This is a hack. +const MCExpr *PPCAsmParser:: +FixupVariantKind(const MCExpr *E) { + MCContext &Context = getParser().getContext(); + + switch (E->getKind()) { + case MCExpr::Target: + case MCExpr::Constant: + return E; + + case MCExpr::SymbolRef: { + const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E); + MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; + + switch (SRE->getKind()) { + case MCSymbolRefExpr::VK_TLSGD: + Variant = MCSymbolRefExpr::VK_PPC_TLSGD; + break; + case MCSymbolRefExpr::VK_TLSLD: + Variant = MCSymbolRefExpr::VK_PPC_TLSLD; + break; + default: + return E; + } + return MCSymbolRefExpr::Create(&SRE->getSymbol(), Variant, Context); + } + + case MCExpr::Unary: { + const MCUnaryExpr *UE = cast<MCUnaryExpr>(E); + const MCExpr *Sub = FixupVariantKind(UE->getSubExpr()); + if (Sub == UE->getSubExpr()) + return E; + return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context); + } + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast<MCBinaryExpr>(E); + const MCExpr *LHS = FixupVariantKind(BE->getLHS()); + const MCExpr *RHS = FixupVariantKind(BE->getRHS()); + if (LHS == BE->getLHS() && RHS == BE->getRHS()) + return E; + return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context); + } + } + + llvm_unreachable("Invalid expression kind!"); +} + +/// ParseExpression. This differs from the default "parseExpression" in that +/// it handles modifiers. +bool PPCAsmParser:: +ParseExpression(const MCExpr *&EVal) { + + if (isDarwin()) + return ParseDarwinExpression(EVal); + + // (ELF Platforms) + // Handle \code @l/@ha \endcode + if (getParser().parseExpression(EVal)) + return true; + + EVal = FixupVariantKind(EVal); + + PPCMCExpr::VariantKind Variant; + const MCExpr *E = ExtractModifierFromExpr(EVal, Variant); + if (E) + EVal = PPCMCExpr::Create(Variant, E, false, getParser().getContext()); + + return false; +} + +/// ParseDarwinExpression. (MachO Platforms) +/// This differs from the default "parseExpression" in that it handles detection +/// of the \code hi16(), ha16() and lo16() \endcode modifiers. At present, +/// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO +/// syntax form so it is done here. TODO: Determine if there is merit in arranging +/// for this to be done at a higher level. +bool PPCAsmParser:: +ParseDarwinExpression(const MCExpr *&EVal) { + PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None; + switch (getLexer().getKind()) { + default: + break; + case AsmToken::Identifier: + // Compiler-generated Darwin identifiers begin with L,l,_ or "; thus + // something starting with any other char should be part of the + // asm syntax. If handwritten asm includes an identifier like lo16, + // then all bets are off - but no-one would do that, right? + StringRef poss = Parser.getTok().getString(); + if (poss.equals_lower("lo16")) { + Variant = PPCMCExpr::VK_PPC_LO; + } else if (poss.equals_lower("hi16")) { + Variant = PPCMCExpr::VK_PPC_HI; + } else if (poss.equals_lower("ha16")) { + Variant = PPCMCExpr::VK_PPC_HA; + } + if (Variant != PPCMCExpr::VK_PPC_None) { + Parser.Lex(); // Eat the xx16 + if (getLexer().isNot(AsmToken::LParen)) + return Error(Parser.getTok().getLoc(), "expected '('"); + Parser.Lex(); // Eat the '(' + } + break; + } + + if (getParser().parseExpression(EVal)) + return true; + + if (Variant != PPCMCExpr::VK_PPC_None) { + if (getLexer().isNot(AsmToken::RParen)) + return Error(Parser.getTok().getLoc(), "expected ')'"); + Parser.Lex(); // Eat the ')' + EVal = PPCMCExpr::Create(Variant, EVal, false, getParser().getContext()); + } + return false; +} + +/// ParseOperand +/// This handles registers in the form 'NN', '%rNN' for ELF platforms and +/// rNN for MachO. +bool PPCAsmParser::ParseOperand(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); + const MCExpr *EVal; + + // Attempt to parse the next token as an immediate + switch (getLexer().getKind()) { + // Special handling for register names. These are interpreted + // as immediates corresponding to the register number. + case AsmToken::Percent: + Parser.Lex(); // Eat the '%'. + unsigned RegNo; + int64_t IntVal; + if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) { + Parser.Lex(); // Eat the identifier token. + Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); + return false; + } + return Error(S, "invalid register name"); + + case AsmToken::Identifier: + // Note that non-register-name identifiers from the compiler will begin + // with '_', 'L'/'l' or '"'. Of course, handwritten asm could include + // identifiers like r31foo - so we fall through in the event that parsing + // a register name fails. + if (isDarwin()) { + unsigned RegNo; + int64_t IntVal; + if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) { + Parser.Lex(); // Eat the identifier token. + Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); + return false; + } + } + // Fall-through to process non-register-name identifiers as expression. + // All other expressions + case AsmToken::LParen: + case AsmToken::Plus: + case AsmToken::Minus: + case AsmToken::Integer: + case AsmToken::Dot: + case AsmToken::Dollar: + case AsmToken::Exclaim: + case AsmToken::Tilde: + if (!ParseExpression(EVal)) + break; + /* fall through */ + default: + return Error(S, "unknown operand"); + } + + // Push the parsed operand into the list of operands + Operands.push_back(PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64())); + + // Check whether this is a TLS call expression + bool TLSCall = false; + if (const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(EVal)) + TLSCall = Ref->getSymbol().getName() == "__tls_get_addr"; + + if (TLSCall && getLexer().is(AsmToken::LParen)) { + const MCExpr *TLSSym; + + Parser.Lex(); // Eat the '('. + S = Parser.getTok().getLoc(); + if (ParseExpression(TLSSym)) + return Error(S, "invalid TLS call expression"); + if (getLexer().isNot(AsmToken::RParen)) + return Error(Parser.getTok().getLoc(), "missing ')'"); + E = Parser.getTok().getLoc(); + Parser.Lex(); // Eat the ')'. + + Operands.push_back(PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64())); + } + + // Otherwise, check for D-form memory operands + if (!TLSCall && getLexer().is(AsmToken::LParen)) { + Parser.Lex(); // Eat the '('. + S = Parser.getTok().getLoc(); + + int64_t IntVal; + switch (getLexer().getKind()) { + case AsmToken::Percent: + Parser.Lex(); // Eat the '%'. + unsigned RegNo; + if (MatchRegisterName(Parser.getTok(), RegNo, IntVal)) + return Error(S, "invalid register name"); + Parser.Lex(); // Eat the identifier token. + break; + + case AsmToken::Integer: + if (!isDarwin()) { + if (getParser().parseAbsoluteExpression(IntVal) || + IntVal < 0 || IntVal > 31) + return Error(S, "invalid register number"); + } else { + return Error(S, "unexpected integer value"); + } + break; + + case AsmToken::Identifier: + if (isDarwin()) { + unsigned RegNo; + if (!MatchRegisterName(Parser.getTok(), RegNo, IntVal)) { + Parser.Lex(); // Eat the identifier token. + break; + } + } + // Fall-through.. + + default: + return Error(S, "invalid memory operand"); + } + + if (getLexer().isNot(AsmToken::RParen)) + return Error(Parser.getTok().getLoc(), "missing ')'"); + E = Parser.getTok().getLoc(); + Parser.Lex(); // Eat the ')'. + + Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); + } + + return false; +} + +/// Parse an instruction mnemonic followed by its operands. +bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { + // The first operand is the token for the instruction name. + // If the next character is a '+' or '-', we need to add it to the + // instruction name, to match what TableGen is doing. + std::string NewOpcode; + if (getLexer().is(AsmToken::Plus)) { + getLexer().Lex(); + NewOpcode = Name; + NewOpcode += '+'; + Name = NewOpcode; + } + if (getLexer().is(AsmToken::Minus)) { + getLexer().Lex(); + NewOpcode = Name; + NewOpcode += '-'; + Name = NewOpcode; + } + // If the instruction ends in a '.', we need to create a separate + // token for it, to match what TableGen is doing. + size_t Dot = Name.find('.'); + StringRef Mnemonic = Name.slice(0, Dot); + if (!NewOpcode.empty()) // Underlying memory for Name is volatile. + Operands.push_back( + PPCOperand::CreateTokenWithStringCopy(Mnemonic, NameLoc, isPPC64())); + else + Operands.push_back(PPCOperand::CreateToken(Mnemonic, NameLoc, isPPC64())); + if (Dot != StringRef::npos) { + SMLoc DotLoc = SMLoc::getFromPointer(NameLoc.getPointer() + Dot); + StringRef DotStr = Name.slice(Dot, StringRef::npos); + if (!NewOpcode.empty()) // Underlying memory for Name is volatile. + Operands.push_back( + PPCOperand::CreateTokenWithStringCopy(DotStr, DotLoc, isPPC64())); + else + Operands.push_back(PPCOperand::CreateToken(DotStr, DotLoc, isPPC64())); + } + + // If there are no more operands then finish + if (getLexer().is(AsmToken::EndOfStatement)) + return false; + + // Parse the first operand + if (ParseOperand(Operands)) + return true; + + while (getLexer().isNot(AsmToken::EndOfStatement) && + getLexer().is(AsmToken::Comma)) { + // Consume the comma token + getLexer().Lex(); + + // Parse the next operand + if (ParseOperand(Operands)) + return true; + } + + return false; +} + +/// ParseDirective parses the PPC specific directives +bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + if (!isDarwin()) { + if (IDVal == ".word") + return ParseDirectiveWord(2, DirectiveID.getLoc()); + if (IDVal == ".llong") + return ParseDirectiveWord(8, DirectiveID.getLoc()); + if (IDVal == ".tc") + return ParseDirectiveTC(isPPC64()? 8 : 4, DirectiveID.getLoc()); + if (IDVal == ".machine") + return ParseDirectiveMachine(DirectiveID.getLoc()); + if (IDVal == ".abiversion") + return ParseDirectiveAbiVersion(DirectiveID.getLoc()); + if (IDVal == ".localentry") + return ParseDirectiveLocalEntry(DirectiveID.getLoc()); + } else { + if (IDVal == ".machine") + return ParseDarwinDirectiveMachine(DirectiveID.getLoc()); + } + return true; +} + +/// ParseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + if (getParser().parseExpression(Value)) + return false; + + getParser().getStreamer().EmitValue(Value, Size); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + if (getLexer().isNot(AsmToken::Comma)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +/// ParseDirectiveTC +/// ::= .tc [ symbol (, expression)* ] +bool PPCAsmParser::ParseDirectiveTC(unsigned Size, SMLoc L) { + // Skip TC symbol, which is only used with XCOFF. + while (getLexer().isNot(AsmToken::EndOfStatement) + && getLexer().isNot(AsmToken::Comma)) + Parser.Lex(); + if (getLexer().isNot(AsmToken::Comma)) { + Error(L, "unexpected token in directive"); + return false; + } + Parser.Lex(); + + // Align to word size. + getParser().getStreamer().EmitValueToAlignment(Size); + + // Emit expressions. + return ParseDirectiveWord(Size, L); +} + +/// ParseDirectiveMachine (ELF platforms) +/// ::= .machine [ cpu | "push" | "pop" ] +bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) { + if (getLexer().isNot(AsmToken::Identifier) && + getLexer().isNot(AsmToken::String)) { + Error(L, "unexpected token in directive"); + return false; + } + + StringRef CPU = Parser.getTok().getIdentifier(); + Parser.Lex(); + + // FIXME: Right now, the parser always allows any available + // instruction, so the .machine directive is not useful. + // Implement ".machine any" (by doing nothing) for the benefit + // of existing assembler code. Likewise, we can then implement + // ".machine push" and ".machine pop" as no-op. + if (CPU != "any" && CPU != "push" && CPU != "pop") { + Error(L, "unrecognized machine type"); + return false; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + PPCTargetStreamer &TStreamer = + *static_cast<PPCTargetStreamer *>( + getParser().getStreamer().getTargetStreamer()); + TStreamer.emitMachine(CPU); + + return false; +} + +/// ParseDarwinDirectiveMachine (Mach-o platforms) +/// ::= .machine cpu-identifier +bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) { + if (getLexer().isNot(AsmToken::Identifier) && + getLexer().isNot(AsmToken::String)) { + Error(L, "unexpected token in directive"); + return false; + } + + StringRef CPU = Parser.getTok().getIdentifier(); + Parser.Lex(); + + // FIXME: this is only the 'default' set of cpu variants. + // However we don't act on this information at present, this is simply + // allowing parsing to proceed with minimal sanity checking. + if (CPU != "ppc7400" && CPU != "ppc" && CPU != "ppc64") { + Error(L, "unrecognized cpu type"); + return false; + } + + if (isPPC64() && (CPU == "ppc7400" || CPU == "ppc")) { + Error(L, "wrong cpu type specified for 64bit"); + return false; + } + if (!isPPC64() && CPU == "ppc64") { + Error(L, "wrong cpu type specified for 32bit"); + return false; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + + return false; +} + +/// ParseDirectiveAbiVersion +/// ::= .abiversion constant-expression +bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) { + int64_t AbiVersion; + if (getParser().parseAbsoluteExpression(AbiVersion)){ + Error(L, "expected constant expression"); + return false; + } + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + + PPCTargetStreamer &TStreamer = + *static_cast<PPCTargetStreamer *>( + getParser().getStreamer().getTargetStreamer()); + TStreamer.emitAbiVersion(AbiVersion); + + return false; +} + +/// ParseDirectiveLocalEntry +/// ::= .localentry symbol, expression +bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) { + StringRef Name; + if (getParser().parseIdentifier(Name)) { + Error(L, "expected identifier in directive"); + return false; + } + MCSymbol *Sym = getContext().GetOrCreateSymbol(Name); + + if (getLexer().isNot(AsmToken::Comma)) { + Error(L, "unexpected token in directive"); + return false; + } + Lex(); + + const MCExpr *Expr; + if (getParser().parseExpression(Expr)) { + Error(L, "expected expression"); + return false; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Error(L, "unexpected token in directive"); + return false; + } + + PPCTargetStreamer &TStreamer = + *static_cast<PPCTargetStreamer *>( + getParser().getStreamer().getTargetStreamer()); + TStreamer.emitLocalEntry(Sym, Expr); + + return false; +} + + + +/// Force static initialization. +extern "C" void LLVMInitializePowerPCAsmParser() { + RegisterMCAsmParser<PPCAsmParser> A(ThePPC32Target); + RegisterMCAsmParser<PPCAsmParser> B(ThePPC64Target); + RegisterMCAsmParser<PPCAsmParser> C(ThePPC64LETarget); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "PPCGenAsmMatcher.inc" + +// Define this matcher function after the auto-generated include so we +// have the match class enum definitions. +unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, + unsigned Kind) { + // If the kind is a token for a literal immediate, check if our asm + // operand matches. This is for InstAliases which have a fixed-value + // immediate in the syntax. + int64_t ImmVal; + switch (Kind) { + case MCK_0: ImmVal = 0; break; + case MCK_1: ImmVal = 1; break; + case MCK_2: ImmVal = 2; break; + case MCK_3: ImmVal = 3; break; + default: return Match_InvalidOperand; + } + + PPCOperand &Op = static_cast<PPCOperand &>(AsmOp); + if (Op.isImm() && Op.getImm() == ImmVal) + return Match_Success; + + return Match_InvalidOperand; +} + +const MCExpr * +PPCAsmParser::applyModifierToExpr(const MCExpr *E, + MCSymbolRefExpr::VariantKind Variant, + MCContext &Ctx) { + switch (Variant) { + case MCSymbolRefExpr::VK_PPC_LO: + return PPCMCExpr::Create(PPCMCExpr::VK_PPC_LO, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HI: + return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HI, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HA: + return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HA, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HIGHER: + return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HIGHERA: + return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HIGHEST: + return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx); + case MCSymbolRefExpr::VK_PPC_HIGHESTA: + return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx); + default: + return nullptr; + } +} diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/CMakeLists.txt b/contrib/llvm/lib/Target/PowerPC/Disassembler/CMakeLists.txt new file mode 100644 index 000000000000..ca457df88d3e --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMPowerPCDisassembler + PPCDisassembler.cpp + ) diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/LLVMBuild.txt b/contrib/llvm/lib/Target/PowerPC/Disassembler/LLVMBuild.txt new file mode 100644 index 000000000000..b0978c227ae9 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===-- ./lib/Target/PowerPC/Disassembler/LLVMBuild.txt ---------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = PowerPCDisassembler +parent = PowerPC +required_libraries = MC PowerPCInfo Support +add_to_library_groups = PowerPC diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/Makefile b/contrib/llvm/lib/Target/PowerPC/Disassembler/Makefile new file mode 100644 index 000000000000..86e3b4752207 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/Makefile @@ -0,0 +1,16 @@ +##===-- lib/Target/PowerPC/Disassembler/Makefile -----------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../../.. +LIBRARYNAME = LLVMPowerPCDisassembler + +# Hack: we need to include 'main' PPC target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp new file mode 100644 index 000000000000..a2305a9efc71 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -0,0 +1,348 @@ +//===------ PPCDisassembler.cpp - Disassembler for PowerPC ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCFixedLenDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/MemoryObject.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-disassembler" + +typedef MCDisassembler::DecodeStatus DecodeStatus; + +namespace { +class PPCDisassembler : public MCDisassembler { +public: + PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) + : MCDisassembler(STI, Ctx) {} + virtual ~PPCDisassembler() {} + + // Override MCDisassembler. + virtual DecodeStatus getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream, + raw_ostream &cStream) const override; +}; +} // end anonymous namespace + +static MCDisassembler *createPPCDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new PPCDisassembler(STI, Ctx); +} + +extern "C" void LLVMInitializePowerPCDisassembler() { + // Register the disassembler for each target. + TargetRegistry::RegisterMCDisassembler(ThePPC32Target, + createPPCDisassembler); + TargetRegistry::RegisterMCDisassembler(ThePPC64Target, + createPPCDisassembler); + TargetRegistry::RegisterMCDisassembler(ThePPC64LETarget, + createPPCDisassembler); +} + +// FIXME: These can be generated by TableGen from the existing register +// encoding values! + +static const unsigned CRRegs[] = { + PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3, + PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7 +}; + +static const unsigned CRBITRegs[] = { + PPC::CR0LT, PPC::CR0GT, PPC::CR0EQ, PPC::CR0UN, + PPC::CR1LT, PPC::CR1GT, PPC::CR1EQ, PPC::CR1UN, + PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN, + PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN, + PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN, + PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN, + PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN, + PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN +}; + +static const unsigned FRegs[] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31 +}; + +static const unsigned VRegs[] = { + PPC::V0, PPC::V1, PPC::V2, PPC::V3, + PPC::V4, PPC::V5, PPC::V6, PPC::V7, + PPC::V8, PPC::V9, PPC::V10, PPC::V11, + PPC::V12, PPC::V13, PPC::V14, PPC::V15, + PPC::V16, PPC::V17, PPC::V18, PPC::V19, + PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, + PPC::V28, PPC::V29, PPC::V30, PPC::V31 +}; + +static const unsigned VSRegs[] = { + PPC::VSL0, PPC::VSL1, PPC::VSL2, PPC::VSL3, + PPC::VSL4, PPC::VSL5, PPC::VSL6, PPC::VSL7, + PPC::VSL8, PPC::VSL9, PPC::VSL10, PPC::VSL11, + PPC::VSL12, PPC::VSL13, PPC::VSL14, PPC::VSL15, + PPC::VSL16, PPC::VSL17, PPC::VSL18, PPC::VSL19, + PPC::VSL20, PPC::VSL21, PPC::VSL22, PPC::VSL23, + PPC::VSL24, PPC::VSL25, PPC::VSL26, PPC::VSL27, + PPC::VSL28, PPC::VSL29, PPC::VSL30, PPC::VSL31, + + PPC::VSH0, PPC::VSH1, PPC::VSH2, PPC::VSH3, + PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, + PPC::VSH8, PPC::VSH9, PPC::VSH10, PPC::VSH11, + PPC::VSH12, PPC::VSH13, PPC::VSH14, PPC::VSH15, + PPC::VSH16, PPC::VSH17, PPC::VSH18, PPC::VSH19, + PPC::VSH20, PPC::VSH21, PPC::VSH22, PPC::VSH23, + PPC::VSH24, PPC::VSH25, PPC::VSH26, PPC::VSH27, + PPC::VSH28, PPC::VSH29, PPC::VSH30, PPC::VSH31 +}; + +static const unsigned VSFRegs[] = { + PPC::F0, PPC::F1, PPC::F2, PPC::F3, + PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, + PPC::F12, PPC::F13, PPC::F14, PPC::F15, + PPC::F16, PPC::F17, PPC::F18, PPC::F19, + PPC::F20, PPC::F21, PPC::F22, PPC::F23, + PPC::F24, PPC::F25, PPC::F26, PPC::F27, + PPC::F28, PPC::F29, PPC::F30, PPC::F31, + + PPC::VF0, PPC::VF1, PPC::VF2, PPC::VF3, + PPC::VF4, PPC::VF5, PPC::VF6, PPC::VF7, + PPC::VF8, PPC::VF9, PPC::VF10, PPC::VF11, + PPC::VF12, PPC::VF13, PPC::VF14, PPC::VF15, + PPC::VF16, PPC::VF17, PPC::VF18, PPC::VF19, + PPC::VF20, PPC::VF21, PPC::VF22, PPC::VF23, + PPC::VF24, PPC::VF25, PPC::VF26, PPC::VF27, + PPC::VF28, PPC::VF29, PPC::VF30, PPC::VF31 +}; + +static const unsigned GPRegs[] = { + PPC::R0, PPC::R1, PPC::R2, PPC::R3, + PPC::R4, PPC::R5, PPC::R6, PPC::R7, + PPC::R8, PPC::R9, PPC::R10, PPC::R11, + PPC::R12, PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31 +}; + +static const unsigned GP0Regs[] = { + PPC::ZERO, PPC::R1, PPC::R2, PPC::R3, + PPC::R4, PPC::R5, PPC::R6, PPC::R7, + PPC::R8, PPC::R9, PPC::R10, PPC::R11, + PPC::R12, PPC::R13, PPC::R14, PPC::R15, + PPC::R16, PPC::R17, PPC::R18, PPC::R19, + PPC::R20, PPC::R21, PPC::R22, PPC::R23, + PPC::R24, PPC::R25, PPC::R26, PPC::R27, + PPC::R28, PPC::R29, PPC::R30, PPC::R31 +}; + +static const unsigned G8Regs[] = { + PPC::X0, PPC::X1, PPC::X2, PPC::X3, + PPC::X4, PPC::X5, PPC::X6, PPC::X7, + PPC::X8, PPC::X9, PPC::X10, PPC::X11, + PPC::X12, PPC::X13, PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31 +}; + +template <std::size_t N> +static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo, + const unsigned (&Regs)[N]) { + assert(RegNo < N && "Invalid register number"); + Inst.addOperand(MCOperand::CreateReg(Regs[RegNo])); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCRRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, CRRegs); +} + +static DecodeStatus DecodeCRBITRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, CRBITRegs); +} + +static DecodeStatus DecodeF4RCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, FRegs); +} + +static DecodeStatus DecodeF8RCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, FRegs); +} + +static DecodeStatus DecodeVRRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, VRegs); +} + +static DecodeStatus DecodeVSRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, VSRegs); +} + +static DecodeStatus DecodeVSFRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, VSFRegs); +} + +static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, GPRegs); +} + +static DecodeStatus DecodeGPRC_NOR0RegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, GP0Regs); +} + +static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, G8Regs); +} + +#define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass +#define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass + +template<unsigned N> +static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + assert(isUInt<N>(Imm) && "Invalid immediate"); + Inst.addOperand(MCOperand::CreateImm(Imm)); + return MCDisassembler::Success; +} + +template<unsigned N> +static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + assert(isUInt<N>(Imm) && "Invalid immediate"); + Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm))); + return MCDisassembler::Success; +} + +static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + // Decode the memri field (imm, reg), which has the low 16-bits as the + // displacement and the next 5 bits as the register #. + + uint64_t Base = Imm >> 16; + uint64_t Disp = Imm & 0xFFFF; + + assert(Base < 32 && "Invalid base register"); + + switch (Inst.getOpcode()) { + default: break; + case PPC::LBZU: + case PPC::LHAU: + case PPC::LHZU: + case PPC::LWZU: + case PPC::LFSU: + case PPC::LFDU: + // Add the tied output operand. + Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base])); + break; + case PPC::STBU: + case PPC::STHU: + case PPC::STWU: + case PPC::STFSU: + case PPC::STFDU: + Inst.insert(Inst.begin(), MCOperand::CreateReg(GP0Regs[Base])); + break; + } + + Inst.addOperand(MCOperand::CreateImm(SignExtend64<16>(Disp))); + Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base])); + return MCDisassembler::Success; +} + +static DecodeStatus decodeMemRIXOperands(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + // Decode the memrix field (imm, reg), which has the low 14-bits as the + // displacement and the next 5 bits as the register #. + + uint64_t Base = Imm >> 14; + uint64_t Disp = Imm & 0x3FFF; + + assert(Base < 32 && "Invalid base register"); + + if (Inst.getOpcode() == PPC::LDU) + // Add the tied output operand. + Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base])); + else if (Inst.getOpcode() == PPC::STDU) + Inst.insert(Inst.begin(), MCOperand::CreateReg(GP0Regs[Base])); + + Inst.addOperand(MCOperand::CreateImm(SignExtend64<16>(Disp << 2))); + Inst.addOperand(MCOperand::CreateReg(GP0Regs[Base])); + return MCDisassembler::Success; +} + +static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm, + int64_t Address, const void *Decoder) { + // The cr bit encoding is 0x80 >> cr_reg_num. + + unsigned Zeros = countTrailingZeros(Imm); + assert(Zeros < 8 && "Invalid CR bit value"); + + Inst.addOperand(MCOperand::CreateReg(CRRegs[7 - Zeros])); + return MCDisassembler::Success; +} + +#include "PPCGenDisassemblerTables.inc" + +DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + const MemoryObject &Region, + uint64_t Address, + raw_ostream &os, + raw_ostream &cs) const { + // Get the four bytes of the instruction. + uint8_t Bytes[4]; + Size = 4; + if (Region.readBytes(Address, Size, Bytes) == -1) { + Size = 0; + return MCDisassembler::Fail; + } + + // The instruction is big-endian encoded. + uint32_t Inst = (Bytes[0] << 24) | + (Bytes[1] << 16) | + (Bytes[2] << 8) | + (Bytes[3] << 0); + + return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp new file mode 100644 index 000000000000..771b6f5ec487 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -0,0 +1,365 @@ +//===-- PPCInstPrinter.cpp - Convert PPC MCInst to assembly syntax --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an PPC MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "PPCInstPrinter.h" +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOpcodes.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +// FIXME: Once the integrated assembler supports full register names, tie this +// to the verbose-asm setting. +static cl::opt<bool> +FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false), + cl::desc("Use full register names when printing assembly")); + +#include "PPCGenAsmWriter.inc" + +void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + +void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot) { + // Check for slwi/srwi mnemonics. + if (MI->getOpcode() == PPC::RLWINM) { + unsigned char SH = MI->getOperand(2).getImm(); + unsigned char MB = MI->getOperand(3).getImm(); + unsigned char ME = MI->getOperand(4).getImm(); + bool useSubstituteMnemonic = false; + if (SH <= 31 && MB == 0 && ME == (31-SH)) { + O << "\tslwi "; useSubstituteMnemonic = true; + } + if (SH <= 31 && MB == (32-SH) && ME == 31) { + O << "\tsrwi "; useSubstituteMnemonic = true; + SH = 32-SH; + } + if (useSubstituteMnemonic) { + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + O << ", " << (unsigned int)SH; + + printAnnotation(O, Annot); + return; + } + } + + if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) && + MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { + O << "\tmr "; + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + printAnnotation(O, Annot); + return; + } + + if (MI->getOpcode() == PPC::RLDICR) { + unsigned char SH = MI->getOperand(2).getImm(); + unsigned char ME = MI->getOperand(3).getImm(); + // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH + if (63-SH == ME) { + O << "\tsldi "; + printOperand(MI, 0, O); + O << ", "; + printOperand(MI, 1, O); + O << ", " << (unsigned int)SH; + printAnnotation(O, Annot); + return; + } + } + + // For fast-isel, a COPY_TO_REGCLASS may survive this long. This is + // used when converting a 32-bit float to a 64-bit float as part of + // conversion to an integer (see PPCFastISel.cpp:SelectFPToI()), + // as otherwise we have problems with incorrect register classes + // in machine instruction verification. For now, just avoid trying + // to print it as such an instruction has no effect (a 32-bit float + // in a register is already in 64-bit form, just with lower + // precision). FIXME: Is there a better solution? + if (MI->getOpcode() == TargetOpcode::COPY_TO_REGCLASS) + return; + + printInstruction(MI, O); + printAnnotation(O, Annot); +} + + +void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, + const char *Modifier) { + unsigned Code = MI->getOperand(OpNo).getImm(); + + if (StringRef(Modifier) == "cc") { + switch ((PPC::Predicate)Code) { + case PPC::PRED_LT_MINUS: + case PPC::PRED_LT_PLUS: + case PPC::PRED_LT: + O << "lt"; + return; + case PPC::PRED_LE_MINUS: + case PPC::PRED_LE_PLUS: + case PPC::PRED_LE: + O << "le"; + return; + case PPC::PRED_EQ_MINUS: + case PPC::PRED_EQ_PLUS: + case PPC::PRED_EQ: + O << "eq"; + return; + case PPC::PRED_GE_MINUS: + case PPC::PRED_GE_PLUS: + case PPC::PRED_GE: + O << "ge"; + return; + case PPC::PRED_GT_MINUS: + case PPC::PRED_GT_PLUS: + case PPC::PRED_GT: + O << "gt"; + return; + case PPC::PRED_NE_MINUS: + case PPC::PRED_NE_PLUS: + case PPC::PRED_NE: + O << "ne"; + return; + case PPC::PRED_UN_MINUS: + case PPC::PRED_UN_PLUS: + case PPC::PRED_UN: + O << "un"; + return; + case PPC::PRED_NU_MINUS: + case PPC::PRED_NU_PLUS: + case PPC::PRED_NU: + O << "nu"; + return; + case PPC::PRED_BIT_SET: + case PPC::PRED_BIT_UNSET: + llvm_unreachable("Invalid use of bit predicate code"); + } + llvm_unreachable("Invalid predicate code"); + } + + if (StringRef(Modifier) == "pm") { + switch ((PPC::Predicate)Code) { + case PPC::PRED_LT: + case PPC::PRED_LE: + case PPC::PRED_EQ: + case PPC::PRED_GE: + case PPC::PRED_GT: + case PPC::PRED_NE: + case PPC::PRED_UN: + case PPC::PRED_NU: + return; + case PPC::PRED_LT_MINUS: + case PPC::PRED_LE_MINUS: + case PPC::PRED_EQ_MINUS: + case PPC::PRED_GE_MINUS: + case PPC::PRED_GT_MINUS: + case PPC::PRED_NE_MINUS: + case PPC::PRED_UN_MINUS: + case PPC::PRED_NU_MINUS: + O << "-"; + return; + case PPC::PRED_LT_PLUS: + case PPC::PRED_LE_PLUS: + case PPC::PRED_EQ_PLUS: + case PPC::PRED_GE_PLUS: + case PPC::PRED_GT_PLUS: + case PPC::PRED_NE_PLUS: + case PPC::PRED_UN_PLUS: + case PPC::PRED_NU_PLUS: + O << "+"; + return; + case PPC::PRED_BIT_SET: + case PPC::PRED_BIT_UNSET: + llvm_unreachable("Invalid use of bit predicate code"); + } + llvm_unreachable("Invalid predicate code"); + } + + assert(StringRef(Modifier) == "reg" && + "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!"); + printOperand(MI, OpNo+1, O); +} + +void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 3 && "Invalid u2imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int Value = MI->getOperand(OpNo).getImm(); + Value = SignExtend32<5>(Value); + O << (int)Value; +} + +void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 31 && "Invalid u5imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned int Value = MI->getOperand(OpNo).getImm(); + assert(Value <= 63 && "Invalid u6imm argument!"); + O << (unsigned int)Value; +} + +void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).isImm()) + O << (short)MI->getOperand(OpNo).getImm(); + else + printOperand(MI, OpNo, O); +} + +void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).isImm()) + O << (unsigned short)MI->getOperand(OpNo).getImm(); + else + printOperand(MI, OpNo, O); +} + +void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (!MI->getOperand(OpNo).isImm()) + return printOperand(MI, OpNo, O); + + // Branches can take an immediate operand. This is used by the branch + // selection pass to print .+8, an eight byte displacement from the PC. + O << ".+"; + printAbsBranchOperand(MI, OpNo, O); +} + +void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (!MI->getOperand(OpNo).isImm()) + return printOperand(MI, OpNo, O); + + O << (int)MI->getOperand(OpNo).getImm()*4; +} + + +void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned CCReg = MI->getOperand(OpNo).getReg(); + unsigned RegNo; + switch (CCReg) { + default: llvm_unreachable("Unknown CR register"); + case PPC::CR0: RegNo = 0; break; + case PPC::CR1: RegNo = 1; break; + case PPC::CR2: RegNo = 2; break; + case PPC::CR3: RegNo = 3; break; + case PPC::CR4: RegNo = 4; break; + case PPC::CR5: RegNo = 5; break; + case PPC::CR6: RegNo = 6; break; + case PPC::CR7: RegNo = 7; break; + } + O << (0x80 >> RegNo); +} + +void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printS16ImmOperand(MI, OpNo, O); + O << '('; + if (MI->getOperand(OpNo+1).getReg() == PPC::R0) + O << "0"; + else + printOperand(MI, OpNo+1, O); + O << ')'; +} + +void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + // When used as the base register, r0 reads constant zero rather than + // the value contained in the register. For this reason, the darwin + // assembler requires that we print r0 as 0 (no r) when used as the base. + if (MI->getOperand(OpNo).getReg() == PPC::R0) + O << "0"; + else + printOperand(MI, OpNo, O); + O << ", "; + printOperand(MI, OpNo+1, O); +} + +void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must + // come at the _end_ of the expression. + const MCOperand &Op = MI->getOperand(OpNo); + const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*Op.getExpr()); + O << refExp.getSymbol().getName(); + O << '('; + printOperand(MI, OpNo+1, O); + O << ')'; + if (refExp.getKind() != MCSymbolRefExpr::VK_None) + O << '@' << MCSymbolRefExpr::getVariantKindName(refExp.getKind()); +} + + +/// stripRegisterPrefix - This method strips the character prefix from a +/// register name so that only the number is left. Used by for linux asm. +static const char *stripRegisterPrefix(const char *RegName) { + if (FullRegNames) + return RegName; + + switch (RegName[0]) { + case 'r': + case 'f': + case 'v': + if (RegName[1] == 's') + return RegName + 2; + return RegName + 1; + case 'c': if (RegName[1] == 'r') return RegName + 2; + } + + return RegName; +} + +void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + const char *RegName = getRegisterName(Op.getReg()); + // The linux and AIX assembler does not take register prefixes. + if (!isDarwinSyntax()) + RegName = stripRegisterPrefix(RegName); + + O << RegName; + return; + } + + if (Op.isImm()) { + O << Op.getImm(); + return; + } + + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << *Op.getExpr(); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h new file mode 100644 index 000000000000..211a62813e7a --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -0,0 +1,63 @@ +//===- PPCInstPrinter.h - Convert PPC MCInst to assembly syntax -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an PPC MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef PPCINSTPRINTER_H +#define PPCINSTPRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class MCOperand; + +class PPCInstPrinter : public MCInstPrinter { + bool IsDarwin; +public: + PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI, bool isDarwin) + : MCInstPrinter(MAI, MII, MRI), IsDarwin(isDarwin) {} + + bool isDarwinSyntax() const { + return IsDarwin; + } + + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printPredicateOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier = nullptr); + + void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp new file mode 100644 index 000000000000..c54d5e75bdfd --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -0,0 +1,245 @@ +//===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCFixupKinds.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCELF.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + case FK_Data_1: + case FK_Data_2: + case FK_Data_4: + case FK_Data_8: + case PPC::fixup_ppc_nofixup: + return Value; + case PPC::fixup_ppc_brcond14: + case PPC::fixup_ppc_brcond14abs: + return Value & 0xfffc; + case PPC::fixup_ppc_br24: + case PPC::fixup_ppc_br24abs: + return Value & 0x3fffffc; + case PPC::fixup_ppc_half16: + return Value & 0xffff; + case PPC::fixup_ppc_half16ds: + return Value & 0xfffc; + } +} + +static unsigned getFixupKindNumBytes(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("Unknown fixup kind!"); + case FK_Data_1: + return 1; + case FK_Data_2: + case PPC::fixup_ppc_half16: + case PPC::fixup_ppc_half16ds: + return 2; + case FK_Data_4: + case PPC::fixup_ppc_brcond14: + case PPC::fixup_ppc_brcond14abs: + case PPC::fixup_ppc_br24: + case PPC::fixup_ppc_br24abs: + return 4; + case FK_Data_8: + return 8; + case PPC::fixup_ppc_nofixup: + return 0; + } +} + +namespace { + +class PPCAsmBackend : public MCAsmBackend { + const Target &TheTarget; + bool IsLittleEndian; +public: + PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T), + IsLittleEndian(isLittle) {} + + unsigned getNumFixupKinds() const override { + return PPC::NumTargetFixupKinds; + } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { + const static MCFixupKindInfo InfosBE[PPC::NumTargetFixupKinds] = { + // name offset bits flags + { "fixup_ppc_br24", 6, 24, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_brcond14", 16, 14, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_br24abs", 6, 24, 0 }, + { "fixup_ppc_brcond14abs", 16, 14, 0 }, + { "fixup_ppc_half16", 0, 16, 0 }, + { "fixup_ppc_half16ds", 0, 14, 0 }, + { "fixup_ppc_nofixup", 0, 0, 0 } + }; + const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = { + // name offset bits flags + { "fixup_ppc_br24", 2, 24, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_brcond14", 2, 14, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_br24abs", 2, 24, 0 }, + { "fixup_ppc_brcond14abs", 2, 14, 0 }, + { "fixup_ppc_half16", 0, 16, 0 }, + { "fixup_ppc_half16ds", 2, 14, 0 }, + { "fixup_ppc_nofixup", 0, 0, 0 } + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind]; + } + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override { + Value = adjustFixupValue(Fixup.getKind(), Value); + if (!Value) return; // Doesn't change encoding. + + unsigned Offset = Fixup.getOffset(); + unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + + // For each byte of the fragment that the fixup touches, mask in the bits + // from the fixup value. The Value has been "split up" into the appropriate + // bitfields above. + for (unsigned i = 0; i != NumBytes; ++i) { + unsigned Idx = IsLittleEndian ? i : (NumBytes - 1 - i); + Data[Offset + i] |= uint8_t((Value >> (Idx * 8)) & 0xff); + } + } + + void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFixup &Fixup, const MCFragment *DF, + const MCValue &Target, uint64_t &Value, + bool &IsResolved) override { + switch ((PPC::Fixups)Fixup.getKind()) { + default: break; + case PPC::fixup_ppc_br24: + case PPC::fixup_ppc_br24abs: + // If the target symbol has a local entry point we must not attempt + // to resolve the fixup directly. Emit a relocation and leave + // resolution of the final target address to the linker. + if (const MCSymbolRefExpr *A = Target.getSymA()) { + const MCSymbolData &Data = Asm.getSymbolData(A->getSymbol()); + // The "other" values are stored in the last 6 bits of the second byte. + // The traditional defines for STO values assume the full byte and thus + // the shift to pack it. + unsigned Other = MCELF::getOther(Data) << 2; + if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0) + IsResolved = false; + } + break; + } + } + + bool mayNeedRelaxation(const MCInst &Inst) const override { + // FIXME. + return false; + } + + bool fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { + // FIXME. + llvm_unreachable("relaxInstruction() unimplemented"); + } + + + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { + // FIXME. + llvm_unreachable("relaxInstruction() unimplemented"); + } + + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override { + uint64_t NumNops = Count / 4; + for (uint64_t i = 0; i != NumNops; ++i) + OW->Write32(0x60000000); + + switch (Count % 4) { + default: break; // No leftover bytes to write + case 1: OW->Write8(0); break; + case 2: OW->Write16(0); break; + case 3: OW->Write16(0); OW->Write8(0); break; + } + + return true; + } + + unsigned getPointerSize() const { + StringRef Name = TheTarget.getName(); + if (Name == "ppc64" || Name == "ppc64le") return 8; + assert(Name == "ppc32" && "Unknown target name!"); + return 4; + } + + bool isLittleEndian() const { + return IsLittleEndian; + } +}; +} // end anonymous namespace + + +// FIXME: This should be in a separate file. +namespace { + class DarwinPPCAsmBackend : public PPCAsmBackend { + public: + DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { } + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + bool is64 = getPointerSize() == 8; + return createPPCMachObjectWriter( + OS, + /*Is64Bit=*/is64, + (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC), + MachO::CPU_SUBTYPE_POWERPC_ALL); + } + }; + + class ELFPPCAsmBackend : public PPCAsmBackend { + uint8_t OSABI; + public: + ELFPPCAsmBackend(const Target &T, bool IsLittleEndian, uint8_t OSABI) : + PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { } + + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + bool is64 = getPointerSize() == 8; + return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI); + } + }; + +} // end anonymous namespace + +MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU) { + if (Triple(TT).isOSDarwin()) + return new DarwinPPCAsmBackend(T); + + uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS()); + bool IsLittleEndian = Triple(TT).getArch() == Triple::ppc64le; + return new ELFPPCAsmBackend(T, IsLittleEndian, OSABI); +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp new file mode 100644 index 000000000000..ca813176bd5a --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -0,0 +1,432 @@ +//===-- PPCELFObjectWriter.cpp - PPC ELF Writer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCFixupKinds.h" +#include "MCTargetDesc/PPCMCExpr.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/MC/MCELF.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace { + class PPCELFObjectWriter : public MCELFObjectTargetWriter { + public: + PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI); + + virtual ~PPCELFObjectWriter(); + protected: + virtual unsigned getRelocTypeInner(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const; + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override; + + bool needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const override; + }; +} + +PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI) + : MCELFObjectTargetWriter(Is64Bit, OSABI, + Is64Bit ? ELF::EM_PPC64 : ELF::EM_PPC, + /*HasRelocationAddend*/ true) {} + +PPCELFObjectWriter::~PPCELFObjectWriter() { +} + +static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target, + const MCFixup &Fixup) { + const MCExpr *Expr = Fixup.getValue(); + + if (Expr->getKind() != MCExpr::Target) + return Target.getAccessVariant(); + + switch (cast<PPCMCExpr>(Expr)->getKind()) { + case PPCMCExpr::VK_PPC_None: + return MCSymbolRefExpr::VK_None; + case PPCMCExpr::VK_PPC_LO: + return MCSymbolRefExpr::VK_PPC_LO; + case PPCMCExpr::VK_PPC_HI: + return MCSymbolRefExpr::VK_PPC_HI; + case PPCMCExpr::VK_PPC_HA: + return MCSymbolRefExpr::VK_PPC_HA; + case PPCMCExpr::VK_PPC_HIGHERA: + return MCSymbolRefExpr::VK_PPC_HIGHERA; + case PPCMCExpr::VK_PPC_HIGHER: + return MCSymbolRefExpr::VK_PPC_HIGHER; + case PPCMCExpr::VK_PPC_HIGHEST: + return MCSymbolRefExpr::VK_PPC_HIGHEST; + case PPCMCExpr::VK_PPC_HIGHESTA: + return MCSymbolRefExpr::VK_PPC_HIGHESTA; + } + llvm_unreachable("unknown PPCMCExpr kind"); +} + +unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const +{ + MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup); + + // determine the type of the relocation + unsigned Type; + if (IsPCRel) { + switch ((unsigned)Fixup.getKind()) { + default: + llvm_unreachable("Unimplemented"); + case PPC::fixup_ppc_br24: + case PPC::fixup_ppc_br24abs: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC_REL24; + break; + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_PPC_PLTREL24; + break; + } + break; + case PPC::fixup_ppc_brcond14: + case PPC::fixup_ppc_brcond14abs: + Type = ELF::R_PPC_REL14; + break; + case PPC::fixup_ppc_half16: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC_REL16; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = ELF::R_PPC_REL16_LO; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Type = ELF::R_PPC_REL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_HA: + Type = ELF::R_PPC_REL16_HA; + break; + } + break; + case FK_Data_4: + case FK_PCRel_4: + Type = ELF::R_PPC_REL32; + break; + case FK_Data_8: + case FK_PCRel_8: + Type = ELF::R_PPC64_REL64; + break; + } + } else { + switch ((unsigned)Fixup.getKind()) { + default: llvm_unreachable("invalid fixup kind!"); + case PPC::fixup_ppc_br24abs: + Type = ELF::R_PPC_ADDR24; + break; + case PPC::fixup_ppc_brcond14abs: + Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_ + break; + case PPC::fixup_ppc_half16: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC_ADDR16; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = ELF::R_PPC_ADDR16_LO; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Type = ELF::R_PPC_ADDR16_HI; + break; + case MCSymbolRefExpr::VK_PPC_HA: + Type = ELF::R_PPC_ADDR16_HA; + break; + case MCSymbolRefExpr::VK_PPC_HIGHER: + Type = ELF::R_PPC64_ADDR16_HIGHER; + break; + case MCSymbolRefExpr::VK_PPC_HIGHERA: + Type = ELF::R_PPC64_ADDR16_HIGHERA; + break; + case MCSymbolRefExpr::VK_PPC_HIGHEST: + Type = ELF::R_PPC64_ADDR16_HIGHEST; + break; + case MCSymbolRefExpr::VK_PPC_HIGHESTA: + Type = ELF::R_PPC64_ADDR16_HIGHESTA; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_PPC_GOT16; + break; + case MCSymbolRefExpr::VK_PPC_GOT_LO: + Type = ELF::R_PPC_GOT16_LO; + break; + case MCSymbolRefExpr::VK_PPC_GOT_HI: + Type = ELF::R_PPC_GOT16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_HA: + Type = ELF::R_PPC_GOT16_HA; + break; + case MCSymbolRefExpr::VK_PPC_TOC: + Type = ELF::R_PPC64_TOC16; + break; + case MCSymbolRefExpr::VK_PPC_TOC_LO: + Type = ELF::R_PPC64_TOC16_LO; + break; + case MCSymbolRefExpr::VK_PPC_TOC_HI: + Type = ELF::R_PPC64_TOC16_HI; + break; + case MCSymbolRefExpr::VK_PPC_TOC_HA: + Type = ELF::R_PPC64_TOC16_HA; + break; + case MCSymbolRefExpr::VK_PPC_TPREL: + Type = ELF::R_PPC_TPREL16; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_LO: + Type = ELF::R_PPC_TPREL16_LO; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HI: + Type = ELF::R_PPC_TPREL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HA: + Type = ELF::R_PPC_TPREL16_HA; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER: + Type = ELF::R_PPC64_TPREL16_HIGHER; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HIGHERA: + Type = ELF::R_PPC64_TPREL16_HIGHERA; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HIGHEST: + Type = ELF::R_PPC64_TPREL16_HIGHEST; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_HIGHESTA: + Type = ELF::R_PPC64_TPREL16_HIGHESTA; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL: + Type = ELF::R_PPC64_DTPREL16; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_LO: + Type = ELF::R_PPC64_DTPREL16_LO; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HI: + Type = ELF::R_PPC64_DTPREL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HA: + Type = ELF::R_PPC64_DTPREL16_HA; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER: + Type = ELF::R_PPC64_DTPREL16_HIGHER; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHERA: + Type = ELF::R_PPC64_DTPREL16_HIGHERA; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHEST: + Type = ELF::R_PPC64_DTPREL16_HIGHEST; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHESTA: + Type = ELF::R_PPC64_DTPREL16_HIGHESTA; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSGD: + if (is64Bit()) + Type = ELF::R_PPC64_GOT_TLSGD16; + else + Type = ELF::R_PPC_GOT_TLSGD16; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO: + Type = ELF::R_PPC64_GOT_TLSGD16_LO; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HI: + Type = ELF::R_PPC64_GOT_TLSGD16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA: + Type = ELF::R_PPC64_GOT_TLSGD16_HA; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSLD: + if (is64Bit()) + Type = ELF::R_PPC64_GOT_TLSLD16; + else + Type = ELF::R_PPC_GOT_TLSLD16; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO: + Type = ELF::R_PPC64_GOT_TLSLD16_LO; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HI: + Type = ELF::R_PPC64_GOT_TLSLD16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA: + Type = ELF::R_PPC64_GOT_TLSLD16_HA; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL: + /* We don't have R_PPC64_GOT_TPREL16, but since GOT offsets + are always 4-aligned, we can use R_PPC64_GOT_TPREL16_DS. */ + Type = ELF::R_PPC64_GOT_TPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO: + /* We don't have R_PPC64_GOT_TPREL16_LO, but since GOT offsets + are always 4-aligned, we can use R_PPC64_GOT_TPREL16_LO_DS. */ + Type = ELF::R_PPC64_GOT_TPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HI: + Type = ELF::R_PPC64_GOT_TPREL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL: + /* We don't have R_PPC64_GOT_DTPREL16, but since GOT offsets + are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_DS. */ + Type = ELF::R_PPC64_GOT_DTPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO: + /* We don't have R_PPC64_GOT_DTPREL16_LO, but since GOT offsets + are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_LO_DS. */ + Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA: + Type = ELF::R_PPC64_GOT_TPREL16_HA; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HI: + Type = ELF::R_PPC64_GOT_DTPREL16_HI; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HA: + Type = ELF::R_PPC64_GOT_DTPREL16_HA; + break; + } + break; + case PPC::fixup_ppc_half16ds: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC64_ADDR16_DS; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = ELF::R_PPC64_ADDR16_LO_DS; + break; + case MCSymbolRefExpr::VK_GOT: + Type = ELF::R_PPC64_GOT16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_LO: + Type = ELF::R_PPC64_GOT16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_TOC: + Type = ELF::R_PPC64_TOC16_DS; + break; + case MCSymbolRefExpr::VK_PPC_TOC_LO: + Type = ELF::R_PPC64_TOC16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_TPREL: + Type = ELF::R_PPC64_TPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_TPREL_LO: + Type = ELF::R_PPC64_TPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL: + Type = ELF::R_PPC64_DTPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL_LO: + Type = ELF::R_PPC64_DTPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL: + Type = ELF::R_PPC64_GOT_TPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO: + Type = ELF::R_PPC64_GOT_TPREL16_LO_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL: + Type = ELF::R_PPC64_GOT_DTPREL16_DS; + break; + case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO: + Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS; + break; + } + break; + case PPC::fixup_ppc_nofixup: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_PPC_TLSGD: + if (is64Bit()) + Type = ELF::R_PPC64_TLSGD; + else + Type = ELF::R_PPC_TLSGD; + break; + case MCSymbolRefExpr::VK_PPC_TLSLD: + if (is64Bit()) + Type = ELF::R_PPC64_TLSLD; + else + Type = ELF::R_PPC_TLSLD; + break; + case MCSymbolRefExpr::VK_PPC_TLS: + if (is64Bit()) + Type = ELF::R_PPC64_TLS; + else + Type = ELF::R_PPC_TLS; + break; + } + break; + case FK_Data_8: + switch (Modifier) { + default: llvm_unreachable("Unsupported Modifier"); + case MCSymbolRefExpr::VK_PPC_TOCBASE: + Type = ELF::R_PPC64_TOC; + break; + case MCSymbolRefExpr::VK_None: + Type = ELF::R_PPC64_ADDR64; + break; + case MCSymbolRefExpr::VK_PPC_DTPMOD: + Type = ELF::R_PPC64_DTPMOD64; + break; + case MCSymbolRefExpr::VK_PPC_TPREL: + Type = ELF::R_PPC64_TPREL64; + break; + case MCSymbolRefExpr::VK_PPC_DTPREL: + Type = ELF::R_PPC64_DTPREL64; + break; + } + break; + case FK_Data_4: + Type = ELF::R_PPC_ADDR32; + break; + case FK_Data_2: + Type = ELF::R_PPC_ADDR16; + break; + } + } + return Type; +} + +unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + return getRelocTypeInner(Target, Fixup, IsPCRel); +} + +bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbolData &SD, + unsigned Type) const { + switch (Type) { + default: + return false; + + case ELF::R_PPC_REL24: + // If the target symbol has a local entry point, we must keep the + // target symbol to preserve that information for the linker. + // The "other" values are stored in the last 6 bits of the second byte. + // The traditional defines for STO values assume the full byte and thus + // the shift to pack it. + unsigned Other = MCELF::getOther(SD) << 2; + return (Other & ELF::STO_PPC64_LOCAL_MASK) != 0; + } +} + +MCObjectWriter *llvm::createPPCELFObjectWriter(raw_ostream &OS, + bool Is64Bit, + bool IsLittleEndian, + uint8_t OSABI) { + MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI); + return createELFObjectWriter(MOTW, OS, IsLittleEndian); +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h new file mode 100644 index 000000000000..68de8c1f27a5 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -0,0 +1,56 @@ +//===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_PPC_PPCFIXUPKINDS_H +#define LLVM_PPC_PPCFIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +#undef PPC + +namespace llvm { +namespace PPC { +enum Fixups { + // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b' + // and 'bl'. + fixup_ppc_br24 = FirstTargetFixupKind, + + /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional + /// branches. + fixup_ppc_brcond14, + + /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches + /// like 'ba' and 'bla'. + fixup_ppc_br24abs, + + /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional + /// branches. + fixup_ppc_brcond14abs, + + /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo) + /// or ha16(_foo) for instrs like 'li' or 'addis'. + fixup_ppc_half16, + + /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with + /// implied 2 zero bits for instrs like 'std'. + fixup_ppc_half16ds, + + /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call + /// to __tls_get_addr for the TLS general and local dynamic models, + /// or inserts the thread-pointer register number. + fixup_ppc_nofixup, + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp new file mode 100644 index 000000000000..b95a2ac13e04 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -0,0 +1,82 @@ +//===-- PPCMCAsmInfo.cpp - PPC asm properties -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the MCAsmInfoDarwin properties. +// +//===----------------------------------------------------------------------===// + +#include "PPCMCAsmInfo.h" +#include "llvm/ADT/Triple.h" + +using namespace llvm; + +void PPCMCAsmInfoDarwin::anchor() { } + +PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) { + if (is64Bit) { + PointerSize = CalleeSaveStackSlotSize = 8; + } + IsLittleEndian = false; + + CommentString = ";"; + ExceptionsType = ExceptionHandling::DwarfCFI; + + if (!is64Bit) + Data64bitsDirective = nullptr; // We can't emit a 64-bit unit in PPC32 mode. + + AssemblerDialect = 1; // New-Style mnemonics. + SupportsDebugInformation= true; // Debug information. + + // The installed assembler for OSX < 10.6 lacks some directives. + // FIXME: this should really be a check on the assembler characteristics + // rather than OS version + if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6)) + HasWeakDefCanBeHiddenDirective = false; + + UseIntegratedAssembler = true; +} + +void PPCLinuxMCAsmInfo::anchor() { } + +PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit, const Triple& T) { + if (is64Bit) { + PointerSize = CalleeSaveStackSlotSize = 8; + } + IsLittleEndian = T.getArch() == Triple::ppc64le; + + // ".comm align is in bytes but .align is pow-2." + AlignmentIsInBytes = false; + + CommentString = "#"; + + // Uses '.section' before '.bss' directive + UsesELFSectionDirectiveForBSS = true; + + // Debug Information + SupportsDebugInformation = true; + + DollarIsPC = true; + + // Set up DWARF directives + HasLEB128 = true; // Target asm supports leb128 directives (little-endian) + MinInstAlignment = 4; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; + + ZeroDirective = "\t.space\t"; + Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr; + AssemblerDialect = 1; // New-Style mnemonics. + + if (T.getOS() == llvm::Triple::FreeBSD || + (T.getOS() == llvm::Triple::NetBSD && !is64Bit) || + (T.getOS() == llvm::Triple::OpenBSD && !is64Bit)) + UseIntegratedAssembler = true; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h new file mode 100644 index 000000000000..754330b2c60f --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h @@ -0,0 +1,37 @@ +//===-- PPCMCAsmInfo.h - PPC asm properties --------------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the MCAsmInfoDarwin class. +// +//===----------------------------------------------------------------------===// + +#ifndef PPCTARGETASMINFO_H +#define PPCTARGETASMINFO_H + +#include "llvm/MC/MCAsmInfoDarwin.h" +#include "llvm/MC/MCAsmInfoELF.h" + +namespace llvm { +class Triple; + + class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin { + void anchor() override; + public: + explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&); + }; + + class PPCLinuxMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + public: + explicit PPCLinuxMCAsmInfo(bool is64Bit, const Triple&); + }; + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp new file mode 100644 index 000000000000..435a93f78c1d --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -0,0 +1,322 @@ +//===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPCMCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCFixupKinds.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOpcodes.h" +using namespace llvm; + +#define DEBUG_TYPE "mccodeemitter" + +STATISTIC(MCNumEmitted, "Number of MC instructions emitted"); + +namespace { +class PPCMCCodeEmitter : public MCCodeEmitter { + PPCMCCodeEmitter(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION; + void operator=(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION; + + const MCInstrInfo &MCII; + const MCContext &CTX; + bool IsLittleEndian; + +public: + PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool isLittle) + : MCII(mcii), CTX(ctx), IsLittleEndian(isLittle) { + } + + ~PPCMCCodeEmitter() {} + + unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + // getBinaryCodeForInstr - TableGen'erated function for getting the + // binary encoding for an instruction. + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override { + // For fast-isel, a float COPY_TO_REGCLASS can survive this long. + // It's just a nop to keep the register classes happy, so don't + // generate anything. + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MCII.get(Opcode); + if (Opcode == TargetOpcode::COPY_TO_REGCLASS) + return; + + uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI); + + // Output the constant in big/little endian byte order. + unsigned Size = Desc.getSize(); + switch (Size) { + case 4: + if (IsLittleEndian) { + OS << (char)(Bits); + OS << (char)(Bits >> 8); + OS << (char)(Bits >> 16); + OS << (char)(Bits >> 24); + } else { + OS << (char)(Bits >> 24); + OS << (char)(Bits >> 16); + OS << (char)(Bits >> 8); + OS << (char)(Bits); + } + break; + case 8: + // If we emit a pair of instructions, the first one is + // always in the top 32 bits, even on little-endian. + if (IsLittleEndian) { + OS << (char)(Bits >> 32); + OS << (char)(Bits >> 40); + OS << (char)(Bits >> 48); + OS << (char)(Bits >> 56); + OS << (char)(Bits); + OS << (char)(Bits >> 8); + OS << (char)(Bits >> 16); + OS << (char)(Bits >> 24); + } else { + OS << (char)(Bits >> 56); + OS << (char)(Bits >> 48); + OS << (char)(Bits >> 40); + OS << (char)(Bits >> 32); + OS << (char)(Bits >> 24); + OS << (char)(Bits >> 16); + OS << (char)(Bits >> 8); + OS << (char)(Bits); + } + break; + default: + llvm_unreachable ("Invalid instruction size"); + } + + ++MCNumEmitted; // Keep track of the # of mi's emitted. + } + +}; + +} // end anonymous namespace + +MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + Triple TT(STI.getTargetTriple()); + bool IsLittleEndian = TT.getArch() == Triple::ppc64le; + return new PPCMCCodeEmitter(MCII, Ctx, IsLittleEndian); +} + +unsigned PPCMCCodeEmitter:: +getDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the branch target. + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_br24)); + return 0; +} + +unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the branch target. + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_brcond14)); + return 0; +} + +unsigned PPCMCCodeEmitter:: +getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the branch target. + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_br24abs)); + return 0; +} + +unsigned PPCMCCodeEmitter:: +getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the branch target. + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_brcond14abs)); + return 0; +} + +unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the immediate field. + Fixups.push_back(MCFixup::Create(IsLittleEndian? 0 : 2, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_half16)); + return 0; +} + +unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Encode (imm, reg) as a memri, which has the low 16-bits as the + // displacement and the next 5 bits as the register #. + assert(MI.getOperand(OpNo+1).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 16; + + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isImm()) + return (getMachineOpValue(MI, MO, Fixups, STI) & 0xFFFF) | RegBits; + + // Add a fixup for the displacement field. + Fixups.push_back(MCFixup::Create(IsLittleEndian? 0 : 2, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_half16)); + return RegBits; +} + + +unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Encode (imm, reg) as a memrix, which has the low 14-bits as the + // displacement and the next 5 bits as the register #. + assert(MI.getOperand(OpNo+1).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 14; + + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isImm()) + return ((getMachineOpValue(MI, MO, Fixups, STI) >> 2) & 0x3FFF) | RegBits; + + // Add a fixup for the displacement field. + Fixups.push_back(MCFixup::Create(IsLittleEndian? 0 : 2, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_half16ds)); + return RegBits; +} + + +unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + if (MO.isReg()) return getMachineOpValue(MI, MO, Fixups, STI); + + // Add a fixup for the TLS register, which simply provides a relocation + // hint to the linker that this statement is part of a relocation sequence. + // Return the thread-pointer register's encoding. + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_nofixup)); + Triple TT(STI.getTargetTriple()); + bool isPPC64 = TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le; + return CTX.getRegisterInfo()->getEncodingValue(isPPC64 ? PPC::X13 : PPC::R2); +} + +unsigned PPCMCCodeEmitter::getTLSCallEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // For special TLS calls, we need two fixups; one for the branch target + // (__tls_get_addr), which we create via getDirectBrEncoding as usual, + // and one for the TLSGD or TLSLD symbol, which is emitted here. + const MCOperand &MO = MI.getOperand(OpNo+1); + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), + (MCFixupKind)PPC::fixup_ppc_nofixup)); + return getDirectBrEncoding(MI, OpNo, Fixups, STI); +} + +unsigned PPCMCCodeEmitter:: +get_crbitm_encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 || + MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) && + (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7)); + return 0x80 >> CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); +} + + +unsigned PPCMCCodeEmitter:: +getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) { + // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand. + // The GPR operand should come through here though. + assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 && + MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) || + MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7); + return CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); + } + + assert(MO.isImm() && + "Relocation required in an instruction that we cannot encode!"); + return MO.getImm(); +} + + +#include "PPCGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp new file mode 100644 index 000000000000..3ac0aca6b78c --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp @@ -0,0 +1,133 @@ +//===-- PPCMCExpr.cpp - PPC specific MC expression classes ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPCMCExpr.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectStreamer.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppcmcexpr" + +const PPCMCExpr* +PPCMCExpr::Create(VariantKind Kind, const MCExpr *Expr, + bool isDarwin, MCContext &Ctx) { + return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin); +} + +void PPCMCExpr::PrintImpl(raw_ostream &OS) const { + if (isDarwinSyntax()) { + switch (Kind) { + default: llvm_unreachable("Invalid kind!"); + case VK_PPC_LO: OS << "lo16"; break; + case VK_PPC_HI: OS << "hi16"; break; + case VK_PPC_HA: OS << "ha16"; break; + } + + OS << '('; + getSubExpr()->print(OS); + OS << ')'; + } else { + getSubExpr()->print(OS); + + switch (Kind) { + default: llvm_unreachable("Invalid kind!"); + case VK_PPC_LO: OS << "@l"; break; + case VK_PPC_HI: OS << "@h"; break; + case VK_PPC_HA: OS << "@ha"; break; + case VK_PPC_HIGHER: OS << "@higher"; break; + case VK_PPC_HIGHERA: OS << "@highera"; break; + case VK_PPC_HIGHEST: OS << "@highest"; break; + case VK_PPC_HIGHESTA: OS << "@highesta"; break; + } + } +} + +bool +PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout) const { + MCValue Value; + + if (!getSubExpr()->EvaluateAsRelocatable(Value, Layout)) + return false; + + if (Value.isAbsolute()) { + int64_t Result = Value.getConstant(); + switch (Kind) { + default: + llvm_unreachable("Invalid kind!"); + case VK_PPC_LO: + Result = Result & 0xffff; + break; + case VK_PPC_HI: + Result = (Result >> 16) & 0xffff; + break; + case VK_PPC_HA: + Result = ((Result + 0x8000) >> 16) & 0xffff; + break; + case VK_PPC_HIGHER: + Result = (Result >> 32) & 0xffff; + break; + case VK_PPC_HIGHERA: + Result = ((Result + 0x8000) >> 32) & 0xffff; + break; + case VK_PPC_HIGHEST: + Result = (Result >> 48) & 0xffff; + break; + case VK_PPC_HIGHESTA: + Result = ((Result + 0x8000) >> 48) & 0xffff; + break; + } + Res = MCValue::get(Result); + } else { + if (!Layout) + return false; + + MCContext &Context = Layout->getAssembler().getContext(); + const MCSymbolRefExpr *Sym = Value.getSymA(); + MCSymbolRefExpr::VariantKind Modifier = Sym->getKind(); + if (Modifier != MCSymbolRefExpr::VK_None) + return false; + switch (Kind) { + default: + llvm_unreachable("Invalid kind!"); + case VK_PPC_LO: + Modifier = MCSymbolRefExpr::VK_PPC_LO; + break; + case VK_PPC_HI: + Modifier = MCSymbolRefExpr::VK_PPC_HI; + break; + case VK_PPC_HA: + Modifier = MCSymbolRefExpr::VK_PPC_HA; + break; + case VK_PPC_HIGHERA: + Modifier = MCSymbolRefExpr::VK_PPC_HIGHERA; + break; + case VK_PPC_HIGHER: + Modifier = MCSymbolRefExpr::VK_PPC_HIGHER; + break; + case VK_PPC_HIGHEST: + Modifier = MCSymbolRefExpr::VK_PPC_HIGHEST; + break; + case VK_PPC_HIGHESTA: + Modifier = MCSymbolRefExpr::VK_PPC_HIGHESTA; + break; + } + Sym = MCSymbolRefExpr::Create(&Sym->getSymbol(), Modifier, Context); + Res = MCValue::get(Sym, Value.getSymB(), Value.getConstant()); + } + + return true; +} + +void PPCMCExpr::visitUsedExpr(MCStreamer &Streamer) const { + Streamer.visitUsedExpr(*getSubExpr()); +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h new file mode 100644 index 000000000000..bca408507e72 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -0,0 +1,96 @@ +//===-- PPCMCExpr.h - PPC specific MC expression classes --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef PPCMCEXPR_H +#define PPCMCEXPR_H + +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCValue.h" + +namespace llvm { + +class PPCMCExpr : public MCTargetExpr { +public: + enum VariantKind { + VK_PPC_None, + VK_PPC_LO, + VK_PPC_HI, + VK_PPC_HA, + VK_PPC_HIGHER, + VK_PPC_HIGHERA, + VK_PPC_HIGHEST, + VK_PPC_HIGHESTA + }; + +private: + const VariantKind Kind; + const MCExpr *Expr; + bool IsDarwin; + + explicit PPCMCExpr(VariantKind _Kind, const MCExpr *_Expr, + bool _IsDarwin) + : Kind(_Kind), Expr(_Expr), IsDarwin(_IsDarwin) {} + +public: + /// @name Construction + /// @{ + + static const PPCMCExpr *Create(VariantKind Kind, const MCExpr *Expr, + bool isDarwin, MCContext &Ctx); + + static const PPCMCExpr *CreateLo(const MCExpr *Expr, + bool isDarwin, MCContext &Ctx) { + return Create(VK_PPC_LO, Expr, isDarwin, Ctx); + } + + static const PPCMCExpr *CreateHi(const MCExpr *Expr, + bool isDarwin, MCContext &Ctx) { + return Create(VK_PPC_HI, Expr, isDarwin, Ctx); + } + + static const PPCMCExpr *CreateHa(const MCExpr *Expr, + bool isDarwin, MCContext &Ctx) { + return Create(VK_PPC_HA, Expr, isDarwin, Ctx); + } + + /// @} + /// @name Accessors + /// @{ + + /// getOpcode - Get the kind of this expression. + VariantKind getKind() const { return Kind; } + + /// getSubExpr - Get the child of this expression. + const MCExpr *getSubExpr() const { return Expr; } + + /// isDarwinSyntax - True if expression is to be printed using Darwin syntax. + bool isDarwinSyntax() const { return IsDarwin; } + + + /// @} + + void PrintImpl(raw_ostream &OS) const override; + bool EvaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; + const MCSection *FindAssociatedSection() const override { + return getSubExpr()->FindAssociatedSection(); + } + + // There are no TLS PPCMCExprs at the moment. + void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {} + + static bool classof(const MCExpr *E) { + return E->getKind() == MCExpr::Target; + } +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp new file mode 100644 index 000000000000..4c6780ff75a7 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -0,0 +1,307 @@ +//===-- PPCMCTargetDesc.cpp - PowerPC Target Descriptions -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides PowerPC specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "PPCMCTargetDesc.h" +#include "InstPrinter/PPCInstPrinter.h" +#include "PPCMCAsmInfo.h" +#include "PPCTargetStreamer.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCELF.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "PPCGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "PPCGenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "PPCGenRegisterInfo.inc" + +// Pin the vtable to this file. +PPCTargetStreamer::~PPCTargetStreamer() {} +PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} + +static MCInstrInfo *createPPCMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitPPCMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) { + Triple TheTriple(TT); + bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 || + TheTriple.getArch() == Triple::ppc64le); + unsigned Flavour = isPPC64 ? 0 : 1; + unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR; + + MCRegisterInfo *X = new MCRegisterInfo(); + InitPPCMCRegisterInfo(X, RA, Flavour, Flavour); + return X; +} + +static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS) { + MCSubtargetInfo *X = new MCSubtargetInfo(); + InitPPCMCSubtargetInfo(X, TT, CPU, FS); + return X; +} + +static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) { + Triple TheTriple(TT); + bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 || + TheTriple.getArch() == Triple::ppc64le); + + MCAsmInfo *MAI; + if (TheTriple.isOSDarwin()) + MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple); + else + MAI = new PPCLinuxMCAsmInfo(isPPC64, TheTriple); + + // Initial state of the frame pointer is R1. + unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1; + MCCFIInstruction Inst = + MCCFIInstruction::createDefCfa(nullptr, MRI.getDwarfRegNum(Reg, true), 0); + MAI->addInitialFrameState(Inst); + + return MAI; +} + +static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + + if (RM == Reloc::Default) { + Triple T(TT); + if (T.isOSDarwin()) + RM = Reloc::DynamicNoPIC; + else + RM = Reloc::Static; + } + if (CM == CodeModel::Default) { + Triple T(TT); + if (!T.isOSDarwin() && + (T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le)) + CM = CodeModel::Medium; + } + X->InitMCCodeGenInfo(RM, CM, OL); + return X; +} + +namespace { +class PPCTargetAsmStreamer : public PPCTargetStreamer { + formatted_raw_ostream &OS; + +public: + PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) + : PPCTargetStreamer(S), OS(OS) {} + void emitTCEntry(const MCSymbol &S) override { + OS << "\t.tc "; + OS << S.getName(); + OS << "[TC],"; + OS << S.getName(); + OS << '\n'; + } + void emitMachine(StringRef CPU) override { + OS << "\t.machine " << CPU << '\n'; + } + virtual void emitAbiVersion(int AbiVersion) override { + OS << "\t.abiversion " << AbiVersion << '\n'; + } + virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) { + OS << "\t.localentry\t" << *S << ", " << *LocalOffset << '\n'; + } +}; + +class PPCTargetELFStreamer : public PPCTargetStreamer { +public: + PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {} + MCELFStreamer &getStreamer() { + return static_cast<MCELFStreamer &>(Streamer); + } + virtual void emitTCEntry(const MCSymbol &S) override { + // Creates a R_PPC64_TOC relocation + Streamer.EmitSymbolValue(&S, 8); + } + void emitMachine(StringRef CPU) override { + // FIXME: Is there anything to do in here or does this directive only + // limit the parser? + } + virtual void emitAbiVersion(int AbiVersion) override { + MCAssembler &MCA = getStreamer().getAssembler(); + unsigned Flags = MCA.getELFHeaderEFlags(); + Flags &= ~ELF::EF_PPC64_ABI; + Flags |= (AbiVersion & ELF::EF_PPC64_ABI); + MCA.setELFHeaderEFlags(Flags); + } + virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) { + MCAssembler &MCA = getStreamer().getAssembler(); + MCSymbolData &Data = getStreamer().getOrCreateSymbolData(S); + + int64_t Res; + if (!LocalOffset->EvaluateAsAbsolute(Res, MCA)) + report_fatal_error(".localentry expression must be absolute."); + + unsigned Encoded = ELF::encodePPC64LocalEntryOffset(Res); + if (Res != ELF::decodePPC64LocalEntryOffset(Encoded)) + report_fatal_error(".localentry expression cannot be encoded."); + + // The "other" values are stored in the last 6 bits of the second byte. + // The traditional defines for STO values assume the full byte and thus + // the shift to pack it. + unsigned Other = MCELF::getOther(Data) << 2; + Other &= ~ELF::STO_PPC64_LOCAL_MASK; + Other |= Encoded; + MCELF::setOther(Data, Other >> 2); + + // For GAS compatibility, unless we already saw a .abiversion directive, + // set e_flags to indicate ELFv2 ABI. + unsigned Flags = MCA.getELFHeaderEFlags(); + if ((Flags & ELF::EF_PPC64_ABI) == 0) + MCA.setELFHeaderEFlags(Flags | 2); + } +}; + +class PPCTargetMachOStreamer : public PPCTargetStreamer { +public: + PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {} + void emitTCEntry(const MCSymbol &S) override { + llvm_unreachable("Unknown pseudo-op: .tc"); + } + void emitMachine(StringRef CPU) override { + // FIXME: We should update the CPUType, CPUSubType in the Object file if + // the new values are different from the defaults. + } + virtual void emitAbiVersion(int AbiVersion) override { + llvm_unreachable("Unknown pseudo-op: .abiversion"); + } + virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) { + llvm_unreachable("Unknown pseudo-op: .localentry"); + } +}; +} + +// This is duplicated code. Refactor this. +static MCStreamer *createMCStreamer(const Target &T, StringRef TT, + MCContext &Ctx, MCAsmBackend &MAB, + raw_ostream &OS, + MCCodeEmitter *Emitter, + const MCSubtargetInfo &STI, + bool RelaxAll, + bool NoExecStack) { + if (Triple(TT).isOSDarwin()) { + MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll); + new PPCTargetMachOStreamer(*S); + return S; + } + + MCStreamer *S = + createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack); + new PPCTargetELFStreamer(*S); + return S; +} + +static MCStreamer * +createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS, + bool isVerboseAsm, bool useDwarfDirectory, + MCInstPrinter *InstPrint, MCCodeEmitter *CE, + MCAsmBackend *TAB, bool ShowInst) { + + MCStreamer *S = llvm::createAsmStreamer( + Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst); + new PPCTargetAsmStreamer(*S, OS); + return S; +} + +static MCInstPrinter *createPPCMCInstPrinter(const Target &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI) { + bool isDarwin = Triple(STI.getTargetTriple()).isOSDarwin(); + return new PPCInstPrinter(MAI, MII, MRI, isDarwin); +} + +extern "C" void LLVMInitializePowerPCTargetMC() { + // Register the MC asm info. + RegisterMCAsmInfoFn C(ThePPC32Target, createPPCMCAsmInfo); + RegisterMCAsmInfoFn D(ThePPC64Target, createPPCMCAsmInfo); + RegisterMCAsmInfoFn E(ThePPC64LETarget, createPPCMCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(ThePPC32Target, createPPCMCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(ThePPC64Target, createPPCMCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(ThePPC64LETarget, + createPPCMCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(ThePPC32Target, createPPCMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(ThePPC64Target, createPPCMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(ThePPC64LETarget, + createPPCMCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(ThePPC32Target, createPPCMCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(ThePPC64Target, createPPCMCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(ThePPC64LETarget, createPPCMCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(ThePPC32Target, + createPPCMCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(ThePPC64Target, + createPPCMCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(ThePPC64LETarget, + createPPCMCSubtargetInfo); + + // Register the MC Code Emitter + TargetRegistry::RegisterMCCodeEmitter(ThePPC32Target, createPPCMCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(ThePPC64Target, createPPCMCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(ThePPC64LETarget, + createPPCMCCodeEmitter); + + // Register the asm backend. + TargetRegistry::RegisterMCAsmBackend(ThePPC32Target, createPPCAsmBackend); + TargetRegistry::RegisterMCAsmBackend(ThePPC64Target, createPPCAsmBackend); + TargetRegistry::RegisterMCAsmBackend(ThePPC64LETarget, createPPCAsmBackend); + + // Register the object streamer. + TargetRegistry::RegisterMCObjectStreamer(ThePPC32Target, createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(ThePPC64Target, createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(ThePPC64LETarget, createMCStreamer); + + // Register the asm streamer. + TargetRegistry::RegisterAsmStreamer(ThePPC32Target, createMCAsmStreamer); + TargetRegistry::RegisterAsmStreamer(ThePPC64Target, createMCAsmStreamer); + TargetRegistry::RegisterAsmStreamer(ThePPC64LETarget, createMCAsmStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(ThePPC64LETarget, + createPPCMCInstPrinter); +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h new file mode 100644 index 000000000000..474395b93637 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -0,0 +1,75 @@ +//===-- PPCMCTargetDesc.h - PowerPC Target Descriptions ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides PowerPC specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef PPCMCTARGETDESC_H +#define PPCMCTARGETDESC_H + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +#include "llvm/Support/DataTypes.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCObjectWriter; +class MCRegisterInfo; +class MCSubtargetInfo; +class Target; +class StringRef; +class raw_ostream; + +extern Target ThePPC32Target; +extern Target ThePPC64Target; +extern Target ThePPC64LETarget; + +MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx); + +MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU); + +/// createPPCELFObjectWriter - Construct an PPC ELF object writer. +MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS, + bool Is64Bit, + bool IsLittleEndian, + uint8_t OSABI); +/// createPPCELFObjectWriter - Construct a PPC Mach-O object writer. +MCObjectWriter *createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); +} // End llvm namespace + +// Generated files will use "namespace PPC". To avoid symbol clash, +// undefine PPC here. PPC may be predefined on some hosts. +#undef PPC + +// Defines symbolic names for PowerPC registers. This defines a mapping from +// register name to register number. +// +#define GET_REGINFO_ENUM +#include "PPCGenRegisterInfo.inc" + +// Defines symbolic names for the PowerPC instructions. +// +#define GET_INSTRINFO_ENUM +#include "PPCGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "PPCGenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp new file mode 100644 index 000000000000..cff27baeb5ee --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -0,0 +1,389 @@ +//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCFixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MachO.h" + +using namespace llvm; + +namespace { +class PPCMachObjectWriter : public MCMachObjectTargetWriter { + bool RecordScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + unsigned Log2Size, uint64_t &FixedValue); + + void RecordPPCRelocation(MachObjectWriter *Writer, const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, uint64_t &FixedValue); + +public: + PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype) + : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype, + /*UseAggressiveSymbolFolding=*/Is64Bit) {} + + void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm, + const MCAsmLayout &Layout, const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) override { + if (Writer->is64Bit()) { + report_fatal_error("Relocation emission for MachO/PPC64 unimplemented."); + } else + RecordPPCRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + } +}; +} + +/// computes the log2 of the size of the relocation, +/// used for relocation_info::r_length. +static unsigned getFixupKindLog2Size(unsigned Kind) { + switch (Kind) { + default: + report_fatal_error("log2size(FixupKind): Unhandled fixup kind!"); + case FK_PCRel_1: + case FK_Data_1: + return 0; + case FK_PCRel_2: + case FK_Data_2: + return 1; + case FK_PCRel_4: + case PPC::fixup_ppc_brcond14: + case PPC::fixup_ppc_half16: + case PPC::fixup_ppc_br24: + case FK_Data_4: + return 2; + case FK_PCRel_8: + case FK_Data_8: + return 3; + } + return 0; +} + +/// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum. +/// Outline based on PPCELFObjectWriter::getRelocTypeInner(). +static unsigned getRelocType(const MCValue &Target, + const MCFixupKind FixupKind, // from + // Fixup.getKind() + const bool IsPCRel) { + const MCSymbolRefExpr::VariantKind Modifier = + Target.isAbsolute() ? MCSymbolRefExpr::VK_None + : Target.getSymA()->getKind(); + // determine the type of the relocation + unsigned Type = MachO::GENERIC_RELOC_VANILLA; + if (IsPCRel) { // relative to PC + switch ((unsigned)FixupKind) { + default: + report_fatal_error("Unimplemented fixup kind (relative)"); + case PPC::fixup_ppc_br24: + Type = MachO::PPC_RELOC_BR24; // R_PPC_REL24 + break; + case PPC::fixup_ppc_brcond14: + Type = MachO::PPC_RELOC_BR14; + break; + case PPC::fixup_ppc_half16: + switch (Modifier) { + default: + llvm_unreachable("Unsupported modifier for half16 fixup"); + case MCSymbolRefExpr::VK_PPC_HA: + Type = MachO::PPC_RELOC_HA16; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = MachO::PPC_RELOC_LO16; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Type = MachO::PPC_RELOC_HI16; + break; + } + break; + } + } else { + switch ((unsigned)FixupKind) { + default: + report_fatal_error("Unimplemented fixup kind (absolute)!"); + case PPC::fixup_ppc_half16: + switch (Modifier) { + default: + llvm_unreachable("Unsupported modifier for half16 fixup"); + case MCSymbolRefExpr::VK_PPC_HA: + Type = MachO::PPC_RELOC_HA16_SECTDIFF; + break; + case MCSymbolRefExpr::VK_PPC_LO: + Type = MachO::PPC_RELOC_LO16_SECTDIFF; + break; + case MCSymbolRefExpr::VK_PPC_HI: + Type = MachO::PPC_RELOC_HI16_SECTDIFF; + break; + } + break; + case FK_Data_4: + break; + case FK_Data_2: + break; + } + } + return Type; +} + +static void makeRelocationInfo(MachO::any_relocation_info &MRE, + const uint32_t FixupOffset, const uint32_t Index, + const unsigned IsPCRel, const unsigned Log2Size, + const unsigned IsExtern, const unsigned Type) { + MRE.r_word0 = FixupOffset; + // The bitfield offsets that work (as determined by trial-and-error) + // are different than what is documented in the mach-o manuals. + // This appears to be an endianness issue; reversing the order of the + // documented bitfields in <llvm/Support/MachO.h> fixes this (but + // breaks x86/ARM assembly). + MRE.r_word1 = ((Index << 8) | // was << 0 + (IsPCRel << 7) | // was << 24 + (Log2Size << 5) | // was << 25 + (IsExtern << 4) | // was << 27 + (Type << 0)); // was << 28 +} + +static void +makeScatteredRelocationInfo(MachO::any_relocation_info &MRE, + const uint32_t Addr, const unsigned Type, + const unsigned Log2Size, const unsigned IsPCRel, + const uint32_t Value2) { + // For notes on bitfield positions and endianness, see: + // https://developer.apple.com/library/mac/documentation/developertools/conceptual/MachORuntime/Reference/reference.html#//apple_ref/doc/uid/20001298-scattered_relocation_entry + MRE.r_word0 = ((Addr << 0) | (Type << 24) | (Log2Size << 28) | + (IsPCRel << 30) | MachO::R_SCATTERED); + MRE.r_word1 = Value2; +} + +/// Compute fixup offset (address). +static uint32_t getFixupOffset(const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup) { + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset(); + // On Mach-O, ppc_fixup_half16 relocations must refer to the + // start of the instruction, not the second halfword, as ELF does + if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16) + FixupOffset &= ~uint32_t(3); + return FixupOffset; +} + +/// \return false if falling back to using non-scattered relocation, +/// otherwise true for normal scattered relocation. +/// based on X86MachObjectWriter::RecordScatteredRelocation +/// and ARMMachObjectWriter::RecordScatteredRelocation +bool PPCMachObjectWriter::RecordScatteredRelocation( + MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + unsigned Log2Size, uint64_t &FixedValue) { + // caller already computes these, can we just pass and reuse? + const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup); + const MCFixupKind FK = Fixup.getKind(); + const unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, FK); + const unsigned Type = getRelocType(Target, FK, IsPCRel); + + // Is this a local or SECTDIFF relocation entry? + // SECTDIFF relocation entries have symbol subtractions, + // and require two entries, the first for the add-symbol value, + // the second for the subtract-symbol value. + + // See <reloc.h>. + const MCSymbol *A = &Target.getSymA()->getSymbol(); + const MCSymbolData *A_SD = &Asm.getSymbolData(*A); + + if (!A_SD->getFragment()) + report_fatal_error("symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + + uint32_t Value = Writer->getSymbolAddress(A_SD, Layout); + uint64_t SecAddr = + Writer->getSectionAddress(A_SD->getFragment()->getParent()); + FixedValue += SecAddr; + uint32_t Value2 = 0; + + if (const MCSymbolRefExpr *B = Target.getSymB()) { + const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol()); + + if (!B_SD->getFragment()) + report_fatal_error("symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + + // FIXME: is Type correct? see include/llvm/Support/MachO.h + Value2 = Writer->getSymbolAddress(B_SD, Layout); + FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent()); + } + // FIXME: does FixedValue get used?? + + // Relocations are written out in reverse order, so the PAIR comes first. + if (Type == MachO::PPC_RELOC_SECTDIFF || + Type == MachO::PPC_RELOC_HI16_SECTDIFF || + Type == MachO::PPC_RELOC_LO16_SECTDIFF || + Type == MachO::PPC_RELOC_HA16_SECTDIFF || + Type == MachO::PPC_RELOC_LO14_SECTDIFF || + Type == MachO::PPC_RELOC_LOCAL_SECTDIFF) { + // X86 had this piece, but ARM does not + // If the offset is too large to fit in a scattered relocation, + // we're hosed. It's an unfortunate limitation of the MachO format. + if (FixupOffset > 0xffffff) { + char Buffer[32]; + format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); + Asm.getContext().FatalError(Fixup.getLoc(), + Twine("Section too large, can't encode " + "r_address (") + + Buffer + ") into 24 bits of scattered " + "relocation entry."); + llvm_unreachable("fatal error returned?!"); + } + + // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()? + // see PPCMCExpr::EvaluateAsRelocatableImpl() + uint32_t other_half = 0; + switch (Type) { + case MachO::PPC_RELOC_LO16_SECTDIFF: + other_half = (FixedValue >> 16) & 0xffff; + // applyFixupOffset longer extracts the high part because it now assumes + // this was already done. + // It looks like this is not true for the FixedValue needed with Mach-O + // relocs. + // So we need to adjust FixedValue again here. + FixedValue &= 0xffff; + break; + case MachO::PPC_RELOC_HA16_SECTDIFF: + other_half = FixedValue & 0xffff; + FixedValue = + ((FixedValue >> 16) + ((FixedValue & 0x8000) ? 1 : 0)) & 0xffff; + break; + case MachO::PPC_RELOC_HI16_SECTDIFF: + other_half = FixedValue & 0xffff; + FixedValue = (FixedValue >> 16) & 0xffff; + break; + default: + llvm_unreachable("Invalid PPC scattered relocation type."); + break; + } + + MachO::any_relocation_info MRE; + makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR, + Log2Size, IsPCRel, Value2); + Writer->addRelocation(Fragment->getParent(), MRE); + } else { + // If the offset is more than 24-bits, it won't fit in a scattered + // relocation offset field, so we fall back to using a non-scattered + // relocation. This is a bit risky, as if the offset reaches out of + // the block and the linker is doing scattered loading on this + // symbol, things can go badly. + // + // Required for 'as' compatibility. + if (FixupOffset > 0xffffff) + return false; + } + MachO::any_relocation_info MRE; + makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value); + Writer->addRelocation(Fragment->getParent(), MRE); + return true; +} + +// see PPCELFObjectWriter for a general outline of cases +void PPCMachObjectWriter::RecordPPCRelocation( + MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) { + const MCFixupKind FK = Fixup.getKind(); // unsigned + const unsigned Log2Size = getFixupKindLog2Size(FK); + const bool IsPCRel = Writer->isFixupKindPCRel(Asm, FK); + const unsigned RelocType = getRelocType(Target, FK, IsPCRel); + + // If this is a difference or a defined symbol plus an offset, then we need a + // scattered relocation entry. Differences always require scattered + // relocations. + if (Target.getSymB() && + // Q: are branch targets ever scattered? + RelocType != MachO::PPC_RELOC_BR24 && + RelocType != MachO::PPC_RELOC_BR14) { + RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + Log2Size, FixedValue); + return; + } + + // this doesn't seem right for RIT_PPC_BR24 + // Get the symbol data, if any. + const MCSymbolData *SD = nullptr; + if (Target.getSymA()) + SD = &Asm.getSymbolData(Target.getSymA()->getSymbol()); + + // See <reloc.h>. + const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup); + unsigned Index = 0; + unsigned IsExtern = 0; + unsigned Type = RelocType; + + if (Target.isAbsolute()) { // constant + // SymbolNum of 0 indicates the absolute section. + // + // FIXME: Currently, these are never generated (see code below). I cannot + // find a case where they are actually emitted. + report_fatal_error("FIXME: relocations to absolute targets " + "not yet implemented"); + // the above line stolen from ARM, not sure + } else { + // Resolve constant variables. + if (SD->getSymbol().isVariable()) { + int64_t Res; + if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + } + + // Check whether we need an external or internal relocation. + if (Writer->doesSymbolRequireExternRelocation(SD)) { + IsExtern = 1; + Index = SD->getIndex(); + // For external relocations, make sure to offset the fixup value to + // compensate for the addend of the symbol address, if it was + // undefined. This occurs with weak definitions, for example. + if (!SD->Symbol->isUndefined()) + FixedValue -= Layout.getSymbolOffset(SD); + } else { + // The index is the section ordinal (1-based). + const MCSectionData &SymSD = + Asm.getSectionData(SD->getSymbol().getSection()); + Index = SymSD.getOrdinal() + 1; + FixedValue += Writer->getSectionAddress(&SymSD); + } + if (IsPCRel) + FixedValue -= Writer->getSectionAddress(Fragment->getParent()); + } + + // struct relocation_info (8 bytes) + MachO::any_relocation_info MRE; + makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, IsExtern, + Type); + Writer->addRelocation(Fragment->getParent(), MRE); +} + +MCObjectWriter *llvm::createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter( + new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS, + /*IsLittleEndian=*/false); +} diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp new file mode 100644 index 000000000000..c2987b641c04 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp @@ -0,0 +1,86 @@ +//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PowerPC branch predicates. +// +//===----------------------------------------------------------------------===// + +#include "PPCPredicates.h" +#include "llvm/Support/ErrorHandling.h" +#include <cassert> +using namespace llvm; + +PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) { + switch (Opcode) { + case PPC::PRED_EQ: return PPC::PRED_NE; + case PPC::PRED_NE: return PPC::PRED_EQ; + case PPC::PRED_LT: return PPC::PRED_GE; + case PPC::PRED_GE: return PPC::PRED_LT; + case PPC::PRED_GT: return PPC::PRED_LE; + case PPC::PRED_LE: return PPC::PRED_GT; + case PPC::PRED_NU: return PPC::PRED_UN; + case PPC::PRED_UN: return PPC::PRED_NU; + case PPC::PRED_EQ_MINUS: return PPC::PRED_NE_PLUS; + case PPC::PRED_NE_MINUS: return PPC::PRED_EQ_PLUS; + case PPC::PRED_LT_MINUS: return PPC::PRED_GE_PLUS; + case PPC::PRED_GE_MINUS: return PPC::PRED_LT_PLUS; + case PPC::PRED_GT_MINUS: return PPC::PRED_LE_PLUS; + case PPC::PRED_LE_MINUS: return PPC::PRED_GT_PLUS; + case PPC::PRED_NU_MINUS: return PPC::PRED_UN_PLUS; + case PPC::PRED_UN_MINUS: return PPC::PRED_NU_PLUS; + case PPC::PRED_EQ_PLUS: return PPC::PRED_NE_MINUS; + case PPC::PRED_NE_PLUS: return PPC::PRED_EQ_MINUS; + case PPC::PRED_LT_PLUS: return PPC::PRED_GE_MINUS; + case PPC::PRED_GE_PLUS: return PPC::PRED_LT_MINUS; + case PPC::PRED_GT_PLUS: return PPC::PRED_LE_MINUS; + case PPC::PRED_LE_PLUS: return PPC::PRED_GT_MINUS; + case PPC::PRED_NU_PLUS: return PPC::PRED_UN_MINUS; + case PPC::PRED_UN_PLUS: return PPC::PRED_NU_MINUS; + + // Simple predicates for single condition-register bits. + case PPC::PRED_BIT_SET: return PPC::PRED_BIT_UNSET; + case PPC::PRED_BIT_UNSET: return PPC::PRED_BIT_SET; + } + llvm_unreachable("Unknown PPC branch opcode!"); +} + +PPC::Predicate PPC::getSwappedPredicate(PPC::Predicate Opcode) { + switch (Opcode) { + case PPC::PRED_EQ: return PPC::PRED_EQ; + case PPC::PRED_NE: return PPC::PRED_NE; + case PPC::PRED_LT: return PPC::PRED_GT; + case PPC::PRED_GE: return PPC::PRED_LE; + case PPC::PRED_GT: return PPC::PRED_LT; + case PPC::PRED_LE: return PPC::PRED_GE; + case PPC::PRED_NU: return PPC::PRED_NU; + case PPC::PRED_UN: return PPC::PRED_UN; + case PPC::PRED_EQ_MINUS: return PPC::PRED_EQ_MINUS; + case PPC::PRED_NE_MINUS: return PPC::PRED_NE_MINUS; + case PPC::PRED_LT_MINUS: return PPC::PRED_GT_MINUS; + case PPC::PRED_GE_MINUS: return PPC::PRED_LE_MINUS; + case PPC::PRED_GT_MINUS: return PPC::PRED_LT_MINUS; + case PPC::PRED_LE_MINUS: return PPC::PRED_GE_MINUS; + case PPC::PRED_NU_MINUS: return PPC::PRED_NU_MINUS; + case PPC::PRED_UN_MINUS: return PPC::PRED_UN_MINUS; + case PPC::PRED_EQ_PLUS: return PPC::PRED_EQ_PLUS; + case PPC::PRED_NE_PLUS: return PPC::PRED_NE_PLUS; + case PPC::PRED_LT_PLUS: return PPC::PRED_GT_PLUS; + case PPC::PRED_GE_PLUS: return PPC::PRED_LE_PLUS; + case PPC::PRED_GT_PLUS: return PPC::PRED_LT_PLUS; + case PPC::PRED_LE_PLUS: return PPC::PRED_GE_PLUS; + case PPC::PRED_NU_PLUS: return PPC::PRED_NU_PLUS; + case PPC::PRED_UN_PLUS: return PPC::PRED_UN_PLUS; + + case PPC::PRED_BIT_SET: + case PPC::PRED_BIT_UNSET: + llvm_unreachable("Invalid use of bit predicate code"); + } + llvm_unreachable("Unknown PPC branch opcode!"); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h new file mode 100644 index 000000000000..10e328a8116e --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h @@ -0,0 +1,68 @@ +//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the PowerPC branch predicates. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H +#define LLVM_TARGET_POWERPC_PPCPREDICATES_H + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +// Generated files will use "namespace PPC". To avoid symbol clash, +// undefine PPC here. PPC may be predefined on some hosts. +#undef PPC + +namespace llvm { +namespace PPC { + /// Predicate - These are "(BI << 5) | BO" for various predicates. + enum Predicate { + PRED_LT = (0 << 5) | 12, + PRED_LE = (1 << 5) | 4, + PRED_EQ = (2 << 5) | 12, + PRED_GE = (0 << 5) | 4, + PRED_GT = (1 << 5) | 12, + PRED_NE = (2 << 5) | 4, + PRED_UN = (3 << 5) | 12, + PRED_NU = (3 << 5) | 4, + PRED_LT_MINUS = (0 << 5) | 14, + PRED_LE_MINUS = (1 << 5) | 6, + PRED_EQ_MINUS = (2 << 5) | 14, + PRED_GE_MINUS = (0 << 5) | 6, + PRED_GT_MINUS = (1 << 5) | 14, + PRED_NE_MINUS = (2 << 5) | 6, + PRED_UN_MINUS = (3 << 5) | 14, + PRED_NU_MINUS = (3 << 5) | 6, + PRED_LT_PLUS = (0 << 5) | 15, + PRED_LE_PLUS = (1 << 5) | 7, + PRED_EQ_PLUS = (2 << 5) | 15, + PRED_GE_PLUS = (0 << 5) | 7, + PRED_GT_PLUS = (1 << 5) | 15, + PRED_NE_PLUS = (2 << 5) | 7, + PRED_UN_PLUS = (3 << 5) | 15, + PRED_NU_PLUS = (3 << 5) | 7, + + // When dealing with individual condition-register bits, we have simple set + // and unset predicates. + PRED_BIT_SET = 1024, + PRED_BIT_UNSET = 1025 + }; + + /// Invert the specified predicate. != -> ==, < -> >=. + Predicate InvertPredicate(Predicate Opcode); + + /// Assume the condition register is set by MI(a,b), return the predicate if + /// we modify the instructions such that condition register is set by MI(b,a). + Predicate getSwappedPredicate(Predicate Opcode); +} +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h new file mode 100644 index 000000000000..ba5fa4f79b4e --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPC.h @@ -0,0 +1,105 @@ +//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// PowerPC back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_POWERPC_H +#define LLVM_TARGET_POWERPC_H + +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include <string> + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +namespace llvm { + class PPCTargetMachine; + class PassRegistry; + class FunctionPass; + class ImmutablePass; + class JITCodeEmitter; + class MachineInstr; + class AsmPrinter; + class MCInst; + + FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM); +#ifndef NDEBUG + FunctionPass *createPPCCTRLoopsVerify(); +#endif + FunctionPass *createPPCEarlyReturnPass(); + FunctionPass *createPPCVSXCopyPass(); + FunctionPass *createPPCVSXCopyCleanupPass(); + FunctionPass *createPPCVSXFMAMutatePass(); + FunctionPass *createPPCBranchSelectionPass(); + FunctionPass *createPPCISelDag(PPCTargetMachine &TM); + FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM, + JITCodeEmitter &MCE); + void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + AsmPrinter &AP, bool isDarwin); + + /// \brief Creates an PPC-specific Target Transformation Info pass. + ImmutablePass *createPPCTargetTransformInfoPass(const PPCTargetMachine *TM); + + void initializePPCVSXFMAMutatePass(PassRegistry&); + extern char &PPCVSXFMAMutateID; + + namespace PPCII { + + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // PPC Specific MachineOperand flags. + MO_NO_FLAG, + + /// MO_PLT_OR_STUB - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "FOO$stub" or "FOO@plt" symbol. This is + /// used for calls and jumps to external functions on Tiger and earlier, and + /// for PIC calls on Linux and ELF systems. + MO_PLT_OR_STUB = 1, + + /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to + /// the function's picbase, e.g. lo16(symbol-picbase). + MO_PIC_FLAG = 2, + + /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to + /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase). + MO_NLP_FLAG = 4, + + /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a + /// symbol with hidden visibility. This causes a different kind of + /// non-lazy-pointer to be generated. + MO_NLP_HIDDEN_FLAG = 8, + + /// The next are not flags but distinct values. + MO_ACCESS_MASK = 0xf0, + + /// MO_LO, MO_HA - lo16(symbol) and ha16(symbol) + MO_LO = 1 << 4, + MO_HA = 2 << 4, + + MO_TPREL_LO = 4 << 4, + MO_TPREL_HA = 3 << 4, + + /// These values identify relocations on immediates folded + /// into memory operations. + MO_DTPREL_LO = 5 << 4, + MO_TLSLD_LO = 6 << 4, + MO_TOC_LO = 7 << 4, + + // Symbol for VK_PPC_TLS fixup attached to an ADD instruction + MO_TLS = 8 << 4 + }; + } // end namespace PPCII + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td new file mode 100644 index 000000000000..a9842b287cbb --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPC.td @@ -0,0 +1,344 @@ +//===-- PPC.td - Describe the PowerPC Target Machine -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is the top level entry point for the PowerPC target. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing. +// +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// PowerPC Subtarget features. +// + +//===----------------------------------------------------------------------===// +// CPU Directives // +//===----------------------------------------------------------------------===// + +def Directive440 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_440", "">; +def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">; +def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">; +def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; +def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; +def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; +def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">; +def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">; +def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">; +def Directive32 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">; +def Directive64 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">; +def DirectiveA2 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">; +def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective", + "PPC::DIR_E500mc", "">; +def DirectiveE5500 : SubtargetFeature<"", "DarwinDirective", + "PPC::DIR_E5500", "">; +def DirectivePwr3: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR3", "">; +def DirectivePwr4: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR4", "">; +def DirectivePwr5: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5", "">; +def DirectivePwr5x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">; +def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">; +def DirectivePwr6x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">; +def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">; +def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">; + +def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", + "Enable 64-bit instructions">; +def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true", + "Enable 64-bit registers usage for ppc32 [beta]">; +def FeatureCRBits : SubtargetFeature<"crbits", "UseCRBits", "true", + "Use condition-register bits individually">; +def FeatureAltivec : SubtargetFeature<"altivec","HasAltivec", "true", + "Enable Altivec instructions">; +def FeatureMFOCRF : SubtargetFeature<"mfocrf","HasMFOCRF", "true", + "Enable the MFOCRF instruction">; +def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true", + "Enable the fsqrt instruction">; +def FeatureFCPSGN : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true", + "Enable the fcpsgn instruction">; +def FeatureFRE : SubtargetFeature<"fre", "HasFRE", "true", + "Enable the fre instruction">; +def FeatureFRES : SubtargetFeature<"fres", "HasFRES", "true", + "Enable the fres instruction">; +def FeatureFRSQRTE : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true", + "Enable the frsqrte instruction">; +def FeatureFRSQRTES : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true", + "Enable the frsqrtes instruction">; +def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true", + "Assume higher precision reciprocal estimates">; +def FeatureSTFIWX : SubtargetFeature<"stfiwx","HasSTFIWX", "true", + "Enable the stfiwx instruction">; +def FeatureLFIWAX : SubtargetFeature<"lfiwax","HasLFIWAX", "true", + "Enable the lfiwax instruction">; +def FeatureFPRND : SubtargetFeature<"fprnd", "HasFPRND", "true", + "Enable the fri[mnpz] instructions">; +def FeatureFPCVT : SubtargetFeature<"fpcvt", "HasFPCVT", "true", + "Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions">; +def FeatureISEL : SubtargetFeature<"isel","HasISEL", "true", + "Enable the isel instruction">; +def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD", "true", + "Enable the popcnt[dw] instructions">; +def FeatureLDBRX : SubtargetFeature<"ldbrx","HasLDBRX", "true", + "Enable the ldbrx instruction">; +def FeatureBookE : SubtargetFeature<"booke", "IsBookE", "true", + "Enable Book E instructions">; +def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true", + "Enable QPX instructions">; +def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true", + "Enable VSX instructions", + [FeatureAltivec]>; + +def DeprecatedMFTB : SubtargetFeature<"", "DeprecatedMFTB", "true", + "Treat mftb as deprecated">; +def DeprecatedDST : SubtargetFeature<"", "DeprecatedDST", "true", + "Treat vector data stream cache control instructions as deprecated">; + +// Note: Future features to add when support is extended to more +// recent ISA levels: +// +// CMPB p6, p6x, p7 cmpb +// DFP p6, p6x, p7 decimal floating-point instructions +// POPCNTB p5 through p7 popcntb and related instructions +// VSX p7 vector-scalar instruction set + +//===----------------------------------------------------------------------===// +// Classes used for relation maps. +//===----------------------------------------------------------------------===// +// RecFormRel - Filter class used to relate non-record-form instructions with +// their record-form variants. +class RecFormRel; + +// AltVSXFMARel - Filter class used to relate the primary addend-killing VSX +// FMA instruction forms with their corresponding factor-killing forms. +class AltVSXFMARel { + bit IsVSXFMAAlt = 0; +} + +//===----------------------------------------------------------------------===// +// Relation Map Definitions. +//===----------------------------------------------------------------------===// + +def getRecordFormOpcode : InstrMapping { + let FilterClass = "RecFormRel"; + // Instructions with the same BaseName and Interpretation64Bit values + // form a row. + let RowFields = ["BaseName", "Interpretation64Bit"]; + // Instructions with the same RC value form a column. + let ColFields = ["RC"]; + // The key column are the non-record-form instructions. + let KeyCol = ["0"]; + // Value columns RC=1 + let ValueCols = [["1"]]; +} + +def getNonRecordFormOpcode : InstrMapping { + let FilterClass = "RecFormRel"; + // Instructions with the same BaseName and Interpretation64Bit values + // form a row. + let RowFields = ["BaseName", "Interpretation64Bit"]; + // Instructions with the same RC value form a column. + let ColFields = ["RC"]; + // The key column are the record-form instructions. + let KeyCol = ["1"]; + // Value columns are RC=0 + let ValueCols = [["0"]]; +} + +def getAltVSXFMAOpcode : InstrMapping { + let FilterClass = "AltVSXFMARel"; + // Instructions with the same BaseName and Interpretation64Bit values + // form a row. + let RowFields = ["BaseName"]; + // Instructions with the same RC value form a column. + let ColFields = ["IsVSXFMAAlt"]; + // The key column are the (default) addend-killing instructions. + let KeyCol = ["0"]; + // Value columns IsVSXFMAAlt=1 + let ValueCols = [["1"]]; +} + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "PPCRegisterInfo.td" +include "PPCSchedule.td" +include "PPCInstrInfo.td" + +//===----------------------------------------------------------------------===// +// PowerPC processors supported. +// + +def : Processor<"generic", G3Itineraries, [Directive32]>; +def : ProcessorModel<"440", PPC440Model, [Directive440, FeatureISEL, + FeatureFRES, FeatureFRSQRTE, + FeatureBookE, DeprecatedMFTB]>; +def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL, + FeatureFRES, FeatureFRSQRTE, + FeatureBookE, DeprecatedMFTB]>; +def : Processor<"601", G3Itineraries, [Directive601]>; +def : Processor<"602", G3Itineraries, [Directive602]>; +def : Processor<"603", G3Itineraries, [Directive603, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"603e", G3Itineraries, [Directive603, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"603ev", G3Itineraries, [Directive603, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"604", G3Itineraries, [Directive604, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"604e", G3Itineraries, [Directive604, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"620", G3Itineraries, [Directive620, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"750", G4Itineraries, [Directive750, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"g3", G3Itineraries, [Directive750, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE]>; +def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE]>; +def : ProcessorModel<"970", G5Model, + [Directive970, FeatureAltivec, + FeatureMFOCRF, FeatureFSqrt, + FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX, + Feature64Bit /*, Feature64BitRegs */]>; +def : ProcessorModel<"g5", G5Model, + [Directive970, FeatureAltivec, + FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX, + FeatureFRES, FeatureFRSQRTE, + Feature64Bit /*, Feature64BitRegs */, + DeprecatedMFTB, DeprecatedDST]>; +def : ProcessorModel<"e500mc", PPCE500mcModel, + [DirectiveE500mc, FeatureMFOCRF, + FeatureSTFIWX, FeatureBookE, FeatureISEL, + DeprecatedMFTB]>; +def : ProcessorModel<"e5500", PPCE5500Model, + [DirectiveE5500, FeatureMFOCRF, Feature64Bit, + FeatureSTFIWX, FeatureBookE, FeatureISEL, + DeprecatedMFTB]>; +def : ProcessorModel<"a2", PPCA2Model, + [DirectiveA2, FeatureBookE, FeatureMFOCRF, + FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureLDBRX, Feature64Bit + /*, Feature64BitRegs */, DeprecatedMFTB]>; +def : ProcessorModel<"a2q", PPCA2Model, + [DirectiveA2, FeatureBookE, FeatureMFOCRF, + FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureLDBRX, Feature64Bit + /*, Feature64BitRegs */, FeatureQPX, DeprecatedMFTB]>; +def : ProcessorModel<"pwr3", G5Model, + [DirectivePwr3, FeatureAltivec, + FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF, + FeatureSTFIWX, Feature64Bit]>; +def : ProcessorModel<"pwr4", G5Model, + [DirectivePwr4, FeatureAltivec, FeatureMFOCRF, + FeatureFSqrt, FeatureFRES, FeatureFRSQRTE, + FeatureSTFIWX, Feature64Bit]>; +def : ProcessorModel<"pwr5", G5Model, + [DirectivePwr5, FeatureAltivec, FeatureMFOCRF, + FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, + FeatureSTFIWX, Feature64Bit, + DeprecatedMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr5x", G5Model, + [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF, + FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, + FeatureSTFIWX, FeatureFPRND, Feature64Bit, + DeprecatedMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr6", G5Model, + [DirectivePwr6, FeatureAltivec, + FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, + FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, Feature64Bit /*, Feature64BitRegs */, + DeprecatedMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr6x", G5Model, + [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF, + FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, + FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, + FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, Feature64Bit, + DeprecatedMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr7", P7Model, + [DirectivePwr7, FeatureAltivec, + FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, + FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureLDBRX, + Feature64Bit /*, Feature64BitRegs */, + DeprecatedMFTB, DeprecatedDST]>; +def : ProcessorModel<"pwr8", P7Model /* FIXME: Update to P8Model when available */, + [DirectivePwr8, FeatureAltivec, + FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, + FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, + FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, + FeatureFPRND, FeatureFPCVT, FeatureISEL, + FeaturePOPCNTD, FeatureLDBRX, + Feature64Bit /*, Feature64BitRegs */, + DeprecatedMFTB, DeprecatedDST]>; +def : Processor<"ppc", G3Itineraries, [Directive32]>; +def : ProcessorModel<"ppc64", G5Model, + [Directive64, FeatureAltivec, + FeatureMFOCRF, FeatureFSqrt, FeatureFRES, + FeatureFRSQRTE, FeatureSTFIWX, + Feature64Bit /*, Feature64BitRegs */]>; +def : ProcessorModel<"ppc64le", G5Model, + [Directive64, FeatureAltivec, + FeatureMFOCRF, FeatureFSqrt, FeatureFRES, + FeatureFRSQRTE, FeatureSTFIWX, + Feature64Bit /*, Feature64BitRegs */]>; + +//===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "PPCCallingConv.td" + +def PPCInstrInfo : InstrInfo { + let isLittleEndianEncoding = 1; + + // FIXME: Unset this when no longer needed! + let decodePositionallyEncodedOperands = 1; + + let noNamedPositionallyEncodedOperands = 1; +} + +def PPCAsmParser : AsmParser { + let ShouldEmitMatchRegisterName = 0; +} + +def PPCAsmParserVariant : AsmParserVariant { + int Variant = 0; + + // We do not use hard coded registers in asm strings. However, some + // InstAlias definitions use immediate literals. Set RegisterPrefix + // so that those are not misinterpreted as registers. + string RegisterPrefix = "%"; +} + +def PPC : Target { + // Information about the instructions. + let InstructionSet = PPCInstrInfo; + + let AssemblyParsers = [PPCAsmParser]; + let AssemblyParserVariants = [PPCAsmParserVariant]; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp new file mode 100644 index 000000000000..138402241957 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -0,0 +1,1397 @@ +//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to PowerPC assembly language. This printer is +// the output mechanism used by `llc'. +// +// Documentation at http://developer.apple.com/documentation/DeveloperTools/ +// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "InstPrinter/PPCInstPrinter.h" +#include "PPCMachineFunctionInfo.h" +#include "MCTargetDesc/PPCMCExpr.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "PPCTargetStreamer.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Mangler.h" +#include "llvm/IR/Module.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "asmprinter" + +namespace { + class PPCAsmPrinter : public AsmPrinter { + protected: + MapVector<MCSymbol*, MCSymbol*> TOC; + const PPCSubtarget &Subtarget; + uint64_t TOCLabelID; + public: + explicit PPCAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer), + Subtarget(TM.getSubtarget<PPCSubtarget>()), TOCLabelID(0) {} + + const char *getPassName() const override { + return "PowerPC Assembly Printer"; + } + + MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym); + + void EmitInstruction(const MachineInstr *MI) override; + + void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + }; + + /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux + class PPCLinuxAsmPrinter : public PPCAsmPrinter { + public: + explicit PPCLinuxAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : PPCAsmPrinter(TM, Streamer) {} + + const char *getPassName() const override { + return "Linux PPC Assembly Printer"; + } + + bool doFinalization(Module &M) override; + void EmitStartOfAsmFile(Module &M) override; + + void EmitFunctionEntryLabel() override; + + void EmitFunctionBodyStart() override; + void EmitFunctionBodyEnd() override; + }; + + /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac + /// OS X + class PPCDarwinAsmPrinter : public PPCAsmPrinter { + public: + explicit PPCDarwinAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : PPCAsmPrinter(TM, Streamer) {} + + const char *getPassName() const override { + return "Darwin PPC Assembly Printer"; + } + + bool doFinalization(Module &M) override; + void EmitStartOfAsmFile(Module &M) override; + + void EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs); + }; +} // end of anonymous namespace + +/// stripRegisterPrefix - This method strips the character prefix from a +/// register name so that only the number is left. Used by for linux asm. +static const char *stripRegisterPrefix(const char *RegName) { + switch (RegName[0]) { + case 'r': + case 'f': + case 'v': + if (RegName[1] == 's') + return RegName + 2; + return RegName + 1; + case 'c': if (RegName[1] == 'r') return RegName + 2; + } + + return RegName; +} + +void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O) { + const DataLayout *DL = TM.getDataLayout(); + const MachineOperand &MO = MI->getOperand(OpNo); + + switch (MO.getType()) { + case MachineOperand::MO_Register: { + const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg()); + // Linux assembler (Others?) does not take register mnemonics. + // FIXME - What about special registers used in mfspr/mtspr? + if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName); + O << RegName; + return; + } + case MachineOperand::MO_Immediate: + O << MO.getImm(); + return; + + case MachineOperand::MO_MachineBasicBlock: + O << *MO.getMBB()->getSymbol(); + return; + case MachineOperand::MO_ConstantPoolIndex: + O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() + << '_' << MO.getIndex(); + return; + case MachineOperand::MO_BlockAddress: + O << *GetBlockAddressSymbol(MO.getBlockAddress()); + return; + case MachineOperand::MO_GlobalAddress: { + // Computing the address of a global symbol, not calling it. + const GlobalValue *GV = MO.getGlobal(); + MCSymbol *SymToPrint; + + // External or weakly linked global variables need non-lazily-resolved stubs + if (TM.getRelocationModel() != Reloc::Static && + (GV->isDeclaration() || GV->isWeakForLinker())) { + if (!GV->hasHiddenVisibility()) { + SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>() + .getGVStubEntry(SymToPrint); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl:: + StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); + } else if (GV->isDeclaration() || GV->hasCommonLinkage() || + GV->hasAvailableExternallyLinkage()) { + SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>(). + getHiddenGVStubEntry(SymToPrint); + if (!StubSym.getPointer()) + StubSym = MachineModuleInfoImpl:: + StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); + } else { + SymToPrint = getSymbol(GV); + } + } else { + SymToPrint = getSymbol(GV); + } + + O << *SymToPrint; + + printOffset(MO.getOffset(), O); + return; + } + + default: + O << "<unknown operand type: " << (unsigned)MO.getType() << ">"; + return; + } +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + // See if this is a generic print operand + return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); + case 'c': // Don't print "$" before a global var name or constant. + break; // PPC never has a prefix. + case 'L': // Write second word of DImode reference. + // Verify that this operand has two consecutive registers. + if (!MI->getOperand(OpNo).isReg() || + OpNo+1 == MI->getNumOperands() || + !MI->getOperand(OpNo+1).isReg()) + return true; + ++OpNo; // Return the high-part. + break; + case 'I': + // Write 'i' if an integer constant, otherwise nothing. Used to print + // addi vs add, etc. + if (MI->getOperand(OpNo).isImm()) + O << "i"; + return false; + } + } + + printOperand(MI, OpNo, O); + return false; +} + +// At the moment, all inline asm memory operands are a single register. +// In any case, the output of this routine should always be just one +// assembler operand. + +bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'y': // A memory reference for an X-form instruction + { + const char *RegName = "r0"; + if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName); + O << RegName << ", "; + printOperand(MI, OpNo, O); + return false; + } + } + } + + assert(MI->getOperand(OpNo).isReg()); + O << "0("; + printOperand(MI, OpNo, O); + O << ")"; + return false; +} + + +/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry +/// exists for it. If not, create one. Then return a symbol that references +/// the TOC entry. +MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) { + const DataLayout *DL = TM.getDataLayout(); + MCSymbol *&TOCEntry = TOC[Sym]; + + // To avoid name clash check if the name already exists. + while (!TOCEntry) { + if (OutContext.LookupSymbol(Twine(DL->getPrivateGlobalPrefix()) + + "C" + Twine(TOCLabelID++)) == nullptr) { + TOCEntry = GetTempSymbol("C", TOCLabelID); + } + } + + return TOCEntry; +} + + +/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to +/// the current output stream. +/// +void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { + MCInst TmpInst; + bool isPPC64 = Subtarget.isPPC64(); + + // Lower multi-instruction pseudo operations. + switch (MI->getOpcode()) { + default: break; + case TargetOpcode::DBG_VALUE: + llvm_unreachable("Should be handled target independently"); + case PPC::MovePCtoLR: + case PPC::MovePCtoLR8: { + // Transform %LR = MovePCtoLR + // Into this, where the label is the PIC base: + // bl L1$pb + // L1$pb: + MCSymbol *PICBase = MF->getPICBaseSymbol(); + + // Emit the 'bl'. + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL) + // FIXME: We would like an efficient form for this, so we don't have to do + // a lot of extra uniquing. + .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext))); + + // Emit the label. + OutStreamer.EmitLabel(PICBase); + return; + } + case PPC::GetGBRO: { + // Get the offset from the GOT Base Register to the GOT + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + MCSymbol *PICOffset = MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol(); + TmpInst.setOpcode(PPC::LWZ); + const MCExpr *Exp = + MCSymbolRefExpr::Create(PICOffset, MCSymbolRefExpr::VK_None, OutContext); + const MCExpr *PB = + MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), + MCSymbolRefExpr::VK_None, + OutContext); + const MCOperand MO = TmpInst.getOperand(1); + TmpInst.getOperand(1) = MCOperand::CreateExpr(MCBinaryExpr::CreateSub(Exp, + PB, + OutContext)); + TmpInst.addOperand(MO); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case PPC::UpdateGBR: { + // Update the GOT Base Register to point to the GOT. It may be possible to + // merge this with the PPC::GetGBRO, doing it all in one step. + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + TmpInst.setOpcode(PPC::ADD4); + TmpInst.addOperand(TmpInst.getOperand(0)); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case PPC::LWZtoc: { + // Transform %X3 = LWZtoc <ga:@min1>, %X2 + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + + // Change the opcode to LWZ, and the global address operand to be a + // reference to the GOT entry we will synthesize later. + TmpInst.setOpcode(PPC::LWZ); + const MachineOperand &MO = MI->getOperand(1); + + // Map symbol -> label of TOC entry + assert(MO.isGlobal() || MO.isCPI() || MO.isJTI()); + MCSymbol *MOSymbol = nullptr; + if (MO.isGlobal()) + MOSymbol = getSymbol(MO.getGlobal()); + else if (MO.isCPI()) + MOSymbol = GetCPISymbol(MO.getIndex()); + else if (MO.isJTI()) + MOSymbol = GetJTISymbol(MO.getIndex()); + + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); + + const MCExpr *Exp = + MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_None, + OutContext); + const MCExpr *PB = + MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".L.TOC.")), + OutContext); + Exp = MCBinaryExpr::CreateSub(Exp, PB, OutContext); + TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case PPC::LDtocJTI: + case PPC::LDtocCPT: + case PPC::LDtoc: { + // Transform %X3 = LDtoc <ga:@min1>, %X2 + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + + // Change the opcode to LD, and the global address operand to be a + // reference to the TOC entry we will synthesize later. + TmpInst.setOpcode(PPC::LD); + const MachineOperand &MO = MI->getOperand(1); + + // Map symbol -> label of TOC entry + assert(MO.isGlobal() || MO.isCPI() || MO.isJTI()); + MCSymbol *MOSymbol = nullptr; + if (MO.isGlobal()) + MOSymbol = getSymbol(MO.getGlobal()); + else if (MO.isCPI()) + MOSymbol = GetCPISymbol(MO.getIndex()); + else if (MO.isJTI()) + MOSymbol = GetJTISymbol(MO.getIndex()); + + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); + + const MCExpr *Exp = + MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC, + OutContext); + TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + + case PPC::ADDIStocHA: { + // Transform %Xd = ADDIStocHA %X2, <ga:@sym> + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + + // Change the opcode to ADDIS8. If the global address is external, has + // common linkage, is a non-local function address, or is a jump table + // address, then generate a TOC entry and reference that. Otherwise + // reference the symbol directly. + TmpInst.setOpcode(PPC::ADDIS8); + const MachineOperand &MO = MI->getOperand(2); + assert((MO.isGlobal() || MO.isCPI() || MO.isJTI()) && + "Invalid operand for ADDIStocHA!"); + MCSymbol *MOSymbol = nullptr; + bool IsExternal = false; + bool IsNonLocalFunction = false; + bool IsCommon = false; + bool IsAvailExt = false; + + if (MO.isGlobal()) { + const GlobalValue *GV = MO.getGlobal(); + MOSymbol = getSymbol(GV); + IsExternal = GV->isDeclaration(); + IsCommon = GV->hasCommonLinkage(); + IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && + (GV->isDeclaration() || GV->isWeakForLinker()); + IsAvailExt = GV->hasAvailableExternallyLinkage(); + } else if (MO.isCPI()) + MOSymbol = GetCPISymbol(MO.getIndex()); + else if (MO.isJTI()) + MOSymbol = GetJTISymbol(MO.getIndex()); + + if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt || + MO.isJTI() || TM.getCodeModel() == CodeModel::Large) + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + + const MCExpr *Exp = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA, + OutContext); + TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case PPC::LDtocL: { + // Transform %Xd = LDtocL <ga:@sym>, %Xs + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + + // Change the opcode to LD. If the global address is external, has + // common linkage, or is a jump table address, then reference the + // associated TOC entry. Otherwise reference the symbol directly. + TmpInst.setOpcode(PPC::LD); + const MachineOperand &MO = MI->getOperand(1); + assert((MO.isGlobal() || MO.isJTI() || MO.isCPI()) && + "Invalid operand for LDtocL!"); + MCSymbol *MOSymbol = nullptr; + + if (MO.isJTI()) + MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex())); + else if (MO.isCPI()) { + MOSymbol = GetCPISymbol(MO.getIndex()); + if (TM.getCodeModel() == CodeModel::Large) + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + } + else if (MO.isGlobal()) { + const GlobalValue *GValue = MO.getGlobal(); + MOSymbol = getSymbol(GValue); + if (GValue->getType()->getElementType()->isFunctionTy() || + GValue->isDeclaration() || GValue->hasCommonLinkage() || + GValue->hasAvailableExternallyLinkage() || + TM.getCodeModel() == CodeModel::Large) + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + } + + const MCExpr *Exp = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO, + OutContext); + TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case PPC::ADDItocL: { + // Transform %Xd = ADDItocL %Xs, <ga:@sym> + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + + // Change the opcode to ADDI8. If the global address is external, then + // generate a TOC entry and reference that. Otherwise reference the + // symbol directly. + TmpInst.setOpcode(PPC::ADDI8); + const MachineOperand &MO = MI->getOperand(2); + assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL"); + MCSymbol *MOSymbol = nullptr; + bool IsExternal = false; + bool IsNonLocalFunction = false; + + if (MO.isGlobal()) { + const GlobalValue *GV = MO.getGlobal(); + MOSymbol = getSymbol(GV); + IsExternal = GV->isDeclaration(); + IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && + (GV->isDeclaration() || GV->isWeakForLinker()); + } else if (MO.isCPI()) + MOSymbol = GetCPISymbol(MO.getIndex()); + + if (IsNonLocalFunction || IsExternal || + TM.getCodeModel() == CodeModel::Large) + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + + const MCExpr *Exp = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO, + OutContext); + TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case PPC::ADDISgotTprelHA: { + // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym> + // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha + assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC"); + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTprel = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA, + OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8) + .addReg(MI->getOperand(0).getReg()) + .addReg(PPC::X2) + .addExpr(SymGotTprel)); + return; + } + case PPC::LDgotTprelL: + case PPC::LDgotTprelL32: { + // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + + // Change the opcode to LD. + TmpInst.setOpcode(isPPC64 ? PPC::LD : PPC::LWZ); + const MachineOperand &MO = MI->getOperand(1); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *Exp = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO, + OutContext); + TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + + case PPC::PPC32PICGOT: { + MCSymbol *GOTSymbol = OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_")); + MCSymbol *GOTRef = OutContext.CreateTempSymbol(); + MCSymbol *NextInstr = OutContext.CreateTempSymbol(); + + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL) + // FIXME: We would like an efficient form for this, so we don't have to do + // a lot of extra uniquing. + .addExpr(MCSymbolRefExpr::Create(NextInstr, OutContext))); + const MCExpr *OffsExpr = + MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(GOTSymbol, OutContext), + MCSymbolRefExpr::Create(GOTRef, OutContext), + OutContext); + OutStreamer.EmitLabel(GOTRef); + OutStreamer.EmitValue(OffsExpr, 4); + OutStreamer.EmitLabel(NextInstr); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR) + .addReg(MI->getOperand(0).getReg())); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LWZ) + .addReg(MI->getOperand(1).getReg()) + .addImm(0) + .addReg(MI->getOperand(0).getReg())); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADD4) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addReg(MI->getOperand(0).getReg())); + return; + } + case PPC::PPC32GOT: { + MCSymbol *GOTSymbol = OutContext.GetOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_")); + const MCExpr *SymGotTlsL = + MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, + OutContext); + const MCExpr *SymGotTlsHA = + MCSymbolRefExpr::Create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, + OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI) + .addReg(MI->getOperand(0).getReg()) + .addExpr(SymGotTlsL)); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(0).getReg()) + .addExpr(SymGotTlsHA)); + return; + } + case PPC::ADDIStlsgdHA: { + // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym> + // Into: %Xd = ADDIS8 %X2, sym@got@tlsgd@ha + assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC"); + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTlsGD = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA, + OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8) + .addReg(MI->getOperand(0).getReg()) + .addReg(PPC::X2) + .addExpr(SymGotTlsGD)); + return; + } + case PPC::ADDItlsgdL: + // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym> + // Into: %Xd = ADDI8 %Xs, sym@got@tlsgd@l + case PPC::ADDItlsgdL32: { + // Transform: %Rd = ADDItlsgdL32 %Rs, <ga:@sym> + // Into: %Rd = ADDI %Rs, sym@got@tlsgd + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTlsGD = + MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ? + MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO : + MCSymbolRefExpr::VK_PPC_GOT_TLSGD, + OutContext); + EmitToStreamer(OutStreamer, + MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymGotTlsGD)); + return; + } + case PPC::GETtlsADDR: + // Transform: %X3 = GETtlsADDR %X3, <ga:@sym> + // Into: BL8_NOP_TLS __tls_get_addr(sym@tlsgd) + case PPC::GETtlsADDR32: { + // Transform: %R3 = GETtlsADDR32 %R3, <ga:@sym> + // Into: BL_TLS __tls_get_addr(sym@tlsgd)@PLT + + StringRef Name = "__tls_get_addr"; + MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name); + MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None; + + if (!Subtarget.isPPC64() && !Subtarget.isDarwin() && + TM.getRelocationModel() == Reloc::PIC_) + Kind = MCSymbolRefExpr::VK_PLT; + const MCSymbolRefExpr *TlsRef = + MCSymbolRefExpr::Create(TlsGetAddr, Kind, OutContext); + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymVar = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD, + OutContext); + EmitToStreamer(OutStreamer, + MCInstBuilder(Subtarget.isPPC64() ? + PPC::BL8_NOP_TLS : PPC::BL_TLS) + .addExpr(TlsRef) + .addExpr(SymVar)); + return; + } + case PPC::ADDIStlsldHA: { + // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym> + // Into: %Xd = ADDIS8 %X2, sym@got@tlsld@ha + assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC"); + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTlsLD = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA, + OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS8) + .addReg(MI->getOperand(0).getReg()) + .addReg(PPC::X2) + .addExpr(SymGotTlsLD)); + return; + } + case PPC::ADDItlsldL: + // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym> + // Into: %Xd = ADDI8 %Xs, sym@got@tlsld@l + case PPC::ADDItlsldL32: { + // Transform: %Rd = ADDItlsldL32 %Rs, <ga:@sym> + // Into: %Rd = ADDI %Rs, sym@got@tlsld + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymGotTlsLD = + MCSymbolRefExpr::Create(MOSymbol, Subtarget.isPPC64() ? + MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO : + MCSymbolRefExpr::VK_PPC_GOT_TLSLD, + OutContext); + EmitToStreamer(OutStreamer, + MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymGotTlsLD)); + return; + } + case PPC::GETtlsldADDR: + // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym> + // Into: BL8_NOP_TLS __tls_get_addr(sym@tlsld) + case PPC::GETtlsldADDR32: { + // Transform: %R3 = GETtlsldADDR32 %R3, <ga:@sym> + // Into: BL_TLS __tls_get_addr(sym@tlsld)@PLT + + StringRef Name = "__tls_get_addr"; + MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name); + MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None; + + if (!Subtarget.isPPC64() && !Subtarget.isDarwin() && + TM.getRelocationModel() == Reloc::PIC_) + Kind = MCSymbolRefExpr::VK_PLT; + + const MCSymbolRefExpr *TlsRef = + MCSymbolRefExpr::Create(TlsGetAddr, Kind, OutContext); + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymVar = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD, + OutContext); + EmitToStreamer(OutStreamer, + MCInstBuilder(Subtarget.isPPC64() ? + PPC::BL8_NOP_TLS : PPC::BL_TLS) + .addExpr(TlsRef) + .addExpr(SymVar)); + return; + } + case PPC::ADDISdtprelHA: + // Transform: %Xd = ADDISdtprelHA %X3, <ga:@sym> + // Into: %Xd = ADDIS8 %X3, sym@dtprel@ha + case PPC::ADDISdtprelHA32: { + // Transform: %Rd = ADDISdtprelHA32 %R3, <ga:@sym> + // Into: %Rd = ADDIS %R3, sym@dtprel@ha + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymDtprel = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA, + OutContext); + EmitToStreamer(OutStreamer, + MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDIS8 : PPC::ADDIS) + .addReg(MI->getOperand(0).getReg()) + .addReg(Subtarget.isPPC64() ? PPC::X3 : PPC::R3) + .addExpr(SymDtprel)); + return; + } + case PPC::ADDIdtprelL: + // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym> + // Into: %Xd = ADDI8 %Xs, sym@dtprel@l + case PPC::ADDIdtprelL32: { + // Transform: %Rd = ADDIdtprelL32 %Rs, <ga:@sym> + // Into: %Rd = ADDI %Rs, sym@dtprel@l + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymDtprel = + MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO, + OutContext); + EmitToStreamer(OutStreamer, + MCInstBuilder(Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymDtprel)); + return; + } + case PPC::MFOCRF: + case PPC::MFOCRF8: + if (!Subtarget.hasMFOCRF()) { + // Transform: %R3 = MFOCRF %CR7 + // Into: %R3 = MFCR ;; cr7 + unsigned NewOpcode = + MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8; + OutStreamer.AddComment(PPCInstPrinter:: + getRegisterName(MI->getOperand(1).getReg())); + EmitToStreamer(OutStreamer, MCInstBuilder(NewOpcode) + .addReg(MI->getOperand(0).getReg())); + return; + } + break; + case PPC::MTOCRF: + case PPC::MTOCRF8: + if (!Subtarget.hasMFOCRF()) { + // Transform: %CR7 = MTOCRF %R3 + // Into: MTCRF mask, %R3 ;; cr7 + unsigned NewOpcode = + MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8; + unsigned Mask = 0x80 >> OutContext.getRegisterInfo() + ->getEncodingValue(MI->getOperand(0).getReg()); + OutStreamer.AddComment(PPCInstPrinter:: + getRegisterName(MI->getOperand(0).getReg())); + EmitToStreamer(OutStreamer, MCInstBuilder(NewOpcode) + .addImm(Mask) + .addReg(MI->getOperand(1).getReg())); + return; + } + break; + case PPC::LD: + case PPC::STD: + case PPC::LWA_32: + case PPC::LWA: { + // Verify alignment is legal, so we don't create relocations + // that can't be supported. + // FIXME: This test is currently disabled for Darwin. The test + // suite shows a handful of test cases that fail this check for + // Darwin. Those need to be investigated before this sanity test + // can be enabled for those subtargets. + if (!Subtarget.isDarwin()) { + unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1; + const MachineOperand &MO = MI->getOperand(OpNum); + if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4) + llvm_unreachable("Global must be word-aligned for LD, STD, LWA!"); + } + // Now process the instruction normally. + break; + } + } + + LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin()); + EmitToStreamer(OutStreamer, TmpInst); +} + +void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) { + if (Subtarget.isELFv2ABI()) { + PPCTargetStreamer *TS = + static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer()); + + if (TS) + TS->emitAbiVersion(2); + } + + if (Subtarget.isPPC64() || TM.getRelocationModel() != Reloc::PIC_) + return AsmPrinter::EmitStartOfAsmFile(M); + + // FIXME: The use of .got2 assumes large GOT model (-fPIC), which is not + // optimal for some cases. We should consider supporting small model (-fpic) + // as well in the future. + assert(TM.getCodeModel() != CodeModel::Small && + "Small code model PIC is currently unsupported."); + OutStreamer.SwitchSection(OutContext.getELFSection(".got2", + ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC, + SectionKind::getReadOnly())); + + MCSymbol *TOCSym = OutContext.GetOrCreateSymbol(Twine(".L.TOC.")); + MCSymbol *CurrentPos = OutContext.CreateTempSymbol(); + + OutStreamer.EmitLabel(CurrentPos); + + // The GOT pointer points to the middle of the GOT, in order to reference the + // entire 64kB range. 0x8000 is the midpoint. + const MCExpr *tocExpr = + MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(CurrentPos, OutContext), + MCConstantExpr::Create(0x8000, OutContext), + OutContext); + + OutStreamer.EmitAssignment(TOCSym, tocExpr); + + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); +} + +void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { + // linux/ppc32 - Normal entry label. + if (!Subtarget.isPPC64() && TM.getRelocationModel() != Reloc::PIC_) + return AsmPrinter::EmitFunctionEntryLabel(); + + if (!Subtarget.isPPC64()) { + const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>(); + if (PPCFI->usesPICBase()) { + MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol(); + MCSymbol *PICBase = MF->getPICBaseSymbol(); + OutStreamer.EmitLabel(RelocSymbol); + + const MCExpr *OffsExpr = + MCBinaryExpr::CreateSub( + MCSymbolRefExpr::Create(OutContext.GetOrCreateSymbol(Twine(".L.TOC.")), + OutContext), + MCSymbolRefExpr::Create(PICBase, OutContext), + OutContext); + OutStreamer.EmitValue(OffsExpr, 4); + OutStreamer.EmitLabel(CurrentFnSym); + return; + } else + return AsmPrinter::EmitFunctionEntryLabel(); + } + + // ELFv2 ABI - Normal entry label. + if (Subtarget.isELFv2ABI()) + return AsmPrinter::EmitFunctionEntryLabel(); + + // Emit an official procedure descriptor. + MCSectionSubPair Current = OutStreamer.getCurrentSection(); + const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".opd", + ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC, + SectionKind::getReadOnly()); + OutStreamer.SwitchSection(Section); + OutStreamer.EmitLabel(CurrentFnSym); + OutStreamer.EmitValueToAlignment(8); + MCSymbol *Symbol1 = + OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName())); + // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function + // entry point. + OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext), + 8 /*size*/); + MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC.")); + // Generates a R_PPC64_TOC relocation for TOC base insertion. + OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2, + MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext), + 8/*size*/); + // Emit a null environment pointer. + OutStreamer.EmitIntValue(0, 8 /* size */); + OutStreamer.SwitchSection(Current.first, Current.second); + + MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol( + ".L." + Twine(CurrentFnSym->getName())); + OutStreamer.EmitLabel(RealFnSym); + CurrentFnSymForSize = RealFnSym; +} + + +bool PPCLinuxAsmPrinter::doFinalization(Module &M) { + const DataLayout *TD = TM.getDataLayout(); + + bool isPPC64 = TD->getPointerSizeInBits() == 64; + + PPCTargetStreamer &TS = + static_cast<PPCTargetStreamer &>(*OutStreamer.getTargetStreamer()); + + if (!TOC.empty()) { + const MCSectionELF *Section; + + if (isPPC64) + Section = OutStreamer.getContext().getELFSection(".toc", + ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC, + SectionKind::getReadOnly()); + else + Section = OutStreamer.getContext().getELFSection(".got2", + ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC, + SectionKind::getReadOnly()); + OutStreamer.SwitchSection(Section); + + for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(), + E = TOC.end(); I != E; ++I) { + OutStreamer.EmitLabel(I->second); + MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName()); + if (isPPC64) + TS.emitTCEntry(*S); + else + OutStreamer.EmitSymbolValue(S, 4); + } + } + + MachineModuleInfoELF &MMIELF = + MMI->getObjFileInfo<MachineModuleInfoELF>(); + + MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); + if (!Stubs.empty()) { + OutStreamer.SwitchSection(getObjFileLowering().getDataSection()); + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer.EmitLabel(Stubs[i].first); + // .long _foo + OutStreamer.EmitValue(MCSymbolRefExpr::Create(Stubs[i].second.getPointer(), + OutContext), + isPPC64 ? 8 : 4/*size*/); + } + + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + return AsmPrinter::doFinalization(M); +} + +/// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2. +void PPCLinuxAsmPrinter::EmitFunctionBodyStart() { + // In the ELFv2 ABI, in functions that use the TOC register, we need to + // provide two entry points. The ABI guarantees that when calling the + // local entry point, r2 is set up by the caller to contain the TOC base + // for this function, and when calling the global entry point, r12 is set + // up by the caller to hold the address of the global entry point. We + // thus emit a prefix sequence along the following lines: + // + // func: + // # global entry point + // addis r2,r12,(.TOC.-func)@ha + // addi r2,r2,(.TOC.-func)@l + // .localentry func, .-func + // # local entry point, followed by function body + // + // This ensures we have r2 set up correctly while executing the function + // body, no matter which entry point is called. + if (Subtarget.isELFv2ABI() + // Only do all that if the function uses r2 in the first place. + && !MF->getRegInfo().use_empty(PPC::X2)) { + + MCSymbol *GlobalEntryLabel = OutContext.CreateTempSymbol(); + OutStreamer.EmitLabel(GlobalEntryLabel); + const MCSymbolRefExpr *GlobalEntryLabelExp = + MCSymbolRefExpr::Create(GlobalEntryLabel, OutContext); + + MCSymbol *TOCSymbol = OutContext.GetOrCreateSymbol(StringRef(".TOC.")); + const MCExpr *TOCDeltaExpr = + MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(TOCSymbol, OutContext), + GlobalEntryLabelExp, OutContext); + + const MCExpr *TOCDeltaHi = + PPCMCExpr::CreateHa(TOCDeltaExpr, false, OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS) + .addReg(PPC::X2) + .addReg(PPC::X12) + .addExpr(TOCDeltaHi)); + + const MCExpr *TOCDeltaLo = + PPCMCExpr::CreateLo(TOCDeltaExpr, false, OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDI) + .addReg(PPC::X2) + .addReg(PPC::X2) + .addExpr(TOCDeltaLo)); + + MCSymbol *LocalEntryLabel = OutContext.CreateTempSymbol(); + OutStreamer.EmitLabel(LocalEntryLabel); + const MCSymbolRefExpr *LocalEntryLabelExp = + MCSymbolRefExpr::Create(LocalEntryLabel, OutContext); + const MCExpr *LocalOffsetExp = + MCBinaryExpr::CreateSub(LocalEntryLabelExp, + GlobalEntryLabelExp, OutContext); + + PPCTargetStreamer *TS = + static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer()); + + if (TS) + TS->emitLocalEntry(CurrentFnSym, LocalOffsetExp); + } +} + +/// EmitFunctionBodyEnd - Print the traceback table before the .size +/// directive. +/// +void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() { + // Only the 64-bit target requires a traceback table. For now, + // we only emit the word of zeroes that GDB requires to find + // the end of the function, and zeroes for the eight-byte + // mandatory fields. + // FIXME: We should fill in the eight-byte mandatory fields as described in + // the PPC64 ELF ABI (this is a low-priority item because GDB does not + // currently make use of these fields). + if (Subtarget.isPPC64()) { + OutStreamer.EmitIntValue(0, 4/*size*/); + OutStreamer.EmitIntValue(0, 8/*size*/); + } +} + +void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) { + static const char *const CPUDirectives[] = { + "", + "ppc", + "ppc440", + "ppc601", + "ppc602", + "ppc603", + "ppc7400", + "ppc750", + "ppc970", + "ppcA2", + "ppce500mc", + "ppce5500", + "power3", + "power4", + "power5", + "power5x", + "power6", + "power6x", + "power7", + "ppc64", + "ppc64le" + }; + + unsigned Directive = Subtarget.getDarwinDirective(); + if (Subtarget.hasMFOCRF() && Directive < PPC::DIR_970) + Directive = PPC::DIR_970; + if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400) + Directive = PPC::DIR_7400; + if (Subtarget.isPPC64() && Directive < PPC::DIR_64) + Directive = PPC::DIR_64; + assert(Directive <= PPC::DIR_64 && "Directive out of range."); + + assert(Directive < array_lengthof(CPUDirectives) && + "CPUDirectives[] might not be up-to-date!"); + PPCTargetStreamer &TStreamer = + *static_cast<PPCTargetStreamer *>(OutStreamer.getTargetStreamer()); + TStreamer.emitMachine(CPUDirectives[Directive]); + + // Prime text sections so they are adjacent. This reduces the likelihood a + // large data or debug section causes a branch to exceed 16M limit. + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection()); + if (TM.getRelocationModel() == Reloc::PIC_) { + OutStreamer.SwitchSection( + OutContext.getMachOSection("__TEXT", "__picsymbolstub1", + MachO::S_SYMBOL_STUBS | + MachO::S_ATTR_PURE_INSTRUCTIONS, + 32, SectionKind::getText())); + } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) { + OutStreamer.SwitchSection( + OutContext.getMachOSection("__TEXT","__symbol_stub1", + MachO::S_SYMBOL_STUBS | + MachO::S_ATTR_PURE_INSTRUCTIONS, + 16, SectionKind::getText())); + } + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); +} + +static MCSymbol *GetLazyPtr(MCSymbol *Sym, MCContext &Ctx) { + // Remove $stub suffix, add $lazy_ptr. + StringRef NoStub = Sym->getName().substr(0, Sym->getName().size()-5); + return Ctx.GetOrCreateSymbol(NoStub + "$lazy_ptr"); +} + +static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) { + // Add $tmp suffix to $stub, yielding $stub$tmp. + return Ctx.GetOrCreateSymbol(Sym->getName() + "$tmp"); +} + +void PPCDarwinAsmPrinter:: +EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { + bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64; + bool isDarwin = Subtarget.isDarwin(); + + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + + // .lazy_symbol_pointer + const MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection(); + + // Output stubs for dynamically-linked functions + if (TM.getRelocationModel() == Reloc::PIC_) { + const MCSection *StubSection = + OutContext.getMachOSection("__TEXT", "__picsymbolstub1", + MachO::S_SYMBOL_STUBS | + MachO::S_ATTR_PURE_INSTRUCTIONS, + 32, SectionKind::getText()); + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + OutStreamer.SwitchSection(StubSection); + EmitAlignment(4); + + MCSymbol *Stub = Stubs[i].first; + MCSymbol *RawSym = Stubs[i].second.getPointer(); + MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext); + MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext); + + OutStreamer.EmitLabel(Stub); + OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); + + const MCExpr *Anon = MCSymbolRefExpr::Create(AnonSymbol, OutContext); + const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext); + const MCExpr *Sub = + MCBinaryExpr::CreateSub(LazyPtrExpr, Anon, OutContext); + + // mflr r0 + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R0)); + // bcl 20, 31, AnonSymbol + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCLalways).addExpr(Anon)); + OutStreamer.EmitLabel(AnonSymbol); + // mflr r11 + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MFLR).addReg(PPC::R11)); + // addis r11, r11, ha16(LazyPtr - AnonSymbol) + const MCExpr *SubHa16 = PPCMCExpr::CreateHa(Sub, isDarwin, OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ADDIS) + .addReg(PPC::R11) + .addReg(PPC::R11) + .addExpr(SubHa16)); + // mtlr r0 + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTLR).addReg(PPC::R0)); + + // ldu r12, lo16(LazyPtr - AnonSymbol)(r11) + // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11) + const MCExpr *SubLo16 = PPCMCExpr::CreateLo(Sub, isDarwin, OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU) + .addReg(PPC::R12) + .addExpr(SubLo16).addExpr(SubLo16) + .addReg(PPC::R11)); + // mtctr r12 + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12)); + // bctr + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTR)); + + OutStreamer.SwitchSection(LSPSection); + OutStreamer.EmitLabel(LazyPtr); + OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); + + MCSymbol *DyldStubBindingHelper = + OutContext.GetOrCreateSymbol(StringRef("dyld_stub_binding_helper")); + if (isPPC64) { + // .quad dyld_stub_binding_helper + OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 8); + } else { + // .long dyld_stub_binding_helper + OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 4); + } + } + OutStreamer.AddBlankLine(); + return; + } + + const MCSection *StubSection = + OutContext.getMachOSection("__TEXT","__symbol_stub1", + MachO::S_SYMBOL_STUBS | + MachO::S_ATTR_PURE_INSTRUCTIONS, + 16, SectionKind::getText()); + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + MCSymbol *Stub = Stubs[i].first; + MCSymbol *RawSym = Stubs[i].second.getPointer(); + MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext); + const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext); + + OutStreamer.SwitchSection(StubSection); + EmitAlignment(4); + OutStreamer.EmitLabel(Stub); + OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); + + // lis r11, ha16(LazyPtr) + const MCExpr *LazyPtrHa16 = + PPCMCExpr::CreateHa(LazyPtrExpr, isDarwin, OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LIS) + .addReg(PPC::R11) + .addExpr(LazyPtrHa16)); + + // ldu r12, lo16(LazyPtr)(r11) + // lwzu r12, lo16(LazyPtr)(r11) + const MCExpr *LazyPtrLo16 = + PPCMCExpr::CreateLo(LazyPtrExpr, isDarwin, OutContext); + EmitToStreamer(OutStreamer, MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU) + .addReg(PPC::R12) + .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16) + .addReg(PPC::R11)); + + // mtctr r12 + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR).addReg(PPC::R12)); + // bctr + EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTR)); + + OutStreamer.SwitchSection(LSPSection); + OutStreamer.EmitLabel(LazyPtr); + OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); + + MCSymbol *DyldStubBindingHelper = + OutContext.GetOrCreateSymbol(StringRef("dyld_stub_binding_helper")); + if (isPPC64) { + // .quad dyld_stub_binding_helper + OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 8); + } else { + // .long dyld_stub_binding_helper + OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 4); + } + } + + OutStreamer.AddBlankLine(); +} + + +bool PPCDarwinAsmPrinter::doFinalization(Module &M) { + bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64; + + // Darwin/PPC always uses mach-o. + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + MachineModuleInfoMachO &MMIMacho = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + + MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList(); + if (!Stubs.empty()) + EmitFunctionStubs(Stubs); + + if (MAI->doesSupportExceptionHandling() && MMI) { + // Add the (possibly multiple) personalities to the set of global values. + // Only referenced functions get into the Personalities list. + const std::vector<const Function*> &Personalities = MMI->getPersonalities(); + for (std::vector<const Function*>::const_iterator I = Personalities.begin(), + E = Personalities.end(); I != E; ++I) { + if (*I) { + MCSymbol *NLPSym = getSymbolWithGlobalValueBase(*I, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMIMacho.getGVStubEntry(NLPSym); + StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true); + } + } + } + + // Output stubs for dynamically-linked functions. + Stubs = MMIMacho.GetGVStubList(); + + // Output macho stubs for external and common global variables. + if (!Stubs.empty()) { + // Switch with ".non_lazy_symbol_pointer" directive. + OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); + EmitAlignment(isPPC64 ? 3 : 2); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer.EmitLabel(Stubs[i].first); + // .indirect_symbol _foo + MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second; + OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol); + + if (MCSym.getInt()) + // External to current translation unit. + OutStreamer.EmitIntValue(0, isPPC64 ? 8 : 4/*size*/); + else + // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info pointers + // need to be indirect and pc-rel. We accomplish this by using NLPs. + // However, sometimes the types are local to the file. So we need to + // fill in the value for the NLP in those cases. + OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), + OutContext), + isPPC64 ? 8 : 4/*size*/); + } + + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + Stubs = MMIMacho.GetHiddenGVStubList(); + if (!Stubs.empty()) { + OutStreamer.SwitchSection(getObjFileLowering().getDataSection()); + EmitAlignment(isPPC64 ? 3 : 2); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer.EmitLabel(Stubs[i].first); + // .long _foo + OutStreamer.EmitValue(MCSymbolRefExpr:: + Create(Stubs[i].second.getPointer(), + OutContext), + isPPC64 ? 8 : 4/*size*/); + } + + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never generates + // code that does this, it is always safe to set. + OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + + return AsmPrinter::doFinalization(M); +} + +/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code +/// for a MachineFunction to the given output stream, in a format that the +/// Darwin assembler can deal with. +/// +static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm, + MCStreamer &Streamer) { + const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>(); + + if (Subtarget->isDarwin()) + return new PPCDarwinAsmPrinter(tm, Streamer); + return new PPCLinuxAsmPrinter(tm, Streamer); +} + +// Force static initialization. +extern "C" void LLVMInitializePowerPCAsmPrinter() { + TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(ThePPC64LETarget, createPPCAsmPrinterPass); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp new file mode 100644 index 000000000000..ee906712ee02 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -0,0 +1,211 @@ +//===-- PPCBranchSelector.cpp - Emit long conditional branches ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that scans a machine function to determine which +// conditional branches need more than 16 bits of displacement to reach their +// target basic block. It does this in two passes; a calculation of basic block +// positions pass, and a branch pseudo op to machine branch opcode pass. This +// pass should be run last, just before the assembly printer. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "ppc-branch-select" + +STATISTIC(NumExpanded, "Number of branches expanded to long format"); + +namespace llvm { + void initializePPCBSelPass(PassRegistry&); +} + +namespace { + struct PPCBSel : public MachineFunctionPass { + static char ID; + PPCBSel() : MachineFunctionPass(ID) { + initializePPCBSelPass(*PassRegistry::getPassRegistry()); + } + + /// BlockSizes - The sizes of the basic blocks in the function. + std::vector<unsigned> BlockSizes; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "PowerPC Branch Selector"; + } + }; + char PPCBSel::ID = 0; +} + +INITIALIZE_PASS(PPCBSel, "ppc-branch-select", "PowerPC Branch Selector", + false, false) + +/// createPPCBranchSelectionPass - returns an instance of the Branch Selection +/// Pass +/// +FunctionPass *llvm::createPPCBranchSelectionPass() { + return new PPCBSel(); +} + +bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { + const PPCInstrInfo *TII = + static_cast<const PPCInstrInfo*>(Fn.getTarget().getInstrInfo()); + // Give the blocks of the function a dense, in-order, numbering. + Fn.RenumberBlocks(); + BlockSizes.resize(Fn.getNumBlockIDs()); + + // Measure each MBB and compute a size for the entire function. + unsigned FuncSize = 0; + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock *MBB = MFI; + + unsigned BlockSize = 0; + for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end(); + MBBI != EE; ++MBBI) + BlockSize += TII->GetInstSizeInBytes(MBBI); + + BlockSizes[MBB->getNumber()] = BlockSize; + FuncSize += BlockSize; + } + + // If the entire function is smaller than the displacement of a branch field, + // we know we don't need to shrink any branches in this function. This is a + // common case. + if (FuncSize < (1 << 15)) { + BlockSizes.clear(); + return false; + } + + // For each conditional branch, if the offset to its destination is larger + // than the offset field allows, transform it into a long branch sequence + // like this: + // short branch: + // bCC MBB + // long branch: + // b!CC $PC+8 + // b MBB + // + bool MadeChange = true; + bool EverMadeChange = false; + while (MadeChange) { + // Iteratively expand branches until we reach a fixed point. + MadeChange = false; + + for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; + ++MFI) { + MachineBasicBlock &MBB = *MFI; + unsigned MBBStartOffset = 0; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineBasicBlock *Dest = nullptr; + if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm()) + Dest = I->getOperand(2).getMBB(); + else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) && + !I->getOperand(1).isImm()) + Dest = I->getOperand(1).getMBB(); + else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ || + I->getOpcode() == PPC::BDZ8 || I->getOpcode() == PPC::BDZ) && + !I->getOperand(0).isImm()) + Dest = I->getOperand(0).getMBB(); + + if (!Dest) { + MBBStartOffset += TII->GetInstSizeInBytes(I); + continue; + } + + // Determine the offset from the current branch to the destination + // block. + int BranchSize; + if (Dest->getNumber() <= MBB.getNumber()) { + // If this is a backwards branch, the delta is the offset from the + // start of this block to this branch, plus the sizes of all blocks + // from this block to the dest. + BranchSize = MBBStartOffset; + + for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i) + BranchSize += BlockSizes[i]; + } else { + // Otherwise, add the size of the blocks between this block and the + // dest to the number of bytes left in this block. + BranchSize = -MBBStartOffset; + + for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i) + BranchSize += BlockSizes[i]; + } + + // If this branch is in range, ignore it. + if (isInt<16>(BranchSize)) { + MBBStartOffset += 4; + continue; + } + + // Otherwise, we have to expand it to a long branch. + MachineInstr *OldBranch = I; + DebugLoc dl = OldBranch->getDebugLoc(); + + if (I->getOpcode() == PPC::BCC) { + // The BCC operands are: + // 0. PPC branch predicate + // 1. CR register + // 2. Target MBB + PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm(); + unsigned CRReg = I->getOperand(1).getReg(); + + // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition. + BuildMI(MBB, I, dl, TII->get(PPC::BCC)) + .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2); + } else if (I->getOpcode() == PPC::BC) { + unsigned CRBit = I->getOperand(0).getReg(); + BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2); + } else if (I->getOpcode() == PPC::BCn) { + unsigned CRBit = I->getOperand(0).getReg(); + BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2); + } else if (I->getOpcode() == PPC::BDNZ) { + BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2); + } else if (I->getOpcode() == PPC::BDNZ8) { + BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2); + } else if (I->getOpcode() == PPC::BDZ) { + BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2); + } else if (I->getOpcode() == PPC::BDZ8) { + BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2); + } else { + llvm_unreachable("Unhandled branch type!"); + } + + // Uncond branch to the real destination. + I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest); + + // Remove the old branch from the function. + OldBranch->eraseFromParent(); + + // Remember that this instruction is 8-bytes, increase the size of the + // block by 4, remember to iterate. + BlockSizes[MBB.getNumber()] += 4; + MBBStartOffset += 8; + ++NumExpanded; + MadeChange = true; + } + } + EverMadeChange |= MadeChange; + } + + BlockSizes.clear(); + return true; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp new file mode 100644 index 000000000000..ec1e34d91f93 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -0,0 +1,663 @@ +//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass identifies loops where we can generate the PPC branch instructions +// that decrement and test the count register (CTR) (bdnz and friends). +// +// The pattern that defines the induction variable can changed depending on +// prior optimizations. For example, the IndVarSimplify phase run by 'opt' +// normalizes induction variables, and the Loop Strength Reduction pass +// run by 'llc' may also make changes to the induction variable. +// +// Criteria for CTR loops: +// - Countable loops (w/ ind. var for a trip count) +// - Try inner-most loops first +// - No nested CTR loops. +// - No function calls in loops. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "PPC.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/PassSupport.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +#ifndef NDEBUG +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#endif + +#include <algorithm> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "ctrloops" + +#ifndef NDEBUG +static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1)); +#endif + +STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops"); + +namespace llvm { + void initializePPCCTRLoopsPass(PassRegistry&); +#ifndef NDEBUG + void initializePPCCTRLoopsVerifyPass(PassRegistry&); +#endif +} + +namespace { + struct PPCCTRLoops : public FunctionPass { + +#ifndef NDEBUG + static int Counter; +#endif + + public: + static char ID; + + PPCCTRLoops() : FunctionPass(ID), TM(nullptr) { + initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); + } + PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { + initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolution>(); + } + + private: + bool mightUseCTR(const Triple &TT, BasicBlock *BB); + bool convertToCTRLoop(Loop *L); + + private: + PPCTargetMachine *TM; + LoopInfo *LI; + ScalarEvolution *SE; + const DataLayout *DL; + DominatorTree *DT; + const TargetLibraryInfo *LibInfo; + }; + + char PPCCTRLoops::ID = 0; +#ifndef NDEBUG + int PPCCTRLoops::Counter = 0; +#endif + +#ifndef NDEBUG + struct PPCCTRLoopsVerify : public MachineFunctionPass { + public: + static char ID; + + PPCCTRLoopsVerify() : MachineFunctionPass(ID) { + initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + MachineDominatorTree *MDT; + }; + + char PPCCTRLoopsVerify::ID = 0; +#endif // NDEBUG +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", + false, false) + +FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) { + return new PPCCTRLoops(TM); +} + +#ifndef NDEBUG +INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", + "PowerPC CTR Loops Verify", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", + "PowerPC CTR Loops Verify", false, false) + +FunctionPass *llvm::createPPCCTRLoopsVerify() { + return new PPCCTRLoopsVerify(); +} +#endif // NDEBUG + +bool PPCCTRLoops::runOnFunction(Function &F) { + LI = &getAnalysis<LoopInfo>(); + SE = &getAnalysis<ScalarEvolution>(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); + DL = DLP ? &DLP->getDataLayout() : nullptr; + LibInfo = getAnalysisIfAvailable<TargetLibraryInfo>(); + + bool MadeChange = false; + + for (LoopInfo::iterator I = LI->begin(), E = LI->end(); + I != E; ++I) { + Loop *L = *I; + if (!L->getParentLoop()) + MadeChange |= convertToCTRLoop(L); + } + + return MadeChange; +} + +static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) { + if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) + return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); + + return false; +} + +bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); + J != JE; ++J) { + if (CallInst *CI = dyn_cast<CallInst>(J)) { + if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) { + // Inline ASM is okay, unless it clobbers the ctr register. + InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); + for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { + InlineAsm::ConstraintInfo &C = CIV[i]; + if (C.Type != InlineAsm::isInput) + for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) + if (StringRef(C.Codes[j]).equals_lower("{ctr}")) + return true; + } + + continue; + } + + if (!TM) + return true; + const TargetLowering *TLI = TM->getTargetLowering(); + + if (Function *F = CI->getCalledFunction()) { + // Most intrinsics don't become function calls, but some might. + // sin, cos, exp and log are always calls. + unsigned Opcode; + if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { + switch (F->getIntrinsicID()) { + default: continue; + +// VisualStudio defines setjmp as _setjmp +#if defined(_MSC_VER) && defined(setjmp) && \ + !defined(setjmp_undefined_for_msvc) +# pragma push_macro("setjmp") +# undef setjmp +# define setjmp_undefined_for_msvc +#endif + + case Intrinsic::setjmp: + +#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) + // let's return it to _setjmp state +# pragma pop_macro("setjmp") +# undef setjmp_undefined_for_msvc +#endif + + case Intrinsic::longjmp: + + // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp + // because, although it does clobber the counter register, the + // control can't then return to inside the loop unless there is also + // an eh_sjlj_setjmp. + case Intrinsic::eh_sjlj_setjmp: + + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::powi: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::pow: + case Intrinsic::sin: + case Intrinsic::cos: + return true; + case Intrinsic::copysign: + if (CI->getArgOperand(0)->getType()->getScalarType()-> + isPPC_FP128Ty()) + return true; + else + continue; // ISD::FCOPYSIGN is never a library call. + case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; + case Intrinsic::floor: Opcode = ISD::FFLOOR; break; + case Intrinsic::ceil: Opcode = ISD::FCEIL; break; + case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; + case Intrinsic::rint: Opcode = ISD::FRINT; break; + case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; + case Intrinsic::round: Opcode = ISD::FROUND; break; + } + } + + // PowerPC does not use [US]DIVREM or other library calls for + // operations on regular types which are not otherwise library calls + // (i.e. soft float or atomics). If adapting for targets that do, + // additional care is required here. + + LibFunc::Func Func; + if (!F->hasLocalLinkage() && F->hasName() && LibInfo && + LibInfo->getLibFunc(F->getName(), Func) && + LibInfo->hasOptimizedCodeGen(Func)) { + // Non-read-only functions are never treated as intrinsics. + if (!CI->onlyReadsMemory()) + return true; + + // Conversion happens only for FP calls. + if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) + return true; + + switch (Func) { + default: return true; + case LibFunc::copysign: + case LibFunc::copysignf: + continue; // ISD::FCOPYSIGN is never a library call. + case LibFunc::copysignl: + return true; + case LibFunc::fabs: + case LibFunc::fabsf: + case LibFunc::fabsl: + continue; // ISD::FABS is never a library call. + case LibFunc::sqrt: + case LibFunc::sqrtf: + case LibFunc::sqrtl: + Opcode = ISD::FSQRT; break; + case LibFunc::floor: + case LibFunc::floorf: + case LibFunc::floorl: + Opcode = ISD::FFLOOR; break; + case LibFunc::nearbyint: + case LibFunc::nearbyintf: + case LibFunc::nearbyintl: + Opcode = ISD::FNEARBYINT; break; + case LibFunc::ceil: + case LibFunc::ceilf: + case LibFunc::ceill: + Opcode = ISD::FCEIL; break; + case LibFunc::rint: + case LibFunc::rintf: + case LibFunc::rintl: + Opcode = ISD::FRINT; break; + case LibFunc::round: + case LibFunc::roundf: + case LibFunc::roundl: + Opcode = ISD::FROUND; break; + case LibFunc::trunc: + case LibFunc::truncf: + case LibFunc::truncl: + Opcode = ISD::FTRUNC; break; + } + + MVT VTy = + TLI->getSimpleValueType(CI->getArgOperand(0)->getType(), true); + if (VTy == MVT::Other) + return true; + + if (TLI->isOperationLegalOrCustom(Opcode, VTy)) + continue; + else if (VTy.isVector() && + TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType())) + continue; + + return true; + } + } + + return true; + } else if (isa<BinaryOperator>(J) && + J->getType()->getScalarType()->isPPC_FP128Ty()) { + // Most operations on ppc_f128 values become calls. + return true; + } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) || + isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) { + CastInst *CI = cast<CastInst>(J); + if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || + CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || + isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) || + isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType())) + return true; + } else if (isLargeIntegerTy(TT.isArch32Bit(), + J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::UDiv || + J->getOpcode() == Instruction::SDiv || + J->getOpcode() == Instruction::URem || + J->getOpcode() == Instruction::SRem)) { + return true; + } else if (TT.isArch32Bit() && + isLargeIntegerTy(false, J->getType()->getScalarType()) && + (J->getOpcode() == Instruction::Shl || + J->getOpcode() == Instruction::AShr || + J->getOpcode() == Instruction::LShr)) { + // Only on PPC32, for 128-bit integers (specifically not 64-bit + // integers), these might be runtime calls. + return true; + } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) { + // On PowerPC, indirect jumps use the counter register. + return true; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) { + if (!TM) + return true; + const TargetLowering *TLI = TM->getTargetLowering(); + + if (TLI->supportJumpTables() && + SI->getNumCases()+1 >= (unsigned) TLI->getMinimumJumpTableEntries()) + return true; + } + } + + return false; +} + +bool PPCCTRLoops::convertToCTRLoop(Loop *L) { + bool MadeChange = false; + + Triple TT = Triple(L->getHeader()->getParent()->getParent()-> + getTargetTriple()); + if (!TT.isArch32Bit() && !TT.isArch64Bit()) + return MadeChange; // Unknown arch. type. + + // Process nested loops first. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { + MadeChange |= convertToCTRLoop(*I); + } + + // If a nested loop has been converted, then we can't convert this loop. + if (MadeChange) + return MadeChange; + +#ifndef NDEBUG + // Stop trying after reaching the limit (if any). + int Limit = CTRLoopLimit; + if (Limit >= 0) { + if (Counter >= CTRLoopLimit) + return false; + Counter++; + } +#endif + + // We don't want to spill/restore the counter register, and so we don't + // want to use the counter register if the loop contains calls. + for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); + I != IE; ++I) + if (mightUseCTR(TT, *I)) + return MadeChange; + + SmallVector<BasicBlock*, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + BasicBlock *CountedExitBlock = nullptr; + const SCEV *ExitCount = nullptr; + BranchInst *CountedExitBranch = nullptr; + for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(), + IE = ExitingBlocks.end(); I != IE; ++I) { + const SCEV *EC = SE->getExitCount(L, *I); + DEBUG(dbgs() << "Exit Count for " << *L << " from block " << + (*I)->getName() << ": " << *EC << "\n"); + if (isa<SCEVCouldNotCompute>(EC)) + continue; + if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) { + if (ConstEC->getValue()->isZero()) + continue; + } else if (!SE->isLoopInvariant(EC, L)) + continue; + + if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32)) + continue; + + // We now have a loop-invariant count of loop iterations (which is not the + // constant zero) for which we know that this loop will not exit via this + // exisiting block. + + // We need to make sure that this block will run on every loop iteration. + // For this to be true, we must dominate all blocks with backedges. Such + // blocks are in-loop predecessors to the header block. + bool NotAlways = false; + for (pred_iterator PI = pred_begin(L->getHeader()), + PIE = pred_end(L->getHeader()); PI != PIE; ++PI) { + if (!L->contains(*PI)) + continue; + + if (!DT->dominates(*I, *PI)) { + NotAlways = true; + break; + } + } + + if (NotAlways) + continue; + + // Make sure this blocks ends with a conditional branch. + Instruction *TI = (*I)->getTerminator(); + if (!TI) + continue; + + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (!BI->isConditional()) + continue; + + CountedExitBranch = BI; + } else + continue; + + // Note that this block may not be the loop latch block, even if the loop + // has a latch block. + CountedExitBlock = *I; + ExitCount = EC; + break; + } + + if (!CountedExitBlock) + return MadeChange; + + BasicBlock *Preheader = L->getLoopPreheader(); + + // If we don't have a preheader, then insert one. If we already have a + // preheader, then we can use it (except if the preheader contains a use of + // the CTR register because some such uses might be reordered by the + // selection DAG after the mtctr instruction). + if (!Preheader || mightUseCTR(TT, Preheader)) + Preheader = InsertPreheaderForLoop(L, this); + if (!Preheader) + return MadeChange; + + DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n"); + + // Insert the count into the preheader and replace the condition used by the + // selected branch. + MadeChange = true; + + SCEVExpander SCEVE(*SE, "loopcnt"); + LLVMContext &C = SE->getContext(); + Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) : + Type::getInt32Ty(C); + if (!ExitCount->getType()->isPointerTy() && + ExitCount->getType() != CountType) + ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); + ExitCount = SE->getAddExpr(ExitCount, + SE->getConstant(CountType, 1)); + Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType, + Preheader->getTerminator()); + + IRBuilder<> CountBuilder(Preheader->getTerminator()); + Module *M = Preheader->getParent()->getParent(); + Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, + CountType); + CountBuilder.CreateCall(MTCTRFunc, ECValue); + + IRBuilder<> CondBuilder(CountedExitBranch); + Value *DecFunc = + Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero); + Value *NewCond = CondBuilder.CreateCall(DecFunc); + Value *OldCond = CountedExitBranch->getCondition(); + CountedExitBranch->setCondition(NewCond); + + // The false branch must exit the loop. + if (!L->contains(CountedExitBranch->getSuccessor(0))) + CountedExitBranch->swapSuccessors(); + + // The old condition may be dead now, and may have even created a dead PHI + // (the original induction variable). + RecursivelyDeleteTriviallyDeadInstructions(OldCond); + DeleteDeadPHIs(CountedExitBlock); + + ++NumCTRLoops; + return MadeChange; +} + +#ifndef NDEBUG +static bool clobbersCTR(const MachineInstr *MI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg()) { + if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) + return true; + } else if (MO.isRegMask()) { + if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8)) + return true; + } + } + + return false; +} + +static bool verifyCTRBranch(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator BI = I; + SmallSet<MachineBasicBlock *, 16> Visited; + SmallVector<MachineBasicBlock *, 8> Preds; + bool CheckPreds; + + if (I == MBB->begin()) { + Visited.insert(MBB); + goto queue_preds; + } else + --I; + +check_block: + Visited.insert(MBB); + if (I == MBB->end()) + goto queue_preds; + + CheckPreds = true; + for (MachineBasicBlock::iterator IE = MBB->begin();; --I) { + unsigned Opc = I->getOpcode(); + if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) { + CheckPreds = false; + break; + } + + if (I != BI && clobbersCTR(I)) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" << + MBB->getFullName() << ") instruction " << *I << + " clobbers CTR, invalidating " << "BB#" << + BI->getParent()->getNumber() << " (" << + BI->getParent()->getFullName() << ") instruction " << + *BI << "\n"); + return false; + } + + if (I == IE) + break; + } + + if (!CheckPreds && Preds.empty()) + return true; + + if (CheckPreds) { +queue_preds: + if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) { + DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" << + BI->getParent()->getNumber() << " (" << + BI->getParent()->getFullName() << ") instruction " << + *BI << "\n"); + return false; + } + + for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), + PIE = MBB->pred_end(); PI != PIE; ++PI) + Preds.push_back(*PI); + } + + do { + MBB = Preds.pop_back_val(); + if (!Visited.count(MBB)) { + I = MBB->getLastNonDebugInstr(); + goto check_block; + } + } while (!Preds.empty()); + + return true; +} + +bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { + MDT = &getAnalysis<MachineDominatorTree>(); + + // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before + // any other instructions that might clobber the ctr register. + for (MachineFunction::iterator I = MF.begin(), IE = MF.end(); + I != IE; ++I) { + MachineBasicBlock *MBB = I; + if (!MDT->isReachableFromEntry(MBB)) + continue; + + for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(), + MIIE = MBB->end(); MII != MIIE; ++MII) { + unsigned Opc = MII->getOpcode(); + if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ || + Opc == PPC::BDZ8 || Opc == PPC::BDZ) + if (!verifyCTRBranch(MBB, MII)) + llvm_unreachable("Invalid PPC CTR loop!"); + } + } + + return false; +} +#endif // NDEBUG + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td new file mode 100644 index 000000000000..222760a0cb91 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -0,0 +1,199 @@ +//===- PPCCallingConv.td - Calling Conventions for PowerPC -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the PowerPC 32- and 64-bit +// architectures. +// +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A> + : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>; +class CCIfNotSubtarget<string F, CCAction A> + : CCIf<!strconcat("!State.getTarget().getSubtarget<PPCSubtarget>().", F), A>; + +//===----------------------------------------------------------------------===// +// Return Value Calling Convention +//===----------------------------------------------------------------------===// + +// Return-value convention for PowerPC +def RetCC_PPC : CallingConv<[ + // On PPC64, integer return values are always promoted to i64 + CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>, + CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>, + + CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>, + CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>, + CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>, + + // Floating point types returned as "direct" go into F1 .. F8; note that + // only the ELFv2 ABI fully utilizes all these registers. + CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + + // Vector types returned as "direct" go into V2 .. V9; note that only the + // ELFv2 ABI fully utilizes all these registers. + CCIfType<[v16i8, v8i16, v4i32, v4f32], + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>, + CCIfType<[v2f64, v2i64], + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>> +]>; + + +// Note that we don't currently have calling conventions for 64-bit +// PowerPC, but handle all the complexities of the ABI in the lowering +// logic. FIXME: See if the logic can be simplified with use of CCs. +// This may require some extensions to current table generation. + +// Simple calling convention for 64-bit ELF PowerPC fast isel. +// Only handle ints and floats. All ints are promoted to i64. +// Vector types and quadword ints are not handled. +def CC_PPC64_ELF_FIS : CallingConv<[ + CCIfType<[i1], CCPromoteToType<i64>>, + CCIfType<[i8], CCPromoteToType<i64>>, + CCIfType<[i16], CCPromoteToType<i64>>, + CCIfType<[i32], CCPromoteToType<i64>>, + CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>, + CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>> +]>; + +// Simple return-value convention for 64-bit ELF PowerPC fast isel. +// All small ints are promoted to i64. Vector types, quadword ints, +// and multiple register returns are "supported" to avoid compile +// errors, but none are handled by the fast selector. +def RetCC_PPC64_ELF_FIS : CallingConv<[ + CCIfType<[i1], CCPromoteToType<i64>>, + CCIfType<[i8], CCPromoteToType<i64>>, + CCIfType<[i16], CCPromoteToType<i64>>, + CCIfType<[i32], CCPromoteToType<i64>>, + CCIfType<[i64], CCAssignToReg<[X3, X4]>>, + CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>, + CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + CCIfType<[v16i8, v8i16, v4i32, v4f32], + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>, + CCIfType<[v2f64, v2i64], + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9]>> +]>; + +//===----------------------------------------------------------------------===// +// PowerPC System V Release 4 32-bit ABI +//===----------------------------------------------------------------------===// + +def CC_PPC32_SVR4_Common : CallingConv<[ + CCIfType<[i1], CCPromoteToType<i32>>, + + // The ABI requires i64 to be passed in two adjacent registers with the first + // register having an odd register number. + CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>, + + // The first 8 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>, + + // Make sure the i64 words from a long double are either both passed in + // registers or both passed on the stack. + CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>, + + // FP values are passed in F1 - F8. + CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, + + // Split arguments have an alignment of 8 bytes on the stack. + CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>, + + CCIfType<[i32], CCAssignToStack<4, 4>>, + + // Floats are stored in double precision format, thus they have the same + // alignment and size as doubles. + CCIfType<[f32,f64], CCAssignToStack<8, 8>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>> +]>; + +// This calling convention puts vector arguments always on the stack. It is used +// to assign vector arguments which belong to the variable portion of the +// parameter list of a variable argument function. +def CC_PPC32_SVR4_VarArg : CallingConv<[ + CCDelegateTo<CC_PPC32_SVR4_Common> +]>; + +// In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to +// put vector arguments in vector registers before putting them on the stack. +def CC_PPC32_SVR4 : CallingConv<[ + // The first 12 Vector arguments are passed in AltiVec registers. + CCIfType<[v16i8, v8i16, v4i32, v4f32], + CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13]>>, + CCIfType<[v2f64, v2i64], + CCAssignToReg<[VSH2, VSH3, VSH4, VSH5, VSH6, VSH7, VSH8, VSH9, + VSH10, VSH11, VSH12, VSH13]>>, + + CCDelegateTo<CC_PPC32_SVR4_Common> +]>; + +// Helper "calling convention" to handle aggregate by value arguments. +// Aggregate by value arguments are always placed in the local variable space +// of the caller. This calling convention is only used to assign those stack +// offsets in the callers stack frame. +// +// Still, the address of the aggregate copy in the callers stack frame is passed +// in a GPR (or in the parameter list area if all GPRs are allocated) from the +// caller to the callee. The location for the address argument is assigned by +// the CC_PPC32_SVR4 calling convention. +// +// The only purpose of CC_PPC32_SVR4_Custom_Dummy is to skip arguments which are +// not passed by value. + +def CC_PPC32_SVR4_ByVal : CallingConv<[ + CCIfByVal<CCPassByVal<4, 4>>, + + CCCustom<"CC_PPC32_SVR4_Custom_Dummy"> +]>; + +def CSR_Altivec : CalleeSavedRegs<(add V20, V21, V22, V23, V24, V25, V26, V27, + V28, V29, V30, V31)>; + +def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, + R29, R30, R31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + +def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>; + +def CSR_SVR432 : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20, + R21, R22, R23, R24, R25, R26, R27, R28, + R29, R30, R31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + +def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>; + +def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20, + X21, X22, X23, X24, X25, X26, X27, X28, + X29, X30, X31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + +def CSR_Darwin64_Altivec : CalleeSavedRegs<(add CSR_Darwin64, CSR_Altivec)>; + +def CSR_SVR464 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20, + X21, X22, X23, X24, X25, X26, X27, X28, + X29, X30, X31, F14, F15, F16, F17, F18, + F19, F20, F21, F22, F23, F24, F25, F26, + F27, F28, F29, F30, F31, CR2, CR3, CR4 + )>; + + +def CSR_SVR464_Altivec : CalleeSavedRegs<(add CSR_SVR464, CSR_Altivec)>; + +def CSR_NoRegs : CalleeSavedRegs<(add)>; + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCodeEmitter.cpp new file mode 100644 index 000000000000..08755238f925 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCCodeEmitter.cpp @@ -0,0 +1,293 @@ +//===-- PPCCodeEmitter.cpp - JIT Code Emitter for PowerPC -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the PowerPC 32-bit CodeEmitter and associated machinery to +// JIT-compile bitcode to native PowerPC. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCRelocations.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/Module.h" +#include "llvm/PassManager.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +namespace { + class PPCCodeEmitter : public MachineFunctionPass { + TargetMachine &TM; + JITCodeEmitter &MCE; + MachineModuleInfo *MMI; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineModuleInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + static char ID; + + /// MovePCtoLROffset - When/if we see a MovePCtoLR instruction, we record + /// its address in the function into this pointer. + void *MovePCtoLROffset; + public: + + PPCCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce) + : MachineFunctionPass(ID), TM(tm), MCE(mce) {} + + /// getBinaryCodeForInstr - This function, generated by the + /// CodeEmitterGenerator using TableGen, produces the binary encoding for + /// machine instructions. + uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const; + + + MachineRelocation GetRelocation(const MachineOperand &MO, + unsigned RelocID) const; + + /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr + unsigned getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const; + + unsigned get_crbitm_encoding(const MachineInstr &MI, unsigned OpNo) const; + unsigned getDirectBrEncoding(const MachineInstr &MI, unsigned OpNo) const; + unsigned getCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const; + unsigned getAbsDirectBrEncoding(const MachineInstr &MI, + unsigned OpNo) const; + unsigned getAbsCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const; + + unsigned getImm16Encoding(const MachineInstr &MI, unsigned OpNo) const; + unsigned getMemRIEncoding(const MachineInstr &MI, unsigned OpNo) const; + unsigned getMemRIXEncoding(const MachineInstr &MI, unsigned OpNo) const; + unsigned getTLSRegEncoding(const MachineInstr &MI, unsigned OpNo) const; + unsigned getTLSCallEncoding(const MachineInstr &MI, unsigned OpNo) const; + + const char *getPassName() const override { + return "PowerPC Machine Code Emitter"; + } + + /// runOnMachineFunction - emits the given MachineFunction to memory + /// + bool runOnMachineFunction(MachineFunction &MF) override; + + /// emitBasicBlock - emits the given MachineBasicBlock to memory + /// + void emitBasicBlock(MachineBasicBlock &MBB); + }; +} + +char PPCCodeEmitter::ID = 0; + +/// createPPCCodeEmitterPass - Return a pass that emits the collected PPC code +/// to the specified MCE object. +FunctionPass *llvm::createPPCJITCodeEmitterPass(PPCTargetMachine &TM, + JITCodeEmitter &JCE) { + return new PPCCodeEmitter(TM, JCE); +} + +bool PPCCodeEmitter::runOnMachineFunction(MachineFunction &MF) { + assert((MF.getTarget().getRelocationModel() != Reloc::Default || + MF.getTarget().getRelocationModel() != Reloc::Static) && + "JIT relocation model must be set to static or default!"); + + MMI = &getAnalysis<MachineModuleInfo>(); + MCE.setModuleInfo(MMI); + do { + MovePCtoLROffset = nullptr; + MCE.startFunction(MF); + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + emitBasicBlock(*BB); + } while (MCE.finishFunction(MF)); + + return false; +} + +void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) { + MCE.StartMachineBasicBlock(&MBB); + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I){ + const MachineInstr &MI = *I; + MCE.processDebugLoc(MI.getDebugLoc(), true); + switch (MI.getOpcode()) { + default: + MCE.emitWordBE(getBinaryCodeForInstr(MI)); + break; + case TargetOpcode::CFI_INSTRUCTION: + break; + case TargetOpcode::EH_LABEL: + MCE.emitLabel(MI.getOperand(0).getMCSymbol()); + break; + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + break; // pseudo opcode, no side effects + case PPC::MovePCtoLR: + case PPC::MovePCtoLR8: + assert(TM.getRelocationModel() == Reloc::PIC_); + MovePCtoLROffset = (void*)MCE.getCurrentPCValue(); + MCE.emitWordBE(0x48000005); // bl 1 + break; + } + MCE.processDebugLoc(MI.getDebugLoc(), false); + } +} + +unsigned PPCCodeEmitter::get_crbitm_encoding(const MachineInstr &MI, + unsigned OpNo) const { + const MachineOperand &MO = MI.getOperand(OpNo); + assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 || + MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) && + (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7)); + return 0x80 >> TM.getRegisterInfo()->getEncodingValue(MO.getReg()); +} + +MachineRelocation PPCCodeEmitter::GetRelocation(const MachineOperand &MO, + unsigned RelocID) const { + // If in PIC mode, we need to encode the negated address of the + // 'movepctolr' into the unrelocated field. After relocation, we'll have + // &gv-&movepctolr-4 in the imm field. Once &movepctolr is added to the imm + // field, we get &gv. This doesn't happen for branch relocations, which are + // always implicitly pc relative. + intptr_t Cst = 0; + if (TM.getRelocationModel() == Reloc::PIC_) { + assert(MovePCtoLROffset && "MovePCtoLR not seen yet?"); + Cst = -(intptr_t)MovePCtoLROffset - 4; + } + + if (MO.isGlobal()) + return MachineRelocation::getGV(MCE.getCurrentPCOffset(), RelocID, + const_cast<GlobalValue *>(MO.getGlobal()), + Cst, isa<Function>(MO.getGlobal())); + if (MO.isSymbol()) + return MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), + RelocID, MO.getSymbolName(), Cst); + if (MO.isCPI()) + return MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), + RelocID, MO.getIndex(), Cst); + + if (MO.isMBB()) + return MachineRelocation::getBB(MCE.getCurrentPCOffset(), + RelocID, MO.getMBB()); + + assert(MO.isJTI()); + return MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), + RelocID, MO.getIndex(), Cst); +} + +unsigned PPCCodeEmitter::getDirectBrEncoding(const MachineInstr &MI, + unsigned OpNo) const { + const MachineOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO); + + MCE.addRelocation(GetRelocation(MO, PPC::reloc_pcrel_bx)); + return 0; +} + +unsigned PPCCodeEmitter::getCondBrEncoding(const MachineInstr &MI, + unsigned OpNo) const { + const MachineOperand &MO = MI.getOperand(OpNo); + MCE.addRelocation(GetRelocation(MO, PPC::reloc_pcrel_bcx)); + return 0; +} + +unsigned PPCCodeEmitter::getAbsDirectBrEncoding(const MachineInstr &MI, + unsigned OpNo) const { + const MachineOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO); + + llvm_unreachable("Absolute branch relocations unsupported on the old JIT."); +} + +unsigned PPCCodeEmitter::getAbsCondBrEncoding(const MachineInstr &MI, + unsigned OpNo) const { + llvm_unreachable("Absolute branch relocations unsupported on the old JIT."); +} + +unsigned PPCCodeEmitter::getImm16Encoding(const MachineInstr &MI, + unsigned OpNo) const { + const MachineOperand &MO = MI.getOperand(OpNo); + if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO); + + unsigned RelocID; + switch (MO.getTargetFlags() & PPCII::MO_ACCESS_MASK) { + default: llvm_unreachable("Unsupported target operand flags!"); + case PPCII::MO_LO: RelocID = PPC::reloc_absolute_low; break; + case PPCII::MO_HA: RelocID = PPC::reloc_absolute_high; break; + } + + MCE.addRelocation(GetRelocation(MO, RelocID)); + return 0; +} + +unsigned PPCCodeEmitter::getMemRIEncoding(const MachineInstr &MI, + unsigned OpNo) const { + // Encode (imm, reg) as a memri, which has the low 16-bits as the + // displacement and the next 5 bits as the register #. + assert(MI.getOperand(OpNo+1).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1)) << 16; + + const MachineOperand &MO = MI.getOperand(OpNo); + if (MO.isImm()) + return (getMachineOpValue(MI, MO) & 0xFFFF) | RegBits; + + // Add a fixup for the displacement field. + MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low)); + return RegBits; +} + +unsigned PPCCodeEmitter::getMemRIXEncoding(const MachineInstr &MI, + unsigned OpNo) const { + // Encode (imm, reg) as a memrix, which has the low 14-bits as the + // displacement and the next 5 bits as the register #. + assert(MI.getOperand(OpNo+1).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1)) << 14; + + const MachineOperand &MO = MI.getOperand(OpNo); + if (MO.isImm()) + return ((getMachineOpValue(MI, MO) >> 2) & 0x3FFF) | RegBits; + + MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low_ix)); + return RegBits; +} + + +unsigned PPCCodeEmitter::getTLSRegEncoding(const MachineInstr &MI, + unsigned OpNo) const { + llvm_unreachable("TLS not supported on the old JIT."); + return 0; +} + +unsigned PPCCodeEmitter::getTLSCallEncoding(const MachineInstr &MI, + unsigned OpNo) const { + llvm_unreachable("TLS not supported on the old JIT."); + return 0; +} + +unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const { + + if (MO.isReg()) { + // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand. + // The GPR operand should come through here though. + assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 && + MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) || + MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7); + return TM.getRegisterInfo()->getEncodingValue(MO.getReg()); + } + + assert(MO.isImm() && + "Relocation required in an instruction that we cannot encode!"); + return MO.getImm(); +} + +#include "PPCGenCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp new file mode 100644 index 000000000000..2e524d604789 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -0,0 +1,2282 @@ +//===-- PPCFastISel.cpp - PowerPC FastISel implementation -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the PowerPC-specific support for the FastISel class. Some +// of the target-specific code is generated by tablegen in the file +// PPCGenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCISelLowering.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" + +//===----------------------------------------------------------------------===// +// +// TBD: +// FastLowerArguments: Handle simple cases. +// PPCMaterializeGV: Handle TLS. +// SelectCall: Handle function pointers. +// SelectCall: Handle multi-register return values. +// SelectCall: Optimize away nops for local calls. +// processCallArgs: Handle bit-converted arguments. +// finishCall: Handle multi-register return values. +// PPCComputeAddress: Handle parameter references as FrameIndex's. +// PPCEmitCmp: Handle immediate as operand 1. +// SelectCall: Handle small byval arguments. +// SelectIntrinsicCall: Implement. +// SelectSelect: Implement. +// Consider factoring isTypeLegal into the base class. +// Implement switches and jump tables. +// +//===----------------------------------------------------------------------===// +using namespace llvm; + +#define DEBUG_TYPE "ppcfastisel" + +namespace { + +typedef struct Address { + enum { + RegBase, + FrameIndexBase + } BaseType; + + union { + unsigned Reg; + int FI; + } Base; + + long Offset; + + // Innocuous defaults for our address. + Address() + : BaseType(RegBase), Offset(0) { + Base.Reg = 0; + } +} Address; + +class PPCFastISel final : public FastISel { + + const TargetMachine &TM; + const TargetInstrInfo &TII; + const TargetLowering &TLI; + const PPCSubtarget *PPCSubTarget; + LLVMContext *Context; + + public: + explicit PPCFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) + : FastISel(FuncInfo, LibInfo), + TM(FuncInfo.MF->getTarget()), + TII(*TM.getInstrInfo()), + TLI(*TM.getTargetLowering()), + PPCSubTarget(&TM.getSubtarget<PPCSubtarget>()), + Context(&FuncInfo.Fn->getContext()) { } + + // Backend specific FastISel code. + private: + bool TargetSelectInstruction(const Instruction *I) override; + unsigned TargetMaterializeConstant(const Constant *C) override; + unsigned TargetMaterializeAlloca(const AllocaInst *AI) override; + bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) override; + bool FastLowerArguments() override; + unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override; + unsigned FastEmitInst_ri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + uint64_t Imm); + unsigned FastEmitInst_r(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill); + unsigned FastEmitInst_rr(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill); + + // Instruction selection routines. + private: + bool SelectLoad(const Instruction *I); + bool SelectStore(const Instruction *I); + bool SelectBranch(const Instruction *I); + bool SelectIndirectBr(const Instruction *I); + bool SelectFPExt(const Instruction *I); + bool SelectFPTrunc(const Instruction *I); + bool SelectIToFP(const Instruction *I, bool IsSigned); + bool SelectFPToI(const Instruction *I, bool IsSigned); + bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode); + bool SelectCall(const Instruction *I); + bool SelectRet(const Instruction *I); + bool SelectTrunc(const Instruction *I); + bool SelectIntExt(const Instruction *I); + + // Utility routines. + private: + bool isTypeLegal(Type *Ty, MVT &VT); + bool isLoadTypeLegal(Type *Ty, MVT &VT); + bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value, + bool isZExt, unsigned DestReg); + bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + const TargetRegisterClass *RC, bool IsZExt = true, + unsigned FP64LoadOpc = PPC::LFD); + bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr); + bool PPCComputeAddress(const Value *Obj, Address &Addr); + void PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset, + unsigned &IndexReg); + bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, + unsigned DestReg, bool IsZExt); + unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT); + unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT); + unsigned PPCMaterializeInt(const Constant *C, MVT VT); + unsigned PPCMaterialize32BitInt(int64_t Imm, + const TargetRegisterClass *RC); + unsigned PPCMaterialize64BitInt(int64_t Imm, + const TargetRegisterClass *RC); + unsigned PPCMoveToIntReg(const Instruction *I, MVT VT, + unsigned SrcReg, bool IsSigned); + unsigned PPCMoveToFPReg(MVT VT, unsigned SrcReg, bool IsSigned); + + // Call handling routines. + private: + bool processCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes, + bool IsVarArg); + void finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes, bool IsVarArg); + CCAssignFn *usePPC32CCs(unsigned Flag); + + private: + #include "PPCGenFastISel.inc" + +}; + +} // end anonymous namespace + +#include "PPCGenCallingConv.inc" + +// Function whose sole purpose is to kill compiler warnings +// stemming from unused functions included from PPCGenCallingConv.inc. +CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) { + if (Flag == 1) + return CC_PPC32_SVR4; + else if (Flag == 2) + return CC_PPC32_SVR4_ByVal; + else if (Flag == 3) + return CC_PPC32_SVR4_VarArg; + else + return RetCC_PPC; +} + +static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) { + switch (Pred) { + // These are not representable with any single compare. + case CmpInst::FCMP_FALSE: + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_UGT: + case CmpInst::FCMP_UGE: + case CmpInst::FCMP_ULT: + case CmpInst::FCMP_ULE: + case CmpInst::FCMP_UNE: + case CmpInst::FCMP_TRUE: + default: + return Optional<PPC::Predicate>(); + + case CmpInst::FCMP_OEQ: + case CmpInst::ICMP_EQ: + return PPC::PRED_EQ; + + case CmpInst::FCMP_OGT: + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_SGT: + return PPC::PRED_GT; + + case CmpInst::FCMP_OGE: + case CmpInst::ICMP_UGE: + case CmpInst::ICMP_SGE: + return PPC::PRED_GE; + + case CmpInst::FCMP_OLT: + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_SLT: + return PPC::PRED_LT; + + case CmpInst::FCMP_OLE: + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_SLE: + return PPC::PRED_LE; + + case CmpInst::FCMP_ONE: + case CmpInst::ICMP_NE: + return PPC::PRED_NE; + + case CmpInst::FCMP_ORD: + return PPC::PRED_NU; + + case CmpInst::FCMP_UNO: + return PPC::PRED_UN; + } +} + +// Determine whether the type Ty is simple enough to be handled by +// fast-isel, and return its equivalent machine type in VT. +// FIXME: Copied directly from ARM -- factor into base class? +bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) { + EVT Evt = TLI.getValueType(Ty, true); + + // Only handle simple types. + if (Evt == MVT::Other || !Evt.isSimple()) return false; + VT = Evt.getSimpleVT(); + + // Handle all legal types, i.e. a register that will directly hold this + // value. + return TLI.isTypeLegal(VT); +} + +// Determine whether the type Ty is simple enough to be handled by +// fast-isel as a load target, and return its equivalent machine type in VT. +bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) { + if (isTypeLegal(Ty, VT)) return true; + + // If this is a type than can be sign or zero-extended to a basic operation + // go ahead and accept it now. + if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) { + return true; + } + + return false; +} + +// Given a value Obj, create an Address object Addr that represents its +// address. Return false if we can't handle it. +bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) { + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(Obj)) { + // Don't walk into other basic blocks unless the object is an alloca from + // another block, otherwise it may not have a virtual register assigned. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) { + Opcode = C->getOpcode(); + U = C; + } + + switch (Opcode) { + default: + break; + case Instruction::BitCast: + // Look through bitcasts. + return PPCComputeAddress(U->getOperand(0), Addr); + case Instruction::IntToPtr: + // Look past no-op inttoptrs. + if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return PPCComputeAddress(U->getOperand(0), Addr); + break; + case Instruction::PtrToInt: + // Look past no-op ptrtoints. + if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return PPCComputeAddress(U->getOperand(0), Addr); + break; + case Instruction::GetElementPtr: { + Address SavedAddr = Addr; + long TmpOffset = Addr.Offset; + + // Iterate through the GEP folding the constants into offsets where + // we can. + gep_type_iterator GTI = gep_type_begin(U); + for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end(); + II != IE; ++II, ++GTI) { + const Value *Op = *II; + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = DL.getStructLayout(STy); + unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); + TmpOffset += SL->getElementOffset(Idx); + } else { + uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + TmpOffset += CI->getSExtValue() * S; + break; + } + if (canFoldAddIntoGEP(U, Op)) { + // A compatible add with a constant operand. Fold the constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + TmpOffset += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + // Unsupported + goto unsupported_gep; + } + } + } + + // Try to grab the base operand now. + Addr.Offset = TmpOffset; + if (PPCComputeAddress(U->getOperand(0), Addr)) return true; + + // We failed, restore everything and try the other options. + Addr = SavedAddr; + + unsupported_gep: + break; + } + case Instruction::Alloca: { + const AllocaInst *AI = cast<AllocaInst>(Obj); + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + if (SI != FuncInfo.StaticAllocaMap.end()) { + Addr.BaseType = Address::FrameIndexBase; + Addr.Base.FI = SI->second; + return true; + } + break; + } + } + + // FIXME: References to parameters fall through to the behavior + // below. They should be able to reference a frame index since + // they are stored to the stack, so we can get "ld rx, offset(r1)" + // instead of "addi ry, r1, offset / ld rx, 0(ry)". Obj will + // just contain the parameter. Try to handle this with a FI. + + // Try to get this in a register if nothing else has worked. + if (Addr.Base.Reg == 0) + Addr.Base.Reg = getRegForValue(Obj); + + // Prevent assignment of base register to X0, which is inappropriate + // for loads and stores alike. + if (Addr.Base.Reg != 0) + MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass); + + return Addr.Base.Reg != 0; +} + +// Fix up some addresses that can't be used directly. For example, if +// an offset won't fit in an instruction field, we may need to move it +// into an index register. +void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset, + unsigned &IndexReg) { + + // Check whether the offset fits in the instruction field. + if (!isInt<16>(Addr.Offset)) + UseOffset = false; + + // If this is a stack pointer and the offset needs to be simplified then + // put the alloca address into a register, set the base type back to + // register and continue. This should almost never happen. + if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) { + unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8), + ResultReg).addFrameIndex(Addr.Base.FI).addImm(0); + Addr.Base.Reg = ResultReg; + Addr.BaseType = Address::RegBase; + } + + if (!UseOffset) { + IntegerType *OffsetTy = ((VT == MVT::i32) ? Type::getInt32Ty(*Context) + : Type::getInt64Ty(*Context)); + const ConstantInt *Offset = + ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset)); + IndexReg = PPCMaterializeInt(Offset, MVT::i64); + assert(IndexReg && "Unexpected error in PPCMaterializeInt!"); + } +} + +// Emit a load instruction if possible, returning true if we succeeded, +// otherwise false. See commentary below for how the register class of +// the load is determined. +bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, + const TargetRegisterClass *RC, + bool IsZExt, unsigned FP64LoadOpc) { + unsigned Opc; + bool UseOffset = true; + + // If ResultReg is given, it determines the register class of the load. + // Otherwise, RC is the register class to use. If the result of the + // load isn't anticipated in this block, both may be zero, in which + // case we must make a conservative guess. In particular, don't assign + // R0 or X0 to the result register, as the result may be used in a load, + // store, add-immediate, or isel that won't permit this. (Though + // perhaps the spill and reload of live-exit values would handle this?) + const TargetRegisterClass *UseRC = + (ResultReg ? MRI.getRegClass(ResultReg) : + (RC ? RC : + (VT == MVT::f64 ? &PPC::F8RCRegClass : + (VT == MVT::f32 ? &PPC::F4RCRegClass : + (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass : + &PPC::GPRC_and_GPRC_NOR0RegClass))))); + + bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass); + + switch (VT.SimpleTy) { + default: // e.g., vector types not handled + return false; + case MVT::i8: + Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8; + break; + case MVT::i16: + Opc = (IsZExt ? + (Is32BitInt ? PPC::LHZ : PPC::LHZ8) : + (Is32BitInt ? PPC::LHA : PPC::LHA8)); + break; + case MVT::i32: + Opc = (IsZExt ? + (Is32BitInt ? PPC::LWZ : PPC::LWZ8) : + (Is32BitInt ? PPC::LWA_32 : PPC::LWA)); + if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0)) + UseOffset = false; + break; + case MVT::i64: + Opc = PPC::LD; + assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) && + "64-bit load with 32-bit target??"); + UseOffset = ((Addr.Offset & 3) == 0); + break; + case MVT::f32: + Opc = PPC::LFS; + break; + case MVT::f64: + Opc = FP64LoadOpc; + break; + } + + // If necessary, materialize the offset into a register and use + // the indexed form. Also handle stack pointers with special needs. + unsigned IndexReg = 0; + PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg); + if (ResultReg == 0) + ResultReg = createResultReg(UseRC); + + // Note: If we still have a frame index here, we know the offset is + // in range, as otherwise PPCSimplifyAddress would have converted it + // into a RegBase. + if (Addr.BaseType == Address::FrameIndexBase) { + + MachineMemOperand *MMO = + FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset), + MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI), + MFI.getObjectAlignment(Addr.Base.FI)); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO); + + // Base reg with offset in range. + } else if (UseOffset) { + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addImm(Addr.Offset).addReg(Addr.Base.Reg); + + // Indexed form. + } else { + // Get the RR opcode corresponding to the RI one. FIXME: It would be + // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it + // is hard to get at. + switch (Opc) { + default: llvm_unreachable("Unexpected opcode!"); + case PPC::LBZ: Opc = PPC::LBZX; break; + case PPC::LBZ8: Opc = PPC::LBZX8; break; + case PPC::LHZ: Opc = PPC::LHZX; break; + case PPC::LHZ8: Opc = PPC::LHZX8; break; + case PPC::LHA: Opc = PPC::LHAX; break; + case PPC::LHA8: Opc = PPC::LHAX8; break; + case PPC::LWZ: Opc = PPC::LWZX; break; + case PPC::LWZ8: Opc = PPC::LWZX8; break; + case PPC::LWA: Opc = PPC::LWAX; break; + case PPC::LWA_32: Opc = PPC::LWAX_32; break; + case PPC::LD: Opc = PPC::LDX; break; + case PPC::LFS: Opc = PPC::LFSX; break; + case PPC::LFD: Opc = PPC::LFDX; break; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(Addr.Base.Reg).addReg(IndexReg); + } + + return true; +} + +// Attempt to fast-select a load instruction. +bool PPCFastISel::SelectLoad(const Instruction *I) { + // FIXME: No atomic loads are supported. + if (cast<LoadInst>(I)->isAtomic()) + return false; + + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(I->getType(), VT)) + return false; + + // See if we can handle this address. + Address Addr; + if (!PPCComputeAddress(I->getOperand(0), Addr)) + return false; + + // Look at the currently assigned register for this instruction + // to determine the required register class. This is necessary + // to constrain RA from using R0/X0 when this is not legal. + unsigned AssignedReg = FuncInfo.ValueMap[I]; + const TargetRegisterClass *RC = + AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr; + + unsigned ResultReg = 0; + if (!PPCEmitLoad(VT, ResultReg, Addr, RC)) + return false; + UpdateValueMap(I, ResultReg); + return true; +} + +// Emit a store instruction to store SrcReg at Addr. +bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) { + assert(SrcReg && "Nothing to store!"); + unsigned Opc; + bool UseOffset = true; + + const TargetRegisterClass *RC = MRI.getRegClass(SrcReg); + bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass); + + switch (VT.SimpleTy) { + default: // e.g., vector types not handled + return false; + case MVT::i8: + Opc = Is32BitInt ? PPC::STB : PPC::STB8; + break; + case MVT::i16: + Opc = Is32BitInt ? PPC::STH : PPC::STH8; + break; + case MVT::i32: + assert(Is32BitInt && "Not GPRC for i32??"); + Opc = PPC::STW; + break; + case MVT::i64: + Opc = PPC::STD; + UseOffset = ((Addr.Offset & 3) == 0); + break; + case MVT::f32: + Opc = PPC::STFS; + break; + case MVT::f64: + Opc = PPC::STFD; + break; + } + + // If necessary, materialize the offset into a register and use + // the indexed form. Also handle stack pointers with special needs. + unsigned IndexReg = 0; + PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg); + + // Note: If we still have a frame index here, we know the offset is + // in range, as otherwise PPCSimplifyAddress would have converted it + // into a RegBase. + if (Addr.BaseType == Address::FrameIndexBase) { + MachineMemOperand *MMO = + FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset), + MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI), + MFI.getObjectAlignment(Addr.Base.FI)); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg) + .addImm(Addr.Offset) + .addFrameIndex(Addr.Base.FI) + .addMemOperand(MMO); + + // Base reg with offset in range. + } else if (UseOffset) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg); + + // Indexed form. + else { + // Get the RR opcode corresponding to the RI one. FIXME: It would be + // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it + // is hard to get at. + switch (Opc) { + default: llvm_unreachable("Unexpected opcode!"); + case PPC::STB: Opc = PPC::STBX; break; + case PPC::STH : Opc = PPC::STHX; break; + case PPC::STW : Opc = PPC::STWX; break; + case PPC::STB8: Opc = PPC::STBX8; break; + case PPC::STH8: Opc = PPC::STHX8; break; + case PPC::STW8: Opc = PPC::STWX8; break; + case PPC::STD: Opc = PPC::STDX; break; + case PPC::STFS: Opc = PPC::STFSX; break; + case PPC::STFD: Opc = PPC::STFDX; break; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) + .addReg(SrcReg).addReg(Addr.Base.Reg).addReg(IndexReg); + } + + return true; +} + +// Attempt to fast-select a store instruction. +bool PPCFastISel::SelectStore(const Instruction *I) { + Value *Op0 = I->getOperand(0); + unsigned SrcReg = 0; + + // FIXME: No atomics loads are supported. + if (cast<StoreInst>(I)->isAtomic()) + return false; + + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(Op0->getType(), VT)) + return false; + + // Get the value to be stored into a register. + SrcReg = getRegForValue(Op0); + if (SrcReg == 0) + return false; + + // See if we can handle this address. + Address Addr; + if (!PPCComputeAddress(I->getOperand(1), Addr)) + return false; + + if (!PPCEmitStore(VT, SrcReg, Addr)) + return false; + + return true; +} + +// Attempt to fast-select a branch instruction. +bool PPCFastISel::SelectBranch(const Instruction *I) { + const BranchInst *BI = cast<BranchInst>(I); + MachineBasicBlock *BrBB = FuncInfo.MBB; + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // For now, just try the simplest case where it's fed by a compare. + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + Optional<PPC::Predicate> OptPPCPred = getComparePred(CI->getPredicate()); + if (!OptPPCPred) + return false; + + PPC::Predicate PPCPred = OptPPCPred.getValue(); + + // Take advantage of fall-through opportunities. + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + PPCPred = PPC::InvertPredicate(PPCPred); + } + + unsigned CondReg = createResultReg(&PPC::CRRCRegClass); + + if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(), + CondReg)) + return false; + + BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC)) + .addImm(PPCPred).addReg(CondReg).addMBB(TBB); + FastEmitBranch(FBB, DbgLoc); + FuncInfo.MBB->addSuccessor(TBB); + return true; + + } else if (const ConstantInt *CI = + dyn_cast<ConstantInt>(BI->getCondition())) { + uint64_t Imm = CI->getZExtValue(); + MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB; + FastEmitBranch(Target, DbgLoc); + return true; + } + + // FIXME: ARM looks for a case where the block containing the compare + // has been split from the block containing the branch. If this happens, + // there is a vreg available containing the result of the compare. I'm + // not sure we can do much, as we've lost the predicate information with + // the compare instruction -- we have a 4-bit CR but don't know which bit + // to test here. + return false; +} + +// Attempt to emit a compare of the two source values. Signed and unsigned +// comparisons are supported. Return false if we can't handle it. +bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2, + bool IsZExt, unsigned DestReg) { + Type *Ty = SrcValue1->getType(); + EVT SrcEVT = TLI.getValueType(Ty, true); + if (!SrcEVT.isSimple()) + return false; + MVT SrcVT = SrcEVT.getSimpleVT(); + + if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits()) + return false; + + // See if operand 2 is an immediate encodeable in the compare. + // FIXME: Operands are not in canonical order at -O0, so an immediate + // operand in position 1 is a lost opportunity for now. We are + // similar to ARM in this regard. + long Imm = 0; + bool UseImm = false; + + // Only 16-bit integer constants can be represented in compares for + // PowerPC. Others will be materialized into a register. + if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(SrcValue2)) { + if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 || + SrcVT == MVT::i8 || SrcVT == MVT::i1) { + const APInt &CIVal = ConstInt->getValue(); + Imm = (IsZExt) ? (long)CIVal.getZExtValue() : (long)CIVal.getSExtValue(); + if ((IsZExt && isUInt<16>(Imm)) || (!IsZExt && isInt<16>(Imm))) + UseImm = true; + } + } + + unsigned CmpOpc; + bool NeedsExt = false; + switch (SrcVT.SimpleTy) { + default: return false; + case MVT::f32: + CmpOpc = PPC::FCMPUS; + break; + case MVT::f64: + CmpOpc = PPC::FCMPUD; + break; + case MVT::i1: + case MVT::i8: + case MVT::i16: + NeedsExt = true; + // Intentional fall-through. + case MVT::i32: + if (!UseImm) + CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW; + else + CmpOpc = IsZExt ? PPC::CMPLWI : PPC::CMPWI; + break; + case MVT::i64: + if (!UseImm) + CmpOpc = IsZExt ? PPC::CMPLD : PPC::CMPD; + else + CmpOpc = IsZExt ? PPC::CMPLDI : PPC::CMPDI; + break; + } + + unsigned SrcReg1 = getRegForValue(SrcValue1); + if (SrcReg1 == 0) + return false; + + unsigned SrcReg2 = 0; + if (!UseImm) { + SrcReg2 = getRegForValue(SrcValue2); + if (SrcReg2 == 0) + return false; + } + + if (NeedsExt) { + unsigned ExtReg = createResultReg(&PPC::GPRCRegClass); + if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt)) + return false; + SrcReg1 = ExtReg; + + if (!UseImm) { + unsigned ExtReg = createResultReg(&PPC::GPRCRegClass); + if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt)) + return false; + SrcReg2 = ExtReg; + } + } + + if (!UseImm) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg) + .addReg(SrcReg1).addReg(SrcReg2); + else + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc), DestReg) + .addReg(SrcReg1).addImm(Imm); + + return true; +} + +// Attempt to fast-select a floating-point extend instruction. +bool PPCFastISel::SelectFPExt(const Instruction *I) { + Value *Src = I->getOperand(0); + EVT SrcVT = TLI.getValueType(Src->getType(), true); + EVT DestVT = TLI.getValueType(I->getType(), true); + + if (SrcVT != MVT::f32 || DestVT != MVT::f64) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) + return false; + + // No code is generated for a FP extend. + UpdateValueMap(I, SrcReg); + return true; +} + +// Attempt to fast-select a floating-point truncate instruction. +bool PPCFastISel::SelectFPTrunc(const Instruction *I) { + Value *Src = I->getOperand(0); + EVT SrcVT = TLI.getValueType(Src->getType(), true); + EVT DestVT = TLI.getValueType(I->getType(), true); + + if (SrcVT != MVT::f64 || DestVT != MVT::f32) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) + return false; + + // Round the result to single precision. + unsigned DestReg = createResultReg(&PPC::F4RCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), DestReg) + .addReg(SrcReg); + + UpdateValueMap(I, DestReg); + return true; +} + +// Move an i32 or i64 value in a GPR to an f64 value in an FPR. +// FIXME: When direct register moves are implemented (see PowerISA 2.08), +// those should be used instead of moving via a stack slot when the +// subtarget permits. +// FIXME: The code here is sloppy for the 4-byte case. Can use a 4-byte +// stack slot and 4-byte store/load sequence. Or just sext the 4-byte +// case to 8 bytes which produces tighter code but wastes stack space. +unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg, + bool IsSigned) { + + // If necessary, extend 32-bit int to 64-bit. + if (SrcVT == MVT::i32) { + unsigned TmpReg = createResultReg(&PPC::G8RCRegClass); + if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned)) + return 0; + SrcReg = TmpReg; + } + + // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary. + Address Addr; + Addr.BaseType = Address::FrameIndexBase; + Addr.Base.FI = MFI.CreateStackObject(8, 8, false); + + // Store the value from the GPR. + if (!PPCEmitStore(MVT::i64, SrcReg, Addr)) + return 0; + + // Load the integer value into an FPR. The kind of load used depends + // on a number of conditions. + unsigned LoadOpc = PPC::LFD; + + if (SrcVT == MVT::i32) { + if (!IsSigned) { + LoadOpc = PPC::LFIWZX; + Addr.Offset = 4; + } else if (PPCSubTarget->hasLFIWAX()) { + LoadOpc = PPC::LFIWAX; + Addr.Offset = 4; + } + } + + const TargetRegisterClass *RC = &PPC::F8RCRegClass; + unsigned ResultReg = 0; + if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc)) + return 0; + + return ResultReg; +} + +// Attempt to fast-select an integer-to-floating-point conversion. +bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) { + MVT DstVT; + Type *DstTy = I->getType(); + if (!isTypeLegal(DstTy, DstVT)) + return false; + + if (DstVT != MVT::f32 && DstVT != MVT::f64) + return false; + + Value *Src = I->getOperand(0); + EVT SrcEVT = TLI.getValueType(Src->getType(), true); + if (!SrcEVT.isSimple()) + return false; + + MVT SrcVT = SrcEVT.getSimpleVT(); + + if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && + SrcVT != MVT::i32 && SrcVT != MVT::i64) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (SrcReg == 0) + return false; + + // We can only lower an unsigned convert if we have the newer + // floating-point conversion operations. + if (!IsSigned && !PPCSubTarget->hasFPCVT()) + return false; + + // FIXME: For now we require the newer floating-point conversion operations + // (which are present only on P7 and A2 server models) when converting + // to single-precision float. Otherwise we have to generate a lot of + // fiddly code to avoid double rounding. If necessary, the fiddly code + // can be found in PPCTargetLowering::LowerINT_TO_FP(). + if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT()) + return false; + + // Extend the input if necessary. + if (SrcVT == MVT::i8 || SrcVT == MVT::i16) { + unsigned TmpReg = createResultReg(&PPC::G8RCRegClass); + if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned)) + return false; + SrcVT = MVT::i64; + SrcReg = TmpReg; + } + + // Move the integer value to an FPR. + unsigned FPReg = PPCMoveToFPReg(SrcVT, SrcReg, IsSigned); + if (FPReg == 0) + return false; + + // Determine the opcode for the conversion. + const TargetRegisterClass *RC = &PPC::F8RCRegClass; + unsigned DestReg = createResultReg(RC); + unsigned Opc; + + if (DstVT == MVT::f32) + Opc = IsSigned ? PPC::FCFIDS : PPC::FCFIDUS; + else + Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU; + + // Generate the convert. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addReg(FPReg); + + UpdateValueMap(I, DestReg); + return true; +} + +// Move the floating-point value in SrcReg into an integer destination +// register, and return the register (or zero if we can't handle it). +// FIXME: When direct register moves are implemented (see PowerISA 2.08), +// those should be used instead of moving via a stack slot when the +// subtarget permits. +unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT, + unsigned SrcReg, bool IsSigned) { + // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary. + // Note that if have STFIWX available, we could use a 4-byte stack + // slot for i32, but this being fast-isel we'll just go with the + // easiest code gen possible. + Address Addr; + Addr.BaseType = Address::FrameIndexBase; + Addr.Base.FI = MFI.CreateStackObject(8, 8, false); + + // Store the value from the FPR. + if (!PPCEmitStore(MVT::f64, SrcReg, Addr)) + return 0; + + // Reload it into a GPR. If we want an i32, modify the address + // to have a 4-byte offset so we load from the right place. + if (VT == MVT::i32) + Addr.Offset = 4; + + // Look at the currently assigned register for this instruction + // to determine the required register class. + unsigned AssignedReg = FuncInfo.ValueMap[I]; + const TargetRegisterClass *RC = + AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr; + + unsigned ResultReg = 0; + if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned)) + return 0; + + return ResultReg; +} + +// Attempt to fast-select a floating-point-to-integer conversion. +bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) { + MVT DstVT, SrcVT; + Type *DstTy = I->getType(); + if (!isTypeLegal(DstTy, DstVT)) + return false; + + if (DstVT != MVT::i32 && DstVT != MVT::i64) + return false; + + // If we don't have FCTIDUZ and we need it, punt to SelectionDAG. + if (DstVT == MVT::i64 && !IsSigned && !PPCSubTarget->hasFPCVT()) + return false; + + Value *Src = I->getOperand(0); + Type *SrcTy = Src->getType(); + if (!isTypeLegal(SrcTy, SrcVT)) + return false; + + if (SrcVT != MVT::f32 && SrcVT != MVT::f64) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (SrcReg == 0) + return false; + + // Convert f32 to f64 if necessary. This is just a meaningless copy + // to get the register class right. COPY_TO_REGCLASS is needed since + // a COPY from F4RC to F8RC is converted to a F4RC-F4RC copy downstream. + const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg); + if (InRC == &PPC::F4RCRegClass) { + unsigned TmpReg = createResultReg(&PPC::F8RCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY_TO_REGCLASS), TmpReg) + .addReg(SrcReg).addImm(PPC::F8RCRegClassID); + SrcReg = TmpReg; + } + + // Determine the opcode for the conversion, which takes place + // entirely within FPRs. + unsigned DestReg = createResultReg(&PPC::F8RCRegClass); + unsigned Opc; + + if (DstVT == MVT::i32) + if (IsSigned) + Opc = PPC::FCTIWZ; + else + Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ; + else + Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ; + + // Generate the convert. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addReg(SrcReg); + + // Now move the integer value from a float register to an integer register. + unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned); + if (IntReg == 0) + return false; + + UpdateValueMap(I, IntReg); + return true; +} + +// Attempt to fast-select a binary integer operation that isn't already +// handled automatically. +bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) { + EVT DestVT = TLI.getValueType(I->getType(), true); + + // We can get here in the case when we have a binary operation on a non-legal + // type and the target independent selector doesn't know how to handle it. + if (DestVT != MVT::i16 && DestVT != MVT::i8) + return false; + + // Look at the currently assigned register for this instruction + // to determine the required register class. If there is no register, + // make a conservative choice (don't assign R0). + unsigned AssignedReg = FuncInfo.ValueMap[I]; + const TargetRegisterClass *RC = + (AssignedReg ? MRI.getRegClass(AssignedReg) : + &PPC::GPRC_and_GPRC_NOR0RegClass); + bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass); + + unsigned Opc; + switch (ISDOpcode) { + default: return false; + case ISD::ADD: + Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8; + break; + case ISD::OR: + Opc = IsGPRC ? PPC::OR : PPC::OR8; + break; + case ISD::SUB: + Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8; + break; + } + + unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass); + unsigned SrcReg1 = getRegForValue(I->getOperand(0)); + if (SrcReg1 == 0) return false; + + // Handle case of small immediate operand. + if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) { + const APInt &CIVal = ConstInt->getValue(); + int Imm = (int)CIVal.getSExtValue(); + bool UseImm = true; + if (isInt<16>(Imm)) { + switch (Opc) { + default: + llvm_unreachable("Missing case!"); + case PPC::ADD4: + Opc = PPC::ADDI; + MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass); + break; + case PPC::ADD8: + Opc = PPC::ADDI8; + MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass); + break; + case PPC::OR: + Opc = PPC::ORI; + break; + case PPC::OR8: + Opc = PPC::ORI8; + break; + case PPC::SUBF: + if (Imm == -32768) + UseImm = false; + else { + Opc = PPC::ADDI; + MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass); + Imm = -Imm; + } + break; + case PPC::SUBF8: + if (Imm == -32768) + UseImm = false; + else { + Opc = PPC::ADDI8; + MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass); + Imm = -Imm; + } + break; + } + + if (UseImm) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), + ResultReg) + .addReg(SrcReg1) + .addImm(Imm); + UpdateValueMap(I, ResultReg); + return true; + } + } + } + + // Reg-reg case. + unsigned SrcReg2 = getRegForValue(I->getOperand(1)); + if (SrcReg2 == 0) return false; + + // Reverse operands for subtract-from. + if (ISDOpcode == ISD::SUB) + std::swap(SrcReg1, SrcReg2); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(SrcReg1).addReg(SrcReg2); + UpdateValueMap(I, ResultReg); + return true; +} + +// Handle arguments to a call that we're attempting to fast-select. +// Return false if the arguments are too complex for us at the moment. +bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args, + SmallVectorImpl<unsigned> &ArgRegs, + SmallVectorImpl<MVT> &ArgVTs, + SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags, + SmallVectorImpl<unsigned> &RegArgs, + CallingConv::ID CC, + unsigned &NumBytes, + bool IsVarArg) { + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context); + + // Reserve space for the linkage area on the stack. + bool isELFv2ABI = PPCSubTarget->isELFv2ABI(); + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); + CCInfo.AllocateStack(LinkageSize, 8); + + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS); + + // Bail out if we can't handle any of the arguments. + for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { + CCValAssign &VA = ArgLocs[I]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // Skip vector arguments for now, as well as long double and + // uint128_t, and anything that isn't passed in a register. + if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 || ArgVT == MVT::i1 || + !VA.isRegLoc() || VA.needsCustom()) + return false; + + // Skip bit-converted arguments for now. + if (VA.getLocInfo() == CCValAssign::BCvt) + return false; + } + + // Get a count of how many bytes are to be pushed onto the stack. + NumBytes = CCInfo.getNextStackOffset(); + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. + NumBytes = std::max(NumBytes, LinkageSize + 64); + + // Issue CALLSEQ_START. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TII.getCallFrameSetupOpcode())) + .addImm(NumBytes); + + // Prepare to assign register arguments. Every argument uses up a + // GPR protocol register even if it's passed in a floating-point + // register. + unsigned NextGPR = PPC::X3; + unsigned NextFPR = PPC::F1; + + // Process arguments. + for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) { + CCValAssign &VA = ArgLocs[I]; + unsigned Arg = ArgRegs[VA.getValNo()]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // Handle argument promotion and bitcasts. + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::SExt: { + MVT DestVT = VA.getLocVT(); + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false)) + llvm_unreachable("Failed to emit a sext!"); + ArgVT = DestVT; + Arg = TmpReg; + break; + } + case CCValAssign::AExt: + case CCValAssign::ZExt: { + MVT DestVT = VA.getLocVT(); + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true)) + llvm_unreachable("Failed to emit a zext!"); + ArgVT = DestVT; + Arg = TmpReg; + break; + } + case CCValAssign::BCvt: { + // FIXME: Not yet handled. + llvm_unreachable("Should have bailed before getting here!"); + break; + } + } + + // Copy this argument to the appropriate register. + unsigned ArgReg; + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) { + ArgReg = NextFPR++; + ++NextGPR; + } else + ArgReg = NextGPR++; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ArgReg).addReg(Arg); + RegArgs.push_back(ArgReg); + } + + return true; +} + +// For a call that we've determined we can fast-select, finish the +// call sequence and generate a copy to obtain the return value (if any). +void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes, bool IsVarArg) { + // Issue CallSEQ_END. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TII.getCallFrameDestroyOpcode())) + .addImm(NumBytes).addImm(0); + + // Next, generate a copy to obtain the return value. + // FIXME: No multi-register return values yet, though I don't foresee + // any real difficulties there. + if (RetVT != MVT::isVoid) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS); + CCValAssign &VA = RVLocs[0]; + assert(RVLocs.size() == 1 && "No support for multi-reg return values!"); + assert(VA.isRegLoc() && "Can only return in registers!"); + + MVT DestVT = VA.getValVT(); + MVT CopyVT = DestVT; + + // Ints smaller than a register still arrive in a full 64-bit + // register, so make sure we recognize this. + if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) + CopyVT = MVT::i64; + + unsigned SourcePhysReg = VA.getLocReg(); + unsigned ResultReg = 0; + + if (RetVT == CopyVT) { + const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT); + ResultReg = createResultReg(CpyRC); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(SourcePhysReg); + + // If necessary, round the floating result to single precision. + } else if (CopyVT == MVT::f64) { + ResultReg = createResultReg(TLI.getRegClassFor(RetVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::FRSP), + ResultReg).addReg(SourcePhysReg); + + // If only the low half of a general register is needed, generate + // a GPRC copy instead of a G8RC copy. (EXTRACT_SUBREG can't be + // used along the fast-isel path (not lowered), and downstream logic + // also doesn't like a direct subreg copy on a physical reg.) + } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) { + ResultReg = createResultReg(&PPC::GPRCRegClass); + // Convert physical register from G8RC to GPRC. + SourcePhysReg -= PPC::X0 - PPC::R0; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg) + .addReg(SourcePhysReg); + } + + assert(ResultReg && "ResultReg unset!"); + UsedRegs.push_back(SourcePhysReg); + UpdateValueMap(I, ResultReg); + } +} + +// Attempt to fast-select a call instruction. +bool PPCFastISel::SelectCall(const Instruction *I) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + + // Can't handle inline asm. + if (isa<InlineAsm>(Callee)) + return false; + + // Allow SelectionDAG isel to handle tail calls. + if (CI->isTailCall()) + return false; + + // Obtain calling convention. + ImmutableCallSite CS(CI); + CallingConv::ID CC = CS.getCallingConv(); + + PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); + FunctionType *FTy = cast<FunctionType>(PT->getElementType()); + bool IsVarArg = FTy->isVarArg(); + + // Not ready for varargs yet. + if (IsVarArg) + return false; + + // Handle simple calls for now, with legal return types and + // those that can be extended. + Type *RetTy = I->getType(); + MVT RetVT; + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 && + RetVT != MVT::i8) + return false; + + // FIXME: No multi-register return values yet. + if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 && + RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 && + RetVT != MVT::f64) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS); + if (RVLocs.size() > 1) + return false; + } + + // Bail early if more than 8 arguments, as we only currently + // handle arguments passed in registers. + unsigned NumArgs = CS.arg_size(); + if (NumArgs > 8) + return false; + + // Set up the argument vectors. + SmallVector<Value*, 8> Args; + SmallVector<unsigned, 8> ArgRegs; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + + Args.reserve(NumArgs); + ArgRegs.reserve(NumArgs); + ArgVTs.reserve(NumArgs); + ArgFlags.reserve(NumArgs); + + for (ImmutableCallSite::arg_iterator II = CS.arg_begin(), IE = CS.arg_end(); + II != IE; ++II) { + // FIXME: ARM does something for intrinsic calls here, check into that. + + unsigned AttrIdx = II - CS.arg_begin() + 1; + + // Only handle easy calls for now. It would be reasonably easy + // to handle <= 8-byte structures passed ByVal in registers, but we + // have to ensure they are right-justified in the register. + if (CS.paramHasAttr(AttrIdx, Attribute::InReg) || + CS.paramHasAttr(AttrIdx, Attribute::StructRet) || + CS.paramHasAttr(AttrIdx, Attribute::Nest) || + CS.paramHasAttr(AttrIdx, Attribute::ByVal)) + return false; + + ISD::ArgFlagsTy Flags; + if (CS.paramHasAttr(AttrIdx, Attribute::SExt)) + Flags.setSExt(); + if (CS.paramHasAttr(AttrIdx, Attribute::ZExt)) + Flags.setZExt(); + + Type *ArgTy = (*II)->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8) + return false; + + if (ArgVT.isVector()) + return false; + + unsigned Arg = getRegForValue(*II); + if (Arg == 0) + return false; + + unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(*II); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Process the arguments. + SmallVector<unsigned, 8> RegArgs; + unsigned NumBytes; + + if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, + RegArgs, CC, NumBytes, IsVarArg)) + return false; + + // FIXME: No handling for function pointers yet. This requires + // implementing the function descriptor (OPD) setup. + const GlobalValue *GV = dyn_cast<GlobalValue>(Callee); + if (!GV) + return false; + + // Build direct call with NOP for TOC restore. + // FIXME: We can and should optimize away the NOP for local calls. + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(PPC::BL8_NOP)); + // Add callee. + MIB.addGlobalAddress(GV); + + // Add implicit physical register uses to the call. + for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II) + MIB.addReg(RegArgs[II], RegState::Implicit); + + // Direct calls in the ELFv2 ABI need the TOC register live into the call. + if (PPCSubTarget->isELFv2ABI()) + MIB.addReg(PPC::X2, RegState::Implicit); + + // Add a register mask with the call-preserved registers. Proper + // defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(CC)); + + // Finish off the call including any return values. + SmallVector<unsigned, 4> UsedRegs; + finishCall(RetVT, UsedRegs, I, CC, NumBytes, IsVarArg); + + // Set all unused physregs defs as dead. + static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; +} + +// Attempt to fast-select a return instruction. +bool PPCFastISel::SelectRet(const Instruction *I) { + + if (!FuncInfo.CanLowerReturn) + return false; + + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + + // Build a list of return value registers. + SmallVector<unsigned, 4> RetRegs; + CallingConv::ID CC = F.getCallingConv(); + + if (Ret->getNumOperands() > 0) { + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, *Context); + CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS); + const Value *RV = Ret->getOperand(0); + + // FIXME: Only one output register for now. + if (ValLocs.size() > 1) + return false; + + // Special case for returning a constant integer of any size. + // Materialize the constant as an i64 and copy it to the return + // register. This avoids an unnecessary extend or truncate. + if (isa<ConstantInt>(*RV)) { + const Constant *C = cast<Constant>(RV); + unsigned SrcReg = PPCMaterializeInt(C, MVT::i64); + unsigned RetReg = ValLocs[0].getLocReg(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg); + RetRegs.push_back(RetReg); + + } else { + unsigned Reg = getRegForValue(RV); + + if (Reg == 0) + return false; + + // Copy the result values into the output registers. + for (unsigned i = 0; i < ValLocs.size(); ++i) { + + CCValAssign &VA = ValLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + RetRegs.push_back(VA.getLocReg()); + unsigned SrcReg = Reg + VA.getValNo(); + + EVT RVEVT = TLI.getValueType(RV->getType()); + if (!RVEVT.isSimple()) + return false; + MVT RVVT = RVEVT.getSimpleVT(); + MVT DestVT = VA.getLocVT(); + + if (RVVT != DestVT && RVVT != MVT::i8 && + RVVT != MVT::i16 && RVVT != MVT::i32) + return false; + + if (RVVT != DestVT) { + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + llvm_unreachable("Full value assign but types don't match?"); + case CCValAssign::AExt: + case CCValAssign::ZExt: { + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true)) + return false; + SrcReg = TmpReg; + break; + } + case CCValAssign::SExt: { + const TargetRegisterClass *RC = + (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned TmpReg = createResultReg(RC); + if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false)) + return false; + SrcReg = TmpReg; + break; + } + } + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), RetRegs[i]) + .addReg(SrcReg); + } + } + } + + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(PPC::BLR)); + + for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) + MIB.addReg(RetRegs[i], RegState::Implicit); + + return true; +} + +// Attempt to emit an integer extend of SrcReg into DestReg. Both +// signed and zero extensions are supported. Return false if we +// can't handle it. +bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, + unsigned DestReg, bool IsZExt) { + if (DestVT != MVT::i32 && DestVT != MVT::i64) + return false; + if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && SrcVT != MVT::i32) + return false; + + // Signed extensions use EXTSB, EXTSH, EXTSW. + if (!IsZExt) { + unsigned Opc; + if (SrcVT == MVT::i8) + Opc = (DestVT == MVT::i32) ? PPC::EXTSB : PPC::EXTSB8_32_64; + else if (SrcVT == MVT::i16) + Opc = (DestVT == MVT::i32) ? PPC::EXTSH : PPC::EXTSH8_32_64; + else { + assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??"); + Opc = PPC::EXTSW_32_64; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addReg(SrcReg); + + // Unsigned 32-bit extensions use RLWINM. + } else if (DestVT == MVT::i32) { + unsigned MB; + if (SrcVT == MVT::i8) + MB = 24; + else { + assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??"); + MB = 16; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLWINM), + DestReg) + .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31); + + // Unsigned 64-bit extensions use RLDICL (with a 32-bit source). + } else { + unsigned MB; + if (SrcVT == MVT::i8) + MB = 56; + else if (SrcVT == MVT::i16) + MB = 48; + else + MB = 32; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(PPC::RLDICL_32_64), DestReg) + .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB); + } + + return true; +} + +// Attempt to fast-select an indirect branch instruction. +bool PPCFastISel::SelectIndirectBr(const Instruction *I) { + unsigned AddrReg = getRegForValue(I->getOperand(0)); + if (AddrReg == 0) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::MTCTR8)) + .addReg(AddrReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8)); + + const IndirectBrInst *IB = cast<IndirectBrInst>(I); + for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]); + + return true; +} + +// Attempt to fast-select an integer truncate instruction. +bool PPCFastISel::SelectTrunc(const Instruction *I) { + Value *Src = I->getOperand(0); + EVT SrcVT = TLI.getValueType(Src->getType(), true); + EVT DestVT = TLI.getValueType(I->getType(), true); + + if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16) + return false; + + if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8) + return false; + + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) + return false; + + // The only interesting case is when we need to switch register classes. + if (SrcVT == MVT::i64) { + unsigned ResultReg = createResultReg(&PPC::GPRCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), + ResultReg).addReg(SrcReg, 0, PPC::sub_32); + SrcReg = ResultReg; + } + + UpdateValueMap(I, SrcReg); + return true; +} + +// Attempt to fast-select an integer extend instruction. +bool PPCFastISel::SelectIntExt(const Instruction *I) { + Type *DestTy = I->getType(); + Value *Src = I->getOperand(0); + Type *SrcTy = Src->getType(); + + bool IsZExt = isa<ZExtInst>(I); + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) return false; + + EVT SrcEVT, DestEVT; + SrcEVT = TLI.getValueType(SrcTy, true); + DestEVT = TLI.getValueType(DestTy, true); + if (!SrcEVT.isSimple()) + return false; + if (!DestEVT.isSimple()) + return false; + + MVT SrcVT = SrcEVT.getSimpleVT(); + MVT DestVT = DestEVT.getSimpleVT(); + + // If we know the register class needed for the result of this + // instruction, use it. Otherwise pick the register class of the + // correct size that does not contain X0/R0, since we don't know + // whether downstream uses permit that assignment. + unsigned AssignedReg = FuncInfo.ValueMap[I]; + const TargetRegisterClass *RC = + (AssignedReg ? MRI.getRegClass(AssignedReg) : + (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass : + &PPC::GPRC_and_GPRC_NOR0RegClass)); + unsigned ResultReg = createResultReg(RC); + + if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt)) + return false; + + UpdateValueMap(I, ResultReg); + return true; +} + +// Attempt to fast-select an instruction that wasn't handled by +// the table-generated machinery. +bool PPCFastISel::TargetSelectInstruction(const Instruction *I) { + + switch (I->getOpcode()) { + case Instruction::Load: + return SelectLoad(I); + case Instruction::Store: + return SelectStore(I); + case Instruction::Br: + return SelectBranch(I); + case Instruction::IndirectBr: + return SelectIndirectBr(I); + case Instruction::FPExt: + return SelectFPExt(I); + case Instruction::FPTrunc: + return SelectFPTrunc(I); + case Instruction::SIToFP: + return SelectIToFP(I, /*IsSigned*/ true); + case Instruction::UIToFP: + return SelectIToFP(I, /*IsSigned*/ false); + case Instruction::FPToSI: + return SelectFPToI(I, /*IsSigned*/ true); + case Instruction::FPToUI: + return SelectFPToI(I, /*IsSigned*/ false); + case Instruction::Add: + return SelectBinaryIntOp(I, ISD::ADD); + case Instruction::Or: + return SelectBinaryIntOp(I, ISD::OR); + case Instruction::Sub: + return SelectBinaryIntOp(I, ISD::SUB); + case Instruction::Call: + if (dyn_cast<IntrinsicInst>(I)) + return false; + return SelectCall(I); + case Instruction::Ret: + return SelectRet(I); + case Instruction::Trunc: + return SelectTrunc(I); + case Instruction::ZExt: + case Instruction::SExt: + return SelectIntExt(I); + // Here add other flavors of Instruction::XXX that automated + // cases don't catch. For example, switches are terminators + // that aren't yet handled. + default: + break; + } + return false; +} + +// Materialize a floating-point constant into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { + // No plans to handle long double here. + if (VT != MVT::f32 && VT != MVT::f64) + return 0; + + // All FP constants are loaded from the constant pool. + unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); + assert(Align > 0 && "Unexpectedly missing alignment information!"); + unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + CodeModel::Model CModel = TM.getCodeModel(); + + MachineMemOperand *MMO = + FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, + (VT == MVT::f32) ? 4 : 8, Align); + + unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD; + unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + + // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)). + if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocCPT), + TmpReg) + .addConstantPoolIndex(Idx).addReg(PPC::X2); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addImm(0).addReg(TmpReg).addMemOperand(MMO); + } else { + // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)). + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), + TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx); + // But for large code model, we must generate a LDtocL followed + // by the LF[SD]. + if (CModel == CodeModel::Large) { + unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), + TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addImm(0).addReg(TmpReg2); + } else + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO) + .addReg(TmpReg) + .addMemOperand(MMO); + } + + return DestReg; +} + +// Materialize the address of a global value into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { + assert(VT == MVT::i64 && "Non-address!"); + const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass; + unsigned DestReg = createResultReg(RC); + + // Global values may be plain old object addresses, TLS object + // addresses, constant pool entries, or jump tables. How we generate + // code for these may depend on small, medium, or large code model. + CodeModel::Model CModel = TM.getCodeModel(); + + // FIXME: Jump tables are not yet required because fast-isel doesn't + // handle switches; if that changes, we need them as well. For now, + // what follows assumes everything's a generic (or TLS) global address. + + // FIXME: We don't yet handle the complexity of TLS. + if (GV->isThreadLocal()) + return 0; + + // For small code model, generate a simple TOC load. + if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtoc), + DestReg) + .addGlobalAddress(GV) + .addReg(PPC::X2); + else { + // If the address is an externally defined symbol, a symbol with common + // or externally available linkage, a non-local function address, or a + // jump table address (not yet needed), or if we are generating code + // for large code model, we generate: + // LDtocL(GV, ADDIStocHA(%X2, GV)) + // Otherwise we generate: + // ADDItocL(ADDIStocHA(%X2, GV), GV) + // Either way, start with the ADDIStocHA: + unsigned HighPartReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), + HighPartReg).addReg(PPC::X2).addGlobalAddress(GV); + + // If/when switches are implemented, jump tables should be handled + // on the "if" path here. + if (CModel == CodeModel::Large || + (GV->getType()->getElementType()->isFunctionTy() && + (GV->isDeclaration() || GV->isWeakForLinker())) || + GV->isDeclaration() || GV->hasCommonLinkage() || + GV->hasAvailableExternallyLinkage()) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), + DestReg).addGlobalAddress(GV).addReg(HighPartReg); + else + // Otherwise generate the ADDItocL. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL), + DestReg).addReg(HighPartReg).addGlobalAddress(GV); + } + + return DestReg; +} + +// Materialize a 32-bit integer constant into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm, + const TargetRegisterClass *RC) { + unsigned Lo = Imm & 0xFFFF; + unsigned Hi = (Imm >> 16) & 0xFFFF; + + unsigned ResultReg = createResultReg(RC); + bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass); + + if (isInt<16>(Imm)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(IsGPRC ? PPC::LI : PPC::LI8), ResultReg) + .addImm(Imm); + else if (Lo) { + // Both Lo and Hi have nonzero bits. + unsigned TmpReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg) + .addImm(Hi); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(IsGPRC ? PPC::ORI : PPC::ORI8), ResultReg) + .addReg(TmpReg).addImm(Lo); + } else + // Just Hi bits. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), ResultReg) + .addImm(Hi); + + return ResultReg; +} + +// Materialize a 64-bit integer constant into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm, + const TargetRegisterClass *RC) { + unsigned Remainder = 0; + unsigned Shift = 0; + + // If the value doesn't fit in 32 bits, see if we can shift it + // so that it fits in 32 bits. + if (!isInt<32>(Imm)) { + Shift = countTrailingZeros<uint64_t>(Imm); + int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; + + if (isInt<32>(ImmSh)) + Imm = ImmSh; + else { + Remainder = Imm; + Shift = 32; + Imm >>= 32; + } + } + + // Handle the high-order 32 bits (if shifted) or the whole 32 bits + // (if not shifted). + unsigned TmpReg1 = PPCMaterialize32BitInt(Imm, RC); + if (!Shift) + return TmpReg1; + + // If upper 32 bits were not zero, we've built them and need to shift + // them into place. + unsigned TmpReg2; + if (Imm) { + TmpReg2 = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::RLDICR), + TmpReg2).addReg(TmpReg1).addImm(Shift).addImm(63 - Shift); + } else + TmpReg2 = TmpReg1; + + unsigned TmpReg3, Hi, Lo; + if ((Hi = (Remainder >> 16) & 0xFFFF)) { + TmpReg3 = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORIS8), + TmpReg3).addReg(TmpReg2).addImm(Hi); + } else + TmpReg3 = TmpReg2; + + if ((Lo = Remainder & 0xFFFF)) { + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORI8), + ResultReg).addReg(TmpReg3).addImm(Lo); + return ResultReg; + } + + return TmpReg3; +} + + +// Materialize an integer constant into a register, and return +// the register number (or zero if we failed to handle it). +unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT) { + // If we're using CR bit registers for i1 values, handle that as a special + // case first. + if (VT == MVT::i1 && PPCSubTarget->useCRBits()) { + const ConstantInt *CI = cast<ConstantInt>(C); + unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg); + return ImmReg; + } + + if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && + VT != MVT::i8 && VT != MVT::i1) + return 0; + + const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass : + &PPC::GPRCRegClass); + + // If the constant is in range, use a load-immediate. + const ConstantInt *CI = cast<ConstantInt>(C); + if (isInt<16>(CI->getSExtValue())) { + unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; + unsigned ImmReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) + .addImm(CI->getSExtValue()); + return ImmReg; + } + + // Construct the constant piecewise. + int64_t Imm = CI->getZExtValue(); + + if (VT == MVT::i64) + return PPCMaterialize64BitInt(Imm, RC); + else if (VT == MVT::i32) + return PPCMaterialize32BitInt(Imm, RC); + + return 0; +} + +// Materialize a constant into a register, and return the register +// number (or zero if we failed to handle it). +unsigned PPCFastISel::TargetMaterializeConstant(const Constant *C) { + EVT CEVT = TLI.getValueType(C->getType(), true); + + // Only handle simple types. + if (!CEVT.isSimple()) return 0; + MVT VT = CEVT.getSimpleVT(); + + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return PPCMaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return PPCMaterializeGV(GV, VT); + else if (isa<ConstantInt>(C)) + return PPCMaterializeInt(C, VT); + + return 0; +} + +// Materialize the address created by an alloca into a register, and +// return the register number (or zero if we failed to handle it). +unsigned PPCFastISel::TargetMaterializeAlloca(const AllocaInst *AI) { + // Don't handle dynamic allocas. + if (!FuncInfo.StaticAllocaMap.count(AI)) return 0; + + MVT VT; + if (!isLoadTypeLegal(AI->getType(), VT)) return 0; + + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + + if (SI != FuncInfo.StaticAllocaMap.end()) { + unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8), + ResultReg).addFrameIndex(SI->second).addImm(0); + return ResultReg; + } + + return 0; +} + +// Fold loads into extends when possible. +// FIXME: We can have multiple redundant extend/trunc instructions +// following a load. The folding only picks up one. Extend this +// to check subsequent instructions for the same pattern and remove +// them. Thus ResultReg should be the def reg for the last redundant +// instruction in a chain, and all intervening instructions can be +// removed from parent. Change test/CodeGen/PowerPC/fast-isel-fold.ll +// to add ELF64-NOT: rldicl to the appropriate tests when this works. +bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) { + // Verify we have a legal type before going any further. + MVT VT; + if (!isLoadTypeLegal(LI->getType(), VT)) + return false; + + // Combine load followed by zero- or sign-extend. + bool IsZExt = false; + switch(MI->getOpcode()) { + default: + return false; + + case PPC::RLDICL: + case PPC::RLDICL_32_64: { + IsZExt = true; + unsigned MB = MI->getOperand(3).getImm(); + if ((VT == MVT::i8 && MB <= 56) || + (VT == MVT::i16 && MB <= 48) || + (VT == MVT::i32 && MB <= 32)) + break; + return false; + } + + case PPC::RLWINM: + case PPC::RLWINM8: { + IsZExt = true; + unsigned MB = MI->getOperand(3).getImm(); + if ((VT == MVT::i8 && MB <= 24) || + (VT == MVT::i16 && MB <= 16)) + break; + return false; + } + + case PPC::EXTSB: + case PPC::EXTSB8: + case PPC::EXTSB8_32_64: + /* There is no sign-extending load-byte instruction. */ + return false; + + case PPC::EXTSH: + case PPC::EXTSH8: + case PPC::EXTSH8_32_64: { + if (VT != MVT::i16 && VT != MVT::i8) + return false; + break; + } + + case PPC::EXTSW: + case PPC::EXTSW_32_64: { + if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8) + return false; + break; + } + } + + // See if we can handle this address. + Address Addr; + if (!PPCComputeAddress(LI->getOperand(0), Addr)) + return false; + + unsigned ResultReg = MI->getOperand(0).getReg(); + + if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt)) + return false; + + MI->eraseFromParent(); + return true; +} + +// Attempt to lower call arguments in a faster way than done by +// the selection DAG code. +bool PPCFastISel::FastLowerArguments() { + // Defer to normal argument lowering for now. It's reasonably + // efficient. Consider doing something like ARM to handle the + // case where all args fit in registers, no varargs, no float + // or vector args. + return false; +} + +// Handle materializing integer constants into a register. This is not +// automatically generated for PowerPC, so must be explicitly created here. +unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) { + + if (Opc != ISD::Constant) + return 0; + + // If we're using CR bit registers for i1 values, handle that as a special + // case first. + if (VT == MVT::i1 && PPCSubTarget->useCRBits()) { + unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg); + return ImmReg; + } + + if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && + VT != MVT::i8 && VT != MVT::i1) + return 0; + + const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass : + &PPC::GPRCRegClass); + if (VT == MVT::i64) + return PPCMaterialize64BitInt(Imm, RC); + else + return PPCMaterialize32BitInt(Imm, RC); +} + +// Override for ADDI and ADDI8 to set the correct register class +// on RHS operand 0. The automatic infrastructure naively assumes +// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost +// for these cases. At the moment, none of the other automatically +// generated RI instructions require special treatment. However, once +// SelectSelect is implemented, "isel" requires similar handling. +// +// Also be conservative about the output register class. Avoid +// assigning R0 or X0 to the output register for GPRC and G8RC +// register classes, as any such result could be used in ADDI, etc., +// where those regs have another meaning. +unsigned PPCFastISel::FastEmitInst_ri(unsigned MachineInstOpcode, + const TargetRegisterClass *RC, + unsigned Op0, bool Op0IsKill, + uint64_t Imm) { + if (MachineInstOpcode == PPC::ADDI) + MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass); + else if (MachineInstOpcode == PPC::ADDI8) + MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass); + + const TargetRegisterClass *UseRC = + (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass : + (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC)); + + return FastISel::FastEmitInst_ri(MachineInstOpcode, UseRC, + Op0, Op0IsKill, Imm); +} + +// Override for instructions with one register operand to avoid use of +// R0/X0. The automatic infrastructure isn't aware of the context so +// we must be conservative. +unsigned PPCFastISel::FastEmitInst_r(unsigned MachineInstOpcode, + const TargetRegisterClass* RC, + unsigned Op0, bool Op0IsKill) { + const TargetRegisterClass *UseRC = + (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass : + (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC)); + + return FastISel::FastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill); +} + +// Override for instructions with two register operands to avoid use +// of R0/X0. The automatic infrastructure isn't aware of the context +// so we must be conservative. +unsigned PPCFastISel::FastEmitInst_rr(unsigned MachineInstOpcode, + const TargetRegisterClass* RC, + unsigned Op0, bool Op0IsKill, + unsigned Op1, bool Op1IsKill) { + const TargetRegisterClass *UseRC = + (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass : + (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC)); + + return FastISel::FastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill, + Op1, Op1IsKill); +} + +namespace llvm { + // Create the fast instruction selector for PowerPC64 ELF. + FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) { + const TargetMachine &TM = FuncInfo.MF->getTarget(); + + // Only available on 64-bit ELF for now. + const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>(); + if (Subtarget->isPPC64() && Subtarget->isSVR4ABI()) + return new PPCFastISel(FuncInfo, LibInfo); + + return nullptr; + } +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp new file mode 100644 index 000000000000..b2577a9c7cf7 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -0,0 +1,1604 @@ +//===-- PPCFrameLowering.cpp - PPC Frame Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PPC implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "PPCFrameLowering.h" +#include "PPCInstrBuilder.h" +#include "PPCInstrInfo.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCSubtarget.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +/// VRRegNo - Map from a numbered VR register to its enum value. +/// +static const uint16_t VRRegNo[] = { + PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 , + PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15, + PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23, + PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31 +}; + +PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI) + : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, + (STI.hasQPX() || STI.isBGQ()) ? 32 : 16, 0), + Subtarget(STI) {} + +// With the SVR4 ABI, callee-saved registers have fixed offsets on the stack. +const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots( + unsigned &NumEntries) const { + if (Subtarget.isDarwinABI()) { + NumEntries = 1; + if (Subtarget.isPPC64()) { + static const SpillSlot darwin64Offsets = {PPC::X31, -8}; + return &darwin64Offsets; + } else { + static const SpillSlot darwinOffsets = {PPC::R31, -4}; + return &darwinOffsets; + } + } + + // Early exit if not using the SVR4 ABI. + if (!Subtarget.isSVR4ABI()) { + NumEntries = 0; + return nullptr; + } + + // Note that the offsets here overlap, but this is fixed up in + // processFunctionBeforeFrameFinalized. + + static const SpillSlot Offsets[] = { + // Floating-point register save area offsets. + {PPC::F31, -8}, + {PPC::F30, -16}, + {PPC::F29, -24}, + {PPC::F28, -32}, + {PPC::F27, -40}, + {PPC::F26, -48}, + {PPC::F25, -56}, + {PPC::F24, -64}, + {PPC::F23, -72}, + {PPC::F22, -80}, + {PPC::F21, -88}, + {PPC::F20, -96}, + {PPC::F19, -104}, + {PPC::F18, -112}, + {PPC::F17, -120}, + {PPC::F16, -128}, + {PPC::F15, -136}, + {PPC::F14, -144}, + + // General register save area offsets. + {PPC::R31, -4}, + {PPC::R30, -8}, + {PPC::R29, -12}, + {PPC::R28, -16}, + {PPC::R27, -20}, + {PPC::R26, -24}, + {PPC::R25, -28}, + {PPC::R24, -32}, + {PPC::R23, -36}, + {PPC::R22, -40}, + {PPC::R21, -44}, + {PPC::R20, -48}, + {PPC::R19, -52}, + {PPC::R18, -56}, + {PPC::R17, -60}, + {PPC::R16, -64}, + {PPC::R15, -68}, + {PPC::R14, -72}, + + // CR save area offset. We map each of the nonvolatile CR fields + // to the slot for CR2, which is the first of the nonvolatile CR + // fields to be assigned, so that we only allocate one save slot. + // See PPCRegisterInfo::hasReservedSpillSlot() for more information. + {PPC::CR2, -4}, + + // VRSAVE save area offset. + {PPC::VRSAVE, -4}, + + // Vector register save area + {PPC::V31, -16}, + {PPC::V30, -32}, + {PPC::V29, -48}, + {PPC::V28, -64}, + {PPC::V27, -80}, + {PPC::V26, -96}, + {PPC::V25, -112}, + {PPC::V24, -128}, + {PPC::V23, -144}, + {PPC::V22, -160}, + {PPC::V21, -176}, + {PPC::V20, -192}}; + + static const SpillSlot Offsets64[] = { + // Floating-point register save area offsets. + {PPC::F31, -8}, + {PPC::F30, -16}, + {PPC::F29, -24}, + {PPC::F28, -32}, + {PPC::F27, -40}, + {PPC::F26, -48}, + {PPC::F25, -56}, + {PPC::F24, -64}, + {PPC::F23, -72}, + {PPC::F22, -80}, + {PPC::F21, -88}, + {PPC::F20, -96}, + {PPC::F19, -104}, + {PPC::F18, -112}, + {PPC::F17, -120}, + {PPC::F16, -128}, + {PPC::F15, -136}, + {PPC::F14, -144}, + + // General register save area offsets. + {PPC::X31, -8}, + {PPC::X30, -16}, + {PPC::X29, -24}, + {PPC::X28, -32}, + {PPC::X27, -40}, + {PPC::X26, -48}, + {PPC::X25, -56}, + {PPC::X24, -64}, + {PPC::X23, -72}, + {PPC::X22, -80}, + {PPC::X21, -88}, + {PPC::X20, -96}, + {PPC::X19, -104}, + {PPC::X18, -112}, + {PPC::X17, -120}, + {PPC::X16, -128}, + {PPC::X15, -136}, + {PPC::X14, -144}, + + // VRSAVE save area offset. + {PPC::VRSAVE, -4}, + + // Vector register save area + {PPC::V31, -16}, + {PPC::V30, -32}, + {PPC::V29, -48}, + {PPC::V28, -64}, + {PPC::V27, -80}, + {PPC::V26, -96}, + {PPC::V25, -112}, + {PPC::V24, -128}, + {PPC::V23, -144}, + {PPC::V22, -160}, + {PPC::V21, -176}, + {PPC::V20, -192}}; + + if (Subtarget.isPPC64()) { + NumEntries = array_lengthof(Offsets64); + + return Offsets64; + } else { + NumEntries = array_lengthof(Offsets); + + return Offsets; + } +} + +/// RemoveVRSaveCode - We have found that this function does not need any code +/// to manipulate the VRSAVE register, even though it uses vector registers. +/// This can happen when the only registers used are known to be live in or out +/// of the function. Remove all of the VRSAVE related code from the function. +/// FIXME: The removal of the code results in a compile failure at -O0 when the +/// function contains a function call, as the GPR containing original VRSAVE +/// contents is spilled and reloaded around the call. Without the prolog code, +/// the spill instruction refers to an undefined register. This code needs +/// to account for all uses of that GPR. +static void RemoveVRSaveCode(MachineInstr *MI) { + MachineBasicBlock *Entry = MI->getParent(); + MachineFunction *MF = Entry->getParent(); + + // We know that the MTVRSAVE instruction immediately follows MI. Remove it. + MachineBasicBlock::iterator MBBI = MI; + ++MBBI; + assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE); + MBBI->eraseFromParent(); + + bool RemovedAllMTVRSAVEs = true; + // See if we can find and remove the MTVRSAVE instruction from all of the + // epilog blocks. + for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) { + // If last instruction is a return instruction, add an epilogue + if (!I->empty() && I->back().isReturn()) { + bool FoundIt = false; + for (MBBI = I->end(); MBBI != I->begin(); ) { + --MBBI; + if (MBBI->getOpcode() == PPC::MTVRSAVE) { + MBBI->eraseFromParent(); // remove it. + FoundIt = true; + break; + } + } + RemovedAllMTVRSAVEs &= FoundIt; + } + } + + // If we found and removed all MTVRSAVE instructions, remove the read of + // VRSAVE as well. + if (RemovedAllMTVRSAVEs) { + MBBI = MI; + assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?"); + --MBBI; + assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?"); + MBBI->eraseFromParent(); + } + + // Finally, nuke the UPDATE_VRSAVE. + MI->eraseFromParent(); +} + +// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the +// instruction selector. Based on the vector registers that have been used, +// transform this into the appropriate ORI instruction. +static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) { + MachineFunction *MF = MI->getParent()->getParent(); + const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); + DebugLoc dl = MI->getDebugLoc(); + + unsigned UsedRegMask = 0; + for (unsigned i = 0; i != 32; ++i) + if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i])) + UsedRegMask |= 1 << (31-i); + + // Live in and live out values already must be in the mask, so don't bother + // marking them. + for (MachineRegisterInfo::livein_iterator + I = MF->getRegInfo().livein_begin(), + E = MF->getRegInfo().livein_end(); I != E; ++I) { + unsigned RegNo = TRI->getEncodingValue(I->first); + if (VRRegNo[RegNo] == I->first) // If this really is a vector reg. + UsedRegMask &= ~(1 << (31-RegNo)); // Doesn't need to be marked. + } + + // Live out registers appear as use operands on return instructions. + for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end(); + UsedRegMask != 0 && BI != BE; ++BI) { + const MachineBasicBlock &MBB = *BI; + if (MBB.empty() || !MBB.back().isReturn()) + continue; + const MachineInstr &Ret = MBB.back(); + for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) { + const MachineOperand &MO = Ret.getOperand(I); + if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg())) + continue; + unsigned RegNo = TRI->getEncodingValue(MO.getReg()); + UsedRegMask &= ~(1 << (31-RegNo)); + } + } + + // If no registers are used, turn this into a copy. + if (UsedRegMask == 0) { + // Remove all VRSAVE code. + RemoveVRSaveCode(MI); + return; + } + + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned DstReg = MI->getOperand(0).getReg(); + + if ((UsedRegMask & 0xFFFF) == UsedRegMask) { + if (DstReg != SrcReg) + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg) + .addReg(SrcReg) + .addImm(UsedRegMask); + else + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg) + .addReg(SrcReg, RegState::Kill) + .addImm(UsedRegMask); + } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) { + if (DstReg != SrcReg) + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg) + .addImm(UsedRegMask >> 16); + else + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg, RegState::Kill) + .addImm(UsedRegMask >> 16); + } else { + if (DstReg != SrcReg) + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg) + .addImm(UsedRegMask >> 16); + else + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) + .addReg(SrcReg, RegState::Kill) + .addImm(UsedRegMask >> 16); + + BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg) + .addReg(DstReg, RegState::Kill) + .addImm(UsedRegMask & 0xFFFF); + } + + // Remove the old UPDATE_VRSAVE instruction. + MI->eraseFromParent(); +} + +static bool spillsCR(const MachineFunction &MF) { + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + return FuncInfo->isCRSpilled(); +} + +static bool spillsVRSAVE(const MachineFunction &MF) { + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + return FuncInfo->isVRSAVESpilled(); +} + +static bool hasSpills(const MachineFunction &MF) { + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + return FuncInfo->hasSpills(); +} + +static bool hasNonRISpills(const MachineFunction &MF) { + const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + return FuncInfo->hasNonRISpills(); +} + +/// determineFrameLayout - Determine the size of the frame and maximum call +/// frame size. +unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, + bool UpdateMF, + bool UseEstimate) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Get the number of bytes to allocate from the FrameInfo + unsigned FrameSize = + UseEstimate ? MFI->estimateStackSize(MF) : MFI->getStackSize(); + + // Get stack alignments. The frame must be aligned to the greatest of these: + unsigned TargetAlign = getStackAlignment(); // alignment required per the ABI + unsigned MaxAlign = MFI->getMaxAlignment(); // algmt required by data in frame + unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1; + + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo()); + + // If we are a leaf function, and use up to 224 bytes of stack space, + // don't have a frame pointer, calls, or dynamic alloca then we do not need + // to adjust the stack pointer (we fit in the Red Zone). + // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate + // stackless code if all local vars are reg-allocated. + bool DisableRedZone = MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::NoRedZone); + if (!DisableRedZone && + (Subtarget.isPPC64() || // 32-bit SVR4, no stack- + !Subtarget.isSVR4ABI() || // allocated locals. + FrameSize == 0) && + FrameSize <= 224 && // Fits in red zone. + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !RegInfo->hasBasePointer(MF)) { // No special alignment. + // No need for frame + if (UpdateMF) + MFI->setStackSize(0); + return 0; + } + + // Get the maximum call frame size of all the calls. + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + + // Maximum call frame needs to be at least big enough for linkage area. + unsigned minCallFrameSize = getLinkageSize(Subtarget.isPPC64(), + Subtarget.isDarwinABI(), + Subtarget.isELFv2ABI()); + maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize); + + // If we have dynamic alloca then maxCallFrameSize needs to be aligned so + // that allocations will be aligned. + if (MFI->hasVarSizedObjects()) + maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask; + + // Update maximum call frame size. + if (UpdateMF) + MFI->setMaxCallFrameSize(maxCallFrameSize); + + // Include call frame size in total. + FrameSize += maxCallFrameSize; + + // Make sure the frame is aligned. + FrameSize = (FrameSize + AlignMask) & ~AlignMask; + + // Update frame info. + if (UpdateMF) + MFI->setStackSize(FrameSize); + + return FrameSize; +} + +// hasFP - Return true if the specified function actually has a dedicated frame +// pointer register. +bool PPCFrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // FIXME: This is pretty much broken by design: hasFP() might be called really + // early, before the stack layout was calculated and thus hasFP() might return + // true or false here depending on the time of call. + return (MFI->getStackSize()) && needsFP(MF); +} + +// needsFP - Return true if the specified function should have a dedicated frame +// pointer register. This is true if the function has variable sized allocas or +// if frame pointer elimination is disabled. +bool PPCFrameLowering::needsFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Naked functions have no stack frame pushed, so we don't have a frame + // pointer. + if (MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::Naked)) + return false; + + return MF.getTarget().Options.DisableFramePointerElim(MF) || + MFI->hasVarSizedObjects() || + (MF.getTarget().Options.GuaranteedTailCallOpt && + MF.getInfo<PPCFunctionInfo>()->hasFastCall()); +} + +void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { + bool is31 = needsFP(MF); + unsigned FPReg = is31 ? PPC::R31 : PPC::R1; + unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1; + + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo()); + bool HasBP = RegInfo->hasBasePointer(MF); + unsigned BPReg = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg; + unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) + for (MachineBasicBlock::iterator MBBI = BI->end(); MBBI != BI->begin(); ) { + --MBBI; + for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) { + MachineOperand &MO = MBBI->getOperand(I); + if (!MO.isReg()) + continue; + + switch (MO.getReg()) { + case PPC::FP: + MO.setReg(FPReg); + break; + case PPC::FP8: + MO.setReg(FP8Reg); + break; + case PPC::BP: + MO.setReg(BPReg); + break; + case PPC::BP8: + MO.setReg(BP8Reg); + break; + + } + } + } +} + +void PPCFrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo()); + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo()); + + MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + DebugLoc dl; + bool needsFrameMoves = MMI.hasDebugInfo() || + MF.getFunction()->needsUnwindTableEntry(); + bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_; + + // Get processor type. + bool isPPC64 = Subtarget.isPPC64(); + // Get the ABI. + bool isDarwinABI = Subtarget.isDarwinABI(); + bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isELFv2ABI = Subtarget.isELFv2ABI(); + assert((isDarwinABI || isSVR4ABI) && + "Currently only Darwin and SVR4 ABIs are supported for PowerPC."); + + // Scan the prolog, looking for an UPDATE_VRSAVE instruction. If we find it, + // process it. + if (!isSVR4ABI) + for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) { + if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) { + HandleVRSaveUpdate(MBBI, TII); + break; + } + } + + // Move MBBI back to the beginning of the function. + MBBI = MBB.begin(); + + // Work out frame sizes. + unsigned FrameSize = determineFrameLayout(MF); + int NegFrameSize = -FrameSize; + if (!isInt<32>(NegFrameSize)) + llvm_unreachable("Unhandled stack size!"); + + if (MFI->isFrameAddressTaken()) + replaceFPWithRealFP(MF); + + // Check if the link register (LR) must be saved. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + bool MustSaveLR = FI->mustSaveLR(); + const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs(); + // Do we have a frame pointer and/or base pointer for this function? + bool HasFP = hasFP(MF); + bool HasBP = RegInfo->hasBasePointer(MF); + + unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; + unsigned BPReg = RegInfo->getBaseRegister(MF); + unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; + unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR; + unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0; + unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg + // ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.) + const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8 + : PPC::MFLR ); + const MCInstrDesc& StoreInst = TII.get(isPPC64 ? PPC::STD + : PPC::STW ); + const MCInstrDesc& StoreUpdtInst = TII.get(isPPC64 ? PPC::STDU + : PPC::STWU ); + const MCInstrDesc& StoreUpdtIdxInst = TII.get(isPPC64 ? PPC::STDUX + : PPC::STWUX); + const MCInstrDesc& LoadImmShiftedInst = TII.get(isPPC64 ? PPC::LIS8 + : PPC::LIS ); + const MCInstrDesc& OrImmInst = TII.get(isPPC64 ? PPC::ORI8 + : PPC::ORI ); + const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8 + : PPC::OR ); + const MCInstrDesc& SubtractCarryingInst = TII.get(isPPC64 ? PPC::SUBFC8 + : PPC::SUBFC); + const MCInstrDesc& SubtractImmCarryingInst = TII.get(isPPC64 ? PPC::SUBFIC8 + : PPC::SUBFIC); + + // Regarding this assert: Even though LR is saved in the caller's frame (i.e., + // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no + // Red Zone, an asynchronous event (a form of "callee") could claim a frame & + // overwrite it, so PPC32 SVR4 must claim at least a minimal frame to save LR. + assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) && + "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4."); + + int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI); + + int FPOffset = 0; + if (HasFP) { + if (isSVR4ABI) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int FPIndex = FI->getFramePointerSaveIndex(); + assert(FPIndex && "No Frame Pointer Save Slot!"); + FPOffset = FFI->getObjectOffset(FPIndex); + } else { + FPOffset = + PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI); + } + } + + int BPOffset = 0; + if (HasBP) { + if (isSVR4ABI) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int BPIndex = FI->getBasePointerSaveIndex(); + assert(BPIndex && "No Base Pointer Save Slot!"); + BPOffset = FFI->getObjectOffset(BPIndex); + } else { + BPOffset = + PPCFrameLowering::getBasePointerSaveOffset(isPPC64, + isDarwinABI, + isPIC); + } + } + + // Get stack alignments. + unsigned MaxAlign = MFI->getMaxAlignment(); + if (HasBP && MaxAlign > 1) + assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) && + "Invalid alignment!"); + + // Frames of 32KB & larger require special handling because they cannot be + // indexed into with a simple STDU/STWU/STD/STW immediate offset operand. + bool isLargeFrame = !isInt<16>(NegFrameSize); + + if (MustSaveLR) + BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg); + + assert((isPPC64 || MustSaveCRs.empty()) && + "Prologue CR saving supported only in 64-bit mode"); + + if (!MustSaveCRs.empty()) { // will only occur for PPC64 + // FIXME: In the ELFv2 ABI, we are not required to save all CR fields. + // If only one or two CR fields are clobbered, it could be more + // efficient to use mfocrf to selectively save just those fields. + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg); + for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i) + MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill); + } + + if (HasFP) + // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. + BuildMI(MBB, MBBI, dl, StoreInst) + .addReg(FPReg) + .addImm(FPOffset) + .addReg(SPReg); + + if (HasBP) + // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. + BuildMI(MBB, MBBI, dl, StoreInst) + .addReg(BPReg) + .addImm(BPOffset) + .addReg(SPReg); + + if (MustSaveLR) + // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe. + BuildMI(MBB, MBBI, dl, StoreInst) + .addReg(ScratchReg) + .addImm(LROffset) + .addReg(SPReg); + + if (!MustSaveCRs.empty()) // will only occur for PPC64 + BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8)) + .addReg(TempReg, getKillRegState(true)) + .addImm(8) + .addReg(SPReg); + + // Skip the rest if this is a leaf function & all spills fit in the Red Zone. + if (!FrameSize) return; + + // Adjust stack pointer: r1 += NegFrameSize. + // If there is a preferred stack alignment, align R1 now + + if (HasBP) { + // Save a copy of r1 as the base pointer. + BuildMI(MBB, MBBI, dl, OrInst, BPReg) + .addReg(SPReg) + .addReg(SPReg); + } + + if (HasBP && MaxAlign > 1) { + if (isPPC64) + BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg) + .addReg(SPReg) + .addImm(0) + .addImm(64 - Log2_32(MaxAlign)); + else // PPC32... + BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg) + .addReg(SPReg) + .addImm(0) + .addImm(32 - Log2_32(MaxAlign)) + .addImm(31); + if (!isLargeFrame) { + BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(NegFrameSize); + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg) + .addImm(NegFrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, TempReg) + .addReg(TempReg, RegState::Kill) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addReg(TempReg, RegState::Kill); + } + BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) + .addReg(SPReg, RegState::Kill) + .addReg(SPReg) + .addReg(ScratchReg); + + } else if (!isLargeFrame) { + BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg) + .addReg(SPReg) + .addImm(NegFrameSize) + .addReg(SPReg); + + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + .addImm(NegFrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(NegFrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg) + .addReg(SPReg, RegState::Kill) + .addReg(SPReg) + .addReg(ScratchReg); + } + + // Add the "machine moves" for the instructions we generated above, but in + // reverse order. + if (needsFrameMoves) { + // Show update of SP. + assert(NegFrameSize); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + if (HasFP) { + unsigned Reg = MRI->getDwarfRegNum(FPReg, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg, FPOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + if (HasBP) { + unsigned Reg = MRI->getDwarfRegNum(BPReg, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg, BPOffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + if (MustSaveLR) { + unsigned Reg = MRI->getDwarfRegNum(LRReg, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg, LROffset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } + + // If there is a frame pointer, copy R1 into R31 + if (HasFP) { + BuildMI(MBB, MBBI, dl, OrInst, FPReg) + .addReg(SPReg) + .addReg(SPReg); + + if (needsFrameMoves) { + // Mark effective beginning of when frame pointer is ready. + unsigned Reg = MRI->getDwarfRegNum(FPReg, true); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaRegister(nullptr, Reg)); + + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } + + if (needsFrameMoves) { + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + unsigned Reg = CSI[I].getReg(); + if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue; + + // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just + // subregisters of CR2. We just need to emit a move of CR2. + if (PPC::CRBITRCRegClass.contains(Reg)) + continue; + + // For SVR4, don't emit a move for the CR spill slot if we haven't + // spilled CRs. + if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4) + && MustSaveCRs.empty()) + continue; + + // For 64-bit SVR4 when we have spilled CRs, the spill location + // is SP+8, not a frame-relative slot. + if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) { + // In the ELFv1 ABI, only CR2 is noted in CFI and stands in for + // the whole CR word. In the ELFv2 ABI, every CR that was + // actually saved gets its own CFI record. + unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2; + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(CRReg, true), 8)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + continue; + } + + int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx()); + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); + BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } +} + +void PPCFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI != MBB.end() && "Returning block has no terminator"); + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo()); + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo()); + + unsigned RetOpcode = MBBI->getOpcode(); + DebugLoc dl; + + assert((RetOpcode == PPC::BLR || + RetOpcode == PPC::TCRETURNri || + RetOpcode == PPC::TCRETURNdi || + RetOpcode == PPC::TCRETURNai || + RetOpcode == PPC::TCRETURNri8 || + RetOpcode == PPC::TCRETURNdi8 || + RetOpcode == PPC::TCRETURNai8) && + "Can only insert epilog into returning blocks"); + + // Get alignment info so we know how to restore the SP. + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Get the number of bytes allocated from the FrameInfo. + int FrameSize = MFI->getStackSize(); + + // Get processor type. + bool isPPC64 = Subtarget.isPPC64(); + // Get the ABI. + bool isDarwinABI = Subtarget.isDarwinABI(); + bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_; + + // Check if the link register (LR) has been saved. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + bool MustSaveLR = FI->mustSaveLR(); + const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs(); + // Do we have a frame pointer and/or base pointer for this function? + bool HasFP = hasFP(MF); + bool HasBP = RegInfo->hasBasePointer(MF); + + unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; + unsigned BPReg = RegInfo->getBaseRegister(MF); + unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; + unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0; + unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg + const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8 + : PPC::MTLR ); + const MCInstrDesc& LoadInst = TII.get( isPPC64 ? PPC::LD + : PPC::LWZ ); + const MCInstrDesc& LoadImmShiftedInst = TII.get( isPPC64 ? PPC::LIS8 + : PPC::LIS ); + const MCInstrDesc& OrImmInst = TII.get( isPPC64 ? PPC::ORI8 + : PPC::ORI ); + const MCInstrDesc& AddImmInst = TII.get( isPPC64 ? PPC::ADDI8 + : PPC::ADDI ); + const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8 + : PPC::ADD4 ); + + int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI); + + int FPOffset = 0; + if (HasFP) { + if (isSVR4ABI) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int FPIndex = FI->getFramePointerSaveIndex(); + assert(FPIndex && "No Frame Pointer Save Slot!"); + FPOffset = FFI->getObjectOffset(FPIndex); + } else { + FPOffset = + PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI); + } + } + + int BPOffset = 0; + if (HasBP) { + if (isSVR4ABI) { + MachineFrameInfo *FFI = MF.getFrameInfo(); + int BPIndex = FI->getBasePointerSaveIndex(); + assert(BPIndex && "No Base Pointer Save Slot!"); + BPOffset = FFI->getObjectOffset(BPIndex); + } else { + BPOffset = + PPCFrameLowering::getBasePointerSaveOffset(isPPC64, + isDarwinABI, + isPIC); + } + } + + bool UsesTCRet = RetOpcode == PPC::TCRETURNri || + RetOpcode == PPC::TCRETURNdi || + RetOpcode == PPC::TCRETURNai || + RetOpcode == PPC::TCRETURNri8 || + RetOpcode == PPC::TCRETURNdi8 || + RetOpcode == PPC::TCRETURNai8; + + if (UsesTCRet) { + int MaxTCRetDelta = FI->getTailCallSPDelta(); + MachineOperand &StackAdjust = MBBI->getOperand(1); + assert(StackAdjust.isImm() && "Expecting immediate value."); + // Adjust stack pointer. + int StackAdj = StackAdjust.getImm(); + int Delta = StackAdj - MaxTCRetDelta; + assert((Delta >= 0) && "Delta must be positive"); + if (MaxTCRetDelta>0) + FrameSize += (StackAdj +Delta); + else + FrameSize += StackAdj; + } + + // Frames of 32KB & larger require special handling because they cannot be + // indexed into with a simple LD/LWZ immediate offset operand. + bool isLargeFrame = !isInt<16>(FrameSize); + + if (FrameSize) { + // In the prologue, the loaded (or persistent) stack pointer value is offset + // by the STDU/STDUX/STWU/STWUX instruction. Add this offset back now. + + // If this function contained a fastcc call and GuaranteedTailCallOpt is + // enabled (=> hasFastCall()==true) the fastcc call might contain a tail + // call which invalidates the stack pointer value in SP(0). So we use the + // value of R31 in this case. + if (FI->hasFastCall()) { + assert(HasFP && "Expecting a valid frame pointer."); + if (!isLargeFrame) { + BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + .addReg(FPReg).addImm(FrameSize); + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + .addImm(FrameSize >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(FrameSize & 0xFFFF); + BuildMI(MBB, MBBI, dl, AddInst) + .addReg(SPReg) + .addReg(FPReg) + .addReg(ScratchReg); + } + } else if (!isLargeFrame && !HasBP && !MFI->hasVarSizedObjects()) { + BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + .addReg(SPReg) + .addImm(FrameSize); + } else { + BuildMI(MBB, MBBI, dl, LoadInst, SPReg) + .addImm(0) + .addReg(SPReg); + } + + } + + if (MustSaveLR) + BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg) + .addImm(LROffset) + .addReg(SPReg); + + assert((isPPC64 || MustSaveCRs.empty()) && + "Epilogue CR restoring supported only in 64-bit mode"); + + if (!MustSaveCRs.empty()) // will only occur for PPC64 + BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg) + .addImm(8) + .addReg(SPReg); + + if (HasFP) + BuildMI(MBB, MBBI, dl, LoadInst, FPReg) + .addImm(FPOffset) + .addReg(SPReg); + + if (HasBP) + BuildMI(MBB, MBBI, dl, LoadInst, BPReg) + .addImm(BPOffset) + .addReg(SPReg); + + if (!MustSaveCRs.empty()) // will only occur for PPC64 + for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i) + BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i]) + .addReg(TempReg, getKillRegState(i == e-1)); + + if (MustSaveLR) + BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg); + + // Callee pop calling convention. Pop parameter/linkage area. Used for tail + // call optimization + if (MF.getTarget().Options.GuaranteedTailCallOpt && RetOpcode == PPC::BLR && + MF.getFunction()->getCallingConv() == CallingConv::Fast) { + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + unsigned CallerAllocatedAmt = FI->getMinReservedArea(); + + if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) { + BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + .addReg(SPReg).addImm(CallerAllocatedAmt); + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + .addImm(CallerAllocatedAmt >> 16); + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addImm(CallerAllocatedAmt & 0xFFFF); + BuildMI(MBB, MBBI, dl, AddInst) + .addReg(SPReg) + .addReg(FPReg) + .addReg(ScratchReg); + } + } else if (RetOpcode == PPC::TCRETURNdi) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + } else if (RetOpcode == PPC::TCRETURNri) { + MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR)); + } else if (RetOpcode == PPC::TCRETURNai) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm()); + } else if (RetOpcode == PPC::TCRETURNdi8) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + } else if (RetOpcode == PPC::TCRETURNri8) { + MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8)); + } else if (RetOpcode == PPC::TCRETURNai8) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm()); + } +} + +/// MustSaveLR - Return true if this function requires that we save the LR +/// register onto the stack in the prolog and restore it in the epilog of the +/// function. +static bool MustSaveLR(const MachineFunction &MF, unsigned LR) { + const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>(); + + // We need a save/restore of LR if there is any def of LR (which is + // defined by calls, including the PIC setup sequence), or if there is + // some use of the LR stack slot (e.g. for builtin_return_address). + // (LR comes in 32 and 64 bit versions.) + MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR); + return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired(); +} + +void +PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *) const { + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo()); + + // Save and clear the LR state. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + unsigned LR = RegInfo->getRARegister(); + FI->setMustSaveLR(MustSaveLR(MF, LR)); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.setPhysRegUnused(LR); + + // Save R31 if necessary + int FPSI = FI->getFramePointerSaveIndex(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); + bool isPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_; + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // If the frame pointer save index hasn't been defined yet. + if (!FPSI && needsFP(MF)) { + // Find out what the fix offset of the frame pointer save area. + int FPOffset = getFramePointerSaveOffset(isPPC64, isDarwinABI); + // Allocate the frame index for frame pointer save area. + FPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); + // Save the result. + FI->setFramePointerSaveIndex(FPSI); + } + + int BPSI = FI->getBasePointerSaveIndex(); + if (!BPSI && RegInfo->hasBasePointer(MF)) { + int BPOffset = getBasePointerSaveOffset(isPPC64, isDarwinABI, isPIC); + // Allocate the frame index for the base pointer save area. + BPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, BPOffset, true); + // Save the result. + FI->setBasePointerSaveIndex(BPSI); + } + + // Reserve stack space to move the linkage area to in case of a tail call. + int TCSPDelta = 0; + if (MF.getTarget().Options.GuaranteedTailCallOpt && + (TCSPDelta = FI->getTailCallSPDelta()) < 0) { + MFI->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true); + } + + // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the + // function uses CR 2, 3, or 4. + if (!isPPC64 && !isDarwinABI && + (MRI.isPhysRegUsed(PPC::CR2) || + MRI.isPhysRegUsed(PPC::CR3) || + MRI.isPhysRegUsed(PPC::CR4))) { + int FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true); + FI->setCRSpillFrameIndex(FrameIdx); + } +} + +void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const { + // Early exit if not using the SVR4 ABI. + if (!Subtarget.isSVR4ABI()) { + addScavengingSpillSlot(MF, RS); + return; + } + + // Get callee saved register information. + MachineFrameInfo *FFI = MF.getFrameInfo(); + const std::vector<CalleeSavedInfo> &CSI = FFI->getCalleeSavedInfo(); + + // Early exit if no callee saved registers are modified! + if (CSI.empty() && !needsFP(MF)) { + addScavengingSpillSlot(MF, RS); + return; + } + + unsigned MinGPR = PPC::R31; + unsigned MinG8R = PPC::X31; + unsigned MinFPR = PPC::F31; + unsigned MinVR = PPC::V31; + + bool HasGPSaveArea = false; + bool HasG8SaveArea = false; + bool HasFPSaveArea = false; + bool HasVRSAVESaveArea = false; + bool HasVRSaveArea = false; + + SmallVector<CalleeSavedInfo, 18> GPRegs; + SmallVector<CalleeSavedInfo, 18> G8Regs; + SmallVector<CalleeSavedInfo, 18> FPRegs; + SmallVector<CalleeSavedInfo, 18> VRegs; + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (PPC::GPRCRegClass.contains(Reg)) { + HasGPSaveArea = true; + + GPRegs.push_back(CSI[i]); + + if (Reg < MinGPR) { + MinGPR = Reg; + } + } else if (PPC::G8RCRegClass.contains(Reg)) { + HasG8SaveArea = true; + + G8Regs.push_back(CSI[i]); + + if (Reg < MinG8R) { + MinG8R = Reg; + } + } else if (PPC::F8RCRegClass.contains(Reg)) { + HasFPSaveArea = true; + + FPRegs.push_back(CSI[i]); + + if (Reg < MinFPR) { + MinFPR = Reg; + } + } else if (PPC::CRBITRCRegClass.contains(Reg) || + PPC::CRRCRegClass.contains(Reg)) { + ; // do nothing, as we already know whether CRs are spilled + } else if (PPC::VRSAVERCRegClass.contains(Reg)) { + HasVRSAVESaveArea = true; + } else if (PPC::VRRCRegClass.contains(Reg)) { + HasVRSaveArea = true; + + VRegs.push_back(CSI[i]); + + if (Reg < MinVR) { + MinVR = Reg; + } + } else { + llvm_unreachable("Unknown RegisterClass!"); + } + } + + PPCFunctionInfo *PFI = MF.getInfo<PPCFunctionInfo>(); + const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo(); + + int64_t LowerBound = 0; + + // Take into account stack space reserved for tail calls. + int TCSPDelta = 0; + if (MF.getTarget().Options.GuaranteedTailCallOpt && + (TCSPDelta = PFI->getTailCallSPDelta()) < 0) { + LowerBound = TCSPDelta; + } + + // The Floating-point register save area is right below the back chain word + // of the previous stack frame. + if (HasFPSaveArea) { + for (unsigned i = 0, e = FPRegs.size(); i != e; ++i) { + int FI = FPRegs[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + LowerBound -= (31 - TRI->getEncodingValue(MinFPR) + 1) * 8; + } + + // Check whether the frame pointer register is allocated. If so, make sure it + // is spilled to the correct offset. + if (needsFP(MF)) { + HasGPSaveArea = true; + + int FI = PFI->getFramePointerSaveIndex(); + assert(FI && "No Frame Pointer Save Slot!"); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + const PPCRegisterInfo *RegInfo = + static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo()); + if (RegInfo->hasBasePointer(MF)) { + HasGPSaveArea = true; + + int FI = PFI->getBasePointerSaveIndex(); + assert(FI && "No Base Pointer Save Slot!"); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + // General register save area starts right below the Floating-point + // register save area. + if (HasGPSaveArea || HasG8SaveArea) { + // Move general register save area spill slots down, taking into account + // the size of the Floating-point register save area. + for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) { + int FI = GPRegs[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + // Move general register save area spill slots down, taking into account + // the size of the Floating-point register save area. + for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) { + int FI = G8Regs[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + + unsigned MinReg = + std::min<unsigned>(TRI->getEncodingValue(MinGPR), + TRI->getEncodingValue(MinG8R)); + + if (Subtarget.isPPC64()) { + LowerBound -= (31 - MinReg + 1) * 8; + } else { + LowerBound -= (31 - MinReg + 1) * 4; + } + } + + // For 32-bit only, the CR save area is below the general register + // save area. For 64-bit SVR4, the CR save area is addressed relative + // to the stack pointer and hence does not need an adjustment here. + // Only CR2 (the first nonvolatile spilled) has an associated frame + // index so that we have a single uniform save area. + if (spillsCR(MF) && !(Subtarget.isPPC64() && Subtarget.isSVR4ABI())) { + // Adjust the frame index of the CR spill slot. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + + if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2) + // Leave Darwin logic as-is. + || (!Subtarget.isSVR4ABI() && + (PPC::CRBITRCRegClass.contains(Reg) || + PPC::CRRCRegClass.contains(Reg)))) { + int FI = CSI[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + } + + LowerBound -= 4; // The CR save area is always 4 bytes long. + } + + if (HasVRSAVESaveArea) { + // FIXME SVR4: Is it actually possible to have multiple elements in CSI + // which have the VRSAVE register class? + // Adjust the frame index of the VRSAVE spill slot. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + + if (PPC::VRSAVERCRegClass.contains(Reg)) { + int FI = CSI[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + } + + LowerBound -= 4; // The VRSAVE save area is always 4 bytes long. + } + + if (HasVRSaveArea) { + // Insert alignment padding, we need 16-byte alignment. + LowerBound = (LowerBound - 15) & ~(15); + + for (unsigned i = 0, e = VRegs.size(); i != e; ++i) { + int FI = VRegs[i].getFrameIdx(); + + FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI)); + } + } + + addScavengingSpillSlot(MF, RS); +} + +void +PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, + RegScavenger *RS) const { + // Reserve a slot closest to SP or frame pointer if we have a dynalloc or + // a large stack, which will require scavenging a register to materialize a + // large offset. + + // We need to have a scavenger spill slot for spills if the frame size is + // large. In case there is no free register for large-offset addressing, + // this slot is used for the necessary emergency spill. Also, we need the + // slot for dynamic stack allocations. + + // The scavenger might be invoked if the frame offset does not fit into + // the 16-bit immediate. We don't know the complete frame size here + // because we've not yet computed callee-saved register spills or the + // needed alignment padding. + unsigned StackSize = determineFrameLayout(MF, false, true); + MachineFrameInfo *MFI = MF.getFrameInfo(); + if (MFI->hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) || + hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) { + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC; + RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); + + // Might we have over-aligned allocas? + bool HasAlVars = MFI->hasVarSizedObjects() && + MFI->getMaxAlignment() > getStackAlignment(); + + // These kinds of spills might need two registers. + if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars) + RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(), + RC->getAlignment(), + false)); + + } +} + +bool +PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + + // Currently, this function only handles SVR4 32- and 64-bit ABIs. + // Return false otherwise to maintain pre-existing behavior. + if (!Subtarget.isSVR4ABI()) + return false; + + MachineFunction *MF = MBB.getParent(); + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo()); + DebugLoc DL; + bool CRSpilled = false; + MachineInstrBuilder CRMIB; + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + // Only Darwin actually uses the VRSAVE register, but it can still appear + // here if, for example, @llvm.eh.unwind.init() is used. If we're not on + // Darwin, ignore it. + if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI()) + continue; + + // CR2 through CR4 are the nonvolatile CR fields. + bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4; + + // Add the callee-saved register as live-in; it's killed at the spill. + MBB.addLiveIn(Reg); + + if (CRSpilled && IsCRField) { + CRMIB.addReg(Reg, RegState::ImplicitKill); + continue; + } + + // Insert the spill to the stack frame. + if (IsCRField) { + PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>(); + if (Subtarget.isPPC64()) { + // The actual spill will happen at the start of the prologue. + FuncInfo->addMustSaveCR(Reg); + } else { + CRSpilled = true; + FuncInfo->setSpillsCR(); + + // 32-bit: FP-relative. Note that we made sure CR2-CR4 all have + // the same frame index in PPCRegisterInfo::hasReservedSpillSlot. + CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12) + .addReg(Reg, RegState::ImplicitKill); + + MBB.insert(MI, CRMIB); + MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW)) + .addReg(PPC::R12, + getKillRegState(true)), + CSI[i].getFrameIdx())); + } + } else { + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(MBB, MI, Reg, true, + CSI[i].getFrameIdx(), RC, TRI); + } + } + return true; +} + +static void +restoreCRs(bool isPPC64, bool is31, + bool CR2Spilled, bool CR3Spilled, bool CR4Spilled, + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) { + + MachineFunction *MF = MBB.getParent(); + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo()); + DebugLoc DL; + unsigned RestoreOp, MoveReg; + + if (isPPC64) + // This is handled during epilogue generation. + return; + else { + // 32-bit: FP-relative + MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ), + PPC::R12), + CSI[CSIIndex].getFrameIdx())); + RestoreOp = PPC::MTOCRF; + MoveReg = PPC::R12; + } + + if (CR2Spilled) + MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2) + .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled))); + + if (CR3Spilled) + MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR3) + .addReg(MoveReg, getKillRegState(!CR4Spilled))); + + if (CR4Spilled) + MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR4) + .addReg(MoveReg, getKillRegState(true))); +} + +void PPCFrameLowering:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo()); + if (MF.getTarget().Options.GuaranteedTailCallOpt && + I->getOpcode() == PPC::ADJCALLSTACKUP) { + // Add (actually subtract) back the amount the callee popped on return. + if (int CalleeAmt = I->getOperand(1).getImm()) { + bool is64Bit = Subtarget.isPPC64(); + CalleeAmt *= -1; + unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1; + unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0; + unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI; + unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4; + unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS; + unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI; + MachineInstr *MI = I; + DebugLoc dl = MI->getDebugLoc(); + + if (isInt<16>(CalleeAmt)) { + BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg) + .addReg(StackReg, RegState::Kill) + .addImm(CalleeAmt); + } else { + MachineBasicBlock::iterator MBBI = I; + BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg) + .addImm(CalleeAmt >> 16); + BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg) + .addReg(TmpReg, RegState::Kill) + .addImm(CalleeAmt & 0xFFFF); + BuildMI(MBB, MBBI, dl, TII.get(ADDInstr), StackReg) + .addReg(StackReg, RegState::Kill) + .addReg(TmpReg); + } + } + } + // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions. + MBB.erase(I); +} + +bool +PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + + // Currently, this function only handles SVR4 32- and 64-bit ABIs. + // Return false otherwise to maintain pre-existing behavior. + if (!Subtarget.isSVR4ABI()) + return false; + + MachineFunction *MF = MBB.getParent(); + const PPCInstrInfo &TII = + *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo()); + bool CR2Spilled = false; + bool CR3Spilled = false; + bool CR4Spilled = false; + unsigned CSIIndex = 0; + + // Initialize insertion-point logic; we will be restoring in reverse + // order of spill. + MachineBasicBlock::iterator I = MI, BeforeI = I; + bool AtStart = I == MBB.begin(); + + if (!AtStart) + --BeforeI; + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + + // Only Darwin actually uses the VRSAVE register, but it can still appear + // here if, for example, @llvm.eh.unwind.init() is used. If we're not on + // Darwin, ignore it. + if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI()) + continue; + + if (Reg == PPC::CR2) { + CR2Spilled = true; + // The spill slot is associated only with CR2, which is the + // first nonvolatile spilled. Save it here. + CSIIndex = i; + continue; + } else if (Reg == PPC::CR3) { + CR3Spilled = true; + continue; + } else if (Reg == PPC::CR4) { + CR4Spilled = true; + continue; + } else { + // When we first encounter a non-CR register after seeing at + // least one CR register, restore all spilled CRs together. + if ((CR2Spilled || CR3Spilled || CR4Spilled) + && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) { + bool is31 = needsFP(*MF); + restoreCRs(Subtarget.isPPC64(), is31, + CR2Spilled, CR3Spilled, CR4Spilled, + MBB, I, CSI, CSIIndex); + CR2Spilled = CR3Spilled = CR4Spilled = false; + } + + // Default behavior for non-CR saves. + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), + RC, TRI); + assert(I != MBB.begin() && + "loadRegFromStackSlot didn't insert any code!"); + } + + // Insert in reverse order. + if (AtStart) + I = MBB.begin(); + else { + I = BeforeI; + ++I; + } + } + + // If we haven't yet spilled the CRs, do so now. + if (CR2Spilled || CR3Spilled || CR4Spilled) { + bool is31 = needsFP(*MF); + restoreCRs(Subtarget.isPPC64(), is31, CR2Spilled, CR3Spilled, CR4Spilled, + MBB, I, CSI, CSIIndex); + } + + return true; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h new file mode 100644 index 000000000000..c0c7d248f8d2 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h @@ -0,0 +1,126 @@ +//===-- PPCFrameLowering.h - Define frame lowering for PowerPC --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC_FRAMEINFO_H +#define POWERPC_FRAMEINFO_H + +#include "PPC.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +class PPCSubtarget; + +class PPCFrameLowering: public TargetFrameLowering { + const PPCSubtarget &Subtarget; + +public: + PPCFrameLowering(const PPCSubtarget &STI); + + unsigned determineFrameLayout(MachineFunction &MF, + bool UpdateMF = true, + bool UseEstimate = false) const; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + + bool hasFP(const MachineFunction &MF) const override; + bool needsFP(const MachineFunction &MF) const; + void replaceFPWithRealFP(MachineFunction &MF) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = nullptr) const override; + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS = nullptr) const override; + void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const override; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const override; + + /// targetHandlesStackFrameRounding - Returns true if the target is + /// responsible for rounding up the stack frame (probably at emitPrologue + /// time). + bool targetHandlesStackFrameRounding() const override { return true; } + + /// getReturnSaveOffset - Return the previous frame offset to save the + /// return address. + static unsigned getReturnSaveOffset(bool isPPC64, bool isDarwinABI) { + if (isDarwinABI) + return isPPC64 ? 16 : 8; + // SVR4 ABI: + return isPPC64 ? 16 : 4; + } + + /// getTOCSaveOffset - Return the previous frame offset to save the + /// TOC register -- 64-bit SVR4 ABI only. + static unsigned getTOCSaveOffset(bool isELFv2ABI) { + return isELFv2ABI ? 24 : 40; + } + + /// getFramePointerSaveOffset - Return the previous frame offset to save the + /// frame pointer. + static unsigned getFramePointerSaveOffset(bool isPPC64, bool isDarwinABI) { + // For the Darwin ABI: + // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area + // for saving the frame pointer (if needed.) While the published ABI has + // not used this slot since at least MacOSX 10.2, there is older code + // around that does use it, and that needs to continue to work. + if (isDarwinABI) + return isPPC64 ? -8U : -4U; + + // SVR4 ABI: First slot in the general register save area. + return isPPC64 ? -8U : -4U; + } + + /// getBasePointerSaveOffset - Return the previous frame offset to save the + /// base pointer. + static unsigned getBasePointerSaveOffset(bool isPPC64, + bool isDarwinABI, + bool isPIC) { + if (isDarwinABI) + return isPPC64 ? -16U : -8U; + + // SVR4 ABI: First slot in the general register save area. + return isPPC64 ? -16U : isPIC ? -12U : -8U; + } + + /// getLinkageSize - Return the size of the PowerPC ABI linkage area. + /// + static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI, + bool isELFv2ABI) { + if (isDarwinABI || isPPC64) + return (isELFv2ABI ? 4 : 6) * (isPPC64 ? 8 : 4); + + // SVR4 ABI: + return 8; + } + + const SpillSlot * + getCalleeSavedSpillSlots(unsigned &NumEntries) const override; +}; +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp new file mode 100644 index 000000000000..d9b242cad265 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -0,0 +1,433 @@ +//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements hazard recognizers for scheduling on PowerPC processors. +// +//===----------------------------------------------------------------------===// + +#include "PPCHazardRecognizers.h" +#include "PPC.h" +#include "PPCInstrInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "pre-RA-sched" + +bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) { + // FIXME: Move this. + if (isBCTRAfterSet(SU)) + return true; + + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + if (!MCID) + return false; + + if (!MCID->mayLoad()) + return false; + + // SU is a load; for any predecessors in this dispatch group, that are stores, + // and with which we have an ordering dependency, return true. + for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) { + const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit()); + if (!PredMCID || !PredMCID->mayStore()) + continue; + + if (!SU->Preds[i].isNormalMemory() && !SU->Preds[i].isBarrier()) + continue; + + for (unsigned j = 0, je = CurGroup.size(); j != je; ++j) + if (SU->Preds[i].getSUnit() == CurGroup[j]) + return true; + } + + return false; +} + +bool PPCDispatchGroupSBHazardRecognizer::isBCTRAfterSet(SUnit *SU) { + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + if (!MCID) + return false; + + if (!MCID->isBranch()) + return false; + + // SU is a branch; for any predecessors in this dispatch group, with which we + // have a data dependence and set the counter register, return true. + for (unsigned i = 0, ie = (unsigned) SU->Preds.size(); i != ie; ++i) { + const MCInstrDesc *PredMCID = DAG->getInstrDesc(SU->Preds[i].getSUnit()); + if (!PredMCID || PredMCID->getSchedClass() != PPC::Sched::IIC_SprMTSPR) + continue; + + if (SU->Preds[i].isCtrl()) + continue; + + for (unsigned j = 0, je = CurGroup.size(); j != je; ++j) + if (SU->Preds[i].getSUnit() == CurGroup[j]) + return true; + } + + return false; +} + +// FIXME: Remove this when we don't need this: +namespace llvm { namespace PPC { extern int getNonRecordFormOpcode(uint16_t); } } + +// FIXME: A lot of code in PPCDispatchGroupSBHazardRecognizer is P7 specific. + +bool PPCDispatchGroupSBHazardRecognizer::mustComeFirst(const MCInstrDesc *MCID, + unsigned &NSlots) { + // FIXME: Indirectly, this information is contained in the itinerary, and + // we should derive it from there instead of separately specifying it + // here. + unsigned IIC = MCID->getSchedClass(); + switch (IIC) { + default: + NSlots = 1; + break; + case PPC::Sched::IIC_IntDivW: + case PPC::Sched::IIC_IntDivD: + case PPC::Sched::IIC_LdStLoadUpd: + case PPC::Sched::IIC_LdStLDU: + case PPC::Sched::IIC_LdStLFDU: + case PPC::Sched::IIC_LdStLFDUX: + case PPC::Sched::IIC_LdStLHA: + case PPC::Sched::IIC_LdStLHAU: + case PPC::Sched::IIC_LdStLWA: + case PPC::Sched::IIC_LdStSTDU: + case PPC::Sched::IIC_LdStSTFDU: + NSlots = 2; + break; + case PPC::Sched::IIC_LdStLoadUpdX: + case PPC::Sched::IIC_LdStLDUX: + case PPC::Sched::IIC_LdStLHAUX: + case PPC::Sched::IIC_LdStLWARX: + case PPC::Sched::IIC_LdStLDARX: + case PPC::Sched::IIC_LdStSTDUX: + case PPC::Sched::IIC_LdStSTDCX: + case PPC::Sched::IIC_LdStSTWCX: + case PPC::Sched::IIC_BrMCRX: // mtcr + // FIXME: Add sync/isync (here and in the itinerary). + NSlots = 4; + break; + } + + // FIXME: record-form instructions need a different itinerary class. + if (NSlots == 1 && PPC::getNonRecordFormOpcode(MCID->getOpcode()) != -1) + NSlots = 2; + + switch (IIC) { + default: + // All multi-slot instructions must come first. + return NSlots > 1; + case PPC::Sched::IIC_BrCR: // cr logicals + case PPC::Sched::IIC_SprMFCR: + case PPC::Sched::IIC_SprMFCRF: + case PPC::Sched::IIC_SprMTSPR: + return true; + } +} + +ScheduleHazardRecognizer::HazardType +PPCDispatchGroupSBHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + if (Stalls == 0 && isLoadAfterStore(SU)) + return NoopHazard; + + return ScoreboardHazardRecognizer::getHazardType(SU, Stalls); +} + +bool PPCDispatchGroupSBHazardRecognizer::ShouldPreferAnother(SUnit *SU) { + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + unsigned NSlots; + if (MCID && mustComeFirst(MCID, NSlots) && CurSlots) + return true; + + return ScoreboardHazardRecognizer::ShouldPreferAnother(SU); +} + +unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) { + // We only need to fill out a maximum of 5 slots here: The 6th slot could + // only be a second branch, and otherwise the next instruction will start a + // new group. + if (isLoadAfterStore(SU) && CurSlots < 6) { + unsigned Directive = + DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective(); + // If we're using a special group-terminating nop, then we need only one. + if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || + Directive == PPC::DIR_PWR8 ) + return 1; + + return 5 - CurSlots; + } + + return ScoreboardHazardRecognizer::PreEmitNoops(SU); +} + +void PPCDispatchGroupSBHazardRecognizer::EmitInstruction(SUnit *SU) { + const MCInstrDesc *MCID = DAG->getInstrDesc(SU); + if (MCID) { + if (CurSlots == 5 || (MCID->isBranch() && CurBranches == 1)) { + CurGroup.clear(); + CurSlots = CurBranches = 0; + } else { + DEBUG(dbgs() << "**** Adding to dispatch group: SU(" << + SU->NodeNum << "): "); + DEBUG(DAG->dumpNode(SU)); + + unsigned NSlots; + bool MustBeFirst = mustComeFirst(MCID, NSlots); + + // If this instruction must come first, but does not, then it starts a + // new group. + if (MustBeFirst && CurSlots) { + CurSlots = CurBranches = 0; + CurGroup.clear(); + } + + CurSlots += NSlots; + CurGroup.push_back(SU); + + if (MCID->isBranch()) + ++CurBranches; + } + } + + return ScoreboardHazardRecognizer::EmitInstruction(SU); +} + +void PPCDispatchGroupSBHazardRecognizer::AdvanceCycle() { + return ScoreboardHazardRecognizer::AdvanceCycle(); +} + +void PPCDispatchGroupSBHazardRecognizer::RecedeCycle() { + llvm_unreachable("Bottom-up scheduling not supported"); +} + +void PPCDispatchGroupSBHazardRecognizer::Reset() { + CurGroup.clear(); + CurSlots = CurBranches = 0; + return ScoreboardHazardRecognizer::Reset(); +} + +void PPCDispatchGroupSBHazardRecognizer::EmitNoop() { + unsigned Directive = + DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective(); + // If the group has now filled all of its slots, or if we're using a special + // group-terminating nop, the group is complete. + if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || + Directive == PPC::DIR_PWR8 || CurSlots == 6) { + CurGroup.clear(); + CurSlots = CurBranches = 0; + } else { + CurGroup.push_back(nullptr); + ++CurSlots; + } +} + +//===----------------------------------------------------------------------===// +// PowerPC 970 Hazard Recognizer +// +// This models the dispatch group formation of the PPC970 processor. Dispatch +// groups are bundles of up to five instructions that can contain various mixes +// of instructions. The PPC970 can dispatch a peak of 4 non-branch and one +// branch instruction per-cycle. +// +// There are a number of restrictions to dispatch group formation: some +// instructions can only be issued in the first slot of a dispatch group, & some +// instructions fill an entire dispatch group. Additionally, only branches can +// issue in the 5th (last) slot. +// +// Finally, there are a number of "structural" hazards on the PPC970. These +// conditions cause large performance penalties due to misprediction, recovery, +// and replay logic that has to happen. These cases include setting a CTR and +// branching through it in the same dispatch group, and storing to an address, +// then loading from the same address within a dispatch group. To avoid these +// conditions, we insert no-op instructions when appropriate. +// +// FIXME: This is missing some significant cases: +// 1. Modeling of microcoded instructions. +// 2. Handling of serialized operations. +// 3. Handling of the esoteric cases in "Resource-based Instruction Grouping". +// + +PPCHazardRecognizer970::PPCHazardRecognizer970(const ScheduleDAG &DAG) + : DAG(DAG) { + EndDispatchGroup(); +} + +void PPCHazardRecognizer970::EndDispatchGroup() { + DEBUG(errs() << "=== Start of dispatch group\n"); + NumIssued = 0; + + // Structural hazard info. + HasCTRSet = false; + NumStores = 0; +} + + +PPCII::PPC970_Unit +PPCHazardRecognizer970::GetInstrType(unsigned Opcode, + bool &isFirst, bool &isSingle, + bool &isCracked, + bool &isLoad, bool &isStore) { + const MCInstrDesc &MCID = DAG.TII->get(Opcode); + + isLoad = MCID.mayLoad(); + isStore = MCID.mayStore(); + + uint64_t TSFlags = MCID.TSFlags; + + isFirst = TSFlags & PPCII::PPC970_First; + isSingle = TSFlags & PPCII::PPC970_Single; + isCracked = TSFlags & PPCII::PPC970_Cracked; + return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask); +} + +/// isLoadOfStoredAddress - If we have a load from the previously stored pointer +/// as indicated by StorePtr1/StorePtr2/StoreSize, return true. +bool PPCHazardRecognizer970:: +isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset, + const Value *LoadValue) const { + for (unsigned i = 0, e = NumStores; i != e; ++i) { + // Handle exact and commuted addresses. + if (LoadValue == StoreValue[i] && LoadOffset == StoreOffset[i]) + return true; + + // Okay, we don't have an exact match, if this is an indexed offset, see if + // we have overlap (which happens during fp->int conversion for example). + if (StoreValue[i] == LoadValue) { + // Okay the base pointers match, so we have [c1+r] vs [c2+r]. Check + // to see if the load and store actually overlap. + if (StoreOffset[i] < LoadOffset) { + if (int64_t(StoreOffset[i]+StoreSize[i]) > LoadOffset) return true; + } else { + if (int64_t(LoadOffset+LoadSize) > StoreOffset[i]) return true; + } + } + } + return false; +} + +/// getHazardType - We return hazard for any non-branch instruction that would +/// terminate the dispatch group. We turn NoopHazard for any +/// instructions that wouldn't terminate the dispatch group that would cause a +/// pipeline flush. +ScheduleHazardRecognizer::HazardType PPCHazardRecognizer970:: +getHazardType(SUnit *SU, int Stalls) { + assert(Stalls == 0 && "PPC hazards don't support scoreboard lookahead"); + + MachineInstr *MI = SU->getInstr(); + + if (MI->isDebugValue()) + return NoHazard; + + unsigned Opcode = MI->getOpcode(); + bool isFirst, isSingle, isCracked, isLoad, isStore; + PPCII::PPC970_Unit InstrType = + GetInstrType(Opcode, isFirst, isSingle, isCracked, + isLoad, isStore); + if (InstrType == PPCII::PPC970_Pseudo) return NoHazard; + + // We can only issue a PPC970_First/PPC970_Single instruction (such as + // crand/mtspr/etc) if this is the first cycle of the dispatch group. + if (NumIssued != 0 && (isFirst || isSingle)) + return Hazard; + + // If this instruction is cracked into two ops by the decoder, we know that + // it is not a branch and that it cannot issue if 3 other instructions are + // already in the dispatch group. + if (isCracked && NumIssued > 2) + return Hazard; + + switch (InstrType) { + default: llvm_unreachable("Unknown instruction type!"); + case PPCII::PPC970_FXU: + case PPCII::PPC970_LSU: + case PPCII::PPC970_FPU: + case PPCII::PPC970_VALU: + case PPCII::PPC970_VPERM: + // We can only issue a branch as the last instruction in a group. + if (NumIssued == 4) return Hazard; + break; + case PPCII::PPC970_CRU: + // We can only issue a CR instruction in the first two slots. + if (NumIssued >= 2) return Hazard; + break; + case PPCII::PPC970_BRU: + break; + } + + // Do not allow MTCTR and BCTRL to be in the same dispatch group. + if (HasCTRSet && Opcode == PPC::BCTRL) + return NoopHazard; + + // If this is a load following a store, make sure it's not to the same or + // overlapping address. + if (isLoad && NumStores && !MI->memoperands_empty()) { + MachineMemOperand *MO = *MI->memoperands_begin(); + if (isLoadOfStoredAddress(MO->getSize(), + MO->getOffset(), MO->getValue())) + return NoopHazard; + } + + return NoHazard; +} + +void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) { + MachineInstr *MI = SU->getInstr(); + + if (MI->isDebugValue()) + return; + + unsigned Opcode = MI->getOpcode(); + bool isFirst, isSingle, isCracked, isLoad, isStore; + PPCII::PPC970_Unit InstrType = + GetInstrType(Opcode, isFirst, isSingle, isCracked, + isLoad, isStore); + if (InstrType == PPCII::PPC970_Pseudo) return; + + // Update structural hazard information. + if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true; + + // Track the address stored to. + if (isStore && NumStores < 4 && !MI->memoperands_empty()) { + MachineMemOperand *MO = *MI->memoperands_begin(); + StoreSize[NumStores] = MO->getSize(); + StoreOffset[NumStores] = MO->getOffset(); + StoreValue[NumStores] = MO->getValue(); + ++NumStores; + } + + if (InstrType == PPCII::PPC970_BRU || isSingle) + NumIssued = 4; // Terminate a d-group. + ++NumIssued; + + // If this instruction is cracked into two ops by the decoder, remember that + // we issued two pieces. + if (isCracked) + ++NumIssued; + + if (NumIssued == 5) + EndDispatchGroup(); +} + +void PPCHazardRecognizer970::AdvanceCycle() { + assert(NumIssued < 5 && "Illegal dispatch group!"); + ++NumIssued; + if (NumIssued == 5) + EndDispatchGroup(); +} + +void PPCHazardRecognizer970::Reset() { + EndDispatchGroup(); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h new file mode 100644 index 000000000000..23f76c16d138 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h @@ -0,0 +1,102 @@ +//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling on PowerPC processors. +// +//===----------------------------------------------------------------------===// + +#ifndef PPCHAZRECS_H +#define PPCHAZRECS_H + +#include "PPCInstrInfo.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/ScoreboardHazardRecognizer.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" + +namespace llvm { + +/// PPCDispatchGroupSBHazardRecognizer - This class implements a scoreboard-based +/// hazard recognizer for PPC ooo processors with dispatch-group hazards. +class PPCDispatchGroupSBHazardRecognizer : public ScoreboardHazardRecognizer { + const ScheduleDAG *DAG; + SmallVector<SUnit *, 7> CurGroup; + unsigned CurSlots, CurBranches; + + bool isLoadAfterStore(SUnit *SU); + bool isBCTRAfterSet(SUnit *SU); + bool mustComeFirst(const MCInstrDesc *MCID, unsigned &NSlots); +public: + PPCDispatchGroupSBHazardRecognizer(const InstrItineraryData *ItinData, + const ScheduleDAG *DAG_) : + ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_), + CurSlots(0), CurBranches(0) {} + + HazardType getHazardType(SUnit *SU, int Stalls) override; + bool ShouldPreferAnother(SUnit* SU) override; + unsigned PreEmitNoops(SUnit *SU) override; + void EmitInstruction(SUnit *SU) override; + void AdvanceCycle() override; + void RecedeCycle() override; + void Reset() override; + void EmitNoop() override; +}; + +/// PPCHazardRecognizer970 - This class defines a finite state automata that +/// models the dispatch logic on the PowerPC 970 (aka G5) processor. This +/// promotes good dispatch group formation and implements noop insertion to +/// avoid structural hazards that cause significant performance penalties (e.g. +/// setting the CTR register then branching through it within a dispatch group), +/// or storing then loading from the same address within a dispatch group. +class PPCHazardRecognizer970 : public ScheduleHazardRecognizer { + const ScheduleDAG &DAG; + + unsigned NumIssued; // Number of insts issued, including advanced cycles. + + // Various things that can cause a structural hazard. + + // HasCTRSet - If the CTR register is set in this group, disallow BCTRL. + bool HasCTRSet; + + // StoredPtr - Keep track of the address of any store. If we see a load from + // the same address (or one that aliases it), disallow the store. We can have + // up to four stores in one dispatch group, hence we track up to 4. + // + // This is null if we haven't seen a store yet. We keep track of both + // operands of the store here, since we support [r+r] and [r+i] addressing. + const Value *StoreValue[4]; + int64_t StoreOffset[4]; + uint64_t StoreSize[4]; + unsigned NumStores; + +public: + PPCHazardRecognizer970(const ScheduleDAG &DAG); + virtual HazardType getHazardType(SUnit *SU, int Stalls) override; + virtual void EmitInstruction(SUnit *SU) override; + virtual void AdvanceCycle() override; + virtual void Reset() override; + +private: + /// EndDispatchGroup - Called when we are finishing a new dispatch group. + /// + void EndDispatchGroup(); + + /// GetInstrType - Classify the specified powerpc opcode according to its + /// pipeline. + PPCII::PPC970_Unit GetInstrType(unsigned Opcode, + bool &isFirst, bool &isSingle,bool &isCracked, + bool &isLoad, bool &isStore); + + bool isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset, + const Value *LoadValue) const; +}; + +} // end namespace llvm + +#endif + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp new file mode 100644 index 000000000000..490f6d2bcd4d --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -0,0 +1,2205 @@ +//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a pattern matching instruction selector for PowerPC, +// converting from a legalized dag to a PPC dag. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +#define DEBUG_TYPE "ppc-codegen" + +// FIXME: Remove this once the bug has been fixed! +cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug", +cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); + +namespace llvm { + void initializePPCDAGToDAGISelPass(PassRegistry&); +} + +namespace { + //===--------------------------------------------------------------------===// + /// PPCDAGToDAGISel - PPC specific code to select PPC machine + /// instructions for SelectionDAG operations. + /// + class PPCDAGToDAGISel : public SelectionDAGISel { + const PPCTargetMachine &TM; + const PPCTargetLowering *PPCLowering; + const PPCSubtarget *PPCSubTarget; + unsigned GlobalBaseReg; + public: + explicit PPCDAGToDAGISel(PPCTargetMachine &tm) + : SelectionDAGISel(tm), TM(tm), + PPCLowering(TM.getTargetLowering()), + PPCSubTarget(TM.getSubtargetImpl()) { + initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + // Make sure we re-emit a set of the global base reg if necessary + GlobalBaseReg = 0; + PPCLowering = TM.getTargetLowering(); + PPCSubTarget = TM.getSubtargetImpl(); + SelectionDAGISel::runOnMachineFunction(MF); + + if (!PPCSubTarget->isSVR4ABI()) + InsertVRSaveCode(MF); + + return true; + } + + void PostprocessISelDAG() override; + + /// getI32Imm - Return a target constant with the specified value, of type + /// i32. + inline SDValue getI32Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + } + + /// getI64Imm - Return a target constant with the specified value, of type + /// i64. + inline SDValue getI64Imm(uint64_t Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i64); + } + + /// getSmallIPtrImm - Return a target constant of pointer type. + inline SDValue getSmallIPtrImm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, PPCLowering->getPointerTy()); + } + + /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s + /// with any number of 0s on either side. The 1s are allowed to wrap from + /// LSB to MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. + /// 0x0F0F0000 is not, since all 1s are not contiguous. + static bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME); + + + /// isRotateAndMask - Returns true if Mask and Shift can be folded into a + /// rotate and mask opcode and mask operation. + static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask, + unsigned &SH, unsigned &MB, unsigned &ME); + + /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC + /// base register. Return the virtual register that holds this value. + SDNode *getGlobalBaseReg(); + + // Select - Convert the specified operand from a target-independent to a + // target-specific node if it hasn't already been changed. + SDNode *Select(SDNode *N) override; + + SDNode *SelectBitfieldInsert(SDNode *N); + + /// SelectCC - Select a comparison of the specified values with the + /// specified condition code, returning the CR# of the expression. + SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDLoc dl); + + /// SelectAddrImm - Returns true if the address N can be represented by + /// a base register plus a signed 16-bit displacement [r+imm]. + bool SelectAddrImm(SDValue N, SDValue &Disp, + SDValue &Base) { + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false); + } + + /// SelectAddrImmOffs - Return true if the operand is valid for a preinc + /// immediate field. Note that the operand at this point is already the + /// result of a prior SelectAddressRegImm call. + bool SelectAddrImmOffs(SDValue N, SDValue &Out) const { + if (N.getOpcode() == ISD::TargetConstant || + N.getOpcode() == ISD::TargetGlobalAddress) { + Out = N; + return true; + } + + return false; + } + + /// SelectAddrIdx - Given the specified addressed, check to see if it can be + /// represented as an indexed [r+r] operation. Returns false if it can + /// be represented by [r+imm], which are preferred. + bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) { + return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG); + } + + /// SelectAddrIdxOnly - Given the specified addressed, force it to be + /// represented as an indexed [r+r] operation. + bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) { + return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG); + } + + /// SelectAddrImmX4 - Returns true if the address N can be represented by + /// a base register plus a signed 16-bit displacement that is a multiple of 4. + /// Suitable for use by STD and friends. + bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) { + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true); + } + + // Select an address into a single register. + bool SelectAddr(SDValue N, SDValue &Base) { + Base = N; + return true; + } + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. It is always correct to compute the value into + /// a register. The case of adding a (possibly relocatable) constant to a + /// register can be improved, but it is wrong to substitute Reg+Reg for + /// Reg in an asm, because the load or store opcode would have to change. + bool SelectInlineAsmMemoryOperand(const SDValue &Op, + char ConstraintCode, + std::vector<SDValue> &OutOps) override { + OutOps.push_back(Op); + return false; + } + + void InsertVRSaveCode(MachineFunction &MF); + + const char *getPassName() const override { + return "PowerPC DAG->DAG Pattern Instruction Selection"; + } + +// Include the pieces autogenerated from the target description. +#include "PPCGenDAGISel.inc" + +private: + SDNode *SelectSETCC(SDNode *N); + + void PeepholePPC64(); + void PeepholeCROps(); + + bool AllUsersSelectZero(SDNode *N); + void SwapAllSelectUsers(SDNode *N); + }; +} + +/// InsertVRSaveCode - Once the entire function has been instruction selected, +/// all virtual registers are created and all machine instructions are built, +/// check to see if we need to save/restore VRSAVE. If so, do it. +void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) { + // Check to see if this function uses vector registers, which means we have to + // save and restore the VRSAVE register and update it with the regs we use. + // + // In this case, there will be virtual registers of vector type created + // by the scheduler. Detect them now. + bool HasVectorVReg = false; + for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) { + HasVectorVReg = true; + break; + } + } + if (!HasVectorVReg) return; // nothing to do. + + // If we have a vector register, we want to emit code into the entry and exit + // blocks to save and restore the VRSAVE register. We do this here (instead + // of marking all vector instructions as clobbering VRSAVE) for two reasons: + // + // 1. This (trivially) reduces the load on the register allocator, by not + // having to represent the live range of the VRSAVE register. + // 2. This (more significantly) allows us to create a temporary virtual + // register to hold the saved VRSAVE value, allowing this temporary to be + // register allocated, instead of forcing it to be spilled to the stack. + + // Create two vregs - one to hold the VRSAVE register that is live-in to the + // function and one for the value after having bits or'd into it. + unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + + const TargetInstrInfo &TII = *TM.getInstrInfo(); + MachineBasicBlock &EntryBB = *Fn.begin(); + DebugLoc dl; + // Emit the following code into the entry block: + // InVRSAVE = MFVRSAVE + // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE + // MTVRSAVE UpdatedVRSAVE + MachineBasicBlock::iterator IP = EntryBB.begin(); // Insert Point + BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE); + BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE), + UpdatedVRSAVE).addReg(InVRSAVE); + BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE); + + // Find all return blocks, outputting a restore in each epilog. + for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { + if (!BB->empty() && BB->back().isReturn()) { + IP = BB->end(); --IP; + + // Skip over all terminator instructions, which are part of the return + // sequence. + MachineBasicBlock::iterator I2 = IP; + while (I2 != BB->begin() && (--I2)->isTerminator()) + IP = I2; + + // Emit: MTVRSAVE InVRSave + BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE); + } + } +} + + +/// getGlobalBaseReg - Output the instructions required to put the +/// base address to use for accessing globals into a register. +/// +SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { + if (!GlobalBaseReg) { + const TargetInstrInfo &TII = *TM.getInstrInfo(); + // Insert the set of GlobalBaseReg into the first MBB of the function + MachineBasicBlock &FirstMBB = MF->front(); + MachineBasicBlock::iterator MBBI = FirstMBB.begin(); + DebugLoc dl; + + if (PPCLowering->getPointerTy() == MVT::i32) { + if (PPCSubTarget->isTargetELF()) + GlobalBaseReg = PPC::R30; + else + GlobalBaseReg = + RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR)); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg); + if (PPCSubTarget->isTargetELF()) { + unsigned TempReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); + BuildMI(FirstMBB, MBBI, dl, + TII.get(PPC::GetGBRO), TempReg).addReg(GlobalBaseReg); + BuildMI(FirstMBB, MBBI, dl, + TII.get(PPC::UpdateGBR)).addReg(GlobalBaseReg).addReg(TempReg); + MF->getInfo<PPCFunctionInfo>()->setUsesPICBase(true); + } + } else { + GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RC_NOX0RegClass); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8)); + BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg); + } + } + return CurDAG->getRegister(GlobalBaseReg, + PPCLowering->getPointerTy()).getNode(); +} + +/// isIntS16Immediate - This method tests to see if the node is either a 32-bit +/// or 64-bit immediate, and if the value can be accurately represented as a +/// sign extension from a 16-bit value. If so, this returns true and the +/// immediate. +static bool isIntS16Immediate(SDNode *N, short &Imm) { + if (N->getOpcode() != ISD::Constant) + return false; + + Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); + if (N->getValueType(0) == MVT::i32) + return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); + else + return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); +} + +static bool isIntS16Immediate(SDValue Op, short &Imm) { + return isIntS16Immediate(Op.getNode(), Imm); +} + + +/// isInt32Immediate - This method tests to see if the node is a 32-bit constant +/// operand. If so Imm will receive the 32-bit value. +static bool isInt32Immediate(SDNode *N, unsigned &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { + Imm = cast<ConstantSDNode>(N)->getZExtValue(); + return true; + } + return false; +} + +/// isInt64Immediate - This method tests to see if the node is a 64-bit constant +/// operand. If so Imm will receive the 64-bit value. +static bool isInt64Immediate(SDNode *N, uint64_t &Imm) { + if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) { + Imm = cast<ConstantSDNode>(N)->getZExtValue(); + return true; + } + return false; +} + +// isInt32Immediate - This method tests to see if a constant operand. +// If so Imm will receive the 32 bit value. +static bool isInt32Immediate(SDValue N, unsigned &Imm) { + return isInt32Immediate(N.getNode(), Imm); +} + + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the 32 bit value. +static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) { + return N->getOpcode() == Opc + && isInt32Immediate(N->getOperand(1).getNode(), Imm); +} + +bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) { + if (!Val) + return false; + + if (isShiftedMask_32(Val)) { + // look for the first non-zero bit + MB = countLeadingZeros(Val); + // look for the first zero bit after the run of ones + ME = countLeadingZeros((Val - 1) ^ Val); + return true; + } else { + Val = ~Val; // invert mask + if (isShiftedMask_32(Val)) { + // effectively look for the first zero bit + ME = countLeadingZeros(Val) - 1; + // effectively look for the first one bit after the run of zeros + MB = countLeadingZeros((Val - 1) ^ Val) + 1; + return true; + } + } + // no run present + return false; +} + +bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask, + bool isShiftMask, unsigned &SH, + unsigned &MB, unsigned &ME) { + // Don't even go down this path for i64, since different logic will be + // necessary for rldicl/rldicr/rldimi. + if (N->getValueType(0) != MVT::i32) + return false; + + unsigned Shift = 32; + unsigned Indeterminant = ~0; // bit mask marking indeterminant results + unsigned Opcode = N->getOpcode(); + if (N->getNumOperands() != 2 || + !isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31)) + return false; + + if (Opcode == ISD::SHL) { + // apply shift left to mask if it comes first + if (isShiftMask) Mask = Mask << Shift; + // determine which bits are made indeterminant by shift + Indeterminant = ~(0xFFFFFFFFu << Shift); + } else if (Opcode == ISD::SRL) { + // apply shift right to mask if it comes first + if (isShiftMask) Mask = Mask >> Shift; + // determine which bits are made indeterminant by shift + Indeterminant = ~(0xFFFFFFFFu >> Shift); + // adjust for the left rotate + Shift = 32 - Shift; + } else if (Opcode == ISD::ROTL) { + Indeterminant = 0; + } else { + return false; + } + + // if the mask doesn't intersect any Indeterminant bits + if (Mask && !(Mask & Indeterminant)) { + SH = Shift & 31; + // make sure the mask is still a mask (wrap arounds may not be) + return isRunOfOnes(Mask, MB, ME); + } + return false; +} + +/// SelectBitfieldInsert - turn an or of two masked values into +/// the rotate left word immediate then mask insert (rlwimi) instruction. +SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc dl(N); + + APInt LKZ, LKO, RKZ, RKO; + CurDAG->computeKnownBits(Op0, LKZ, LKO); + CurDAG->computeKnownBits(Op1, RKZ, RKO); + + unsigned TargetMask = LKZ.getZExtValue(); + unsigned InsertMask = RKZ.getZExtValue(); + + if ((TargetMask | InsertMask) == 0xFFFFFFFF) { + unsigned Op0Opc = Op0.getOpcode(); + unsigned Op1Opc = Op1.getOpcode(); + unsigned Value, SH = 0; + TargetMask = ~TargetMask; + InsertMask = ~InsertMask; + + // If the LHS has a foldable shift and the RHS does not, then swap it to the + // RHS so that we can fold the shift into the insert. + if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) { + if (Op0.getOperand(0).getOpcode() == ISD::SHL || + Op0.getOperand(0).getOpcode() == ISD::SRL) { + if (Op1.getOperand(0).getOpcode() != ISD::SHL && + Op1.getOperand(0).getOpcode() != ISD::SRL) { + std::swap(Op0, Op1); + std::swap(Op0Opc, Op1Opc); + std::swap(TargetMask, InsertMask); + } + } + } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) { + if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL && + Op1.getOperand(0).getOpcode() != ISD::SRL) { + std::swap(Op0, Op1); + std::swap(Op0Opc, Op1Opc); + std::swap(TargetMask, InsertMask); + } + } + + unsigned MB, ME; + if (isRunOfOnes(InsertMask, MB, ME)) { + SDValue Tmp1, Tmp2; + + if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) && + isInt32Immediate(Op1.getOperand(1), Value)) { + Op1 = Op1.getOperand(0); + SH = (Op1Opc == ISD::SHL) ? Value : 32 - Value; + } + if (Op1Opc == ISD::AND) { + // The AND mask might not be a constant, and we need to make sure that + // if we're going to fold the masking with the insert, all bits not + // know to be zero in the mask are known to be one. + APInt MKZ, MKO; + CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO); + bool CanFoldMask = InsertMask == MKO.getZExtValue(); + + unsigned SHOpc = Op1.getOperand(0).getOpcode(); + if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask && + isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) { + // Note that Value must be in range here (less than 32) because + // otherwise there would not be any bits set in InsertMask. + Op1 = Op1.getOperand(0).getOperand(0); + SH = (SHOpc == ISD::SHL) ? Value : 32 - Value; + } + } + + SH &= 31; + SDValue Ops[] = { Op0, Op1, getI32Imm(SH), getI32Imm(MB), + getI32Imm(ME) }; + return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops); + } + } + return nullptr; +} + +/// SelectCC - Select a comparison of the specified values with the specified +/// condition code, returning the CR# of the expression. +SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, + ISD::CondCode CC, SDLoc dl) { + // Always select the LHS. + unsigned Opc; + + if (LHS.getValueType() == MVT::i32) { + unsigned Imm; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + if (isInt32Immediate(RHS, Imm)) { + // SETEQ/SETNE comparison with 16-bit immediate, fold it. + if (isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + // If this is a 16-bit signed immediate, fold it. + if (isInt<16>((int)Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + + // For non-equality comparisons, the default code would materialize the + // constant, then compare against it, like this: + // lis r2, 4660 + // ori r2, r2, 22136 + // cmpw cr0, r3, r2 + // Since we are just comparing for equality, we can emit this instead: + // xoris r0,r3,0x1234 + // cmplwi cr0,r0,0x5678 + // beq cr0,L6 + SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS, + getI32Imm(Imm >> 16)), 0); + return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor, + getI32Imm(Imm & 0xFFFF)), 0); + } + Opc = PPC::CMPLW; + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + Opc = PPC::CMPLW; + } else { + short SImm; + if (isIntS16Immediate(RHS, SImm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, + getI32Imm((int)SImm & 0xFFFF)), + 0); + Opc = PPC::CMPW; + } + } else if (LHS.getValueType() == MVT::i64) { + uint64_t Imm; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + if (isInt64Immediate(RHS.getNode(), Imm)) { + // SETEQ/SETNE comparison with 16-bit immediate, fold it. + if (isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + // If this is a 16-bit signed immediate, fold it. + if (isInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, + getI32Imm(Imm & 0xFFFF)), 0); + + // For non-equality comparisons, the default code would materialize the + // constant, then compare against it, like this: + // lis r2, 4660 + // ori r2, r2, 22136 + // cmpd cr0, r3, r2 + // Since we are just comparing for equality, we can emit this instead: + // xoris r0,r3,0x1234 + // cmpldi cr0,r0,0x5678 + // beq cr0,L6 + if (isUInt<32>(Imm)) { + SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS, + getI64Imm(Imm >> 16)), 0); + return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor, + getI64Imm(Imm & 0xFFFF)), 0); + } + } + Opc = PPC::CMPLD; + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS, + getI64Imm(Imm & 0xFFFF)), 0); + Opc = PPC::CMPLD; + } else { + short SImm; + if (isIntS16Immediate(RHS, SImm)) + return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, + getI64Imm(SImm & 0xFFFF)), + 0); + Opc = PPC::CMPD; + } + } else if (LHS.getValueType() == MVT::f32) { + Opc = PPC::FCMPUS; + } else { + assert(LHS.getValueType() == MVT::f64 && "Unknown vt!"); + Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD; + } + return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0); +} + +static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) { + switch (CC) { + case ISD::SETUEQ: + case ISD::SETONE: + case ISD::SETOLE: + case ISD::SETOGE: + llvm_unreachable("Should be lowered by legalize!"); + default: llvm_unreachable("Unknown condition!"); + case ISD::SETOEQ: + case ISD::SETEQ: return PPC::PRED_EQ; + case ISD::SETUNE: + case ISD::SETNE: return PPC::PRED_NE; + case ISD::SETOLT: + case ISD::SETLT: return PPC::PRED_LT; + case ISD::SETULE: + case ISD::SETLE: return PPC::PRED_LE; + case ISD::SETOGT: + case ISD::SETGT: return PPC::PRED_GT; + case ISD::SETUGE: + case ISD::SETGE: return PPC::PRED_GE; + case ISD::SETO: return PPC::PRED_NU; + case ISD::SETUO: return PPC::PRED_UN; + // These two are invalid for floating point. Assume we have int. + case ISD::SETULT: return PPC::PRED_LT; + case ISD::SETUGT: return PPC::PRED_GT; + } +} + +/// getCRIdxForSetCC - Return the index of the condition register field +/// associated with the SetCC condition, and whether or not the field is +/// treated as inverted. That is, lt = 0; ge = 0 inverted. +static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) { + Invert = false; + switch (CC) { + default: llvm_unreachable("Unknown condition!"); + case ISD::SETOLT: + case ISD::SETLT: return 0; // Bit #0 = SETOLT + case ISD::SETOGT: + case ISD::SETGT: return 1; // Bit #1 = SETOGT + case ISD::SETOEQ: + case ISD::SETEQ: return 2; // Bit #2 = SETOEQ + case ISD::SETUO: return 3; // Bit #3 = SETUO + case ISD::SETUGE: + case ISD::SETGE: Invert = true; return 0; // !Bit #0 = SETUGE + case ISD::SETULE: + case ISD::SETLE: Invert = true; return 1; // !Bit #1 = SETULE + case ISD::SETUNE: + case ISD::SETNE: Invert = true; return 2; // !Bit #2 = SETUNE + case ISD::SETO: Invert = true; return 3; // !Bit #3 = SETO + case ISD::SETUEQ: + case ISD::SETOGE: + case ISD::SETOLE: + case ISD::SETONE: + llvm_unreachable("Invalid branch code: should be expanded by legalize"); + // These are invalid for floating point. Assume integer. + case ISD::SETULT: return 0; + case ISD::SETUGT: return 1; + } +} + +// getVCmpInst: return the vector compare instruction for the specified +// vector type and condition code. Since this is for altivec specific code, +// only support the altivec types (v16i8, v8i16, v4i32, and v4f32). +static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, + bool HasVSX, bool &Swap, bool &Negate) { + Swap = false; + Negate = false; + + if (VecVT.isFloatingPoint()) { + /* Handle some cases by swapping input operands. */ + switch (CC) { + case ISD::SETLE: CC = ISD::SETGE; Swap = true; break; + case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; + case ISD::SETOLE: CC = ISD::SETOGE; Swap = true; break; + case ISD::SETOLT: CC = ISD::SETOGT; Swap = true; break; + case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; + case ISD::SETUGT: CC = ISD::SETULT; Swap = true; break; + default: break; + } + /* Handle some cases by negating the result. */ + switch (CC) { + case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; + case ISD::SETUNE: CC = ISD::SETOEQ; Negate = true; break; + case ISD::SETULE: CC = ISD::SETOGT; Negate = true; break; + case ISD::SETULT: CC = ISD::SETOGE; Negate = true; break; + default: break; + } + /* We have instructions implementing the remaining cases. */ + switch (CC) { + case ISD::SETEQ: + case ISD::SETOEQ: + if (VecVT == MVT::v4f32) + return HasVSX ? PPC::XVCMPEQSP : PPC::VCMPEQFP; + else if (VecVT == MVT::v2f64) + return PPC::XVCMPEQDP; + break; + case ISD::SETGT: + case ISD::SETOGT: + if (VecVT == MVT::v4f32) + return HasVSX ? PPC::XVCMPGTSP : PPC::VCMPGTFP; + else if (VecVT == MVT::v2f64) + return PPC::XVCMPGTDP; + break; + case ISD::SETGE: + case ISD::SETOGE: + if (VecVT == MVT::v4f32) + return HasVSX ? PPC::XVCMPGESP : PPC::VCMPGEFP; + else if (VecVT == MVT::v2f64) + return PPC::XVCMPGEDP; + break; + default: + break; + } + llvm_unreachable("Invalid floating-point vector compare condition"); + } else { + /* Handle some cases by swapping input operands. */ + switch (CC) { + case ISD::SETGE: CC = ISD::SETLE; Swap = true; break; + case ISD::SETLT: CC = ISD::SETGT; Swap = true; break; + case ISD::SETUGE: CC = ISD::SETULE; Swap = true; break; + case ISD::SETULT: CC = ISD::SETUGT; Swap = true; break; + default: break; + } + /* Handle some cases by negating the result. */ + switch (CC) { + case ISD::SETNE: CC = ISD::SETEQ; Negate = true; break; + case ISD::SETUNE: CC = ISD::SETUEQ; Negate = true; break; + case ISD::SETLE: CC = ISD::SETGT; Negate = true; break; + case ISD::SETULE: CC = ISD::SETUGT; Negate = true; break; + default: break; + } + /* We have instructions implementing the remaining cases. */ + switch (CC) { + case ISD::SETEQ: + case ISD::SETUEQ: + if (VecVT == MVT::v16i8) + return PPC::VCMPEQUB; + else if (VecVT == MVT::v8i16) + return PPC::VCMPEQUH; + else if (VecVT == MVT::v4i32) + return PPC::VCMPEQUW; + break; + case ISD::SETGT: + if (VecVT == MVT::v16i8) + return PPC::VCMPGTSB; + else if (VecVT == MVT::v8i16) + return PPC::VCMPGTSH; + else if (VecVT == MVT::v4i32) + return PPC::VCMPGTSW; + break; + case ISD::SETUGT: + if (VecVT == MVT::v16i8) + return PPC::VCMPGTUB; + else if (VecVT == MVT::v8i16) + return PPC::VCMPGTUH; + else if (VecVT == MVT::v4i32) + return PPC::VCMPGTUW; + break; + default: + break; + } + llvm_unreachable("Invalid integer vector compare condition"); + } +} + +SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) { + SDLoc dl(N); + unsigned Imm; + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = (PtrVT == MVT::i64); + + if (!PPCSubTarget->useCRBits() && + isInt32Immediate(N->getOperand(1), Imm)) { + // We can codegen setcc op, imm very efficiently compared to a brcond. + // Check for those cases here. + // setcc op, 0 + if (Imm == 0) { + SDValue Op = N->getOperand(0); + switch (CC) { + default: break; + case ISD::SETEQ: { + Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0); + SDValue Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + case ISD::SETNE: { + if (isPPC64) break; + SDValue AD = + SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + Op, getI32Imm(~0U)), 0); + return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, + AD.getValue(1)); + } + case ISD::SETLT: { + SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + case ISD::SETGT: { + SDValue T = + SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0); + T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0); + SDValue Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + } + } else if (Imm == ~0U) { // setcc op, -1 + SDValue Op = N->getOperand(0); + switch (CC) { + default: break; + case ISD::SETEQ: + if (isPPC64) break; + Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + Op, getI32Imm(1)), 0); + return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, + SDValue(CurDAG->getMachineNode(PPC::LI, dl, + MVT::i32, + getI32Imm(0)), 0), + Op.getValue(1)); + case ISD::SETNE: { + if (isPPC64) break; + Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0); + SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + Op, getI32Imm(~0U)); + return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0), + Op, SDValue(AD, 1)); + } + case ISD::SETLT: { + SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op, + getI32Imm(1)), 0); + SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD, + Op), 0); + SDValue Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + case ISD::SETGT: { + SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) }; + Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), + 0); + return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, + getI32Imm(1)); + } + } + } + } + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Altivec Vector compare instructions do not set any CR register by default and + // vector compare operations return the same type as the operands. + if (LHS.getValueType().isVector()) { + EVT VecVT = LHS.getValueType(); + bool Swap, Negate; + unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC, + PPCSubTarget->hasVSX(), Swap, Negate); + if (Swap) + std::swap(LHS, RHS); + + if (Negate) { + SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0); + return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : + PPC::VNOR, + VecVT, VCmp, VCmp); + } + + return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS); + } + + if (PPCSubTarget->useCRBits()) + return nullptr; + + bool Inv; + unsigned Idx = getCRIdxForSetCC(CC, Inv); + SDValue CCReg = SelectCC(LHS, RHS, CC, dl); + SDValue IntCR; + + // Force the ccreg into CR7. + SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32); + + SDValue InFlag(nullptr, 0); // Null incoming flag value. + CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg, + InFlag).getValue(1); + + IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg, + CCReg), 0); + + SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31), + getI32Imm(31), getI32Imm(31) }; + if (!Inv) + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + + // Get the specified bit. + SDValue Tmp = + SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0); + return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1)); +} + + +// Select - Convert the specified operand from a target-independent to a +// target-specific node if it hasn't already been changed. +SDNode *PPCDAGToDAGISel::Select(SDNode *N) { + SDLoc dl(N); + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return nullptr; // Already selected. + } + + switch (N->getOpcode()) { + default: break; + + case ISD::Constant: { + if (N->getValueType(0) == MVT::i64) { + // Get 64 bit value. + int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue(); + // Assume no remaining bits. + unsigned Remainder = 0; + // Assume no shift required. + unsigned Shift = 0; + + // If it can't be represented as a 32 bit value. + if (!isInt<32>(Imm)) { + Shift = countTrailingZeros<uint64_t>(Imm); + int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; + + // If the shifted value fits 32 bits. + if (isInt<32>(ImmSh)) { + // Go with the shifted value. + Imm = ImmSh; + } else { + // Still stuck with a 64 bit value. + Remainder = Imm; + Shift = 32; + Imm >>= 32; + } + } + + // Intermediate operand. + SDNode *Result; + + // Handle first 32 bits. + unsigned Lo = Imm & 0xFFFF; + unsigned Hi = (Imm >> 16) & 0xFFFF; + + // Simple value. + if (isInt<16>(Imm)) { + // Just the Lo bits. + Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo)); + } else if (Lo) { + // Handle the Hi bits. + unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8; + Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi)); + // And Lo bits. + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Lo)); + } else { + // Just the Hi bits. + Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi)); + } + + // If no shift, we're done. + if (!Shift) return Result; + + // Shift for next step if the upper 32-bits were not zero. + if (Imm) { + Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, + SDValue(Result, 0), + getI32Imm(Shift), + getI32Imm(63 - Shift)); + } + + // Add in the last bits as required. + if ((Hi = (Remainder >> 16) & 0xFFFF)) { + Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Hi)); + } + if ((Lo = Remainder & 0xFFFF)) { + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Lo)); + } + + return Result; + } + break; + } + + case ISD::SETCC: { + SDNode *SN = SelectSETCC(N); + if (SN) + return SN; + break; + } + case PPCISD::GlobalBaseReg: + return getGlobalBaseReg(); + + case ISD::FrameIndex: { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0)); + unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8; + if (N->hasOneUse()) + return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), TFI, + getSmallIPtrImm(0)); + return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI, + getSmallIPtrImm(0)); + } + + case PPCISD::MFOCRF: { + SDValue InFlag = N->getOperand(1); + return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, + N->getOperand(0), InFlag); + } + + case ISD::SDIV: { + // FIXME: since this depends on the setting of the carry flag from the srawi + // we should really be making notes about that for the scheduler. + // FIXME: It sure would be nice if we could cheaply recognize the + // srl/add/sra pattern the dag combiner will generate for this as + // sra/addze rather than having to handle sdiv ourselves. oh well. + unsigned Imm; + if (isInt32Immediate(N->getOperand(1), Imm)) { + SDValue N0 = N->getOperand(0); + if ((signed)Imm > 0 && isPowerOf2_32(Imm)) { + SDNode *Op = + CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue, + N0, getI32Imm(Log2_32(Imm))); + return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, + SDValue(Op, 0), SDValue(Op, 1)); + } else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) { + SDNode *Op = + CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue, + N0, getI32Imm(Log2_32(-Imm))); + SDValue PT = + SDValue(CurDAG->getMachineNode(PPC::ADDZE, dl, MVT::i32, + SDValue(Op, 0), SDValue(Op, 1)), + 0); + return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT); + } + } + + // Other cases are autogenerated. + break; + } + + case ISD::LOAD: { + // Handle preincrement loads. + LoadSDNode *LD = cast<LoadSDNode>(N); + EVT LoadedVT = LD->getMemoryVT(); + + // Normal loads are handled by code generated from the .td file. + if (LD->getAddressingMode() != ISD::PRE_INC) + break; + + SDValue Offset = LD->getOffset(); + if (Offset.getOpcode() == ISD::TargetConstant || + Offset.getOpcode() == ISD::TargetGlobalAddress) { + + unsigned Opcode; + bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD; + if (LD->getValueType(0) != MVT::i64) { + // Handle PPC32 integer and normal FP loads. + assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::f64: Opcode = PPC::LFDU; break; + case MVT::f32: Opcode = PPC::LFSU; break; + case MVT::i32: Opcode = PPC::LWZU; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZU; break; + } + } else { + assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!"); + assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::i64: Opcode = PPC::LDU; break; + case MVT::i32: Opcode = PPC::LWZU8; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZU8; break; + } + } + + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[] = { Offset, Base, Chain }; + return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0), + PPCLowering->getPointerTy(), + MVT::Other, Ops); + } else { + unsigned Opcode; + bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD; + if (LD->getValueType(0) != MVT::i64) { + // Handle PPC32 integer and normal FP loads. + assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::f64: Opcode = PPC::LFDUX; break; + case MVT::f32: Opcode = PPC::LFSUX; break; + case MVT::i32: Opcode = PPC::LWZUX; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZUX; break; + } + } else { + assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!"); + assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) && + "Invalid sext update load"); + switch (LoadedVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid PPC load type!"); + case MVT::i64: Opcode = PPC::LDUX; break; + case MVT::i32: Opcode = isSExt ? PPC::LWAUX : PPC::LWZUX8; break; + case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break; + case MVT::i1: + case MVT::i8: Opcode = PPC::LBZUX8; break; + } + } + + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + SDValue Ops[] = { Base, Offset, Chain }; + return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0), + PPCLowering->getPointerTy(), + MVT::Other, Ops); + } + } + + case ISD::AND: { + unsigned Imm, Imm2, SH, MB, ME; + uint64_t Imm64; + + // If this is an and of a value rotated between 0 and 31 bits and then and'd + // with a mask, emit rlwinm + if (isInt32Immediate(N->getOperand(1), Imm) && + isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) { + SDValue Val = N->getOperand(0).getOperand(0); + SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + // If this is just a masked value where the input is not handled above, and + // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm + if (isInt32Immediate(N->getOperand(1), Imm) && + isRunOfOnes(Imm, MB, ME) && + N->getOperand(0).getOpcode() != ISD::ROTL) { + SDValue Val = N->getOperand(0); + SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + // If this is a 64-bit zero-extension mask, emit rldicl. + if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) && + isMask_64(Imm64)) { + SDValue Val = N->getOperand(0); + MB = 64 - CountTrailingOnes_64(Imm64); + SH = 0; + + // If the operand is a logical right shift, we can fold it into this + // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb) + // for n <= mb. The right shift is really a left rotate followed by a + // mask, and this mask is a more-restrictive sub-mask of the mask implied + // by the shift. + if (Val.getOpcode() == ISD::SRL && + isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) { + assert(Imm < 64 && "Illegal shift amount"); + Val = Val.getOperand(0); + SH = 64 - Imm; + } + + SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB) }; + return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops); + } + // AND X, 0 -> 0, not "rlwinm 32". + if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) { + ReplaceUses(SDValue(N, 0), N->getOperand(1)); + return nullptr; + } + // ISD::OR doesn't get all the bitfield insertion fun. + // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert + if (isInt32Immediate(N->getOperand(1), Imm) && + N->getOperand(0).getOpcode() == ISD::OR && + isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) { + unsigned MB, ME; + Imm = ~(Imm^Imm2); + if (isRunOfOnes(Imm, MB, ME)) { + SDValue Ops[] = { N->getOperand(0).getOperand(0), + N->getOperand(0).getOperand(1), + getI32Imm(0), getI32Imm(MB),getI32Imm(ME) }; + return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops); + } + } + + // Other cases are autogenerated. + break; + } + case ISD::OR: + if (N->getValueType(0) == MVT::i32) + if (SDNode *I = SelectBitfieldInsert(N)) + return I; + + // Other cases are autogenerated. + break; + case ISD::SHL: { + unsigned Imm, SH, MB, ME; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) && + isRotateAndMask(N, Imm, true, SH, MB, ME)) { + SDValue Ops[] = { N->getOperand(0).getOperand(0), + getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + + // Other cases are autogenerated. + break; + } + case ISD::SRL: { + unsigned Imm, SH, MB, ME; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) && + isRotateAndMask(N, Imm, true, SH, MB, ME)) { + SDValue Ops[] = { N->getOperand(0).getOperand(0), + getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) }; + return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + } + + // Other cases are autogenerated. + break; + } + // FIXME: Remove this once the ANDI glue bug is fixed: + case PPCISD::ANDIo_1_EQ_BIT: + case PPCISD::ANDIo_1_GT_BIT: { + if (!ANDIGlueBug) + break; + + EVT InVT = N->getOperand(0).getValueType(); + assert((InVT == MVT::i64 || InVT == MVT::i32) && + "Invalid input type for ANDIo_1_EQ_BIT"); + + unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo; + SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue, + N->getOperand(0), + CurDAG->getTargetConstant(1, InVT)), 0); + SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); + SDValue SRIdxVal = + CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ? + PPC::sub_eq : PPC::sub_gt, MVT::i32); + + return CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1, + CR0Reg, SRIdxVal, + SDValue(AndI.getNode(), 1) /* glue */); + } + case ISD::SELECT_CC: { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); + EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = (PtrVT == MVT::i64); + + // If this is a select of i1 operands, we'll pattern match it. + if (PPCSubTarget->useCRBits() && + N->getOperand(0).getValueType() == MVT::i1) + break; + + // Handle the setcc cases here. select_cc lhs, 0, 1, 0, cc + if (!isPPC64) + if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1))) + if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2))) + if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3))) + if (N1C->isNullValue() && N3C->isNullValue() && + N2C->getZExtValue() == 1ULL && CC == ISD::SETNE && + // FIXME: Implement this optzn for PPC64. + N->getValueType(0) == MVT::i32) { + SDNode *Tmp = + CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue, + N->getOperand(0), getI32Imm(~0U)); + return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, + SDValue(Tmp, 0), N->getOperand(0), + SDValue(Tmp, 1)); + } + + SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl); + + if (N->getValueType(0) == MVT::i1) { + // An i1 select is: (c & t) | (!c & f). + bool Inv; + unsigned Idx = getCRIdxForSetCC(CC, Inv); + + unsigned SRI; + switch (Idx) { + default: llvm_unreachable("Invalid CC index"); + case 0: SRI = PPC::sub_lt; break; + case 1: SRI = PPC::sub_gt; break; + case 2: SRI = PPC::sub_eq; break; + case 3: SRI = PPC::sub_un; break; + } + + SDValue CCBit = CurDAG->getTargetExtractSubreg(SRI, dl, MVT::i1, CCReg); + + SDValue NotCCBit(CurDAG->getMachineNode(PPC::CRNOR, dl, MVT::i1, + CCBit, CCBit), 0); + SDValue C = Inv ? NotCCBit : CCBit, + NotC = Inv ? CCBit : NotCCBit; + + SDValue CAndT(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1, + C, N->getOperand(2)), 0); + SDValue NotCAndF(CurDAG->getMachineNode(PPC::CRAND, dl, MVT::i1, + NotC, N->getOperand(3)), 0); + + return CurDAG->SelectNodeTo(N, PPC::CROR, MVT::i1, CAndT, NotCAndF); + } + + unsigned BROpc = getPredicateForSetCC(CC); + + unsigned SelectCCOp; + if (N->getValueType(0) == MVT::i32) + SelectCCOp = PPC::SELECT_CC_I4; + else if (N->getValueType(0) == MVT::i64) + SelectCCOp = PPC::SELECT_CC_I8; + else if (N->getValueType(0) == MVT::f32) + SelectCCOp = PPC::SELECT_CC_F4; + else if (N->getValueType(0) == MVT::f64) + SelectCCOp = PPC::SELECT_CC_F8; + else + SelectCCOp = PPC::SELECT_CC_VRRC; + + SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3), + getI32Imm(BROpc) }; + return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops); + } + case ISD::VSELECT: + if (PPCSubTarget->hasVSX()) { + SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) }; + return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops); + } + + break; + case ISD::VECTOR_SHUFFLE: + if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 || + N->getValueType(0) == MVT::v2i64)) { + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); + + SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1), + Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1); + unsigned DM[2]; + + for (int i = 0; i < 2; ++i) + if (SVN->getMaskElt(i) <= 0 || SVN->getMaskElt(i) == 2) + DM[i] = 0; + else + DM[i] = 1; + + SDValue DMV = CurDAG->getTargetConstant(DM[1] | (DM[0] << 1), MVT::i32); + + if (Op1 == Op2 && DM[0] == 0 && DM[1] == 0 && + Op1.getOpcode() == ISD::SCALAR_TO_VECTOR && + isa<LoadSDNode>(Op1.getOperand(0))) { + LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0)); + SDValue Base, Offset; + + if (LD->isUnindexed() && + SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) { + SDValue Chain = LD->getChain(); + SDValue Ops[] = { Base, Offset, Chain }; + return CurDAG->SelectNodeTo(N, PPC::LXVDSX, + N->getValueType(0), Ops); + } + } + + SDValue Ops[] = { Op1, Op2, DMV }; + return CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops); + } + + break; + case PPCISD::BDNZ: + case PPCISD::BDZ: { + bool IsPPC64 = PPCSubTarget->isPPC64(); + SDValue Ops[] = { N->getOperand(1), N->getOperand(0) }; + return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ? + (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (IsPPC64 ? PPC::BDZ8 : PPC::BDZ), + MVT::Other, Ops); + } + case PPCISD::COND_BRANCH: { + // Op #0 is the Chain. + // Op #1 is the PPC::PRED_* number. + // Op #2 is the CR# + // Op #3 is the Dest MBB + // Op #4 is the Flag. + // Prevent PPC::PRED_* from being selected into LI. + SDValue Pred = + getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); + SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3), + N->getOperand(0), N->getOperand(4) }; + return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops); + } + case ISD::BR_CC: { + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + unsigned PCC = getPredicateForSetCC(CC); + + if (N->getOperand(2).getValueType() == MVT::i1) { + unsigned Opc; + bool Swap; + switch (PCC) { + default: llvm_unreachable("Unexpected Boolean-operand predicate"); + case PPC::PRED_LT: Opc = PPC::CRANDC; Swap = true; break; + case PPC::PRED_LE: Opc = PPC::CRORC; Swap = true; break; + case PPC::PRED_EQ: Opc = PPC::CREQV; Swap = false; break; + case PPC::PRED_GE: Opc = PPC::CRORC; Swap = false; break; + case PPC::PRED_GT: Opc = PPC::CRANDC; Swap = false; break; + case PPC::PRED_NE: Opc = PPC::CRXOR; Swap = false; break; + } + + SDValue BitComp(CurDAG->getMachineNode(Opc, dl, MVT::i1, + N->getOperand(Swap ? 3 : 2), + N->getOperand(Swap ? 2 : 3)), 0); + return CurDAG->SelectNodeTo(N, PPC::BC, MVT::Other, + BitComp, N->getOperand(4), N->getOperand(0)); + } + + SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl); + SDValue Ops[] = { getI32Imm(PCC), CondCode, + N->getOperand(4), N->getOperand(0) }; + return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops); + } + case ISD::BRIND: { + // FIXME: Should custom lower this. + SDValue Chain = N->getOperand(0); + SDValue Target = N->getOperand(1); + unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8; + unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8; + Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target, + Chain), 0); + return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain); + } + case PPCISD::TOC_ENTRY: { + if (PPCSubTarget->isSVR4ABI() && !PPCSubTarget->isPPC64()) { + SDValue GA = N->getOperand(0); + return CurDAG->getMachineNode(PPC::LWZtoc, dl, MVT::i32, GA, + N->getOperand(1)); + } + assert (PPCSubTarget->isPPC64() && + "Only supported for 64-bit ABI and 32-bit SVR4"); + + // For medium and large code model, we generate two instructions as + // described below. Otherwise we allow SelectCodeCommon to handle this, + // selecting one of LDtoc, LDtocJTI, and LDtocCPT. + CodeModel::Model CModel = TM.getCodeModel(); + if (CModel != CodeModel::Medium && CModel != CodeModel::Large) + break; + + // The first source operand is a TargetGlobalAddress or a TargetJumpTable. + // If it is an externally defined symbol, a symbol with common linkage, + // a non-local function address, or a jump table address, or if we are + // generating code for large code model, we generate: + // LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>)) + // Otherwise we generate: + // ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>) + SDValue GA = N->getOperand(0); + SDValue TOCbase = N->getOperand(1); + SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64, + TOCbase, GA); + + if (isa<JumpTableSDNode>(GA) || CModel == CodeModel::Large) + return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, + SDValue(Tmp, 0)); + + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { + const GlobalValue *GValue = G->getGlobal(); + if ((GValue->getType()->getElementType()->isFunctionTy() && + (GValue->isDeclaration() || GValue->isWeakForLinker())) || + GValue->isDeclaration() || GValue->hasCommonLinkage() || + GValue->hasAvailableExternallyLinkage()) + return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, + SDValue(Tmp, 0)); + } + + return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64, + SDValue(Tmp, 0), GA); + } + case PPCISD::PPC32_PICGOT: { + // Generate a PIC-safe GOT reference. + assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() && + "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4"); + return CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT, PPCLowering->getPointerTy(), MVT::i32); + } + case PPCISD::VADD_SPLAT: { + // This expands into one of three sequences, depending on whether + // the first operand is odd or even, positive or negative. + assert(isa<ConstantSDNode>(N->getOperand(0)) && + isa<ConstantSDNode>(N->getOperand(1)) && + "Invalid operand on VADD_SPLAT!"); + + int Elt = N->getConstantOperandVal(0); + int EltSize = N->getConstantOperandVal(1); + unsigned Opc1, Opc2, Opc3; + EVT VT; + + if (EltSize == 1) { + Opc1 = PPC::VSPLTISB; + Opc2 = PPC::VADDUBM; + Opc3 = PPC::VSUBUBM; + VT = MVT::v16i8; + } else if (EltSize == 2) { + Opc1 = PPC::VSPLTISH; + Opc2 = PPC::VADDUHM; + Opc3 = PPC::VSUBUHM; + VT = MVT::v8i16; + } else { + assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!"); + Opc1 = PPC::VSPLTISW; + Opc2 = PPC::VADDUWM; + Opc3 = PPC::VSUBUWM; + VT = MVT::v4i32; + } + + if ((Elt & 1) == 0) { + // Elt is even, in the range [-32,-18] + [16,30]. + // + // Convert: VADD_SPLAT elt, size + // Into: tmp = VSPLTIS[BHW] elt + // VADDU[BHW]M tmp, tmp + // Where: [BHW] = B for size = 1, H for size = 2, W for size = 4 + SDValue EltVal = getI32Imm(Elt >> 1); + SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + SDValue TmpVal = SDValue(Tmp, 0); + return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal); + + } else if (Elt > 0) { + // Elt is odd and positive, in the range [17,31]. + // + // Convert: VADD_SPLAT elt, size + // Into: tmp1 = VSPLTIS[BHW] elt-16 + // tmp2 = VSPLTIS[BHW] -16 + // VSUBU[BHW]M tmp1, tmp2 + SDValue EltVal = getI32Imm(Elt - 16); + SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + EltVal = getI32Imm(-16); + SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + return CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0), + SDValue(Tmp2, 0)); + + } else { + // Elt is odd and negative, in the range [-31,-17]. + // + // Convert: VADD_SPLAT elt, size + // Into: tmp1 = VSPLTIS[BHW] elt+16 + // tmp2 = VSPLTIS[BHW] -16 + // VADDU[BHW]M tmp1, tmp2 + SDValue EltVal = getI32Imm(Elt + 16); + SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + EltVal = getI32Imm(-16); + SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + return CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0), + SDValue(Tmp2, 0)); + } + } + } + + return SelectCode(N); +} + +/// PostprocessISelDAG - Perform some late peephole optimizations +/// on the DAG representation. +void PPCDAGToDAGISel::PostprocessISelDAG() { + + // Skip peepholes at -O0. + if (TM.getOptLevel() == CodeGenOpt::None) + return; + + PeepholePPC64(); + PeepholeCROps(); +} + +// Check if all users of this node will become isel where the second operand +// is the constant zero. If this is so, and if we can negate the condition, +// then we can flip the true and false operands. This will allow the zero to +// be folded with the isel so that we don't need to materialize a register +// containing zero. +bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) { + // If we're not using isel, then this does not matter. + if (!PPCSubTarget->hasISEL()) + return false; + + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (!User->isMachineOpcode()) + return false; + if (User->getMachineOpcode() != PPC::SELECT_I4 && + User->getMachineOpcode() != PPC::SELECT_I8) + return false; + + SDNode *Op2 = User->getOperand(2).getNode(); + if (!Op2->isMachineOpcode()) + return false; + + if (Op2->getMachineOpcode() != PPC::LI && + Op2->getMachineOpcode() != PPC::LI8) + return false; + + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op2->getOperand(0)); + if (!C) + return false; + + if (!C->isNullValue()) + return false; + } + + return true; +} + +void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) { + SmallVector<SDNode *, 4> ToReplace; + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + assert((User->getMachineOpcode() == PPC::SELECT_I4 || + User->getMachineOpcode() == PPC::SELECT_I8) && + "Must have all select users"); + ToReplace.push_back(User); + } + + for (SmallVector<SDNode *, 4>::iterator UI = ToReplace.begin(), + UE = ToReplace.end(); UI != UE; ++UI) { + SDNode *User = *UI; + SDNode *ResNode = + CurDAG->getMachineNode(User->getMachineOpcode(), SDLoc(User), + User->getValueType(0), User->getOperand(0), + User->getOperand(2), + User->getOperand(1)); + + DEBUG(dbgs() << "CR Peephole replacing:\nOld: "); + DEBUG(User->dump(CurDAG)); + DEBUG(dbgs() << "\nNew: "); + DEBUG(ResNode->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + + ReplaceUses(User, ResNode); + } +} + +void PPCDAGToDAGISel::PeepholeCROps() { + bool IsModified; + do { + IsModified = false; + for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), + E = CurDAG->allnodes_end(); I != E; ++I) { + MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I); + if (!MachineNode || MachineNode->use_empty()) + continue; + SDNode *ResNode = MachineNode; + + bool Op1Set = false, Op1Unset = false, + Op1Not = false, + Op2Set = false, Op2Unset = false, + Op2Not = false; + + unsigned Opcode = MachineNode->getMachineOpcode(); + switch (Opcode) { + default: break; + case PPC::CRAND: + case PPC::CRNAND: + case PPC::CROR: + case PPC::CRXOR: + case PPC::CRNOR: + case PPC::CREQV: + case PPC::CRANDC: + case PPC::CRORC: { + SDValue Op = MachineNode->getOperand(1); + if (Op.isMachineOpcode()) { + if (Op.getMachineOpcode() == PPC::CRSET) + Op2Set = true; + else if (Op.getMachineOpcode() == PPC::CRUNSET) + Op2Unset = true; + else if (Op.getMachineOpcode() == PPC::CRNOR && + Op.getOperand(0) == Op.getOperand(1)) + Op2Not = true; + } + } // fallthrough + case PPC::BC: + case PPC::BCn: + case PPC::SELECT_I4: + case PPC::SELECT_I8: + case PPC::SELECT_F4: + case PPC::SELECT_F8: + case PPC::SELECT_VRRC: { + SDValue Op = MachineNode->getOperand(0); + if (Op.isMachineOpcode()) { + if (Op.getMachineOpcode() == PPC::CRSET) + Op1Set = true; + else if (Op.getMachineOpcode() == PPC::CRUNSET) + Op1Unset = true; + else if (Op.getMachineOpcode() == PPC::CRNOR && + Op.getOperand(0) == Op.getOperand(1)) + Op1Not = true; + } + } + break; + } + + bool SelectSwap = false; + switch (Opcode) { + default: break; + case PPC::CRAND: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // x & x = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Set) + // 1 & y = y + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op2Set) + // x & 1 = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Unset || Op2Unset) + // x & 0 = 0 & y = 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Not) + // ~x & y = andc(y, x) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(0). + getOperand(0)); + else if (Op2Not) + // x & ~y = andc(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CRNAND: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // nand(x, x) -> nor(x, x) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Set) + // nand(1, y) -> nor(y, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op2Set) + // nand(x, 1) -> nor(x, x) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Unset || Op2Unset) + // nand(x, 0) = nand(0, y) = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Not) + // nand(~x, y) = ~(~x & y) = x | ~y = orc(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // nand(x, ~y) = ~x | y = orc(y, x) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1). + getOperand(0), + MachineNode->getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CROR: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // x | x = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Set || Op2Set) + // x | 1 = 1 | y = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Unset) + // 0 | y = y + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op2Unset) + // x | 0 = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Not) + // ~x | y = orc(y, x) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(0). + getOperand(0)); + else if (Op2Not) + // x | ~y = orc(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CRXOR: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // xor(x, x) = 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Set) + // xor(1, y) -> nor(y, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op2Set) + // xor(x, 1) -> nor(x, x) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Unset) + // xor(0, y) = y + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op2Unset) + // xor(x, 0) = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Not) + // xor(~x, y) = eqv(x, y) + ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // xor(x, ~y) = eqv(x, y) + ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CREQV, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CRNOR: + if (Op1Set || Op2Set) + // nor(1, y) -> 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Unset) + // nor(0, y) = ~y -> nor(y, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op2Unset) + // nor(x, 0) = ~x + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Not) + // nor(~x, y) = andc(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // nor(x, ~y) = andc(y, x) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1). + getOperand(0), + MachineNode->getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CREQV: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // eqv(x, x) = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Set) + // eqv(1, y) = y + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op2Set) + // eqv(x, 1) = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Unset) + // eqv(0, y) = ~y -> nor(y, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op2Unset) + // eqv(x, 0) = ~x + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(0)); + else if (Op1Not) + // eqv(~x, y) = xor(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // eqv(x, ~y) = xor(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRXOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1)), + SelectSwap = true; + break; + case PPC::CRANDC: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // andc(x, x) = 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Set) + // andc(1, y) = ~y + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op1Unset || Op2Set) + // andc(0, y) = andc(x, 1) = 0 + ResNode = CurDAG->getMachineNode(PPC::CRUNSET, SDLoc(MachineNode), + MVT::i1); + else if (Op2Unset) + // andc(x, 0) = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Not) + // andc(~x, y) = ~(x | y) = nor(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // andc(x, ~y) = x & y + ResNode = CurDAG->getMachineNode(PPC::CRAND, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRORC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(0)), + SelectSwap = true; + break; + case PPC::CRORC: + if (MachineNode->getOperand(0) == MachineNode->getOperand(1)) + // orc(x, x) = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op1Set || Op2Unset) + // orc(1, y) = orc(x, 0) = 1 + ResNode = CurDAG->getMachineNode(PPC::CRSET, SDLoc(MachineNode), + MVT::i1); + else if (Op2Set) + // orc(x, 1) = x + ResNode = MachineNode->getOperand(0).getNode(); + else if (Op1Unset) + // orc(0, y) = ~y + ResNode = CurDAG->getMachineNode(PPC::CRNOR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(1)); + else if (Op1Not) + // orc(~x, y) = ~(x & y) = nand(x, y) + ResNode = CurDAG->getMachineNode(PPC::CRNAND, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1)); + else if (Op2Not) + // orc(x, ~y) = x | y + ResNode = CurDAG->getMachineNode(PPC::CROR, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(0), + MachineNode->getOperand(1). + getOperand(0)); + else if (AllUsersSelectZero(MachineNode)) + ResNode = CurDAG->getMachineNode(PPC::CRANDC, SDLoc(MachineNode), + MVT::i1, MachineNode->getOperand(1), + MachineNode->getOperand(0)), + SelectSwap = true; + break; + case PPC::SELECT_I4: + case PPC::SELECT_I8: + case PPC::SELECT_F4: + case PPC::SELECT_F8: + case PPC::SELECT_VRRC: + if (Op1Set) + ResNode = MachineNode->getOperand(1).getNode(); + else if (Op1Unset) + ResNode = MachineNode->getOperand(2).getNode(); + else if (Op1Not) + ResNode = CurDAG->getMachineNode(MachineNode->getMachineOpcode(), + SDLoc(MachineNode), + MachineNode->getValueType(0), + MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(2), + MachineNode->getOperand(1)); + break; + case PPC::BC: + case PPC::BCn: + if (Op1Not) + ResNode = CurDAG->getMachineNode(Opcode == PPC::BC ? PPC::BCn : + PPC::BC, + SDLoc(MachineNode), + MVT::Other, + MachineNode->getOperand(0). + getOperand(0), + MachineNode->getOperand(1), + MachineNode->getOperand(2)); + // FIXME: Handle Op1Set, Op1Unset here too. + break; + } + + // If we're inverting this node because it is used only by selects that + // we'd like to swap, then swap the selects before the node replacement. + if (SelectSwap) + SwapAllSelectUsers(MachineNode); + + if (ResNode != MachineNode) { + DEBUG(dbgs() << "CR Peephole replacing:\nOld: "); + DEBUG(MachineNode->dump(CurDAG)); + DEBUG(dbgs() << "\nNew: "); + DEBUG(ResNode->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + + ReplaceUses(MachineNode, ResNode); + IsModified = true; + } + } + if (IsModified) + CurDAG->RemoveDeadNodes(); + } while (IsModified); +} + +void PPCDAGToDAGISel::PeepholePPC64() { + // These optimizations are currently supported only for 64-bit SVR4. + if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64()) + return; + + SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); + ++Position; + + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = --Position; + // Skip dead nodes and any non-machine opcodes. + if (N->use_empty() || !N->isMachineOpcode()) + continue; + + unsigned FirstOp; + unsigned StorageOpcode = N->getMachineOpcode(); + + switch (StorageOpcode) { + default: continue; + + case PPC::LBZ: + case PPC::LBZ8: + case PPC::LD: + case PPC::LFD: + case PPC::LFS: + case PPC::LHA: + case PPC::LHA8: + case PPC::LHZ: + case PPC::LHZ8: + case PPC::LWA: + case PPC::LWZ: + case PPC::LWZ8: + FirstOp = 0; + break; + + case PPC::STB: + case PPC::STB8: + case PPC::STD: + case PPC::STFD: + case PPC::STFS: + case PPC::STH: + case PPC::STH8: + case PPC::STW: + case PPC::STW8: + FirstOp = 1; + break; + } + + // If this is a load or store with a zero offset, we may be able to + // fold an add-immediate into the memory operation. + if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) || + N->getConstantOperandVal(FirstOp) != 0) + continue; + + SDValue Base = N->getOperand(FirstOp + 1); + if (!Base.isMachineOpcode()) + continue; + + unsigned Flags = 0; + bool ReplaceFlags = true; + + // When the feeding operation is an add-immediate of some sort, + // determine whether we need to add relocation information to the + // target flags on the immediate operand when we fold it into the + // load instruction. + // + // For something like ADDItocL, the relocation information is + // inferred from the opcode; when we process it in the AsmPrinter, + // we add the necessary relocation there. A load, though, can receive + // relocation from various flavors of ADDIxxx, so we need to carry + // the relocation information in the target flags. + switch (Base.getMachineOpcode()) { + default: continue; + + case PPC::ADDI8: + case PPC::ADDI: + // In some cases (such as TLS) the relocation information + // is already in place on the operand, so copying the operand + // is sufficient. + ReplaceFlags = false; + // For these cases, the immediate may not be divisible by 4, in + // which case the fold is illegal for DS-form instructions. (The + // other cases provide aligned addresses and are always safe.) + if ((StorageOpcode == PPC::LWA || + StorageOpcode == PPC::LD || + StorageOpcode == PPC::STD) && + (!isa<ConstantSDNode>(Base.getOperand(1)) || + Base.getConstantOperandVal(1) % 4 != 0)) + continue; + break; + case PPC::ADDIdtprelL: + Flags = PPCII::MO_DTPREL_LO; + break; + case PPC::ADDItlsldL: + Flags = PPCII::MO_TLSLD_LO; + break; + case PPC::ADDItocL: + Flags = PPCII::MO_TOC_LO; + break; + } + + // We found an opportunity. Reverse the operands from the add + // immediate and substitute them into the load or store. If + // needed, update the target flags for the immediate operand to + // reflect the necessary relocation information. + DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: "); + DEBUG(Base->dump(CurDAG)); + DEBUG(dbgs() << "\nN: "); + DEBUG(N->dump(CurDAG)); + DEBUG(dbgs() << "\n"); + + SDValue ImmOpnd = Base.getOperand(1); + + // If the relocation information isn't already present on the + // immediate operand, add it now. + if (ReplaceFlags) { + if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) { + SDLoc dl(GA); + const GlobalValue *GV = GA->getGlobal(); + // We can't perform this optimization for data whose alignment + // is insufficient for the instruction encoding. + if (GV->getAlignment() < 4 && + (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD || + StorageOpcode == PPC::LWA)) { + DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n"); + continue; + } + ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags); + } else if (ConstantPoolSDNode *CP = + dyn_cast<ConstantPoolSDNode>(ImmOpnd)) { + const Constant *C = CP->getConstVal(); + ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64, + CP->getAlignment(), + 0, Flags); + } + } + + if (FirstOp == 1) // Store + (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd, + Base.getOperand(0), N->getOperand(3)); + else // Load + (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0), + N->getOperand(2)); + + // The add-immediate may now be dead, in which case remove it. + if (Base.getNode()->use_empty()) + CurDAG->RemoveDeadNode(Base.getNode()); + } +} + + +/// createPPCISelDag - This pass converts a legalized DAG into a +/// PowerPC-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) { + return new PPCDAGToDAGISel(TM); +} + +static void initializePassOnce(PassRegistry &Registry) { + const char *Name = "PowerPC DAG->DAG Pattern Instruction Selection"; + PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID, + nullptr, false, false); + Registry.registerPass(*PI, true); +} + +void llvm::initializePPCDAGToDAGISelPass(PassRegistry &Registry) { + CALL_ONCE_INITIALIZATION(initializePassOnce); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp new file mode 100644 index 000000000000..1247e860f733 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -0,0 +1,9290 @@ +//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPCISelLowering class. +// +//===----------------------------------------------------------------------===// + +#include "PPCISelLowering.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCPerfectShuffle.h" +#include "PPCTargetMachine.h" +#include "PPCTargetObjectFile.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", +cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); + +static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref", +cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden); + +static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned", +cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden); + +// FIXME: Remove this once the bug has been fixed! +extern cl::opt<bool> ANDIGlueBug; + +static TargetLoweringObjectFile *createTLOF(const Triple &TT) { + // If it isn't a Mach-O file then it's going to be a linux ELF + // object file. + if (TT.isOSDarwin()) + return new TargetLoweringObjectFileMachO(); + + return new PPC64LinuxTargetObjectFile(); +} + +PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM) + : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))), + Subtarget(*TM.getSubtargetImpl()) { + setPow2DivIsCheap(); + + // Use _setjmp/_longjmp instead of setjmp/longjmp. + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(true); + + // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all + // arguments are at least 4/8 bytes aligned. + bool isPPC64 = Subtarget.isPPC64(); + setMinStackArgumentAlignment(isPPC64 ? 8:4); + + // Set up the register classes. + addRegisterClass(MVT::i32, &PPC::GPRCRegClass); + addRegisterClass(MVT::f32, &PPC::F4RCRegClass); + addRegisterClass(MVT::f64, &PPC::F8RCRegClass); + + // PowerPC has an i16 but no i8 (or i1) SEXTLOAD + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand); + + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // PowerPC has pre-inc load and store's. + setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal); + setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal); + setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal); + + if (Subtarget.useCRBits()) { + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + if (isPPC64 || Subtarget.hasFPCVT()) { + setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); + AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); + setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); + AddPromotedToType (ISD::UINT_TO_FP, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); + } else { + setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); + } + + // PowerPC does not support direct load / store of condition registers + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::i1, Custom); + + // FIXME: Remove this once the ANDI glue bug is fixed: + if (ANDIGlueBug) + setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); + + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::i32, MVT::i1, Expand); + setTruncStoreAction(MVT::i16, MVT::i1, Expand); + setTruncStoreAction(MVT::i8, MVT::i1, Expand); + + addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass); + } + + // This is used in the ppcf128->int sequence. Note it has different semantics + // from FP_ROUND: that rounds to nearest, this rounds to zero. + setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom); + + // We do not currently implement these libm ops for PowerPC. + setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand); + setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand); + setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand); + setOperationAction(ISD::FRINT, MVT::ppcf128, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand); + setOperationAction(ISD::FREM, MVT::ppcf128, Expand); + + // PowerPC has no SREM/UREM instructions + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i64, Expand); + setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + + // We don't support sin/cos/sqrt/fmod/pow + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FREM , MVT::f64, Expand); + setOperationAction(ISD::FPOW , MVT::f64, Expand); + setOperationAction(ISD::FMA , MVT::f64, Legal); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + setOperationAction(ISD::FREM , MVT::f32, Expand); + setOperationAction(ISD::FPOW , MVT::f32, Expand); + setOperationAction(ISD::FMA , MVT::f32, Legal); + + setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); + + // If we're enabling GP optimizations, use hardware square root + if (!Subtarget.hasFSQRT() && + !(TM.Options.UnsafeFPMath && + Subtarget.hasFRSQRTE() && Subtarget.hasFRE())) + setOperationAction(ISD::FSQRT, MVT::f64, Expand); + + if (!Subtarget.hasFSQRT() && + !(TM.Options.UnsafeFPMath && + Subtarget.hasFRSQRTES() && Subtarget.hasFRES())) + setOperationAction(ISD::FSQRT, MVT::f32, Expand); + + if (Subtarget.hasFCPSGN()) { + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal); + } else { + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + } + + if (Subtarget.hasFPRND()) { + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FROUND, MVT::f64, Legal); + + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FROUND, MVT::f32, Legal); + } + + // PowerPC does not have BSWAP, CTPOP or CTTZ + setOperationAction(ISD::BSWAP, MVT::i32 , Expand); + setOperationAction(ISD::CTTZ , MVT::i32 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i64 , Expand); + setOperationAction(ISD::CTTZ , MVT::i64 , Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + + if (Subtarget.hasPOPCNTD()) { + setOperationAction(ISD::CTPOP, MVT::i32 , Legal); + setOperationAction(ISD::CTPOP, MVT::i64 , Legal); + } else { + setOperationAction(ISD::CTPOP, MVT::i32 , Expand); + setOperationAction(ISD::CTPOP, MVT::i64 , Expand); + } + + // PowerPC does not have ROTR + setOperationAction(ISD::ROTR, MVT::i32 , Expand); + setOperationAction(ISD::ROTR, MVT::i64 , Expand); + + if (!Subtarget.useCRBits()) { + // PowerPC does not have Select + setOperationAction(ISD::SELECT, MVT::i32, Expand); + setOperationAction(ISD::SELECT, MVT::i64, Expand); + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::f64, Expand); + } + + // PowerPC wants to turn select_cc of FP into fsel when possible. + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + + // PowerPC wants to optimize integer setcc a bit + if (!Subtarget.useCRBits()) + setOperationAction(ISD::SETCC, MVT::i32, Custom); + + // PowerPC does not have BRCOND which requires SetCC + if (!Subtarget.useCRBits()) + setOperationAction(ISD::BRCOND, MVT::Other, Expand); + + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + + // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + + // PowerPC does not have [U|S]INT_TO_FP + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); + + setOperationAction(ISD::BITCAST, MVT::f32, Expand); + setOperationAction(ISD::BITCAST, MVT::i32, Expand); + setOperationAction(ISD::BITCAST, MVT::i64, Expand); + setOperationAction(ISD::BITCAST, MVT::f64, Expand); + + // We cannot sextinreg(i1). Expand to shifts. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); + + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support + // SjLj exception handling but a light-weight setjmp/longjmp replacement to + // support continuation, user-level threading, and etc.. As a result, no + // other SjLj exception interfaces are implemented and please don't build + // your own exception handling based on them. + // LLVM/Clang supports zero-cost DWARF exception handling. + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + + // We want to legalize GlobalAddress and ConstantPool nodes into the + // appropriate instructions to materialize the address. + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); + setOperationAction(ISD::BlockAddress, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + setOperationAction(ISD::JumpTable, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + setOperationAction(ISD::BlockAddress, MVT::i64, Custom); + setOperationAction(ISD::ConstantPool, MVT::i64, Custom); + setOperationAction(ISD::JumpTable, MVT::i64, Custom); + + // TRAP is legal. + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + // TRAMPOLINE is custom lowered. + setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); + setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); + + // VASTART needs to be custom lowered to use the VarArgsFrameIndex + setOperationAction(ISD::VASTART , MVT::Other, Custom); + + if (Subtarget.isSVR4ABI()) { + if (isPPC64) { + // VAARG always uses double-word chunks, so promote anything smaller. + setOperationAction(ISD::VAARG, MVT::i1, Promote); + AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64); + setOperationAction(ISD::VAARG, MVT::i8, Promote); + AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64); + setOperationAction(ISD::VAARG, MVT::i16, Promote); + AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64); + setOperationAction(ISD::VAARG, MVT::i32, Promote); + AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + } else { + // VAARG is custom lowered with the 32-bit SVR4 ABI. + setOperationAction(ISD::VAARG, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::i64, Custom); + } + } else + setOperationAction(ISD::VAARG, MVT::Other, Expand); + + if (Subtarget.isSVR4ABI() && !isPPC64) + // VACOPY is custom lowered with the 32-bit SVR4 ABI. + setOperationAction(ISD::VACOPY , MVT::Other, Custom); + else + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + + // Use the default implementation. + setOperationAction(ISD::VAEND , MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + // To handle counter-based loop conditions. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); + + // Comparisons that require checking two conditions. + setCondCodeAction(ISD::SETULT, MVT::f32, Expand); + setCondCodeAction(ISD::SETULT, MVT::f64, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); + setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); + setCondCodeAction(ISD::SETOGE, MVT::f32, Expand); + setCondCodeAction(ISD::SETOGE, MVT::f64, Expand); + setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); + setCondCodeAction(ISD::SETOLE, MVT::f64, Expand); + setCondCodeAction(ISD::SETONE, MVT::f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::f64, Expand); + + if (Subtarget.has64BitSupport()) { + // They also have instructions for converting between i64 and fp. + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); + // This is just the low 32 bits of a (signed) fp->i64 conversion. + // We cannot do this with Promote because i64 is not a legal type. + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + + if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + } else { + // PowerPC does not have FP_TO_UINT on 32-bit implementations. + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); + } + + // With the instructions enabled under FPCVT, we can do everything. + if (Subtarget.hasFPCVT()) { + if (Subtarget.has64BitSupport()) { + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + } + + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + } + + if (Subtarget.use64BitRegs()) { + // 64-bit PowerPC implementations can support i64 types directly + addRegisterClass(MVT::i64, &PPC::G8RCRegClass); + // BUILD_PAIR can't be handled natively, and should be expanded to shl/or + setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); + // 64-bit PowerPC wants to expand i128 shifts itself. + setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); + } else { + // 32-bit PowerPC wants to expand i64 shifts itself. + setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); + } + + if (Subtarget.hasAltivec()) { + // First set operation action for all vector types to expand. Then we + // will selectively turn on ones that can be effectively codegen'd. + for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)i; + + // add/sub are legal for all supported vector VT's. + setOperationAction(ISD::ADD , VT, Legal); + setOperationAction(ISD::SUB , VT, Legal); + + // We promote all shuffles to v16i8. + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote); + AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8); + + // We promote all non-typed operations to v4i32. + setOperationAction(ISD::AND , VT, Promote); + AddPromotedToType (ISD::AND , VT, MVT::v4i32); + setOperationAction(ISD::OR , VT, Promote); + AddPromotedToType (ISD::OR , VT, MVT::v4i32); + setOperationAction(ISD::XOR , VT, Promote); + AddPromotedToType (ISD::XOR , VT, MVT::v4i32); + setOperationAction(ISD::LOAD , VT, Promote); + AddPromotedToType (ISD::LOAD , VT, MVT::v4i32); + setOperationAction(ISD::SELECT, VT, Promote); + AddPromotedToType (ISD::SELECT, VT, MVT::v4i32); + setOperationAction(ISD::STORE, VT, Promote); + AddPromotedToType (ISD::STORE, VT, MVT::v4i32); + + // No other operations are legal. + setOperationAction(ISD::MUL , VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FNEG, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FABS, VT, Expand); + setOperationAction(ISD::FPOWI, VT, Expand); + setOperationAction(ISD::FFLOOR, VT, Expand); + setOperationAction(ISD::FCEIL, VT, Expand); + setOperationAction(ISD::FTRUNC, VT, Expand); + setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); + setOperationAction(ISD::BUILD_VECTOR, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); + setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + + for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) { + MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j; + setTruncStoreAction(VT, InnerVT, Expand); + } + setLoadExtAction(ISD::SEXTLOAD, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, Expand); + } + + // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle + // with merges, splats, etc. + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); + + setOperationAction(ISD::AND , MVT::v4i32, Legal); + setOperationAction(ISD::OR , MVT::v4i32, Legal); + setOperationAction(ISD::XOR , MVT::v4i32, Legal); + setOperationAction(ISD::LOAD , MVT::v4i32, Legal); + setOperationAction(ISD::SELECT, MVT::v4i32, + Subtarget.useCRBits() ? Legal : Expand); + setOperationAction(ISD::STORE , MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + + addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); + addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); + addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); + addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass); + + setOperationAction(ISD::MUL, MVT::v4f32, Legal); + setOperationAction(ISD::FMA, MVT::v4f32, Legal); + + if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + } + + setOperationAction(ISD::MUL, MVT::v4i32, Custom); + setOperationAction(ISD::MUL, MVT::v8i16, Custom); + setOperationAction(ISD::MUL, MVT::v16i8, Custom); + + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom); + + setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + + // Altivec does not contain unordered floating-point compare instructions + setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETO, MVT::v4f32, Expand); + setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand); + + if (Subtarget.hasVSX()) { + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); + + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); + setOperationAction(ISD::FROUND, MVT::v2f64, Legal); + + setOperationAction(ISD::FROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::MUL, MVT::v2f64, Legal); + setOperationAction(ISD::FMA, MVT::v2f64, Legal); + + setOperationAction(ISD::FDIV, MVT::v2f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); + + setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); + setOperationAction(ISD::VSELECT, MVT::v8i16, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); + setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); + + // Share the Altivec comparison restrictions. + setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand); + setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand); + setCondCodeAction(ISD::SETO, MVT::v2f64, Expand); + setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand); + + setOperationAction(ISD::LOAD, MVT::v2f64, Legal); + setOperationAction(ISD::STORE, MVT::v2f64, Legal); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); + + addRegisterClass(MVT::f64, &PPC::VSFRCRegClass); + + addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass); + addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass); + + // VSX v2i64 only supports non-arithmetic operations. + setOperationAction(ISD::ADD, MVT::v2i64, Expand); + setOperationAction(ISD::SUB, MVT::v2i64, Expand); + + setOperationAction(ISD::SHL, MVT::v2i64, Expand); + setOperationAction(ISD::SRA, MVT::v2i64, Expand); + setOperationAction(ISD::SRL, MVT::v2i64, Expand); + + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); + + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); + + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + + // Vector operation legalization checks the result type of + // SIGN_EXTEND_INREG, overall legalization checks the inner type. + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); + + addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); + } + } + + if (Subtarget.has64BitSupport()) { + setOperationAction(ISD::PREFETCH, MVT::Other, Legal); + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); + } + + setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); + + setBooleanContents(ZeroOrOneBooleanContent); + // Altivec instructions set fields to all zeros or all ones. + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + if (!isPPC64) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, nullptr); + setLibcallName(RTLIB::SRL_I128, nullptr); + setLibcallName(RTLIB::SRA_I128, nullptr); + } + + if (isPPC64) { + setStackPointerRegisterToSaveRestore(PPC::X1); + setExceptionPointerRegister(PPC::X3); + setExceptionSelectorRegister(PPC::X4); + } else { + setStackPointerRegisterToSaveRestore(PPC::R1); + setExceptionPointerRegister(PPC::R3); + setExceptionSelectorRegister(PPC::R4); + } + + // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::BR_CC); + if (Subtarget.useCRBits()) + setTargetDAGCombine(ISD::BRCOND); + setTargetDAGCombine(ISD::BSWAP); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); + + if (Subtarget.useCRBits()) { + setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::SETCC); + setTargetDAGCombine(ISD::SELECT_CC); + } + + // Use reciprocal estimates. + if (TM.Options.UnsafeFPMath) { + setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::FSQRT); + } + + // Darwin long double math library functions have $LDBL128 appended. + if (Subtarget.isDarwin()) { + setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); + setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128"); + setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128"); + setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128"); + setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128"); + setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128"); + setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128"); + setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128"); + setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128"); + setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128"); + } + + // With 32 condition bits, we don't need to sink (and duplicate) compares + // aggressively in CodeGenPrep. + if (Subtarget.useCRBits()) + setHasMultipleConditionRegisters(); + + setMinFunctionAlignment(2); + if (Subtarget.isDarwin()) + setPrefFunctionAlignment(4); + + if (isPPC64 && Subtarget.isJITCodeModel()) + // Temporary workaround for the inability of PPC64 JIT to handle jump + // tables. + setSupportJumpTables(false); + + setInsertFencesForAtomic(true); + + if (Subtarget.enableMachineScheduler()) + setSchedulingPreference(Sched::Source); + else + setSchedulingPreference(Sched::Hybrid); + + computeRegisterProperties(); + + // The Freescale cores does better with aggressive inlining of memcpy and + // friends. Gcc uses same threshold of 128 bytes (= 32 word stores). + if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || + Subtarget.getDarwinDirective() == PPC::DIR_E5500) { + MaxStoresPerMemset = 32; + MaxStoresPerMemsetOptSize = 16; + MaxStoresPerMemcpy = 32; + MaxStoresPerMemcpyOptSize = 8; + MaxStoresPerMemmove = 32; + MaxStoresPerMemmoveOptSize = 8; + + setPrefFunctionAlignment(4); + } +} + +/// getMaxByValAlign - Helper for getByValTypeAlignment to determine +/// the desired ByVal argument alignment. +static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, + unsigned MaxMaxAlign) { + if (MaxAlign == MaxMaxAlign) + return; + if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { + if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256) + MaxAlign = 32; + else if (VTy->getBitWidth() >= 128 && MaxAlign < 16) + MaxAlign = 16; + } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + unsigned EltAlign = 0; + getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + } else if (StructType *STy = dyn_cast<StructType>(Ty)) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + unsigned EltAlign = 0; + getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + if (MaxAlign == MaxMaxAlign) + break; + } + } +} + +/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate +/// function arguments in the caller parameter area. +unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const { + // Darwin passes everything on 4 byte boundary. + if (Subtarget.isDarwin()) + return 4; + + // 16byte and wider vectors are passed on 16byte boundary. + // The rest is 8 on PPC64 and 4 on PPC32 boundary. + unsigned Align = Subtarget.isPPC64() ? 8 : 4; + if (Subtarget.hasAltivec() || Subtarget.hasQPX()) + getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16); + return Align; +} + +const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return nullptr; + case PPCISD::FSEL: return "PPCISD::FSEL"; + case PPCISD::FCFID: return "PPCISD::FCFID"; + case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ"; + case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; + case PPCISD::FRE: return "PPCISD::FRE"; + case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; + case PPCISD::STFIWX: return "PPCISD::STFIWX"; + case PPCISD::VMADDFP: return "PPCISD::VMADDFP"; + case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; + case PPCISD::VPERM: return "PPCISD::VPERM"; + case PPCISD::Hi: return "PPCISD::Hi"; + case PPCISD::Lo: return "PPCISD::Lo"; + case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; + case PPCISD::LOAD: return "PPCISD::LOAD"; + case PPCISD::LOAD_TOC: return "PPCISD::LOAD_TOC"; + case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; + case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; + case PPCISD::SRL: return "PPCISD::SRL"; + case PPCISD::SRA: return "PPCISD::SRA"; + case PPCISD::SHL: return "PPCISD::SHL"; + case PPCISD::CALL: return "PPCISD::CALL"; + case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; + case PPCISD::MTCTR: return "PPCISD::MTCTR"; + case PPCISD::BCTRL: return "PPCISD::BCTRL"; + case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; + case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; + case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; + case PPCISD::MFOCRF: return "PPCISD::MFOCRF"; + case PPCISD::VCMP: return "PPCISD::VCMP"; + case PPCISD::VCMPo: return "PPCISD::VCMPo"; + case PPCISD::LBRX: return "PPCISD::LBRX"; + case PPCISD::STBRX: return "PPCISD::STBRX"; + case PPCISD::LARX: return "PPCISD::LARX"; + case PPCISD::STCX: return "PPCISD::STCX"; + case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; + case PPCISD::BDNZ: return "PPCISD::BDNZ"; + case PPCISD::BDZ: return "PPCISD::BDZ"; + case PPCISD::MFFS: return "PPCISD::MFFS"; + case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; + case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; + case PPCISD::CR6SET: return "PPCISD::CR6SET"; + case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; + case PPCISD::ADDIS_TOC_HA: return "PPCISD::ADDIS_TOC_HA"; + case PPCISD::LD_TOC_L: return "PPCISD::LD_TOC_L"; + case PPCISD::ADDI_TOC_L: return "PPCISD::ADDI_TOC_L"; + case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT"; + case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA"; + case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L"; + case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS"; + case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; + case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; + case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; + case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; + case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L"; + case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR"; + case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; + case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; + case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; + case PPCISD::SC: return "PPCISD::SC"; + } +} + +EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { + if (!VT.isVector()) + return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; + return VT.changeVectorElementTypeToInteger(); +} + +//===----------------------------------------------------------------------===// +// Node matching predicates, for use by the tblgen matching code. +//===----------------------------------------------------------------------===// + +/// isFloatingPointZero - Return true if this is 0.0 or -0.0. +static bool isFloatingPointZero(SDValue Op) { + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) + return CFP->getValueAPF().isZero(); + else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { + // Maybe this has already been legalized into the constant pool? + if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1))) + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) + return CFP->getValueAPF().isZero(); + } + return false; +} + +/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return +/// true if Op is undef or if it matches the specified value. +static bool isConstantOrUndef(int Op, int Val) { + return Op < 0 || Op == Val; +} + +/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a +/// VPKUHUM instruction. +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operantion with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { + if (ShuffleKind == 0) { + if (DAG.getTarget().getDataLayout()->isLittleEndian()) + return false; + for (unsigned i = 0; i != 16; ++i) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+1)) + return false; + } else if (ShuffleKind == 2) { + if (!DAG.getTarget().getDataLayout()->isLittleEndian()) + return false; + for (unsigned i = 0; i != 16; ++i) + if (!isConstantOrUndef(N->getMaskElt(i), i*2)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1; + for (unsigned i = 0; i != 8; ++i) + if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j)) + return false; + } + return true; +} + +/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a +/// VPKUWUM instruction. +/// The ShuffleKind distinguishes between big-endian operations with +/// two different inputs (0), either-endian operations with two identical +/// inputs (1), and little-endian operantion with two different inputs (2). +/// For the latter, the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { + if (ShuffleKind == 0) { + if (DAG.getTarget().getDataLayout()->isLittleEndian()) + return false; + for (unsigned i = 0; i != 16; i += 2) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+3)) + return false; + } else if (ShuffleKind == 2) { + if (!DAG.getTarget().getDataLayout()->isLittleEndian()) + return false; + for (unsigned i = 0; i != 16; i += 2) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+1)) + return false; + } else if (ShuffleKind == 1) { + unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 2; + for (unsigned i = 0; i != 8; i += 2) + if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) || + !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) || + !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1)) + return false; + } + return true; +} + +/// isVMerge - Common function, used to match vmrg* shuffles. +/// +static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned LHSStart, unsigned RHSStart) { + if (N->getValueType(0) != MVT::v16i8) + return false; + assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) && + "Unsupported merge size!"); + + for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units + for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit + if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j), + LHSStart+j+i*UnitSize) || + !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j), + RHSStart+j+i*UnitSize)) + return false; + } + return true; +} + +/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for +/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 2) // swapped + return isVMerge(N, UnitSize, 0, 16); + else + return false; + } else { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 0) // normal + return isVMerge(N, UnitSize, 8, 24); + else + return false; + } +} + +/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for +/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). +/// The ShuffleKind distinguishes between big-endian merges with two +/// different inputs (0), either-endian merges with two identical inputs (1), +/// and little-endian merges with two different inputs (2). For the latter, +/// the input operands are swapped (see PPCInstrAltivec.td). +bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned ShuffleKind, SelectionDAG &DAG) { + if (DAG.getTarget().getDataLayout()->isLittleEndian()) { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 8, 8); + else if (ShuffleKind == 2) // swapped + return isVMerge(N, UnitSize, 8, 24); + else + return false; + } else { + if (ShuffleKind == 1) // unary + return isVMerge(N, UnitSize, 0, 0); + else if (ShuffleKind == 0) // normal + return isVMerge(N, UnitSize, 0, 16); + else + return false; + } +} + + +/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift +/// amount, otherwise return -1. +/// The ShuffleKind distinguishes between big-endian operations with two +/// different inputs (0), either-endian operations with two identical inputs +/// (1), and little-endian operations with two different inputs (2). For the +/// latter, the input operands are swapped (see PPCInstrAltivec.td). +int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG) { + if (N->getValueType(0) != MVT::v16i8) + return -1; + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + + // Find the first non-undef value in the shuffle mask. + unsigned i; + for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i) + /*search*/; + + if (i == 16) return -1; // all undef. + + // Otherwise, check to see if the rest of the elements are consecutively + // numbered from this value. + unsigned ShiftAmt = SVOp->getMaskElt(i); + if (ShiftAmt < i) return -1; + + ShiftAmt -= i; + bool isLE = DAG.getTarget().getDataLayout()->isLittleEndian(); + + if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) + return -1; + } else if (ShuffleKind == 1) { + // Check the rest of the elements to see if they are consecutive. + for (++i; i != 16; ++i) + if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15)) + return -1; + } else + return -1; + + if (ShuffleKind == 2 && isLE) + ShiftAmt = 16 - ShiftAmt; + + return ShiftAmt; +} + +/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a splat of a single element that is suitable for input to +/// VSPLTB/VSPLTH/VSPLTW. +bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { + assert(N->getValueType(0) == MVT::v16i8 && + (EltSize == 1 || EltSize == 2 || EltSize == 4)); + + // This is a splat operation if each element of the permute is the same, and + // if the value doesn't reference the second vector. + unsigned ElementBase = N->getMaskElt(0); + + // FIXME: Handle UNDEF elements too! + if (ElementBase >= 16) + return false; + + // Check that the indices are consecutive, in the case of a multi-byte element + // splatted with a v16i8 mask. + for (unsigned i = 1; i != EltSize; ++i) + if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase)) + return false; + + for (unsigned i = EltSize, e = 16; i != e; i += EltSize) { + if (N->getMaskElt(i) < 0) continue; + for (unsigned j = 0; j != EltSize; ++j) + if (N->getMaskElt(i+j) != N->getMaskElt(j)) + return false; + } + return true; +} + +/// isAllNegativeZeroVector - Returns true if all elements of build_vector +/// are -0.0. +bool PPC::isAllNegativeZeroVector(SDNode *N) { + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N); + + APInt APVal, APUndef; + unsigned BitSize; + bool HasAnyUndefs; + + if (BV->isConstantSplat(APVal, APUndef, BitSize, HasAnyUndefs, 32, true)) + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + return CFP->getValueAPF().isNegZero(); + + return false; +} + +/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the +/// specified isSplatShuffleMask VECTOR_SHUFFLE mask. +unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, + SelectionDAG &DAG) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + assert(isSplatShuffleMask(SVOp, EltSize)); + if (DAG.getTarget().getDataLayout()->isLittleEndian()) + return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); + else + return SVOp->getMaskElt(0) / EltSize; +} + +/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed +/// by using a vspltis[bhw] instruction of the specified element size, return +/// the constant being splatted. The ByteSize field indicates the number of +/// bytes of each element [124] -> [bhw]. +SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { + SDValue OpVal(nullptr, 0); + + // If ByteSize of the splat is bigger than the element size of the + // build_vector, then we have a case where we are checking for a splat where + // multiple elements of the buildvector are folded together into a single + // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8). + unsigned EltSize = 16/N->getNumOperands(); + if (EltSize < ByteSize) { + unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval. + SDValue UniquedVals[4]; + assert(Multiple > 1 && Multiple <= 4 && "How can this happen?"); + + // See if all of the elements in the buildvector agree across. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue; + // If the element isn't a constant, bail fully out. + if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue(); + + + if (!UniquedVals[i&(Multiple-1)].getNode()) + UniquedVals[i&(Multiple-1)] = N->getOperand(i); + else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i)) + return SDValue(); // no match. + } + + // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains + // either constant or undef values that are identical for each chunk. See + // if these chunks can form into a larger vspltis*. + + // Check to see if all of the leading entries are either 0 or -1. If + // neither, then this won't fit into the immediate field. + bool LeadingZero = true; + bool LeadingOnes = true; + for (unsigned i = 0; i != Multiple-1; ++i) { + if (!UniquedVals[i].getNode()) continue; // Must have been undefs. + + LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue(); + LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue(); + } + // Finally, check the least significant entry. + if (LeadingZero) { + if (!UniquedVals[Multiple-1].getNode()) + return DAG.getTargetConstant(0, MVT::i32); // 0,0,0,undef + int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue(); + if (Val < 16) + return DAG.getTargetConstant(Val, MVT::i32); // 0,0,0,4 -> vspltisw(4) + } + if (LeadingOnes) { + if (!UniquedVals[Multiple-1].getNode()) + return DAG.getTargetConstant(~0U, MVT::i32); // -1,-1,-1,undef + int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue(); + if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2) + return DAG.getTargetConstant(Val, MVT::i32); + } + + return SDValue(); + } + + // Check to see if this buildvec has a single non-undef value in its elements. + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue; + if (!OpVal.getNode()) + OpVal = N->getOperand(i); + else if (OpVal != N->getOperand(i)) + return SDValue(); + } + + if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def. + + unsigned ValSizeInBytes = EltSize; + uint64_t Value = 0; + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) { + Value = CN->getZExtValue(); + } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) { + assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); + Value = FloatToBits(CN->getValueAPF().convertToFloat()); + } + + // If the splat value is larger than the element value, then we can never do + // this splat. The only case that we could fit the replicated bits into our + // immediate field for would be zero, and we prefer to use vxor for it. + if (ValSizeInBytes < ByteSize) return SDValue(); + + // If the element value is larger than the splat value, cut it in half and + // check to see if the two halves are equal. Continue doing this until we + // get to ByteSize. This allows us to handle 0x01010101 as 0x01. + while (ValSizeInBytes > ByteSize) { + ValSizeInBytes >>= 1; + + // If the top half equals the bottom half, we're still ok. + if (((Value >> (ValSizeInBytes*8)) & ((1 << (8*ValSizeInBytes))-1)) != + (Value & ((1 << (8*ValSizeInBytes))-1))) + return SDValue(); + } + + // Properly sign extend the value. + int MaskVal = SignExtend32(Value, ByteSize * 8); + + // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros. + if (MaskVal == 0) return SDValue(); + + // Finally, if this value fits in a 5 bit sext field, return it + if (SignExtend32<5>(MaskVal) == MaskVal) + return DAG.getTargetConstant(MaskVal, MVT::i32); + return SDValue(); +} + +//===----------------------------------------------------------------------===// +// Addressing Mode Selection +//===----------------------------------------------------------------------===// + +/// isIntS16Immediate - This method tests to see if the node is either a 32-bit +/// or 64-bit immediate, and if the value can be accurately represented as a +/// sign extension from a 16-bit value. If so, this returns true and the +/// immediate. +static bool isIntS16Immediate(SDNode *N, short &Imm) { + if (!isa<ConstantSDNode>(N)) + return false; + + Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); + if (N->getValueType(0) == MVT::i32) + return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); + else + return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); +} +static bool isIntS16Immediate(SDValue Op, short &Imm) { + return isIntS16Immediate(Op.getNode(), Imm); +} + + +/// SelectAddressRegReg - Given the specified addressed, check to see if it +/// can be represented as an indexed [r+r] operation. Returns false if it +/// can be more efficiently represented with [r+imm]. +bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, + SDValue &Index, + SelectionDAG &DAG) const { + short imm = 0; + if (N.getOpcode() == ISD::ADD) { + if (isIntS16Immediate(N.getOperand(1), imm)) + return false; // r+i + if (N.getOperand(1).getOpcode() == PPCISD::Lo) + return false; // r+i + + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } else if (N.getOpcode() == ISD::OR) { + if (isIntS16Immediate(N.getOperand(1), imm)) + return false; // r+i can fold it if we can. + + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are provably + // disjoint. + APInt LHSKnownZero, LHSKnownOne; + APInt RHSKnownZero, RHSKnownOne; + DAG.computeKnownBits(N.getOperand(0), + LHSKnownZero, LHSKnownOne); + + if (LHSKnownZero.getBoolValue()) { + DAG.computeKnownBits(N.getOperand(1), + RHSKnownZero, RHSKnownOne); + // If all of the bits are known zero on the LHS or RHS, the add won't + // carry. + if (~(LHSKnownZero | RHSKnownZero) == 0) { + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } + } + } + + return false; +} + +// If we happen to be doing an i64 load or store into a stack slot that has +// less than a 4-byte alignment, then the frame-index elimination may need to +// use an indexed load or store instruction (because the offset may not be a +// multiple of 4). The extra register needed to hold the offset comes from the +// register scavenger, and it is possible that the scavenger will need to use +// an emergency spill slot. As a result, we need to make sure that a spill slot +// is allocated when doing an i64 load/store into a less-than-4-byte-aligned +// stack slot. +static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { + // FIXME: This does not handle the LWA case. + if (VT != MVT::i64) + return; + + // NOTE: We'll exclude negative FIs here, which come from argument + // lowering, because there are no known test cases triggering this problem + // using packed structures (or similar). We can remove this exclusion if + // we find such a test case. The reason why this is so test-case driven is + // because this entire 'fixup' is only to prevent crashes (from the + // register scavenger) on not-really-valid inputs. For example, if we have: + // %a = alloca i1 + // %b = bitcast i1* %a to i64* + // store i64* a, i64 b + // then the store should really be marked as 'align 1', but is not. If it + // were marked as 'align 1' then the indexed form would have been + // instruction-selected initially, and the problem this 'fixup' is preventing + // won't happen regardless. + if (FrameIdx < 0) + return; + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + unsigned Align = MFI->getObjectAlignment(FrameIdx); + if (Align >= 4) + return; + + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + FuncInfo->setHasNonRISpills(); +} + +/// Returns true if the address N can be represented by a base register plus +/// a signed 16-bit displacement [r+imm], and if it is not better +/// represented as reg+reg. If Aligned is true, only accept displacements +/// suitable for STD and friends, i.e. multiples of 4. +bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, + SDValue &Base, + SelectionDAG &DAG, + bool Aligned) const { + // FIXME dl should come from parent load or store, not from address + SDLoc dl(N); + // If this can be more profitably realized as r+r, fail. + if (SelectAddressRegReg(N, Disp, Base, DAG)) + return false; + + if (N.getOpcode() == ISD::ADD) { + short imm = 0; + if (isIntS16Immediate(N.getOperand(1), imm) && + (!Aligned || (imm & 3) == 0)) { + Disp = DAG.getTargetConstant(imm, N.getValueType()); + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } + return true; // [r+i] + } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { + // Match LOAD (ADD (X, Lo(G))). + assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() + && "Cannot handle constant offsets yet!"); + Disp = N.getOperand(1).getOperand(0); // The global address. + assert(Disp.getOpcode() == ISD::TargetGlobalAddress || + Disp.getOpcode() == ISD::TargetGlobalTLSAddress || + Disp.getOpcode() == ISD::TargetConstantPool || + Disp.getOpcode() == ISD::TargetJumpTable); + Base = N.getOperand(0); + return true; // [&g+r] + } + } else if (N.getOpcode() == ISD::OR) { + short imm = 0; + if (isIntS16Immediate(N.getOperand(1), imm) && + (!Aligned || (imm & 3) == 0)) { + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are + // provably disjoint. + APInt LHSKnownZero, LHSKnownOne; + DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne); + + if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) { + // If all of the bits are known zero on the LHS or RHS, the add won't + // carry. + if (FrameIndexSDNode *FI = + dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else { + Base = N.getOperand(0); + } + Disp = DAG.getTargetConstant(imm, N.getValueType()); + return true; + } + } + } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { + // Loading from a constant address. + + // If this address fits entirely in a 16-bit sext immediate field, codegen + // this as "d, 0" + short Imm; + if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { + Disp = DAG.getTargetConstant(Imm, CN->getValueType(0)); + Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, + CN->getValueType(0)); + return true; + } + + // Handle 32-bit sext immediates with LIS + addr mode. + if ((CN->getValueType(0) == MVT::i32 || + (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && + (!Aligned || (CN->getZExtValue() & 3) == 0)) { + int Addr = (int)CN->getZExtValue(); + + // Otherwise, break this down into an LIS + disp. + Disp = DAG.getTargetConstant((short)Addr, MVT::i32); + + Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, MVT::i32); + unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8; + Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0); + return true; + } + } + + Disp = DAG.getTargetConstant(0, getPointerTy()); + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) { + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + fixupFuncForFI(DAG, FI->getIndex(), N.getValueType()); + } else + Base = N; + return true; // [r+0] +} + +/// SelectAddressRegRegOnly - Given the specified addressed, force it to be +/// represented as an indexed [r+r] operation. +bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, + SDValue &Index, + SelectionDAG &DAG) const { + // Check to see if we can easily represent this as an [r+r] address. This + // will fail if it thinks that the address is more profitably represented as + // reg+imm, e.g. where imm = 0. + if (SelectAddressRegReg(N, Base, Index, DAG)) + return true; + + // If the operand is an addition, always emit this as [r+r], since this is + // better (for code size, and execution, as the memop does the add for free) + // than emitting an explicit add. + if (N.getOpcode() == ISD::ADD) { + Base = N.getOperand(0); + Index = N.getOperand(1); + return true; + } + + // Otherwise, do it the hard way, using R0 as the base register. + Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, + N.getValueType()); + Index = N; + return true; +} + +/// getPreIndexedAddressParts - returns true by value, base pointer and +/// offset pointer and addressing mode by reference if the node's address +/// can be legally represented as pre-indexed load / store address. +bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + if (DisablePPCPreinc) return false; + + bool isLoad = true; + SDValue Ptr; + EVT VT; + unsigned Alignment; + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + Ptr = LD->getBasePtr(); + VT = LD->getMemoryVT(); + Alignment = LD->getAlignment(); + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { + Ptr = ST->getBasePtr(); + VT = ST->getMemoryVT(); + Alignment = ST->getAlignment(); + isLoad = false; + } else + return false; + + // PowerPC doesn't have preinc load/store instructions for vectors. + if (VT.isVector()) + return false; + + if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { + + // Common code will reject creating a pre-inc form if the base pointer + // is a frame index, or if N is a store and the base pointer is either + // the same as or a predecessor of the value being stored. Check for + // those situations here, and try with swapped Base/Offset instead. + bool Swap = false; + + if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base)) + Swap = true; + else if (!isLoad) { + SDValue Val = cast<StoreSDNode>(N)->getValue(); + if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode())) + Swap = true; + } + + if (Swap) + std::swap(Base, Offset); + + AM = ISD::PRE_INC; + return true; + } + + // LDU/STU can only handle immediates that are a multiple of 4. + if (VT != MVT::i64) { + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false)) + return false; + } else { + // LDU/STU need an address with at least 4-byte alignment. + if (Alignment < 4) + return false; + + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true)) + return false; + } + + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { + // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of + // sext i32 to i64 when addr mode is r+i. + if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 && + LD->getExtensionType() == ISD::SEXTLOAD && + isa<ConstantSDNode>(Offset)) + return false; + } + + AM = ISD::PRE_INC; + return true; +} + +//===----------------------------------------------------------------------===// +// LowerOperation implementation +//===----------------------------------------------------------------------===// + +/// GetLabelAccessInfo - Return true if we should reference labels using a +/// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags. +static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags, + unsigned &LoOpFlags, + const GlobalValue *GV = nullptr) { + HiOpFlags = PPCII::MO_HA; + LoOpFlags = PPCII::MO_LO; + + // Don't use the pic base if not in PIC relocation model. + bool isPIC = TM.getRelocationModel() == Reloc::PIC_; + + if (isPIC) { + HiOpFlags |= PPCII::MO_PIC_FLAG; + LoOpFlags |= PPCII::MO_PIC_FLAG; + } + + // If this is a reference to a global value that requires a non-lazy-ptr, make + // sure that instruction lowering adds it. + if (GV && TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV, TM)) { + HiOpFlags |= PPCII::MO_NLP_FLAG; + LoOpFlags |= PPCII::MO_NLP_FLAG; + + if (GV->hasHiddenVisibility()) { + HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; + LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG; + } + } + + return isPIC; +} + +static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC, + SelectionDAG &DAG) { + EVT PtrVT = HiPart.getValueType(); + SDValue Zero = DAG.getConstant(0, PtrVT); + SDLoc DL(HiPart); + + SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero); + SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero); + + // With PIC, the first instruction is actually "GR+hi(&G)". + if (isPIC) + Hi = DAG.getNode(ISD::ADD, DL, PtrVT, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi); + + // Generate non-pic code that has direct accesses to the constant pool. + // The address of the global is just (hi(&g)+lo(&g)). + return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo); +} + +SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = Op.getValueType(); + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + const Constant *C = CP->getConstVal(); + + // 64-bit SVR4 ABI code is always position-independent. + // The actual address of the GlobalValue is stored in the TOC. + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); + return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA, + DAG.getRegister(PPC::X2, MVT::i64)); + } + + unsigned MOHiFlag, MOLoFlag; + bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), + PPCII::MO_PIC_FLAG); + SDLoc DL(CP); + return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT)); + } + + SDValue CPIHi = + DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag); + SDValue CPILo = + DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag); + return LowerLabelRef(CPIHi, CPILo, isPIC, DAG); +} + +SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { + EVT PtrVT = Op.getValueType(); + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + + // 64-bit SVR4 ABI code is always position-independent. + // The actual address of the GlobalValue is stored in the TOC. + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); + return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA, + DAG.getRegister(PPC::X2, MVT::i64)); + } + + unsigned MOHiFlag, MOLoFlag; + bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + PPCII::MO_PIC_FLAG); + SDLoc DL(GA); + return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), PtrVT, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT)); + } + + SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag); + SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag); + return LowerLabelRef(JTIHi, JTILo, isPIC, DAG); +} + +SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = Op.getValueType(); + + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + + unsigned MOHiFlag, MOLoFlag; + bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag); + SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag); + SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag); + return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG); +} + +SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + + // FIXME: TLS addresses currently use medium model code sequences, + // which is the most useful form. Eventually support for small and + // large models could be added if users need it, at the cost of + // additional complexity. + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + SDLoc dl(GA); + const GlobalValue *GV = GA->getGlobal(); + EVT PtrVT = getPointerTy(); + bool is64bit = Subtarget.isPPC64(); + + TLSModel::Model Model = getTargetMachine().getTLSModel(GV); + + if (Model == TLSModel::LocalExec) { + SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TPREL_HA); + SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TPREL_LO); + SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2, + is64bit ? MVT::i64 : MVT::i32); + SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg); + return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi); + } + + if (Model == TLSModel::InitialExec) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); + SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TLS); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, + PtrVT, GOTReg, TGA); + } else + GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); + SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, + PtrVT, TGA, GOTPtr); + return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); + } + + if (Model == TLSModel::GeneralDynamic) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } + SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT, + GOTPtr, TGA); + + // We need a chain node, and don't have one handy. The underlying + // call has no side effects, so using the function entry node + // suffices. + SDValue Chain = DAG.getEntryNode(); + Chain = DAG.getCopyToReg(Chain, dl, + is64bit ? PPC::X3 : PPC::R3, GOTEntry); + SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3, + is64bit ? MVT::i64 : MVT::i32); + SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl, + PtrVT, ParmReg, TGA); + // The return value from GET_TLS_ADDR really is in X3 already, but + // some hacks are needed here to tie everything together. The extra + // copies dissolve during subsequent transforms. + Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr); + return DAG.getCopyFromReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, PtrVT); + } + + if (Model == TLSModel::LocalDynamic) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); + SDValue GOTPtr; + if (is64bit) { + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT, + GOTReg, TGA); + } else { + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } + SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT, + GOTPtr, TGA); + + // We need a chain node, and don't have one handy. The underlying + // call has no side effects, so using the function entry node + // suffices. + SDValue Chain = DAG.getEntryNode(); + Chain = DAG.getCopyToReg(Chain, dl, + is64bit ? PPC::X3 : PPC::R3, GOTEntry); + SDValue ParmReg = DAG.getRegister(is64bit ? PPC::X3 : PPC::R3, + is64bit ? MVT::i64 : MVT::i32); + SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl, + PtrVT, ParmReg, TGA); + // The return value from GET_TLSLD_ADDR really is in X3 already, but + // some hacks are needed here to tie everything together. The extra + // copies dissolve during subsequent transforms. + Chain = DAG.getCopyToReg(Chain, dl, is64bit ? PPC::X3 : PPC::R3, TLSAddr); + SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT, + Chain, ParmReg, TGA); + return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA); + } + + llvm_unreachable("Unknown TLS model!"); +} + +SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = Op.getValueType(); + GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op); + SDLoc DL(GSDN); + const GlobalValue *GV = GSDN->getGlobal(); + + // 64-bit SVR4 ABI code is always position-independent. + // The actual address of the GlobalValue is stored in the TOC. + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset()); + return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA, + DAG.getRegister(PPC::X2, MVT::i64)); + } + + unsigned MOHiFlag, MOLoFlag; + bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV); + + if (isPIC && Subtarget.isSVR4ABI()) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, + GSDN->getOffset(), + PPCII::MO_PIC_FLAG); + return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i32, GA, + DAG.getNode(PPCISD::GlobalBaseReg, DL, MVT::i32)); + } + + SDValue GAHi = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag); + SDValue GALo = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag); + + SDValue Ptr = LowerLabelRef(GAHi, GALo, isPIC, DAG); + + // If the global reference is actually to a non-lazy-pointer, we have to do an + // extra load to get the address of the global. + if (MOHiFlag & PPCII::MO_NLP_FLAG) + Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo(), + false, false, false, 0); + return Ptr; +} + +SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + SDLoc dl(Op); + + if (Op.getValueType() == MVT::v2i64) { + // When the operands themselves are v2i64 values, we need to do something + // special because VSX has no underlying comparison operations for these. + if (Op.getOperand(0).getValueType() == MVT::v2i64) { + // Equality can be handled by casting to the legal type for Altivec + // comparisons, everything else needs to be expanded. + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getSetCC(dl, MVT::v4i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)), + CC)); + } + + return SDValue(); + } + + // We handle most of these in the usual way. + return Op; + } + + // If we're comparing for equality to zero, expose the fact that this is + // implented as a ctlz/srl pair on ppc, so that the dag combiner can + // fold the new nodes. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + if (C->isNullValue() && CC == ISD::SETEQ) { + EVT VT = Op.getOperand(0).getValueType(); + SDValue Zext = Op.getOperand(0); + if (VT.bitsLT(MVT::i32)) { + VT = MVT::i32; + Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0)); + } + unsigned Log2b = Log2_32(VT.getSizeInBits()); + SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext); + SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz, + DAG.getConstant(Log2b, MVT::i32)); + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc); + } + // Leave comparisons against 0 and -1 alone for now, since they're usually + // optimized. FIXME: revisit this when we can custom lower all setcc + // optimizations. + if (C->isAllOnesValue() || C->isNullValue()) + return SDValue(); + } + + // If we have an integer seteq/setne, turn it into a compare against zero + // by xor'ing the rhs with the lhs, which is faster than setting a + // condition register, reading it back out, and masking the correct bit. The + // normal approach here uses sub to do this instead of xor. Using xor exposes + // the result to other bit-twiddling opportunities. + EVT LHSVT = Op.getOperand(0).getValueType(); + if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + EVT VT = Op.getValueType(); + SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, LHSVT), CC); + } + return SDValue(); +} + +SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + SDNode *Node = Op.getNode(); + EVT VT = Node->getValueType(0); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue InChain = Node->getOperand(0); + SDValue VAListPtr = Node->getOperand(1); + const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); + SDLoc dl(Node); + + assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only"); + + // gpr_index + SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, + VAListPtr, MachinePointerInfo(SV), MVT::i8, + false, false, 0); + InChain = GprIndex.getValue(1); + + if (VT == MVT::i64) { + // Check if GprIndex is even + SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex, + DAG.getConstant(1, MVT::i32)); + SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd, + DAG.getConstant(0, MVT::i32), ISD::SETNE); + SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex, + DAG.getConstant(1, MVT::i32)); + // Align GprIndex to be even if it isn't + GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne, + GprIndex); + } + + // fpr index is 1 byte after gpr + SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, + DAG.getConstant(1, MVT::i32)); + + // fpr + SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain, + FprPtr, MachinePointerInfo(SV), MVT::i8, + false, false, 0); + InChain = FprIndex.getValue(1); + + SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, + DAG.getConstant(8, MVT::i32)); + + SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr, + DAG.getConstant(4, MVT::i32)); + + // areas + SDValue OverflowArea = DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, + MachinePointerInfo(), false, false, + false, 0); + InChain = OverflowArea.getValue(1); + + SDValue RegSaveArea = DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, + MachinePointerInfo(), false, false, + false, 0); + InChain = RegSaveArea.getValue(1); + + // select overflow_area if index > 8 + SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex, + DAG.getConstant(8, MVT::i32), ISD::SETLT); + + // adjustment constant gpr_index * 4/8 + SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32, + VT.isInteger() ? GprIndex : FprIndex, + DAG.getConstant(VT.isInteger() ? 4 : 8, + MVT::i32)); + + // OurReg = RegSaveArea + RegConstant + SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea, + RegConstant); + + // Floating types are 32 bytes into RegSaveArea + if (VT.isFloatingPoint()) + OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg, + DAG.getConstant(32, MVT::i32)); + + // increase {f,g}pr_index by 1 (or 2 if VT is i64) + SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32, + VT.isInteger() ? GprIndex : FprIndex, + DAG.getConstant(VT == MVT::i64 ? 2 : 1, + MVT::i32)); + + InChain = DAG.getTruncStore(InChain, dl, IndexPlus1, + VT.isInteger() ? VAListPtr : FprPtr, + MachinePointerInfo(SV), + MVT::i8, false, false, 0); + + // determine if we should load from reg_save_area or overflow_area + SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea); + + // increase overflow_area by 4/8 if gpr/fpr > 8 + SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea, + DAG.getConstant(VT.isInteger() ? 4 : 8, + MVT::i32)); + + OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea, + OverflowAreaPlusN); + + InChain = DAG.getTruncStore(InChain, dl, OverflowArea, + OverflowAreaPtr, + MachinePointerInfo(), + MVT::i32, false, false, 0); + + return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), + false, false, false, 0); +} + +SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only"); + + // We have to copy the entire va_list struct: + // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte + return DAG.getMemcpy(Op.getOperand(0), Op, + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(12, MVT::i32), 8, false, true, + MachinePointerInfo(), MachinePointerInfo()); +} + +SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { + return Op.getOperand(0); +} + +SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Trmp = Op.getOperand(1); // trampoline + SDValue FPtr = Op.getOperand(2); // nested function + SDValue Nest = Op.getOperand(3); // 'nest' parameter value + SDLoc dl(Op); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = (PtrVT == MVT::i64); + Type *IntPtrTy = + DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType( + *DAG.getContext()); + + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + + Entry.Ty = IntPtrTy; + Entry.Node = Trmp; Args.push_back(Entry); + + // TrampSize == (isPPC64 ? 48 : 40); + Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, + isPPC64 ? MVT::i64 : MVT::i32); + Args.push_back(Entry); + + Entry.Node = FPtr; Args.push_back(Entry); + Entry.Node = Nest; Args.push_back(Entry); + + // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(Chain) + .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol("__trampoline_setup", PtrVT), + std::move(Args), 0); + + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + return CallResult.second; +} + +SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + MachineFunction &MF = DAG.getMachineFunction(); + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + + SDLoc dl(Op); + + if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), + MachinePointerInfo(SV), + false, false, 0); + } + + // For the 32-bit SVR4 ABI we follow the layout of the va_list struct. + // We suppose the given va_list is already allocated. + // + // typedef struct { + // char gpr; /* index into the array of 8 GPRs + // * stored in the register save area + // * gpr=0 corresponds to r3, + // * gpr=1 to r4, etc. + // */ + // char fpr; /* index into the array of 8 FPRs + // * stored in the register save area + // * fpr=0 corresponds to f1, + // * fpr=1 to f2, etc. + // */ + // char *overflow_arg_area; + // /* location on stack that holds + // * the next overflow argument + // */ + // char *reg_save_area; + // /* where r3:r10 and f1:f8 (if saved) + // * are stored + // */ + // } va_list[1]; + + + SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), MVT::i32); + SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), MVT::i32); + + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(), + PtrVT); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), + PtrVT); + + uint64_t FrameOffset = PtrVT.getSizeInBits()/8; + SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, PtrVT); + + uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1; + SDValue ConstStackOffset = DAG.getConstant(StackOffset, PtrVT); + + uint64_t FPROffset = 1; + SDValue ConstFPROffset = DAG.getConstant(FPROffset, PtrVT); + + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + + // Store first byte : number of int regs + SDValue firstStore = DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, + Op.getOperand(1), + MachinePointerInfo(SV), + MVT::i8, false, false, 0); + uint64_t nextOffset = FPROffset; + SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1), + ConstFPROffset); + + // Store second byte : number of float regs + SDValue secondStore = + DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr, + MachinePointerInfo(SV, nextOffset), MVT::i8, + false, false, 0); + nextOffset += StackOffset; + nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset); + + // Store second word : arguments given on stack + SDValue thirdStore = + DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, + MachinePointerInfo(SV, nextOffset), + false, false, 0); + nextOffset += FrameOffset; + nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset); + + // Store third word : arguments given in registers + return DAG.getStore(thirdStore, dl, FR, nextPtr, + MachinePointerInfo(SV, nextOffset), + false, false, 0); + +} + +#include "PPCGenCallingConv.inc" + +// Function whose sole purpose is to kill compiler warnings +// stemming from unused functions included from PPCGenCallingConv.inc. +CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { + return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; +} + +bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + return true; +} + +bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg ArgRegs[] = { + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + const unsigned NumArgRegs = array_lengthof(ArgRegs); + + unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs); + + // Skip one register if the first unallocated register has an even register + // number and there are still argument registers available which have not been + // allocated yet. RegNum is actually an index into ArgRegs, which means we + // need to skip a register if RegNum is odd. + if (RegNum != NumArgRegs && RegNum % 2 == 1) { + State.AllocateReg(ArgRegs[RegNum]); + } + + // Always return false here, as this function only makes sure that the first + // unallocated register has an odd register number and does not actually + // allocate a register for the current argument. + return false; +} + +bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg ArgRegs[] = { + PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8 + }; + + const unsigned NumArgRegs = array_lengthof(ArgRegs); + + unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs); + + // If there is only one Floating-point register left we need to put both f64 + // values of a split ppc_fp128 value on the stack. + if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) { + State.AllocateReg(ArgRegs[RegNum]); + } + + // Always return false here, as this function only makes sure that the two f64 + // values a ppc_fp128 value is split into are both passed in registers or both + // passed on the stack and does not actually allocate a register for the + // current argument. + return false; +} + +/// GetFPR - Get the set of FP registers that should be allocated for arguments, +/// on Darwin. +static const MCPhysReg *GetFPR() { + static const MCPhysReg FPR[] = { + PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13 + }; + + return FPR; +} + +/// CalculateStackSlotSize - Calculates the size reserved for this argument on +/// the stack. +static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, + unsigned PtrByteSize) { + unsigned ArgSize = ArgVT.getStoreSize(); + if (Flags.isByVal()) + ArgSize = Flags.getByValSize(); + + // Round up to multiples of the pointer size, except for array members, + // which are always packed. + if (!Flags.isInConsecutiveRegs()) + ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + + return ArgSize; +} + +/// CalculateStackSlotAlignment - Calculates the alignment of this argument +/// on the stack. +static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize) { + unsigned Align = PtrByteSize; + + // Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + Align = 16; + + // ByVal parameters are aligned as requested. + if (Flags.isByVal()) { + unsigned BVAlign = Flags.getByValAlign(); + if (BVAlign > PtrByteSize) { + if (BVAlign % PtrByteSize != 0) + llvm_unreachable( + "ByVal alignment is not a multiple of the pointer size"); + + Align = BVAlign; + } + } + + // Array members are always packed to their original alignment. + if (Flags.isInConsecutiveRegs()) { + // If the array member was split into multiple registers, the first + // needs to be aligned to the size of the full type. (Except for + // ppcf128, which is only aligned as its f64 components.) + if (Flags.isSplit() && OrigVT != MVT::ppcf128) + Align = OrigVT.getStoreSize(); + else + Align = ArgVT.getStoreSize(); + } + + return Align; +} + +/// CalculateStackSlotUsed - Return whether this argument will use its +/// stack slot (instead of being passed in registers). ArgOffset, +/// AvailableFPRs, and AvailableVRs must hold the current argument +/// position, and will be updated to account for this argument. +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, + ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, + unsigned LinkageSize, + unsigned ParamAreaSize, + unsigned &ArgOffset, + unsigned &AvailableFPRs, + unsigned &AvailableVRs) { + bool UseMemory = false; + + // Respect alignment of argument on the stack. + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + // If there's no space left in the argument save area, we must + // use memory (this check also catches zero-sized arguments). + if (ArgOffset >= LinkageSize + ParamAreaSize) + UseMemory = true; + + // Allocate argument on the stack. + ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // If we overran the argument save area, we must use memory + // (this check catches arguments passed partially in memory) + if (ArgOffset > LinkageSize + ParamAreaSize) + UseMemory = true; + + // However, if the argument is actually passed in an FPR or a VR, + // we don't use memory after all. + if (!Flags.isByVal()) { + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) + if (AvailableFPRs > 0) { + --AvailableFPRs; + return false; + } + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) + if (AvailableVRs > 0) { + --AvailableVRs; + return false; + } + } + + return UseMemory; +} + +/// EnsureStackAlignment - Round stack frame size up from NumBytes to +/// ensure minimum alignment required for target. +static unsigned EnsureStackAlignment(const TargetMachine &Target, + unsigned NumBytes) { + unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment(); + unsigned AlignMask = TargetAlign - 1; + NumBytes = (NumBytes + AlignMask) & ~AlignMask; + return NumBytes; +} + +SDValue +PPCTargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) + const { + if (Subtarget.isSVR4ABI()) { + if (Subtarget.isPPC64()) + return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, + dl, DAG, InVals); + else + return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, + dl, DAG, InVals); + } else { + return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, + dl, DAG, InVals); + } +} + +SDValue +PPCTargetLowering::LowerFormalArguments_32SVR4( + SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + // 32-bit SVR4 ABI Stack Frame Layout: + // +-----------------------------------+ + // +--> | Back chain | + // | +-----------------------------------+ + // | | Floating-point register save area | + // | +-----------------------------------+ + // | | General register save area | + // | +-----------------------------------+ + // | | CR save word | + // | +-----------------------------------+ + // | | VRSAVE save word | + // | +-----------------------------------+ + // | | Alignment padding | + // | +-----------------------------------+ + // | | Vector register save area | + // | +-----------------------------------+ + // | | Local variable space | + // | +-----------------------------------+ + // | | Parameter list area | + // | +-----------------------------------+ + // | | LR save word | + // | +-----------------------------------+ + // SP--> +--- | Back chain | + // +-----------------------------------+ + // + // Specifications: + // System V Application Binary Interface PowerPC Processor Supplement + // AltiVec Technology Programming Interface Manual + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + // Potential tail calls could cause overwriting of argument stack slots. + bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && + (CallConv == CallingConv::Fast)); + unsigned PtrByteSize = 4; + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + // Reserve space for the linkage area on the stack. + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false, false); + CCInfo.AllocateStack(LinkageSize, PtrByteSize); + + CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + + // Arguments stored in registers. + if (VA.isRegLoc()) { + const TargetRegisterClass *RC; + EVT ValVT = VA.getValVT(); + + switch (ValVT.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("ValVT not supported by formal arguments Lowering"); + case MVT::i1: + case MVT::i32: + RC = &PPC::GPRCRegClass; + break; + case MVT::f32: + RC = &PPC::F4RCRegClass; + break; + case MVT::f64: + if (Subtarget.hasVSX()) + RC = &PPC::VSFRCRegClass; + else + RC = &PPC::F8RCRegClass; + break; + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v4f32: + RC = &PPC::VRRCRegClass; + break; + case MVT::v2f64: + case MVT::v2i64: + RC = &PPC::VSHRCRegClass; + break; + } + + // Transform the arguments stored in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, + ValVT == MVT::i1 ? MVT::i32 : ValVT); + + if (ValVT == MVT::i1) + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); + + InVals.push_back(ArgValue); + } else { + // Argument stored in memory. + assert(VA.isMemLoc()); + + unsigned ArgSize = VA.getLocVT().getStoreSize(); + int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(), + isImmutable); + + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo(), + false, false, false, 0)); + } + } + + // Assign locations to all of the incoming aggregate by value arguments. + // Aggregates passed by value are stored in the local variable space of the + // caller's stack frame, right above the parameter list area. + SmallVector<CCValAssign, 16> ByValArgLocs; + CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ByValArgLocs, *DAG.getContext()); + + // Reserve stack space for the allocations in CCInfo. + CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); + + CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); + + // Area that is at least reserved in the caller of this function. + unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); + MinReservedArea = std::max(MinReservedArea, LinkageSize); + + // Set the size that is at least reserved in caller of this function. Tail + // call optimized function's reserved stack space needs to be aligned so that + // taking the difference between two stack areas will result in an aligned + // stack. + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); + + SmallVector<SDValue, 8> MemOps; + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) { + static const MCPhysReg GPArgRegs[] = { + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + const unsigned NumGPArgRegs = array_lengthof(GPArgRegs); + + static const MCPhysReg FPArgRegs[] = { + PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, + PPC::F8 + }; + const unsigned NumFPArgRegs = array_lengthof(FPArgRegs); + + FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs, + NumGPArgRegs)); + FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs, + NumFPArgRegs)); + + // Make room for NumGPArgRegs and NumFPArgRegs. + int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + + NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8; + + FuncInfo->setVarArgsStackOffset( + MFI->CreateFixedObject(PtrVT.getSizeInBits()/8, + CCInfo.getNextStackOffset(), true)); + + FuncInfo->setVarArgsFrameIndex(MFI->CreateStackObject(Depth, 8, false)); + SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + + // The fixed integer arguments of a variadic function are stored to the + // VarArgsFrameIndex on the stack so that they may be loaded by deferencing + // the result of va_next. + for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) { + // Get an existing live-in vreg, or add a new one. + unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]); + if (!VReg) + VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass); + + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(), false, false, 0); + MemOps.push_back(Store); + // Increment the address by four for the next argument to store + SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT); + FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); + } + + // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6 + // is set. + // The double arguments are stored to the VarArgsFrameIndex + // on the stack. + for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) { + // Get an existing live-in vreg, or add a new one. + unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]); + if (!VReg) + VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass); + + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(), false, false, 0); + MemOps.push_back(Store); + // Increment the address by eight for the next argument to store + SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8, + PtrVT); + FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); + } + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + + return Chain; +} + +// PPC64 passes i8, i16, and i32 values in i64 registers. Promote +// value to MVT::i64 and then truncate to the correct register size. +SDValue +PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, + SelectionDAG &DAG, SDValue ArgVal, + SDLoc dl) const { + if (Flags.isSExt()) + ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal, + DAG.getValueType(ObjectVT)); + else if (Flags.isZExt()) + ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal, + DAG.getValueType(ObjectVT)); + + return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal); +} + +SDValue +PPCTargetLowering::LowerFormalArguments_64SVR4( + SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + // TODO: add description of PPC stack frame format, or at least some docs. + // + bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isLittleEndian = Subtarget.isLittleEndian(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + // Potential tail calls could cause overwriting of argument stack slots. + bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && + (CallConv == CallingConv::Fast)); + unsigned PtrByteSize = 8; + + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); + + static const MCPhysReg GPR[] = { + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + + static const MCPhysReg *FPR = GetFPR(); + + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + static const MCPhysReg VSRH[] = { + PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8, + PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 + }; + + const unsigned Num_GPR_Regs = array_lengthof(GPR); + const unsigned Num_FPR_Regs = 13; + const unsigned Num_VR_Regs = array_lengthof(VR); + + // Do a first pass over the arguments to determine whether the ABI + // guarantees that our caller has allocated the parameter save area + // on its stack frame. In the ELFv1 ABI, this is always the case; + // in the ELFv2 ABI, it is true if this is a vararg function or if + // any parameter is located in a stack slot. + + bool HasParameterArea = !isELFv2ABI || isVarArg; + unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize; + unsigned NumBytes = LinkageSize; + unsigned AvailableFPRs = Num_FPR_Regs; + unsigned AvailableVRs = Num_VR_Regs; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) + if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytes, AvailableFPRs, AvailableVRs)) + HasParameterArea = true; + + // Add DAG nodes to load the arguments or copy them out of registers. On + // entry to a function on PPC, the arguments start after the linkage area, + // although the first ones are often in registers. + + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; + SmallVector<SDValue, 8> MemOps; + Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); + unsigned CurArgIdx = 0; + for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { + SDValue ArgVal; + bool needsLoad = false; + EVT ObjectVT = Ins[ArgNo].VT; + EVT OrigVT = Ins[ArgNo].ArgVT; + unsigned ObjSize = ObjectVT.getStoreSize(); + unsigned ArgSize = ObjSize; + ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; + std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); + CurArgIdx = Ins[ArgNo].OrigArgIndex; + + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + unsigned CurArgOffset = ArgOffset; + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, Num_GPR_Regs); + + // FIXME the codegen can be much improved in some cases. + // We do not have to keep everything in memory. + if (Flags.isByVal()) { + // ObjSize is the true size, ArgSize rounded up to multiple of registers. + ObjSize = Flags.getByValSize(); + ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // Empty aggregate parameters do not take up registers. Examples: + // struct { } a; + // union { } b; + // int c[0]; + // etc. However, we have to provide a place-holder in InVals, so + // pretend we have an 8-byte item at the current address for that + // purpose. + if (!ObjSize) { + int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + InVals.push_back(FIN); + continue; + } + + // Create a stack object covering all stack doublewords occupied + // by the argument. If the argument is (fully or partially) on + // the stack, or if the argument is fully in registers but the + // caller has allocated the parameter save anyway, we can refer + // directly to the caller's stack frame. Otherwise, create a + // local copy in our own frame. + int FI; + if (HasParameterArea || + ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize) + FI = MFI->CreateFixedObject(ArgSize, ArgOffset, true); + else + FI = MFI->CreateStackObject(ArgSize, Align, false); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + + // Handle aggregates smaller than 8 bytes. + if (ObjSize < PtrByteSize) { + // The value of the object is its address, which differs from the + // address of the enclosing doubleword on big-endian systems. + SDValue Arg = FIN; + if (!isLittleEndian) { + SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, PtrVT); + Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff); + } + InVals.push_back(Arg); + + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store; + + if (ObjSize==1 || ObjSize==2 || ObjSize==4) { + EVT ObjType = (ObjSize == 1 ? MVT::i8 : + (ObjSize == 2 ? MVT::i16 : MVT::i32)); + Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, + MachinePointerInfo(FuncArg), + ObjType, false, false, 0); + } else { + // For sizes that don't fit a truncating store (3, 5, 6, 7), + // store the whole register as-is to the parameter save area + // slot. + Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(FuncArg), + false, false, 0); + } + + MemOps.push_back(Store); + } + // Whether we copied from a register or not, advance the offset + // into the parameter save area by a full doubleword. + ArgOffset += PtrByteSize; + continue; + } + + // The value of the object is its address, which is the address of + // its first stack doubleword. + InVals.push_back(FIN); + + // Store whatever pieces of the object are in registers to memory. + for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { + if (GPR_idx == Num_GPR_Regs) + break; + + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Addr = FIN; + if (j) { + SDValue Off = DAG.getConstant(j, PtrVT); + Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); + } + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo(FuncArg, j), + false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; + } + ArgOffset += ArgSize; + continue; + } + + switch (ObjectVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unhandled argument type!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) + // PPC64 passes i8, i16, and i32 values in i64 registers. Promote + // value to MVT::i64 and then truncate to the correct register size. + ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); + } else { + needsLoad = true; + ArgSize = PtrByteSize; + } + ArgOffset += 8; + break; + + case MVT::f32: + case MVT::f64: + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. + if (FPR_idx != Num_FPR_Regs) { + unsigned VReg; + + if (ObjectVT == MVT::f32) + VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); + else + VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ? + &PPC::VSFRCRegClass : + &PPC::F8RCRegClass); + + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + ++FPR_idx; + } else if (GPR_idx != Num_GPR_Regs) { + // This can only ever happen in the presence of f32 array types, + // since otherwise we never run out of FPRs before running out + // of GPRs. + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::f32) { + if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0)) + ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal); + } + + ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal); + } else { + needsLoad = true; + } + + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize; + ArgOffset += ArgSize; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2f64: + case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ? + MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) : + MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + ++VR_idx; + } else { + needsLoad = true; + } + ArgOffset += 16; + break; + } + + // We need to load the argument to a virtual register if we determined + // above that we ran out of physical registers of the appropriate type. + if (needsLoad) { + if (ObjSize < ArgSize && !isLittleEndian) + CurArgOffset += ArgSize - ObjSize; + int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), + false, false, false, 0); + } + + InVals.push_back(ArgVal); + } + + // Area that is at least reserved in the caller of this function. + unsigned MinReservedArea; + if (HasParameterArea) + MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize); + else + MinReservedArea = LinkageSize; + + // Set the size that is at least reserved in caller of this function. Tail + // call optimized functions' reserved stack space needs to be aligned so that + // taking the difference between two stack areas will result in an aligned + // stack. + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) { + int Depth = ArgOffset; + + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(PtrByteSize, Depth, true)); + SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing the + // result of va_next. + for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx < Num_GPR_Regs; ++GPR_idx) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(), false, false, 0); + MemOps.push_back(Store); + // Increment the address by four for the next argument to store + SDValue PtrOff = DAG.getConstant(PtrByteSize, PtrVT); + FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); + } + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + + return Chain; +} + +SDValue +PPCTargetLowering::LowerFormalArguments_Darwin( + SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> + &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + // TODO: add description of PPC stack frame format, or at least some docs. + // + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = PtrVT == MVT::i64; + // Potential tail calls could cause overwriting of argument stack slots. + bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && + (CallConv == CallingConv::Fast)); + unsigned PtrByteSize = isPPC64 ? 8 : 4; + + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); + unsigned ArgOffset = LinkageSize; + // Area that is at least reserved in caller of this function. + unsigned MinReservedArea = ArgOffset; + + static const MCPhysReg GPR_32[] = { // 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + static const MCPhysReg GPR_64[] = { // 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + + static const MCPhysReg *FPR = GetFPR(); + + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + + const unsigned Num_GPR_Regs = array_lengthof(GPR_32); + const unsigned Num_FPR_Regs = 13; + const unsigned Num_VR_Regs = array_lengthof( VR); + + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + + const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; + + // In 32-bit non-varargs functions, the stack space for vectors is after the + // stack space for non-vectors. We do not use this space unless we have + // too many vectors to fit in registers, something that only occurs in + // constructed examples:), but we have to walk the arglist to figure + // that out...for the pathological case, compute VecArgOffset as the + // start of the vector parameter area. Computing VecArgOffset is the + // entire point of the following loop. + unsigned VecArgOffset = ArgOffset; + if (!isVarArg && !isPPC64) { + for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; + ++ArgNo) { + EVT ObjectVT = Ins[ArgNo].VT; + ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; + + if (Flags.isByVal()) { + // ObjSize is the true size, ArgSize rounded up to multiple of regs. + unsigned ObjSize = Flags.getByValSize(); + unsigned ArgSize = + ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + VecArgOffset += ArgSize; + continue; + } + + switch(ObjectVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unhandled argument type!"); + case MVT::i1: + case MVT::i32: + case MVT::f32: + VecArgOffset += 4; + break; + case MVT::i64: // PPC64 + case MVT::f64: + // FIXME: We are guaranteed to be !isPPC64 at this point. + // Does MVT::i64 apply? + VecArgOffset += 8; + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + // Nothing to do, we're only looking at Nonvector args here. + break; + } + } + } + // We've found where the vector parameter area in memory is. Skip the + // first 12 parameters; these don't use that memory. + VecArgOffset = ((VecArgOffset+15)/16)*16; + VecArgOffset += 12*16; + + // Add DAG nodes to load the arguments or copy them out of registers. On + // entry to a function on PPC, the arguments start after the linkage area, + // although the first ones are often in registers. + + SmallVector<SDValue, 8> MemOps; + unsigned nAltivecParamsAtEnd = 0; + Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin(); + unsigned CurArgIdx = 0; + for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { + SDValue ArgVal; + bool needsLoad = false; + EVT ObjectVT = Ins[ArgNo].VT; + unsigned ObjSize = ObjectVT.getSizeInBits()/8; + unsigned ArgSize = ObjSize; + ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; + std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx); + CurArgIdx = Ins[ArgNo].OrigArgIndex; + + unsigned CurArgOffset = ArgOffset; + + // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. + if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || + ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) { + if (isVarArg || isPPC64) { + MinReservedArea = ((MinReservedArea+15)/16)*16; + MinReservedArea += CalculateStackSlotSize(ObjectVT, + Flags, + PtrByteSize); + } else nAltivecParamsAtEnd++; + } else + // Calculate min reserved area. + MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, + Flags, + PtrByteSize); + + // FIXME the codegen can be much improved in some cases. + // We do not have to keep everything in memory. + if (Flags.isByVal()) { + // ObjSize is the true size, ArgSize rounded up to multiple of registers. + ObjSize = Flags.getByValSize(); + ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + // Objects of size 1 and 2 are right justified, everything else is + // left justified. This means the memory address is adjusted forwards. + if (ObjSize==1 || ObjSize==2) { + CurArgOffset = CurArgOffset + (4 - ObjSize); + } + // The value of the object is its address. + int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + InVals.push_back(FIN); + if (ObjSize==1 || ObjSize==2) { + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg; + if (isPPC64) + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + else + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; + SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(FuncArg), + ObjType, false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; + } + + ArgOffset += PtrByteSize; + + continue; + } + for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { + // Store whatever pieces of the object are in registers + // to memory. ArgOffset will be the address of the beginning + // of the object. + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg; + if (isPPC64) + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + else + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); + int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(FuncArg, j), + false, false, 0); + MemOps.push_back(Store); + ++GPR_idx; + ArgOffset += PtrByteSize; + } else { + ArgOffset += ArgSize - (ArgOffset-CurArgOffset); + break; + } + } + continue; + } + + switch (ObjectVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unhandled argument type!"); + case MVT::i1: + case MVT::i32: + if (!isPPC64) { + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); + + if (ObjectVT == MVT::i1) + ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal); + + ++GPR_idx; + } else { + needsLoad = true; + ArgSize = PtrByteSize; + } + // All int arguments reserve stack space in the Darwin ABI. + ArgOffset += PtrByteSize; + break; + } + // FALLTHROUGH + case MVT::i64: // PPC64 + if (GPR_idx != Num_GPR_Regs) { + unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + + if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) + // PPC64 passes i8, i16, and i32 values in i64 registers. Promote + // value to MVT::i64 and then truncate to the correct register size. + ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); + + ++GPR_idx; + } else { + needsLoad = true; + ArgSize = PtrByteSize; + } + // All int arguments reserve stack space in the Darwin ABI. + ArgOffset += 8; + break; + + case MVT::f32: + case MVT::f64: + // Every 4 bytes of argument space consumes one of the GPRs available for + // argument passing. + if (GPR_idx != Num_GPR_Regs) { + ++GPR_idx; + if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) + ++GPR_idx; + } + if (FPR_idx != Num_FPR_Regs) { + unsigned VReg; + + if (ObjectVT == MVT::f32) + VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); + else + VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass); + + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + ++FPR_idx; + } else { + needsLoad = true; + } + + // All FP arguments reserve stack space in the Darwin ABI. + ArgOffset += isPPC64 ? 8 : ObjSize; + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + // Note that vector arguments in registers don't reserve stack space, + // except in varargs functions. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); + ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); + if (isVarArg) { + while ((ArgOffset % 16) != 0) { + ArgOffset += PtrByteSize; + if (GPR_idx != Num_GPR_Regs) + GPR_idx++; + } + ArgOffset += 16; + GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? + } + ++VR_idx; + } else { + if (!isVarArg && !isPPC64) { + // Vectors go after all the nonvectors. + CurArgOffset = VecArgOffset; + VecArgOffset += 16; + } else { + // Vectors are aligned. + ArgOffset = ((ArgOffset+15)/16)*16; + CurArgOffset = ArgOffset; + ArgOffset += 16; + } + needsLoad = true; + } + break; + } + + // We need to load the argument to a virtual register if we determined above + // that we ran out of physical registers of the appropriate type. + if (needsLoad) { + int FI = MFI->CreateFixedObject(ObjSize, + CurArgOffset + (ArgSize - ObjSize), + isImmutable); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(), + false, false, false, 0); + } + + InVals.push_back(ArgVal); + } + + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + MinReservedArea = ((MinReservedArea+15)/16)*16; + MinReservedArea += 16*nAltivecParamsAtEnd; + } + + // Area that is at least reserved in the caller of this function. + MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); + + // Set the size that is at least reserved in caller of this function. Tail + // call optimized functions' reserved stack space needs to be aligned so that + // taking the difference between two stack areas will result in an aligned + // stack. + MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea); + FuncInfo->setMinReservedArea(MinReservedArea); + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) { + int Depth = ArgOffset; + + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(PtrVT.getSizeInBits()/8, + Depth, true)); + SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); + + // If this function is vararg, store any remaining integer argument regs + // to their spots on the stack so that they may be loaded by deferencing the + // result of va_next. + for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { + unsigned VReg; + + if (isPPC64) + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); + else + VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); + + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); + SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(), false, false, 0); + MemOps.push_back(Store); + // Increment the address by four for the next argument to store + SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT); + FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); + } + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + + return Chain; +} + +/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be +/// adjusted to accommodate the arguments for the tailcall. +static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, + unsigned ParamSize) { + + if (!isTailCall) return 0; + + PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>(); + unsigned CallerMinReservedArea = FI->getMinReservedArea(); + int SPDiff = (int)CallerMinReservedArea - (int)ParamSize; + // Remember only if the new adjustement is bigger. + if (SPDiff < FI->getTailCallSPDelta()) + FI->setTailCallSPDelta(SPDiff); + + return SPDiff; +} + +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. Targets which want to do tail call +/// optimization should implement this function. +bool +PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const { + if (!getTargetMachine().Options.GuaranteedTailCallOpt) + return false; + + // Variable argument functions are not supported. + if (isVarArg) + return false; + + MachineFunction &MF = DAG.getMachineFunction(); + CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); + if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { + // Functions containing by val parameters are not supported. + for (unsigned i = 0; i != Ins.size(); i++) { + ISD::ArgFlagsTy Flags = Ins[i].Flags; + if (Flags.isByVal()) return false; + } + + // Non-PIC/GOT tail calls are supported. + if (getTargetMachine().getRelocationModel() != Reloc::PIC_) + return true; + + // At the moment we can only do local tail calls (in same module, hidden + // or protected) if we are generating PIC. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) + return G->getGlobal()->hasHiddenVisibility() + || G->getGlobal()->hasProtectedVisibility(); + } + + return false; +} + +/// isCallCompatibleAddress - Return the immediate to use if the specified +/// 32-bit value is representable in the immediate field of a BxA instruction. +static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); + if (!C) return nullptr; + + int Addr = C->getZExtValue(); + if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero. + SignExtend32<26>(Addr) != Addr) + return nullptr; // Top 6 bits have to be sext of immediate. + + return DAG.getConstant((int)C->getZExtValue() >> 2, + DAG.getTargetLoweringInfo().getPointerTy()).getNode(); +} + +namespace { + +struct TailCallArgumentInfo { + SDValue Arg; + SDValue FrameIdxOp; + int FrameIdx; + + TailCallArgumentInfo() : FrameIdx(0) {} +}; + +} + +/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. +static void +StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG, + SDValue Chain, + const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs, + SmallVectorImpl<SDValue> &MemOpChains, + SDLoc dl) { + for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) { + SDValue Arg = TailCallArgs[i].Arg; + SDValue FIN = TailCallArgs[i].FrameIdxOp; + int FI = TailCallArgs[i].FrameIdx; + // Store relative to framepointer. + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); + } +} + +/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to +/// the appropriate stack slot for the tail call optimized function call. +static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, + MachineFunction &MF, + SDValue Chain, + SDValue OldRetAddr, + SDValue OldFP, + int SPDiff, + bool isPPC64, + bool isDarwinABI, + SDLoc dl) { + if (SPDiff) { + // Calculate the new stack slot for the return address. + int SlotSize = isPPC64 ? 8 : 4; + int NewRetAddrLoc = SPDiff + PPCFrameLowering::getReturnSaveOffset(isPPC64, + isDarwinABI); + int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize, + NewRetAddrLoc, true); + EVT VT = isPPC64 ? MVT::i64 : MVT::i32; + SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); + Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, + MachinePointerInfo::getFixedStack(NewRetAddr), + false, false, 0); + + // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack + // slot as the FP is never overwritten. + if (isDarwinABI) { + int NewFPLoc = + SPDiff + PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI); + int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc, + true); + SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); + Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx, + MachinePointerInfo::getFixedStack(NewFPIdx), + false, false, 0); + } + } + return Chain; +} + +/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate +/// the position of the argument. +static void +CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, + SDValue Arg, int SPDiff, unsigned ArgOffset, + SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) { + int Offset = ArgOffset + SPDiff; + uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8; + int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + EVT VT = isPPC64 ? MVT::i64 : MVT::i32; + SDValue FIN = DAG.getFrameIndex(FI, VT); + TailCallArgumentInfo Info; + Info.Arg = Arg; + Info.FrameIdxOp = FIN; + Info.FrameIdx = FI; + TailCallArguments.push_back(Info); +} + +/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address +/// stack slot. Returns the chain as result and the loaded frame pointers in +/// LROpOut/FPOpout. Used when tail calling. +SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, + int SPDiff, + SDValue Chain, + SDValue &LROpOut, + SDValue &FPOpOut, + bool isDarwinABI, + SDLoc dl) const { + if (SPDiff) { + // Load the LR and FP stack slot for later adjusting. + EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; + LROpOut = getReturnAddrFrameIndex(DAG); + LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(), + false, false, false, 0); + Chain = SDValue(LROpOut.getNode(), 1); + + // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack + // slot as the FP is never overwritten. + if (isDarwinABI) { + FPOpOut = getFramePointerFrameIndex(DAG); + FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo(), + false, false, false, 0); + Chain = SDValue(FPOpOut.getNode(), 1); + } + } + return Chain; +} + +/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified +/// by "Src" to address "Dst" of size "Size". Alignment information is +/// specified by the specific parameter attribute. The copy will be passed as +/// a byval function parameter. +/// Sometimes what we are copying is the end of a larger object, the part that +/// does not fit in registers. +static SDValue +CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, + ISD::ArgFlagsTy Flags, SelectionDAG &DAG, + SDLoc dl) { + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); + return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), + false, false, MachinePointerInfo(), + MachinePointerInfo()); +} + +/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of +/// tail calls. +static void +LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, + SDValue Arg, SDValue PtrOff, int SPDiff, + unsigned ArgOffset, bool isPPC64, bool isTailCall, + bool isVector, SmallVectorImpl<SDValue> &MemOpChains, + SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, + SDLoc dl) { + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + if (!isTailCall) { + if (isVector) { + SDValue StackPtr; + if (isPPC64) + StackPtr = DAG.getRegister(PPC::X1, MVT::i64); + else + StackPtr = DAG.getRegister(PPC::R1, MVT::i32); + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, + DAG.getConstant(ArgOffset, PtrVT)); + } + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0)); + // Calculate and remember argument location. + } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, + TailCallArguments); +} + +static +void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, + SDLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes, + SDValue LROp, SDValue FPOp, bool isDarwinABI, + SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) { + MachineFunction &MF = DAG.getMachineFunction(); + + // Emit a sequence of copyto/copyfrom virtual registers for arguments that + // might overwrite each other in case of tail call optimization. + SmallVector<SDValue, 8> MemOpChains2; + // Do not flag preceding copytoreg stuff together with the following stuff. + InFlag = SDValue(); + StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, + MemOpChains2, dl); + if (!MemOpChains2.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); + + // Store the return address to the appropriate stack slot. + Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff, + isPPC64, isDarwinABI, dl); + + // Emit callseq_end just before tailcall node. + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(0, true), InFlag, dl); + InFlag = Chain.getValue(1); +} + +static +unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, + SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall, + SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass, + SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys, + const PPCSubtarget &Subtarget) { + + bool isPPC64 = Subtarget.isPPC64(); + bool isSVR4ABI = Subtarget.isSVR4ABI(); + bool isELFv2ABI = Subtarget.isELFv2ABI(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + NodeTys.push_back(MVT::Other); // Returns a chain + NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. + + unsigned CallOpc = PPCISD::CALL; + + bool needIndirectCall = true; + if (!isSVR4ABI || !isPPC64) + if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { + // If this is an absolute destination address, use the munged value. + Callee = SDValue(Dest, 0); + needIndirectCall = false; + } + + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201 + // Use indirect calls for ALL functions calls in JIT mode, since the + // far-call stubs may be outside relocation limits for a BL instruction. + if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) { + unsigned OpFlags = 0; + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) && + (G->getGlobal()->isDeclaration() || + G->getGlobal()->isWeakForLinker())) || + (Subtarget.isTargetELF() && !isPPC64 && + !G->getGlobal()->hasLocalLinkage() && + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = PPCII::MO_PLT_OR_STUB; + } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, + // every direct call is) turn it into a TargetGlobalAddress / + // TargetExternalSymbol node so that legalize doesn't hack it. + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, + Callee.getValueType(), + 0, OpFlags); + needIndirectCall = false; + } + } + + if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + unsigned char OpFlags = 0; + + if ((DAG.getTarget().getRelocationModel() != Reloc::Static && + (Subtarget.getTargetTriple().isMacOSX() && + Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) || + (Subtarget.isTargetELF() && !isPPC64 && + DAG.getTarget().getRelocationModel() == Reloc::PIC_) ) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = PPCII::MO_PLT_OR_STUB; + } + + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), + OpFlags); + needIndirectCall = false; + } + + if (needIndirectCall) { + // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair + // to do the call, we can't use PPCISD::CALL. + SDValue MTCTROps[] = {Chain, Callee, InFlag}; + + if (isSVR4ABI && isPPC64 && !isELFv2ABI) { + // Function pointers in the 64-bit SVR4 ABI do not point to the function + // entry point, but to the function descriptor (the function entry point + // address is part of the function descriptor though). + // The function descriptor is a three doubleword structure with the + // following fields: function entry point, TOC base address and + // environment pointer. + // Thus for a call through a function pointer, the following actions need + // to be performed: + // 1. Save the TOC of the caller in the TOC save area of its stack + // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). + // 2. Load the address of the function entry point from the function + // descriptor. + // 3. Load the TOC of the callee from the function descriptor into r2. + // 4. Load the environment pointer from the function descriptor into + // r11. + // 5. Branch to the function entry point address. + // 6. On return of the callee, the TOC of the caller needs to be + // restored (this is done in FinishCall()). + // + // All those operations are flagged together to ensure that no other + // operations can be scheduled in between. E.g. without flagging the + // operations together, a TOC access in the caller could be scheduled + // between the load of the callee TOC and the branch to the callee, which + // results in the TOC access going through the TOC of the callee instead + // of going through the TOC of the caller, which leads to incorrect code. + + // Load the address of the function entry point from the function + // descriptor. + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other, MVT::Glue); + SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, + makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); + Chain = LoadFuncPtr.getValue(1); + InFlag = LoadFuncPtr.getValue(2); + + // Load environment pointer into r11. + // Offset of the environment pointer within the function descriptor. + SDValue PtrOff = DAG.getIntPtrConstant(16); + + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); + SDValue LoadEnvPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, Chain, AddPtr, + InFlag); + Chain = LoadEnvPtr.getValue(1); + InFlag = LoadEnvPtr.getValue(2); + + SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, + InFlag); + Chain = EnvVal.getValue(0); + InFlag = EnvVal.getValue(1); + + // Load TOC of the callee into r2. We are using a target-specific load + // with r2 hard coded, because the result of a target-independent load + // would never go directly into r2, since r2 is a reserved register (which + // prevents the register allocator from allocating it), resulting in an + // additional register being allocated and an unnecessary move instruction + // being generated. + VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue TOCOff = DAG.getIntPtrConstant(8); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); + SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, + AddTOC, InFlag); + Chain = LoadTOCPtr.getValue(0); + InFlag = LoadTOCPtr.getValue(1); + + MTCTROps[0] = Chain; + MTCTROps[1] = LoadFuncPtr; + MTCTROps[2] = InFlag; + } + + Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, + makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); + InFlag = Chain.getValue(1); + + NodeTys.clear(); + NodeTys.push_back(MVT::Other); + NodeTys.push_back(MVT::Glue); + Ops.push_back(Chain); + CallOpc = PPCISD::BCTRL; + Callee.setNode(nullptr); + // Add use of X11 (holding environment pointer) + if (isSVR4ABI && isPPC64 && !isELFv2ABI) + Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); + // Add CTR register as callee so a bctr can be emitted later. + if (isTailCall) + Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); + } + + // If this is a direct call, pass the chain and the callee. + if (Callee.getNode()) { + Ops.push_back(Chain); + Ops.push_back(Callee); + } + // If this is a tail call add stack pointer delta. + if (isTailCall) + Ops.push_back(DAG.getConstant(SPDiff, MVT::i32)); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // Direct calls in the ELFv2 ABI need the TOC register live into the call. + if (Callee.getNode() && isELFv2ABI) + Ops.push_back(DAG.getRegister(PPC::X2, PtrVT)); + + return CallOpc; +} + +static +bool isLocalCall(const SDValue &Callee) +{ + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) + return !G->getGlobal()->isDeclaration() && + !G->getGlobal()->isWeakForLinker(); + return false; +} + +SDValue +PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + SmallVector<CCValAssign, 16> RVLocs; + CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Val = DAG.getCopyFromReg(Chain, dl, + VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::AExt: + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + break; + case CCValAssign::ZExt: + Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + break; + case CCValAssign::SExt: + Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); + break; + } + + InVals.push_back(Val); + } + + return Chain; +} + +SDValue +PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl, + bool isTailCall, bool isVarArg, + SelectionDAG &DAG, + SmallVector<std::pair<unsigned, SDValue>, 8> + &RegsToPass, + SDValue InFlag, SDValue Chain, + SDValue &Callee, + int SPDiff, unsigned NumBytes, + const SmallVectorImpl<ISD::InputArg> &Ins, + SmallVectorImpl<SDValue> &InVals) const { + + bool isELFv2ABI = Subtarget.isELFv2ABI(); + std::vector<EVT> NodeTys; + SmallVector<SDValue, 8> Ops; + unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff, + isTailCall, RegsToPass, Ops, NodeTys, + Subtarget); + + // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls + if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) + Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); + + // When performing tail call optimization the callee pops its arguments off + // the stack. Account for this here so these bytes can be pushed back on in + // PPCFrameLowering::eliminateCallFramePseudoInstr. + int BytesCalleePops = + (CallConv == CallingConv::Fast && + getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; + + // Add a register mask operand representing the call-preserved registers. + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + // Emit tail call. + if (isTailCall) { + assert(((Callee.getOpcode() == ISD::Register && + cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) || + Callee.getOpcode() == ISD::TargetExternalSymbol || + Callee.getOpcode() == ISD::TargetGlobalAddress || + isa<ConstantSDNode>(Callee)) && + "Expecting an global address, external symbol, absolute value or register"); + + return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops); + } + + // Add a NOP immediately after the branch instruction when using the 64-bit + // SVR4 ABI. At link time, if caller and callee are in a different module and + // thus have a different TOC, the call will be replaced with a call to a stub + // function which saves the current TOC, loads the TOC of the callee and + // branches to the callee. The NOP will be replaced with a load instruction + // which restores the TOC of the caller from the TOC save slot of the current + // stack frame. If caller and callee belong to the same module (and have the + // same TOC), the NOP will remain unchanged. + + bool needsTOCRestore = false; + if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) { + if (CallOpc == PPCISD::BCTRL) { + // This is a call through a function pointer. + // Restore the caller TOC from the save area into R2. + // See PrepareCall() for more information about calls through function + // pointers in the 64-bit SVR4 ABI. + // We are using a target-specific load with r2 hard coded, because the + // result of a target-independent load would never go directly into r2, + // since r2 is a reserved register (which prevents the register allocator + // from allocating it), resulting in an additional register being + // allocated and an unnecessary move instruction being generated. + needsTOCRestore = true; + } else if ((CallOpc == PPCISD::CALL) && + (!isLocalCall(Callee) || + DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { + // Otherwise insert NOP for non-local calls. + CallOpc = PPCISD::CALL_NOP; + } + } + + Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); + InFlag = Chain.getValue(1); + + if (needsTOCRestore) { + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); + SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); + Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag); + InFlag = Chain.getValue(1); + } + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(BytesCalleePops, true), + InFlag, dl); + if (!Ins.empty()) + InFlag = Chain.getValue(1); + + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, + Ins, dl, DAG, InVals); +} + +SDValue +PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &dl = CLI.DL; + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &isTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool isVarArg = CLI.IsVarArg; + + if (isTailCall) + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, + Ins, DAG); + + if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall()) + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + + if (Subtarget.isSVR4ABI()) { + if (Subtarget.isPPC64()) + return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, Outs, OutVals, Ins, + dl, DAG, InVals); + else + return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg, + isTailCall, Outs, OutVals, Ins, + dl, DAG, InVals); + } + + return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg, + isTailCall, Outs, OutVals, Ins, + dl, DAG, InVals); +} + +SDValue +PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description + // of the 32-bit SVR4 ABI stack frame layout. + + assert((CallConv == CallingConv::C || + CallConv == CallingConv::Fast) && "Unknown calling convention!"); + + unsigned PtrByteSize = 4; + + MachineFunction &MF = DAG.getMachineFunction(); + + // Mark this function as potentially containing a function that contains a + // tail call. As a consequence the frame pointer will be used for dynamicalloc + // and restoring the callers stack pointer in this functions epilog. This is + // done because by tail calling the called function might overwrite the value + // in this function's (MF) stack pointer stack slot 0(SP). + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, parameter list area and the part of the local variable space which + // contains copies of aggregates which are passed by value. + + // Assign locations to all of the outgoing arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + // Reserve space for the linkage area on the stack. + CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false, false), + PtrByteSize); + + if (isVarArg) { + // Handle fixed and variable vector arguments differently. + // Fixed vector arguments go into registers as long as registers are + // available. Variable vector arguments always go into memory. + unsigned NumArgs = Outs.size(); + + for (unsigned i = 0; i != NumArgs; ++i) { + MVT ArgVT = Outs[i].VT; + ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; + bool Result; + + if (Outs[i].IsFixed) { + Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, + CCInfo); + } else { + Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, + ArgFlags, CCInfo); + } + + if (Result) { +#ifndef NDEBUG + errs() << "Call operand #" << i << " has unhandled type " + << EVT(ArgVT).getEVTString() << "\n"; +#endif + llvm_unreachable(nullptr); + } + } + } else { + // All arguments are treated the same. + CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4); + } + + // Assign locations to all of the outgoing aggregate by value arguments. + SmallVector<CCValAssign, 16> ByValArgLocs; + CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ByValArgLocs, *DAG.getContext()); + + // Reserve stack space for the allocations in CCInfo. + CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize); + + CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); + + // Size of the linkage area, parameter list area and the part of the local + // space variable where copies of aggregates which are passed by value are + // stored. + unsigned NumBytes = CCByValInfo.getNextStackOffset(); + + // Calculate by how many bytes the stack has to be adjusted in case of tail + // call optimization. + int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); + SDValue CallSeqStart = Chain; + + // Load the return address and frame pointer so it can be moved somewhere else + // later. + SDValue LROp, FPOp; + Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, false, + dl); + + // Set up a copy of the stack pointer for use loading and storing any + // arguments that may not fit in the registers available for argument + // passing. + SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32); + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<TailCallArgumentInfo, 8> TailCallArguments; + SmallVector<SDValue, 8> MemOpChains; + + bool seenFloatArg = false; + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, j = 0, e = ArgLocs.size(); + i != e; + ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + + if (Flags.isByVal()) { + // Argument is an aggregate which is passed by value, thus we need to + // create a copy of it in the local variable space of the current stack + // frame (which is the stack frame of the caller) and pass the address of + // this copy to the callee. + assert((j < ByValArgLocs.size()) && "Index out of bounds!"); + CCValAssign &ByValVA = ByValArgLocs[j++]; + assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!"); + + // Memory reserved in the local variable space of the callers stack frame. + unsigned LocMemOffset = ByValVA.getLocMemOffset(); + + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + + // Create a copy of the argument in the local area of the current + // stack frame. + SDValue MemcpyCall = + CreateCopyOfByValArgument(Arg, PtrOff, + CallSeqStart.getNode()->getOperand(0), + Flags, DAG, dl); + + // This must go outside the CALLSEQ_START..END. + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, + CallSeqStart.getNode()->getOperand(1), + SDLoc(MemcpyCall)); + DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), + NewCallSeqStart.getNode()); + Chain = CallSeqStart = NewCallSeqStart; + + // Pass the address of the aggregate copy on the stack either in a + // physical register or in the parameter list area of the current stack + // frame to the callee. + Arg = PtrOff; + } + + if (VA.isRegLoc()) { + if (Arg.getValueType() == MVT::i1) + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg); + + seenFloatArg |= VA.getLocVT().isFloatingPoint(); + // Put argument in a physical register. + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + // Put argument in the parameter list area of the current stack frame. + assert(VA.isMemLoc()); + unsigned LocMemOffset = VA.getLocMemOffset(); + + if (!isTailCall) { + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), + false, false, 0)); + } else { + // Calculate and remember argument location. + CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset, + TailCallArguments); + } + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // Set CR bit 6 to true if this is a vararg call with floating args passed in + // registers. + if (isVarArg) { + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, InFlag }; + + Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, + dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1)); + + InFlag = Chain.getValue(1); + } + + if (isTailCall) + PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp, + false, TailCallArguments); + + return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, + RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, + Ins, InVals); +} + +// Copy an argument into memory, being careful to do this outside the +// call sequence for the call to which the argument belongs. +SDValue +PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff, + SDValue CallSeqStart, + ISD::ArgFlagsTy Flags, + SelectionDAG &DAG, + SDLoc dl) const { + SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff, + CallSeqStart.getNode()->getOperand(0), + Flags, DAG, dl); + // The MEMCPY must go outside the CALLSEQ_START..END. + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, + CallSeqStart.getNode()->getOperand(1), + SDLoc(MemcpyCall)); + DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), + NewCallSeqStart.getNode()); + return NewCallSeqStart; +} + +SDValue +PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + bool isELFv2ABI = Subtarget.isELFv2ABI(); + bool isLittleEndian = Subtarget.isLittleEndian(); + unsigned NumOps = Outs.size(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + unsigned PtrByteSize = 8; + + MachineFunction &MF = DAG.getMachineFunction(); + + // Mark this function as potentially containing a function that contains a + // tail call. As a consequence the frame pointer will be used for dynamicalloc + // and restoring the callers stack pointer in this functions epilog. This is + // done because by tail calling the called function might overwrite the value + // in this function's (MF) stack pointer stack slot 0(SP). + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes + // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage + // area is 32 bytes reserved space for [SP][CR][LR][TOC]. + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false, + isELFv2ABI); + unsigned NumBytes = LinkageSize; + + // Add up all the space actually used. + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; + + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + NumBytes = ((NumBytes + Align - 1) / Align) * Align; + + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + if (Flags.isInConsecutiveRegsLast()) + NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + } + + unsigned NumBytesActuallyUsed = NumBytes; + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); + + // Calculate by how many bytes the stack has to be adjusted in case of tail + // call optimization. + int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); + + // To protect arguments on the stack from being clobbered in a tail call, + // force all the loads to happen before doing any other lowering. + if (isTailCall) + Chain = DAG.getStackArgumentTokenFactor(Chain); + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); + SDValue CallSeqStart = Chain; + + // Load the return address and frame pointer so it can be move somewhere else + // later. + SDValue LROp, FPOp; + Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true, + dl); + + // Set up a copy of the stack pointer for use loading and storing any + // arguments that may not fit in the registers available for argument + // passing. + SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64); + + // Figure out which arguments are going to go in registers, and which in + // memory. Also, if this is a vararg function, floating point operations + // must be stored to our stack, and loaded into integer regs as well, if + // any integer regs are available for argument passing. + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx, FPR_idx = 0, VR_idx = 0; + + static const MCPhysReg GPR[] = { + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const MCPhysReg *FPR = GetFPR(); + + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + static const MCPhysReg VSRH[] = { + PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8, + PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13 + }; + + const unsigned NumGPRs = array_lengthof(GPR); + const unsigned NumFPRs = 13; + const unsigned NumVRs = array_lengthof(VR); + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<TailCallArgumentInfo, 8> TailCallArguments; + + SmallVector<SDValue, 8> MemOpChains; + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + EVT OrigVT = Outs[i].ArgVT; + + /* Respect alignment of argument on the stack. */ + unsigned Align = + CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize); + ArgOffset = ((ArgOffset + Align - 1) / Align) * Align; + + /* Compute GPR index associated with argument offset. */ + GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize; + GPR_idx = std::min(GPR_idx, NumGPRs); + + // PtrOff will be used to store the current argument to the stack if a + // register cannot be found for it. + SDValue PtrOff; + + PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType()); + + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + + // Promote integers to 64-bit values. + if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) { + // FIXME: Should this use ANY_EXTEND if neither sext nor zext? + unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); + } + + // FIXME memcpy is used way more than necessary. Correctness first. + // Note: "by value" is code for passing a structure by value, not + // basic types. + if (Flags.isByVal()) { + // Note: Size includes alignment padding, so + // struct x { short a; char b; } + // will have Size = 4. With #pragma pack(1), it will have Size = 3. + // These are the proper values we need for right-justifying the + // aggregate in a parameter register. + unsigned Size = Flags.getByValSize(); + + // An empty aggregate parameter takes up no storage and no + // registers. + if (Size == 0) + continue; + + // All aggregates smaller than 8 bytes must be passed right-justified. + if (Size==1 || Size==2 || Size==4) { + EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32); + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, + MachinePointerInfo(), VT, + false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); + + ArgOffset += PtrByteSize; + continue; + } + } + + if (GPR_idx == NumGPRs && Size < 8) { + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(PtrByteSize - Size, + PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, + CallSeqStart, + Flags, DAG, dl); + ArgOffset += PtrByteSize; + continue; + } + // Copy entire object into memory. There are cases where gcc-generated + // code assumes it is there, even if it could be put entirely into + // registers. (This is not what the doc says.) + + // FIXME: The above statement is likely due to a misunderstanding of the + // documents. All arguments must be copied into the parameter area BY + // THE CALLEE in the event that the callee takes the address of any + // formal argument. That has not yet been implemented. However, it is + // reasonable to use the stack area as a staging area for the register + // load. + + // Skip this for small aggregates, as we will use the same slot for a + // right-justified copy, below. + if (Size >= 8) + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, + CallSeqStart, + Flags, DAG, dl); + + // When a register is available, pass a small aggregate right-justified. + if (Size < 8 && GPR_idx != NumGPRs) { + // The easiest way to get this right-justified in a register + // is to copy the structure into the rightmost portion of a + // local variable slot, then load the whole slot into the + // register. + // FIXME: The memcpy seems to produce pretty awful code for + // small aggregates, particularly for packed ones. + // FIXME: It would be preferable to use the slot in the + // parameter save area instead of a new local variable. + SDValue AddPtr = PtrOff; + if (!isLittleEndian) { + SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType()); + AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + } + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, + CallSeqStart, + Flags, DAG, dl); + + // Load the slot into the register. + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load)); + + // Done with this argument. + ArgOffset += PtrByteSize; + continue; + } + + // For aggregates larger than PtrByteSize, copy the pieces of the + // object that fit into registers from the parameter save area. + for (unsigned j=0; j<Size; j+=PtrByteSize) { + SDValue Const = DAG.getConstant(j, PtrOff.getValueType()); + SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + ArgOffset += PtrByteSize; + } else { + ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; + break; + } + } + continue; + } + + switch (Arg.getSimpleValueType().SimpleTy) { + default: llvm_unreachable("Unexpected ValueType for argument!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + // These can be scalar arguments or elements of an integer array type + // passed directly. Clang may use those instead of "byval" aggregate + // types to avoid forcing arguments to memory unnecessarily. + if (GPR_idx != NumGPRs) { + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg)); + } else { + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + true, isTailCall, false, MemOpChains, + TailCallArguments, dl); + } + ArgOffset += PtrByteSize; + break; + case MVT::f32: + case MVT::f64: { + // These can be scalar arguments or elements of a float array type + // passed directly. The latter are used to implement ELFv2 homogenous + // float aggregates. + + // Named arguments go into FPRs first, and once they overflow, the + // remaining arguments go into GPRs and then the parameter save area. + // Unnamed arguments for vararg functions always go to GPRs and + // then the parameter save area. For now, put all arguments to vararg + // routines always in both locations (FPR *and* GPR or stack slot). + bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs; + + // First load the argument into the next available FPR. + if (FPR_idx != NumFPRs) + RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); + + // Next, load the argument into GPR or stack slot if needed. + if (!NeedGPROrStack) + ; + else if (GPR_idx != NumGPRs) { + // In the non-vararg case, this can only ever happen in the + // presence of f32 array types, since otherwise we never run + // out of FPRs before running out of GPRs. + SDValue ArgVal; + + // Double values are always passed in a single GPR. + if (Arg.getValueType() != MVT::f32) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); + + // Non-array float values are extended and passed in a GPR. + } else if (!Flags.isInConsecutiveRegs()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + + // If we have an array of floats, we collect every odd element + // together with its predecessor into one GPR. + } else if (ArgOffset % PtrByteSize != 0) { + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]); + Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + if (!isLittleEndian) + std::swap(Lo, Hi); + ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + + // The final element, if even, goes into the first half of a GPR. + } else if (Flags.isInConsecutiveRegsLast()) { + ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg); + ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal); + if (!isLittleEndian) + ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal, + DAG.getConstant(32, MVT::i32)); + + // Non-final even elements are skipped; they will be handled + // together the with subsequent argument on the next go-around. + } else + ArgVal = SDValue(); + + if (ArgVal.getNode()) + RegsToPass.push_back(std::make_pair(GPR[GPR_idx], ArgVal)); + } else { + // Single-precision floating-point values are mapped to the + // second (rightmost) word of the stack doubleword. + if (Arg.getValueType() == MVT::f32 && + !isLittleEndian && !Flags.isInConsecutiveRegs()) { + SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); + } + + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + true, isTailCall, false, MemOpChains, + TailCallArguments, dl); + } + // When passing an array of floats, the array occupies consecutive + // space in the argument area; only round up to the next doubleword + // at the end of the array. Otherwise, each float takes 8 bytes. + ArgOffset += (Arg.getValueType() == MVT::f32 && + Flags.isInConsecutiveRegs()) ? 4 : 8; + if (Flags.isInConsecutiveRegsLast()) + ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; + break; + } + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2f64: + case MVT::v2i64: + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + + // For a varargs call, named arguments go into VRs or on the stack as + // usual; unnamed arguments always go to the stack or the corresponding + // GPRs when within range. For now, we always put the value in both + // locations (or even all three). + if (isVarArg) { + // We could elide this store in the case where the object fits + // entirely in R registers. Maybe later. + SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0); + MemOpChains.push_back(Store); + if (VR_idx != NumVRs) { + SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + + unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 || + Arg.getSimpleValueType() == MVT::v2i64) ? + VSRH[VR_idx] : VR[VR_idx]; + ++VR_idx; + + RegsToPass.push_back(std::make_pair(VReg, Load)); + } + ArgOffset += 16; + for (unsigned i=0; i<16; i+=PtrByteSize) { + if (GPR_idx == NumGPRs) + break; + SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, + DAG.getConstant(i, PtrVT)); + SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + break; + } + + // Non-varargs Altivec params go into VRs or on the stack. + if (VR_idx != NumVRs) { + unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 || + Arg.getSimpleValueType() == MVT::v2i64) ? + VSRH[VR_idx] : VR[VR_idx]; + ++VR_idx; + + RegsToPass.push_back(std::make_pair(VReg, Arg)); + } else { + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + true, isTailCall, true, MemOpChains, + TailCallArguments, dl); + } + ArgOffset += 16; + break; + } + } + + assert(NumBytesActuallyUsed == ArgOffset); + (void)NumBytesActuallyUsed; + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + // Check if this is an indirect call (MTCTR/BCTRL). + // See PrepareCall() for more information about calls through function + // pointers in the 64-bit SVR4 ABI. + if (!isTailCall && + !dyn_cast<GlobalAddressSDNode>(Callee) && + !dyn_cast<ExternalSymbolSDNode>(Callee)) { + // Load r2 into a virtual register and store it to the TOC save area. + SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64); + // TOC save area offset. + unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset(isELFv2ABI); + SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(), + false, false, 0); + // In the ELFv2 ABI, R12 must contain the address of an indirect callee. + // This does not mean the MTCTR instruction must use R12; it's easier + // to model this as an extra parameter, so do that. + if (isELFv2ABI) + RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee)); + } + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + if (isTailCall) + PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, + FPOp, true, TailCallArguments); + + return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, + RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, + Ins, InVals); +} + +SDValue +PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + unsigned NumOps = Outs.size(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = PtrVT == MVT::i64; + unsigned PtrByteSize = isPPC64 ? 8 : 4; + + MachineFunction &MF = DAG.getMachineFunction(); + + // Mark this function as potentially containing a function that contains a + // tail call. As a consequence the frame pointer will be used for dynamicalloc + // and restoring the callers stack pointer in this functions epilog. This is + // done because by tail calling the called function might overwrite the value + // in this function's (MF) stack pointer stack slot 0(SP). + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); + + // Count how many bytes are to be pushed on the stack, including the linkage + // area, and parameter passing area. We start with 24/48 bytes, which is + // prereserved space for [SP][CR][LR][3 x unused]. + unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true, + false); + unsigned NumBytes = LinkageSize; + + // Add up all the space actually used. + // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually + // they all go in registers, but we must reserve stack space for them for + // possible use by the caller. In varargs or 64-bit calls, parameters are + // assigned stack space in order, with padding so Altivec parameters are + // 16-byte aligned. + unsigned nAltivecParamsAtEnd = 0; + for (unsigned i = 0; i != NumOps; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + EVT ArgVT = Outs[i].VT; + // Varargs Altivec parameters are padded to a 16 byte boundary. + if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || + ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || + ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { + if (!isVarArg && !isPPC64) { + // Non-varargs Altivec parameters go after all the non-Altivec + // parameters; handle those later so we know how much padding we need. + nAltivecParamsAtEnd++; + continue; + } + // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. + NumBytes = ((NumBytes+15)/16)*16; + } + NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); + } + + // Allow for Altivec parameters at the end, if needed. + if (nAltivecParamsAtEnd) { + NumBytes = ((NumBytes+15)/16)*16; + NumBytes += 16*nAltivecParamsAtEnd; + } + + // The prolog code of the callee may store up to 8 GPR argument registers to + // the stack, allowing va_start to index over them in memory if its varargs. + // Because we cannot tell if this is needed on the caller side, we have to + // conservatively assume that it is needed. As such, make sure we have at + // least enough stack space for the caller to store the 8 GPRs. + NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); + + // Tail call needs the stack to be aligned. + if (getTargetMachine().Options.GuaranteedTailCallOpt && + CallConv == CallingConv::Fast) + NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes); + + // Calculate by how many bytes the stack has to be adjusted in case of tail + // call optimization. + int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes); + + // To protect arguments on the stack from being clobbered in a tail call, + // force all the loads to happen before doing any other lowering. + if (isTailCall) + Chain = DAG.getStackArgumentTokenFactor(Chain); + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), + dl); + SDValue CallSeqStart = Chain; + + // Load the return address and frame pointer so it can be move somewhere else + // later. + SDValue LROp, FPOp; + Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true, + dl); + + // Set up a copy of the stack pointer for use loading and storing any + // arguments that may not fit in the registers available for argument + // passing. + SDValue StackPtr; + if (isPPC64) + StackPtr = DAG.getRegister(PPC::X1, MVT::i64); + else + StackPtr = DAG.getRegister(PPC::R1, MVT::i32); + + // Figure out which arguments are going to go in registers, and which in + // memory. Also, if this is a vararg function, floating point operations + // must be stored to our stack, and loaded into integer regs as well, if + // any integer regs are available for argument passing. + unsigned ArgOffset = LinkageSize; + unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; + + static const MCPhysReg GPR_32[] = { // 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10, + }; + static const MCPhysReg GPR_64[] = { // 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10, + }; + static const MCPhysReg *FPR = GetFPR(); + + static const MCPhysReg VR[] = { + PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, + PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 + }; + const unsigned NumGPRs = array_lengthof(GPR_32); + const unsigned NumFPRs = 13; + const unsigned NumVRs = array_lengthof(VR); + + const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<TailCallArgumentInfo, 8> TailCallArguments; + + SmallVector<SDValue, 8> MemOpChains; + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + + // PtrOff will be used to store the current argument to the stack if a + // register cannot be found for it. + SDValue PtrOff; + + PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType()); + + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + + // On PPC64, promote integers to 64-bit values. + if (isPPC64 && Arg.getValueType() == MVT::i32) { + // FIXME: Should this use ANY_EXTEND if neither sext nor zext? + unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); + } + + // FIXME memcpy is used way more than necessary. Correctness first. + // Note: "by value" is code for passing a structure by value, not + // basic types. + if (Flags.isByVal()) { + unsigned Size = Flags.getByValSize(); + // Very small objects are passed right-justified. Everything else is + // passed left-justified. + if (Size==1 || Size==2) { + EVT VT = (Size==1) ? MVT::i8 : MVT::i16; + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, + MachinePointerInfo(), VT, + false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + + ArgOffset += PtrByteSize; + } else { + SDValue Const = DAG.getConstant(PtrByteSize - Size, + PtrOff.getValueType()); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, + CallSeqStart, + Flags, DAG, dl); + ArgOffset += PtrByteSize; + } + continue; + } + // Copy entire object into memory. There are cases where gcc-generated + // code assumes it is there, even if it could be put entirely into + // registers. (This is not what the doc says.) + Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, + CallSeqStart, + Flags, DAG, dl); + + // For small aggregates (Darwin only) and aggregates >= PtrByteSize, + // copy the pieces of the object that fit into registers from the + // parameter save area. + for (unsigned j=0; j<Size; j+=PtrByteSize) { + SDValue Const = DAG.getConstant(j, PtrOff.getValueType()); + SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + ArgOffset += PtrByteSize; + } else { + ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; + break; + } + } + continue; + } + + switch (Arg.getSimpleValueType().SimpleTy) { + default: llvm_unreachable("Unexpected ValueType for argument!"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + if (GPR_idx != NumGPRs) { + if (Arg.getValueType() == MVT::i1) + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg); + + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); + } else { + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + isPPC64, isTailCall, false, MemOpChains, + TailCallArguments, dl); + } + ArgOffset += PtrByteSize; + break; + case MVT::f32: + case MVT::f64: + if (FPR_idx != NumFPRs) { + RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); + + if (isVarArg) { + SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0); + MemOpChains.push_back(Store); + + // Float varargs are always shadowed in available integer registers + if (GPR_idx != NumGPRs) { + SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, + MachinePointerInfo(), false, false, + false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){ + SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType()); + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); + SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + } else { + // If we have any FPRs remaining, we may also have GPRs remaining. + // Args passed in FPRs consume either 1 (f32) or 2 (f64) available + // GPRs. + if (GPR_idx != NumGPRs) + ++GPR_idx; + if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && + !isPPC64) // PPC64 has 64-bit GPR's obviously :) + ++GPR_idx; + } + } else + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + isPPC64, isTailCall, false, MemOpChains, + TailCallArguments, dl); + if (isPPC64) + ArgOffset += 8; + else + ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8; + break; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + if (isVarArg) { + // These go aligned on the stack, or in the corresponding R registers + // when within range. The Darwin PPC ABI doc claims they also go in + // V registers; in fact gcc does this only for arguments that are + // prototyped, not for those that match the ... We do it for all + // arguments, seems to work. + while (ArgOffset % 16 !=0) { + ArgOffset += PtrByteSize; + if (GPR_idx != NumGPRs) + GPR_idx++; + } + // We could elide this store in the case where the object fits + // entirely in R registers. Maybe later. + PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, + DAG.getConstant(ArgOffset, PtrVT)); + SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo(), false, false, 0); + MemOpChains.push_back(Store); + if (VR_idx != NumVRs) { + SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, + MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); + } + ArgOffset += 16; + for (unsigned i=0; i<16; i+=PtrByteSize) { + if (GPR_idx == NumGPRs) + break; + SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, + DAG.getConstant(i, PtrVT)); + SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(), + false, false, false, 0); + MemOpChains.push_back(Load.getValue(1)); + RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); + } + break; + } + + // Non-varargs Altivec params generally go in registers, but have + // stack space allocated at the end. + if (VR_idx != NumVRs) { + // Doesn't have GPR space allocated. + RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); + } else if (nAltivecParamsAtEnd==0) { + // We are emitting Altivec params in order. + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + isPPC64, isTailCall, true, MemOpChains, + TailCallArguments, dl); + ArgOffset += 16; + } + break; + } + } + // If all Altivec parameters fit in registers, as they usually do, + // they get stack space following the non-Altivec parameters. We + // don't track this here because nobody below needs it. + // If there are more Altivec parameters than fit in registers emit + // the stores here. + if (!isVarArg && nAltivecParamsAtEnd > NumVRs) { + unsigned j = 0; + // Offset is aligned; skip 1st 12 params which go in V registers. + ArgOffset = ((ArgOffset+15)/16)*16; + ArgOffset += 12*16; + for (unsigned i = 0; i != NumOps; ++i) { + SDValue Arg = OutVals[i]; + EVT ArgType = Outs[i].VT; + if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || + ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { + if (++j > NumVRs) { + SDValue PtrOff; + // We are emitting Altivec params in order. + LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, + isPPC64, isTailCall, true, MemOpChains, + TailCallArguments, dl); + ArgOffset += 16; + } + } + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); + + // On Darwin, R12 must contain the address of an indirect callee. This does + // not mean the MTCTR instruction must use R12; it's easier to model this as + // an extra parameter, so do that. + if (!isTailCall && + !dyn_cast<GlobalAddressSDNode>(Callee) && + !dyn_cast<ExternalSymbolSDNode>(Callee) && + !isBLACompatibleAddress(Callee, DAG)) + RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : + PPC::R12), Callee)); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + if (isTailCall) + PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp, + FPOp, true, TailCallArguments); + + return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG, + RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes, + Ins, InVals); +} + +bool +PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + RVLocs, Context); + return CCInfo.CheckReturn(Outs, RetCC_PPC); +} + +SDValue +PPCTargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc dl, SelectionDAG &DAG) const { + + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_PPC); + + SDValue Flag; + SmallVector<SDValue, 4> RetOps(1, Chain); + + // Copy the result values into the output registers. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = OutVals[i]; + + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + RetOps[0] = Chain; // Update chain. + + // Add the flag if we have it. + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); +} + +SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + // When we pop the dynamic allocation we need to restore the SP link. + SDLoc dl(Op); + + // Get the corect type for pointers. + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + // Construct the stack pointer operand. + bool isPPC64 = Subtarget.isPPC64(); + unsigned SP = isPPC64 ? PPC::X1 : PPC::R1; + SDValue StackPtr = DAG.getRegister(SP, PtrVT); + + // Get the operands for the STACKRESTORE. + SDValue Chain = Op.getOperand(0); + SDValue SaveSP = Op.getOperand(1); + + // Load the old link SP. + SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr, + MachinePointerInfo(), + false, false, false, 0); + + // Restore the stack pointer. + Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP); + + // Store the old link SP. + return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo(), + false, false, 0); +} + + + +SDValue +PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + // Get current frame pointer save index. The users of this index will be + // primarily DYNALLOC instructions. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + int RASI = FI->getReturnAddrSaveIndex(); + + // If the frame pointer save index hasn't been defined yet. + if (!RASI) { + // Find out what the fix offset of the frame pointer save area. + int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI); + // Allocate the frame index for frame pointer save area. + RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, true); + // Save the result. + FI->setReturnAddrSaveIndex(RASI); + } + return DAG.getFrameIndex(RASI, PtrVT); +} + +SDValue +PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + // Get current frame pointer save index. The users of this index will be + // primarily DYNALLOC instructions. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + int FPSI = FI->getFramePointerSaveIndex(); + + // If the frame pointer save index hasn't been defined yet. + if (!FPSI) { + // Find out what the fix offset of the frame pointer save area. + int FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, + isDarwinABI); + + // Allocate the frame index for frame pointer save area. + FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true); + // Save the result. + FI->setFramePointerSaveIndex(FPSI); + } + return DAG.getFrameIndex(FPSI, PtrVT); +} + +SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const { + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + SDLoc dl(Op); + + // Get the corect type for pointers. + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + // Negate the size. + SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, + DAG.getConstant(0, PtrVT), Size); + // Construct a node for the frame pointer save index. + SDValue FPSIdx = getFramePointerFrameIndex(DAG); + // Build a DYNALLOC node. + SDValue Ops[3] = { Chain, NegSize, FPSIdx }; + SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other); + return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops); +} + +SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL, + DAG.getVTList(MVT::i32, MVT::Other), + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::i1 && + "Custom lowering only for i1 loads"); + + // First, load 8 bits into 32 bits, then truncate to 1 bit. + + SDLoc dl(Op); + LoadSDNode *LD = cast<LoadSDNode>(Op); + + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + MachineMemOperand *MMO = LD->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(), Chain, + BasePtr, MVT::i8, MMO); + SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD); + + SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) }; + return DAG.getMergeValues(Ops, dl); +} + +SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOperand(1).getValueType() == MVT::i1 && + "Custom lowering only for i1 stores"); + + // First, zero extend to 32 bits, then use a truncating store to 8 bits. + + SDLoc dl(Op); + StoreSDNode *ST = cast<StoreSDNode>(Op); + + SDValue Chain = ST->getChain(); + SDValue BasePtr = ST->getBasePtr(); + SDValue Value = ST->getValue(); + MachineMemOperand *MMO = ST->getMemOperand(); + + Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(), Value); + return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO); +} + +// FIXME: Remove this once the ANDI glue bug is fixed: +SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::i1 && + "Custom lowering only for i1 results"); + + SDLoc DL(Op); + return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1, + Op.getOperand(0)); +} + +/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when +/// possible. +SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { + // Not FP? Not a fsel. + if (!Op.getOperand(0).getValueType().isFloatingPoint() || + !Op.getOperand(2).getValueType().isFloatingPoint()) + return Op; + + // We might be able to do better than this under some circumstances, but in + // general, fsel-based lowering of select is a finite-math-only optimization. + // For more information, see section F.3 of the 2.06 ISA specification. + if (!DAG.getTarget().Options.NoInfsFPMath || + !DAG.getTarget().Options.NoNaNsFPMath) + return Op; + + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); + + EVT ResVT = Op.getValueType(); + EVT CmpVT = Op.getOperand(0).getValueType(); + SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); + SDLoc dl(Op); + + // If the RHS of the comparison is a 0.0, we don't need to do the + // subtraction at all. + SDValue Sel1; + if (isFloatingPointZero(RHS)) + switch (CC) { + default: break; // SETUO etc aren't handled by fsel. + case ISD::SETNE: + std::swap(TV, FV); + case ISD::SETEQ: + if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); + Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); + if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits + Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, + DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV); + case ISD::SETULT: + case ISD::SETLT: + std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + case ISD::SETOGE: + case ISD::SETGE: + if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV); + case ISD::SETUGT: + case ISD::SETGT: + std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + case ISD::SETOLE: + case ISD::SETLE: + if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, + DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV); + } + + SDValue Cmp; + switch (CC) { + default: break; // SETUO etc aren't handled by fsel. + case ISD::SETNE: + std::swap(TV, FV); + case ISD::SETEQ: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); + if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits + Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, + DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); + case ISD::SETULT: + case ISD::SETLT: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); + case ISD::SETOGE: + case ISD::SETGE: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); + case ISD::SETUGT: + case ISD::SETGT: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); + case ISD::SETOLE: + case ISD::SETLE: + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS); + if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits + Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); + return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); + } + return Op; +} + +// FIXME: Split this code up when LegalizeDAGTypes lands. +SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, + SDLoc dl) const { + assert(Op.getOperand(0).getValueType().isFloatingPoint()); + SDValue Src = Op.getOperand(0); + if (Src.getValueType() == MVT::f32) + Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); + + SDValue Tmp; + switch (Op.getSimpleValueType().SimpleTy) { + default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); + case MVT::i32: + Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ : + (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : + PPCISD::FCTIDZ), + dl, MVT::f64, Src); + break; + case MVT::i64: + assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && + "i64 FP_TO_UINT is supported only with FPCVT"); + Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : + PPCISD::FCTIDUZ, + dl, MVT::f64, Src); + break; + } + + // Convert the FP value to an int value through memory. + bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && + (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); + SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); + int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI); + + // Emit a store to the stack slot. + SDValue Chain; + if (i32Stack) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4); + SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; + Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, + DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO); + } else + Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, + MPI, false, false, 0); + + // Result is a load from the stack slot. If loading 4 bytes, make sure to + // add in a bias. + if (Op.getValueType() == MVT::i32 && !i32Stack) { + FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, + DAG.getConstant(4, FIPtr.getValueType())); + MPI = MachinePointerInfo(); + } + + return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MPI, + false, false, false, 0); +} + +SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + // Don't handle ppc_fp128 here; let it be lowered to a libcall. + if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) + return SDValue(); + + if (Op.getOperand(0).getValueType() == MVT::i1) + return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), + DAG.getConstantFP(1.0, Op.getValueType()), + DAG.getConstantFP(0.0, Op.getValueType())); + + assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && + "UINT_TO_FP is supported only with FPCVT"); + + // If we have FCFIDS, then use it when converting to single-precision. + // Otherwise, convert to double-precision and then round. + unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + (Op.getOpcode() == ISD::UINT_TO_FP ? + PPCISD::FCFIDUS : PPCISD::FCFIDS) : + (Op.getOpcode() == ISD::UINT_TO_FP ? + PPCISD::FCFIDU : PPCISD::FCFID); + MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ? + MVT::f32 : MVT::f64; + + if (Op.getOperand(0).getValueType() == MVT::i64) { + SDValue SINT = Op.getOperand(0); + // When converting to single-precision, we actually need to convert + // to double-precision first and then round to single-precision. + // To avoid double-rounding effects during that operation, we have + // to prepare the input operand. Bits that might be truncated when + // converting to double-precision are replaced by a bit that won't + // be lost at this stage, but is below the single-precision rounding + // position. + // + // However, if -enable-unsafe-fp-math is in effect, accept double + // rounding to avoid the extra overhead. + if (Op.getValueType() == MVT::f32 && + !Subtarget.hasFPCVT() && + !DAG.getTarget().Options.UnsafeFPMath) { + + // Twiddle input to make sure the low 11 bits are zero. (If this + // is the case, we are guaranteed the value will fit into the 53 bit + // mantissa of an IEEE double-precision value without rounding.) + // If any of those low 11 bits were not zero originally, make sure + // bit 12 (value 2048) is set instead, so that the final rounding + // to single-precision gets the correct result. + SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64, + SINT, DAG.getConstant(2047, MVT::i64)); + Round = DAG.getNode(ISD::ADD, dl, MVT::i64, + Round, DAG.getConstant(2047, MVT::i64)); + Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT); + Round = DAG.getNode(ISD::AND, dl, MVT::i64, + Round, DAG.getConstant(-2048, MVT::i64)); + + // However, we cannot use that value unconditionally: if the magnitude + // of the input value is small, the bit-twiddling we did above might + // end up visibly changing the output. Fortunately, in that case, we + // don't need to twiddle bits since the original input will convert + // exactly to double-precision floating-point already. Therefore, + // construct a conditional to use the original value if the top 11 + // bits are all sign-bit copies, and use the rounded value computed + // above otherwise. + SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64, + SINT, DAG.getConstant(53, MVT::i32)); + Cond = DAG.getNode(ISD::ADD, dl, MVT::i64, + Cond, DAG.getConstant(1, MVT::i64)); + Cond = DAG.getSetCC(dl, MVT::i32, + Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT); + + SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT); + } + + SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); + + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) + FP = DAG.getNode(ISD::FP_ROUND, dl, + MVT::f32, FP, DAG.getIntPtrConstant(0)); + return FP; + } + + assert(Op.getOperand(0).getValueType() == MVT::i32 && + "Unhandled INT_TO_FP type in custom expander!"); + // Since we only generate this in 64-bit mode, we can take advantage of + // 64-bit registers. In particular, sign extend the input value into the + // 64-bit register with extsw, store the WHOLE 64-bit value into the stack + // then lfd it and fcfid it. + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + SDValue Ld; + if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { + int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, 0); + + assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && + "Expected an i32 store"); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), + MachineMemOperand::MOLoad, 4, 4); + SDValue Ops[] = { Store, FIdx }; + Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? + PPCISD::LFIWZX : PPCISD::LFIWAX, + dl, DAG.getVTList(MVT::f64, MVT::Other), + Ops, MVT::i32, MMO); + } else { + assert(Subtarget.isPPC64() && + "i32->FP without LFIWAX supported only on PPC64"); + + int FrameIdx = FrameInfo->CreateStackObject(8, 8, false); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, + Op.getOperand(0)); + + // STD the extended value into the stack slot. + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Ext64, FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, 0); + + // Load the value as a double. + Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx, + MachinePointerInfo::getFixedStack(FrameIdx), + false, false, false, 0); + } + + // FCFID it and return it. + SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) + FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0)); + return FP; +} + +SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + /* + The rounding mode is in bits 30:31 of FPSR, and has the following + settings: + 00 Round to nearest + 01 Round to 0 + 10 Round to +inf + 11 Round to -inf + + FLT_ROUNDS, on the other hand, expects the following: + -1 Undefined + 0 Round to 0 + 1 Round to nearest + 2 Round to +inf + 3 Round to -inf + + To perform the conversion, we do: + ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1)) + */ + + MachineFunction &MF = DAG.getMachineFunction(); + EVT VT = Op.getValueType(); + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + + // Save FP Control Word to register + EVT NodeTys[] = { + MVT::f64, // return register + MVT::Glue // unused in this context + }; + SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None); + + // Save FP register to stack slot + int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, + StackSlot, MachinePointerInfo(), false, false,0); + + // Load FP Control Word from low 32 bits of stack slot. + SDValue Four = DAG.getConstant(4, PtrVT); + SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); + SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo(), + false, false, false, 0); + + // Transform as necessary + SDValue CWD1 = + DAG.getNode(ISD::AND, dl, MVT::i32, + CWD, DAG.getConstant(3, MVT::i32)); + SDValue CWD2 = + DAG.getNode(ISD::SRL, dl, MVT::i32, + DAG.getNode(ISD::AND, dl, MVT::i32, + DAG.getNode(ISD::XOR, dl, MVT::i32, + CWD, DAG.getConstant(3, MVT::i32)), + DAG.getConstant(3, MVT::i32)), + DAG.getConstant(1, MVT::i32)); + + SDValue RetVal = + DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2); + + return DAG.getNode((VT.getSizeInBits() < 16 ? + ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); +} + +SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + SDLoc dl(Op); + assert(Op.getNumOperands() == 3 && + VT == Op.getOperand(1).getValueType() && + "Unexpected SHL!"); + + // Expand into a bunch of logical ops. Note that these ops + // depend on the PPC behavior for oversized shift amounts. + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Amt = Op.getOperand(2); + EVT AmtVT = Amt.getValueType(); + + SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, + DAG.getConstant(BitWidth, AmtVT), Amt); + SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt); + SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1); + SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3); + SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, + DAG.getConstant(-BitWidth, AmtVT)); + SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5); + SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); + SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt); + SDValue OutOps[] = { OutLo, OutHi }; + return DAG.getMergeValues(OutOps, dl); +} + +SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc dl(Op); + unsigned BitWidth = VT.getSizeInBits(); + assert(Op.getNumOperands() == 3 && + VT == Op.getOperand(1).getValueType() && + "Unexpected SRL!"); + + // Expand into a bunch of logical ops. Note that these ops + // depend on the PPC behavior for oversized shift amounts. + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Amt = Op.getOperand(2); + EVT AmtVT = Amt.getValueType(); + + SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, + DAG.getConstant(BitWidth, AmtVT), Amt); + SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); + SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); + SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); + SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, + DAG.getConstant(-BitWidth, AmtVT)); + SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5); + SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6); + SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt); + SDValue OutOps[] = { OutLo, OutHi }; + return DAG.getMergeValues(OutOps, dl); +} + +SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + assert(Op.getNumOperands() == 3 && + VT == Op.getOperand(1).getValueType() && + "Unexpected SRA!"); + + // Expand into a bunch of logical ops, followed by a select_cc. + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + SDValue Amt = Op.getOperand(2); + EVT AmtVT = Amt.getValueType(); + + SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT, + DAG.getConstant(BitWidth, AmtVT), Amt); + SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt); + SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1); + SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3); + SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt, + DAG.getConstant(-BitWidth, AmtVT)); + SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5); + SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt); + SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, AmtVT), + Tmp4, Tmp6, ISD::SETLE); + SDValue OutOps[] = { OutLo, OutHi }; + return DAG.getMergeValues(OutOps, dl); +} + +//===----------------------------------------------------------------------===// +// Vector related lowering. +// + +/// BuildSplatI - Build a canonical splati of Val with an element size of +/// SplatSize. Cast the result to VT. +static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, + SelectionDAG &DAG, SDLoc dl) { + assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); + + static const EVT VTys[] = { // canonical VT to use for each size. + MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 + }; + + EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; + + // Force vspltis[hw] -1 to vspltisb -1 to canonicalize. + if (Val == -1) + SplatSize = 1; + + EVT CanonicalVT = VTys[SplatSize-1]; + + // Build a canonical splat for this value. + SDValue Elt = DAG.getConstant(Val, MVT::i32); + SmallVector<SDValue, 8> Ops; + Ops.assign(CanonicalVT.getVectorNumElements(), Elt); + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, Ops); + return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res); +} + +/// BuildIntrinsicOp - Return a unary operator intrinsic node with the +/// specified intrinsic ID. +static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, + SelectionDAG &DAG, SDLoc dl, + EVT DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = Op.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, + DAG.getConstant(IID, MVT::i32), Op); +} + +/// BuildIntrinsicOp - Return a binary operator intrinsic node with the +/// specified intrinsic ID. +static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS, + SelectionDAG &DAG, SDLoc dl, + EVT DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = LHS.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, + DAG.getConstant(IID, MVT::i32), LHS, RHS); +} + +/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the +/// specified intrinsic ID. +static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, + SDValue Op2, SelectionDAG &DAG, + SDLoc dl, EVT DestVT = MVT::Other) { + if (DestVT == MVT::Other) DestVT = Op0.getValueType(); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT, + DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2); +} + + +/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified +/// amount. The result has the specified value type. +static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, + EVT VT, SelectionDAG &DAG, SDLoc dl) { + // Force LHS/RHS to be the right type. + LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS); + + int Ops[16]; + for (unsigned i = 0; i != 16; ++i) + Ops[i] = i + Amt; + SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops); + return DAG.getNode(ISD::BITCAST, dl, VT, T); +} + +// If this is a case we can't handle, return null and let the default +// expansion code take care of it. If we CAN select this case, and if it +// selects to a single instruction, return Op. Otherwise, if we can codegen +// this case more efficiently than a constant pool load, lower it to the +// sequence of ops that should be used. +SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); + + // Check if this is a splat of a constant value. + APInt APSplatBits, APSplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, + HasAnyUndefs, 0, true) || SplatBitSize > 32) + return SDValue(); + + unsigned SplatBits = APSplatBits.getZExtValue(); + unsigned SplatUndef = APSplatUndef.getZExtValue(); + unsigned SplatSize = SplatBitSize / 8; + + // First, handle single instruction cases. + + // All zeros? + if (SplatBits == 0) { + // Canonicalize all zero vectors to be v4i32. + if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) { + SDValue Z = DAG.getConstant(0, MVT::i32); + Z = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Z, Z, Z, Z); + Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z); + } + return Op; + } + + // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. + int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> + (32-SplatBitSize)); + if (SextVal >= -16 && SextVal <= 15) + return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); + + + // Two instruction sequences. + + // If this value is in the range [-32,30] and is even, use: + // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) + // If this value is in the range [17,31] and is odd, use: + // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) + // If this value is in the range [-31,-17] and is odd, use: + // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) + // Note the last two are three-instruction sequences. + if (SextVal >= -32 && SextVal <= 31) { + // To avoid having these optimizations undone by constant folding, + // we convert to a pseudo that will be expanded later into one of + // the above forms. + SDValue Elt = DAG.getConstant(SextVal, MVT::i32); + EVT VT = (SplatSize == 1 ? MVT::v16i8 : + (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32)); + SDValue EltSize = DAG.getConstant(SplatSize, MVT::i32); + SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize); + if (VT == Op.getValueType()) + return RetVal; + else + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal); + } + + // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is + // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important + // for fneg/fabs. + if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) { + // Make -1 and vspltisw -1: + SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl); + + // Make the VSLW intrinsic, computing 0x8000_0000. + SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV, + OnesV, DAG, dl); + + // xor by OnesV to invert it. + Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // The remaining cases assume either big endian element order or + // a splat-size that equates to the element size of the vector + // to be built. An example that doesn't work for little endian is + // {0, -1, 0, -1, 0, -1, 0, -1} which has a splat size of 32 bits + // and a vector element size of 16 bits. The code below will + // produce the vector in big endian element order, which for little + // endian is {-1, 0, -1, 0, -1, 0, -1, 0}. + + // For now, just avoid these optimizations in that case. + // FIXME: Develop correct optimizations for LE with mismatched + // splat and element sizes. + + if (Subtarget.isLittleEndian() && + SplatSize != Op.getValueType().getVectorElementType().getSizeInBits()) + return SDValue(); + + // Check to see if this is a wide variety of vsplti*, binop self cases. + static const signed char SplatCsts[] = { + -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, + -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 + }; + + for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) { + // Indirect through the SplatCsts array so that we favor 'vsplti -1' for + // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' + int i = SplatCsts[idx]; + + // Figure out what shift amount will be used by altivec if shifted by i in + // this splat size. + unsigned TypeShiftAmt = i & (SplatBitSize-1); + + // vsplti + shl self. + if (SextVal == (int)((unsigned)i << TypeShiftAmt)) { + SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0, + Intrinsic::ppc_altivec_vslw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // vsplti + srl self. + if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { + SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0, + Intrinsic::ppc_altivec_vsrw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // vsplti + sra self. + if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { + SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, + Intrinsic::ppc_altivec_vsraw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // vsplti + rol self. + if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | + ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { + SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl); + static const unsigned IIDs[] = { // Intrinsic to use for each size. + Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0, + Intrinsic::ppc_altivec_vrlw + }; + Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + + // t = vsplti c, result = vsldoi t, t, 1 + if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) { + SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); + return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl); + } + // t = vsplti c, result = vsldoi t, t, 2 + if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) { + SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); + return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl); + } + // t = vsplti c, result = vsldoi t, t, 3 + if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) { + SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl); + return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl); + } + } + + return SDValue(); +} + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. +static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + SDLoc dl) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + + enum { + OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> + OP_VMRGHW, + OP_VMRGLW, + OP_VSPLTISW0, + OP_VSPLTISW1, + OP_VSPLTISW2, + OP_VSPLTISW3, + OP_VSLDOI4, + OP_VSLDOI8, + OP_VSLDOI12 + }; + + if (OpNum == OP_COPY) { + if (LHSID == (1*9+2)*9+3) return LHS; + assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); + return RHS; + } + + SDValue OpLHS, OpRHS; + OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); + OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); + + int ShufIdxs[16]; + switch (OpNum) { + default: llvm_unreachable("Unknown i32 permute!"); + case OP_VMRGHW: + ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3; + ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19; + ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7; + ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23; + break; + case OP_VMRGLW: + ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11; + ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27; + ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15; + ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31; + break; + case OP_VSPLTISW0: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+0; + break; + case OP_VSPLTISW1: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+4; + break; + case OP_VSPLTISW2: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+8; + break; + case OP_VSPLTISW3: + for (unsigned i = 0; i != 16; ++i) + ShufIdxs[i] = (i&3)+12; + break; + case OP_VSLDOI4: + return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl); + case OP_VSLDOI8: + return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl); + case OP_VSLDOI12: + return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl); + } + EVT VT = OpLHS.getValueType(); + OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS); + OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS); + SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs); + return DAG.getNode(ISD::BITCAST, dl, VT, T); +} + +/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this +/// is a shuffle we can handle in a single instruction, return it. Otherwise, +/// return the code it can be lowered into. Worst case, it can always be +/// lowered into a vperm. +SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + EVT VT = Op.getValueType(); + bool isLittleEndian = Subtarget.isLittleEndian(); + + // Cases that are handled by instructions that take permute immediates + // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be + // selected by the instruction selector. + if (V2.getOpcode() == ISD::UNDEF) { + if (PPC::isSplatShuffleMask(SVOp, 1) || + PPC::isSplatShuffleMask(SVOp, 2) || + PPC::isSplatShuffleMask(SVOp, 4) || + PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG)) { + return Op; + } + } + + // Altivec has a variety of "shuffle immediates" that take two vector inputs + // and produce a fixed permutation. If any of these match, do not lower to + // VPERM. + unsigned int ShuffleKind = isLittleEndian ? 2 : 0; + if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) || + PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 || + PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) || + PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG)) + return Op; + + // Check to see if this is a shuffle of 4-byte values. If so, we can use our + // perfect shuffle table to emit an optimal matching sequence. + ArrayRef<int> PermMask = SVOp->getMask(); + + unsigned PFIndexes[4]; + bool isFourElementShuffle = true; + for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number + unsigned EltNo = 8; // Start out undef. + for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. + if (PermMask[i*4+j] < 0) + continue; // Undef, ignore it. + + unsigned ByteSource = PermMask[i*4+j]; + if ((ByteSource & 3) != j) { + isFourElementShuffle = false; + break; + } + + if (EltNo == 8) { + EltNo = ByteSource/4; + } else if (EltNo != ByteSource/4) { + isFourElementShuffle = false; + break; + } + } + PFIndexes[i] = EltNo; + } + + // If this shuffle can be expressed as a shuffle of 4-byte elements, use the + // perfect shuffle vector to determine if it is cost effective to do this as + // discrete instructions, or whether we should use a vperm. + // For now, we skip this for little endian until such time as we have a + // little-endian perfect shuffle table. + if (isFourElementShuffle && !isLittleEndian) { + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = + PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; + + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + // Determining when to avoid vperm is tricky. Many things affect the cost + // of vperm, particularly how many times the perm mask needs to be computed. + // For example, if the perm mask can be hoisted out of a loop or is already + // used (perhaps because there are multiple permutes with the same shuffle + // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of + // the loop requires an extra register. + // + // As a compromise, we only emit discrete instructions if the shuffle can be + // generated in 3 or fewer operations. When we have loop information + // available, if this block is within a loop, we should avoid using vperm + // for 3-operation perms and use a constant pool load instead. + if (Cost < 3) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + + // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant + // vector that will get spilled to the constant pool. + if (V2.getOpcode() == ISD::UNDEF) V2 = V1; + + // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except + // that it is in input element units, not in bytes. Convert now. + + // For little endian, the order of the input vectors is reversed, and + // the permutation mask is complemented with respect to 31. This is + // necessary to produce proper semantics with the big-endian-biased vperm + // instruction. + EVT EltVT = V1.getValueType().getVectorElementType(); + unsigned BytesPerElement = EltVT.getSizeInBits()/8; + + SmallVector<SDValue, 16> ResultMask; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; + + for (unsigned j = 0; j != BytesPerElement; ++j) + if (isLittleEndian) + ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement+j), + MVT::i32)); + else + ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j, + MVT::i32)); + } + + SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, + ResultMask); + if (isLittleEndian) + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V2, V1, VPermMask); + else + return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), + V1, V2, VPermMask); +} + +/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an +/// altivec comparison. If it is, return true and fill in Opc/isDot with +/// information about the intrinsic. +static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, + bool &isDot) { + unsigned IntrinsicID = + cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); + CompareOpc = -1; + isDot = false; + switch (IntrinsicID) { + default: return false; + // Comparison predicates. + case Intrinsic::ppc_altivec_vcmpbfp_p: CompareOpc = 966; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break; + case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break; + + // Normal Comparisons. + case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break; + case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break; + } + return true; +} + +/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom +/// lower, do it, otherwise return null. +SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + // If this is a lowered altivec predicate compare, CompareOpc is set to the + // opcode number of the comparison. + SDLoc dl(Op); + int CompareOpc; + bool isDot; + if (!getAltivecCompareInfo(Op, CompareOpc, isDot)) + return SDValue(); // Don't custom lower most intrinsics. + + // If this is a non-dot comparison, make the VCMP node and we are done. + if (!isDot) { + SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(CompareOpc, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp); + } + + // Create the PPCISD altivec 'dot' comparison node. + SDValue Ops[] = { + Op.getOperand(2), // LHS + Op.getOperand(3), // RHS + DAG.getConstant(CompareOpc, MVT::i32) + }; + EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; + SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); + + // Now that we have the comparison, emit a copy from the CR to a GPR. + // This is flagged to the above dot comparison. + SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32, + DAG.getRegister(PPC::CR6, MVT::i32), + CompNode.getValue(1)); + + // Unpack the result based on how the target uses it. + unsigned BitNo; // Bit # of CR6. + bool InvertBit; // Invert result? + switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) { + default: // Can't happen, don't crash on invalid number though. + case 0: // Return the value of the EQ bit of CR6. + BitNo = 0; InvertBit = false; + break; + case 1: // Return the inverted value of the EQ bit of CR6. + BitNo = 0; InvertBit = true; + break; + case 2: // Return the value of the LT bit of CR6. + BitNo = 2; InvertBit = false; + break; + case 3: // Return the inverted value of the LT bit of CR6. + BitNo = 2; InvertBit = true; + break; + } + + // Shift the bit into the low position. + Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags, + DAG.getConstant(8-(3-BitNo), MVT::i32)); + // Isolate the bit. + Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags, + DAG.getConstant(1, MVT::i32)); + + // If we are supposed to, toggle the bit. + if (InvertBit) + Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags, + DAG.getConstant(1, MVT::i32)); + return Flags; +} + +SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + // For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int + // instructions), but for smaller types, we need to first extend up to v2i32 + // before doing going farther. + if (Op.getValueType() == MVT::v2i64) { + EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); + if (ExtVT != MVT::v2i32) { + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)); + Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op, + DAG.getValueType(EVT::getVectorVT(*DAG.getContext(), + ExtVT.getVectorElementType(), 4))); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op); + Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op, + DAG.getValueType(MVT::v2i32)); + } + + return Op; + } + + return SDValue(); +} + +SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + // Create a stack slot that is 16-byte aligned. + MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); + EVT PtrVT = getPointerTy(); + SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); + + // Store the input value into Value#0 of the stack slot. + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, + Op.getOperand(0), FIdx, MachinePointerInfo(), + false, false, 0); + // Load it out. + return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo(), + false, false, false, 0); +} + +SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + if (Op.getValueType() == MVT::v4i32) { + SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + + SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl); + SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt. + + SDValue RHSSwap = // = vrlw RHS, 16 + BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl); + + // Shrinkify inputs to v8i16. + LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS); + RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS); + RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap); + + // Low parts multiplied together, generating 32-bit results (we ignore the + // top parts). + SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh, + LHS, RHS, DAG, dl, MVT::v4i32); + + SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm, + LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32); + // Shift the high parts up 16 bits. + HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd, + Neg16, DAG, dl); + return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd); + } else if (Op.getValueType() == MVT::v8i16) { + SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + + SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl); + + return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm, + LHS, RHS, Zero, DAG, dl); + } else if (Op.getValueType() == MVT::v16i8) { + SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); + bool isLittleEndian = Subtarget.isLittleEndian(); + + // Multiply the even 8-bit parts, producing 16-bit sums. + SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub, + LHS, RHS, DAG, dl, MVT::v8i16); + EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts); + + // Multiply the odd 8-bit parts, producing 16-bit sums. + SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub, + LHS, RHS, DAG, dl, MVT::v8i16); + OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts); + + // Merge the results together. Because vmuleub and vmuloub are + // instructions with a big-endian bias, we must reverse the + // element numbering and reverse the meaning of "odd" and "even" + // when generating little endian code. + int Ops[16]; + for (unsigned i = 0; i != 8; ++i) { + if (isLittleEndian) { + Ops[i*2 ] = 2*i; + Ops[i*2+1] = 2*i+16; + } else { + Ops[i*2 ] = 2*i+1; + Ops[i*2+1] = 2*i+1+16; + } + } + if (isLittleEndian) + return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops); + else + return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops); + } else { + llvm_unreachable("Unknown mul to lower!"); + } +} + +/// LowerOperation - Provide custom lowering hooks for some operations. +/// +SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: llvm_unreachable("Wasn't expecting to be able to lower this!"); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); + case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); + case ISD::VASTART: + return LowerVASTART(Op, DAG, Subtarget); + + case ISD::VAARG: + return LowerVAARG(Op, DAG, Subtarget); + + case ISD::VACOPY: + return LowerVACOPY(Op, DAG, Subtarget); + + case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget); + case ISD::DYNAMIC_STACKALLOC: + return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); + + case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); + + case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); + case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::FP_TO_UINT: + case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, + SDLoc(Op)); + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + + // Lower 64-bit shifts. + case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); + case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); + case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); + + // Vector-related lowering. + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); + case ISD::MUL: return LowerMUL(Op, DAG); + + // For counter-based loop handling. + case ISD::INTRINSIC_W_CHAIN: return SDValue(); + + // Frame & Return address. + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + } +} + +void PPCTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const { + const TargetMachine &TM = getTargetMachine(); + SDLoc dl(N); + switch (N->getOpcode()) { + default: + llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::INTRINSIC_W_CHAIN: { + if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != + Intrinsic::ppc_is_decremented_ctr_nonzero) + break; + + assert(N->getValueType(0) == MVT::i1 && + "Unexpected result type for CTR decrement intrinsic"); + EVT SVT = getSetCCResultType(*DAG.getContext(), N->getValueType(0)); + SDVTList VTs = DAG.getVTList(SVT, MVT::Other); + SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), + N->getOperand(1)); + + Results.push_back(NewInt); + Results.push_back(NewInt.getValue(1)); + break; + } + case ISD::VAARG: { + if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI() + || TM.getSubtarget<PPCSubtarget>().isPPC64()) + return; + + EVT VT = N->getValueType(0); + + if (VT == MVT::i64) { + SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, Subtarget); + + Results.push_back(NewNode); + Results.push_back(NewNode.getValue(1)); + } + return; + } + case ISD::FP_ROUND_INREG: { + assert(N->getValueType(0) == MVT::ppcf128); + assert(N->getOperand(0).getValueType() == MVT::ppcf128); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, + MVT::f64, N->getOperand(0), + DAG.getIntPtrConstant(0)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, + MVT::f64, N->getOperand(0), + DAG.getIntPtrConstant(1)); + + // Add the two halves of the long double in round-to-zero mode. + SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); + + // We know the low half is about to be thrown away, so just use something + // convenient. + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128, + FPreg, FPreg)); + return; + } + case ISD::FP_TO_SINT: + // LowerFP_TO_INT() can only handle f32 and f64. + if (N->getOperand(0).getValueType() == MVT::ppcf128) + return; + Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); + return; + } +} + + +//===----------------------------------------------------------------------===// +// Other Lowering Code +//===----------------------------------------------------------------------===// + +MachineBasicBlock * +PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, + bool is64bit, unsigned BinOpcode) const { + // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *F = BB->getParent(); + MachineFunction::iterator It = BB; + ++It; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptrA = MI->getOperand(1).getReg(); + unsigned ptrB = MI->getOperand(2).getReg(); + unsigned incr = MI->getOperand(3).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, loopMBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned TmpReg = (!BinOpcode) ? incr : + RegInfo.createVirtualRegister( + is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass : + (const TargetRegisterClass *) &PPC::GPRCRegClass); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // loopMBB: + // l[wd]arx dest, ptr + // add r0, dest, incr + // st[wd]cx. r0, ptr + // bne- loopMBB + // fallthrough --> exitMBB + BB = loopMBB; + BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest) + .addReg(ptrA).addReg(ptrB); + if (BinOpcode) + BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); + BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX)) + .addReg(TmpReg).addReg(ptrA).addReg(ptrB); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + return BB; +} + +MachineBasicBlock * +PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, + MachineBasicBlock *BB, + bool is8bit, // operation + unsigned BinOpcode) const { + // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + // In 64 bit mode we have to use 64 bits for addresses, even though the + // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address + // registers without caring whether they're 32 or 64, but here we're + // doing actual arithmetic on the addresses. + bool is64bit = Subtarget.isPPC64(); + unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; + + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction *F = BB->getParent(); + MachineFunction::iterator It = BB; + ++It; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptrA = MI->getOperand(1).getReg(); + unsigned ptrB = MI->getOperand(2).getReg(); + unsigned incr = MI->getOperand(3).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, loopMBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + const TargetRegisterClass *RC = + is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass : + (const TargetRegisterClass *) &PPC::GPRCRegClass; + unsigned PtrReg = RegInfo.createVirtualRegister(RC); + unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); + unsigned ShiftReg = RegInfo.createVirtualRegister(RC); + unsigned Incr2Reg = RegInfo.createVirtualRegister(RC); + unsigned MaskReg = RegInfo.createVirtualRegister(RC); + unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); + unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); + unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); + unsigned Ptr1Reg; + unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loopMBB); + + // The 4-byte load must be aligned, while a char or short may be + // anywhere in the word. Hence all this nasty bookkeeping code. + // add ptr1, ptrA, ptrB [copy if ptrA==0] + // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] + // xori shift, shift1, 24 [16] + // rlwinm ptr, ptr1, 0, 0, 29 + // slw incr2, incr, shift + // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] + // slw mask, mask2, shift + // loopMBB: + // lwarx tmpDest, ptr + // add tmp, tmpDest, incr2 + // andc tmp2, tmpDest, mask + // and tmp3, tmp, mask + // or tmp4, tmp3, tmp2 + // stwcx. tmp4, ptr + // bne- loopMBB + // fallthrough --> exitMBB + // srw dest, tmpDest, shift + if (ptrA != ZeroReg) { + Ptr1Reg = RegInfo.createVirtualRegister(RC); + BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) + .addReg(ptrA).addReg(ptrB); + } else { + Ptr1Reg = ptrB; + } + BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) + .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); + BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) + .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); + if (is64bit) + BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) + .addReg(Ptr1Reg).addImm(0).addImm(61); + else + BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) + .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); + BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg) + .addReg(incr).addReg(ShiftReg); + if (is8bit) + BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); + else { + BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); + BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535); + } + BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) + .addReg(Mask2Reg).addReg(ShiftReg); + + BB = loopMBB; + BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) + .addReg(ZeroReg).addReg(PtrReg); + if (BinOpcode) + BuildMI(BB, dl, TII->get(BinOpcode), TmpReg) + .addReg(Incr2Reg).addReg(TmpDestReg); + BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg) + .addReg(TmpDestReg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg) + .addReg(TmpReg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg) + .addReg(Tmp3Reg).addReg(Tmp2Reg); + BuildMI(BB, dl, TII->get(PPC::STWCX)) + .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB); + BB->addSuccessor(loopMBB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg) + .addReg(ShiftReg); + return BB; +} + +llvm::MachineBasicBlock* +PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + assert(RC->hasType(MVT::i32) && "Invalid destination!"); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned restoreDstReg = MRI.createVirtualRegister(RC); + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + // For v = setjmp(buf), we generate + // + // thisMBB: + // SjLjSetup mainMBB + // bl mainMBB + // v_restore = 1 + // b sinkMBB + // + // mainMBB: + // buf[LabelOffset] = LR + // v_main = 0 + // + // sinkMBB: + // v = phi(main, restore) + // + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // Note that the structure of the jmp_buf used here is not compatible + // with that used by libc, and is not designed to be. Specifically, it + // stores only those 'reserved' registers that LLVM does not otherwise + // understand how to spill. Also, by convention, by the time this + // intrinsic is called, Clang has already stored the frame address in the + // first slot of the buffer and stack address in the third. Following the + // X86 target code, we'll store the jump address in the second slot. We also + // need to save the TOC pointer (R2) to handle jumps between shared + // libraries, and that will be stored in the fourth slot. The thread + // identifier (R13) is not affected. + + // thisMBB: + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t TOCOffset = 3 * PVT.getStoreSize(); + const int64_t BPOffset = 4 * PVT.getStoreSize(); + + // Prepare IP either in reg. + const TargetRegisterClass *PtrRC = getRegClassFor(PVT); + unsigned LabelReg = MRI.createVirtualRegister(PtrRC); + unsigned BufReg = MI->getOperand(1).getReg(); + + if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) { + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD)) + .addReg(PPC::X2) + .addImm(TOCOffset) + .addReg(BufReg); + MIB.setMemRefs(MMOBegin, MMOEnd); + } + + // Naked functions never have a base pointer, and so we use r1. For all + // other functions, this decision must be delayed until during PEI. + unsigned BaseReg; + if (MF->getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::Naked)) + BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1; + else + BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP; + + MIB = BuildMI(*thisMBB, MI, DL, + TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW)) + .addReg(BaseReg) + .addImm(BPOffset) + .addReg(BufReg); + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Setup + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB); + const PPCRegisterInfo *TRI = + static_cast<const PPCRegisterInfo*>(getTargetMachine().getRegisterInfo()); + MIB.addRegMask(TRI->getNoPreservedMask()); + + BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1); + + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup)) + .addMBB(mainMBB); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); + + thisMBB->addSuccessor(mainMBB, /* weight */ 0); + thisMBB->addSuccessor(sinkMBB, /* weight */ 1); + + // mainMBB: + // mainDstReg = 0 + MIB = BuildMI(mainMBB, DL, + TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg); + + // Store IP + if (Subtarget.isPPC64()) { + MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD)) + .addReg(LabelReg) + .addImm(LabelOffset) + .addReg(BufReg); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW)) + .addReg(LabelReg) + .addImm(LabelOffset) + .addReg(BufReg); + } + + MIB.setMemRefs(MMOBegin, MMOEnd); + + BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(PPC::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(restoreDstReg).addMBB(thisMBB); + + MI->eraseFromParent(); + return sinkMBB; +} + +MachineBasicBlock * +PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + const TargetRegisterClass *RC = + (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + unsigned Tmp = MRI.createVirtualRegister(RC); + // Since FP is only updated here but NOT referenced, it's treated as GPR. + unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31; + unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1; + unsigned BP = (PVT == MVT::i64) ? PPC::X30 : + (Subtarget.isSVR4ABI() && + MF->getTarget().getRelocationModel() == Reloc::PIC_ ? + PPC::R29 : PPC::R30); + + MachineInstrBuilder MIB; + + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t SPOffset = 2 * PVT.getStoreSize(); + const int64_t TOCOffset = 3 * PVT.getStoreSize(); + const int64_t BPOffset = 4 * PVT.getStoreSize(); + + unsigned BufReg = MI->getOperand(0).getReg(); + + // Reload FP (the jumped-to function may not have had a + // frame pointer, and if so, then its r31 will be restored + // as necessary). + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP) + .addImm(0) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP) + .addImm(0) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload IP + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp) + .addImm(LabelOffset) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp) + .addImm(LabelOffset) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload SP + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP) + .addImm(SPOffset) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP) + .addImm(SPOffset) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload BP + if (PVT == MVT::i64) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP) + .addImm(BPOffset) + .addReg(BufReg); + } else { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP) + .addImm(BPOffset) + .addReg(BufReg); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + + // Reload TOC + if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) { + MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2) + .addImm(TOCOffset) + .addReg(BufReg); + + MIB.setMemRefs(MMOBegin, MMOEnd); + } + + // Jump + BuildMI(*MBB, MI, DL, + TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp); + BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR)); + + MI->eraseFromParent(); + return MBB; +} + +MachineBasicBlock * +PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 || + MI->getOpcode() == PPC::EH_SjLj_SetJmp64) { + return emitEHSjLjSetJmp(MI, BB); + } else if (MI->getOpcode() == PPC::EH_SjLj_LongJmp32 || + MI->getOpcode() == PPC::EH_SjLj_LongJmp64) { + return emitEHSjLjLongJmp(MI, BB); + } + + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + // To "insert" these instructions we actually have to insert their + // control-flow patterns. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; + + MachineFunction *F = BB->getParent(); + + if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 || + MI->getOpcode() == PPC::SELECT_CC_I8 || + MI->getOpcode() == PPC::SELECT_I4 || + MI->getOpcode() == PPC::SELECT_I8)) { + SmallVector<MachineOperand, 2> Cond; + if (MI->getOpcode() == PPC::SELECT_CC_I4 || + MI->getOpcode() == PPC::SELECT_CC_I8) + Cond.push_back(MI->getOperand(4)); + else + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); + Cond.push_back(MI->getOperand(1)); + + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), + Cond, MI->getOperand(2).getReg(), + MI->getOperand(3).getReg()); + } else if (MI->getOpcode() == PPC::SELECT_CC_I4 || + MI->getOpcode() == PPC::SELECT_CC_I8 || + MI->getOpcode() == PPC::SELECT_CC_F4 || + MI->getOpcode() == PPC::SELECT_CC_F8 || + MI->getOpcode() == PPC::SELECT_CC_VRRC || + MI->getOpcode() == PPC::SELECT_I4 || + MI->getOpcode() == PPC::SELECT_I8 || + MI->getOpcode() == PPC::SELECT_F4 || + MI->getOpcode() == PPC::SELECT_F8 || + MI->getOpcode() == PPC::SELECT_VRRC) { + // The incoming instruction knows the destination vreg to set, the + // condition code register to branch on, the true/false values to + // select between, and a branch opcode to use. + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + DebugLoc dl = MI->getDebugLoc(); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Next, add the true and fallthrough blocks as its successors. + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + if (MI->getOpcode() == PPC::SELECT_I4 || + MI->getOpcode() == PPC::SELECT_I8 || + MI->getOpcode() == PPC::SELECT_F4 || + MI->getOpcode() == PPC::SELECT_F8 || + MI->getOpcode() == PPC::SELECT_VRRC) { + BuildMI(BB, dl, TII->get(PPC::BC)) + .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB); + } else { + unsigned SelectPred = MI->getOperand(4).getImm(); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB); + } + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + BB = copy0MBB; + + // Update machine-CFG edges + BB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BB = sinkMBB; + BuildMI(*BB, BB->begin(), dl, + TII->get(PPC::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + } + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I32) + BB = EmitAtomicBinary(MI, BB, false, PPC::ADD4); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I64) + BB = EmitAtomicBinary(MI, BB, true, PPC::ADD8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I32) + BB = EmitAtomicBinary(MI, BB, false, PPC::AND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I64) + BB = EmitAtomicBinary(MI, BB, true, PPC::AND8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I32) + BB = EmitAtomicBinary(MI, BB, false, PPC::OR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I64) + BB = EmitAtomicBinary(MI, BB, true, PPC::OR8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I32) + BB = EmitAtomicBinary(MI, BB, false, PPC::XOR); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I64) + BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32) + BB = EmitAtomicBinary(MI, BB, false, PPC::NAND); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64) + BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8); + + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I32) + BB = EmitAtomicBinary(MI, BB, false, PPC::SUBF); + else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I64) + BB = EmitAtomicBinary(MI, BB, true, PPC::SUBF8); + + else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I8) + BB = EmitPartwordAtomicBinary(MI, BB, true, 0); + else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I16) + BB = EmitPartwordAtomicBinary(MI, BB, false, 0); + else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I32) + BB = EmitAtomicBinary(MI, BB, false, 0); + else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I64) + BB = EmitAtomicBinary(MI, BB, true, 0); + + else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 || + MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64) { + bool is64bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptrA = MI->getOperand(1).getReg(); + unsigned ptrB = MI->getOperand(2).getReg(); + unsigned oldval = MI->getOperand(3).getReg(); + unsigned newval = MI->getOperand(4).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, loop1MBB); + F->insert(It, loop2MBB); + F->insert(It, midMBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loop1MBB); + + // loop1MBB: + // l[wd]arx dest, ptr + // cmp[wd] dest, oldval + // bne- midMBB + // loop2MBB: + // st[wd]cx. newval, ptr + // bne- loopMBB + // b exitBB + // midMBB: + // st[wd]cx. dest, ptr + // exitBB: + BB = loop1MBB; + BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest) + .addReg(ptrA).addReg(ptrB); + BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) + .addReg(oldval).addReg(dest); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); + BB->addSuccessor(loop2MBB); + BB->addSuccessor(midMBB); + + BB = loop2MBB; + BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX)) + .addReg(newval).addReg(ptrA).addReg(ptrB); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); + BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + BB = midMBB; + BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX)) + .addReg(dest).addReg(ptrA).addReg(ptrB); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + } else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 || + MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) { + // We must use 64-bit registers for addresses when targeting 64-bit, + // since we're actually doing arithmetic on them. Other registers + // can be 32-bit. + bool is64bit = Subtarget.isPPC64(); + bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8; + + unsigned dest = MI->getOperand(0).getReg(); + unsigned ptrA = MI->getOperand(1).getReg(); + unsigned ptrB = MI->getOperand(2).getReg(); + unsigned oldval = MI->getOperand(3).getReg(); + unsigned newval = MI->getOperand(4).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, loop1MBB); + F->insert(It, loop2MBB); + F->insert(It, midMBB); + F->insert(It, exitMBB); + exitMBB->splice(exitMBB->begin(), BB, + std::next(MachineBasicBlock::iterator(MI)), BB->end()); + exitMBB->transferSuccessorsAndUpdatePHIs(BB); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + const TargetRegisterClass *RC = + is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass : + (const TargetRegisterClass *) &PPC::GPRCRegClass; + unsigned PtrReg = RegInfo.createVirtualRegister(RC); + unsigned Shift1Reg = RegInfo.createVirtualRegister(RC); + unsigned ShiftReg = RegInfo.createVirtualRegister(RC); + unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC); + unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC); + unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC); + unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC); + unsigned MaskReg = RegInfo.createVirtualRegister(RC); + unsigned Mask2Reg = RegInfo.createVirtualRegister(RC); + unsigned Mask3Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC); + unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC); + unsigned TmpDestReg = RegInfo.createVirtualRegister(RC); + unsigned Ptr1Reg; + unsigned TmpReg = RegInfo.createVirtualRegister(RC); + unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; + // thisMBB: + // ... + // fallthrough --> loopMBB + BB->addSuccessor(loop1MBB); + + // The 4-byte load must be aligned, while a char or short may be + // anywhere in the word. Hence all this nasty bookkeeping code. + // add ptr1, ptrA, ptrB [copy if ptrA==0] + // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27] + // xori shift, shift1, 24 [16] + // rlwinm ptr, ptr1, 0, 0, 29 + // slw newval2, newval, shift + // slw oldval2, oldval,shift + // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535] + // slw mask, mask2, shift + // and newval3, newval2, mask + // and oldval3, oldval2, mask + // loop1MBB: + // lwarx tmpDest, ptr + // and tmp, tmpDest, mask + // cmpw tmp, oldval3 + // bne- midMBB + // loop2MBB: + // andc tmp2, tmpDest, mask + // or tmp4, tmp2, newval3 + // stwcx. tmp4, ptr + // bne- loop1MBB + // b exitBB + // midMBB: + // stwcx. tmpDest, ptr + // exitBB: + // srw dest, tmpDest, shift + if (ptrA != ZeroReg) { + Ptr1Reg = RegInfo.createVirtualRegister(RC); + BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg) + .addReg(ptrA).addReg(ptrB); + } else { + Ptr1Reg = ptrB; + } + BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg) + .addImm(3).addImm(27).addImm(is8bit ? 28 : 27); + BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg) + .addReg(Shift1Reg).addImm(is8bit ? 24 : 16); + if (is64bit) + BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg) + .addReg(Ptr1Reg).addImm(0).addImm(61); + else + BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg) + .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29); + BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg) + .addReg(newval).addReg(ShiftReg); + BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg) + .addReg(oldval).addReg(ShiftReg); + if (is8bit) + BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255); + else { + BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0); + BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg) + .addReg(Mask3Reg).addImm(65535); + } + BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg) + .addReg(Mask2Reg).addReg(ShiftReg); + BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg) + .addReg(NewVal2Reg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg) + .addReg(OldVal2Reg).addReg(MaskReg); + + BB = loop1MBB; + BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg) + .addReg(ZeroReg).addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::AND),TmpReg) + .addReg(TmpDestReg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) + .addReg(TmpReg).addReg(OldVal3Reg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB); + BB->addSuccessor(loop2MBB); + BB->addSuccessor(midMBB); + + BB = loop2MBB; + BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg) + .addReg(TmpDestReg).addReg(MaskReg); + BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg) + .addReg(Tmp2Reg).addReg(NewVal3Reg); + BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg) + .addReg(ZeroReg).addReg(PtrReg); + BuildMI(BB, dl, TII->get(PPC::BCC)) + .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB); + BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB); + BB->addSuccessor(loop1MBB); + BB->addSuccessor(exitMBB); + + BB = midMBB; + BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg) + .addReg(ZeroReg).addReg(PtrReg); + BB->addSuccessor(exitMBB); + + // exitMBB: + // ... + BB = exitMBB; + BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg) + .addReg(ShiftReg); + } else if (MI->getOpcode() == PPC::FADDrtz) { + // This pseudo performs an FADD with rounding mode temporarily forced + // to round-to-zero. We emit this via custom inserter since the FPSCR + // is not modeled at the SelectionDAG level. + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src1 = MI->getOperand(1).getReg(); + unsigned Src2 = MI->getOperand(2).getReg(); + DebugLoc dl = MI->getDebugLoc(); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass); + + // Save FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); + + // Set rounding mode to round-to-zero. + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); + + // Perform addition. + BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); + + // Restore FPSCR value. + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg); + } else if (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT || + MI->getOpcode() == PPC::ANDIo_1_GT_BIT || + MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 || + MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) { + unsigned Opcode = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 || + MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) ? + PPC::ANDIo8 : PPC::ANDIo; + bool isEQ = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT || + MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8); + + MachineRegisterInfo &RegInfo = F->getRegInfo(); + unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ? + &PPC::GPRCRegClass : + &PPC::G8RCRegClass); + + DebugLoc dl = MI->getDebugLoc(); + BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) + .addReg(MI->getOperand(1).getReg()).addImm(1); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), + MI->getOperand(0).getReg()) + .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); + } else { + llvm_unreachable("Unexpected instr type to insert"); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +//===----------------------------------------------------------------------===// +// Target Optimization Hooks +//===----------------------------------------------------------------------===// + +SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op, + DAGCombinerInfo &DCI) const { + if (DCI.isAfterLegalizeVectorOps()) + return SDValue(); + + EVT VT = Op.getValueType(); + + if ((VT == MVT::f32 && Subtarget.hasFRES()) || + (VT == MVT::f64 && Subtarget.hasFRE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX())) { + + // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) + // For the reciprocal, we need to find the zero of the function: + // F(X) = A X - 1 [which has a zero at X = 1/A] + // => + // X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form + // does not require additional intermediate precision] + + // Convergence is quadratic, so we essentially double the number of digits + // correct after every iteration. The minimum architected relative + // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has + // 23 digits and double has 52 digits. + int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; + if (VT.getScalarType() == MVT::f64) + ++Iterations; + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(Op); + + SDValue FPOne = + DAG.getConstantFP(1.0, VT.getScalarType()); + if (VT.isVector()) { + assert(VT.getVectorNumElements() == 4 && + "Unknown vector type"); + FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, + FPOne, FPOne, FPOne, FPOne); + } + + SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op); + DCI.AddToWorklist(Est.getNode()); + + // Newton iterations: Est = Est + Est (1 - Arg * Est) + for (int i = 0; i < Iterations; ++i) { + SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est); + DCI.AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst); + DCI.AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst); + DCI.AddToWorklist(NewEst.getNode()); + + Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst); + DCI.AddToWorklist(Est.getNode()); + } + + return Est; + } + + return SDValue(); +} + +SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op, + DAGCombinerInfo &DCI) const { + if (DCI.isAfterLegalizeVectorOps()) + return SDValue(); + + EVT VT = Op.getValueType(); + + if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || + (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || + (VT == MVT::v4f32 && Subtarget.hasAltivec()) || + (VT == MVT::v2f64 && Subtarget.hasVSX())) { + + // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) + // For the reciprocal sqrt, we need to find the zero of the function: + // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] + // => + // X_{i+1} = X_i (1.5 - A X_i^2 / 2) + // As a result, we precompute A/2 prior to the iteration loop. + + // Convergence is quadratic, so we essentially double the number of digits + // correct after every iteration. The minimum architected relative + // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has + // 23 digits and double has 52 digits. + int Iterations = Subtarget.hasRecipPrec() ? 1 : 3; + if (VT.getScalarType() == MVT::f64) + ++Iterations; + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(Op); + + SDValue FPThreeHalves = + DAG.getConstantFP(1.5, VT.getScalarType()); + if (VT.isVector()) { + assert(VT.getVectorNumElements() == 4 && + "Unknown vector type"); + FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, + FPThreeHalves, FPThreeHalves, + FPThreeHalves, FPThreeHalves); + } + + SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op); + DCI.AddToWorklist(Est.getNode()); + + // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that + // this entire sequence requires only one FP constant. + SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op); + DCI.AddToWorklist(HalfArg.getNode()); + + HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op); + DCI.AddToWorklist(HalfArg.getNode()); + + // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) + for (int i = 0; i < Iterations; ++i) { + SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est); + DCI.AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst); + DCI.AddToWorklist(NewEst.getNode()); + + NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst); + DCI.AddToWorklist(NewEst.getNode()); + + Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst); + DCI.AddToWorklist(Est.getNode()); + } + + return Est; + } + + return SDValue(); +} + +// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does +// not enforce equality of the chain operands. +static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base, + unsigned Bytes, int Dist, + SelectionDAG &DAG) { + EVT VT = LS->getMemoryVT(); + if (VT.getSizeInBits() / 8 != Bytes) + return false; + + SDValue Loc = LS->getBasePtr(); + SDValue BaseLoc = Base->getBasePtr(); + if (Loc.getOpcode() == ISD::FrameIndex) { + if (BaseLoc.getOpcode() != ISD::FrameIndex) + return false; + const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + int FI = cast<FrameIndexSDNode>(Loc)->getIndex(); + int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex(); + int FS = MFI->getObjectSize(FI); + int BFS = MFI->getObjectSize(BFI); + if (FS != BFS || FS != (int)Bytes) return false; + return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); + } + + // Handle X+C + if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc && + cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes) + return true; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const GlobalValue *GV1 = nullptr; + const GlobalValue *GV2 = nullptr; + int64_t Offset1 = 0; + int64_t Offset2 = 0; + bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); + bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); + if (isGA1 && isGA2 && GV1 == GV2) + return Offset1 == (Offset2 + Dist*Bytes); + return false; +} + +// Return true is there is a nearyby consecutive load to the one provided +// (regardless of alignment). We search up and down the chain, looking though +// token factors and other loads (but nothing else). As a result, a true +// results indicates that it is safe to create a new consecutive load adjacent +// to the load provided. +static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { + SDValue Chain = LD->getChain(); + EVT VT = LD->getMemoryVT(); + + SmallSet<SDNode *, 16> LoadRoots; + SmallVector<SDNode *, 8> Queue(1, Chain.getNode()); + SmallSet<SDNode *, 16> Visited; + + // First, search up the chain, branching to follow all token-factor operands. + // If we find a consecutive load, then we're done, otherwise, record all + // nodes just above the top-level loads and token factors. + while (!Queue.empty()) { + SDNode *ChainNext = Queue.pop_back_val(); + if (!Visited.insert(ChainNext)) + continue; + + if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) { + if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) + return true; + + if (!Visited.count(ChainLD->getChain().getNode())) + Queue.push_back(ChainLD->getChain().getNode()); + } else if (ChainNext->getOpcode() == ISD::TokenFactor) { + for (const SDUse &O : ChainNext->ops()) + if (!Visited.count(O.getNode())) + Queue.push_back(O.getNode()); + } else + LoadRoots.insert(ChainNext); + } + + // Second, search down the chain, starting from the top-level nodes recorded + // in the first phase. These top-level nodes are the nodes just above all + // loads and token factors. Starting with their uses, recursively look though + // all loads (just the chain uses) and token factors to find a consecutive + // load. + Visited.clear(); + Queue.clear(); + + for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(), + IE = LoadRoots.end(); I != IE; ++I) { + Queue.push_back(*I); + + while (!Queue.empty()) { + SDNode *LoadRoot = Queue.pop_back_val(); + if (!Visited.insert(LoadRoot)) + continue; + + if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot)) + if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG)) + return true; + + for (SDNode::use_iterator UI = LoadRoot->use_begin(), + UE = LoadRoot->use_end(); UI != UE; ++UI) + if (((isa<LoadSDNode>(*UI) && + cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) || + UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI)) + Queue.push_back(*UI); + } + } + + return false; +} + +SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + + assert(Subtarget.useCRBits() && + "Expecting to be tracking CR bits"); + // If we're tracking CR bits, we need to be careful that we don't have: + // trunc(binary-ops(zext(x), zext(y))) + // or + // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) + // such that we're unnecessarily moving things into GPRs when it would be + // better to keep them in CR bits. + + // Note that trunc here can be an actual i1 trunc, or can be the effective + // truncation that comes from a setcc or select_cc. + if (N->getOpcode() == ISD::TRUNCATE && + N->getValueType(0) != MVT::i1) + return SDValue(); + + if (N->getOperand(0).getValueType() != MVT::i32 && + N->getOperand(0).getValueType() != MVT::i64) + return SDValue(); + + if (N->getOpcode() == ISD::SETCC || + N->getOpcode() == ISD::SELECT_CC) { + // If we're looking at a comparison, then we need to make sure that the + // high bits (all except for the first) don't matter the result. + ISD::CondCode CC = + cast<CondCodeSDNode>(N->getOperand( + N->getOpcode() == ISD::SETCC ? 2 : 4))->get(); + unsigned OpBits = N->getOperand(0).getValueSizeInBits(); + + if (ISD::isSignedIntSetCC(CC)) { + if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits || + DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits) + return SDValue(); + } else if (ISD::isUnsignedIntSetCC(CC)) { + if (!DAG.MaskedValueIsZero(N->getOperand(0), + APInt::getHighBitsSet(OpBits, OpBits-1)) || + !DAG.MaskedValueIsZero(N->getOperand(1), + APInt::getHighBitsSet(OpBits, OpBits-1))) + return SDValue(); + } else { + // This is neither a signed nor an unsigned comparison, just make sure + // that the high bits are equal. + APInt Op1Zero, Op1One; + APInt Op2Zero, Op2One; + DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One); + DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One); + + // We don't really care about what is known about the first bit (if + // anything), so clear it in all masks prior to comparing them. + Op1Zero.clearBit(0); Op1One.clearBit(0); + Op2Zero.clearBit(0); Op2One.clearBit(0); + + if (Op1Zero != Op2Zero || Op1One != Op2One) + return SDValue(); + } + } + + // We now know that the higher-order bits are irrelevant, we just need to + // make sure that all of the intermediate operations are bit operations, and + // all inputs are extensions. + if (N->getOperand(0).getOpcode() != ISD::AND && + N->getOperand(0).getOpcode() != ISD::OR && + N->getOperand(0).getOpcode() != ISD::XOR && + N->getOperand(0).getOpcode() != ISD::SELECT && + N->getOperand(0).getOpcode() != ISD::SELECT_CC && + N->getOperand(0).getOpcode() != ISD::TRUNCATE && + N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND && + N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(0).getOpcode() != ISD::ANY_EXTEND) + return SDValue(); + + if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) && + N->getOperand(1).getOpcode() != ISD::AND && + N->getOperand(1).getOpcode() != ISD::OR && + N->getOperand(1).getOpcode() != ISD::XOR && + N->getOperand(1).getOpcode() != ISD::SELECT && + N->getOperand(1).getOpcode() != ISD::SELECT_CC && + N->getOperand(1).getOpcode() != ISD::TRUNCATE && + N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND && + N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND && + N->getOperand(1).getOpcode() != ISD::ANY_EXTEND) + return SDValue(); + + SmallVector<SDValue, 4> Inputs; + SmallVector<SDValue, 8> BinOps, PromOps; + SmallPtrSet<SDNode *, 16> Visited; + + for (unsigned i = 0; i < 2; ++i) { + if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND || + N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND || + N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) && + N->getOperand(i).getOperand(0).getValueType() == MVT::i1) || + isa<ConstantSDNode>(N->getOperand(i))) + Inputs.push_back(N->getOperand(i)); + else + BinOps.push_back(N->getOperand(i)); + + if (N->getOpcode() == ISD::TRUNCATE) + break; + } + + // Visit all inputs, collect all binary operations (and, or, xor and + // select) that are all fed by extensions. + while (!BinOps.empty()) { + SDValue BinOp = BinOps.back(); + BinOps.pop_back(); + + if (!Visited.insert(BinOp.getNode())) + continue; + + PromOps.push_back(BinOp); + + for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { + // The condition of the select is not promoted. + if (BinOp.getOpcode() == ISD::SELECT && i == 0) + continue; + if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) + continue; + + if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || + BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || + BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && + BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || + isa<ConstantSDNode>(BinOp.getOperand(i))) { + Inputs.push_back(BinOp.getOperand(i)); + } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || + BinOp.getOperand(i).getOpcode() == ISD::OR || + BinOp.getOperand(i).getOpcode() == ISD::XOR || + BinOp.getOperand(i).getOpcode() == ISD::SELECT || + BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC || + BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || + BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND || + BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND || + BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) { + BinOps.push_back(BinOp.getOperand(i)); + } else { + // We have an input that is not an extension or another binary + // operation; we'll abort this transformation. + return SDValue(); + } + } + } + + // Make sure that this is a self-contained cluster of operations (which + // is not quite the same thing as saying that everything has only one + // use). + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + if (isa<ConstantSDNode>(Inputs[i])) + continue; + + for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), + UE = Inputs[i].getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User != N && !Visited.count(User)) + return SDValue(); + + // Make sure that we're not going to promote the non-output-value + // operand(s) or SELECT or SELECT_CC. + // FIXME: Although we could sometimes handle this, and it does occur in + // practice that one of the condition inputs to the select is also one of + // the outputs, we currently can't deal with this. + if (User->getOpcode() == ISD::SELECT) { + if (User->getOperand(0) == Inputs[i]) + return SDValue(); + } else if (User->getOpcode() == ISD::SELECT_CC) { + if (User->getOperand(0) == Inputs[i] || + User->getOperand(1) == Inputs[i]) + return SDValue(); + } + } + } + + for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { + for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), + UE = PromOps[i].getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User != N && !Visited.count(User)) + return SDValue(); + + // Make sure that we're not going to promote the non-output-value + // operand(s) or SELECT or SELECT_CC. + // FIXME: Although we could sometimes handle this, and it does occur in + // practice that one of the condition inputs to the select is also one of + // the outputs, we currently can't deal with this. + if (User->getOpcode() == ISD::SELECT) { + if (User->getOperand(0) == PromOps[i]) + return SDValue(); + } else if (User->getOpcode() == ISD::SELECT_CC) { + if (User->getOperand(0) == PromOps[i] || + User->getOperand(1) == PromOps[i]) + return SDValue(); + } + } + } + + // Replace all inputs with the extension operand. + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + // Constants may have users outside the cluster of to-be-promoted nodes, + // and so we need to replace those as we do the promotions. + if (isa<ConstantSDNode>(Inputs[i])) + continue; + else + DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); + } + + // Replace all operations (these are all the same, but have a different + // (i1) return type). DAG.getNode will validate that the types of + // a binary operator match, so go through the list in reverse so that + // we've likely promoted both operands first. Any intermediate truncations or + // extensions disappear. + while (!PromOps.empty()) { + SDValue PromOp = PromOps.back(); + PromOps.pop_back(); + + if (PromOp.getOpcode() == ISD::TRUNCATE || + PromOp.getOpcode() == ISD::SIGN_EXTEND || + PromOp.getOpcode() == ISD::ZERO_EXTEND || + PromOp.getOpcode() == ISD::ANY_EXTEND) { + if (!isa<ConstantSDNode>(PromOp.getOperand(0)) && + PromOp.getOperand(0).getValueType() != MVT::i1) { + // The operand is not yet ready (see comment below). + PromOps.insert(PromOps.begin(), PromOp); + continue; + } + + SDValue RepValue = PromOp.getOperand(0); + if (isa<ConstantSDNode>(RepValue)) + RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue); + + DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue); + continue; + } + + unsigned C; + switch (PromOp.getOpcode()) { + default: C = 0; break; + case ISD::SELECT: C = 1; break; + case ISD::SELECT_CC: C = 2; break; + } + + if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && + PromOp.getOperand(C).getValueType() != MVT::i1) || + (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && + PromOp.getOperand(C+1).getValueType() != MVT::i1)) { + // The to-be-promoted operands of this node have not yet been + // promoted (this should be rare because we're going through the + // list backward, but if one of the operands has several users in + // this cluster of to-be-promoted nodes, it is possible). + PromOps.insert(PromOps.begin(), PromOp); + continue; + } + + SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), + PromOp.getNode()->op_end()); + + // If there are any constant inputs, make sure they're replaced now. + for (unsigned i = 0; i < 2; ++i) + if (isa<ConstantSDNode>(Ops[C+i])) + Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]); + + DAG.ReplaceAllUsesOfValueWith(PromOp, + DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops)); + } + + // Now we're left with the initial truncation itself. + if (N->getOpcode() == ISD::TRUNCATE) + return N->getOperand(0); + + // Otherwise, this is a comparison. The operands to be compared have just + // changed type (to i1), but everything else is the same. + return SDValue(N, 0); +} + +SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + + // If we're tracking CR bits, we need to be careful that we don't have: + // zext(binary-ops(trunc(x), trunc(y))) + // or + // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) + // such that we're unnecessarily moving things into CR bits that can more + // efficiently stay in GPRs. Note that if we're not certain that the high + // bits are set as required by the final extension, we still may need to do + // some masking to get the proper behavior. + + // This same functionality is important on PPC64 when dealing with + // 32-to-64-bit extensions; these occur often when 32-bit values are used as + // the return values of functions. Because it is so similar, it is handled + // here as well. + + if (N->getValueType(0) != MVT::i32 && + N->getValueType(0) != MVT::i64) + return SDValue(); + + if (!((N->getOperand(0).getValueType() == MVT::i1 && + Subtarget.useCRBits()) || + (N->getOperand(0).getValueType() == MVT::i32 && + Subtarget.isPPC64()))) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::AND && + N->getOperand(0).getOpcode() != ISD::OR && + N->getOperand(0).getOpcode() != ISD::XOR && + N->getOperand(0).getOpcode() != ISD::SELECT && + N->getOperand(0).getOpcode() != ISD::SELECT_CC) + return SDValue(); + + SmallVector<SDValue, 4> Inputs; + SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps; + SmallPtrSet<SDNode *, 16> Visited; + + // Visit all inputs, collect all binary operations (and, or, xor and + // select) that are all fed by truncations. + while (!BinOps.empty()) { + SDValue BinOp = BinOps.back(); + BinOps.pop_back(); + + if (!Visited.insert(BinOp.getNode())) + continue; + + PromOps.push_back(BinOp); + + for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) { + // The condition of the select is not promoted. + if (BinOp.getOpcode() == ISD::SELECT && i == 0) + continue; + if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3) + continue; + + if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || + isa<ConstantSDNode>(BinOp.getOperand(i))) { + Inputs.push_back(BinOp.getOperand(i)); + } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || + BinOp.getOperand(i).getOpcode() == ISD::OR || + BinOp.getOperand(i).getOpcode() == ISD::XOR || + BinOp.getOperand(i).getOpcode() == ISD::SELECT || + BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) { + BinOps.push_back(BinOp.getOperand(i)); + } else { + // We have an input that is not a truncation or another binary + // operation; we'll abort this transformation. + return SDValue(); + } + } + } + + // Make sure that this is a self-contained cluster of operations (which + // is not quite the same thing as saying that everything has only one + // use). + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + if (isa<ConstantSDNode>(Inputs[i])) + continue; + + for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(), + UE = Inputs[i].getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User != N && !Visited.count(User)) + return SDValue(); + + // Make sure that we're not going to promote the non-output-value + // operand(s) or SELECT or SELECT_CC. + // FIXME: Although we could sometimes handle this, and it does occur in + // practice that one of the condition inputs to the select is also one of + // the outputs, we currently can't deal with this. + if (User->getOpcode() == ISD::SELECT) { + if (User->getOperand(0) == Inputs[i]) + return SDValue(); + } else if (User->getOpcode() == ISD::SELECT_CC) { + if (User->getOperand(0) == Inputs[i] || + User->getOperand(1) == Inputs[i]) + return SDValue(); + } + } + } + + for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) { + for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(), + UE = PromOps[i].getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User != N && !Visited.count(User)) + return SDValue(); + + // Make sure that we're not going to promote the non-output-value + // operand(s) or SELECT or SELECT_CC. + // FIXME: Although we could sometimes handle this, and it does occur in + // practice that one of the condition inputs to the select is also one of + // the outputs, we currently can't deal with this. + if (User->getOpcode() == ISD::SELECT) { + if (User->getOperand(0) == PromOps[i]) + return SDValue(); + } else if (User->getOpcode() == ISD::SELECT_CC) { + if (User->getOperand(0) == PromOps[i] || + User->getOperand(1) == PromOps[i]) + return SDValue(); + } + } + } + + unsigned PromBits = N->getOperand(0).getValueSizeInBits(); + bool ReallyNeedsExt = false; + if (N->getOpcode() != ISD::ANY_EXTEND) { + // If all of the inputs are not already sign/zero extended, then + // we'll still need to do that at the end. + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + if (isa<ConstantSDNode>(Inputs[i])) + continue; + + unsigned OpBits = + Inputs[i].getOperand(0).getValueSizeInBits(); + assert(PromBits < OpBits && "Truncation not to a smaller bit count?"); + + if ((N->getOpcode() == ISD::ZERO_EXTEND && + !DAG.MaskedValueIsZero(Inputs[i].getOperand(0), + APInt::getHighBitsSet(OpBits, + OpBits-PromBits))) || + (N->getOpcode() == ISD::SIGN_EXTEND && + DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) < + (OpBits-(PromBits-1)))) { + ReallyNeedsExt = true; + break; + } + } + } + + // Replace all inputs, either with the truncation operand, or a + // truncation or extension to the final output type. + for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { + // Constant inputs need to be replaced with the to-be-promoted nodes that + // use them because they might have users outside of the cluster of + // promoted nodes. + if (isa<ConstantSDNode>(Inputs[i])) + continue; + + SDValue InSrc = Inputs[i].getOperand(0); + if (Inputs[i].getValueType() == N->getValueType(0)) + DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc); + else if (N->getOpcode() == ISD::SIGN_EXTEND) + DAG.ReplaceAllUsesOfValueWith(Inputs[i], + DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0))); + else if (N->getOpcode() == ISD::ZERO_EXTEND) + DAG.ReplaceAllUsesOfValueWith(Inputs[i], + DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0))); + else + DAG.ReplaceAllUsesOfValueWith(Inputs[i], + DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0))); + } + + // Replace all operations (these are all the same, but have a different + // (promoted) return type). DAG.getNode will validate that the types of + // a binary operator match, so go through the list in reverse so that + // we've likely promoted both operands first. + while (!PromOps.empty()) { + SDValue PromOp = PromOps.back(); + PromOps.pop_back(); + + unsigned C; + switch (PromOp.getOpcode()) { + default: C = 0; break; + case ISD::SELECT: C = 1; break; + case ISD::SELECT_CC: C = 2; break; + } + + if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) && + PromOp.getOperand(C).getValueType() != N->getValueType(0)) || + (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) && + PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) { + // The to-be-promoted operands of this node have not yet been + // promoted (this should be rare because we're going through the + // list backward, but if one of the operands has several users in + // this cluster of to-be-promoted nodes, it is possible). + PromOps.insert(PromOps.begin(), PromOp); + continue; + } + + SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(), + PromOp.getNode()->op_end()); + + // If this node has constant inputs, then they'll need to be promoted here. + for (unsigned i = 0; i < 2; ++i) { + if (!isa<ConstantSDNode>(Ops[C+i])) + continue; + if (Ops[C+i].getValueType() == N->getValueType(0)) + continue; + + if (N->getOpcode() == ISD::SIGN_EXTEND) + Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); + else if (N->getOpcode() == ISD::ZERO_EXTEND) + Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); + else + Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0)); + } + + DAG.ReplaceAllUsesOfValueWith(PromOp, + DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops)); + } + + // Now we're left with the initial extension itself. + if (!ReallyNeedsExt) + return N->getOperand(0); + + // To zero extend, just mask off everything except for the first bit (in the + // i1 case). + if (N->getOpcode() == ISD::ZERO_EXTEND) + return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0), + DAG.getConstant(APInt::getLowBitsSet( + N->getValueSizeInBits(0), PromBits), + N->getValueType(0))); + + assert(N->getOpcode() == ISD::SIGN_EXTEND && + "Invalid extension type"); + EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0)); + SDValue ShiftCst = + DAG.getConstant(N->getValueSizeInBits(0)-PromBits, ShiftAmountTy); + return DAG.getNode(ISD::SRA, dl, N->getValueType(0), + DAG.getNode(ISD::SHL, dl, N->getValueType(0), + N->getOperand(0), ShiftCst), ShiftCst); +} + +SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + const TargetMachine &TM = getTargetMachine(); + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + switch (N->getOpcode()) { + default: break; + case PPCISD::SHL: + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (C->isNullValue()) // 0 << V -> 0. + return N->getOperand(0); + } + break; + case PPCISD::SRL: + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (C->isNullValue()) // 0 >>u V -> 0. + return N->getOperand(0); + } + break; + case PPCISD::SRA: + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + if (C->isNullValue() || // 0 >>s V -> 0. + C->isAllOnesValue()) // -1 >>s V -> -1. + return N->getOperand(0); + } + break; + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + return DAGCombineExtBoolTrunc(N, DCI); + case ISD::TRUNCATE: + case ISD::SETCC: + case ISD::SELECT_CC: + return DAGCombineTruncBoolExt(N, DCI); + case ISD::FDIV: { + assert(TM.Options.UnsafeFPMath && + "Reciprocal estimates require UnsafeFPMath"); + + if (N->getOperand(1).getOpcode() == ISD::FSQRT) { + SDValue RV = + DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI); + if (RV.getNode()) { + DCI.AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), + N->getOperand(0), RV); + } + } else if (N->getOperand(1).getOpcode() == ISD::FP_EXTEND && + N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) { + SDValue RV = + DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0), + DCI); + if (RV.getNode()) { + DCI.AddToWorklist(RV.getNode()); + RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)), + N->getValueType(0), RV); + DCI.AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), + N->getOperand(0), RV); + } + } else if (N->getOperand(1).getOpcode() == ISD::FP_ROUND && + N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) { + SDValue RV = + DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0), + DCI); + if (RV.getNode()) { + DCI.AddToWorklist(RV.getNode()); + RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)), + N->getValueType(0), RV, + N->getOperand(1).getOperand(1)); + DCI.AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), + N->getOperand(0), RV); + } + } + + SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI); + if (RV.getNode()) { + DCI.AddToWorklist(RV.getNode()); + return DAG.getNode(ISD::FMUL, dl, N->getValueType(0), + N->getOperand(0), RV); + } + + } + break; + case ISD::FSQRT: { + assert(TM.Options.UnsafeFPMath && + "Reciprocal estimates require UnsafeFPMath"); + + // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the + // reciprocal sqrt. + SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI); + if (RV.getNode()) { + DCI.AddToWorklist(RV.getNode()); + RV = DAGCombineFastRecip(RV, DCI); + if (RV.getNode()) { + // Unfortunately, RV is now NaN if the input was exactly 0. Select out + // this case and force the answer to 0. + + EVT VT = RV.getValueType(); + + SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType()); + if (VT.isVector()) { + assert(VT.getVectorNumElements() == 4 && "Unknown vector type"); + Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero); + } + + SDValue ZeroCmp = + DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT), + N->getOperand(0), Zero, ISD::SETEQ); + DCI.AddToWorklist(ZeroCmp.getNode()); + DCI.AddToWorklist(RV.getNode()); + + RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT, + ZeroCmp, Zero, RV); + return RV; + } + } + + } + break; + case ISD::SINT_TO_FP: + if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) { + if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) { + // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores. + // We allow the src/dst to be either f32/f64, but the intermediate + // type must be i64. + if (N->getOperand(0).getValueType() == MVT::i64 && + N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) { + SDValue Val = N->getOperand(0).getOperand(0); + if (Val.getValueType() == MVT::f32) { + Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); + DCI.AddToWorklist(Val.getNode()); + } + + Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val); + DCI.AddToWorklist(Val.getNode()); + Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val); + DCI.AddToWorklist(Val.getNode()); + if (N->getValueType(0) == MVT::f32) { + Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val, + DAG.getIntPtrConstant(0)); + DCI.AddToWorklist(Val.getNode()); + } + return Val; + } else if (N->getOperand(0).getValueType() == MVT::i32) { + // If the intermediate type is i32, we can avoid the load/store here + // too. + } + } + } + break; + case ISD::STORE: + // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)). + if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() && + !cast<StoreSDNode>(N)->isTruncatingStore() && + N->getOperand(1).getOpcode() == ISD::FP_TO_SINT && + N->getOperand(1).getValueType() == MVT::i32 && + N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) { + SDValue Val = N->getOperand(1).getOperand(0); + if (Val.getValueType() == MVT::f32) { + Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); + DCI.AddToWorklist(Val.getNode()); + } + Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val); + DCI.AddToWorklist(Val.getNode()); + + SDValue Ops[] = { + N->getOperand(0), Val, N->getOperand(2), + DAG.getValueType(N->getOperand(1).getValueType()) + }; + + Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, + DAG.getVTList(MVT::Other), Ops, + cast<StoreSDNode>(N)->getMemoryVT(), + cast<StoreSDNode>(N)->getMemOperand()); + DCI.AddToWorklist(Val.getNode()); + return Val; + } + + // Turn STORE (BSWAP) -> sthbrx/stwbrx. + if (cast<StoreSDNode>(N)->isUnindexed() && + N->getOperand(1).getOpcode() == ISD::BSWAP && + N->getOperand(1).getNode()->hasOneUse() && + (N->getOperand(1).getValueType() == MVT::i32 || + N->getOperand(1).getValueType() == MVT::i16 || + (TM.getSubtarget<PPCSubtarget>().hasLDBRX() && + TM.getSubtarget<PPCSubtarget>().isPPC64() && + N->getOperand(1).getValueType() == MVT::i64))) { + SDValue BSwapOp = N->getOperand(1).getOperand(0); + // Do an any-extend to 32-bits if this is a half-word input. + if (BSwapOp.getValueType() == MVT::i16) + BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp); + + SDValue Ops[] = { + N->getOperand(0), BSwapOp, N->getOperand(2), + DAG.getValueType(N->getOperand(1).getValueType()) + }; + return + DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other), + Ops, cast<StoreSDNode>(N)->getMemoryVT(), + cast<StoreSDNode>(N)->getMemOperand()); + } + break; + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(N); + EVT VT = LD->getValueType(0); + Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext()); + unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty); + if (ISD::isNON_EXTLoad(N) && VT.isVector() && + TM.getSubtarget<PPCSubtarget>().hasAltivec() && + (VT == MVT::v16i8 || VT == MVT::v8i16 || + VT == MVT::v4i32 || VT == MVT::v4f32) && + LD->getAlignment() < ABIAlignment) { + // This is a type-legal unaligned Altivec load. + SDValue Chain = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + bool isLittleEndian = Subtarget.isLittleEndian(); + + // This implements the loading of unaligned vectors as described in + // the venerable Apple Velocity Engine overview. Specifically: + // https://developer.apple.com/hardwaredrivers/ve/alignment.html + // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html + // + // The general idea is to expand a sequence of one or more unaligned + // loads into an alignment-based permutation-control instruction (lvsl + // or lvsr), a series of regular vector loads (which always truncate + // their input address to an aligned address), and a series of + // permutations. The results of these permutations are the requested + // loaded values. The trick is that the last "extra" load is not taken + // from the address you might suspect (sizeof(vector) bytes after the + // last requested load), but rather sizeof(vector) - 1 bytes after the + // last requested vector. The point of this is to avoid a page fault if + // the base address happened to be aligned. This works because if the + // base address is aligned, then adding less than a full vector length + // will cause the last vector in the sequence to be (re)loaded. + // Otherwise, the next vector will be fetched as you might suspect was + // necessary. + + // We might be able to reuse the permutation generation from + // a different base address offset from this one by an aligned amount. + // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this + // optimization later. + Intrinsic::ID Intr = (isLittleEndian ? + Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl); + SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8); + + // Refine the alignment of the original load (a "new" load created here + // which was identical to the first except for the alignment would be + // merged with the existing node regardless). + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(LD->getPointerInfo(), + LD->getMemOperand()->getFlags(), + LD->getMemoryVT().getStoreSize(), + ABIAlignment); + LD->refineAlignment(MMO); + SDValue BaseLoad = SDValue(LD, 0); + + // Note that the value of IncOffset (which is provided to the next + // load's pointer info offset value, and thus used to calculate the + // alignment), and the value of IncValue (which is actually used to + // increment the pointer value) are different! This is because we + // require the next load to appear to be aligned, even though it + // is actually offset from the base pointer by a lesser amount. + int IncOffset = VT.getSizeInBits() / 8; + int IncValue = IncOffset; + + // Walk (both up and down) the chain looking for another load at the real + // (aligned) offset (the alignment of the other load does not matter in + // this case). If found, then do not use the offset reduction trick, as + // that will prevent the loads from being later combined (as they would + // otherwise be duplicates). + if (!findConsecutiveLoad(LD, DAG)) + --IncValue; + + SDValue Increment = DAG.getConstant(IncValue, getPointerTy()); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + + SDValue ExtraLoad = + DAG.getLoad(VT, dl, Chain, Ptr, + LD->getPointerInfo().getWithOffset(IncOffset), + LD->isVolatile(), LD->isNonTemporal(), + LD->isInvariant(), ABIAlignment); + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + BaseLoad.getValue(1), ExtraLoad.getValue(1)); + + if (BaseLoad.getValueType() != MVT::v4i32) + BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad); + + if (ExtraLoad.getValueType() != MVT::v4i32) + ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad); + + // Because vperm has a big-endian bias, we must reverse the order + // of the input vectors and complement the permute control vector + // when generating little endian code. We have already handled the + // latter by using lvsr instead of lvsl, so just reverse BaseLoad + // and ExtraLoad here. + SDValue Perm; + if (isLittleEndian) + Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + ExtraLoad, BaseLoad, PermCntl, DAG, dl); + else + Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm, + BaseLoad, ExtraLoad, PermCntl, DAG, dl); + + if (VT != MVT::v4i32) + Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm); + + // Now we need to be really careful about how we update the users of the + // original load. We cannot just call DCI.CombineTo (or + // DAG.ReplaceAllUsesWith for that matter), because the load still has + // uses created here (the permutation for example) that need to stay. + SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); + while (UI != UE) { + SDUse &Use = UI.getUse(); + SDNode *User = *UI; + // Note: BaseLoad is checked here because it might not be N, but a + // bitcast of N. + if (User == Perm.getNode() || User == BaseLoad.getNode() || + User == TF.getNode() || Use.getResNo() > 1) { + ++UI; + continue; + } + + SDValue To = Use.getResNo() ? TF : Perm; + ++UI; + + SmallVector<SDValue, 8> Ops; + for (const SDUse &O : User->ops()) { + if (O == Use) + Ops.push_back(To); + else + Ops.push_back(O); + } + + DAG.UpdateNodeOperands(User, Ops); + } + + return SDValue(N, 0); + } + } + break; + case ISD::INTRINSIC_WO_CHAIN: { + bool isLittleEndian = Subtarget.isLittleEndian(); + Intrinsic::ID Intr = (isLittleEndian ? + Intrinsic::ppc_altivec_lvsr : + Intrinsic::ppc_altivec_lvsl); + if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr && + N->getOperand(1)->getOpcode() == ISD::ADD) { + SDValue Add = N->getOperand(1); + + if (DAG.MaskedValueIsZero(Add->getOperand(1), + APInt::getAllOnesValue(4 /* 16 byte alignment */).zext( + Add.getValueType().getScalarType().getSizeInBits()))) { + SDNode *BasePtr = Add->getOperand(0).getNode(); + for (SDNode::use_iterator UI = BasePtr->use_begin(), + UE = BasePtr->use_end(); UI != UE; ++UI) { + if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && + cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == + Intr) { + // We've found another LVSL/LVSR, and this address is an aligned + // multiple of that one. The results will be the same, so use the + // one we've just found instead. + + return SDValue(*UI, 0); + } + } + } + } + } + + break; + case ISD::BSWAP: + // Turn BSWAP (LOAD) -> lhbrx/lwbrx. + if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) && + N->getOperand(0).hasOneUse() && + (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 || + (TM.getSubtarget<PPCSubtarget>().hasLDBRX() && + TM.getSubtarget<PPCSubtarget>().isPPC64() && + N->getValueType(0) == MVT::i64))) { + SDValue Load = N->getOperand(0); + LoadSDNode *LD = cast<LoadSDNode>(Load); + // Create the byte-swapping load. + SDValue Ops[] = { + LD->getChain(), // Chain + LD->getBasePtr(), // Ptr + DAG.getValueType(N->getValueType(0)) // VT + }; + SDValue BSLoad = + DAG.getMemIntrinsicNode(PPCISD::LBRX, dl, + DAG.getVTList(N->getValueType(0) == MVT::i64 ? + MVT::i64 : MVT::i32, MVT::Other), + Ops, LD->getMemoryVT(), LD->getMemOperand()); + + // If this is an i16 load, insert the truncate. + SDValue ResVal = BSLoad; + if (N->getValueType(0) == MVT::i16) + ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad); + + // First, combine the bswap away. This makes the value produced by the + // load dead. + DCI.CombineTo(N, ResVal); + + // Next, combine the load away, we give it a bogus result value but a real + // chain result. The result value is dead because the bswap is dead. + DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1)); + + // Return N so it doesn't get rechecked! + return SDValue(N, 0); + } + + break; + case PPCISD::VCMP: { + // If a VCMPo node already exists with exactly the same operands as this + // node, use its result instead of this node (VCMPo computes both a CR6 and + // a normal output). + // + if (!N->getOperand(0).hasOneUse() && + !N->getOperand(1).hasOneUse() && + !N->getOperand(2).hasOneUse()) { + + // Scan all of the users of the LHS, looking for VCMPo's that match. + SDNode *VCMPoNode = nullptr; + + SDNode *LHSN = N->getOperand(0).getNode(); + for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); + UI != E; ++UI) + if (UI->getOpcode() == PPCISD::VCMPo && + UI->getOperand(1) == N->getOperand(1) && + UI->getOperand(2) == N->getOperand(2) && + UI->getOperand(0) == N->getOperand(0)) { + VCMPoNode = *UI; + break; + } + + // If there is no VCMPo node, or if the flag value has a single use, don't + // transform this. + if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) + break; + + // Look at the (necessarily single) use of the flag value. If it has a + // chain, this transformation is more complex. Note that multiple things + // could use the value result, which we should ignore. + SDNode *FlagUser = nullptr; + for (SDNode::use_iterator UI = VCMPoNode->use_begin(); + FlagUser == nullptr; ++UI) { + assert(UI != VCMPoNode->use_end() && "Didn't find user!"); + SDNode *User = *UI; + for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { + if (User->getOperand(i) == SDValue(VCMPoNode, 1)) { + FlagUser = User; + break; + } + } + } + + // If the user is a MFOCRF instruction, we know this is safe. + // Otherwise we give up for right now. + if (FlagUser->getOpcode() == PPCISD::MFOCRF) + return SDValue(VCMPoNode, 0); + } + break; + } + case ISD::BRCOND: { + SDValue Cond = N->getOperand(1); + SDValue Target = N->getOperand(2); + + if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && + cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == + Intrinsic::ppc_is_decremented_ctr_nonzero) { + + // We now need to make the intrinsic dead (it cannot be instruction + // selected). + DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); + assert(Cond.getNode()->hasOneUse() && + "Counter decrement has more than one use"); + + return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other, + N->getOperand(0), Target); + } + } + break; + case ISD::BR_CC: { + // If this is a branch on an altivec predicate comparison, lower this so + // that we don't have to do a MFOCRF: instead, branch directly on CR6. This + // lowering is done pre-legalize, because the legalizer lowers the predicate + // compare down to code that is difficult to reassemble. + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); + + // Sometimes the promoted value of the intrinsic is ANDed by some non-zero + // value. If so, pass-through the AND to get to the intrinsic. + if (LHS.getOpcode() == ISD::AND && + LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && + cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == + Intrinsic::ppc_is_decremented_ctr_nonzero && + isa<ConstantSDNode>(LHS.getOperand(1)) && + !cast<ConstantSDNode>(LHS.getOperand(1))->getConstantIntValue()-> + isZero()) + LHS = LHS.getOperand(0); + + if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && + cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == + Intrinsic::ppc_is_decremented_ctr_nonzero && + isa<ConstantSDNode>(RHS)) { + assert((CC == ISD::SETEQ || CC == ISD::SETNE) && + "Counter decrement comparison is not EQ or NE"); + + unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); + bool isBDNZ = (CC == ISD::SETEQ && Val) || + (CC == ISD::SETNE && !Val); + + // We now need to make the intrinsic dead (it cannot be instruction + // selected). + DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); + assert(LHS.getNode()->hasOneUse() && + "Counter decrement has more than one use"); + + return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, + N->getOperand(0), N->getOperand(4)); + } + + int CompareOpc; + bool isDot; + + if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && + isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && + getAltivecCompareInfo(LHS, CompareOpc, isDot)) { + assert(isDot && "Can't compare against a vector result!"); + + // If this is a comparison against something other than 0/1, then we know + // that the condition is never/always true. + unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); + if (Val != 0 && Val != 1) { + if (CC == ISD::SETEQ) // Cond never true, remove branch. + return N->getOperand(0); + // Always !=, turn it into an unconditional branch. + return DAG.getNode(ISD::BR, dl, MVT::Other, + N->getOperand(0), N->getOperand(4)); + } + + bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); + + // Create the PPCISD altivec 'dot' comparison node. + SDValue Ops[] = { + LHS.getOperand(2), // LHS of compare + LHS.getOperand(3), // RHS of compare + DAG.getConstant(CompareOpc, MVT::i32) + }; + EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; + SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); + + // Unpack the result based on how the target uses it. + PPC::Predicate CompOpc; + switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) { + default: // Can't happen, don't crash on invalid number though. + case 0: // Branch on the value of the EQ bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; + break; + case 1: // Branch on the inverted value of the EQ bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ; + break; + case 2: // Branch on the value of the LT bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE; + break; + case 3: // Branch on the inverted value of the LT bit of CR6. + CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT; + break; + } + + return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0), + DAG.getConstant(CompOpc, MVT::i32), + DAG.getRegister(PPC::CR6, MVT::i32), + N->getOperand(4), CompNode.getValue(1)); + } + break; + } + } + + return SDValue(); +} + +//===----------------------------------------------------------------------===// +// Inline Assembly Support +//===----------------------------------------------------------------------===// + +void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0); + switch (Op.getOpcode()) { + default: break; + case PPCISD::LBRX: { + // lhbrx is known to have the top bits cleared out. + if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16) + KnownZero = 0xFFFF0000; + break; + } + case ISD::INTRINSIC_WO_CHAIN: { + switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { + default: break; + case Intrinsic::ppc_altivec_vcmpbfp_p: + case Intrinsic::ppc_altivec_vcmpeqfp_p: + case Intrinsic::ppc_altivec_vcmpequb_p: + case Intrinsic::ppc_altivec_vcmpequh_p: + case Intrinsic::ppc_altivec_vcmpequw_p: + case Intrinsic::ppc_altivec_vcmpgefp_p: + case Intrinsic::ppc_altivec_vcmpgtfp_p: + case Intrinsic::ppc_altivec_vcmpgtsb_p: + case Intrinsic::ppc_altivec_vcmpgtsh_p: + case Intrinsic::ppc_altivec_vcmpgtsw_p: + case Intrinsic::ppc_altivec_vcmpgtub_p: + case Intrinsic::ppc_altivec_vcmpgtuh_p: + case Intrinsic::ppc_altivec_vcmpgtuw_p: + KnownZero = ~1U; // All bits but the low one are known to be zero. + break; + } + } + } +} + + +/// getConstraintType - Given a constraint, return the type of +/// constraint it is for this target. +PPCTargetLowering::ConstraintType +PPCTargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'b': + case 'r': + case 'f': + case 'v': + case 'y': + return C_RegisterClass; + case 'Z': + // FIXME: While Z does indicate a memory constraint, it specifically + // indicates an r+r address (used in conjunction with the 'y' modifier + // in the replacement string). Currently, we're forcing the base + // register to be r0 in the asm printer (which is interpreted as zero) + // and forming the complete address in the second register. This is + // suboptimal. + return C_Memory; + } + } else if (Constraint == "wc") { // individual CR bits. + return C_RegisterClass; + } else if (Constraint == "wa" || Constraint == "wd" || + Constraint == "wf" || Constraint == "ws") { + return C_RegisterClass; // VSX registers. + } + return TargetLowering::getConstraintType(Constraint); +} + +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +PPCTargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (!CallOperandVal) + return CW_Default; + Type *type = CallOperandVal->getType(); + + // Look at the constraint type. + if (StringRef(constraint) == "wc" && type->isIntegerTy(1)) + return CW_Register; // an individual CR bit. + else if ((StringRef(constraint) == "wa" || + StringRef(constraint) == "wd" || + StringRef(constraint) == "wf") && + type->isVectorTy()) + return CW_Register; + else if (StringRef(constraint) == "ws" && type->isDoubleTy()) + return CW_Register; + + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + case 'b': + if (type->isIntegerTy()) + weight = CW_Register; + break; + case 'f': + if (type->isFloatTy()) + weight = CW_Register; + break; + case 'd': + if (type->isDoubleTy()) + weight = CW_Register; + break; + case 'v': + if (type->isVectorTy()) + weight = CW_Register; + break; + case 'y': + weight = CW_Register; + break; + case 'Z': + weight = CW_Memory; + break; + } + return weight; +} + +std::pair<unsigned, const TargetRegisterClass*> +PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, + MVT VT) const { + if (Constraint.size() == 1) { + // GCC RS6000 Constraint Letters + switch (Constraint[0]) { + case 'b': // R1-R31 + if (VT == MVT::i64 && Subtarget.isPPC64()) + return std::make_pair(0U, &PPC::G8RC_NOX0RegClass); + return std::make_pair(0U, &PPC::GPRC_NOR0RegClass); + case 'r': // R0-R31 + if (VT == MVT::i64 && Subtarget.isPPC64()) + return std::make_pair(0U, &PPC::G8RCRegClass); + return std::make_pair(0U, &PPC::GPRCRegClass); + case 'f': + if (VT == MVT::f32 || VT == MVT::i32) + return std::make_pair(0U, &PPC::F4RCRegClass); + if (VT == MVT::f64 || VT == MVT::i64) + return std::make_pair(0U, &PPC::F8RCRegClass); + break; + case 'v': + return std::make_pair(0U, &PPC::VRRCRegClass); + case 'y': // crrc + return std::make_pair(0U, &PPC::CRRCRegClass); + } + } else if (Constraint == "wc") { // an individual CR bit. + return std::make_pair(0U, &PPC::CRBITRCRegClass); + } else if (Constraint == "wa" || Constraint == "wd" || + Constraint == "wf") { + return std::make_pair(0U, &PPC::VSRCRegClass); + } else if (Constraint == "ws") { + return std::make_pair(0U, &PPC::VSFRCRegClass); + } + + std::pair<unsigned, const TargetRegisterClass*> R = + TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + + // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers + // (which we call X[0-9]+). If a 64-bit value has been requested, and a + // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent + // register. + // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use + // the AsmName field from *RegisterInfo.td, then this would not be necessary. + if (R.first && VT == MVT::i64 && Subtarget.isPPC64() && + PPC::GPRCRegClass.contains(R.first)) { + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + return std::make_pair(TRI->getMatchingSuperReg(R.first, + PPC::sub_32, &PPC::G8RCRegClass), + &PPC::G8RCRegClass); + } + + return R; +} + + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue>&Ops, + SelectionDAG &DAG) const { + SDValue Result; + + // Only support length 1 constraints. + if (Constraint.length() > 1) return; + + char Letter = Constraint[0]; + switch (Letter) { + default: break; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'O': + case 'P': { + ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op); + if (!CST) return; // Must be an immediate to match. + unsigned Value = CST->getZExtValue(); + switch (Letter) { + default: llvm_unreachable("Unknown constraint letter!"); + case 'I': // "I" is a signed 16-bit constant. + if ((short)Value == (int)Value) + Result = DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'J': // "J" is a constant with only the high-order 16 bits nonzero. + case 'L': // "L" is a signed 16-bit constant shifted left 16 bits. + if ((short)Value == 0) + Result = DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'K': // "K" is a constant with only the low-order 16 bits nonzero. + if ((Value >> 16) == 0) + Result = DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'M': // "M" is a constant that is greater than 31. + if (Value > 31) + Result = DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'N': // "N" is a positive constant that is an exact power of two. + if ((int)Value > 0 && isPowerOf2_32(Value)) + Result = DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'O': // "O" is the constant zero. + if (Value == 0) + Result = DAG.getTargetConstant(Value, Op.getValueType()); + break; + case 'P': // "P" is a constant whose negation is a signed 16-bit constant. + if ((short)-Value == (int)-Value) + Result = DAG.getTargetConstant(Value, Op.getValueType()); + break; + } + break; + } + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + + // Handle standard constraint letters. + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +// isLegalAddressingMode - Return true if the addressing mode represented +// by AM is legal for this target, for a load/store of the specified type. +bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const { + // FIXME: PPC does not allow r+i addressing modes for vectors! + + // PPC allows a sign-extended 16-bit immediate field. + if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1) + return false; + + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + // PPC only support r+r, + switch (AM.Scale) { + case 0: // "r+i" or just "i", depending on HasBaseReg. + break; + case 1: + if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. + return false; + // Otherwise we have r+r or r+i. + break; + case 2: + if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. + return false; + // Allow 2*r as r+r. + break; + default: + // No other scales are supported. + return false; + } + + return true; +} + +SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + if (verifyReturnAddressArgumentIsConstant(Op, DAG)) + return SDValue(); + + SDLoc dl(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + + // Make sure the function does not optimize away the store of the RA to + // the stack. + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + FuncInfo->setLRStoreRequired(); + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); + + if (Depth > 0) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = + + DAG.getConstant(PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI), + isPPC64? MVT::i64 : MVT::i32); + return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, getPointerTy(), + FrameAddr, Offset), + MachinePointerInfo(), false, false, false, 0); + } + + // Just load the return address off the stack. + SDValue RetAddrFI = getReturnAddrFrameIndex(DAG); + return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + RetAddrFI, MachinePointerInfo(), false, false, false, 0); +} + +SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + + EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(); + bool isPPC64 = PtrVT == MVT::i64; + + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + // Naked functions never have a frame pointer, and so we use r1. For all + // other functions, this decision must be delayed until during PEI. + unsigned FrameReg; + if (MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::Naked)) + FrameReg = isPPC64 ? PPC::X1 : PPC::R1; + else + FrameReg = isPPC64 ? PPC::FP8 : PPC::FP; + + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, + PtrVT); + while (Depth--) + FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(), + FrameAddr, MachinePointerInfo(), false, false, + false, 0); + return FrameAddr; +} + +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +unsigned PPCTargetLowering::getRegisterByName(const char* RegName, + EVT VT) const { + bool isPPC64 = Subtarget.isPPC64(); + bool isDarwinABI = Subtarget.isDarwinABI(); + + if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || + (!isPPC64 && VT != MVT::i32)) + report_fatal_error("Invalid register global variable type"); + + bool is64Bit = isPPC64 && VT == MVT::i64; + unsigned Reg = StringSwitch<unsigned>(RegName) + .Case("r1", is64Bit ? PPC::X1 : PPC::R1) + .Case("r2", isDarwinABI ? 0 : (is64Bit ? PPC::X2 : PPC::R2)) + .Case("r13", (!isPPC64 && isDarwinABI) ? 0 : + (is64Bit ? PPC::X13 : PPC::R13)) + .Default(0); + + if (Reg) + return Reg; + report_fatal_error("Invalid register name global variable"); +} + +bool +PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // The PowerPC target isn't yet aware of offsets. + return false; +} + +/// getOptimalMemOpType - Returns the target specific optimal type for load +/// and store operations as a result of memset, memcpy, and memmove +/// lowering. If DstAlign is zero that means it's safe to destination +/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it +/// means there isn't a need to check it against alignment requirement, +/// probably because the source does not need to be loaded. If 'IsMemset' is +/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that +/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy +/// source is constant so it does not need to be loaded. +/// It returns EVT::Other if the type should be determined using generic +/// target-independent logic. +EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + if (Subtarget.isPPC64()) { + return MVT::i64; + } else { + return MVT::i32; + } +} + +/// \brief Returns true if it is beneficial to convert a load of a constant +/// to just the constant itself. +bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0 || BitSize > 64) + return false; + return true; +} + +bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); + unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); + return NumBits1 == 64 && NumBits2 == 32; +} + +bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { + if (!VT1.isInteger() || !VT2.isInteger()) + return false; + unsigned NumBits1 = VT1.getSizeInBits(); + unsigned NumBits2 = VT2.getSizeInBits(); + return NumBits1 == 64 && NumBits2 == 32; +} + +bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const { + return isInt<16>(Imm) || isUInt<16>(Imm); +} + +bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { + return isInt<16>(Imm) || isUInt<16>(Imm); +} + +bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, + unsigned, + bool *Fast) const { + if (DisablePPCUnaligned) + return false; + + // PowerPC supports unaligned memory access for simple non-vector types. + // Although accessing unaligned addresses is not as efficient as accessing + // aligned addresses, it is generally more efficient than manual expansion, + // and generally only traps for software emulation when crossing page + // boundaries. + + if (!VT.isSimple()) + return false; + + if (VT.getSimpleVT().isVector()) { + if (Subtarget.hasVSX()) { + if (VT != MVT::v2f64 && VT != MVT::v2i64) + return false; + } else { + return false; + } + } + + if (VT == MVT::ppcf128) + return false; + + if (Fast) + *Fast = true; + + return true; +} + +bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + default: + break; + } + + return false; +} + +bool +PPCTargetLowering::shouldExpandBuildVectorWithShuffles( + EVT VT , unsigned DefinedValues) const { + if (VT == MVT::v2i64) + return false; + + return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); +} + +Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const { + if (DisableILPPref || Subtarget.enableMachineScheduler()) + return TargetLowering::getSchedulingPreference(N); + + return Sched::ILP; +} + +// Create a fast isel object. +FastISel * +PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) const { + return PPC::createFastISel(FuncInfo, LibInfo); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h new file mode 100644 index 000000000000..74d3c651538e --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -0,0 +1,717 @@ +//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that PPC uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H +#define LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H + +#include "PPC.h" +#include "PPCInstrInfo.h" +#include "PPCRegisterInfo.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + namespace PPCISD { + enum NodeType { + // Start the numbering where the builtin ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// FSEL - Traditional three-operand fsel node. + /// + FSEL, + + /// FCFID - The FCFID instruction, taking an f64 operand and producing + /// and f64 value containing the FP representation of the integer that + /// was temporarily in the f64 operand. + FCFID, + + /// Newer FCFID[US] integer-to-floating-point conversion instructions for + /// unsigned integers and single-precision outputs. + FCFIDU, FCFIDS, FCFIDUS, + + /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64 + /// operand, producing an f64 value containing the integer representation + /// of that FP value. + FCTIDZ, FCTIWZ, + + /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for + /// unsigned integers. + FCTIDUZ, FCTIWUZ, + + /// Reciprocal estimate instructions (unary FP ops). + FRE, FRSQRTE, + + // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking + // three v4f32 operands and producing a v4f32 result. + VMADDFP, VNMSUBFP, + + /// VPERM - The PPC VPERM Instruction. + /// + VPERM, + + /// Hi/Lo - These represent the high and low 16-bit parts of a global + /// address respectively. These nodes have two operands, the first of + /// which must be a TargetGlobalAddress, and the second of which must be a + /// Constant. Selected naively, these turn into 'lis G+C' and 'li G+C', + /// though these are usually folded into other nodes. + Hi, Lo, + + TOC_ENTRY, + + /// The following two target-specific nodes are used for calls through + /// function pointers in the 64-bit SVR4 ABI. + + /// Like a regular LOAD but additionally taking/producing a flag. + LOAD, + + /// Like LOAD (taking/producing a flag), but using r2 as hard-coded + /// destination. + LOAD_TOC, + + /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX) + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an allocation on the stack. + DYNALLOC, + + /// GlobalBaseReg - On Darwin, this node represents the result of the mflr + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// These nodes represent the 32-bit PPC shifts that operate on 6-bit + /// shift amounts. These nodes are generated by the multi-precision shift + /// code. + SRL, SRA, SHL, + + /// CALL - A direct function call. + /// CALL_NOP is a call with the special NOP which follows 64-bit + /// SVR4 calls. + CALL, CALL_NOP, + + /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a + /// MTCTR instruction. + MTCTR, + + /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a + /// BCTRL instruction. + BCTRL, + + /// Return with a flag operand, matched by 'blr' + RET_FLAG, + + /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction. + /// This copies the bits corresponding to the specified CRREG into the + /// resultant GPR. Bits corresponding to other CR regs are undefined. + MFOCRF, + + // FIXME: Remove these once the ANDI glue bug is fixed: + /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the + /// eq or gt bit of CR0 after executing andi. x, 1. This is used to + /// implement truncation of i32 or i64 to i1. + ANDIo_1_EQ_BIT, ANDIo_1_GT_BIT, + + // EH_SJLJ_SETJMP - SjLj exception handling setjmp. + EH_SJLJ_SETJMP, + + // EH_SJLJ_LONGJMP - SjLj exception handling longjmp. + EH_SJLJ_LONGJMP, + + /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP* + /// instructions. For lack of better number, we use the opcode number + /// encoding for the OPC field to identify the compare. For example, 838 + /// is VCMPGTSH. + VCMP, + + /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the + /// altivec VCMP*o instructions. For lack of better number, we use the + /// opcode number encoding for the OPC field to identify the compare. For + /// example, 838 is VCMPGTSH. + VCMPo, + + /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This + /// corresponds to the COND_BRANCH pseudo instruction. CRRC is the + /// condition register to branch on, OPC is the branch opcode to use (e.g. + /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is + /// an optional input flag argument. + COND_BRANCH, + + /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based + /// loops. + BDNZ, BDZ, + + /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding + /// towards zero. Used only as part of the long double-to-int + /// conversion sequence. + FADDRTZ, + + /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register. + MFFS, + + /// LARX = This corresponds to PPC l{w|d}arx instrcution: load and + /// reserve indexed. This is used to implement atomic operations. + LARX, + + /// STCX = This corresponds to PPC stcx. instrcution: store conditional + /// indexed. This is used to implement atomic operations. + STCX, + + /// TC_RETURN - A tail call return. + /// operand #0 chain + /// operand #1 callee (register or absolute) + /// operand #2 stack adjustment + /// operand #3 optional in flag + TC_RETURN, + + /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls + CR6SET, + CR6UNSET, + + /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS + /// on PPC32. + PPC32_GOT, + + /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and + /// local dynamic TLS on PPC32. + PPC32_PICGOT, + + /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec + /// TLS model, produces an ADDIS8 instruction that adds the GOT + /// base to sym\@got\@tprel\@ha. + ADDIS_GOT_TPREL_HA, + + /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec + /// TLS model, produces a LD instruction with base register G8RReg + /// and offset sym\@got\@tprel\@l. This completes the addition that + /// finds the offset of "sym" relative to the thread pointer. + LD_GOT_TPREL_L, + + /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS + /// model, produces an ADD instruction that adds the contents of + /// G8RReg to the thread pointer. Symbol contains a relocation + /// sym\@tls which is to be replaced by the thread pointer and + /// identifies to the linker that the instruction is part of a + /// TLS sequence. + ADD_TLS, + + /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS + /// model, produces an ADDIS8 instruction that adds the GOT base + /// register to sym\@got\@tlsgd\@ha. + ADDIS_TLSGD_HA, + + /// G8RC = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS + /// model, produces an ADDI8 instruction that adds G8RReg to + /// sym\@got\@tlsgd\@l. + ADDI_TLSGD_L, + + /// G8RC = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS + /// model, produces a call to __tls_get_addr(sym\@tlsgd). + GET_TLS_ADDR, + + /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS + /// model, produces an ADDIS8 instruction that adds the GOT base + /// register to sym\@got\@tlsld\@ha. + ADDIS_TLSLD_HA, + + /// G8RC = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS + /// model, produces an ADDI8 instruction that adds G8RReg to + /// sym\@got\@tlsld\@l. + ADDI_TLSLD_L, + + /// G8RC = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS + /// model, produces a call to __tls_get_addr(sym\@tlsld). + GET_TLSLD_ADDR, + + /// G8RC = ADDIS_DTPREL_HA %X3, Symbol, Chain - For the + /// local-dynamic TLS model, produces an ADDIS8 instruction + /// that adds X3 to sym\@dtprel\@ha. The Chain operand is needed + /// to tie this in place following a copy to %X3 from the result + /// of a GET_TLSLD_ADDR. + ADDIS_DTPREL_HA, + + /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS + /// model, produces an ADDI8 instruction that adds G8RReg to + /// sym\@got\@dtprel\@l. + ADDI_DTPREL_L, + + /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded + /// during instruction selection to optimize a BUILD_VECTOR into + /// operations on splats. This is necessary to avoid losing these + /// optimizations due to constant folding. + VADD_SPLAT, + + /// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned + /// operand identifies the operating system entry point. + SC, + + /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a + /// byte-swapping store instruction. It byte-swaps the low "Type" bits of + /// the GPRC input, then stores it through Ptr. Type can be either i16 or + /// i32. + STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE, + + /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a + /// byte-swapping load instruction. It loads "Type" bits, byte swaps it, + /// then puts it in the bottom bits of the GPRC. TYPE can be either i16 + /// or i32. + LBRX, + + /// STFIWX - The STFIWX instruction. The first operand is an input token + /// chain, then an f64 value to store, then an address to store it to. + STFIWX, + + /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point + /// load which sign-extends from a 32-bit integer value into the + /// destination 64-bit register. + LFIWAX, + + /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point + /// load which zero-extends from a 32-bit integer value into the + /// destination 64-bit register. + LFIWZX, + + /// G8RC = ADDIS_TOC_HA %X2, Symbol - For medium and large code model, + /// produces an ADDIS8 instruction that adds the TOC base register to + /// sym\@toc\@ha. + ADDIS_TOC_HA, + + /// G8RC = LD_TOC_L Symbol, G8RReg - For medium and large code model, + /// produces a LD instruction with base register G8RReg and offset + /// sym\@toc\@l. Preceded by an ADDIS_TOC_HA to form a full 32-bit offset. + LD_TOC_L, + + /// G8RC = ADDI_TOC_L G8RReg, Symbol - For medium code model, produces + /// an ADDI8 instruction that adds G8RReg to sym\@toc\@l. + /// Preceded by an ADDIS_TOC_HA to form a full 32-bit offset. + ADDI_TOC_L + }; + } + + /// Define some predicates that are used for node matching. + namespace PPC { + /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a + /// VPKUHUM instruction. + bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG); + + /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a + /// VPKUWUM instruction. + bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG); + + /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for + /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes). + bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned ShuffleKind, SelectionDAG &DAG); + + /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for + /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes). + bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, + unsigned ShuffleKind, SelectionDAG &DAG); + + /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the + /// shift amount, otherwise return -1. + int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, + SelectionDAG &DAG); + + /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a splat of a single element that is suitable for input to + /// VSPLTB/VSPLTH/VSPLTW. + bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize); + + /// isAllNegativeZeroVector - Returns true if all elements of build_vector + /// are -0.0. + bool isAllNegativeZeroVector(SDNode *N); + + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the + /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. + unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG); + + /// get_VSPLTI_elt - If this is a build_vector of constants which can be + /// formed by using a vspltis[bhw] instruction of the specified element + /// size, return the constant being splatted. The ByteSize field indicates + /// the number of bytes of each element [124] -> [bhw]. + SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG); + } + + class PPCSubtarget; + class PPCTargetLowering : public TargetLowering { + const PPCSubtarget &Subtarget; + + public: + explicit PPCTargetLowering(PPCTargetMachine &TM); + + /// getTargetNodeName() - This method returns the name of a target specific + /// DAG node. + const char *getTargetNodeName(unsigned Opcode) const override; + + MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; } + + /// getSetCCResultType - Return the ISD::SETCC ValueType + EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + + /// getPreIndexedAddressParts - returns true by value, base pointer and + /// offset pointer and addressing mode by reference if the node's address + /// can be legally represented as pre-indexed load / store address. + bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const override; + + /// SelectAddressRegReg - Given the specified addressed, check to see if it + /// can be represented as an indexed [r+r] operation. Returns false if it + /// can be more efficiently represented with [r+imm]. + bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index, + SelectionDAG &DAG) const; + + /// SelectAddressRegImm - Returns true if the address N can be represented + /// by a base register plus a signed 16-bit displacement [r+imm], and if it + /// is not better represented as reg+reg. If Aligned is true, only accept + /// displacements suitable for STD and friends, i.e. multiples of 4. + bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, + SelectionDAG &DAG, bool Aligned) const; + + /// SelectAddressRegRegOnly - Given the specified addressed, force it to be + /// represented as an indexed [r+r] operation. + bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index, + SelectionDAG &DAG) const; + + Sched::Preference getSchedulingPreference(SDNode *N) const override; + + /// LowerOperation - Provide custom lowering hooks for some operations. + /// + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + + /// ReplaceNodeResults - Replace the results of node with an illegal result + /// type with new values built out of custom code. + /// + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const override; + + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + unsigned getRegisterByName(const char* RegName, EVT VT) const override; + + void computeKnownBitsForTargetNode(const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const override; + MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, + MachineBasicBlock *MBB, bool is64Bit, + unsigned BinOpcode) const; + MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI, + MachineBasicBlock *MBB, + bool is8bit, unsigned Opcode) const; + + MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + ConstraintType + getConstraintType(const std::string &Constraint) const override; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const override; + + std::pair<unsigned, const TargetRegisterClass*> + getRegForInlineAsmConstraint(const std::string &Constraint, + MVT VT) const override; + + /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate + /// function arguments in the caller parameter area. This is the actual + /// alignment, not its logarithm. + unsigned getByValTypeAlignment(Type *Ty) const override; + + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops + /// vector. If it is invalid, don't add anything to Ops. + void LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const override; + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; + + /// isLegalICmpImmediate - Return true if the specified immediate is legal + /// icmp immediate, that is the target has icmp instructions which can + /// compare a register against the immediate without having to materialize + /// the immediate into a register. + bool isLegalICmpImmediate(int64_t Imm) const override; + + /// isLegalAddImmediate - Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + bool isLegalAddImmediate(int64_t Imm) const override; + + /// isTruncateFree - Return true if it's free to truncate a value of + /// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in + /// register X1 to i32 by referencing its sub-register R1. + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; + bool isTruncateFree(EVT VT1, EVT VT2) const override; + + /// \brief Returns true if it is beneficial to convert a load of a constant + /// to just the constant itself. + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + + /// getOptimalMemOpType - Returns the target specific optimal type for load + /// and store operations as a result of memset, memcpy, and memmove + /// lowering. If DstAlign is zero that means it's safe to destination + /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it + /// means there isn't a need to check it against alignment requirement, + /// probably because the source does not need to be loaded. If 'IsMemset' is + /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that + /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy + /// source is constant so it does not need to be loaded. + /// It returns EVT::Other if the type should be determined using generic + /// target-independent logic. + EVT + getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + MachineFunction &MF) const override; + + /// Is unaligned memory access allowed for the given type, and is it fast + /// relative to software emulation. + bool allowsUnalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + bool *Fast = nullptr) const override; + + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + + // Should we expand the build vector with shuffles? + bool + shouldExpandBuildVectorWithShuffles(EVT VT, + unsigned DefinedValues) const override; + + /// createFastISel - This method returns a target-specific FastISel object, + /// or null if the target does not support "fast" instruction selection. + FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) const override; + + /// \brief Returns true if an argument of type Ty needs to be passed in a + /// contiguous block of registers in calling convention CallConv. + bool functionArgumentNeedsConsecutiveRegisters( + Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override { + // We support any array type as "consecutive" block in the parameter + // save area. The element type defines the alignment requirement and + // whether the argument should go in GPRs, FPRs, or VRs if available. + // + // Note that clang uses this capability both to implement the ELFv2 + // homogeneous float/vector aggregate ABI, and to avoid having to use + // "byval" when passing aggregates that might fully fit in registers. + return Ty->isArrayTy(); + } + + private: + SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; + SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; + + bool + IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const; + + SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG, + int SPDiff, + SDValue Chain, + SDValue &LROpOut, + SDValue &FPOpOut, + bool isDarwinABI, + SDLoc dl) const; + + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; + SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, SDLoc dl) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall, + bool isVarArg, + SelectionDAG &DAG, + SmallVector<std::pair<unsigned, SDValue>, 8> + &RegsToPass, + SDValue InFlag, SDValue Chain, + SDValue &Callee, + int SPDiff, unsigned NumBytes, + const SmallVectorImpl<ISD::InputArg> &Ins, + SmallVectorImpl<SDValue> &InVals) const; + + SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + + SDValue + LowerCall(TargetLowering::CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + + bool + CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; + + SDValue + LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc dl, SelectionDAG &DAG) const override; + + SDValue + extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG, + SDValue ArgVal, SDLoc dl) const; + + SDValue + LowerFormalArguments_Darwin(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue + LowerFormalArguments_64SVR4(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue + LowerFormalArguments_32SVR4(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + SDValue + createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff, + SDValue CallSeqStart, ISD::ArgFlagsTy Flags, + SelectionDAG &DAG, SDLoc dl) const; + + SDValue + LowerCall_Darwin(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, + bool isVarArg, bool isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue + LowerCall_64SVR4(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, + bool isVarArg, bool isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue + LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, + bool isVarArg, bool isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + + SDValue DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const; + SDValue DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const; + + CCAssignFn *useFastISelCCs(unsigned Flag) const; + }; + + namespace PPC { + FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo); + } + + bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); + + bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); + + bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State); +} + +#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td new file mode 100644 index 000000000000..9ed384f56244 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -0,0 +1,1137 @@ +//===-- PPCInstr64Bit.td - The PowerPC 64-bit Support ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the PowerPC 64-bit instructions. These patterns are used +// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// 64-bit operands. +// +def s16imm64 : Operand<i64> { + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCS16ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; +} +def u16imm64 : Operand<i64> { + let PrintMethod = "printU16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCU16ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<16>"; +} +def s17imm64 : Operand<i64> { + // This operand type is used for addis/lis to allow the assembler parser + // to accept immediates in the range -65536..65535 for compatibility with + // the GNU assembler. The operand is treated as 16-bit otherwise. + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCS17ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; +} +def tocentry : Operand<iPTR> { + let MIOperandInfo = (ops i64imm:$imm); +} +def tlsreg : Operand<i64> { + let EncoderMethod = "getTLSRegEncoding"; + let ParserMatchClass = PPCTLSRegOperand; +} +def tlsgd : Operand<i64> {} +def tlscall : Operand<i64> { + let PrintMethod = "printTLSCall"; + let MIOperandInfo = (ops calltarget:$func, tlsgd:$sym); + let EncoderMethod = "getTLSCallEncoding"; +} + +//===----------------------------------------------------------------------===// +// 64-bit transformation functions. +// + +def SHL64 : SDNodeXForm<imm, [{ + // Transformation function: 63 - imm + return getI32Imm(63 - N->getZExtValue()); +}]>; + +def SRL64 : SDNodeXForm<imm, [{ + // Transformation function: 64 - imm + return N->getZExtValue() ? getI32Imm(64 - N->getZExtValue()) : getI32Imm(0); +}]>; + +def HI32_48 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return getI32Imm((unsigned short)(N->getZExtValue() >> 32)); +}]>; + +def HI48_64 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return getI32Imm((unsigned short)(N->getZExtValue() >> 48)); +}]>; + + +//===----------------------------------------------------------------------===// +// Calls. +// + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in { + let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in { + def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>, + Requires<[In64BitMode]>; + def BCCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond), + "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB, + []>, + Requires<[In64BitMode]>; + + def BCCTR8 : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi), + "bcctr 12, $bi, 0", IIC_BrB, []>, + Requires<[In64BitMode]>; + def BCCTR8n : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi), + "bcctr 4, $bi, 0", IIC_BrB, []>, + Requires<[In64BitMode]>; + } +} + +let Defs = [LR8] in + def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>, + PPC970_Unit_BRU; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { + let Defs = [CTR8], Uses = [CTR8] in { + def BDZ8 : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz $dst">; + def BDNZ8 : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz $dst">; + } + + let isReturn = 1, Defs = [CTR8], Uses = [CTR8, LR8, RM] in { + def BDZLR8 : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins), + "bdzlr", IIC_BrB, []>; + def BDNZLR8 : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins), + "bdnzlr", IIC_BrB, []>; + } +} + + + +let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in { + // Convenient aliases for call instructions + let Uses = [RM] in { + def BL8 : IForm<18, 0, 1, (outs), (ins calltarget:$func), + "bl $func", IIC_BrB, []>; // See Pat patterns below. + + def BL8_TLS : IForm<18, 0, 1, (outs), (ins tlscall:$func), + "bl $func", IIC_BrB, []>; + + def BLA8 : IForm<18, 1, 1, (outs), (ins abscalltarget:$func), + "bla $func", IIC_BrB, [(PPCcall (i64 imm:$func))]>; + } + let Uses = [RM], isCodeGenOnly = 1 in { + def BL8_NOP : IForm_and_DForm_4_zero<18, 0, 1, 24, + (outs), (ins calltarget:$func), + "bl $func\n\tnop", IIC_BrB, []>; + + def BL8_NOP_TLS : IForm_and_DForm_4_zero<18, 0, 1, 24, + (outs), (ins tlscall:$func), + "bl $func\n\tnop", IIC_BrB, []>; + + def BLA8_NOP : IForm_and_DForm_4_zero<18, 1, 1, 24, + (outs), (ins abscalltarget:$func), + "bla $func\n\tnop", IIC_BrB, + [(PPCcall_nop (i64 imm:$func))]>; + } + let Uses = [CTR8, RM] in { + def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl)]>, + Requires<[In64BitMode]>; + + let isCodeGenOnly = 1 in { + def BCCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond), + "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB, + []>, + Requires<[In64BitMode]>; + + def BCCTRL8 : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi), + "bcctrl 12, $bi, 0", IIC_BrB, []>, + Requires<[In64BitMode]>; + def BCCTRL8n : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi), + "bcctrl 4, $bi, 0", IIC_BrB, []>, + Requires<[In64BitMode]>; + } + } +} +} // Interpretation64Bit + +// FIXME: Duplicating this for the asm parser should be unnecessary, but the +// previous definition must be marked as CodeGen only to prevent decoding +// conflicts. +let Interpretation64Bit = 1, isAsmParserOnly = 1 in +let isCall = 1, PPC970_Unit = 7, Defs = [LR8], Uses = [RM] in +def BL8_TLS_ : IForm<18, 0, 1, (outs), (ins tlscall:$func), + "bl $func", IIC_BrB, []>; + +// Calls +def : Pat<(PPCcall (i64 tglobaladdr:$dst)), + (BL8 tglobaladdr:$dst)>; +def : Pat<(PPCcall_nop (i64 tglobaladdr:$dst)), + (BL8_NOP tglobaladdr:$dst)>; + +def : Pat<(PPCcall (i64 texternalsym:$dst)), + (BL8 texternalsym:$dst)>; +def : Pat<(PPCcall_nop (i64 texternalsym:$dst)), + (BL8_NOP texternalsym:$dst)>; + +// Atomic operations +let usesCustomInserter = 1 in { + let Defs = [CR0] in { + def ATOMIC_LOAD_ADD_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_ADD_I64", + [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_SUB_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_SUB_I64", + [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_OR_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_OR_I64", + [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_XOR_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_XOR_I64", + [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_AND_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_AND_i64", + [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>; + def ATOMIC_LOAD_NAND_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$incr), "#ATOMIC_LOAD_NAND_I64", + [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>; + + def ATOMIC_CMP_SWAP_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$old, g8rc:$new), "#ATOMIC_CMP_SWAP_I64", + [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>; + + def ATOMIC_SWAP_I64 : Pseudo< + (outs g8rc:$dst), (ins memrr:$ptr, g8rc:$new), "#ATOMIC_SWAP_I64", + [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>; + } +} + +// Instructions to support atomic operations +def LDARX : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr), + "ldarx $rD, $ptr", IIC_LdStLDARX, + [(set i64:$rD, (PPClarx xoaddr:$ptr))]>; + +let Defs = [CR0] in +def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst), + "stdcx. $rS, $dst", IIC_LdStSTDCX, + [(PPCstcx i64:$rS, xoaddr:$dst)]>, + isDOT; + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNdi8 :Pseudo< (outs), + (ins calltarget:$dst, i32imm:$offset), + "#TC_RETURNd8 $dst $offset", + []>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNai8 :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset), + "#TC_RETURNa8 $func $offset", + [(PPCtc_return (i64 imm:$func), imm:$offset)]>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset), + "#TC_RETURNr8 $dst $offset", + []>; + +let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1, + isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in +def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>, + Requires<[In64BitMode]>; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, + isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in +def TAILB8 : IForm<18, 0, 0, (outs), (ins calltarget:$dst), + "b $dst", IIC_BrB, + []>; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, + isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in +def TAILBA8 : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst), + "ba $dst", IIC_BrB, + []>; +} // Interpretation64Bit + +def : Pat<(PPCtc_return (i64 tglobaladdr:$dst), imm:$imm), + (TCRETURNdi8 tglobaladdr:$dst, imm:$imm)>; + +def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm), + (TCRETURNdi8 texternalsym:$dst, imm:$imm)>; + +def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm), + (TCRETURNri8 CTRRC8:$dst, imm:$imm)>; + + +// 64-bit CR instructions +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +let neverHasSideEffects = 1 in { +def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST), + "mtocrf $FXM, $ST", IIC_BrMCRX>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS), + "mtcrf $FXM, $rS", IIC_BrMCRX>, + PPC970_MicroCode, PPC970_Unit_CRU; + +let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking. +def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM), + "mfocrf $rT, $FXM", IIC_SprMFCRF>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins), + "mfcr $rT", IIC_SprMFCR>, + PPC970_MicroCode, PPC970_Unit_CRU; +} // neverHasSideEffects = 1 + +let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { + let Defs = [CTR8] in + def EH_SjLj_SetJmp64 : Pseudo<(outs gprc:$dst), (ins memr:$buf), + "#EH_SJLJ_SETJMP64", + [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>, + Requires<[In64BitMode]>; + let isTerminator = 1 in + def EH_SjLj_LongJmp64 : Pseudo<(outs), (ins memr:$buf), + "#EH_SJLJ_LONGJMP64", + [(PPCeh_sjlj_longjmp addr:$buf)]>, + Requires<[In64BitMode]>; +} + +//===----------------------------------------------------------------------===// +// 64-bit SPR manipulation instrs. + +let Uses = [CTR8] in { +def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs g8rc:$rT), (ins), + "mfctr $rT", IIC_SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let Pattern = [(PPCmtctr i64:$rS)], Defs = [CTR8] in { +def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS), + "mtctr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let hasSideEffects = 1, Defs = [CTR8] in { +let Pattern = [(int_ppc_mtctr i64:$rS)] in +def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS), + "mtctr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +let Pattern = [(set i64:$rT, readcyclecounter)] in +def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins), + "mfspr $rT, 268", IIC_SprMFTB>, + PPC970_DGroup_First, PPC970_Unit_FXU; +// Note that encoding mftb using mfspr is now the preferred form, +// and has been since at least ISA v2.03. The mftb instruction has +// now been phased out. Using mfspr, however, is known not to work on +// the POWER3. + +let Defs = [X1], Uses = [X1] in +def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8", + [(set i64:$result, + (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>; + +let Defs = [LR8] in { +def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS), + "mtlr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let Uses = [LR8] in { +def MFLR8 : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins), + "mflr $rT", IIC_SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +} // Interpretation64Bit + +//===----------------------------------------------------------------------===// +// Fixed point instructions. +// + +let PPC970_Unit = 1 in { // FXU Operations. +let Interpretation64Bit = 1 in { +let neverHasSideEffects = 1 in { +let isCodeGenOnly = 1 in { + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { +def LI8 : DForm_2_r0<14, (outs g8rc:$rD), (ins s16imm64:$imm), + "li $rD, $imm", IIC_IntSimple, + [(set i64:$rD, imm64SExt16:$imm)]>; +def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins s17imm64:$imm), + "lis $rD, $imm", IIC_IntSimple, + [(set i64:$rD, imm16ShiftedSExt:$imm)]>; +} + +// Logical ops. +let isCommutable = 1 in { +defm NAND8: XForm_6r<31, 476, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "nand", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (not (and i64:$rS, i64:$rB)))]>; +defm AND8 : XForm_6r<31, 28, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "and", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (and i64:$rS, i64:$rB))]>; +} // isCommutable +defm ANDC8: XForm_6r<31, 60, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "andc", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (and i64:$rS, (not i64:$rB)))]>; +let isCommutable = 1 in { +defm OR8 : XForm_6r<31, 444, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "or", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (or i64:$rS, i64:$rB))]>; +defm NOR8 : XForm_6r<31, 124, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "nor", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (not (or i64:$rS, i64:$rB)))]>; +} // isCommutable +defm ORC8 : XForm_6r<31, 412, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "orc", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (or i64:$rS, (not i64:$rB)))]>; +let isCommutable = 1 in { +defm EQV8 : XForm_6r<31, 284, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "eqv", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (not (xor i64:$rS, i64:$rB)))]>; +defm XOR8 : XForm_6r<31, 316, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), + "xor", "$rA, $rS, $rB", IIC_IntSimple, + [(set i64:$rA, (xor i64:$rS, i64:$rB))]>; +} // let isCommutable = 1 + +// Logical ops with immediate. +let Defs = [CR0] in { +def ANDIo8 : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "andi. $dst, $src1, $src2", IIC_IntGeneral, + [(set i64:$dst, (and i64:$src1, immZExt16:$src2))]>, + isDOT; +def ANDISo8 : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "andis. $dst, $src1, $src2", IIC_IntGeneral, + [(set i64:$dst, (and i64:$src1, imm16ShiftedZExt:$src2))]>, + isDOT; +} +def ORI8 : DForm_4<24, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "ori $dst, $src1, $src2", IIC_IntSimple, + [(set i64:$dst, (or i64:$src1, immZExt16:$src2))]>; +def ORIS8 : DForm_4<25, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "oris $dst, $src1, $src2", IIC_IntSimple, + [(set i64:$dst, (or i64:$src1, imm16ShiftedZExt:$src2))]>; +def XORI8 : DForm_4<26, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "xori $dst, $src1, $src2", IIC_IntSimple, + [(set i64:$dst, (xor i64:$src1, immZExt16:$src2))]>; +def XORIS8 : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "xoris $dst, $src1, $src2", IIC_IntSimple, + [(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>; + +let isCommutable = 1 in +defm ADD8 : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "add", "$rT, $rA, $rB", IIC_IntSimple, + [(set i64:$rT, (add i64:$rA, i64:$rB))]>; +// ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the +// initial-exec thread-local storage model. +def ADD8TLS : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB), + "add $rT, $rA, $rB", IIC_IntSimple, + [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>; + +let isCommutable = 1 in +defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "addc", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (addc i64:$rA, i64:$rB))]>, + PPC970_DGroup_Cracked; + +let Defs = [CARRY] in +def ADDIC8 : DForm_2<12, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm), + "addic $rD, $rA, $imm", IIC_IntGeneral, + [(set i64:$rD, (addc i64:$rA, imm64SExt16:$imm))]>; +def ADDI8 : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm), + "addi $rD, $rA, $imm", IIC_IntSimple, + [(set i64:$rD, (add i64:$rA, imm64SExt16:$imm))]>; +def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm), + "addis $rD, $rA, $imm", IIC_IntSimple, + [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>; + +let Defs = [CARRY] in { +def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm), + "subfic $rD, $rA, $imm", IIC_IntGeneral, + [(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>; +defm SUBFC8 : XOForm_1r<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "subfc", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (subc i64:$rB, i64:$rA))]>, + PPC970_DGroup_Cracked; +} +defm SUBF8 : XOForm_1r<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "subf", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (sub i64:$rB, i64:$rA))]>; +defm NEG8 : XOForm_3r<31, 104, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "neg", "$rT, $rA", IIC_IntSimple, + [(set i64:$rT, (ineg i64:$rA))]>; +let Uses = [CARRY] in { +let isCommutable = 1 in +defm ADDE8 : XOForm_1rc<31, 138, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "adde", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (adde i64:$rA, i64:$rB))]>; +defm ADDME8 : XOForm_3rc<31, 234, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "addme", "$rT, $rA", IIC_IntGeneral, + [(set i64:$rT, (adde i64:$rA, -1))]>; +defm ADDZE8 : XOForm_3rc<31, 202, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "addze", "$rT, $rA", IIC_IntGeneral, + [(set i64:$rT, (adde i64:$rA, 0))]>; +defm SUBFE8 : XOForm_1rc<31, 136, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "subfe", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (sube i64:$rB, i64:$rA))]>; +defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "subfme", "$rT, $rA", IIC_IntGeneral, + [(set i64:$rT, (sube -1, i64:$rA))]>; +defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$rT), (ins g8rc:$rA), + "subfze", "$rT, $rA", IIC_IntGeneral, + [(set i64:$rT, (sube 0, i64:$rA))]>; +} +} // isCodeGenOnly + +// FIXME: Duplicating this for the asm parser should be unnecessary, but the +// previous definition must be marked as CodeGen only to prevent decoding +// conflicts. +let isAsmParserOnly = 1 in +def ADD8TLS_ : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB), + "add $rT, $rA, $rB", IIC_IntSimple, []>; + +let isCommutable = 1 in { +defm MULHD : XOForm_1r<31, 73, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "mulhd", "$rT, $rA, $rB", IIC_IntMulHW, + [(set i64:$rT, (mulhs i64:$rA, i64:$rB))]>; +defm MULHDU : XOForm_1r<31, 9, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "mulhdu", "$rT, $rA, $rB", IIC_IntMulHWU, + [(set i64:$rT, (mulhu i64:$rA, i64:$rB))]>; +} // isCommutable +} +} // Interpretation64Bit + +let isCompare = 1, neverHasSideEffects = 1 in { + def CMPD : XForm_16_ext<31, 0, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB), + "cmpd $crD, $rA, $rB", IIC_IntCompare>, isPPC64; + def CMPLD : XForm_16_ext<31, 32, (outs crrc:$crD), (ins g8rc:$rA, g8rc:$rB), + "cmpld $crD, $rA, $rB", IIC_IntCompare>, isPPC64; + def CMPDI : DForm_5_ext<11, (outs crrc:$crD), (ins g8rc:$rA, s16imm64:$imm), + "cmpdi $crD, $rA, $imm", IIC_IntCompare>, isPPC64; + def CMPLDI : DForm_6_ext<10, (outs crrc:$dst), (ins g8rc:$src1, u16imm64:$src2), + "cmpldi $dst, $src1, $src2", + IIC_IntCompare>, isPPC64; +} + +let neverHasSideEffects = 1 in { +defm SLD : XForm_6r<31, 27, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB), + "sld", "$rA, $rS, $rB", IIC_IntRotateD, + [(set i64:$rA, (PPCshl i64:$rS, i32:$rB))]>, isPPC64; +defm SRD : XForm_6r<31, 539, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB), + "srd", "$rA, $rS, $rB", IIC_IntRotateD, + [(set i64:$rA, (PPCsrl i64:$rS, i32:$rB))]>, isPPC64; +defm SRAD : XForm_6rc<31, 794, (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB), + "srad", "$rA, $rS, $rB", IIC_IntRotateD, + [(set i64:$rA, (PPCsra i64:$rS, i32:$rB))]>, isPPC64; + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS), + "extsb", "$rA, $rS", IIC_IntSimple, + [(set i64:$rA, (sext_inreg i64:$rS, i8))]>; +defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS), + "extsh", "$rA, $rS", IIC_IntSimple, + [(set i64:$rA, (sext_inreg i64:$rS, i16))]>; +} // Interpretation64Bit + +// For fast-isel: +let isCodeGenOnly = 1 in { +def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS), + "extsb $rA, $rS", IIC_IntSimple, []>, isPPC64; +def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS), + "extsh $rA, $rS", IIC_IntSimple, []>, isPPC64; +} // isCodeGenOnly for fast-isel + +defm EXTSW : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS), + "extsw", "$rA, $rS", IIC_IntSimple, + [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS), + "extsw", "$rA, $rS", IIC_IntSimple, + [(set i64:$rA, (sext i32:$rS))]>, isPPC64; + +defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH), + "sradi", "$rA, $rS, $SH", IIC_IntRotateDI, + [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64; +defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS), + "cntlzd", "$rA, $rS", IIC_IntGeneral, + [(set i64:$rA, (ctlz i64:$rS))]>; +def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS), + "popcntd $rA, $rS", IIC_IntGeneral, + [(set i64:$rA, (ctpop i64:$rS))]>; + +// popcntw also does a population count on the high 32 bits (storing the +// results in the high 32-bits of the output). We'll ignore that here (which is +// safe because we never separately use the high part of the 64-bit registers). +def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS), + "popcntw $rA, $rS", IIC_IntGeneral, + [(set i32:$rA, (ctpop i32:$rS))]>; + +defm DIVD : XOForm_1r<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divd", "$rT, $rA, $rB", IIC_IntDivD, + [(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64, + PPC970_DGroup_First, PPC970_DGroup_Cracked; +defm DIVDU : XOForm_1r<31, 457, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divdu", "$rT, $rA, $rB", IIC_IntDivD, + [(set i64:$rT, (udiv i64:$rA, i64:$rB))]>, isPPC64, + PPC970_DGroup_First, PPC970_DGroup_Cracked; +let isCommutable = 1 in +defm MULLD : XOForm_1r<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "mulld", "$rT, $rA, $rB", IIC_IntMulHD, + [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm), + "mulli $rD, $rA, $imm", IIC_IntMulLI, + [(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>; +} + +let neverHasSideEffects = 1 in { +let isCommutable = 1 in { +defm RLDIMI : MDForm_1r<30, 3, (outs g8rc:$rA), + (ins g8rc:$rSi, g8rc:$rS, u6imm:$SH, u6imm:$MBE), + "rldimi", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64, RegConstraint<"$rSi = $rA">, + NoEncode<"$rSi">; +} + +// Rotate instructions. +defm RLDCL : MDSForm_1r<30, 8, + (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE), + "rldcl", "$rA, $rS, $rB, $MBE", IIC_IntRotateD, + []>, isPPC64; +defm RLDCR : MDSForm_1r<30, 9, + (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE), + "rldcr", "$rA, $rS, $rB, $MBE", IIC_IntRotateD, + []>, isPPC64; +defm RLDICL : MDForm_1r<30, 0, + (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; +// For fast-isel: +let isCodeGenOnly = 1 in +def RLDICL_32_64 : MDForm_1<30, 0, + (outs g8rc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; +// End fast-isel. +defm RLDICR : MDForm_1r<30, 1, + (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; +defm RLDIC : MDForm_1r<30, 2, + (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), + "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA), + (ins g8rc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME), + "rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral, + []>; + +let isCommutable = 1 in { +// RLWIMI can be commuted if the rotate amount is zero. +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA), + (ins g8rc:$rSi, g8rc:$rS, u5imm:$SH, u5imm:$MB, + u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME", + IIC_IntRotate, []>, PPC970_DGroup_Cracked, + RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">; +} + +let isSelect = 1 in +def ISEL8 : AForm_4<31, 15, + (outs g8rc:$rT), (ins g8rc_nox0:$rA, g8rc:$rB, crbitrc:$cond), + "isel $rT, $rA, $rB, $cond", IIC_IntGeneral, + []>; +} // Interpretation64Bit +} // neverHasSideEffects = 1 +} // End FXU Operations. + + +//===----------------------------------------------------------------------===// +// Load/Store instructions. +// + + +// Sign extending loads. +let canFoldAsLoad = 1, PPC970_Unit = 2 in { +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src), + "lha $rD, $src", IIC_LdStLHA, + [(set i64:$rD, (sextloadi16 iaddr:$src))]>, + PPC970_DGroup_Cracked; +def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src), + "lwa $rD, $src", IIC_LdStLWA, + [(set i64:$rD, + (aligned4sextloadi32 ixaddr:$src))]>, isPPC64, + PPC970_DGroup_Cracked; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def LHAX8: XForm_1<31, 343, (outs g8rc:$rD), (ins memrr:$src), + "lhax $rD, $src", IIC_LdStLHA, + [(set i64:$rD, (sextloadi16 xaddr:$src))]>, + PPC970_DGroup_Cracked; +def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src), + "lwax $rD, $src", IIC_LdStLHA, + [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64, + PPC970_DGroup_Cracked; +// For fast-isel: +let isCodeGenOnly = 1, mayLoad = 1 in { +def LWA_32 : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src), + "lwa $rD, $src", IIC_LdStLWA, []>, isPPC64, + PPC970_DGroup_Cracked; +def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src), + "lwax $rD, $src", IIC_LdStLHA, []>, isPPC64, + PPC970_DGroup_Cracked; +} // end fast-isel isCodeGenOnly + +// Update forms. +let mayLoad = 1, neverHasSideEffects = 1 in { +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def LHAU8 : DForm_1<43, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memri:$addr), + "lhau $rD, $addr", IIC_LdStLHAU, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +// NO LWAU! + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +def LHAUX8 : XForm_1<31, 375, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lhaux $rD, $addr", IIC_LdStLHAUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +def LWAUX : XForm_1<31, 373, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lwaux $rD, $addr", IIC_LdStLHAUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">, isPPC64; +} +} + +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +// Zero extending loads. +let canFoldAsLoad = 1, PPC970_Unit = 2 in { +def LBZ8 : DForm_1<34, (outs g8rc:$rD), (ins memri:$src), + "lbz $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi8 iaddr:$src))]>; +def LHZ8 : DForm_1<40, (outs g8rc:$rD), (ins memri:$src), + "lhz $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi16 iaddr:$src))]>; +def LWZ8 : DForm_1<32, (outs g8rc:$rD), (ins memri:$src), + "lwz $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64; + +def LBZX8 : XForm_1<31, 87, (outs g8rc:$rD), (ins memrr:$src), + "lbzx $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi8 xaddr:$src))]>; +def LHZX8 : XForm_1<31, 279, (outs g8rc:$rD), (ins memrr:$src), + "lhzx $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi16 xaddr:$src))]>; +def LWZX8 : XForm_1<31, 23, (outs g8rc:$rD), (ins memrr:$src), + "lwzx $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (zextloadi32 xaddr:$src))]>; + + +// Update forms. +let mayLoad = 1, neverHasSideEffects = 1 in { +def LBZU8 : DForm_1<35, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lbzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +def LHZU8 : DForm_1<41, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lhzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; +def LWZU8 : DForm_1<33, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lwzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LBZUX8 : XForm_1<31, 119, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lbzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +def LHZUX8 : XForm_1<31, 311, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lhzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +def LWZUX8 : XForm_1<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lwzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +} +} +} // Interpretation64Bit + + +// Full 8-byte loads. +let canFoldAsLoad = 1, PPC970_Unit = 2 in { +def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src), + "ld $rD, $src", IIC_LdStLD, + [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64; +// The following three definitions are selected for small code model only. +// Otherwise, we need to create two instructions to form a 32-bit offset, +// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select(). +def LDtoc: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), + "#LDtoc", + [(set i64:$rD, + (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64; +def LDtocJTI: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), + "#LDtocJTI", + [(set i64:$rD, + (PPCtoc_entry tjumptable:$disp, i64:$reg))]>, isPPC64; +def LDtocCPT: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc:$reg), + "#LDtocCPT", + [(set i64:$rD, + (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64; + +let hasSideEffects = 1, isCodeGenOnly = 1, RST = 2, Defs = [X2] in +def LDinto_toc: DSForm_1<58, 0, (outs), (ins memrix:$src), + "ld 2, $src", IIC_LdStLD, + [(PPCload_toc ixaddr:$src)]>, isPPC64; + +def LDX : XForm_1<31, 21, (outs g8rc:$rD), (ins memrr:$src), + "ldx $rD, $src", IIC_LdStLD, + [(set i64:$rD, (load xaddr:$src))]>, isPPC64; +def LDBRX : XForm_1<31, 532, (outs g8rc:$rD), (ins memrr:$src), + "ldbrx $rD, $src", IIC_LdStLoad, + [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64; + +let mayLoad = 1, neverHasSideEffects = 1 in { +def LDU : DSForm_1<58, 1, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr), + "ldu $rD, $addr", IIC_LdStLDU, + []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64, + NoEncode<"$ea_result">; + +def LDUX : XForm_1<31, 53, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "ldux $rD, $addr", IIC_LdStLDUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">, isPPC64; +} +} + +def : Pat<(PPCload ixaddr:$src), + (LD ixaddr:$src)>; +def : Pat<(PPCload xaddr:$src), + (LDX xaddr:$src)>; + +// Support for medium and large code model. +def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), + "#ADDIStocHA", + [(set i64:$rD, + (PPCaddisTocHA i64:$reg, tglobaladdr:$disp))]>, + isPPC64; +def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), + "#LDtocL", + [(set i64:$rD, + (PPCldTocL tglobaladdr:$disp, i64:$reg))]>, isPPC64; +def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), + "#ADDItocL", + [(set i64:$rD, + (PPCaddiTocL i64:$reg, tglobaladdr:$disp))]>, isPPC64; + +// Support for thread-local storage. +def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDISgotTprelHA", + [(set i64:$rD, + (PPCaddisGotTprelHA i64:$reg, + tglobaltlsaddr:$disp))]>, + isPPC64; +def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg), + "#LDgotTprelL", + [(set i64:$rD, + (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>, + isPPC64; +def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g), + (ADD8TLS $in, tglobaltlsaddr:$g)>; +def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDIStlsgdHA", + [(set i64:$rD, + (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; +def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDItlsgdL", + [(set i64:$rD, + (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; +def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), + "#GETtlsADDR", + [(set i64:$rD, + (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>, + isPPC64; +def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDIStlsldHA", + [(set i64:$rD, + (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; +def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDItlsldL", + [(set i64:$rD, + (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; +def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), + "#GETtlsldADDR", + [(set i64:$rD, + (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>, + isPPC64; +def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDISdtprelHA", + [(set i64:$rD, + (PPCaddisDtprelHA i64:$reg, + tglobaltlsaddr:$disp))]>, + isPPC64; +def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#ADDIdtprelL", + [(set i64:$rD, + (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; + +let PPC970_Unit = 2 in { +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +// Truncating stores. +def STB8 : DForm_1<38, (outs), (ins g8rc:$rS, memri:$src), + "stb $rS, $src", IIC_LdStStore, + [(truncstorei8 i64:$rS, iaddr:$src)]>; +def STH8 : DForm_1<44, (outs), (ins g8rc:$rS, memri:$src), + "sth $rS, $src", IIC_LdStStore, + [(truncstorei16 i64:$rS, iaddr:$src)]>; +def STW8 : DForm_1<36, (outs), (ins g8rc:$rS, memri:$src), + "stw $rS, $src", IIC_LdStStore, + [(truncstorei32 i64:$rS, iaddr:$src)]>; +def STBX8 : XForm_8<31, 215, (outs), (ins g8rc:$rS, memrr:$dst), + "stbx $rS, $dst", IIC_LdStStore, + [(truncstorei8 i64:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STHX8 : XForm_8<31, 407, (outs), (ins g8rc:$rS, memrr:$dst), + "sthx $rS, $dst", IIC_LdStStore, + [(truncstorei16 i64:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STWX8 : XForm_8<31, 151, (outs), (ins g8rc:$rS, memrr:$dst), + "stwx $rS, $dst", IIC_LdStStore, + [(truncstorei32 i64:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +} // Interpretation64Bit + +// Normal 8-byte stores. +def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst), + "std $rS, $dst", IIC_LdStSTD, + [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64; +def STDX : XForm_8<31, 149, (outs), (ins g8rc:$rS, memrr:$dst), + "stdx $rS, $dst", IIC_LdStSTD, + [(store i64:$rS, xaddr:$dst)]>, isPPC64, + PPC970_DGroup_Cracked; +def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst), + "stdbrx $rS, $dst", IIC_LdStStore, + [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64, + PPC970_DGroup_Cracked; +} + +// Stores with Update (pre-inc). +let PPC970_Unit = 2, mayStore = 1 in { +let Interpretation64Bit = 1, isCodeGenOnly = 1 in { +def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), + "stbu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), + "sthu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst), + "stwu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; + +def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), + "stbux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), + "sthux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), + "stwux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +} // Interpretation64Bit + +def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrix:$dst), + "stdu $rS, $dst", IIC_LdStSTDU, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">, + isPPC64; + +def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memrr:$dst), + "stdux $rS, $dst", IIC_LdStSTDUX, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked, isPPC64; +} + +// Patterns to match the pre-inc stores. We can't put the patterns on +// the instruction definitions directly as ISel wants the address base +// and offset to be separate operands, not a single complex operand. +def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STBU8 $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STHU8 $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STWU8 $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STDU $rS, iaddroff:$ptroff, $ptrreg)>; + +def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STBUX8 $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STHUX8 $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STWUX8 $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STDUX $rS, $ptrreg, $ptroff)>; + + +//===----------------------------------------------------------------------===// +// Floating point instructions. +// + + +let PPC970_Unit = 3, neverHasSideEffects = 1, + Uses = [RM] in { // FPU Operations. +defm FCFID : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB), + "fcfid", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64; +defm FCTID : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB), + "fctid", "$frD, $frB", IIC_FPGeneral, + []>, isPPC64; +defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB), + "fctidz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64; + +defm FCFIDU : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB), + "fcfidu", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64; +defm FCFIDS : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB), + "fcfids", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64; +defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB), + "fcfidus", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64; +defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiduz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64; +defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiwuz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64; +} + + +//===----------------------------------------------------------------------===// +// Instruction Patterns +// + +// Extensions and truncates to/from 32-bit regs. +def : Pat<(i64 (zext i32:$in)), + (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32), + 0, 32)>; +def : Pat<(i64 (anyext i32:$in)), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32)>; +def : Pat<(i32 (trunc i64:$in)), + (EXTRACT_SUBREG $in, sub_32)>; + +// Implement the 'not' operation with the NOR instruction. +// (we could use the default xori pattern, but nor has lower latency on some +// cores (such as the A2)). +def i64not : OutPatFrag<(ops node:$in), + (NOR8 $in, $in)>; +def : Pat<(not i64:$in), + (i64not $in)>; + +// Extending loads with i64 targets. +def : Pat<(zextloadi1 iaddr:$src), + (LBZ8 iaddr:$src)>; +def : Pat<(zextloadi1 xaddr:$src), + (LBZX8 xaddr:$src)>; +def : Pat<(extloadi1 iaddr:$src), + (LBZ8 iaddr:$src)>; +def : Pat<(extloadi1 xaddr:$src), + (LBZX8 xaddr:$src)>; +def : Pat<(extloadi8 iaddr:$src), + (LBZ8 iaddr:$src)>; +def : Pat<(extloadi8 xaddr:$src), + (LBZX8 xaddr:$src)>; +def : Pat<(extloadi16 iaddr:$src), + (LHZ8 iaddr:$src)>; +def : Pat<(extloadi16 xaddr:$src), + (LHZX8 xaddr:$src)>; +def : Pat<(extloadi32 iaddr:$src), + (LWZ8 iaddr:$src)>; +def : Pat<(extloadi32 xaddr:$src), + (LWZX8 xaddr:$src)>; + +// Standard shifts. These are represented separately from the real shifts above +// so that we can distinguish between shifts that allow 6-bit and 7-bit shift +// amounts. +def : Pat<(sra i64:$rS, i32:$rB), + (SRAD $rS, $rB)>; +def : Pat<(srl i64:$rS, i32:$rB), + (SRD $rS, $rB)>; +def : Pat<(shl i64:$rS, i32:$rB), + (SLD $rS, $rB)>; + +// SHL/SRL +def : Pat<(shl i64:$in, (i32 imm:$imm)), + (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>; +def : Pat<(srl i64:$in, (i32 imm:$imm)), + (RLDICL $in, (SRL64 imm:$imm), imm:$imm)>; + +// ROTL +def : Pat<(rotl i64:$in, i32:$sh), + (RLDCL $in, $sh, 0)>; +def : Pat<(rotl i64:$in, (i32 imm:$imm)), + (RLDICL $in, imm:$imm, 0)>; + +// Hi and Lo for Darwin Global Addresses. +def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>; +def : Pat<(PPClo tglobaladdr:$in, 0), (LI8 tglobaladdr:$in)>; +def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>; +def : Pat<(PPClo tconstpool:$in , 0), (LI8 tconstpool:$in)>; +def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>; +def : Pat<(PPClo tjumptable:$in , 0), (LI8 tjumptable:$in)>; +def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>; +def : Pat<(PPClo tblockaddress:$in, 0), (LI8 tblockaddress:$in)>; +def : Pat<(PPChi tglobaltlsaddr:$g, i64:$in), + (ADDIS8 $in, tglobaltlsaddr:$g)>; +def : Pat<(PPClo tglobaltlsaddr:$g, i64:$in), + (ADDI8 $in, tglobaltlsaddr:$g)>; +def : Pat<(add i64:$in, (PPChi tglobaladdr:$g, 0)), + (ADDIS8 $in, tglobaladdr:$g)>; +def : Pat<(add i64:$in, (PPChi tconstpool:$g, 0)), + (ADDIS8 $in, tconstpool:$g)>; +def : Pat<(add i64:$in, (PPChi tjumptable:$g, 0)), + (ADDIS8 $in, tjumptable:$g)>; +def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)), + (ADDIS8 $in, tblockaddress:$g)>; + +// Patterns to match r+r indexed loads and stores for +// addresses without at least 4-byte alignment. +def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)), + (LWAX xoaddr:$src)>; +def : Pat<(i64 (unaligned4load xoaddr:$src)), + (LDX xoaddr:$src)>; +def : Pat<(unaligned4store i64:$rS, xoaddr:$dst), + (STDX $rS, xoaddr:$dst)>; + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td new file mode 100644 index 000000000000..b271b5d5aa21 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -0,0 +1,939 @@ +//===-- PPCInstrAltivec.td - The PowerPC Altivec Extension -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Altivec extension to the PowerPC instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Altivec transformation functions and pattern fragments. +// + +// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be +// of that type. +def vnot_ppc : PatFrag<(ops node:$in), + (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>; + +def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG); +}]>; +def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG); +}]>; +def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG); +}]>; +def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 1, *CurDAG); +}]>; + +// These fragments are provided for little-endian, where the inputs must be +// swapped for correct semantics. +def vpkuhum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG); +}]>; +def vpkuwum_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), 2, *CurDAG); +}]>; + +def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG); +}]>; +def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG); +}]>; +def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG); +}]>; +def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 0, *CurDAG); +}]>; +def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 0, *CurDAG); +}]>; +def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 0, *CurDAG); +}]>; + + +def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG); +}]>; +def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG); +}]>; +def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG); +}]>; +def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 1, *CurDAG); +}]>; +def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 1, *CurDAG); +}]>; +def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 1, *CurDAG); +}]>; + + +// These fragments are provided for little-endian, where the inputs must be +// swapped for correct semantics. +def vmrglb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG); +}]>; +def vmrglh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG); +}]>; +def vmrglw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG); +}]>; +def vmrghb_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, 2, *CurDAG); +}]>; +def vmrghh_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, 2, *CurDAG); +}]>; +def vmrghw_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, 2, *CurDAG); +}]>; + + +def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::isVSLDOIShuffleMask(N, 0, *CurDAG)); +}]>; +def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVSLDOIShuffleMask(N, 0, *CurDAG) != -1; +}], VSLDOI_get_imm>; + + +/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into +/// vector_shuffle(X,undef,mask) by the dag combiner. +def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::isVSLDOIShuffleMask(N, 1, *CurDAG)); +}]>; +def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVSLDOIShuffleMask(N, 1, *CurDAG) != -1; +}], VSLDOI_unary_get_imm>; + + +/// VSLDOI_swapped* - These fragments are provided for little-endian, where +/// the inputs must be swapped for correct semantics. +def VSLDOI_swapped_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::isVSLDOIShuffleMask(N, 2, *CurDAG)); +}]>; +def vsldoi_swapped_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isVSLDOIShuffleMask(N, 2, *CurDAG) != -1; +}], VSLDOI_get_imm>; + + +// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm. +def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::getVSPLTImmediate(N, 1, *CurDAG)); +}]>; +def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1); +}], VSPLTB_get_imm>; +def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::getVSPLTImmediate(N, 2, *CurDAG)); +}]>; +def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2); +}], VSPLTH_get_imm>; +def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{ + return getI32Imm(PPC::getVSPLTImmediate(N, 4, *CurDAG)); +}]>; +def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 4); +}], VSPLTW_get_imm>; + + +// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm. +def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{ + return PPC::get_VSPLTI_elt(N, 1, *CurDAG); +}]>; +def vecspltisb : PatLeaf<(build_vector), [{ + return PPC::get_VSPLTI_elt(N, 1, *CurDAG).getNode() != 0; +}], VSPLTISB_get_imm>; + +// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm. +def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{ + return PPC::get_VSPLTI_elt(N, 2, *CurDAG); +}]>; +def vecspltish : PatLeaf<(build_vector), [{ + return PPC::get_VSPLTI_elt(N, 2, *CurDAG).getNode() != 0; +}], VSPLTISH_get_imm>; + +// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm. +def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{ + return PPC::get_VSPLTI_elt(N, 4, *CurDAG); +}]>; +def vecspltisw : PatLeaf<(build_vector), [{ + return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != 0; +}], VSPLTISW_get_imm>; + +//===----------------------------------------------------------------------===// +// Helpers for defining instructions that directly correspond to intrinsics. + +// VA1a_Int_Ty - A VAForm_1a intrinsic definition of specific type. +class VA1a_Int_Ty<bits<6> xo, string opc, Intrinsic IntID, ValueType Ty> + : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), + !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP, + [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB, Ty:$vC))]>; + +// VA1a_Int_Ty2 - A VAForm_1a intrinsic definition where the type of the +// inputs doesn't match the type of the output. +class VA1a_Int_Ty2<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType InTy> + : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), + !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP, + [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB, InTy:$vC))]>; + +// VA1a_Int_Ty3 - A VAForm_1a intrinsic definition where there are two +// input types and an output type. +class VA1a_Int_Ty3<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType In1Ty, ValueType In2Ty> + : VAForm_1a<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), + !strconcat(opc, " $vD, $vA, $vB, $vC"), IIC_VecFP, + [(set OutTy:$vD, + (IntID In1Ty:$vA, In1Ty:$vB, In2Ty:$vC))]>; + +// VX1_Int_Ty - A VXForm_1 intrinsic definition of specific type. +class VX1_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty> + : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, + [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB))]>; + +// VX1_Int_Ty2 - A VXForm_1 intrinsic definition where the type of the +// inputs doesn't match the type of the output. +class VX1_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType InTy> + : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, + [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB))]>; + +// VX1_Int_Ty3 - A VXForm_1 intrinsic definition where there are two +// input types and an output type. +class VX1_Int_Ty3<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType In1Ty, ValueType In2Ty> + : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + !strconcat(opc, " $vD, $vA, $vB"), IIC_VecFP, + [(set OutTy:$vD, (IntID In1Ty:$vA, In2Ty:$vB))]>; + +// VX2_Int_SP - A VXForm_2 intrinsic definition of vector single-precision type. +class VX2_Int_SP<bits<11> xo, string opc, Intrinsic IntID> + : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB), + !strconcat(opc, " $vD, $vB"), IIC_VecFP, + [(set v4f32:$vD, (IntID v4f32:$vB))]>; + +// VX2_Int_Ty2 - A VXForm_2 intrinsic definition where the type of the +// inputs doesn't match the type of the output. +class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy, + ValueType InTy> + : VXForm_2<xo, (outs vrrc:$vD), (ins vrrc:$vB), + !strconcat(opc, " $vD, $vB"), IIC_VecFP, + [(set OutTy:$vD, (IntID InTy:$vB))]>; + +//===----------------------------------------------------------------------===// +// Instruction Definitions. + +def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">; +let Predicates = [HasAltivec] in { + +let isCodeGenOnly = 1 in { +def DSS : DSS_Form<822, (outs), + (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2), + "dss $STRM", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; +def DSSALL : DSS_Form<822, (outs), + (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2), + "dssall", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; +def DST : DSS_Form<342, (outs), + (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB), + "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; +def DSTT : DSS_Form<342, (outs), + (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB), + "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; +def DSTST : DSS_Form<374, (outs), + (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB), + "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; +def DSTSTT : DSS_Form<374, (outs), + (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB), + "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; + +def DST64 : DSS_Form<342, (outs), + (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB), + "dst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; +def DSTT64 : DSS_Form<342, (outs), + (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB), + "dstt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; +def DSTST64 : DSS_Form<374, (outs), + (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB), + "dstst $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; +def DSTSTT64 : DSS_Form<374, (outs), + (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB), + "dststt $rA, $rB, $STRM", IIC_LdStLoad /*FIXME*/, []>, + Deprecated<DeprecatedDST>; +} + +def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins), + "mfvscr $vD", IIC_LdStStore, + [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; +def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB), + "mtvscr $vB", IIC_LdStLoad, + [(int_ppc_altivec_mtvscr v4i32:$vB)]>; + +let canFoldAsLoad = 1, PPC970_Unit = 2 in { // Loads. +def LVEBX: XForm_1<31, 7, (outs vrrc:$vD), (ins memrr:$src), + "lvebx $vD, $src", IIC_LdStLoad, + [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>; +def LVEHX: XForm_1<31, 39, (outs vrrc:$vD), (ins memrr:$src), + "lvehx $vD, $src", IIC_LdStLoad, + [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>; +def LVEWX: XForm_1<31, 71, (outs vrrc:$vD), (ins memrr:$src), + "lvewx $vD, $src", IIC_LdStLoad, + [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>; +def LVX : XForm_1<31, 103, (outs vrrc:$vD), (ins memrr:$src), + "lvx $vD, $src", IIC_LdStLoad, + [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>; +def LVXL : XForm_1<31, 359, (outs vrrc:$vD), (ins memrr:$src), + "lvxl $vD, $src", IIC_LdStLoad, + [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>; +} + +def LVSL : XForm_1<31, 6, (outs vrrc:$vD), (ins memrr:$src), + "lvsl $vD, $src", IIC_LdStLoad, + [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>, + PPC970_Unit_LSU; +def LVSR : XForm_1<31, 38, (outs vrrc:$vD), (ins memrr:$src), + "lvsr $vD, $src", IIC_LdStLoad, + [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>, + PPC970_Unit_LSU; + +let PPC970_Unit = 2 in { // Stores. +def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst), + "stvebx $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>; +def STVEHX: XForm_8<31, 167, (outs), (ins vrrc:$rS, memrr:$dst), + "stvehx $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>; +def STVEWX: XForm_8<31, 199, (outs), (ins vrrc:$rS, memrr:$dst), + "stvewx $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>; +def STVX : XForm_8<31, 231, (outs), (ins vrrc:$rS, memrr:$dst), + "stvx $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>; +def STVXL : XForm_8<31, 487, (outs), (ins vrrc:$rS, memrr:$dst), + "stvxl $rS, $dst", IIC_LdStStore, + [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>; +} + +let PPC970_Unit = 5 in { // VALU Operations. +// VA-Form instructions. 3-input AltiVec ops. +let isCommutable = 1 in { +def VMADDFP : VAForm_1<46, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB), + "vmaddfp $vD, $vA, $vC, $vB", IIC_VecFP, + [(set v4f32:$vD, + (fma v4f32:$vA, v4f32:$vC, v4f32:$vB))]>; + +// FIXME: The fma+fneg pattern won't match because fneg is not legal. +def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB), + "vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP, + [(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC, + (fneg v4f32:$vB))))]>; + +def VMHADDSHS : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>; +def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs, + v8i16>; +def VMLADDUHM : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>; +} // isCommutable + +def VPERM : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm, + v4i32, v4i32, v16i8>; +def VSEL : VA1a_Int_Ty<42, "vsel", int_ppc_altivec_vsel, v4i32>; + +// Shuffles. +def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH), + "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP, + [(set v16i8:$vD, + (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>; + +// VX-Form instructions. AltiVec arithmetic ops. +let isCommutable = 1 in { +def VADDFP : VXForm_1<10, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vaddfp $vD, $vA, $vB", IIC_VecFP, + [(set v4f32:$vD, (fadd v4f32:$vA, v4f32:$vB))]>; + +def VADDUBM : VXForm_1<0, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vaddubm $vD, $vA, $vB", IIC_VecGeneral, + [(set v16i8:$vD, (add v16i8:$vA, v16i8:$vB))]>; +def VADDUHM : VXForm_1<64, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vadduhm $vD, $vA, $vB", IIC_VecGeneral, + [(set v8i16:$vD, (add v8i16:$vA, v8i16:$vB))]>; +def VADDUWM : VXForm_1<128, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vadduwm $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (add v4i32:$vA, v4i32:$vB))]>; + +def VADDCUW : VX1_Int_Ty<384, "vaddcuw", int_ppc_altivec_vaddcuw, v4i32>; +def VADDSBS : VX1_Int_Ty<768, "vaddsbs", int_ppc_altivec_vaddsbs, v16i8>; +def VADDSHS : VX1_Int_Ty<832, "vaddshs", int_ppc_altivec_vaddshs, v8i16>; +def VADDSWS : VX1_Int_Ty<896, "vaddsws", int_ppc_altivec_vaddsws, v4i32>; +def VADDUBS : VX1_Int_Ty<512, "vaddubs", int_ppc_altivec_vaddubs, v16i8>; +def VADDUHS : VX1_Int_Ty<576, "vadduhs", int_ppc_altivec_vadduhs, v8i16>; +def VADDUWS : VX1_Int_Ty<640, "vadduws", int_ppc_altivec_vadduws, v4i32>; +} // isCommutable + +let isCommutable = 1 in +def VAND : VXForm_1<1028, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vand $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (and v4i32:$vA, v4i32:$vB))]>; +def VANDC : VXForm_1<1092, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vandc $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (and v4i32:$vA, + (vnot_ppc v4i32:$vB)))]>; + +def VCFSX : VXForm_1<842, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vcfsx $vD, $vB, $UIMM", IIC_VecFP, + [(set v4f32:$vD, + (int_ppc_altivec_vcfsx v4i32:$vB, imm:$UIMM))]>; +def VCFUX : VXForm_1<778, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vcfux $vD, $vB, $UIMM", IIC_VecFP, + [(set v4f32:$vD, + (int_ppc_altivec_vcfux v4i32:$vB, imm:$UIMM))]>; +def VCTSXS : VXForm_1<970, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vctsxs $vD, $vB, $UIMM", IIC_VecFP, + [(set v4i32:$vD, + (int_ppc_altivec_vctsxs v4f32:$vB, imm:$UIMM))]>; +def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vctuxs $vD, $vB, $UIMM", IIC_VecFP, + [(set v4i32:$vD, + (int_ppc_altivec_vctuxs v4f32:$vB, imm:$UIMM))]>; + +// Defines with the UIM field set to 0 for floating-point +// to integer (fp_to_sint/fp_to_uint) conversions and integer +// to floating-point (sint_to_fp/uint_to_fp) conversions. +let isCodeGenOnly = 1, VA = 0 in { +def VCFSX_0 : VXForm_1<842, (outs vrrc:$vD), (ins vrrc:$vB), + "vcfsx $vD, $vB, 0", IIC_VecFP, + [(set v4f32:$vD, + (int_ppc_altivec_vcfsx v4i32:$vB, 0))]>; +def VCTUXS_0 : VXForm_1<906, (outs vrrc:$vD), (ins vrrc:$vB), + "vctuxs $vD, $vB, 0", IIC_VecFP, + [(set v4i32:$vD, + (int_ppc_altivec_vctuxs v4f32:$vB, 0))]>; +def VCFUX_0 : VXForm_1<778, (outs vrrc:$vD), (ins vrrc:$vB), + "vcfux $vD, $vB, 0", IIC_VecFP, + [(set v4f32:$vD, + (int_ppc_altivec_vcfux v4i32:$vB, 0))]>; +def VCTSXS_0 : VXForm_1<970, (outs vrrc:$vD), (ins vrrc:$vB), + "vctsxs $vD, $vB, 0", IIC_VecFP, + [(set v4i32:$vD, + (int_ppc_altivec_vctsxs v4f32:$vB, 0))]>; +} +def VEXPTEFP : VX2_Int_SP<394, "vexptefp", int_ppc_altivec_vexptefp>; +def VLOGEFP : VX2_Int_SP<458, "vlogefp", int_ppc_altivec_vlogefp>; + +let isCommutable = 1 in { +def VAVGSB : VX1_Int_Ty<1282, "vavgsb", int_ppc_altivec_vavgsb, v16i8>; +def VAVGSH : VX1_Int_Ty<1346, "vavgsh", int_ppc_altivec_vavgsh, v8i16>; +def VAVGSW : VX1_Int_Ty<1410, "vavgsw", int_ppc_altivec_vavgsw, v4i32>; +def VAVGUB : VX1_Int_Ty<1026, "vavgub", int_ppc_altivec_vavgub, v16i8>; +def VAVGUH : VX1_Int_Ty<1090, "vavguh", int_ppc_altivec_vavguh, v8i16>; +def VAVGUW : VX1_Int_Ty<1154, "vavguw", int_ppc_altivec_vavguw, v4i32>; + +def VMAXFP : VX1_Int_Ty<1034, "vmaxfp", int_ppc_altivec_vmaxfp, v4f32>; +def VMAXSB : VX1_Int_Ty< 258, "vmaxsb", int_ppc_altivec_vmaxsb, v16i8>; +def VMAXSH : VX1_Int_Ty< 322, "vmaxsh", int_ppc_altivec_vmaxsh, v8i16>; +def VMAXSW : VX1_Int_Ty< 386, "vmaxsw", int_ppc_altivec_vmaxsw, v4i32>; +def VMAXUB : VX1_Int_Ty< 2, "vmaxub", int_ppc_altivec_vmaxub, v16i8>; +def VMAXUH : VX1_Int_Ty< 66, "vmaxuh", int_ppc_altivec_vmaxuh, v8i16>; +def VMAXUW : VX1_Int_Ty< 130, "vmaxuw", int_ppc_altivec_vmaxuw, v4i32>; +def VMINFP : VX1_Int_Ty<1098, "vminfp", int_ppc_altivec_vminfp, v4f32>; +def VMINSB : VX1_Int_Ty< 770, "vminsb", int_ppc_altivec_vminsb, v16i8>; +def VMINSH : VX1_Int_Ty< 834, "vminsh", int_ppc_altivec_vminsh, v8i16>; +def VMINSW : VX1_Int_Ty< 898, "vminsw", int_ppc_altivec_vminsw, v4i32>; +def VMINUB : VX1_Int_Ty< 514, "vminub", int_ppc_altivec_vminub, v16i8>; +def VMINUH : VX1_Int_Ty< 578, "vminuh", int_ppc_altivec_vminuh, v8i16>; +def VMINUW : VX1_Int_Ty< 642, "vminuw", int_ppc_altivec_vminuw, v4i32>; +} // isCommutable + +def VMRGHB : VXForm_1< 12, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrghb $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGHH : VXForm_1< 76, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrghh $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGHW : VXForm_1<140, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrghw $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGLB : VXForm_1<268, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrglb $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGLH : VXForm_1<332, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrglh $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>; +def VMRGLW : VXForm_1<396, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmrglw $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>; + +def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm, + v4i32, v16i8, v4i32>; +def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm, + v4i32, v8i16, v4i32>; +def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs, + v4i32, v8i16, v4i32>; +def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm, + v4i32, v16i8, v4i32>; +def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm, + v4i32, v8i16, v4i32>; +def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs, + v4i32, v8i16, v4i32>; + +let isCommutable = 1 in { +def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb, + v8i16, v16i8>; +def VMULESH : VX1_Int_Ty2<840, "vmulesh", int_ppc_altivec_vmulesh, + v4i32, v8i16>; +def VMULEUB : VX1_Int_Ty2<520, "vmuleub", int_ppc_altivec_vmuleub, + v8i16, v16i8>; +def VMULEUH : VX1_Int_Ty2<584, "vmuleuh", int_ppc_altivec_vmuleuh, + v4i32, v8i16>; +def VMULOSB : VX1_Int_Ty2<264, "vmulosb", int_ppc_altivec_vmulosb, + v8i16, v16i8>; +def VMULOSH : VX1_Int_Ty2<328, "vmulosh", int_ppc_altivec_vmulosh, + v4i32, v8i16>; +def VMULOUB : VX1_Int_Ty2< 8, "vmuloub", int_ppc_altivec_vmuloub, + v8i16, v16i8>; +def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh, + v4i32, v8i16>; +} // isCommutable + +def VREFP : VX2_Int_SP<266, "vrefp", int_ppc_altivec_vrefp>; +def VRFIM : VX2_Int_SP<714, "vrfim", int_ppc_altivec_vrfim>; +def VRFIN : VX2_Int_SP<522, "vrfin", int_ppc_altivec_vrfin>; +def VRFIP : VX2_Int_SP<650, "vrfip", int_ppc_altivec_vrfip>; +def VRFIZ : VX2_Int_SP<586, "vrfiz", int_ppc_altivec_vrfiz>; +def VRSQRTEFP : VX2_Int_SP<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>; + +def VSUBCUW : VX1_Int_Ty<1408, "vsubcuw", int_ppc_altivec_vsubcuw, v4i32>; + +def VSUBFP : VXForm_1<74, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsubfp $vD, $vA, $vB", IIC_VecGeneral, + [(set v4f32:$vD, (fsub v4f32:$vA, v4f32:$vB))]>; +def VSUBUBM : VXForm_1<1024, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsububm $vD, $vA, $vB", IIC_VecGeneral, + [(set v16i8:$vD, (sub v16i8:$vA, v16i8:$vB))]>; +def VSUBUHM : VXForm_1<1088, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsubuhm $vD, $vA, $vB", IIC_VecGeneral, + [(set v8i16:$vD, (sub v8i16:$vA, v8i16:$vB))]>; +def VSUBUWM : VXForm_1<1152, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vsubuwm $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (sub v4i32:$vA, v4i32:$vB))]>; + +def VSUBSBS : VX1_Int_Ty<1792, "vsubsbs" , int_ppc_altivec_vsubsbs, v16i8>; +def VSUBSHS : VX1_Int_Ty<1856, "vsubshs" , int_ppc_altivec_vsubshs, v8i16>; +def VSUBSWS : VX1_Int_Ty<1920, "vsubsws" , int_ppc_altivec_vsubsws, v4i32>; +def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>; +def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>; +def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>; + +def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>; +def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>; + +def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs, + v4i32, v16i8, v4i32>; +def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs, + v4i32, v8i16, v4i32>; +def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs, + v4i32, v16i8, v4i32>; + +def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vnor $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (vnot_ppc (or v4i32:$vA, + v4i32:$vB)))]>; +let isCommutable = 1 in { +def VOR : VXForm_1<1156, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vor $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (or v4i32:$vA, v4i32:$vB))]>; +def VXOR : VXForm_1<1220, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vxor $vD, $vA, $vB", IIC_VecFP, + [(set v4i32:$vD, (xor v4i32:$vA, v4i32:$vB))]>; +} // isCommutable + +def VRLB : VX1_Int_Ty< 4, "vrlb", int_ppc_altivec_vrlb, v16i8>; +def VRLH : VX1_Int_Ty< 68, "vrlh", int_ppc_altivec_vrlh, v8i16>; +def VRLW : VX1_Int_Ty< 132, "vrlw", int_ppc_altivec_vrlw, v4i32>; + +def VSL : VX1_Int_Ty< 452, "vsl" , int_ppc_altivec_vsl, v4i32 >; +def VSLO : VX1_Int_Ty<1036, "vslo", int_ppc_altivec_vslo, v4i32>; + +def VSLB : VX1_Int_Ty< 260, "vslb", int_ppc_altivec_vslb, v16i8>; +def VSLH : VX1_Int_Ty< 324, "vslh", int_ppc_altivec_vslh, v8i16>; +def VSLW : VX1_Int_Ty< 388, "vslw", int_ppc_altivec_vslw, v4i32>; + +def VSPLTB : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vspltb $vD, $vB, $UIMM", IIC_VecPerm, + [(set v16i8:$vD, + (vspltb_shuffle:$UIMM v16i8:$vB, (undef)))]>; +def VSPLTH : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vsplth $vD, $vB, $UIMM", IIC_VecPerm, + [(set v16i8:$vD, + (vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>; +def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), + "vspltw $vD, $vB, $UIMM", IIC_VecPerm, + [(set v16i8:$vD, + (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>; + +def VSR : VX1_Int_Ty< 708, "vsr" , int_ppc_altivec_vsr, v4i32>; +def VSRO : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>; + +def VSRAB : VX1_Int_Ty< 772, "vsrab", int_ppc_altivec_vsrab, v16i8>; +def VSRAH : VX1_Int_Ty< 836, "vsrah", int_ppc_altivec_vsrah, v8i16>; +def VSRAW : VX1_Int_Ty< 900, "vsraw", int_ppc_altivec_vsraw, v4i32>; +def VSRB : VX1_Int_Ty< 516, "vsrb" , int_ppc_altivec_vsrb , v16i8>; +def VSRH : VX1_Int_Ty< 580, "vsrh" , int_ppc_altivec_vsrh , v8i16>; +def VSRW : VX1_Int_Ty< 644, "vsrw" , int_ppc_altivec_vsrw , v4i32>; + + +def VSPLTISB : VXForm_3<780, (outs vrrc:$vD), (ins s5imm:$SIMM), + "vspltisb $vD, $SIMM", IIC_VecPerm, + [(set v16i8:$vD, (v16i8 vecspltisb:$SIMM))]>; +def VSPLTISH : VXForm_3<844, (outs vrrc:$vD), (ins s5imm:$SIMM), + "vspltish $vD, $SIMM", IIC_VecPerm, + [(set v8i16:$vD, (v8i16 vecspltish:$SIMM))]>; +def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM), + "vspltisw $vD, $SIMM", IIC_VecPerm, + [(set v4i32:$vD, (v4i32 vecspltisw:$SIMM))]>; + +// Vector Pack. +def VPKPX : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx, + v8i16, v4i32>; +def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss, + v16i8, v8i16>; +def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus, + v16i8, v8i16>; +def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss, + v16i8, v4i32>; +def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus, + v8i16, v4i32>; +def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vpkuhum $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, + (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>; +def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus, + v16i8, v8i16>; +def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vpkuwum $vD, $vA, $vB", IIC_VecFP, + [(set v16i8:$vD, + (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>; +def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus, + v8i16, v4i32>; + +// Vector Unpack. +def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx, + v4i32, v8i16>; +def VUPKHSB : VX2_Int_Ty2<526, "vupkhsb", int_ppc_altivec_vupkhsb, + v8i16, v16i8>; +def VUPKHSH : VX2_Int_Ty2<590, "vupkhsh", int_ppc_altivec_vupkhsh, + v4i32, v8i16>; +def VUPKLPX : VX2_Int_Ty2<974, "vupklpx", int_ppc_altivec_vupklpx, + v4i32, v8i16>; +def VUPKLSB : VX2_Int_Ty2<654, "vupklsb", int_ppc_altivec_vupklsb, + v8i16, v16i8>; +def VUPKLSH : VX2_Int_Ty2<718, "vupklsh", int_ppc_altivec_vupklsh, + v4i32, v8i16>; + + +// Altivec Comparisons. + +class VCMP<bits<10> xo, string asmstr, ValueType Ty> + : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr, + IIC_VecFPCompare, + [(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>; +class VCMPo<bits<10> xo, string asmstr, ValueType Ty> + : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr, + IIC_VecFPCompare, + [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> { + let Defs = [CR6]; + let RC = 1; +} + +// f32 element comparisons.0 +def VCMPBFP : VCMP <966, "vcmpbfp $vD, $vA, $vB" , v4f32>; +def VCMPBFPo : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>; +def VCMPEQFP : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>; +def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>; +def VCMPGEFP : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>; +def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>; +def VCMPGTFP : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>; +def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>; + +// i8 element comparisons. +def VCMPEQUB : VCMP < 6, "vcmpequb $vD, $vA, $vB" , v16i8>; +def VCMPEQUBo : VCMPo< 6, "vcmpequb. $vD, $vA, $vB", v16i8>; +def VCMPGTSB : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>; +def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>; +def VCMPGTUB : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>; +def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>; + +// i16 element comparisons. +def VCMPEQUH : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>; +def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>; +def VCMPGTSH : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>; +def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>; +def VCMPGTUH : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>; +def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>; + +// i32 element comparisons. +def VCMPEQUW : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>; +def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>; +def VCMPGTSW : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>; +def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>; +def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>; +def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>; + +let isCodeGenOnly = 1 in { +def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins), + "vxor $vD, $vD, $vD", IIC_VecFP, + [(set v16i8:$vD, (v16i8 immAllZerosV))]>; +def V_SET0H : VXForm_setzero<1220, (outs vrrc:$vD), (ins), + "vxor $vD, $vD, $vD", IIC_VecFP, + [(set v8i16:$vD, (v8i16 immAllZerosV))]>; +def V_SET0 : VXForm_setzero<1220, (outs vrrc:$vD), (ins), + "vxor $vD, $vD, $vD", IIC_VecFP, + [(set v4i32:$vD, (v4i32 immAllZerosV))]>; + +let IMM=-1 in { +def V_SETALLONESB : VXForm_3<908, (outs vrrc:$vD), (ins), + "vspltisw $vD, -1", IIC_VecFP, + [(set v16i8:$vD, (v16i8 immAllOnesV))]>; +def V_SETALLONESH : VXForm_3<908, (outs vrrc:$vD), (ins), + "vspltisw $vD, -1", IIC_VecFP, + [(set v8i16:$vD, (v8i16 immAllOnesV))]>; +def V_SETALLONES : VXForm_3<908, (outs vrrc:$vD), (ins), + "vspltisw $vD, -1", IIC_VecFP, + [(set v4i32:$vD, (v4i32 immAllOnesV))]>; +} +} +} // VALU Operations. + +//===----------------------------------------------------------------------===// +// Additional Altivec Patterns +// + +// DS* intrinsics +def : Pat<(int_ppc_altivec_dssall), (DSSALL 1, 0, 0, 0)>; +def : Pat<(int_ppc_altivec_dss imm:$STRM), (DSS 0, imm:$STRM, 0, 0)>; + +// * 32-bit +def : Pat<(int_ppc_altivec_dst i32:$rA, i32:$rB, imm:$STRM), + (DST 0, imm:$STRM, $rA, $rB)>; +def : Pat<(int_ppc_altivec_dstt i32:$rA, i32:$rB, imm:$STRM), + (DSTT 1, imm:$STRM, $rA, $rB)>; +def : Pat<(int_ppc_altivec_dstst i32:$rA, i32:$rB, imm:$STRM), + (DSTST 0, imm:$STRM, $rA, $rB)>; +def : Pat<(int_ppc_altivec_dststt i32:$rA, i32:$rB, imm:$STRM), + (DSTSTT 1, imm:$STRM, $rA, $rB)>; + +// * 64-bit +def : Pat<(int_ppc_altivec_dst i64:$rA, i32:$rB, imm:$STRM), + (DST64 0, imm:$STRM, $rA, $rB)>; +def : Pat<(int_ppc_altivec_dstt i64:$rA, i32:$rB, imm:$STRM), + (DSTT64 1, imm:$STRM, $rA, $rB)>; +def : Pat<(int_ppc_altivec_dstst i64:$rA, i32:$rB, imm:$STRM), + (DSTST64 0, imm:$STRM, $rA, $rB)>; +def : Pat<(int_ppc_altivec_dststt i64:$rA, i32:$rB, imm:$STRM), + (DSTSTT64 1, imm:$STRM, $rA, $rB)>; + +// Loads. +def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>; + +// Stores. +def : Pat<(store v4i32:$rS, xoaddr:$dst), + (STVX $rS, xoaddr:$dst)>; + +// Bit conversions. +def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>; + +def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>; + +def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>; + +def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>; + +// Shuffles. + +// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x) +def:Pat<(vsldoi_unary_shuffle:$in v16i8:$vA, undef), + (VSLDOI $vA, $vA, (VSLDOI_unary_get_imm $in))>; +def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef), + (VPKUWUM $vA, $vA)>; +def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef), + (VPKUHUM $vA, $vA)>; + +// Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands. +// These fragments are matched for little-endian, where the inputs must +// be swapped for correct semantics. +def:Pat<(vsldoi_swapped_shuffle:$in v16i8:$vA, v16i8:$vB), + (VSLDOI $vB, $vA, (VSLDOI_swapped_get_imm $in))>; +def:Pat<(vpkuwum_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VPKUWUM $vB, $vA)>; +def:Pat<(vpkuhum_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VPKUHUM $vB, $vA)>; + +// Match vmrg*(x,x) +def:Pat<(vmrglb_unary_shuffle v16i8:$vA, undef), + (VMRGLB $vA, $vA)>; +def:Pat<(vmrglh_unary_shuffle v16i8:$vA, undef), + (VMRGLH $vA, $vA)>; +def:Pat<(vmrglw_unary_shuffle v16i8:$vA, undef), + (VMRGLW $vA, $vA)>; +def:Pat<(vmrghb_unary_shuffle v16i8:$vA, undef), + (VMRGHB $vA, $vA)>; +def:Pat<(vmrghh_unary_shuffle v16i8:$vA, undef), + (VMRGHH $vA, $vA)>; +def:Pat<(vmrghw_unary_shuffle v16i8:$vA, undef), + (VMRGHW $vA, $vA)>; + +// Match vmrg*(y,x), i.e., swapped operands. These fragments +// are matched for little-endian, where the inputs must be +// swapped for correct semantics. +def:Pat<(vmrglb_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGLB $vB, $vA)>; +def:Pat<(vmrglh_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGLH $vB, $vA)>; +def:Pat<(vmrglw_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGLW $vB, $vA)>; +def:Pat<(vmrghb_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGHB $vB, $vA)>; +def:Pat<(vmrghh_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGHH $vB, $vA)>; +def:Pat<(vmrghw_swapped_shuffle v16i8:$vA, v16i8:$vB), + (VMRGHW $vB, $vA)>; + +// Logical Operations +def : Pat<(vnot_ppc v4i32:$vA), (VNOR $vA, $vA)>; + +def : Pat<(vnot_ppc (or v4i32:$A, v4i32:$B)), + (VNOR $A, $B)>; +def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)), + (VANDC $A, $B)>; + +def : Pat<(fmul v4f32:$vA, v4f32:$vB), + (VMADDFP $vA, $vB, + (v4i32 (VSLW (V_SETALLONES), (V_SETALLONES))))>; + +// Fused multiply add and multiply sub for packed float. These are represented +// separately from the real instructions above, for operations that must have +// the additional precision, such as Newton-Rhapson (used by divide, sqrt) +def : Pat<(PPCvmaddfp v4f32:$A, v4f32:$B, v4f32:$C), + (VMADDFP $A, $B, $C)>; +def : Pat<(PPCvnmsubfp v4f32:$A, v4f32:$B, v4f32:$C), + (VNMSUBFP $A, $B, $C)>; + +def : Pat<(int_ppc_altivec_vmaddfp v4f32:$A, v4f32:$B, v4f32:$C), + (VMADDFP $A, $B, $C)>; +def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C), + (VNMSUBFP $A, $B, $C)>; + +def : Pat<(PPCvperm v16i8:$vA, v16i8:$vB, v16i8:$vC), + (VPERM $vA, $vB, $vC)>; + +def : Pat<(PPCfre v4f32:$A), (VREFP $A)>; +def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>; + +// Vector shifts +def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSLB $vA, $vB))>; +def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSLH $vA, $vB))>; +def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSLW $vA, $vB))>; + +def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRB $vA, $vB))>; +def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRH $vA, $vB))>; +def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRW $vA, $vB))>; + +def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRAB $vA, $vB))>; +def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRAH $vA, $vB))>; +def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRAW $vA, $vB))>; + +// Float to integer and integer to float conversions +def : Pat<(v4i32 (fp_to_sint v4f32:$vA)), + (VCTSXS_0 $vA)>; +def : Pat<(v4i32 (fp_to_uint v4f32:$vA)), + (VCTUXS_0 $vA)>; +def : Pat<(v4f32 (sint_to_fp v4i32:$vA)), + (VCFSX_0 $vA)>; +def : Pat<(v4f32 (uint_to_fp v4i32:$vA)), + (VCFUX_0 $vA)>; + +// Floating-point rounding +def : Pat<(v4f32 (ffloor v4f32:$vA)), + (VRFIM $vA)>; +def : Pat<(v4f32 (fceil v4f32:$vA)), + (VRFIP $vA)>; +def : Pat<(v4f32 (ftrunc v4f32:$vA)), + (VRFIZ $vA)>; +def : Pat<(v4f32 (fnearbyint v4f32:$vA)), + (VRFIN $vA)>; + +} // end HasAltivec + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h new file mode 100644 index 000000000000..b424d1101416 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h @@ -0,0 +1,43 @@ +//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file exposes functions that may be used with BuildMI from the +// MachineInstrBuilder.h file to simplify generating frame and constant pool +// references. +// +// For reference, the order of operands for memory references is: +// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate +// Displacement. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC_INSTRBUILDER_H +#define POWERPC_INSTRBUILDER_H + +#include "llvm/CodeGen/MachineInstrBuilder.h" + +namespace llvm { + +/// addFrameReference - This function is used to add a reference to the base of +/// an abstract object on the stack frame of the current function. This +/// reference has base register as the FrameIndex offset until it is resolved. +/// This allows a constant offset to be specified as well... +/// +static inline const MachineInstrBuilder& +addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0, + bool mem = true) { + if (mem) + return MIB.addImm(Offset).addFrameIndex(FI); + else + return MIB.addFrameIndex(FI).addImm(Offset); +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td new file mode 100644 index 000000000000..1e4396cd1017 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -0,0 +1,1300 @@ +//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// PowerPC instruction formats + +class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin> + : Instruction { + field bits<32> Inst; + field bits<32> SoftFail = 0; + let Size = 4; + + bit PPC64 = 0; // Default value, override with isPPC64 + + let Namespace = "PPC"; + let Inst{0-5} = opcode; + let OutOperandList = OOL; + let InOperandList = IOL; + let AsmString = asmstr; + let Itinerary = itin; + + bits<1> PPC970_First = 0; + bits<1> PPC970_Single = 0; + bits<1> PPC970_Cracked = 0; + bits<3> PPC970_Unit = 0; + + /// These fields correspond to the fields in PPCInstrInfo.h. Any changes to + /// these must be reflected there! See comments there for what these are. + let TSFlags{0} = PPC970_First; + let TSFlags{1} = PPC970_Single; + let TSFlags{2} = PPC970_Cracked; + let TSFlags{5-3} = PPC970_Unit; + + // Fields used for relation models. + string BaseName = ""; + + // For cases where multiple instruction definitions really represent the + // same underlying instruction but with one definition for 64-bit arguments + // and one for 32-bit arguments, this bit breaks the degeneracy between + // the two forms and allows TableGen to generate mapping tables. + bit Interpretation64Bit = 0; +} + +class PPC970_DGroup_First { bits<1> PPC970_First = 1; } +class PPC970_DGroup_Single { bits<1> PPC970_Single = 1; } +class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; } +class PPC970_MicroCode; + +class PPC970_Unit_Pseudo { bits<3> PPC970_Unit = 0; } +class PPC970_Unit_FXU { bits<3> PPC970_Unit = 1; } +class PPC970_Unit_LSU { bits<3> PPC970_Unit = 2; } +class PPC970_Unit_FPU { bits<3> PPC970_Unit = 3; } +class PPC970_Unit_CRU { bits<3> PPC970_Unit = 4; } +class PPC970_Unit_VALU { bits<3> PPC970_Unit = 5; } +class PPC970_Unit_VPERM { bits<3> PPC970_Unit = 6; } +class PPC970_Unit_BRU { bits<3> PPC970_Unit = 7; } + +// Two joined instructions; used to emit two adjacent instructions as one. +// The itinerary from the first instruction is used for scheduling and +// classification. +class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : Instruction { + field bits<64> Inst; + field bits<64> SoftFail = 0; + let Size = 8; + + bit PPC64 = 0; // Default value, override with isPPC64 + + let Namespace = "PPC"; + let Inst{0-5} = opcode1; + let Inst{32-37} = opcode2; + let OutOperandList = OOL; + let InOperandList = IOL; + let AsmString = asmstr; + let Itinerary = itin; + + bits<1> PPC970_First = 0; + bits<1> PPC970_Single = 0; + bits<1> PPC970_Cracked = 0; + bits<3> PPC970_Unit = 0; + + /// These fields correspond to the fields in PPCInstrInfo.h. Any changes to + /// these must be reflected there! See comments there for what these are. + let TSFlags{0} = PPC970_First; + let TSFlags{1} = PPC970_Single; + let TSFlags{2} = PPC970_Cracked; + let TSFlags{5-3} = PPC970_Unit; + + // Fields used for relation models. + string BaseName = ""; + bit Interpretation64Bit = 0; +} + +// 1.7.1 I-Form +class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + let Pattern = pattern; + bits<24> LI; + + let Inst{6-29} = LI; + let Inst{30} = aa; + let Inst{31} = lk; +} + +// 1.7.2 B-Form +class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr> + : I<opcode, OOL, IOL, asmstr, IIC_BrB> { + bits<7> BIBO; // 2 bits of BI and 5 bits of BO. + bits<3> CR; + bits<14> BD; + + bits<5> BI; + let BI{0-1} = BIBO{5-6}; + let BI{2-4} = CR{0-2}; + + let Inst{6-10} = BIBO{4-0}; + let Inst{11-15} = BI; + let Inst{16-29} = BD; + let Inst{30} = aa; + let Inst{31} = lk; +} + +class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL, + string asmstr> + : BForm<opcode, aa, lk, OOL, IOL, asmstr> { + let BIBO{4-0} = bo; + let BIBO{6-5} = 0; + let CR = 0; +} + +class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk, + dag OOL, dag IOL, string asmstr> + : I<opcode, OOL, IOL, asmstr, IIC_BrB> { + bits<14> BD; + + let Inst{6-10} = bo; + let Inst{11-15} = bi; + let Inst{16-29} = BD; + let Inst{30} = aa; + let Inst{31} = lk; +} + +class BForm_3<bits<6> opcode, bit aa, bit lk, + dag OOL, dag IOL, string asmstr> + : I<opcode, OOL, IOL, asmstr, IIC_BrB> { + bits<5> BO; + bits<5> BI; + bits<14> BD; + + let Inst{6-10} = BO; + let Inst{11-15} = BI; + let Inst{16-29} = BD; + let Inst{30} = aa; + let Inst{31} = lk; +} + +class BForm_4<bits<6> opcode, bits<5> bo, bit aa, bit lk, + dag OOL, dag IOL, string asmstr> + : I<opcode, OOL, IOL, asmstr, IIC_BrB> { + bits<5> BI; + bits<14> BD; + + let Inst{6-10} = bo; + let Inst{11-15} = BI; + let Inst{16-29} = BD; + let Inst{30} = aa; + let Inst{31} = lk; +} + +// 1.7.3 SC-Form +class SCForm<bits<6> opcode, bits<1> xo, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, + list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<7> LEV; + + let Pattern = pattern; + + let Inst{20-26} = LEV; + let Inst{30} = xo; +} + +// 1.7.4 D-Form +class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<5> B; + bits<16> C; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = B; + let Inst{16-31} = C; +} + +class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<21> Addr; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = Addr{20-16}; // Base Reg + let Inst{16-31} = Addr{15-0}; // Displacement +} + +class DForm_1a<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<16> C; + bits<5> B; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = B; + let Inst{16-31} = C; +} + + +class DForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : DForm_base<opcode, OOL, IOL, asmstr, itin, pattern> { + + // Even though ADDICo does not really have an RC bit, provide + // the declaration of one here so that isDOT has something to set. + bit RC = 0; +} + +class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<16> B; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = 0; + let Inst{16-31} = B; +} + +class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> B; + bits<5> A; + bits<16> C; + + let Pattern = pattern; + + let Inst{6-10} = A; + let Inst{11-15} = B; + let Inst{16-31} = C; +} + +class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : DForm_1<opcode, OOL, IOL, asmstr, itin, pattern> { + let A = 0; + let Addr = 0; +} + +class DForm_4_fixedreg_zero<bits<6> opcode, bits<5> R, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, + list<dag> pattern> + : DForm_4<opcode, OOL, IOL, asmstr, itin, pattern> { + let A = R; + let B = R; + let C = 0; +} + +class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2, + dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<21> Addr; + + let Pattern = pattern; + bits<24> LI; + + let Inst{6-29} = LI; + let Inst{30} = aa; + let Inst{31} = lk; + + let Inst{38-42} = A; + let Inst{43-47} = Addr{20-16}; // Base Reg + let Inst{48-63} = Addr{15-0}; // Displacement +} + +// This is used to emit BL8+NOP. +class IForm_and_DForm_4_zero<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2, + dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : IForm_and_DForm_1<opcode1, aa, lk, opcode2, + OOL, IOL, asmstr, itin, pattern> { + let A = 0; + let Addr = 0; +} + +class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + bits<1> L; + bits<5> RA; + bits<16> I; + + let Inst{6-8} = BF; + let Inst{9} = 0; + let Inst{10} = L; + let Inst{11-15} = RA; + let Inst{16-31} = I; +} + +class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : DForm_5<opcode, OOL, IOL, asmstr, itin> { + let L = PPC64; +} + +class DForm_6<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : DForm_5<opcode, OOL, IOL, asmstr, itin>; + +class DForm_6_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : DForm_6<opcode, OOL, IOL, asmstr, itin> { + let L = PPC64; +} + + +// 1.7.5 DS-Form +class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RST; + bits<19> DS_RA; + + let Pattern = pattern; + + let Inst{6-10} = RST; + let Inst{11-15} = DS_RA{18-14}; // Register # + let Inst{16-29} = DS_RA{13-0}; // Displacement. + let Inst{30-31} = xo; +} + + +// 1.7.6 X-Form +class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RST; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RST; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +// This is the same as XForm_base_r3xo, but the first two operands are swapped +// when code is emitted. +class XForm_base_r3xo_swapped + <bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<5> RST; + bits<5> B; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RST; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + + +class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>; + +class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let RST = 0; +} + +class XForm_rs<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let A = 0; + let B = 0; +} + +class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> { + let Pattern = pattern; +} + +class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>; + +class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> { + let Pattern = pattern; +} + +class XForm_11<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> { + let B = 0; + let Pattern = pattern; +} + +class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + bits<1> L; + bits<5> RA; + bits<5> RB; + + let Inst{6-8} = BF; + let Inst{9} = 0; + let Inst{10} = L; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RS; + bits<1> L; + + let Inst{6-10} = RS; + let Inst{15} = L; + let Inst{21-30} = xo; +} + +class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> { + let L = PPC64; +} + +class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + bits<5> FRA; + bits<5> FRB; + + let Inst{6-8} = BF; + let Inst{9-10} = 0; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + let Pattern = pattern; + let Inst{6-10} = 31; + let Inst{11-15} = 0; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<2> L; + + let Pattern = pattern; + let Inst{6-8} = 0; + let Inst{9-10} = L; + let Inst{11-15} = 0; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XForm_24_eieio<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : XForm_24_sync<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let L = 0; +} + +class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { +} + +class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let A = 0; +} + +class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { +} + +// This is used for MFFS, MTFSB0, MTFSB1. 42 is arbitrary; this series of +// numbers presumably relates to some document, but I haven't found it. +class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RST; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} +class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let Pattern = pattern; + bits<5> FM; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = FM; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let RST = 0; + let A = 0; + let B = 0; +} + +class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let RST = 0; + let A = 0; +} + +// XX*-Form (VSX) +class XX1Form<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = XT{5}; +} + +class XX2Form<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = 0; + let Inst{16-20} = XB{4-0}; + let Inst{21-29} = xo; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX2Form_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> CR; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6-8} = CR; + let Inst{9-15} = 0; + let Inst{16-20} = XB{4-0}; + let Inst{21-29} = xo; + let Inst{30} = XB{5}; + let Inst{31} = 0; +} + +class XX2Form_2<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XB; + bits<2> D; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-13} = 0; + let Inst{14-15} = D; + let Inst{16-20} = XB{4-0}; + let Inst{21-29} = xo; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX3Form<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XA; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21-28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> CR; + bits<6> XA; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6-8} = CR; + let Inst{9-10} = 0; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21-28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = 0; +} + +class XX3Form_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XA; + bits<6> XB; + bits<2> D; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21} = 0; + let Inst{22-23} = D; + let Inst{24-28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX3Form_Rc<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XA; + bits<6> XB; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21} = RC; + let Inst{22-28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX4Form<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<6> XT; + bits<6> XA; + bits<6> XB; + bits<6> XC; + + let Pattern = pattern; + + let Inst{6-10} = XT{4-0}; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21-25} = XC{4-0}; + let Inst{26-27} = xo; + let Inst{28} = XC{5}; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +// DCB_Form - Form X instruction, used for dcb* instructions. +class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<31, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + let Inst{6-10} = immfield; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + + +// DSS_Form - Form X instruction, used for altivec dss* instructions. +class DSS_Form<bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<31, OOL, IOL, asmstr, itin> { + bits<1> T; + bits<2> STRM; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + + let Inst{6} = T; + let Inst{7-8} = 0; + let Inst{9-10} = STRM; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +// 1.7.7 XL-Form +class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> CRD; + bits<5> CRA; + bits<5> CRB; + + let Pattern = pattern; + + let Inst{6-10} = CRD; + let Inst{11-15} = CRA; + let Inst{16-20} = CRB; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> CRD; + + let Pattern = pattern; + + let Inst{6-10} = CRD; + let Inst{11-15} = CRD; + let Inst{16-20} = CRD; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> BO; + bits<5> BI; + bits<2> BH; + + let Pattern = pattern; + + let Inst{6-10} = BO; + let Inst{11-15} = BI; + let Inst{16-18} = 0; + let Inst{19-20} = BH; + let Inst{21-30} = xo; + let Inst{31} = lk; +} + +class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> { + bits<7> BIBO; // 2 bits of BI and 5 bits of BO. + bits<3> CR; + + let BO = BIBO{4-0}; + let BI{0-1} = BIBO{5-6}; + let BI{2-4} = CR{0-2}; + let BH = 0; +} + +class XLForm_2_br2<bits<6> opcode, bits<10> xo, bits<5> bo, bit lk, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> { + let BO = bo; + let BH = 0; +} + +class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo, bits<5> bi, bit lk, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> { + let BO = bo; + let BI = bi; + let BH = 0; +} + +class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> BF; + bits<3> BFA; + + let Inst{6-8} = BF; + let Inst{9-10} = 0; + let Inst{11-13} = BFA; + let Inst{14-15} = 0; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +// 1.7.8 XFX-Form +class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RT; + bits<10> SPR; + + let Inst{6-10} = RT; + let Inst{11} = SPR{4}; + let Inst{12} = SPR{3}; + let Inst{13} = SPR{2}; + let Inst{14} = SPR{1}; + let Inst{15} = SPR{0}; + let Inst{16} = SPR{9}; + let Inst{17} = SPR{8}; + let Inst{18} = SPR{7}; + let Inst{19} = SPR{6}; + let Inst{20} = SPR{5}; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr, + dag OOL, dag IOL, string asmstr, InstrItinClass itin> + : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin> { + let SPR = spr; +} + +class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RT; + + let Inst{6-10} = RT; + let Inst{11-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<8> FXM; + bits<5> rS; + + let Inst{6-10} = rS; + let Inst{11} = 0; + let Inst{12-19} = FXM; + let Inst{20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> ST; + bits<8> FXM; + + let Inst{6-10} = ST; + let Inst{11} = 1; + let Inst{12-19} = FXM; + let Inst{20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XFXForm_7<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin> + : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin>; + +class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr, + dag OOL, dag IOL, string asmstr, InstrItinClass itin> + : XFXForm_7<opcode, xo, OOL, IOL, asmstr, itin> { + let SPR = spr; +} + +// XFL-Form - MTFSF +// This is probably 1.7.9, but I don't have the reference that uses this +// numbering scheme... +class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag>pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<8> FM; + bits<5> rT; + + bit RC = 0; // set by isDOT + let Pattern = pattern; + + let Inst{6} = 0; + let Inst{7-14} = FM; + let Inst{15} = 0; + let Inst{16-20} = rT; + let Inst{21-30} = xo; + let Inst{31} = RC; +} + +// 1.7.10 XS-Form - SRADI. +class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> A; + bits<5> RS; + bits<6> SH; + + bit RC = 0; // set by isDOT + let Pattern = pattern; + + let Inst{6-10} = RS; + let Inst{11-15} = A; + let Inst{16-20} = SH{4,3,2,1,0}; + let Inst{21-29} = xo; + let Inst{30} = SH{5}; + let Inst{31} = RC; +} + +// 1.7.11 XO-Form +class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RT; + bits<5> RA; + bits<5> RB; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RT; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21} = oe; + let Inst{22-30} = xo; + let Inst{31} = RC; +} + +class XOForm_3<bits<6> opcode, bits<9> xo, bit oe, + dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> + : XOForm_1<opcode, xo, oe, OOL, IOL, asmstr, itin, pattern> { + let RB = 0; +} + +// 1.7.12 A-Form +class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> FRT; + bits<5> FRA; + bits<5> FRC; + bits<5> FRB; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = FRT; + let Inst{11-15} = FRA; + let Inst{16-20} = FRB; + let Inst{21-25} = FRC; + let Inst{26-30} = xo; + let Inst{31} = RC; +} + +class AForm_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let FRC = 0; +} + +class AForm_3<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let FRB = 0; +} + +class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RT; + bits<5> RA; + bits<5> RB; + bits<5> COND; + + let Pattern = pattern; + + let Inst{6-10} = RT; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-25} = COND; + let Inst{26-30} = xo; + let Inst{31} = 0; +} + +// 1.7.13 M-Form +class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RA; + bits<5> RS; + bits<5> RB; + bits<5> MB; + bits<5> ME; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RS; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-25} = MB; + let Inst{26-30} = ME; + let Inst{31} = RC; +} + +class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : MForm_1<opcode, OOL, IOL, asmstr, itin, pattern> { +} + +// 1.7.14 MD-Form +class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RA; + bits<5> RS; + bits<6> SH; + bits<6> MBE; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RS; + let Inst{11-15} = RA; + let Inst{16-20} = SH{4,3,2,1,0}; + let Inst{21-26} = MBE{4,3,2,1,0,5}; + let Inst{27-29} = xo; + let Inst{30} = SH{5}; + let Inst{31} = RC; +} + +class MDSForm_1<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> RA; + bits<5> RS; + bits<5> RB; + bits<6> MBE; + + let Pattern = pattern; + + bit RC = 0; // set by isDOT + + let Inst{6-10} = RS; + let Inst{11-15} = RA; + let Inst{16-20} = RB; + let Inst{21-26} = MBE{4,3,2,1,0,5}; + let Inst{27-30} = xo; + let Inst{31} = RC; +} + + +// E-1 VA-Form + +// VAForm_1 - DACB ordering. +class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VC; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-25} = VC; + let Inst{26-31} = xo; +} + +// VAForm_1a - DABC ordering. +class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + bits<5> VC; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-25} = VC; + let Inst{26-31} = xo; +} + +class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + bits<4> SH; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21} = 0; + let Inst{22-25} = SH; + let Inst{26-31} = xo; +} + +// E-2 VX-Form +class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + +class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : VXForm_1<xo, OOL, IOL, asmstr, itin, pattern> { + let VA = VD; + let VB = VD; +} + + +class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = 0; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + +class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> IMM; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = IMM; + let Inst{16-20} = 0; + let Inst{21-31} = xo; +} + +/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr. +class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = 0; + let Inst{16-20} = 0; + let Inst{21-31} = xo; +} + +/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr. +class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-10} = 0; + let Inst{11-15} = 0; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + +// E-4 VXR-Form +class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VD; + bits<5> VA; + bits<5> VB; + bit RC = 0; + + let Pattern = pattern; + + let Inst{6-10} = VD; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21} = RC; + let Inst{22-31} = xo; +} + +//===----------------------------------------------------------------------===// +class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern> + : I<0, OOL, IOL, asmstr, NoItinerary> { + let isCodeGenOnly = 1; + let PPC64 = 0; + let Pattern = pattern; + let Inst{31-0} = 0; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp new file mode 100644 index 000000000000..9bac91d7d412 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -0,0 +1,2246 @@ +//===-- PPCInstrInfo.cpp - PowerPC Instruction Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "MCTargetDesc/PPCPredicates.h" +#include "PPC.h" +#include "PPCHazardRecognizers.h" +#include "PPCInstrBuilder.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-instr-info" + +#define GET_INSTRMAP_INFO +#define GET_INSTRINFO_CTOR_DTOR +#include "PPCGenInstrInfo.inc" + +static cl:: +opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden, + cl::desc("Disable analysis for CTR loops")); + +static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt", +cl::desc("Disable compare instruction optimization"), cl::Hidden); + +static cl::opt<bool> DisableVSXFMAMutate("disable-ppc-vsx-fma-mutation", +cl::desc("Disable VSX FMA instruction mutation"), cl::Hidden); + +static cl::opt<bool> VSXSelfCopyCrash("crash-on-ppc-vsx-self-copy", +cl::desc("Causes the backend to crash instead of generating a nop VSX copy"), +cl::Hidden); + +// Pin the vtable to this file. +void PPCInstrInfo::anchor() {} + +PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI) + : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP), + Subtarget(STI), RI(STI) {} + +/// CreateTargetHazardRecognizer - Return the hazard recognizer to use for +/// this target when scheduling the DAG. +ScheduleHazardRecognizer * +PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, + const ScheduleDAG *DAG) const { + unsigned Directive = + static_cast<const PPCSubtarget *>(STI)->getDarwinDirective(); + if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 || + Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) { + const InstrItineraryData *II = + &static_cast<const PPCSubtarget *>(STI)->getInstrItineraryData(); + return new ScoreboardHazardRecognizer(II, DAG); + } + + return TargetInstrInfo::CreateTargetHazardRecognizer(STI, DAG); +} + +/// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer +/// to use for this target when scheduling the DAG. +ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer( + const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + unsigned Directive = + DAG->TM.getSubtarget<PPCSubtarget>().getDarwinDirective(); + + if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8) + return new PPCDispatchGroupSBHazardRecognizer(II, DAG); + + // Most subtargets use a PPC970 recognizer. + if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 && + Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) { + assert(DAG->TII && "No InstrInfo?"); + + return new PPCHazardRecognizer970(*DAG); + } + + return new ScoreboardHazardRecognizer(II, DAG); +} + + +int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const { + int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx, + UseMI, UseIdx); + + const MachineOperand &DefMO = DefMI->getOperand(DefIdx); + unsigned Reg = DefMO.getReg(); + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + bool IsRegCR; + if (TRI->isVirtualRegister(Reg)) { + const MachineRegisterInfo *MRI = + &DefMI->getParent()->getParent()->getRegInfo(); + IsRegCR = MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRRCRegClass) || + MRI->getRegClass(Reg)->hasSuperClassEq(&PPC::CRBITRCRegClass); + } else { + IsRegCR = PPC::CRRCRegClass.contains(Reg) || + PPC::CRBITRCRegClass.contains(Reg); + } + + if (UseMI->isBranch() && IsRegCR) { + if (Latency < 0) + Latency = getInstrLatency(ItinData, DefMI); + + // On some cores, there is an additional delay between writing to a condition + // register, and using it from a branch. + unsigned Directive = Subtarget.getDarwinDirective(); + switch (Directive) { + default: break; + case PPC::DIR_7400: + case PPC::DIR_750: + case PPC::DIR_970: + case PPC::DIR_E5500: + case PPC::DIR_PWR4: + case PPC::DIR_PWR5: + case PPC::DIR_PWR5X: + case PPC::DIR_PWR6: + case PPC::DIR_PWR6X: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: + Latency += 2; + break; + } + } + + return Latency; +} + +// Detect 32 -> 64-bit extensions where we may reuse the low sub-register. +bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { + switch (MI.getOpcode()) { + default: return false; + case PPC::EXTSW: + case PPC::EXTSW_32_64: + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + SubIdx = PPC::sub_32; + return true; + } +} + +unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + // Note: This list must be kept consistent with LoadRegFromStackSlot. + switch (MI->getOpcode()) { + default: break; + case PPC::LD: + case PPC::LWZ: + case PPC::LFS: + case PPC::LFD: + case PPC::RESTORE_CR: + case PPC::RESTORE_CRBIT: + case PPC::LVX: + case PPC::LXVD2X: + case PPC::RESTORE_VRSAVE: + // Check for the operands added by addFrameReference (the immediate is the + // offset which defaults to 0). + if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() && + MI->getOperand(2).isFI()) { + FrameIndex = MI->getOperand(2).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + // Note: This list must be kept consistent with StoreRegToStackSlot. + switch (MI->getOpcode()) { + default: break; + case PPC::STD: + case PPC::STW: + case PPC::STFS: + case PPC::STFD: + case PPC::SPILL_CR: + case PPC::SPILL_CRBIT: + case PPC::STVX: + case PPC::STXVD2X: + case PPC::SPILL_VRSAVE: + // Check for the operands added by addFrameReference (the immediate is the + // offset which defaults to 0). + if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() && + MI->getOperand(2).isFI()) { + FrameIndex = MI->getOperand(2).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +// commuteInstruction - We can commute rlwimi instructions, but only if the +// rotate amt is zero. We also have to munge the immediates a bit. +MachineInstr * +PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { + MachineFunction &MF = *MI->getParent()->getParent(); + + // Normal instructions can be commuted the obvious way. + if (MI->getOpcode() != PPC::RLWIMI && + MI->getOpcode() != PPC::RLWIMIo && + MI->getOpcode() != PPC::RLWIMI8 && + MI->getOpcode() != PPC::RLWIMI8o) + return TargetInstrInfo::commuteInstruction(MI, NewMI); + + // Cannot commute if it has a non-zero rotate count. + if (MI->getOperand(3).getImm() != 0) + return nullptr; + + // If we have a zero rotate count, we have: + // M = mask(MB,ME) + // Op0 = (Op1 & ~M) | (Op2 & M) + // Change this to: + // M = mask((ME+1)&31, (MB-1)&31) + // Op0 = (Op2 & ~M) | (Op1 & M) + + // Swap op1/op2 + unsigned Reg0 = MI->getOperand(0).getReg(); + unsigned Reg1 = MI->getOperand(1).getReg(); + unsigned Reg2 = MI->getOperand(2).getReg(); + unsigned SubReg1 = MI->getOperand(1).getSubReg(); + unsigned SubReg2 = MI->getOperand(2).getSubReg(); + bool Reg1IsKill = MI->getOperand(1).isKill(); + bool Reg2IsKill = MI->getOperand(2).isKill(); + bool ChangeReg0 = false; + // If machine instrs are no longer in two-address forms, update + // destination register as well. + if (Reg0 == Reg1) { + // Must be two address instruction! + assert(MI->getDesc().getOperandConstraint(0, MCOI::TIED_TO) && + "Expecting a two-address instruction!"); + assert(MI->getOperand(0).getSubReg() == SubReg1 && "Tied subreg mismatch"); + Reg2IsKill = false; + ChangeReg0 = true; + } + + // Masks. + unsigned MB = MI->getOperand(4).getImm(); + unsigned ME = MI->getOperand(5).getImm(); + + if (NewMI) { + // Create a new instruction. + unsigned Reg0 = ChangeReg0 ? Reg2 : MI->getOperand(0).getReg(); + bool Reg0IsDead = MI->getOperand(0).isDead(); + return BuildMI(MF, MI->getDebugLoc(), MI->getDesc()) + .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead)) + .addReg(Reg2, getKillRegState(Reg2IsKill)) + .addReg(Reg1, getKillRegState(Reg1IsKill)) + .addImm((ME+1) & 31) + .addImm((MB-1) & 31); + } + + if (ChangeReg0) { + MI->getOperand(0).setReg(Reg2); + MI->getOperand(0).setSubReg(SubReg2); + } + MI->getOperand(2).setReg(Reg1); + MI->getOperand(1).setReg(Reg2); + MI->getOperand(2).setSubReg(SubReg1); + MI->getOperand(1).setSubReg(SubReg2); + MI->getOperand(2).setIsKill(Reg1IsKill); + MI->getOperand(1).setIsKill(Reg2IsKill); + + // Swap the mask around. + MI->getOperand(4).setImm((ME+1) & 31); + MI->getOperand(5).setImm((MB-1) & 31); + return MI; +} + +bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + // For VSX A-Type FMA instructions, it is the first two operands that can be + // commuted, however, because the non-encoded tied input operand is listed + // first, the operands to swap are actually the second and third. + + int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode()); + if (AltOpc == -1) + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + + SrcOpIdx1 = 2; + SrcOpIdx2 = 3; + return true; +} + +void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + // This function is used for scheduling, and the nop wanted here is the type + // that terminates dispatch groups on the POWER cores. + unsigned Directive = Subtarget.getDarwinDirective(); + unsigned Opcode; + switch (Directive) { + default: Opcode = PPC::NOP; break; + case PPC::DIR_PWR6: Opcode = PPC::NOP_GT_PWR6; break; + case PPC::DIR_PWR7: Opcode = PPC::NOP_GT_PWR7; break; + case PPC::DIR_PWR8: Opcode = PPC::NOP_GT_PWR7; break; /* FIXME: Update when P8 InstrScheduling model is ready */ + } + + DebugLoc DL; + BuildMI(MBB, MI, DL, get(Opcode)); +} + +// Branch analysis. +// Note: If the condition register is set to CTR or CTR8 then this is a +// BDNZ (imm == 1) or BDZ (imm == 0) branch. +bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + bool isPPC64 = Subtarget.isPPC64(); + + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (LastInst->getOpcode() == PPC::B) { + if (!LastInst->getOperand(0).isMBB()) + return true; + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (LastInst->getOpcode() == PPC::BCC) { + if (!LastInst->getOperand(2).isMBB()) + return true; + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(2).getMBB(); + Cond.push_back(LastInst->getOperand(0)); + Cond.push_back(LastInst->getOperand(1)); + return false; + } else if (LastInst->getOpcode() == PPC::BC) { + if (!LastInst->getOperand(1).isMBB()) + return true; + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); + Cond.push_back(LastInst->getOperand(0)); + return false; + } else if (LastInst->getOpcode() == PPC::BCn) { + if (!LastInst->getOperand(1).isMBB()) + return true; + // Block ends with fall-through condbranch. + TBB = LastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET)); + Cond.push_back(LastInst->getOperand(0)); + return false; + } else if (LastInst->getOpcode() == PPC::BDNZ8 || + LastInst->getOpcode() == PPC::BDNZ) { + if (!LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(1)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + return false; + } else if (LastInst->getOpcode() == PPC::BDZ8 || + LastInst->getOpcode() == PPC::BDZ) { + if (!LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = LastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(0)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + return false; + } + + // Otherwise, don't know what this is. + return true; + } + + // Get the instruction before it if it's a terminator. + MachineInstr *SecondLastInst = I; + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && + isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with PPC::B and PPC:BCC, handle it. + if (SecondLastInst->getOpcode() == PPC::BCC && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(2).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + TBB = SecondLastInst->getOperand(2).getMBB(); + Cond.push_back(SecondLastInst->getOperand(0)); + Cond.push_back(SecondLastInst->getOperand(1)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (SecondLastInst->getOpcode() == PPC::BC && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(1).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + TBB = SecondLastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET)); + Cond.push_back(SecondLastInst->getOperand(0)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } else if (SecondLastInst->getOpcode() == PPC::BCn && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(1).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + TBB = SecondLastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_UNSET)); + Cond.push_back(SecondLastInst->getOperand(0)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } else if ((SecondLastInst->getOpcode() == PPC::BDNZ8 || + SecondLastInst->getOpcode() == PPC::BDNZ) && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(0).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = SecondLastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(1)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } else if ((SecondLastInst->getOpcode() == PPC::BDZ8 || + SecondLastInst->getOpcode() == PPC::BDZ) && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(0).isMBB() || + !LastInst->getOperand(0).isMBB()) + return true; + if (DisableCTRLoopAnal) + return true; + TBB = SecondLastInst->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(0)); + Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR, + true)); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } + + // If the block ends with two PPC:Bs, handle it. The second one is not + // executed, so remove it. + if (SecondLastInst->getOpcode() == PPC::B && + LastInst->getOpcode() == PPC::B) { + if (!SecondLastInst->getOperand(0).isMBB()) + return true; + TBB = SecondLastInst->getOperand(0).getMBB(); + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return false; + } + + // Otherwise, can't handle this. + return true; +} + +unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) return 0; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } + if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC && + I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn && + I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ && + I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) return 1; + --I; + if (I->getOpcode() != PPC::BCC && + I->getOpcode() != PPC::BC && I->getOpcode() != PPC::BCn && + I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ && + I->getOpcode() != PPC::BDZ8 && I->getOpcode() != PPC::BDZ) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +unsigned +PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 2 || Cond.size() == 0) && + "PPC branch conditions have two components!"); + + bool isPPC64 = Subtarget.isPPC64(); + + // One-way branch. + if (!FBB) { + if (Cond.empty()) // Unconditional branch + BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB); + else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8) + BuildMI(&MBB, DL, get(Cond[0].getImm() ? + (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB); + else if (Cond[0].getImm() == PPC::PRED_BIT_SET) + BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB); + else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET) + BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB); + else // Conditional branch + BuildMI(&MBB, DL, get(PPC::BCC)) + .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB); + return 1; + } + + // Two-way Conditional Branch. + if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8) + BuildMI(&MBB, DL, get(Cond[0].getImm() ? + (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(TBB); + else if (Cond[0].getImm() == PPC::PRED_BIT_SET) + BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB); + else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET) + BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB); + else + BuildMI(&MBB, DL, get(PPC::BCC)) + .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB); + BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB); + return 2; +} + +// Select analysis. +bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, int &TrueCycles, int &FalseCycles) const { + if (!Subtarget.hasISEL()) + return false; + + if (Cond.size() != 2) + return false; + + // If this is really a bdnz-like condition, then it cannot be turned into a + // select. + if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8) + return false; + + // Check register classes. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = + RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + if (!RC) + return false; + + // isel is for regular integer GPRs only. + if (!PPC::GPRCRegClass.hasSubClassEq(RC) && + !PPC::GPRC_NOR0RegClass.hasSubClassEq(RC) && + !PPC::G8RCRegClass.hasSubClassEq(RC) && + !PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) + return false; + + // FIXME: These numbers are for the A2, how well they work for other cores is + // an open question. On the A2, the isel instruction has a 2-cycle latency + // but single-cycle throughput. These numbers are used in combination with + // the MispredictPenalty setting from the active SchedMachineModel. + CondCycles = 1; + TrueCycles = 1; + FalseCycles = 1; + + return true; +} + +void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc dl, + unsigned DestReg, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned TrueReg, unsigned FalseReg) const { + assert(Cond.size() == 2 && + "PPC branch conditions have two components!"); + + assert(Subtarget.hasISEL() && + "Cannot insert select on target without ISEL support"); + + // Get the register classes. + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = + RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + assert(RC && "TrueReg and FalseReg must have overlapping register classes"); + + bool Is64Bit = PPC::G8RCRegClass.hasSubClassEq(RC) || + PPC::G8RC_NOX0RegClass.hasSubClassEq(RC); + assert((Is64Bit || + PPC::GPRCRegClass.hasSubClassEq(RC) || + PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) && + "isel is for regular integer GPRs only"); + + unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL; + unsigned SelectPred = Cond[0].getImm(); + + unsigned SubIdx; + bool SwapOps; + switch (SelectPred) { + default: llvm_unreachable("invalid predicate for isel"); + case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break; + case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break; + case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break; + case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break; + case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break; + case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break; + case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break; + case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break; + case PPC::PRED_BIT_SET: SubIdx = 0; SwapOps = false; break; + case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break; + } + + unsigned FirstReg = SwapOps ? FalseReg : TrueReg, + SecondReg = SwapOps ? TrueReg : FalseReg; + + // The first input register of isel cannot be r0. If it is a member + // of a register class that can be r0, then copy it first (the + // register allocator should eliminate the copy). + if (MRI.getRegClass(FirstReg)->contains(PPC::R0) || + MRI.getRegClass(FirstReg)->contains(PPC::X0)) { + const TargetRegisterClass *FirstRC = + MRI.getRegClass(FirstReg)->contains(PPC::X0) ? + &PPC::G8RC_NOX0RegClass : &PPC::GPRC_NOR0RegClass; + unsigned OldFirstReg = FirstReg; + FirstReg = MRI.createVirtualRegister(FirstRC); + BuildMI(MBB, MI, dl, get(TargetOpcode::COPY), FirstReg) + .addReg(OldFirstReg); + } + + BuildMI(MBB, MI, dl, get(OpCode), DestReg) + .addReg(FirstReg).addReg(SecondReg) + .addReg(Cond[1].getReg(), 0, SubIdx); +} + +void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + // We can end up with self copies and similar things as a result of VSX copy + // legalization. Promote them here. + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (PPC::F8RCRegClass.contains(DestReg) && + PPC::VSLRCRegClass.contains(SrcReg)) { + unsigned SuperReg = + TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass); + + if (VSXSelfCopyCrash && SrcReg == SuperReg) + llvm_unreachable("nop VSX copy"); + + DestReg = SuperReg; + } else if (PPC::VRRCRegClass.contains(DestReg) && + PPC::VSHRCRegClass.contains(SrcReg)) { + unsigned SuperReg = + TRI->getMatchingSuperReg(DestReg, PPC::sub_128, &PPC::VSRCRegClass); + + if (VSXSelfCopyCrash && SrcReg == SuperReg) + llvm_unreachable("nop VSX copy"); + + DestReg = SuperReg; + } else if (PPC::F8RCRegClass.contains(SrcReg) && + PPC::VSLRCRegClass.contains(DestReg)) { + unsigned SuperReg = + TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass); + + if (VSXSelfCopyCrash && DestReg == SuperReg) + llvm_unreachable("nop VSX copy"); + + SrcReg = SuperReg; + } else if (PPC::VRRCRegClass.contains(SrcReg) && + PPC::VSHRCRegClass.contains(DestReg)) { + unsigned SuperReg = + TRI->getMatchingSuperReg(SrcReg, PPC::sub_128, &PPC::VSRCRegClass); + + if (VSXSelfCopyCrash && DestReg == SuperReg) + llvm_unreachable("nop VSX copy"); + + SrcReg = SuperReg; + } + + unsigned Opc; + if (PPC::GPRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::OR; + else if (PPC::G8RCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::OR8; + else if (PPC::F4RCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::FMR; + else if (PPC::CRRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::MCRF; + else if (PPC::VRRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::VOR; + else if (PPC::VSRCRegClass.contains(DestReg, SrcReg)) + // There are two different ways this can be done: + // 1. xxlor : This has lower latency (on the P7), 2 cycles, but can only + // issue in VSU pipeline 0. + // 2. xmovdp/xmovsp: This has higher latency (on the P7), 6 cycles, but + // can go to either pipeline. + // We'll always use xxlor here, because in practically all cases where + // copies are generated, they are close enough to some use that the + // lower-latency form is preferable. + Opc = PPC::XXLOR; + else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::XXLORf; + else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) + Opc = PPC::CROR; + else + llvm_unreachable("Impossible reg-to-reg copy"); + + const MCInstrDesc &MCID = get(Opc); + if (MCID.getNumOperands() == 3) + BuildMI(MBB, I, DL, MCID, DestReg) + .addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc)); + else + BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc)); +} + +// This function returns true if a CR spill is necessary and false otherwise. +bool +PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF, + unsigned SrcReg, bool isKill, + int FrameIdx, + const TargetRegisterClass *RC, + SmallVectorImpl<MachineInstr*> &NewMIs, + bool &NonRI, bool &SpillsVRS) const{ + // Note: If additional store instructions are added here, + // update isStoreToStackSlot. + + DebugLoc DL; + if (PPC::GPRCRegClass.hasSubClassEq(RC) || + PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + } else if (PPC::G8RCRegClass.hasSubClassEq(RC) || + PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + return true; + } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CRBIT)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + return true; + } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXVD2X)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXSDX)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + NonRI = true; + } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isDarwin() && + "VRSAVE only needs spill/restore on Darwin"); + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE)) + .addReg(SrcReg, + getKillRegState(isKill)), + FrameIdx)); + SpillsVRS = true; + } else { + llvm_unreachable("Unknown regclass!"); + } + + return false; +} + +void +PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + SmallVector<MachineInstr*, 4> NewMIs; + + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + FuncInfo->setHasSpills(); + + bool NonRI = false, SpillsVRS = false; + if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs, + NonRI, SpillsVRS)) + FuncInfo->setSpillsCR(); + + if (SpillsVRS) + FuncInfo->setSpillsVRSAVE(); + + if (NonRI) + FuncInfo->setHasNonRISpills(); + + for (unsigned i = 0, e = NewMIs.size(); i != e; ++i) + MBB.insert(MI, NewMIs[i]); + + const MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), + MachineMemOperand::MOStore, + MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); + NewMIs.back()->addMemOperand(MF, MMO); +} + +bool +PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC, + SmallVectorImpl<MachineInstr*> &NewMIs, + bool &NonRI, bool &SpillsVRS) const{ + // Note: If additional load instructions are added here, + // update isLoadFromStackSlot. + + if (PPC::GPRCRegClass.hasSubClassEq(RC) || + PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ), + DestReg), FrameIdx)); + } else if (PPC::G8RCRegClass.hasSubClassEq(RC) || + PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg), + FrameIdx)); + } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg), + FrameIdx)); + } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg), + FrameIdx)); + } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, + get(PPC::RESTORE_CR), DestReg), + FrameIdx)); + return true; + } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, + get(PPC::RESTORE_CRBIT), DestReg), + FrameIdx)); + return true; + } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg), + FrameIdx)); + NonRI = true; + } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXVD2X), DestReg), + FrameIdx)); + NonRI = true; + } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) { + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXSDX), DestReg), + FrameIdx)); + NonRI = true; + } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isDarwin() && + "VRSAVE only needs spill/restore on Darwin"); + NewMIs.push_back(addFrameReference(BuildMI(MF, DL, + get(PPC::RESTORE_VRSAVE), + DestReg), + FrameIdx)); + SpillsVRS = true; + } else { + llvm_unreachable("Unknown regclass!"); + } + + return false; +} + +void +PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + SmallVector<MachineInstr*, 4> NewMIs; + DebugLoc DL; + if (MI != MBB.end()) DL = MI->getDebugLoc(); + + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); + FuncInfo->setHasSpills(); + + bool NonRI = false, SpillsVRS = false; + if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs, + NonRI, SpillsVRS)) + FuncInfo->setSpillsCR(); + + if (SpillsVRS) + FuncInfo->setSpillsVRSAVE(); + + if (NonRI) + FuncInfo->setHasNonRISpills(); + + for (unsigned i = 0, e = NewMIs.size(); i != e; ++i) + MBB.insert(MI, NewMIs[i]); + + const MachineFrameInfo &MFI = *MF.getFrameInfo(); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), + MachineMemOperand::MOLoad, + MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); + NewMIs.back()->addMemOperand(MF, MMO); +} + +bool PPCInstrInfo:: +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 2 && "Invalid PPC branch opcode!"); + if (Cond[1].getReg() == PPC::CTR8 || Cond[1].getReg() == PPC::CTR) + Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0); + else + // Leave the CR# the same, but invert the condition. + Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm())); + return false; +} + +bool PPCInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const { + // For some instructions, it is legal to fold ZERO into the RA register field. + // A zero immediate should always be loaded with a single li. + unsigned DefOpc = DefMI->getOpcode(); + if (DefOpc != PPC::LI && DefOpc != PPC::LI8) + return false; + if (!DefMI->getOperand(1).isImm()) + return false; + if (DefMI->getOperand(1).getImm() != 0) + return false; + + // Note that we cannot here invert the arguments of an isel in order to fold + // a ZERO into what is presented as the second argument. All we have here + // is the condition bit, and that might come from a CR-logical bit operation. + + const MCInstrDesc &UseMCID = UseMI->getDesc(); + + // Only fold into real machine instructions. + if (UseMCID.isPseudo()) + return false; + + unsigned UseIdx; + for (UseIdx = 0; UseIdx < UseMI->getNumOperands(); ++UseIdx) + if (UseMI->getOperand(UseIdx).isReg() && + UseMI->getOperand(UseIdx).getReg() == Reg) + break; + + assert(UseIdx < UseMI->getNumOperands() && "Cannot find Reg in UseMI"); + assert(UseIdx < UseMCID.getNumOperands() && "No operand description for Reg"); + + const MCOperandInfo *UseInfo = &UseMCID.OpInfo[UseIdx]; + + // We can fold the zero if this register requires a GPRC_NOR0/G8RC_NOX0 + // register (which might also be specified as a pointer class kind). + if (UseInfo->isLookupPtrRegClass()) { + if (UseInfo->RegClass /* Kind */ != 1) + return false; + } else { + if (UseInfo->RegClass != PPC::GPRC_NOR0RegClassID && + UseInfo->RegClass != PPC::G8RC_NOX0RegClassID) + return false; + } + + // Make sure this is not tied to an output register (or otherwise + // constrained). This is true for ST?UX registers, for example, which + // are tied to their output registers. + if (UseInfo->Constraints != 0) + return false; + + unsigned ZeroReg; + if (UseInfo->isLookupPtrRegClass()) { + bool isPPC64 = Subtarget.isPPC64(); + ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO; + } else { + ZeroReg = UseInfo->RegClass == PPC::G8RC_NOX0RegClassID ? + PPC::ZERO8 : PPC::ZERO; + } + + bool DeleteDef = MRI->hasOneNonDBGUse(Reg); + UseMI->getOperand(UseIdx).setReg(ZeroReg); + + if (DeleteDef) + DefMI->eraseFromParent(); + + return true; +} + +static bool MBBDefinesCTR(MachineBasicBlock &MBB) { + for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); + I != IE; ++I) + if (I->definesRegister(PPC::CTR) || I->definesRegister(PPC::CTR8)) + return true; + return false; +} + +// We should make sure that, if we're going to predicate both sides of a +// condition (a diamond), that both sides don't define the counter register. We +// can predicate counter-decrement-based branches, but while that predicates +// the branching, it does not predicate the counter decrement. If we tried to +// merge the triangle into one predicated block, we'd decrement the counter +// twice. +bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumT, unsigned ExtraT, + MachineBasicBlock &FMBB, + unsigned NumF, unsigned ExtraF, + const BranchProbability &Probability) const { + return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB)); +} + + +bool PPCInstrInfo::isPredicated(const MachineInstr *MI) const { + // The predicated branches are identified by their type, not really by the + // explicit presence of a predicate. Furthermore, some of them can be + // predicated more than once. Because if conversion won't try to predicate + // any instruction which already claims to be predicated (by returning true + // here), always return false. In doing so, we let isPredicable() be the + // final word on whether not the instruction can be (further) predicated. + + return false; +} + +bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { + if (!MI->isTerminator()) + return false; + + // Conditional branch is a special case. + if (MI->isBranch() && !MI->isBarrier()) + return true; + + return !isPredicated(MI); +} + +bool PPCInstrInfo::PredicateInstruction( + MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const { + unsigned OpC = MI->getOpcode(); + if (OpC == PPC::BLR) { + if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) { + bool isPPC64 = Subtarget.isPPC64(); + MI->setDesc(get(Pred[0].getImm() ? + (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) : + (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR))); + } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) { + MI->setDesc(get(PPC::BCLR)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()); + } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { + MI->setDesc(get(PPC::BCLRn)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()); + } else { + MI->setDesc(get(PPC::BCCLR)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addImm(Pred[0].getImm()) + .addReg(Pred[1].getReg()); + } + + return true; + } else if (OpC == PPC::B) { + if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) { + bool isPPC64 = Subtarget.isPPC64(); + MI->setDesc(get(Pred[0].getImm() ? + (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : + (isPPC64 ? PPC::BDZ8 : PPC::BDZ))); + } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) { + MachineBasicBlock *MBB = MI->getOperand(0).getMBB(); + MI->RemoveOperand(0); + + MI->setDesc(get(PPC::BC)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()) + .addMBB(MBB); + } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { + MachineBasicBlock *MBB = MI->getOperand(0).getMBB(); + MI->RemoveOperand(0); + + MI->setDesc(get(PPC::BCn)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()) + .addMBB(MBB); + } else { + MachineBasicBlock *MBB = MI->getOperand(0).getMBB(); + MI->RemoveOperand(0); + + MI->setDesc(get(PPC::BCC)); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addImm(Pred[0].getImm()) + .addReg(Pred[1].getReg()) + .addMBB(MBB); + } + + return true; + } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || + OpC == PPC::BCTRL || OpC == PPC::BCTRL8) { + if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) + llvm_unreachable("Cannot predicate bctr[l] on the ctr register"); + + bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8; + bool isPPC64 = Subtarget.isPPC64(); + + if (Pred[0].getImm() == PPC::PRED_BIT_SET) { + MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) : + (setLR ? PPC::BCCTRL : PPC::BCCTR))); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()); + return true; + } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { + MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n) : + (setLR ? PPC::BCCTRLn : PPC::BCCTRn))); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addReg(Pred[1].getReg()); + return true; + } + + MI->setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8) : + (setLR ? PPC::BCCCTRL : PPC::BCCCTR))); + MachineInstrBuilder(*MI->getParent()->getParent(), MI) + .addImm(Pred[0].getImm()) + .addReg(Pred[1].getReg()); + return true; + } + + return false; +} + +bool PPCInstrInfo::SubsumesPredicate( + const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const { + assert(Pred1.size() == 2 && "Invalid PPC first predicate"); + assert(Pred2.size() == 2 && "Invalid PPC second predicate"); + + if (Pred1[1].getReg() == PPC::CTR8 || Pred1[1].getReg() == PPC::CTR) + return false; + if (Pred2[1].getReg() == PPC::CTR8 || Pred2[1].getReg() == PPC::CTR) + return false; + + // P1 can only subsume P2 if they test the same condition register. + if (Pred1[1].getReg() != Pred2[1].getReg()) + return false; + + PPC::Predicate P1 = (PPC::Predicate) Pred1[0].getImm(); + PPC::Predicate P2 = (PPC::Predicate) Pred2[0].getImm(); + + if (P1 == P2) + return true; + + // Does P1 subsume P2, e.g. GE subsumes GT. + if (P1 == PPC::PRED_LE && + (P2 == PPC::PRED_LT || P2 == PPC::PRED_EQ)) + return true; + if (P1 == PPC::PRED_GE && + (P2 == PPC::PRED_GT || P2 == PPC::PRED_EQ)) + return true; + + return false; +} + +bool PPCInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // Note: At the present time, the contents of Pred from this function is + // unused by IfConversion. This implementation follows ARM by pushing the + // CR-defining operand. Because the 'DZ' and 'DNZ' count as types of + // predicate, instructions defining CTR or CTR8 are also included as + // predicate-defining instructions. + + const TargetRegisterClass *RCs[] = + { &PPC::CRRCRegClass, &PPC::CRBITRCRegClass, + &PPC::CTRRCRegClass, &PPC::CTRRC8RegClass }; + + bool Found = false; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + for (unsigned c = 0; c < array_lengthof(RCs) && !Found; ++c) { + const TargetRegisterClass *RC = RCs[c]; + if (MO.isReg()) { + if (MO.isDef() && RC->contains(MO.getReg())) { + Pred.push_back(MO); + Found = true; + } + } else if (MO.isRegMask()) { + for (TargetRegisterClass::iterator I = RC->begin(), + IE = RC->end(); I != IE; ++I) + if (MO.clobbersPhysReg(*I)) { + Pred.push_back(MO); + Found = true; + } + } + } + } + + return Found; +} + +bool PPCInstrInfo::isPredicable(MachineInstr *MI) const { + unsigned OpC = MI->getOpcode(); + switch (OpC) { + default: + return false; + case PPC::B: + case PPC::BLR: + case PPC::BCTR: + case PPC::BCTR8: + case PPC::BCTRL: + case PPC::BCTRL8: + return true; + } +} + +bool PPCInstrInfo::analyzeCompare(const MachineInstr *MI, + unsigned &SrcReg, unsigned &SrcReg2, + int &Mask, int &Value) const { + unsigned Opc = MI->getOpcode(); + + switch (Opc) { + default: return false; + case PPC::CMPWI: + case PPC::CMPLWI: + case PPC::CMPDI: + case PPC::CMPLDI: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + Value = MI->getOperand(2).getImm(); + Mask = 0xFFFF; + return true; + case PPC::CMPW: + case PPC::CMPLW: + case PPC::CMPD: + case PPC::CMPLD: + case PPC::FCMPUS: + case PPC::FCMPUD: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = MI->getOperand(2).getReg(); + return true; + } +} + +bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr, + unsigned SrcReg, unsigned SrcReg2, + int Mask, int Value, + const MachineRegisterInfo *MRI) const { + if (DisableCmpOpt) + return false; + + int OpC = CmpInstr->getOpcode(); + unsigned CRReg = CmpInstr->getOperand(0).getReg(); + + // FP record forms set CR1 based on the execption status bits, not a + // comparison with zero. + if (OpC == PPC::FCMPUS || OpC == PPC::FCMPUD) + return false; + + // The record forms set the condition register based on a signed comparison + // with zero (so says the ISA manual). This is not as straightforward as it + // seems, however, because this is always a 64-bit comparison on PPC64, even + // for instructions that are 32-bit in nature (like slw for example). + // So, on PPC32, for unsigned comparisons, we can use the record forms only + // for equality checks (as those don't depend on the sign). On PPC64, + // we are restricted to equality for unsigned 64-bit comparisons and for + // signed 32-bit comparisons the applicability is more restricted. + bool isPPC64 = Subtarget.isPPC64(); + bool is32BitSignedCompare = OpC == PPC::CMPWI || OpC == PPC::CMPW; + bool is32BitUnsignedCompare = OpC == PPC::CMPLWI || OpC == PPC::CMPLW; + bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD; + + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) return false; + int MIOpC = MI->getOpcode(); + + bool equalityOnly = false; + bool noSub = false; + if (isPPC64) { + if (is32BitSignedCompare) { + // We can perform this optimization only if MI is sign-extending. + if (MIOpC == PPC::SRAW || MIOpC == PPC::SRAWo || + MIOpC == PPC::SRAWI || MIOpC == PPC::SRAWIo || + MIOpC == PPC::EXTSB || MIOpC == PPC::EXTSBo || + MIOpC == PPC::EXTSH || MIOpC == PPC::EXTSHo || + MIOpC == PPC::EXTSW || MIOpC == PPC::EXTSWo) { + noSub = true; + } else + return false; + } else if (is32BitUnsignedCompare) { + // We can perform this optimization, equality only, if MI is + // zero-extending. + if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo || + MIOpC == PPC::SLW || MIOpC == PPC::SLWo || + MIOpC == PPC::SRW || MIOpC == PPC::SRWo) { + noSub = true; + equalityOnly = true; + } else + return false; + } else + equalityOnly = is64BitUnsignedCompare; + } else + equalityOnly = is32BitUnsignedCompare; + + if (equalityOnly) { + // We need to check the uses of the condition register in order to reject + // non-equality comparisons. + for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg), + IE = MRI->use_instr_end(); I != IE; ++I) { + MachineInstr *UseMI = &*I; + if (UseMI->getOpcode() == PPC::BCC) { + unsigned Pred = UseMI->getOperand(0).getImm(); + if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE) + return false; + } else if (UseMI->getOpcode() == PPC::ISEL || + UseMI->getOpcode() == PPC::ISEL8) { + unsigned SubIdx = UseMI->getOperand(3).getSubReg(); + if (SubIdx != PPC::sub_eq) + return false; + } else + return false; + } + } + + MachineBasicBlock::iterator I = CmpInstr; + + // Scan forward to find the first use of the compare. + for (MachineBasicBlock::iterator EL = CmpInstr->getParent()->end(); + I != EL; ++I) { + bool FoundUse = false; + for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg), + JE = MRI->use_instr_end(); J != JE; ++J) + if (&*J == &*I) { + FoundUse = true; + break; + } + + if (FoundUse) + break; + } + + // There are two possible candidates which can be changed to set CR[01]. + // One is MI, the other is a SUB instruction. + // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1). + MachineInstr *Sub = nullptr; + if (SrcReg2 != 0) + // MI is not a candidate for CMPrr. + MI = nullptr; + // FIXME: Conservatively refuse to convert an instruction which isn't in the + // same BB as the comparison. This is to allow the check below to avoid calls + // (and other explicit clobbers); instead we should really check for these + // more explicitly (in at least a few predecessors). + else if (MI->getParent() != CmpInstr->getParent() || Value != 0) { + // PPC does not have a record-form SUBri. + return false; + } + + // Search for Sub. + const TargetRegisterInfo *TRI = &getRegisterInfo(); + --I; + + // Get ready to iterate backward from CmpInstr. + MachineBasicBlock::iterator E = MI, + B = CmpInstr->getParent()->begin(); + + for (; I != E && !noSub; --I) { + const MachineInstr &Instr = *I; + unsigned IOpC = Instr.getOpcode(); + + if (&*I != CmpInstr && ( + Instr.modifiesRegister(PPC::CR0, TRI) || + Instr.readsRegister(PPC::CR0, TRI))) + // This instruction modifies or uses the record condition register after + // the one we want to change. While we could do this transformation, it + // would likely not be profitable. This transformation removes one + // instruction, and so even forcing RA to generate one move probably + // makes it unprofitable. + return false; + + // Check whether CmpInstr can be made redundant by the current instruction. + if ((OpC == PPC::CMPW || OpC == PPC::CMPLW || + OpC == PPC::CMPD || OpC == PPC::CMPLD) && + (IOpC == PPC::SUBF || IOpC == PPC::SUBF8) && + ((Instr.getOperand(1).getReg() == SrcReg && + Instr.getOperand(2).getReg() == SrcReg2) || + (Instr.getOperand(1).getReg() == SrcReg2 && + Instr.getOperand(2).getReg() == SrcReg))) { + Sub = &*I; + break; + } + + if (I == B) + // The 'and' is below the comparison instruction. + return false; + } + + // Return false if no candidates exist. + if (!MI && !Sub) + return false; + + // The single candidate is called MI. + if (!MI) MI = Sub; + + int NewOpC = -1; + MIOpC = MI->getOpcode(); + if (MIOpC == PPC::ANDIo || MIOpC == PPC::ANDIo8) + NewOpC = MIOpC; + else { + NewOpC = PPC::getRecordFormOpcode(MIOpC); + if (NewOpC == -1 && PPC::getNonRecordFormOpcode(MIOpC) != -1) + NewOpC = MIOpC; + } + + // FIXME: On the non-embedded POWER architectures, only some of the record + // forms are fast, and we should use only the fast ones. + + // The defining instruction has a record form (or is already a record + // form). It is possible, however, that we'll need to reverse the condition + // code of the users. + if (NewOpC == -1) + return false; + + SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate; + SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate; + + // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP + // needs to be updated to be based on SUB. Push the condition code + // operands to OperandsToUpdate. If it is safe to remove CmpInstr, the + // condition code of these operands will be modified. + bool ShouldSwap = false; + if (Sub) { + ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && + Sub->getOperand(2).getReg() == SrcReg; + + // The operands to subf are the opposite of sub, so only in the fixed-point + // case, invert the order. + ShouldSwap = !ShouldSwap; + } + + if (ShouldSwap) + for (MachineRegisterInfo::use_instr_iterator + I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end(); + I != IE; ++I) { + MachineInstr *UseMI = &*I; + if (UseMI->getOpcode() == PPC::BCC) { + PPC::Predicate Pred = (PPC::Predicate) UseMI->getOperand(0).getImm(); + assert((!equalityOnly || + Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE) && + "Invalid predicate for equality-only optimization"); + PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)), + PPC::getSwappedPredicate(Pred))); + } else if (UseMI->getOpcode() == PPC::ISEL || + UseMI->getOpcode() == PPC::ISEL8) { + unsigned NewSubReg = UseMI->getOperand(3).getSubReg(); + assert((!equalityOnly || NewSubReg == PPC::sub_eq) && + "Invalid CR bit for equality-only optimization"); + + if (NewSubReg == PPC::sub_lt) + NewSubReg = PPC::sub_gt; + else if (NewSubReg == PPC::sub_gt) + NewSubReg = PPC::sub_lt; + + SubRegsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(3)), + NewSubReg)); + } else // We need to abort on a user we don't understand. + return false; + } + + // Create a new virtual register to hold the value of the CR set by the + // record-form instruction. If the instruction was not previously in + // record form, then set the kill flag on the CR. + CmpInstr->eraseFromParent(); + + MachineBasicBlock::iterator MII = MI; + BuildMI(*MI->getParent(), std::next(MII), MI->getDebugLoc(), + get(TargetOpcode::COPY), CRReg) + .addReg(PPC::CR0, MIOpC != NewOpC ? RegState::Kill : 0); + + if (MIOpC != NewOpC) { + // We need to be careful here: we're replacing one instruction with + // another, and we need to make sure that we get all of the right + // implicit uses and defs. On the other hand, the caller may be holding + // an iterator to this instruction, and so we can't delete it (this is + // specifically the case if this is the instruction directly after the + // compare). + + const MCInstrDesc &NewDesc = get(NewOpC); + MI->setDesc(NewDesc); + + if (NewDesc.ImplicitDefs) + for (const uint16_t *ImpDefs = NewDesc.getImplicitDefs(); + *ImpDefs; ++ImpDefs) + if (!MI->definesRegister(*ImpDefs)) + MI->addOperand(*MI->getParent()->getParent(), + MachineOperand::CreateReg(*ImpDefs, true, true)); + if (NewDesc.ImplicitUses) + for (const uint16_t *ImpUses = NewDesc.getImplicitUses(); + *ImpUses; ++ImpUses) + if (!MI->readsRegister(*ImpUses)) + MI->addOperand(*MI->getParent()->getParent(), + MachineOperand::CreateReg(*ImpUses, false, true)); + } + + // Modify the condition code of operands in OperandsToUpdate. + // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to + // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. + for (unsigned i = 0, e = PredsToUpdate.size(); i < e; i++) + PredsToUpdate[i].first->setImm(PredsToUpdate[i].second); + + for (unsigned i = 0, e = SubRegsToUpdate.size(); i < e; i++) + SubRegsToUpdate[i].first->setSubReg(SubRegsToUpdate[i].second); + + return true; +} + +/// GetInstSize - Return the number of bytes of code the specified +/// instruction may be. This returns the maximum number of bytes. +/// +unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + unsigned Opcode = MI->getOpcode(); + + if (Opcode == PPC::INLINEASM) { + const MachineFunction *MF = MI->getParent()->getParent(); + const char *AsmStr = MI->getOperand(0).getSymbolName(); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); + } else { + const MCInstrDesc &Desc = get(Opcode); + return Desc.getSize(); + } +} + +#undef DEBUG_TYPE +#define DEBUG_TYPE "ppc-vsx-fma-mutate" + +namespace { + // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers + // (Altivec and scalar floating-point registers), we need to transform the + // copies into subregister copies with other restrictions. + struct PPCVSXFMAMutate : public MachineFunctionPass { + static char ID; + PPCVSXFMAMutate() : MachineFunctionPass(ID) { + initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry()); + } + + LiveIntervals *LIS; + + const PPCTargetMachine *TM; + const PPCInstrInfo *TII; + +protected: + bool processBlock(MachineBasicBlock &MBB) { + bool Changed = false; + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); + I != IE; ++I) { + MachineInstr *MI = I; + + // The default (A-type) VSX FMA form kills the addend (it is taken from + // the target register, which is then updated to reflect the result of + // the FMA). If the instruction, however, kills one of the registers + // used for the product, then we can use the M-form instruction (which + // will take that value from the to-be-defined register). + + int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode()); + if (AltOpc == -1) + continue; + + // This pass is run after register coalescing, and so we're looking for + // a situation like this: + // ... + // %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9 + // %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16, + // %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16 + // ... + // %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19, + // %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19 + // ... + // Where we can eliminate the copy by changing from the A-type to the + // M-type instruction. Specifically, for this example, this means: + // %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16, + // %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16 + // is replaced by: + // %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9, + // %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9 + // and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9 + + SlotIndex FMAIdx = LIS->getInstructionIndex(MI); + + VNInfo *AddendValNo = + LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn(); + MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def); + + // The addend and this instruction must be in the same block. + + if (!AddendMI || AddendMI->getParent() != MI->getParent()) + continue; + + // The addend must be a full copy within the same register class. + + if (!AddendMI->isFullCopy()) + continue; + + unsigned AddendSrcReg = AddendMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) { + if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) != + MRI.getRegClass(AddendSrcReg)) + continue; + } else { + // If AddendSrcReg is a physical register, make sure the destination + // register class contains it. + if (!MRI.getRegClass(AddendMI->getOperand(0).getReg()) + ->contains(AddendSrcReg)) + continue; + } + + // In theory, there could be other uses of the addend copy before this + // fma. We could deal with this, but that would require additional + // logic below and I suspect it will not occur in any relevant + // situations. + bool OtherUsers = false; + for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI); + J != JE; --J) + if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) { + OtherUsers = true; + break; + } + + if (OtherUsers) + continue; + + // Find one of the product operands that is killed by this instruction. + + unsigned KilledProdOp = 0, OtherProdOp = 0; + if (LIS->getInterval(MI->getOperand(2).getReg()) + .Query(FMAIdx).isKill()) { + KilledProdOp = 2; + OtherProdOp = 3; + } else if (LIS->getInterval(MI->getOperand(3).getReg()) + .Query(FMAIdx).isKill()) { + KilledProdOp = 3; + OtherProdOp = 2; + } + + // If there are no killed product operands, then this transformation is + // likely not profitable. + if (!KilledProdOp) + continue; + + // In order to replace the addend here with the source of the copy, + // it must still be live here. + if (!LIS->getInterval(AddendMI->getOperand(1).getReg()).liveAt(FMAIdx)) + continue; + + // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3. + + unsigned AddReg = AddendMI->getOperand(1).getReg(); + unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg(); + unsigned OtherProdReg = MI->getOperand(OtherProdOp).getReg(); + + unsigned AddSubReg = AddendMI->getOperand(1).getSubReg(); + unsigned KilledProdSubReg = MI->getOperand(KilledProdOp).getSubReg(); + unsigned OtherProdSubReg = MI->getOperand(OtherProdOp).getSubReg(); + + bool AddRegKill = AddendMI->getOperand(1).isKill(); + bool KilledProdRegKill = MI->getOperand(KilledProdOp).isKill(); + bool OtherProdRegKill = MI->getOperand(OtherProdOp).isKill(); + + bool AddRegUndef = AddendMI->getOperand(1).isUndef(); + bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef(); + bool OtherProdRegUndef = MI->getOperand(OtherProdOp).isUndef(); + + unsigned OldFMAReg = MI->getOperand(0).getReg(); + + assert(OldFMAReg == AddendMI->getOperand(0).getReg() && + "Addend copy not tied to old FMA output!"); + + DEBUG(dbgs() << "VSX FMA Mutation:\n " << *MI;); + + MI->getOperand(0).setReg(KilledProdReg); + MI->getOperand(1).setReg(KilledProdReg); + MI->getOperand(3).setReg(AddReg); + MI->getOperand(2).setReg(OtherProdReg); + + MI->getOperand(0).setSubReg(KilledProdSubReg); + MI->getOperand(1).setSubReg(KilledProdSubReg); + MI->getOperand(3).setSubReg(AddSubReg); + MI->getOperand(2).setSubReg(OtherProdSubReg); + + MI->getOperand(1).setIsKill(KilledProdRegKill); + MI->getOperand(3).setIsKill(AddRegKill); + MI->getOperand(2).setIsKill(OtherProdRegKill); + + MI->getOperand(1).setIsUndef(KilledProdRegUndef); + MI->getOperand(3).setIsUndef(AddRegUndef); + MI->getOperand(2).setIsUndef(OtherProdRegUndef); + + MI->setDesc(TII->get(AltOpc)); + + DEBUG(dbgs() << " -> " << *MI); + + // The killed product operand was killed here, so we can reuse it now + // for the result of the fma. + + LiveInterval &FMAInt = LIS->getInterval(OldFMAReg); + VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot()); + for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end(); + UI != UE;) { + MachineOperand &UseMO = *UI; + MachineInstr *UseMI = UseMO.getParent(); + ++UI; + + // Don't replace the result register of the copy we're about to erase. + if (UseMI == AddendMI) + continue; + + UseMO.setReg(KilledProdReg); + UseMO.setSubReg(KilledProdSubReg); + } + + // Extend the live intervals of the killed product operand to hold the + // fma result. + + LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg); + for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end(); + AI != AE; ++AI) { + // Don't add the segment that corresponds to the original copy. + if (AI->valno == AddendValNo) + continue; + + VNInfo *NewFMAValNo = + NewFMAInt.getNextValue(AI->start, + LIS->getVNInfoAllocator()); + + NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end, + NewFMAValNo)); + } + DEBUG(dbgs() << " extended: " << NewFMAInt << '\n'); + + FMAInt.removeValNo(FMAValNo); + DEBUG(dbgs() << " trimmed: " << FMAInt << '\n'); + + // Remove the (now unused) copy. + + DEBUG(dbgs() << " removing: " << *AddendMI << '\n'); + LIS->RemoveMachineInstrFromMaps(AddendMI); + AddendMI->eraseFromParent(); + + Changed = true; + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + TM = static_cast<const PPCTargetMachine *>(&MF.getTarget()); + // If we don't have VSX then go ahead and return without doing + // anything. + if (!TM->getSubtargetImpl()->hasVSX()) + return false; + + LIS = &getAnalysis<LiveIntervals>(); + + TII = TM->getInstrInfo(); + + bool Changed = false; + + if (DisableVSXFMAMutate) + return Changed; + + for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + MachineBasicBlock &B = *I++; + if (processBlock(B)) + Changed = true; + } + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.addPreserved<LiveIntervals>(); + AU.addRequired<SlotIndexes>(); + AU.addPreserved<SlotIndexes>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE, + "PowerPC VSX FMA Mutation", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE, + "PowerPC VSX FMA Mutation", false, false) + +char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID; + +char PPCVSXFMAMutate::ID = 0; +FunctionPass* +llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); } + +#undef DEBUG_TYPE +#define DEBUG_TYPE "ppc-vsx-copy" + +namespace llvm { + void initializePPCVSXCopyPass(PassRegistry&); +} + +namespace { + // PPCVSXCopy pass - For copies between VSX registers and non-VSX registers + // (Altivec and scalar floating-point registers), we need to transform the + // copies into subregister copies with other restrictions. + struct PPCVSXCopy : public MachineFunctionPass { + static char ID; + PPCVSXCopy() : MachineFunctionPass(ID) { + initializePPCVSXCopyPass(*PassRegistry::getPassRegistry()); + } + + const PPCTargetMachine *TM; + const PPCInstrInfo *TII; + + bool IsRegInClass(unsigned Reg, const TargetRegisterClass *RC, + MachineRegisterInfo &MRI) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + return RC->hasSubClassEq(MRI.getRegClass(Reg)); + } else if (RC->contains(Reg)) { + return true; + } + + return false; + } + + bool IsVSReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSRCRegClass, MRI); + } + + bool IsVRReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VRRCRegClass, MRI); + } + + bool IsF8Reg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI); + } + +protected: + bool processBlock(MachineBasicBlock &MBB) { + bool Changed = false; + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); + I != IE; ++I) { + MachineInstr *MI = I; + if (!MI->isFullCopy()) + continue; + + MachineOperand &DstMO = MI->getOperand(0); + MachineOperand &SrcMO = MI->getOperand(1); + + if ( IsVSReg(DstMO.getReg(), MRI) && + !IsVSReg(SrcMO.getReg(), MRI)) { + // This is a copy *to* a VSX register from a non-VSX register. + Changed = true; + + const TargetRegisterClass *SrcRC = + IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass : + &PPC::VSLRCRegClass; + assert((IsF8Reg(SrcMO.getReg(), MRI) || + IsVRReg(SrcMO.getReg(), MRI)) && + "Unknown source for a VSX copy"); + + unsigned NewVReg = MRI.createVirtualRegister(SrcRC); + BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg) + .addImm(1) // add 1, not 0, because there is no implicit clearing + // of the high bits. + .addOperand(SrcMO) + .addImm(IsVRReg(SrcMO.getReg(), MRI) ? PPC::sub_128 : + PPC::sub_64); + + // The source of the original copy is now the new virtual register. + SrcMO.setReg(NewVReg); + } else if (!IsVSReg(DstMO.getReg(), MRI) && + IsVSReg(SrcMO.getReg(), MRI)) { + // This is a copy *from* a VSX register to a non-VSX register. + Changed = true; + + const TargetRegisterClass *DstRC = + IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass : + &PPC::VSLRCRegClass; + assert((IsF8Reg(DstMO.getReg(), MRI) || + IsVRReg(DstMO.getReg(), MRI)) && + "Unknown destination for a VSX copy"); + + // Copy the VSX value into a new VSX register of the correct subclass. + unsigned NewVReg = MRI.createVirtualRegister(DstRC); + BuildMI(MBB, MI, MI->getDebugLoc(), + TII->get(TargetOpcode::COPY), NewVReg) + .addOperand(SrcMO); + + // Transform the original copy into a subregister extraction copy. + SrcMO.setReg(NewVReg); + SrcMO.setSubReg(IsVRReg(DstMO.getReg(), MRI) ? PPC::sub_128 : + PPC::sub_64); + } + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + TM = static_cast<const PPCTargetMachine *>(&MF.getTarget()); + // If we don't have VSX on the subtarget, don't do anything. + if (!TM->getSubtargetImpl()->hasVSX()) + return false; + TII = TM->getInstrInfo(); + + bool Changed = false; + + for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + MachineBasicBlock &B = *I++; + if (processBlock(B)) + Changed = true; + } + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +INITIALIZE_PASS(PPCVSXCopy, DEBUG_TYPE, + "PowerPC VSX Copy Legalization", false, false) + +char PPCVSXCopy::ID = 0; +FunctionPass* +llvm::createPPCVSXCopyPass() { return new PPCVSXCopy(); } + +#undef DEBUG_TYPE +#define DEBUG_TYPE "ppc-vsx-copy-cleanup" + +namespace llvm { + void initializePPCVSXCopyCleanupPass(PassRegistry&); +} + +namespace { + // PPCVSXCopyCleanup pass - We sometimes end up generating self copies of VSX + // registers (mostly because the ABI code still places all values into the + // "traditional" floating-point and vector registers). Remove them here. + struct PPCVSXCopyCleanup : public MachineFunctionPass { + static char ID; + PPCVSXCopyCleanup() : MachineFunctionPass(ID) { + initializePPCVSXCopyCleanupPass(*PassRegistry::getPassRegistry()); + } + + const PPCTargetMachine *TM; + const PPCInstrInfo *TII; + +protected: + bool processBlock(MachineBasicBlock &MBB) { + bool Changed = false; + + SmallVector<MachineInstr *, 4> ToDelete; + for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); + I != IE; ++I) { + MachineInstr *MI = I; + if (MI->getOpcode() == PPC::XXLOR && + MI->getOperand(0).getReg() == MI->getOperand(1).getReg() && + MI->getOperand(0).getReg() == MI->getOperand(2).getReg()) + ToDelete.push_back(MI); + } + + if (!ToDelete.empty()) + Changed = true; + + for (unsigned i = 0, ie = ToDelete.size(); i != ie; ++i) { + DEBUG(dbgs() << "Removing VSX self-copy: " << *ToDelete[i]); + ToDelete[i]->eraseFromParent(); + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + TM = static_cast<const PPCTargetMachine *>(&MF.getTarget()); + // If we don't have VSX don't bother doing anything here. + if (!TM->getSubtargetImpl()->hasVSX()) + return false; + TII = TM->getInstrInfo(); + + bool Changed = false; + + for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + MachineBasicBlock &B = *I++; + if (processBlock(B)) + Changed = true; + } + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +INITIALIZE_PASS(PPCVSXCopyCleanup, DEBUG_TYPE, + "PowerPC VSX Copy Cleanup", false, false) + +char PPCVSXCopyCleanup::ID = 0; +FunctionPass* +llvm::createPPCVSXCopyCleanupPass() { return new PPCVSXCopyCleanup(); } + +#undef DEBUG_TYPE +#define DEBUG_TYPE "ppc-early-ret" +STATISTIC(NumBCLR, "Number of early conditional returns"); +STATISTIC(NumBLR, "Number of early returns"); + +namespace llvm { + void initializePPCEarlyReturnPass(PassRegistry&); +} + +namespace { + // PPCEarlyReturn pass - For simple functions without epilogue code, move + // returns up, and create conditional returns, to avoid unnecessary + // branch-to-blr sequences. + struct PPCEarlyReturn : public MachineFunctionPass { + static char ID; + PPCEarlyReturn() : MachineFunctionPass(ID) { + initializePPCEarlyReturnPass(*PassRegistry::getPassRegistry()); + } + + const PPCTargetMachine *TM; + const PPCInstrInfo *TII; + +protected: + bool processBlock(MachineBasicBlock &ReturnMBB) { + bool Changed = false; + + MachineBasicBlock::iterator I = ReturnMBB.begin(); + I = ReturnMBB.SkipPHIsAndLabels(I); + + // The block must be essentially empty except for the blr. + if (I == ReturnMBB.end() || I->getOpcode() != PPC::BLR || + I != ReturnMBB.getLastNonDebugInstr()) + return Changed; + + SmallVector<MachineBasicBlock*, 8> PredToRemove; + for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(), + PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) { + bool OtherReference = false, BlockChanged = false; + for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) { + if (J->getOpcode() == PPC::B) { + if (J->getOperand(0).getMBB() == &ReturnMBB) { + // This is an unconditional branch to the return. Replace the + // branch with a blr. + BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BLR)); + MachineBasicBlock::iterator K = J--; + K->eraseFromParent(); + BlockChanged = true; + ++NumBLR; + continue; + } + } else if (J->getOpcode() == PPC::BCC) { + if (J->getOperand(2).getMBB() == &ReturnMBB) { + // This is a conditional branch to the return. Replace the branch + // with a bclr. + BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR)) + .addImm(J->getOperand(0).getImm()) + .addReg(J->getOperand(1).getReg()); + MachineBasicBlock::iterator K = J--; + K->eraseFromParent(); + BlockChanged = true; + ++NumBCLR; + continue; + } + } else if (J->getOpcode() == PPC::BC || J->getOpcode() == PPC::BCn) { + if (J->getOperand(1).getMBB() == &ReturnMBB) { + // This is a conditional branch to the return. Replace the branch + // with a bclr. + BuildMI(**PI, J, J->getDebugLoc(), + TII->get(J->getOpcode() == PPC::BC ? + PPC::BCLR : PPC::BCLRn)) + .addReg(J->getOperand(0).getReg()); + MachineBasicBlock::iterator K = J--; + K->eraseFromParent(); + BlockChanged = true; + ++NumBCLR; + continue; + } + } else if (J->isBranch()) { + if (J->isIndirectBranch()) { + if (ReturnMBB.hasAddressTaken()) + OtherReference = true; + } else + for (unsigned i = 0; i < J->getNumOperands(); ++i) + if (J->getOperand(i).isMBB() && + J->getOperand(i).getMBB() == &ReturnMBB) + OtherReference = true; + } else if (!J->isTerminator() && !J->isDebugValue()) + break; + + if (J == (*PI)->begin()) + break; + + --J; + } + + if ((*PI)->canFallThrough() && (*PI)->isLayoutSuccessor(&ReturnMBB)) + OtherReference = true; + + // Predecessors are stored in a vector and can't be removed here. + if (!OtherReference && BlockChanged) { + PredToRemove.push_back(*PI); + } + + if (BlockChanged) + Changed = true; + } + + for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i) + PredToRemove[i]->removeSuccessor(&ReturnMBB); + + if (Changed && !ReturnMBB.hasAddressTaken()) { + // We now might be able to merge this blr-only block into its + // by-layout predecessor. + if (ReturnMBB.pred_size() == 1 && + (*ReturnMBB.pred_begin())->isLayoutSuccessor(&ReturnMBB)) { + // Move the blr into the preceding block. + MachineBasicBlock &PrevMBB = **ReturnMBB.pred_begin(); + PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I); + PrevMBB.removeSuccessor(&ReturnMBB); + } + + if (ReturnMBB.pred_empty()) + ReturnMBB.eraseFromParent(); + } + + return Changed; + } + +public: + bool runOnMachineFunction(MachineFunction &MF) override { + TM = static_cast<const PPCTargetMachine *>(&MF.getTarget()); + TII = TM->getInstrInfo(); + + bool Changed = false; + + // If the function does not have at least two blocks, then there is + // nothing to do. + if (MF.size() < 2) + return Changed; + + for (MachineFunction::iterator I = MF.begin(); I != MF.end();) { + MachineBasicBlock &B = *I++; + if (processBlock(B)) + Changed = true; + } + + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +INITIALIZE_PASS(PPCEarlyReturn, DEBUG_TYPE, + "PowerPC Early-Return Creation", false, false) + +char PPCEarlyReturn::ID = 0; +FunctionPass* +llvm::createPPCEarlyReturnPass() { return new PPCEarlyReturn(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h new file mode 100644 index 000000000000..83f14c6cf214 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -0,0 +1,235 @@ +//===-- PPCInstrInfo.h - PowerPC Instruction Information --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC_INSTRUCTIONINFO_H +#define POWERPC_INSTRUCTIONINFO_H + +#include "PPC.h" +#include "PPCRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "PPCGenInstrInfo.inc" + +namespace llvm { + +/// PPCII - This namespace holds all of the PowerPC target-specific +/// per-instruction flags. These must match the corresponding definitions in +/// PPC.td and PPCInstrFormats.td. +namespace PPCII { +enum { + // PPC970 Instruction Flags. These flags describe the characteristics of the + // PowerPC 970 (aka G5) dispatch groups and how they are formed out of + // raw machine instructions. + + /// PPC970_First - This instruction starts a new dispatch group, so it will + /// always be the first one in the group. + PPC970_First = 0x1, + + /// PPC970_Single - This instruction starts a new dispatch group and + /// terminates it, so it will be the sole instruction in the group. + PPC970_Single = 0x2, + + /// PPC970_Cracked - This instruction is cracked into two pieces, requiring + /// two dispatch pipes to be available to issue. + PPC970_Cracked = 0x4, + + /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that + /// an instruction is issued to. + PPC970_Shift = 3, + PPC970_Mask = 0x07 << PPC970_Shift +}; +enum PPC970_Unit { + /// These are the various PPC970 execution unit pipelines. Each instruction + /// is one of these. + PPC970_Pseudo = 0 << PPC970_Shift, // Pseudo instruction + PPC970_FXU = 1 << PPC970_Shift, // Fixed Point (aka Integer/ALU) Unit + PPC970_LSU = 2 << PPC970_Shift, // Load Store Unit + PPC970_FPU = 3 << PPC970_Shift, // Floating Point Unit + PPC970_CRU = 4 << PPC970_Shift, // Control Register Unit + PPC970_VALU = 5 << PPC970_Shift, // Vector ALU + PPC970_VPERM = 6 << PPC970_Shift, // Vector Permute Unit + PPC970_BRU = 7 << PPC970_Shift // Branch Unit +}; +} // end namespace PPCII + + +class PPCInstrInfo : public PPCGenInstrInfo { + PPCSubtarget &Subtarget; + const PPCRegisterInfo RI; + + bool StoreRegToStackSlot(MachineFunction &MF, + unsigned SrcReg, bool isKill, int FrameIdx, + const TargetRegisterClass *RC, + SmallVectorImpl<MachineInstr*> &NewMIs, + bool &NonRI, bool &SpillsVRS) const; + bool LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC, + SmallVectorImpl<MachineInstr*> &NewMIs, + bool &NonRI, bool &SpillsVRS) const; + virtual void anchor(); +public: + explicit PPCInstrInfo(PPCSubtarget &STI); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + const PPCRegisterInfo &getRegisterInfo() const { return RI; } + + ScheduleHazardRecognizer * + CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, + const ScheduleDAG *DAG) const override; + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const override; + + int getOperandLatency(const InstrItineraryData *ItinData, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, + unsigned UseIdx) const override; + int getOperandLatency(const InstrItineraryData *ItinData, + SDNode *DefNode, unsigned DefIdx, + SDNode *UseNode, unsigned UseIdx) const override { + return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx, + UseNode, UseIdx); + } + + bool isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const override; + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + + // commuteInstruction - We can commute rlwimi instructions, but only if the + // rotate amt is zero. We also have to munge the immediates a bit. + MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override; + + bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const override; + + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + + + // Branch analysis. + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const override; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const override; + + // Select analysis. + bool canInsertSelect(const MachineBasicBlock&, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned, unsigned, int&, int&, int&) const override; + void insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DstReg, + const SmallVectorImpl<MachineOperand> &Cond, + unsigned TrueReg, unsigned FalseReg) const override; + + void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + + bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, + unsigned Reg, MachineRegisterInfo *MRI) const override; + + // If conversion by predication (only supported by some branch instructions). + // All of the profitability checks always return true; it is always + // profitable to use the predicated branches. + bool isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, unsigned ExtraPredCycles, + const BranchProbability &Probability) const override { + return true; + } + + bool isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumT, unsigned ExtraT, + MachineBasicBlock &FMBB, + unsigned NumF, unsigned ExtraF, + const BranchProbability &Probability) const override; + + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, + const BranchProbability + &Probability) const override { + return true; + } + + bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, + MachineBasicBlock &FMBB) const override { + return false; + } + + // Predication support. + bool isPredicated(const MachineInstr *MI) const override; + + bool isUnpredicatedTerminator(const MachineInstr *MI) const override; + + bool PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const override; + + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const override; + + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const override; + + bool isPredicable(MachineInstr *MI) const override; + + // Comparison optimization. + + + bool analyzeCompare(const MachineInstr *MI, + unsigned &SrcReg, unsigned &SrcReg2, + int &Mask, int &Value) const override; + + bool optimizeCompareInstr(MachineInstr *CmpInstr, + unsigned SrcReg, unsigned SrcReg2, + int Mask, int Value, + const MachineRegisterInfo *MRI) const override; + + /// GetInstSize - Return the number of bytes of code the specified + /// instruction may be. This returns the maximum number of bytes. + /// + unsigned GetInstSizeInBytes(const MachineInstr *MI) const; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td new file mode 100644 index 000000000000..636ac5d571fa --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -0,0 +1,3424 @@ +//===-- PPCInstrInfo.td - The PowerPC Instruction Set ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the subset of the 32-bit PowerPC instruction set, as used +// by the PowerPC instruction selector. +// +//===----------------------------------------------------------------------===// + +include "PPCInstrFormats.td" + +//===----------------------------------------------------------------------===// +// PowerPC specific type constraints. +// +def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx + SDTCisVT<0, f64>, SDTCisPtrTy<1> +]>; +def SDT_PPClfiwx : SDTypeProfile<1, 1, [ // lfiw[az]x + SDTCisVT<0, f64>, SDTCisPtrTy<1> +]>; + +def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; +def SDT_PPCvperm : SDTypeProfile<1, 3, [ + SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2> +]>; + +def SDT_PPCvcmp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32> +]>; + +def SDT_PPCcondbr : SDTypeProfile<0, 3, [ + SDTCisVT<0, i32>, SDTCisVT<2, OtherVT> +]>; + +def SDT_PPClbrx : SDTypeProfile<1, 2, [ + SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT> +]>; +def SDT_PPCstbrx : SDTypeProfile<0, 3, [ + SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT> +]>; + +def SDT_PPClarx : SDTypeProfile<1, 1, [ + SDTCisInt<0>, SDTCisPtrTy<1> +]>; +def SDT_PPCstcx : SDTypeProfile<0, 2, [ + SDTCisInt<0>, SDTCisPtrTy<1> +]>; + +def SDT_PPCTC_ret : SDTypeProfile<0, 2, [ + SDTCisPtrTy<0>, SDTCisVT<1, i32> +]>; + +def tocentry32 : Operand<iPTR> { + let MIOperandInfo = (ops i32imm:$imm); +} + +//===----------------------------------------------------------------------===// +// PowerPC specific DAG Nodes. +// + +def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>; +def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>; + +def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>; +def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>; +def PPCfcfids : SDNode<"PPCISD::FCFIDS", SDTFPRoundOp, []>; +def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>; +def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>; +def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>; +def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>; +def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>; +def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx, + [SDNPHasChain, SDNPMayStore]>; +def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx, + [SDNPHasChain, SDNPMayLoad]>; +def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx, + [SDNPHasChain, SDNPMayLoad]>; + +// Extract FPSCR (not modeled at the DAG level). +def PPCmffs : SDNode<"PPCISD::MFFS", + SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>, []>; + +// Perform FADD in round-to-zero mode. +def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>; + + +def PPCfsel : SDNode<"PPCISD::FSEL", + // Type constraint for fsel. + SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, + SDTCisFP<0>, SDTCisVT<1, f64>]>, []>; + +def PPChi : SDNode<"PPCISD::Hi", SDTIntBinOp, []>; +def PPClo : SDNode<"PPCISD::Lo", SDTIntBinOp, []>; +def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, [SDNPMayLoad]>; +def PPCvmaddfp : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>; +def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>; + +def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>; + +def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>; +def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp, + [SDNPMayLoad]>; +def PPCaddTls : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>; +def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>; +def PPCaddiTlsgdL : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>; +def PPCgetTlsAddr : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>; +def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>; +def PPCaddiTlsldL : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>; +def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>; +def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp, + [SDNPHasChain]>; +def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>; + +def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; + +// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift +// amounts. These nodes are generated by the multi-precision shift code. +def PPCsrl : SDNode<"PPCISD::SRL" , SDTIntShiftOp>; +def PPCsra : SDNode<"PPCISD::SRA" , SDTIntShiftOp>; +def PPCshl : SDNode<"PPCISD::SHL" , SDTIntShiftOp>; + +// These are target-independent nodes, but have target-specific formats. +def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_PPCCallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def SDT_PPCCall : SDTypeProfile<0, -1, [SDTCisInt<0>]>; +def PPCcall : SDNode<"PPCISD::CALL", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCcall_nop : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCload : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>, + [SDNPHasChain, SDNPSideEffect, + SDNPInGlue, SDNPOutGlue]>; +def PPCmtctr : SDNode<"PPCISD::MTCTR", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + +def retflag : SDNode<"PPCISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def PPCeh_sjlj_setjmp : SDNode<"PPCISD::EH_SJLJ_SETJMP", + SDTypeProfile<1, 1, [SDTCisInt<0>, + SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPSideEffect]>; + +def SDT_PPCsc : SDTypeProfile<0, 1, [SDTCisInt<0>]>; +def PPCsc : SDNode<"PPCISD::SC", SDT_PPCsc, + [SDNPHasChain, SDNPSideEffect]>; + +def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>; +def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>; + +def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr, + [SDNPHasChain, SDNPOptInGlue]>; + +def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx, + [SDNPHasChain, SDNPMayLoad]>; +def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx, + [SDNPHasChain, SDNPMayStore]>; + +// Instructions to set/unset CR bit 6 for SVR4 vararg calls +def PPCcr6set : SDNode<"PPCISD::CR6SET", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +// Instructions to support atomic operations +def PPClarx : SDNode<"PPCISD::LARX", SDT_PPClarx, + [SDNPHasChain, SDNPMayLoad]>; +def PPCstcx : SDNode<"PPCISD::STCX", SDT_PPCstcx, + [SDNPHasChain, SDNPMayStore]>; + +// Instructions to support medium and large code model +def PPCaddisTocHA : SDNode<"PPCISD::ADDIS_TOC_HA", SDTIntBinOp, []>; +def PPCldTocL : SDNode<"PPCISD::LD_TOC_L", SDTIntBinOp, [SDNPMayLoad]>; +def PPCaddiTocL : SDNode<"PPCISD::ADDI_TOC_L", SDTIntBinOp, []>; + + +// Instructions to support dynamic alloca. +def SDTDynOp : SDTypeProfile<1, 2, []>; +def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// PowerPC specific transformation functions and pattern fragments. +// + +def SHL32 : SDNodeXForm<imm, [{ + // Transformation function: 31 - imm + return getI32Imm(31 - N->getZExtValue()); +}]>; + +def SRL32 : SDNodeXForm<imm, [{ + // Transformation function: 32 - imm + return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue()) : getI32Imm(0); +}]>; + +def LO16 : SDNodeXForm<imm, [{ + // Transformation function: get the low 16 bits. + return getI32Imm((unsigned short)N->getZExtValue()); +}]>; + +def HI16 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + return getI32Imm((unsigned)N->getZExtValue() >> 16); +}]>; + +def HA16 : SDNodeXForm<imm, [{ + // Transformation function: shift the immediate value down into the low bits. + signed int Val = N->getZExtValue(); + return getI32Imm((Val - (signed short)Val) >> 16); +}]>; +def MB : SDNodeXForm<imm, [{ + // Transformation function: get the start bit of a mask + unsigned mb = 0, me; + (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me); + return getI32Imm(mb); +}]>; + +def ME : SDNodeXForm<imm, [{ + // Transformation function: get the end bit of a mask + unsigned mb, me = 0; + (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me); + return getI32Imm(me); +}]>; +def maskimm32 : PatLeaf<(imm), [{ + // maskImm predicate - True if immediate is a run of ones. + unsigned mb, me; + if (N->getValueType(0) == MVT::i32) + return isRunOfOnes((unsigned)N->getZExtValue(), mb, me); + else + return false; +}]>; + +def imm32SExt16 : Operand<i32>, ImmLeaf<i32, [{ + // imm32SExt16 predicate - True if the i32 immediate fits in a 16-bit + // sign extended field. Used by instructions like 'addi'. + return (int32_t)Imm == (short)Imm; +}]>; +def imm64SExt16 : Operand<i64>, ImmLeaf<i64, [{ + // imm64SExt16 predicate - True if the i64 immediate fits in a 16-bit + // sign extended field. Used by instructions like 'addi'. + return (int64_t)Imm == (short)Imm; +}]>; +def immZExt16 : PatLeaf<(imm), [{ + // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended + // field. Used by instructions like 'ori'. + return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue(); +}], LO16>; + +// imm16Shifted* - These match immediates where the low 16-bits are zero. There +// are two forms: imm16ShiftedSExt and imm16ShiftedZExt. These two forms are +// identical in 32-bit mode, but in 64-bit mode, they return true if the +// immediate fits into a sign/zero extended 32-bit immediate (with the low bits +// clear). +def imm16ShiftedZExt : PatLeaf<(imm), [{ + // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the + // immediate are set. Used by instructions like 'xoris'. + return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0; +}], HI16>; + +def imm16ShiftedSExt : PatLeaf<(imm), [{ + // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the + // immediate are set. Used by instructions like 'addis'. Identical to + // imm16ShiftedZExt in 32-bit mode. + if (N->getZExtValue() & 0xFFFF) return false; + if (N->getValueType(0) == MVT::i32) + return true; + // For 64-bit, make sure it is sext right. + return N->getZExtValue() == (uint64_t)(int)N->getZExtValue(); +}], HI16>; + +def imm64ZExt32 : Operand<i64>, ImmLeaf<i64, [{ + // imm64ZExt32 predicate - True if the i64 immediate fits in a 32-bit + // zero extended field. + return isUInt<32>(Imm); +}]>; + +// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require +// restricted memrix (4-aligned) constants are alignment sensitive. If these +// offsets are hidden behind TOC entries than the values of the lower-order +// bits cannot be checked directly. As a result, we need to also incorporate +// an alignment check into the relevant patterns. + +def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned4store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 4; +}]>; +def aligned4pre_store : PatFrag< + (ops node:$val, node:$base, node:$offset), + (pre_store node:$val, node:$base, node:$offset), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 4; +}]>; + +def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() < 4; +}]>; +def unaligned4store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() < 4; +}]>; +def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() < 4; +}]>; + +//===----------------------------------------------------------------------===// +// PowerPC Flag Definitions. + +class isPPC64 { bit PPC64 = 1; } +class isDOT { bit RC = 1; } + +class RegConstraint<string C> { + string Constraints = C; +} +class NoEncode<string E> { + string DisableEncoding = E; +} + + +//===----------------------------------------------------------------------===// +// PowerPC Operand Definitions. + +// In the default PowerPC assembler syntax, registers are specified simply +// by number, so they cannot be distinguished from immediate values (without +// looking at the opcode). This means that the default operand matching logic +// for the asm parser does not work, and we need to specify custom matchers. +// Since those can only be specified with RegisterOperand classes and not +// directly on the RegisterClass, all instructions patterns used by the asm +// parser need to use a RegisterOperand (instead of a RegisterClass) for +// all their register operands. +// For this purpose, we define one RegisterOperand for each RegisterClass, +// using the same name as the class, just in lower case. + +def PPCRegGPRCAsmOperand : AsmOperandClass { + let Name = "RegGPRC"; let PredicateMethod = "isRegNumber"; +} +def gprc : RegisterOperand<GPRC> { + let ParserMatchClass = PPCRegGPRCAsmOperand; +} +def PPCRegG8RCAsmOperand : AsmOperandClass { + let Name = "RegG8RC"; let PredicateMethod = "isRegNumber"; +} +def g8rc : RegisterOperand<G8RC> { + let ParserMatchClass = PPCRegG8RCAsmOperand; +} +def PPCRegGPRCNoR0AsmOperand : AsmOperandClass { + let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber"; +} +def gprc_nor0 : RegisterOperand<GPRC_NOR0> { + let ParserMatchClass = PPCRegGPRCNoR0AsmOperand; +} +def PPCRegG8RCNoX0AsmOperand : AsmOperandClass { + let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber"; +} +def g8rc_nox0 : RegisterOperand<G8RC_NOX0> { + let ParserMatchClass = PPCRegG8RCNoX0AsmOperand; +} +def PPCRegF8RCAsmOperand : AsmOperandClass { + let Name = "RegF8RC"; let PredicateMethod = "isRegNumber"; +} +def f8rc : RegisterOperand<F8RC> { + let ParserMatchClass = PPCRegF8RCAsmOperand; +} +def PPCRegF4RCAsmOperand : AsmOperandClass { + let Name = "RegF4RC"; let PredicateMethod = "isRegNumber"; +} +def f4rc : RegisterOperand<F4RC> { + let ParserMatchClass = PPCRegF4RCAsmOperand; +} +def PPCRegVRRCAsmOperand : AsmOperandClass { + let Name = "RegVRRC"; let PredicateMethod = "isRegNumber"; +} +def vrrc : RegisterOperand<VRRC> { + let ParserMatchClass = PPCRegVRRCAsmOperand; +} +def PPCRegCRBITRCAsmOperand : AsmOperandClass { + let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber"; +} +def crbitrc : RegisterOperand<CRBITRC> { + let ParserMatchClass = PPCRegCRBITRCAsmOperand; +} +def PPCRegCRRCAsmOperand : AsmOperandClass { + let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber"; +} +def crrc : RegisterOperand<CRRC> { + let ParserMatchClass = PPCRegCRRCAsmOperand; +} + +def PPCU2ImmAsmOperand : AsmOperandClass { + let Name = "U2Imm"; let PredicateMethod = "isU2Imm"; + let RenderMethod = "addImmOperands"; +} +def u2imm : Operand<i32> { + let PrintMethod = "printU2ImmOperand"; + let ParserMatchClass = PPCU2ImmAsmOperand; +} +def PPCS5ImmAsmOperand : AsmOperandClass { + let Name = "S5Imm"; let PredicateMethod = "isS5Imm"; + let RenderMethod = "addImmOperands"; +} +def s5imm : Operand<i32> { + let PrintMethod = "printS5ImmOperand"; + let ParserMatchClass = PPCS5ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<5>"; +} +def PPCU5ImmAsmOperand : AsmOperandClass { + let Name = "U5Imm"; let PredicateMethod = "isU5Imm"; + let RenderMethod = "addImmOperands"; +} +def u5imm : Operand<i32> { + let PrintMethod = "printU5ImmOperand"; + let ParserMatchClass = PPCU5ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<5>"; +} +def PPCU6ImmAsmOperand : AsmOperandClass { + let Name = "U6Imm"; let PredicateMethod = "isU6Imm"; + let RenderMethod = "addImmOperands"; +} +def u6imm : Operand<i32> { + let PrintMethod = "printU6ImmOperand"; + let ParserMatchClass = PPCU6ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<6>"; +} +def PPCS16ImmAsmOperand : AsmOperandClass { + let Name = "S16Imm"; let PredicateMethod = "isS16Imm"; + let RenderMethod = "addImmOperands"; +} +def s16imm : Operand<i32> { + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCS16ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; +} +def PPCU16ImmAsmOperand : AsmOperandClass { + let Name = "U16Imm"; let PredicateMethod = "isU16Imm"; + let RenderMethod = "addImmOperands"; +} +def u16imm : Operand<i32> { + let PrintMethod = "printU16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCU16ImmAsmOperand; + let DecoderMethod = "decodeUImmOperand<16>"; +} +def PPCS17ImmAsmOperand : AsmOperandClass { + let Name = "S17Imm"; let PredicateMethod = "isS17Imm"; + let RenderMethod = "addImmOperands"; +} +def s17imm : Operand<i32> { + // This operand type is used for addis/lis to allow the assembler parser + // to accept immediates in the range -65536..65535 for compatibility with + // the GNU assembler. The operand is treated as 16-bit otherwise. + let PrintMethod = "printS16ImmOperand"; + let EncoderMethod = "getImm16Encoding"; + let ParserMatchClass = PPCS17ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<16>"; +} +def PPCDirectBrAsmOperand : AsmOperandClass { + let Name = "DirectBr"; let PredicateMethod = "isDirectBr"; + let RenderMethod = "addBranchTargetOperands"; +} +def directbrtarget : Operand<OtherVT> { + let PrintMethod = "printBranchOperand"; + let EncoderMethod = "getDirectBrEncoding"; + let ParserMatchClass = PPCDirectBrAsmOperand; +} +def absdirectbrtarget : Operand<OtherVT> { + let PrintMethod = "printAbsBranchOperand"; + let EncoderMethod = "getAbsDirectBrEncoding"; + let ParserMatchClass = PPCDirectBrAsmOperand; +} +def PPCCondBrAsmOperand : AsmOperandClass { + let Name = "CondBr"; let PredicateMethod = "isCondBr"; + let RenderMethod = "addBranchTargetOperands"; +} +def condbrtarget : Operand<OtherVT> { + let PrintMethod = "printBranchOperand"; + let EncoderMethod = "getCondBrEncoding"; + let ParserMatchClass = PPCCondBrAsmOperand; +} +def abscondbrtarget : Operand<OtherVT> { + let PrintMethod = "printAbsBranchOperand"; + let EncoderMethod = "getAbsCondBrEncoding"; + let ParserMatchClass = PPCCondBrAsmOperand; +} +def calltarget : Operand<iPTR> { + let PrintMethod = "printBranchOperand"; + let EncoderMethod = "getDirectBrEncoding"; + let ParserMatchClass = PPCDirectBrAsmOperand; +} +def abscalltarget : Operand<iPTR> { + let PrintMethod = "printAbsBranchOperand"; + let EncoderMethod = "getAbsDirectBrEncoding"; + let ParserMatchClass = PPCDirectBrAsmOperand; +} +def PPCCRBitMaskOperand : AsmOperandClass { + let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask"; +} +def crbitm: Operand<i8> { + let PrintMethod = "printcrbitm"; + let EncoderMethod = "get_crbitm_encoding"; + let DecoderMethod = "decodeCRBitMOperand"; + let ParserMatchClass = PPCCRBitMaskOperand; +} +// Address operands +// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode). +def PPCRegGxRCNoR0Operand : AsmOperandClass { + let Name = "RegGxRCNoR0"; let PredicateMethod = "isRegNumber"; +} +def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> { + let ParserMatchClass = PPCRegGxRCNoR0Operand; +} +// A version of ptr_rc usable with the asm parser. +def PPCRegGxRCOperand : AsmOperandClass { + let Name = "RegGxRC"; let PredicateMethod = "isRegNumber"; +} +def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> { + let ParserMatchClass = PPCRegGxRCOperand; +} + +def PPCDispRIOperand : AsmOperandClass { + let Name = "DispRI"; let PredicateMethod = "isS16Imm"; + let RenderMethod = "addImmOperands"; +} +def dispRI : Operand<iPTR> { + let ParserMatchClass = PPCDispRIOperand; +} +def PPCDispRIXOperand : AsmOperandClass { + let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4"; + let RenderMethod = "addImmOperands"; +} +def dispRIX : Operand<iPTR> { + let ParserMatchClass = PPCDispRIXOperand; +} + +def memri : Operand<iPTR> { + let PrintMethod = "printMemRegImm"; + let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg); + let EncoderMethod = "getMemRIEncoding"; + let DecoderMethod = "decodeMemRIOperands"; +} +def memrr : Operand<iPTR> { + let PrintMethod = "printMemRegReg"; + let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg); +} +def memrix : Operand<iPTR> { // memri where the imm is 4-aligned. + let PrintMethod = "printMemRegImm"; + let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg); + let EncoderMethod = "getMemRIXEncoding"; + let DecoderMethod = "decodeMemRIXOperands"; +} + +// A single-register address. This is used with the SjLj +// pseudo-instructions. +def memr : Operand<iPTR> { + let MIOperandInfo = (ops ptr_rc:$ptrreg); +} +def PPCTLSRegOperand : AsmOperandClass { + let Name = "TLSReg"; let PredicateMethod = "isTLSReg"; + let RenderMethod = "addTLSRegOperands"; +} +def tlsreg32 : Operand<i32> { + let EncoderMethod = "getTLSRegEncoding"; + let ParserMatchClass = PPCTLSRegOperand; +} +def tlsgd32 : Operand<i32> {} +def tlscall32 : Operand<i32> { + let PrintMethod = "printTLSCall"; + let MIOperandInfo = (ops calltarget:$func, tlsgd32:$sym); + let EncoderMethod = "getTLSCallEncoding"; +} + +// PowerPC Predicate operand. +def pred : Operand<OtherVT> { + let PrintMethod = "printPredicateOperand"; + let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg); +} + +// Define PowerPC specific addressing mode. +def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; +def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>; +def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>; +def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std" + +// The address in a single register. This is used with the SjLj +// pseudo-instructions. +def addr : ComplexPattern<iPTR, 1, "SelectAddr",[], []>; + +/// This is just the offset part of iaddr, used for preinc. +def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>; + +//===----------------------------------------------------------------------===// +// PowerPC Instruction Predicate Definitions. +def In32BitMode : Predicate<"!PPCSubTarget->isPPC64()">; +def In64BitMode : Predicate<"PPCSubTarget->isPPC64()">; +def IsBookE : Predicate<"PPCSubTarget->isBookE()">; +def IsNotBookE : Predicate<"!PPCSubTarget->isBookE()">; + +//===----------------------------------------------------------------------===// +// PowerPC Multiclass Definitions. + +multiclass XForm_6r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XForm_6<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : XForm_6<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_6rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XForm_6<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XForm_6<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_10rc<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XForm_10<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XForm_10<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_11r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XForm_11<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : XForm_11<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XOForm_1r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XOForm_1rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XOForm_1<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XOForm_3r<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XOForm_3<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : XOForm_3<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XOForm_3rc<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XOForm_3<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XOForm_3<opcode, xo, oe, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass MForm_2r<bits<6> opcode, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : MForm_2<opcode, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : MForm_2<opcode, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass MDForm_1r<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : MDForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : MDForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass MDSForm_1r<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : MDSForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR0] in + def o : MDSForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XSForm_1rc<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + let Defs = [CARRY] in + def NAME : XSForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CARRY, CR0] in + def o : XSForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XForm_26<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : XForm_26<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XForm_28<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : XForm_28<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : AForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : AForm_1<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass AForm_2r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : AForm_2<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : AForm_2<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : AForm_3<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>, RecFormRel; + let Defs = [CR1] in + def o : AForm_3<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT, RecFormRel; + } +} + +//===----------------------------------------------------------------------===// +// PowerPC Instruction Definitions. + +// Pseudo-instructions: + +let hasCtrlDep = 1 in { +let Defs = [R1], Uses = [R1] in { +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt", + [(callseq_start timm:$amt)]>; +def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", + [(callseq_end timm:$amt1, timm:$amt2)]>; +} + +def UPDATE_VRSAVE : Pseudo<(outs gprc:$rD), (ins gprc:$rS), + "UPDATE_VRSAVE $rD, $rS", []>; +} + +let Defs = [R1], Uses = [R1] in +def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC", + [(set i32:$result, + (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>; + +// SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after +// instruction selection into a branch sequence. +let usesCustomInserter = 1, // Expanded after instruction selection. + PPC970_Single = 1 in { + // Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes + // because either operand might become the first operand in an isel, and + // that operand cannot be r0. + def SELECT_CC_I4 : Pseudo<(outs gprc:$dst), (ins crrc:$cond, + gprc_nor0:$T, gprc_nor0:$F, + i32imm:$BROPC), "#SELECT_CC_I4", + []>; + def SELECT_CC_I8 : Pseudo<(outs g8rc:$dst), (ins crrc:$cond, + g8rc_nox0:$T, g8rc_nox0:$F, + i32imm:$BROPC), "#SELECT_CC_I8", + []>; + def SELECT_CC_F4 : Pseudo<(outs f4rc:$dst), (ins crrc:$cond, f4rc:$T, f4rc:$F, + i32imm:$BROPC), "#SELECT_CC_F4", + []>; + def SELECT_CC_F8 : Pseudo<(outs f8rc:$dst), (ins crrc:$cond, f8rc:$T, f8rc:$F, + i32imm:$BROPC), "#SELECT_CC_F8", + []>; + def SELECT_CC_VRRC: Pseudo<(outs vrrc:$dst), (ins crrc:$cond, vrrc:$T, vrrc:$F, + i32imm:$BROPC), "#SELECT_CC_VRRC", + []>; + + // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition + // register bit directly. + def SELECT_I4 : Pseudo<(outs gprc:$dst), (ins crbitrc:$cond, + gprc_nor0:$T, gprc_nor0:$F), "#SELECT_I4", + [(set i32:$dst, (select i1:$cond, i32:$T, i32:$F))]>; + def SELECT_I8 : Pseudo<(outs g8rc:$dst), (ins crbitrc:$cond, + g8rc_nox0:$T, g8rc_nox0:$F), "#SELECT_I8", + [(set i64:$dst, (select i1:$cond, i64:$T, i64:$F))]>; + def SELECT_F4 : Pseudo<(outs f4rc:$dst), (ins crbitrc:$cond, + f4rc:$T, f4rc:$F), "#SELECT_F4", + [(set f32:$dst, (select i1:$cond, f32:$T, f32:$F))]>; + def SELECT_F8 : Pseudo<(outs f8rc:$dst), (ins crbitrc:$cond, + f8rc:$T, f8rc:$F), "#SELECT_F8", + [(set f64:$dst, (select i1:$cond, f64:$T, f64:$F))]>; + def SELECT_VRRC: Pseudo<(outs vrrc:$dst), (ins crbitrc:$cond, + vrrc:$T, vrrc:$F), "#SELECT_VRRC", + [(set v4i32:$dst, + (select i1:$cond, v4i32:$T, v4i32:$F))]>; +} + +// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to +// scavenge a register for it. +let mayStore = 1 in { +def SPILL_CR : Pseudo<(outs), (ins crrc:$cond, memri:$F), + "#SPILL_CR", []>; +def SPILL_CRBIT : Pseudo<(outs), (ins crbitrc:$cond, memri:$F), + "#SPILL_CRBIT", []>; +} + +// RESTORE_CR - Indicate that we're restoring the CR register (previously +// spilled), so we'll need to scavenge a register for it. +let mayLoad = 1 in { +def RESTORE_CR : Pseudo<(outs crrc:$cond), (ins memri:$F), + "#RESTORE_CR", []>; +def RESTORE_CRBIT : Pseudo<(outs crbitrc:$cond), (ins memri:$F), + "#RESTORE_CRBIT", []>; +} + +let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in { + let isReturn = 1, Uses = [LR, RM] in + def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB, + [(retflag)]>; + let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in { + def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>; + + let isCodeGenOnly = 1 in { + def BCCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond), + "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB, + []>; + + def BCCTR : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$bi), + "bcctr 12, $bi, 0", IIC_BrB, []>; + def BCCTRn : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$bi), + "bcctr 4, $bi, 0", IIC_BrB, []>; + } + } +} + +let Defs = [LR] in + def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>, + PPC970_Unit_BRU; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { + let isBarrier = 1 in { + def B : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst), + "b $dst", IIC_BrB, + [(br bb:$dst)]>; + def BA : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst), + "ba $dst", IIC_BrB, []>; + } + + // BCC represents an arbitrary conditional branch on a predicate. + // FIXME: should be able to write a pattern for PPCcondbranch, but can't use + // a two-value operand where a dag node expects two operands. :( + let isCodeGenOnly = 1 in { + def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst), + "b${cond:cc}${cond:pm} ${cond:reg}, $dst" + /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>; + def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst), + "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">; + + let isReturn = 1, Uses = [LR, RM] in + def BCCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond), + "b${cond:cc}lr${cond:pm} ${cond:reg}", IIC_BrB, []>; + } + + let isCodeGenOnly = 1 in { + let Pattern = [(brcond i1:$bi, bb:$dst)] in + def BC : BForm_4<16, 12, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst), + "bc 12, $bi, $dst">; + + let Pattern = [(brcond (not i1:$bi), bb:$dst)] in + def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst), + "bc 4, $bi, $dst">; + + let isReturn = 1, Uses = [LR, RM] in + def BCLR : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi), + "bclr 12, $bi, 0", IIC_BrB, []>; + def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi), + "bclr 4, $bi, 0", IIC_BrB, []>; + } + + let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in { + def BDZLR : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins), + "bdzlr", IIC_BrB, []>; + def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins), + "bdnzlr", IIC_BrB, []>; + def BDZLRp : XLForm_2_ext<19, 16, 27, 0, 0, (outs), (ins), + "bdzlr+", IIC_BrB, []>; + def BDNZLRp: XLForm_2_ext<19, 16, 25, 0, 0, (outs), (ins), + "bdnzlr+", IIC_BrB, []>; + def BDZLRm : XLForm_2_ext<19, 16, 26, 0, 0, (outs), (ins), + "bdzlr-", IIC_BrB, []>; + def BDNZLRm: XLForm_2_ext<19, 16, 24, 0, 0, (outs), (ins), + "bdnzlr-", IIC_BrB, []>; + } + + let Defs = [CTR], Uses = [CTR] in { + def BDZ : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz $dst">; + def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz $dst">; + def BDZA : BForm_1<16, 18, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdza $dst">; + def BDNZA : BForm_1<16, 16, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdnza $dst">; + def BDZp : BForm_1<16, 27, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz+ $dst">; + def BDNZp: BForm_1<16, 25, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz+ $dst">; + def BDZAp : BForm_1<16, 27, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdza+ $dst">; + def BDNZAp: BForm_1<16, 25, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdnza+ $dst">; + def BDZm : BForm_1<16, 26, 0, 0, (outs), (ins condbrtarget:$dst), + "bdz- $dst">; + def BDNZm: BForm_1<16, 24, 0, 0, (outs), (ins condbrtarget:$dst), + "bdnz- $dst">; + def BDZAm : BForm_1<16, 26, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdza- $dst">; + def BDNZAm: BForm_1<16, 24, 1, 0, (outs), (ins abscondbrtarget:$dst), + "bdnza- $dst">; + } +} + +// The unconditional BCL used by the SjLj setjmp code. +let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7 in { + let Defs = [LR], Uses = [RM] in { + def BCLalways : BForm_2<16, 20, 31, 0, 1, (outs), (ins condbrtarget:$dst), + "bcl 20, 31, $dst">; + } +} + +let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { + // Convenient aliases for call instructions + let Uses = [RM] in { + def BL : IForm<18, 0, 1, (outs), (ins calltarget:$func), + "bl $func", IIC_BrB, []>; // See Pat patterns below. + def BLA : IForm<18, 1, 1, (outs), (ins abscalltarget:$func), + "bla $func", IIC_BrB, [(PPCcall (i32 imm:$func))]>; + + let isCodeGenOnly = 1 in { + def BL_TLS : IForm<18, 0, 1, (outs), (ins tlscall32:$func), + "bl $func", IIC_BrB, []>; + def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst), + "b${cond:cc}l${cond:pm} ${cond:reg}, $dst">; + def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst), + "b${cond:cc}la${cond:pm} ${cond:reg}, $dst">; + + def BCL : BForm_4<16, 12, 0, 1, (outs), + (ins crbitrc:$bi, condbrtarget:$dst), + "bcl 12, $bi, $dst">; + def BCLn : BForm_4<16, 4, 0, 1, (outs), + (ins crbitrc:$bi, condbrtarget:$dst), + "bcl 4, $bi, $dst">; + } + } + let Uses = [CTR, RM] in { + def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl)]>, + Requires<[In32BitMode]>; + + let isCodeGenOnly = 1 in { + def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond), + "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB, + []>; + + def BCCTRL : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$bi), + "bcctrl 12, $bi, 0", IIC_BrB, []>; + def BCCTRLn : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$bi), + "bcctrl 4, $bi, 0", IIC_BrB, []>; + } + } + let Uses = [LR, RM] in { + def BLRL : XLForm_2_ext<19, 16, 20, 0, 1, (outs), (ins), + "blrl", IIC_BrB, []>; + + let isCodeGenOnly = 1 in { + def BCCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond), + "b${cond:cc}lrl${cond:pm} ${cond:reg}", IIC_BrB, + []>; + + def BCLRL : XLForm_2_br2<19, 16, 12, 1, (outs), (ins crbitrc:$bi), + "bclrl 12, $bi, 0", IIC_BrB, []>; + def BCLRLn : XLForm_2_br2<19, 16, 4, 1, (outs), (ins crbitrc:$bi), + "bclrl 4, $bi, 0", IIC_BrB, []>; + } + } + let Defs = [CTR], Uses = [CTR, RM] in { + def BDZL : BForm_1<16, 18, 0, 1, (outs), (ins condbrtarget:$dst), + "bdzl $dst">; + def BDNZL : BForm_1<16, 16, 0, 1, (outs), (ins condbrtarget:$dst), + "bdnzl $dst">; + def BDZLA : BForm_1<16, 18, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdzla $dst">; + def BDNZLA : BForm_1<16, 16, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdnzla $dst">; + def BDZLp : BForm_1<16, 27, 0, 1, (outs), (ins condbrtarget:$dst), + "bdzl+ $dst">; + def BDNZLp: BForm_1<16, 25, 0, 1, (outs), (ins condbrtarget:$dst), + "bdnzl+ $dst">; + def BDZLAp : BForm_1<16, 27, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdzla+ $dst">; + def BDNZLAp: BForm_1<16, 25, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdnzla+ $dst">; + def BDZLm : BForm_1<16, 26, 0, 1, (outs), (ins condbrtarget:$dst), + "bdzl- $dst">; + def BDNZLm: BForm_1<16, 24, 0, 1, (outs), (ins condbrtarget:$dst), + "bdnzl- $dst">; + def BDZLAm : BForm_1<16, 26, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdzla- $dst">; + def BDNZLAm: BForm_1<16, 24, 1, 1, (outs), (ins abscondbrtarget:$dst), + "bdnzla- $dst">; + } + let Defs = [CTR], Uses = [CTR, LR, RM] in { + def BDZLRL : XLForm_2_ext<19, 16, 18, 0, 1, (outs), (ins), + "bdzlrl", IIC_BrB, []>; + def BDNZLRL : XLForm_2_ext<19, 16, 16, 0, 1, (outs), (ins), + "bdnzlrl", IIC_BrB, []>; + def BDZLRLp : XLForm_2_ext<19, 16, 27, 0, 1, (outs), (ins), + "bdzlrl+", IIC_BrB, []>; + def BDNZLRLp: XLForm_2_ext<19, 16, 25, 0, 1, (outs), (ins), + "bdnzlrl+", IIC_BrB, []>; + def BDZLRLm : XLForm_2_ext<19, 16, 26, 0, 1, (outs), (ins), + "bdzlrl-", IIC_BrB, []>; + def BDNZLRLm: XLForm_2_ext<19, 16, 24, 0, 1, (outs), (ins), + "bdnzlrl-", IIC_BrB, []>; + } +} + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNdi :Pseudo< (outs), + (ins calltarget:$dst, i32imm:$offset), + "#TC_RETURNd $dst $offset", + []>; + + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset), + "#TC_RETURNa $func $offset", + [(PPCtc_return (i32 imm:$func), imm:$offset)]>; + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in +def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset), + "#TC_RETURNr $dst $offset", + []>; + + +let isCodeGenOnly = 1 in { + +let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1, + isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in +def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>, Requires<[In32BitMode]>; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, + isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in +def TAILB : IForm<18, 0, 0, (outs), (ins calltarget:$dst), + "b $dst", IIC_BrB, + []>; + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, + isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in +def TAILBA : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst), + "ba $dst", IIC_BrB, + []>; + +} + +let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { + let Defs = [CTR] in + def EH_SjLj_SetJmp32 : Pseudo<(outs gprc:$dst), (ins memr:$buf), + "#EH_SJLJ_SETJMP32", + [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>, + Requires<[In32BitMode]>; + let isTerminator = 1 in + def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf), + "#EH_SJLJ_LONGJMP32", + [(PPCeh_sjlj_longjmp addr:$buf)]>, + Requires<[In32BitMode]>; +} + +let isBranch = 1, isTerminator = 1 in { + def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst), + "#EH_SjLj_Setup\t$dst", []>; +} + +// System call. +let PPC970_Unit = 7 in { + def SC : SCForm<17, 1, (outs), (ins i32imm:$lev), + "sc $lev", IIC_BrB, [(PPCsc (i32 imm:$lev))]>; +} + +// DCB* instructions. +def DCBA : DCB_Form<758, 0, (outs), (ins memrr:$dst), "dcba $dst", + IIC_LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBF : DCB_Form<86, 0, (outs), (ins memrr:$dst), "dcbf $dst", + IIC_LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBI : DCB_Form<470, 0, (outs), (ins memrr:$dst), "dcbi $dst", + IIC_LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBST : DCB_Form<54, 0, (outs), (ins memrr:$dst), "dcbst $dst", + IIC_LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBT : DCB_Form<278, 0, (outs), (ins memrr:$dst), "dcbt $dst", + IIC_LdStDCBF, [(int_ppc_dcbt xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBTST : DCB_Form<246, 0, (outs), (ins memrr:$dst), "dcbtst $dst", + IIC_LdStDCBF, [(int_ppc_dcbtst xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBZ : DCB_Form<1014, 0, (outs), (ins memrr:$dst), "dcbz $dst", + IIC_LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>, + PPC970_DGroup_Single; +def DCBZL : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst", + IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>, + PPC970_DGroup_Single; + +def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)), + (DCBT xoaddr:$dst)>; + +// Atomic operations +let usesCustomInserter = 1 in { + let Defs = [CR0] in { + def ATOMIC_LOAD_ADD_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I8", + [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_SUB_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I8", + [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_AND_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I8", + [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_OR_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I8", + [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_XOR_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "ATOMIC_LOAD_XOR_I8", + [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_NAND_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I8", + [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_ADD_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I16", + [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_SUB_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I16", + [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_AND_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I16", + [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_OR_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I16", + [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_XOR_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I16", + [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_NAND_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I16", + [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_ADD_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_ADD_I32", + [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_SUB_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_SUB_I32", + [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_AND_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_AND_I32", + [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_OR_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_OR_I32", + [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_XOR_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_XOR_I32", + [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>; + def ATOMIC_LOAD_NAND_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$incr), "#ATOMIC_LOAD_NAND_I32", + [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>; + + def ATOMIC_CMP_SWAP_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I8", + [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>; + def ATOMIC_CMP_SWAP_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new", + [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>; + def ATOMIC_CMP_SWAP_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$old, gprc:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new", + [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>; + + def ATOMIC_SWAP_I8 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_i8", + [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>; + def ATOMIC_SWAP_I16 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I16", + [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>; + def ATOMIC_SWAP_I32 : Pseudo< + (outs gprc:$dst), (ins memrr:$ptr, gprc:$new), "#ATOMIC_SWAP_I32", + [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>; + } +} + +// Instructions to support atomic operations +def LWARX : XForm_1<31, 20, (outs gprc:$rD), (ins memrr:$src), + "lwarx $rD, $src", IIC_LdStLWARX, + [(set i32:$rD, (PPClarx xoaddr:$src))]>; + +let Defs = [CR0] in +def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst), + "stwcx. $rS, $dst", IIC_LdStSTWCX, + [(PPCstcx i32:$rS, xoaddr:$dst)]>, + isDOT; + +let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in +def TRAP : XForm_24<31, 4, (outs), (ins), "trap", IIC_LdStLoad, [(trap)]>; + +def TWI : DForm_base<3, (outs), (ins u5imm:$to, gprc:$rA, s16imm:$imm), + "twi $to, $rA, $imm", IIC_IntTrapW, []>; +def TW : XForm_1<31, 4, (outs), (ins u5imm:$to, gprc:$rA, gprc:$rB), + "tw $to, $rA, $rB", IIC_IntTrapW, []>; +def TDI : DForm_base<2, (outs), (ins u5imm:$to, g8rc:$rA, s16imm:$imm), + "tdi $to, $rA, $imm", IIC_IntTrapD, []>; +def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB), + "td $to, $rA, $rB", IIC_IntTrapD, []>; + +//===----------------------------------------------------------------------===// +// PPC32 Load Instructions. +// + +// Unindexed (r+i) Loads. +let canFoldAsLoad = 1, PPC970_Unit = 2 in { +def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src), + "lbz $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (zextloadi8 iaddr:$src))]>; +def LHA : DForm_1<42, (outs gprc:$rD), (ins memri:$src), + "lha $rD, $src", IIC_LdStLHA, + [(set i32:$rD, (sextloadi16 iaddr:$src))]>, + PPC970_DGroup_Cracked; +def LHZ : DForm_1<40, (outs gprc:$rD), (ins memri:$src), + "lhz $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (zextloadi16 iaddr:$src))]>; +def LWZ : DForm_1<32, (outs gprc:$rD), (ins memri:$src), + "lwz $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (load iaddr:$src))]>; + +def LFS : DForm_1<48, (outs f4rc:$rD), (ins memri:$src), + "lfs $rD, $src", IIC_LdStLFD, + [(set f32:$rD, (load iaddr:$src))]>; +def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src), + "lfd $rD, $src", IIC_LdStLFD, + [(set f64:$rD, (load iaddr:$src))]>; + + +// Unindexed (r+i) Loads with Update (preinc). +let mayLoad = 1, neverHasSideEffects = 1 in { +def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lbzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LHAU : DForm_1<43, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lhau $rD, $addr", IIC_LdStLHAU, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LHZU : DForm_1<41, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lhzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LWZU : DForm_1<33, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lwzu $rD, $addr", IIC_LdStLoadUpd, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LFSU : DForm_1<49, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lfsu $rD, $addr", IIC_LdStLFDU, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + +def LFDU : DForm_1<51, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr), + "lfdu $rD, $addr", IIC_LdStLFDU, + []>, RegConstraint<"$addr.reg = $ea_result">, + NoEncode<"$ea_result">; + + +// Indexed (r+r) Loads with Update (preinc). +def LBZUX : XForm_1<31, 119, (outs gprc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lbzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LHAUX : XForm_1<31, 375, (outs gprc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lhaux $rD, $addr", IIC_LdStLHAUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LHZUX : XForm_1<31, 311, (outs gprc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lhzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LWZUX : XForm_1<31, 55, (outs gprc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lwzux $rD, $addr", IIC_LdStLoadUpdX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LFSUX : XForm_1<31, 567, (outs f4rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lfsux $rD, $addr", IIC_LdStLFDUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; + +def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result), + (ins memrr:$addr), + "lfdux $rD, $addr", IIC_LdStLFDUX, + []>, RegConstraint<"$addr.ptrreg = $ea_result">, + NoEncode<"$ea_result">; +} +} + +// Indexed (r+r) Loads. +// +let canFoldAsLoad = 1, PPC970_Unit = 2 in { +def LBZX : XForm_1<31, 87, (outs gprc:$rD), (ins memrr:$src), + "lbzx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (zextloadi8 xaddr:$src))]>; +def LHAX : XForm_1<31, 343, (outs gprc:$rD), (ins memrr:$src), + "lhax $rD, $src", IIC_LdStLHA, + [(set i32:$rD, (sextloadi16 xaddr:$src))]>, + PPC970_DGroup_Cracked; +def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src), + "lhzx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (zextloadi16 xaddr:$src))]>; +def LWZX : XForm_1<31, 23, (outs gprc:$rD), (ins memrr:$src), + "lwzx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (load xaddr:$src))]>; + + +def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src), + "lhbrx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>; +def LWBRX : XForm_1<31, 534, (outs gprc:$rD), (ins memrr:$src), + "lwbrx $rD, $src", IIC_LdStLoad, + [(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>; + +def LFSX : XForm_25<31, 535, (outs f4rc:$frD), (ins memrr:$src), + "lfsx $frD, $src", IIC_LdStLFD, + [(set f32:$frD, (load xaddr:$src))]>; +def LFDX : XForm_25<31, 599, (outs f8rc:$frD), (ins memrr:$src), + "lfdx $frD, $src", IIC_LdStLFD, + [(set f64:$frD, (load xaddr:$src))]>; + +def LFIWAX : XForm_25<31, 855, (outs f8rc:$frD), (ins memrr:$src), + "lfiwax $frD, $src", IIC_LdStLFD, + [(set f64:$frD, (PPClfiwax xoaddr:$src))]>; +def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src), + "lfiwzx $frD, $src", IIC_LdStLFD, + [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>; +} + +// Load Multiple +def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src), + "lmw $rD, $src", IIC_LdStLMW, []>; + +//===----------------------------------------------------------------------===// +// PPC32 Store Instructions. +// + +// Unindexed (r+i) Stores. +let PPC970_Unit = 2 in { +def STB : DForm_1<38, (outs), (ins gprc:$rS, memri:$src), + "stb $rS, $src", IIC_LdStStore, + [(truncstorei8 i32:$rS, iaddr:$src)]>; +def STH : DForm_1<44, (outs), (ins gprc:$rS, memri:$src), + "sth $rS, $src", IIC_LdStStore, + [(truncstorei16 i32:$rS, iaddr:$src)]>; +def STW : DForm_1<36, (outs), (ins gprc:$rS, memri:$src), + "stw $rS, $src", IIC_LdStStore, + [(store i32:$rS, iaddr:$src)]>; +def STFS : DForm_1<52, (outs), (ins f4rc:$rS, memri:$dst), + "stfs $rS, $dst", IIC_LdStSTFD, + [(store f32:$rS, iaddr:$dst)]>; +def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst), + "stfd $rS, $dst", IIC_LdStSTFD, + [(store f64:$rS, iaddr:$dst)]>; +} + +// Unindexed (r+i) Stores with Update (preinc). +let PPC970_Unit = 2, mayStore = 1 in { +def STBU : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), + "stbu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STHU : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), + "sthu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STWU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst), + "stwu $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STFSU : DForm_1<53, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memri:$dst), + "stfsu $rS, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +def STFDU : DForm_1<55, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memri:$dst), + "stfdu $rS, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">; +} + +// Patterns to match the pre-inc stores. We can't put the patterns on +// the instruction definitions directly as ISel wants the address base +// and offset to be separate operands, not a single complex operand. +def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STBU $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STHU $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STWU $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STFSU $rS, iaddroff:$ptroff, $ptrreg)>; +def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), + (STFDU $rS, iaddroff:$ptroff, $ptrreg)>; + +// Indexed (r+r) Stores. +let PPC970_Unit = 2 in { +def STBX : XForm_8<31, 215, (outs), (ins gprc:$rS, memrr:$dst), + "stbx $rS, $dst", IIC_LdStStore, + [(truncstorei8 i32:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STHX : XForm_8<31, 407, (outs), (ins gprc:$rS, memrr:$dst), + "sthx $rS, $dst", IIC_LdStStore, + [(truncstorei16 i32:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; +def STWX : XForm_8<31, 151, (outs), (ins gprc:$rS, memrr:$dst), + "stwx $rS, $dst", IIC_LdStStore, + [(store i32:$rS, xaddr:$dst)]>, + PPC970_DGroup_Cracked; + +def STHBRX: XForm_8<31, 918, (outs), (ins gprc:$rS, memrr:$dst), + "sthbrx $rS, $dst", IIC_LdStStore, + [(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>, + PPC970_DGroup_Cracked; +def STWBRX: XForm_8<31, 662, (outs), (ins gprc:$rS, memrr:$dst), + "stwbrx $rS, $dst", IIC_LdStStore, + [(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>, + PPC970_DGroup_Cracked; + +def STFIWX: XForm_28<31, 983, (outs), (ins f8rc:$frS, memrr:$dst), + "stfiwx $frS, $dst", IIC_LdStSTFD, + [(PPCstfiwx f64:$frS, xoaddr:$dst)]>; + +def STFSX : XForm_28<31, 663, (outs), (ins f4rc:$frS, memrr:$dst), + "stfsx $frS, $dst", IIC_LdStSTFD, + [(store f32:$frS, xaddr:$dst)]>; +def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst), + "stfdx $frS, $dst", IIC_LdStSTFD, + [(store f64:$frS, xaddr:$dst)]>; +} + +// Indexed (r+r) Stores with Update (preinc). +let PPC970_Unit = 2, mayStore = 1 in { +def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), + "stbux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), + "sthux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst), + "stwux $rS, $dst", IIC_LdStStoreUpd, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins f4rc:$rS, memrr:$dst), + "stfsux $rS, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins f8rc:$rS, memrr:$dst), + "stfdux $rS, $dst", IIC_LdStSTFDU, []>, + RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">, + PPC970_DGroup_Cracked; +} + +// Patterns to match the pre-inc stores. We can't put the patterns on +// the instruction definitions directly as ISel wants the address base +// and offset to be separate operands, not a single complex operand. +def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STBUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STHUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STWUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STFSUX $rS, $ptrreg, $ptroff)>; +def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), + (STFDUX $rS, $ptrreg, $ptroff)>; + +// Store Multiple +def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst), + "stmw $rS, $dst", IIC_LdStLMW, []>; + +def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L), + "sync $L", IIC_LdStSync, []>, Requires<[IsNotBookE]>; + +let isCodeGenOnly = 1 in { + def MSYNC : XForm_24_sync<31, 598, (outs), (ins), + "msync", IIC_LdStSync, []>, Requires<[IsBookE]> { + let L = 0; + } +} + +def : Pat<(int_ppc_sync), (SYNC 0)>, Requires<[IsNotBookE]>; +def : Pat<(int_ppc_sync), (MSYNC)>, Requires<[IsBookE]>; + +//===----------------------------------------------------------------------===// +// PPC32 Arithmetic Instructions. +// + +let PPC970_Unit = 1 in { // FXU Operations. +def ADDI : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm), + "addi $rD, $rA, $imm", IIC_IntSimple, + [(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>; +let BaseName = "addic" in { +let Defs = [CARRY] in +def ADDIC : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), + "addic $rD, $rA, $imm", IIC_IntGeneral, + [(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>, + RecFormRel, PPC970_DGroup_Cracked; +let Defs = [CARRY, CR0] in +def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), + "addic. $rD, $rA, $imm", IIC_IntGeneral, + []>, isDOT, RecFormRel; +} +def ADDIS : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm), + "addis $rD, $rA, $imm", IIC_IntSimple, + [(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>; +let isCodeGenOnly = 1 in +def LA : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym), + "la $rD, $sym($rA)", IIC_IntGeneral, + [(set i32:$rD, (add i32:$rA, + (PPClo tglobaladdr:$sym, 0)))]>; +def MULLI : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), + "mulli $rD, $rA, $imm", IIC_IntMulLI, + [(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>; +let Defs = [CARRY] in +def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), + "subfic $rD, $rA, $imm", IIC_IntGeneral, + [(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>; + +let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { + def LI : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm), + "li $rD, $imm", IIC_IntSimple, + [(set i32:$rD, imm32SExt16:$imm)]>; + def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s17imm:$imm), + "lis $rD, $imm", IIC_IntSimple, + [(set i32:$rD, imm16ShiftedSExt:$imm)]>; +} +} + +let PPC970_Unit = 1 in { // FXU Operations. +let Defs = [CR0] in { +def ANDIo : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "andi. $dst, $src1, $src2", IIC_IntGeneral, + [(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>, + isDOT; +def ANDISo : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "andis. $dst, $src1, $src2", IIC_IntGeneral, + [(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>, + isDOT; +} +def ORI : DForm_4<24, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "ori $dst, $src1, $src2", IIC_IntSimple, + [(set i32:$dst, (or i32:$src1, immZExt16:$src2))]>; +def ORIS : DForm_4<25, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "oris $dst, $src1, $src2", IIC_IntSimple, + [(set i32:$dst, (or i32:$src1, imm16ShiftedZExt:$src2))]>; +def XORI : DForm_4<26, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "xori $dst, $src1, $src2", IIC_IntSimple, + [(set i32:$dst, (xor i32:$src1, immZExt16:$src2))]>; +def XORIS : DForm_4<27, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + "xoris $dst, $src1, $src2", IIC_IntSimple, + [(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>; + +def NOP : DForm_4_zero<24, (outs), (ins), "nop", IIC_IntSimple, + []>; +let isCodeGenOnly = 1 in { +// The POWER6 and POWER7 have special group-terminating nops. +def NOP_GT_PWR6 : DForm_4_fixedreg_zero<24, 1, (outs), (ins), + "ori 1, 1, 0", IIC_IntSimple, []>; +def NOP_GT_PWR7 : DForm_4_fixedreg_zero<24, 2, (outs), (ins), + "ori 2, 2, 0", IIC_IntSimple, []>; +} + +let isCompare = 1, neverHasSideEffects = 1 in { + def CMPWI : DForm_5_ext<11, (outs crrc:$crD), (ins gprc:$rA, s16imm:$imm), + "cmpwi $crD, $rA, $imm", IIC_IntCompare>; + def CMPLWI : DForm_6_ext<10, (outs crrc:$dst), (ins gprc:$src1, u16imm:$src2), + "cmplwi $dst, $src1, $src2", IIC_IntCompare>; +} +} + +let PPC970_Unit = 1, neverHasSideEffects = 1 in { // FXU Operations. +let isCommutable = 1 in { +defm NAND : XForm_6r<31, 476, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "nand", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (not (and i32:$rS, i32:$rB)))]>; +defm AND : XForm_6r<31, 28, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "and", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (and i32:$rS, i32:$rB))]>; +} // isCommutable +defm ANDC : XForm_6r<31, 60, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "andc", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (and i32:$rS, (not i32:$rB)))]>; +let isCommutable = 1 in { +defm OR : XForm_6r<31, 444, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "or", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (or i32:$rS, i32:$rB))]>; +defm NOR : XForm_6r<31, 124, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "nor", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (not (or i32:$rS, i32:$rB)))]>; +} // isCommutable +defm ORC : XForm_6r<31, 412, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "orc", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (or i32:$rS, (not i32:$rB)))]>; +let isCommutable = 1 in { +defm EQV : XForm_6r<31, 284, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "eqv", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (not (xor i32:$rS, i32:$rB)))]>; +defm XOR : XForm_6r<31, 316, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "xor", "$rA, $rS, $rB", IIC_IntSimple, + [(set i32:$rA, (xor i32:$rS, i32:$rB))]>; +} // isCommutable +defm SLW : XForm_6r<31, 24, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "slw", "$rA, $rS, $rB", IIC_IntGeneral, + [(set i32:$rA, (PPCshl i32:$rS, i32:$rB))]>; +defm SRW : XForm_6r<31, 536, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "srw", "$rA, $rS, $rB", IIC_IntGeneral, + [(set i32:$rA, (PPCsrl i32:$rS, i32:$rB))]>; +defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB), + "sraw", "$rA, $rS, $rB", IIC_IntShift, + [(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>; +} + +let PPC970_Unit = 1 in { // FXU Operations. +let neverHasSideEffects = 1 in { +defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH), + "srawi", "$rA, $rS, $SH", IIC_IntShift, + [(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>; +defm CNTLZW : XForm_11r<31, 26, (outs gprc:$rA), (ins gprc:$rS), + "cntlzw", "$rA, $rS", IIC_IntGeneral, + [(set i32:$rA, (ctlz i32:$rS))]>; +defm EXTSB : XForm_11r<31, 954, (outs gprc:$rA), (ins gprc:$rS), + "extsb", "$rA, $rS", IIC_IntSimple, + [(set i32:$rA, (sext_inreg i32:$rS, i8))]>; +defm EXTSH : XForm_11r<31, 922, (outs gprc:$rA), (ins gprc:$rS), + "extsh", "$rA, $rS", IIC_IntSimple, + [(set i32:$rA, (sext_inreg i32:$rS, i16))]>; +} +let isCompare = 1, neverHasSideEffects = 1 in { + def CMPW : XForm_16_ext<31, 0, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB), + "cmpw $crD, $rA, $rB", IIC_IntCompare>; + def CMPLW : XForm_16_ext<31, 32, (outs crrc:$crD), (ins gprc:$rA, gprc:$rB), + "cmplw $crD, $rA, $rB", IIC_IntCompare>; +} +} +let PPC970_Unit = 3 in { // FPU Operations. +//def FCMPO : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB), +// "fcmpo $crD, $fA, $fB", IIC_FPCompare>; +let isCompare = 1, neverHasSideEffects = 1 in { + def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB), + "fcmpu $crD, $fA, $fB", IIC_FPCompare>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), + "fcmpu $crD, $fA, $fB", IIC_FPCompare>; +} + +let Uses = [RM] in { + let neverHasSideEffects = 1 in { + defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiw", "$frD, $frB", IIC_FPGeneral, + []>; + defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiwz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfctiwz f64:$frB))]>; + + defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB), + "frsp", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fround f64:$frB))]>; + + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + defm FRIND : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB), + "frin", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (frnd f64:$frB))]>; + defm FRINS : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB), + "frin", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (frnd f32:$frB))]>; + } + + let neverHasSideEffects = 1 in { + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + defm FRIPD : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB), + "frip", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (fceil f64:$frB))]>; + defm FRIPS : XForm_26r<63, 456, (outs f4rc:$frD), (ins f4rc:$frB), + "frip", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fceil f32:$frB))]>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + defm FRIZD : XForm_26r<63, 424, (outs f8rc:$frD), (ins f8rc:$frB), + "friz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (ftrunc f64:$frB))]>; + defm FRIZS : XForm_26r<63, 424, (outs f4rc:$frD), (ins f4rc:$frB), + "friz", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (ftrunc f32:$frB))]>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in + defm FRIMD : XForm_26r<63, 488, (outs f8rc:$frD), (ins f8rc:$frB), + "frim", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (ffloor f64:$frB))]>; + defm FRIMS : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB), + "frim", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (ffloor f32:$frB))]>; + + defm FSQRT : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB), + "fsqrt", "$frD, $frB", IIC_FPSqrtD, + [(set f64:$frD, (fsqrt f64:$frB))]>; + defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB), + "fsqrts", "$frD, $frB", IIC_FPSqrtS, + [(set f32:$frD, (fsqrt f32:$frB))]>; + } + } +} + +/// Note that FMR is defined as pseudo-ops on the PPC970 because they are +/// often coalesced away and we don't want the dispatch group builder to think +/// that they will fill slots (which could cause the load of a LSU reject to +/// sneak into a d-group with a store). +let neverHasSideEffects = 1 in +defm FMR : XForm_26r<63, 72, (outs f4rc:$frD), (ins f4rc:$frB), + "fmr", "$frD, $frB", IIC_FPGeneral, + []>, // (set f32:$frD, f32:$frB) + PPC970_Unit_Pseudo; + +let PPC970_Unit = 3, neverHasSideEffects = 1 in { // FPU Operations. +// These are artificially split into two different forms, for 4/8 byte FP. +defm FABSS : XForm_26r<63, 264, (outs f4rc:$frD), (ins f4rc:$frB), + "fabs", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fabs f32:$frB))]>; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FABSD : XForm_26r<63, 264, (outs f8rc:$frD), (ins f8rc:$frB), + "fabs", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (fabs f64:$frB))]>; +defm FNABSS : XForm_26r<63, 136, (outs f4rc:$frD), (ins f4rc:$frB), + "fnabs", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fneg (fabs f32:$frB)))]>; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FNABSD : XForm_26r<63, 136, (outs f8rc:$frD), (ins f8rc:$frB), + "fnabs", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (fneg (fabs f64:$frB)))]>; +defm FNEGS : XForm_26r<63, 40, (outs f4rc:$frD), (ins f4rc:$frB), + "fneg", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (fneg f32:$frB))]>; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FNEGD : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB), + "fneg", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (fneg f64:$frB))]>; + +defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB), + "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral, + [(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>; +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB), + "fcpsgn", "$frD, $frA, $frB", IIC_FPGeneral, + [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>; + +// Reciprocal estimates. +defm FRE : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB), + "fre", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfre f64:$frB))]>; +defm FRES : XForm_26r<59, 24, (outs f4rc:$frD), (ins f4rc:$frB), + "fres", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (PPCfre f32:$frB))]>; +defm FRSQRTE : XForm_26r<63, 26, (outs f8rc:$frD), (ins f8rc:$frB), + "frsqrte", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCfrsqrte f64:$frB))]>; +defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB), + "frsqrtes", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (PPCfrsqrte f32:$frB))]>; +} + +// XL-Form instructions. condition register logical ops. +// +let neverHasSideEffects = 1 in +def MCRF : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA), + "mcrf $BF, $BFA", IIC_BrMCR>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +let isCommutable = 1 in { +def CRAND : XLForm_1<19, 257, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crand $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (and i1:$CRA, i1:$CRB))]>; + +def CRNAND : XLForm_1<19, 225, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crnand $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (not (and i1:$CRA, i1:$CRB)))]>; + +def CROR : XLForm_1<19, 449, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "cror $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (or i1:$CRA, i1:$CRB))]>; + +def CRXOR : XLForm_1<19, 193, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crxor $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (xor i1:$CRA, i1:$CRB))]>; + +def CRNOR : XLForm_1<19, 33, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crnor $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (not (or i1:$CRA, i1:$CRB)))]>; + +def CREQV : XLForm_1<19, 289, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "creqv $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (not (xor i1:$CRA, i1:$CRB)))]>; +} // isCommutable + +def CRANDC : XLForm_1<19, 129, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crandc $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (and i1:$CRA, (not i1:$CRB)))]>; + +def CRORC : XLForm_1<19, 417, (outs crbitrc:$CRD), + (ins crbitrc:$CRA, crbitrc:$CRB), + "crorc $CRD, $CRA, $CRB", IIC_BrCR, + [(set i1:$CRD, (or i1:$CRA, (not i1:$CRB)))]>; + +let isCodeGenOnly = 1 in { +def CRSET : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins), + "creqv $dst, $dst, $dst", IIC_BrCR, + [(set i1:$dst, 1)]>; + +def CRUNSET: XLForm_1_ext<19, 193, (outs crbitrc:$dst), (ins), + "crxor $dst, $dst, $dst", IIC_BrCR, + [(set i1:$dst, 0)]>; + +let Defs = [CR1EQ], CRD = 6 in { +def CR6SET : XLForm_1_ext<19, 289, (outs), (ins), + "creqv 6, 6, 6", IIC_BrCR, + [(PPCcr6set)]>; + +def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins), + "crxor 6, 6, 6", IIC_BrCR, + [(PPCcr6unset)]>; +} +} + +// XFX-Form instructions. Instructions that deal with SPRs. +// + +def MFSPR : XFXForm_1<31, 339, (outs gprc:$RT), (ins i32imm:$SPR), + "mfspr $RT, $SPR", IIC_SprMFSPR>; +def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT), + "mtspr $SPR, $RT", IIC_SprMTSPR>; + +def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR), + "mftb $RT, $SPR", IIC_SprMFTB>, Deprecated<DeprecatedMFTB>; + +let Uses = [CTR] in { +def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins), + "mfctr $rT", IIC_SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let Defs = [CTR], Pattern = [(PPCmtctr i32:$rS)] in { +def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), + "mtctr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in { +let Pattern = [(int_ppc_mtctr i32:$rS)] in +def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), + "mtctr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +let Defs = [LR] in { +def MTLR : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS), + "mtlr $rS", IIC_SprMTSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} +let Uses = [LR] in { +def MFLR : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins), + "mflr $rT", IIC_SprMFSPR>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +let isCodeGenOnly = 1 in { + // Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed + // like a GPR on the PPC970. As such, copies in and out have the same + // performance characteristics as an OR instruction. + def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS), + "mtspr 256, $rS", IIC_IntGeneral>, + PPC970_DGroup_Single, PPC970_Unit_FXU; + def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins), + "mfspr $rT, 256", IIC_IntGeneral>, + PPC970_DGroup_First, PPC970_Unit_FXU; + + def MTVRSAVEv : XFXForm_7_ext<31, 467, 256, + (outs VRSAVERC:$reg), (ins gprc:$rS), + "mtspr 256, $rS", IIC_IntGeneral>, + PPC970_DGroup_Single, PPC970_Unit_FXU; + def MFVRSAVEv : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), + (ins VRSAVERC:$reg), + "mfspr $rT, 256", IIC_IntGeneral>, + PPC970_DGroup_First, PPC970_Unit_FXU; +} + +// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register, +// so we'll need to scavenge a register for it. +let mayStore = 1 in +def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F), + "#SPILL_VRSAVE", []>; + +// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously +// spilled), so we'll need to scavenge a register for it. +let mayLoad = 1 in +def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F), + "#RESTORE_VRSAVE", []>; + +let neverHasSideEffects = 1 in { +def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST), + "mtocrf $FXM, $ST", IIC_BrMCRX>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS), + "mtcrf $FXM, $rS", IIC_BrMCRX>, + PPC970_MicroCode, PPC970_Unit_CRU; + +let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking. +def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM), + "mfocrf $rT, $FXM", IIC_SprMFCRF>, + PPC970_DGroup_First, PPC970_Unit_CRU; + +def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins), + "mfcr $rT", IIC_SprMFCR>, + PPC970_MicroCode, PPC970_Unit_CRU; +} // neverHasSideEffects = 1 + +// Pseudo instruction to perform FADD in round-to-zero mode. +let usesCustomInserter = 1, Uses = [RM] in { + def FADDrtz: Pseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "", + [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>; +} + +// The above pseudo gets expanded to make use of the following instructions +// to manipulate FPSCR. Note that FPSCR is not modeled at the DAG level. +let Uses = [RM], Defs = [RM] in { + def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM), + "mtfsb0 $FM", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM), + "mtfsb1 $FM", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + def MTFSF : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT), + "mtfsf $FM, $rT", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; +} +let Uses = [RM] in { + def MFFS : XForm_42<63, 583, (outs f8rc:$rT), (ins), + "mffs $rT", IIC_IntMFFS, + [(set f64:$rT, (PPCmffs))]>, + PPC970_DGroup_Single, PPC970_Unit_FPU; +} + + +let PPC970_Unit = 1, neverHasSideEffects = 1 in { // FXU Operations. +// XO-Form instructions. Arithmetic instructions that can set overflow bit +let isCommutable = 1 in +defm ADD4 : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "add", "$rT, $rA, $rB", IIC_IntSimple, + [(set i32:$rT, (add i32:$rA, i32:$rB))]>; +let isCodeGenOnly = 1 in +def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, tlsreg32:$rB), + "add $rT, $rA, $rB", IIC_IntSimple, + [(set i32:$rT, (add i32:$rA, tglobaltlsaddr:$rB))]>; +let isCommutable = 1 in +defm ADDC : XOForm_1rc<31, 10, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "addc", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (addc i32:$rA, i32:$rB))]>, + PPC970_DGroup_Cracked; + +defm DIVW : XOForm_1r<31, 491, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divw", "$rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (sdiv i32:$rA, i32:$rB))]>, + PPC970_DGroup_First, PPC970_DGroup_Cracked; +defm DIVWU : XOForm_1r<31, 459, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divwu", "$rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (udiv i32:$rA, i32:$rB))]>, + PPC970_DGroup_First, PPC970_DGroup_Cracked; +let isCommutable = 1 in { +defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "mulhw", "$rT, $rA, $rB", IIC_IntMulHW, + [(set i32:$rT, (mulhs i32:$rA, i32:$rB))]>; +defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU, + [(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>; +defm MULLW : XOForm_1r<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "mullw", "$rT, $rA, $rB", IIC_IntMulHW, + [(set i32:$rT, (mul i32:$rA, i32:$rB))]>; +} // isCommutable +defm SUBF : XOForm_1r<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "subf", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (sub i32:$rB, i32:$rA))]>; +defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "subfc", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (subc i32:$rB, i32:$rA))]>, + PPC970_DGroup_Cracked; +defm NEG : XOForm_3r<31, 104, 0, (outs gprc:$rT), (ins gprc:$rA), + "neg", "$rT, $rA", IIC_IntSimple, + [(set i32:$rT, (ineg i32:$rA))]>; +let Uses = [CARRY] in { +let isCommutable = 1 in +defm ADDE : XOForm_1rc<31, 138, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "adde", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (adde i32:$rA, i32:$rB))]>; +defm ADDME : XOForm_3rc<31, 234, 0, (outs gprc:$rT), (ins gprc:$rA), + "addme", "$rT, $rA", IIC_IntGeneral, + [(set i32:$rT, (adde i32:$rA, -1))]>; +defm ADDZE : XOForm_3rc<31, 202, 0, (outs gprc:$rT), (ins gprc:$rA), + "addze", "$rT, $rA", IIC_IntGeneral, + [(set i32:$rT, (adde i32:$rA, 0))]>; +defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "subfe", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (sube i32:$rB, i32:$rA))]>; +defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$rT), (ins gprc:$rA), + "subfme", "$rT, $rA", IIC_IntGeneral, + [(set i32:$rT, (sube -1, i32:$rA))]>; +defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA), + "subfze", "$rT, $rA", IIC_IntGeneral, + [(set i32:$rT, (sube 0, i32:$rA))]>; +} +} + +// A-Form instructions. Most of the instructions executed in the FPU are of +// this type. +// +let PPC970_Unit = 3, neverHasSideEffects = 1 in { // FPU Operations. +let Uses = [RM] in { +let isCommutable = 1 in { + defm FMADD : AForm_1r<63, 29, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>; + defm FMADDS : AForm_1r<59, 29, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>; + defm FMSUB : AForm_1r<63, 28, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set f64:$FRT, + (fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>; + defm FMSUBS : AForm_1r<59, 28, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, + (fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>; + defm FNMADD : AForm_1r<63, 31, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fnmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set f64:$FRT, + (fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>; + defm FNMADDS : AForm_1r<59, 31, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fnmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, + (fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>; + defm FNMSUB : AForm_1r<63, 30, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fnmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused, + [(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC, + (fneg f64:$FRB))))]>; + defm FNMSUBS : AForm_1r<59, 30, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fnmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC, + (fneg f32:$FRB))))]>; +} // isCommutable +} +// FSEL is artificially split into 4 and 8-byte forms for the result. To avoid +// having 4 of these, force the comparison to always be an 8-byte double (code +// should use an FMRSD if the input comparison value really wants to be a float) +// and 4/8 byte forms for the result and operand type.. +let Interpretation64Bit = 1, isCodeGenOnly = 1 in +defm FSELD : AForm_1r<63, 23, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), + "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f64:$FRT, (PPCfsel f64:$FRA, f64:$FRC, f64:$FRB))]>; +defm FSELS : AForm_1r<63, 23, + (outs f4rc:$FRT), (ins f8rc:$FRA, f4rc:$FRC, f4rc:$FRB), + "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>; +let Uses = [RM] in { + let isCommutable = 1 in { + defm FADD : AForm_2r<63, 21, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), + "fadd", "$FRT, $FRA, $FRB", IIC_FPAddSub, + [(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>; + defm FADDS : AForm_2r<59, 21, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB), + "fadds", "$FRT, $FRA, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>; + } // isCommutable + defm FDIV : AForm_2r<63, 18, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), + "fdiv", "$FRT, $FRA, $FRB", IIC_FPDivD, + [(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>; + defm FDIVS : AForm_2r<59, 18, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB), + "fdivs", "$FRT, $FRA, $FRB", IIC_FPDivS, + [(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>; + let isCommutable = 1 in { + defm FMUL : AForm_3r<63, 25, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC), + "fmul", "$FRT, $FRA, $FRC", IIC_FPFused, + [(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>; + defm FMULS : AForm_3r<59, 25, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC), + "fmuls", "$FRT, $FRA, $FRC", IIC_FPGeneral, + [(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>; + } // isCommutable + defm FSUB : AForm_2r<63, 20, + (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), + "fsub", "$FRT, $FRA, $FRB", IIC_FPAddSub, + [(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>; + defm FSUBS : AForm_2r<59, 20, + (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB), + "fsubs", "$FRT, $FRA, $FRB", IIC_FPGeneral, + [(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>; + } +} + +let neverHasSideEffects = 1 in { +let PPC970_Unit = 1 in { // FXU Operations. + let isSelect = 1 in + def ISEL : AForm_4<31, 15, + (outs gprc:$rT), (ins gprc_nor0:$rA, gprc:$rB, crbitrc:$cond), + "isel $rT, $rA, $rB, $cond", IIC_IntGeneral, + []>; +} + +let PPC970_Unit = 1 in { // FXU Operations. +// M-Form instructions. rotate and mask instructions. +// +let isCommutable = 1 in { +// RLWIMI can be commuted if the rotate amount is zero. +defm RLWIMI : MForm_2r<20, (outs gprc:$rA), + (ins gprc:$rSi, gprc:$rS, u5imm:$SH, u5imm:$MB, + u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME", + IIC_IntRotate, []>, PPC970_DGroup_Cracked, + RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">; +} +let BaseName = "rlwinm" in { +def RLWINM : MForm_2<21, + (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME), + "rlwinm $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral, + []>, RecFormRel; +let Defs = [CR0] in +def RLWINMo : MForm_2<21, + (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME), + "rlwinm. $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral, + []>, isDOT, RecFormRel, PPC970_DGroup_Cracked; +} +defm RLWNM : MForm_2r<23, (outs gprc:$rA), + (ins gprc:$rS, gprc:$rB, u5imm:$MB, u5imm:$ME), + "rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral, + []>; +} +} // neverHasSideEffects = 1 + +//===----------------------------------------------------------------------===// +// PowerPC Instruction Patterns +// + +// Arbitrary immediate support. Implement in terms of LIS/ORI. +def : Pat<(i32 imm:$imm), + (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>; + +// Implement the 'not' operation with the NOR instruction. +def i32not : OutPatFrag<(ops node:$in), + (NOR $in, $in)>; +def : Pat<(not i32:$in), + (i32not $in)>; + +// ADD an arbitrary immediate. +def : Pat<(add i32:$in, imm:$imm), + (ADDIS (ADDI $in, (LO16 imm:$imm)), (HA16 imm:$imm))>; +// OR an arbitrary immediate. +def : Pat<(or i32:$in, imm:$imm), + (ORIS (ORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>; +// XOR an arbitrary immediate. +def : Pat<(xor i32:$in, imm:$imm), + (XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>; +// SUBFIC +def : Pat<(sub imm32SExt16:$imm, i32:$in), + (SUBFIC $in, imm:$imm)>; + +// SHL/SRL +def : Pat<(shl i32:$in, (i32 imm:$imm)), + (RLWINM $in, imm:$imm, 0, (SHL32 imm:$imm))>; +def : Pat<(srl i32:$in, (i32 imm:$imm)), + (RLWINM $in, (SRL32 imm:$imm), imm:$imm, 31)>; + +// ROTL +def : Pat<(rotl i32:$in, i32:$sh), + (RLWNM $in, $sh, 0, 31)>; +def : Pat<(rotl i32:$in, (i32 imm:$imm)), + (RLWINM $in, imm:$imm, 0, 31)>; + +// RLWNM +def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm), + (RLWNM $in, $sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>; + +// Calls +def : Pat<(PPCcall (i32 tglobaladdr:$dst)), + (BL tglobaladdr:$dst)>; +def : Pat<(PPCcall (i32 texternalsym:$dst)), + (BL texternalsym:$dst)>; + + +def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm), + (TCRETURNdi tglobaladdr:$dst, imm:$imm)>; + +def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm), + (TCRETURNdi texternalsym:$dst, imm:$imm)>; + +def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm), + (TCRETURNri CTRRC:$dst, imm:$imm)>; + + + +// Hi and Lo for Darwin Global Addresses. +def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>; +def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>; +def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>; +def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>; +def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>; +def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>; +def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>; +def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>; +def : Pat<(PPChi tglobaltlsaddr:$g, i32:$in), + (ADDIS $in, tglobaltlsaddr:$g)>; +def : Pat<(PPClo tglobaltlsaddr:$g, i32:$in), + (ADDI $in, tglobaltlsaddr:$g)>; +def : Pat<(add i32:$in, (PPChi tglobaladdr:$g, 0)), + (ADDIS $in, tglobaladdr:$g)>; +def : Pat<(add i32:$in, (PPChi tconstpool:$g, 0)), + (ADDIS $in, tconstpool:$g)>; +def : Pat<(add i32:$in, (PPChi tjumptable:$g, 0)), + (ADDIS $in, tjumptable:$g)>; +def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)), + (ADDIS $in, tblockaddress:$g)>; + +// Support for thread-local storage. +def PPC32GOT: Pseudo<(outs gprc:$rD), (ins), "#PPC32GOT", + [(set i32:$rD, (PPCppc32GOT))]>; + +// Get the _GLOBAL_OFFSET_TABLE_ in PIC mode. +// This uses two output registers, the first as the real output, the second as a +// temporary register, used internally in code generation. +def PPC32PICGOT: Pseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", + []>, NoEncode<"$rT">; + +def LDgotTprelL32: Pseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg), + "#LDgotTprelL32", + [(set i32:$rD, + (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>; +def : Pat<(PPCaddTls i32:$in, tglobaltlsaddr:$g), + (ADD4TLS $in, tglobaltlsaddr:$g)>; + +def ADDItlsgdL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), + "#ADDItlsgdL32", + [(set i32:$rD, + (PPCaddiTlsgdL i32:$reg, tglobaltlsaddr:$disp))]>; +def GETtlsADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), + "#GETtlsADDR32", + [(set i32:$rD, + (PPCgetTlsAddr i32:$reg, tglobaltlsaddr:$sym))]>; +def ADDItlsldL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), + "#ADDItlsldL32", + [(set i32:$rD, + (PPCaddiTlsldL i32:$reg, tglobaltlsaddr:$disp))]>; +def GETtlsldADDR32 : Pseudo<(outs gprc:$rD), (ins gprc:$reg, tlsgd32:$sym), + "#GETtlsldADDR32", + [(set i32:$rD, + (PPCgetTlsldAddr i32:$reg, tglobaltlsaddr:$sym))]>; +def ADDIdtprelL32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), + "#ADDIdtprelL32", + [(set i32:$rD, + (PPCaddiDtprelL i32:$reg, tglobaltlsaddr:$disp))]>; +def ADDISdtprelHA32 : Pseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, s16imm:$disp), + "#ADDISdtprelHA32", + [(set i32:$rD, + (PPCaddisDtprelHA i32:$reg, + tglobaltlsaddr:$disp))]>; + +// Support for Position-independent code +def LWZtoc: Pseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg), + "#LWZtoc", + [(set i32:$rD, + (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>; +// Get Global (GOT) Base Register offset, from the word immediately preceding +// the function label. +def GetGBRO: Pseudo<(outs gprc:$rT), (ins gprc:$rI), "#GetGBRO", []>; +// Update the Global(GOT) Base Register with the above offset. +def UpdateGBR: Pseudo<(outs gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>; + + +// Standard shifts. These are represented separately from the real shifts above +// so that we can distinguish between shifts that allow 5-bit and 6-bit shift +// amounts. +def : Pat<(sra i32:$rS, i32:$rB), + (SRAW $rS, $rB)>; +def : Pat<(srl i32:$rS, i32:$rB), + (SRW $rS, $rB)>; +def : Pat<(shl i32:$rS, i32:$rB), + (SLW $rS, $rB)>; + +def : Pat<(zextloadi1 iaddr:$src), + (LBZ iaddr:$src)>; +def : Pat<(zextloadi1 xaddr:$src), + (LBZX xaddr:$src)>; +def : Pat<(extloadi1 iaddr:$src), + (LBZ iaddr:$src)>; +def : Pat<(extloadi1 xaddr:$src), + (LBZX xaddr:$src)>; +def : Pat<(extloadi8 iaddr:$src), + (LBZ iaddr:$src)>; +def : Pat<(extloadi8 xaddr:$src), + (LBZX xaddr:$src)>; +def : Pat<(extloadi16 iaddr:$src), + (LHZ iaddr:$src)>; +def : Pat<(extloadi16 xaddr:$src), + (LHZX xaddr:$src)>; +def : Pat<(f64 (extloadf32 iaddr:$src)), + (COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>; +def : Pat<(f64 (extloadf32 xaddr:$src)), + (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>; + +def : Pat<(f64 (fextend f32:$src)), + (COPY_TO_REGCLASS $src, F8RC)>; + +def : Pat<(atomic_fence (imm), (imm)), (SYNC 0)>, Requires<[IsNotBookE]>; +def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[IsBookE]>; + +// Additional FNMSUB patterns: -a*c + b == -(a*c - b) +def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B), + (FNMSUB $A, $C, $B)>; +def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B), + (FNMSUB $A, $C, $B)>; +def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B), + (FNMSUBS $A, $C, $B)>; +def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B), + (FNMSUBS $A, $C, $B)>; + +// FCOPYSIGN's operand types need not agree. +def : Pat<(fcopysign f64:$frB, f32:$frA), + (FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>; +def : Pat<(fcopysign f32:$frB, f64:$frA), + (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>; + +include "PPCInstrAltivec.td" +include "PPCInstr64Bit.td" +include "PPCInstrVSX.td" + +def crnot : OutPatFrag<(ops node:$in), + (CRNOR $in, $in)>; +def : Pat<(not i1:$in), + (crnot $in)>; + +// Patterns for arithmetic i1 operations. +def : Pat<(add i1:$a, i1:$b), + (CRXOR $a, $b)>; +def : Pat<(sub i1:$a, i1:$b), + (CRXOR $a, $b)>; +def : Pat<(mul i1:$a, i1:$b), + (CRAND $a, $b)>; + +// We're sometimes asked to materialize i1 -1, which is just 1 in this case +// (-1 is used to mean all bits set). +def : Pat<(i1 -1), (CRSET)>; + +// i1 extensions, implemented in terms of isel. +def : Pat<(i32 (zext i1:$in)), + (SELECT_I4 $in, (LI 1), (LI 0))>; +def : Pat<(i32 (sext i1:$in)), + (SELECT_I4 $in, (LI -1), (LI 0))>; + +def : Pat<(i64 (zext i1:$in)), + (SELECT_I8 $in, (LI8 1), (LI8 0))>; +def : Pat<(i64 (sext i1:$in)), + (SELECT_I8 $in, (LI8 -1), (LI8 0))>; + +// FIXME: We should choose either a zext or a sext based on other constants +// already around. +def : Pat<(i32 (anyext i1:$in)), + (SELECT_I4 $in, (LI 1), (LI 0))>; +def : Pat<(i64 (anyext i1:$in)), + (SELECT_I8 $in, (LI8 1), (LI8 0))>; + +// match setcc on i1 variables. +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLT)), + (CRANDC $s2, $s1)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULT)), + (CRANDC $s2, $s1)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETLE)), + (CRORC $s2, $s1)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETULE)), + (CRORC $s2, $s1)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETEQ)), + (CREQV $s1, $s2)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGE)), + (CRORC $s1, $s2)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGE)), + (CRORC $s1, $s2)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETGT)), + (CRANDC $s1, $s2)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETUGT)), + (CRANDC $s1, $s2)>; +def : Pat<(i1 (setcc i1:$s1, i1:$s2, SETNE)), + (CRXOR $s1, $s2)>; + +// match setcc on non-i1 (non-vector) variables. Note that SETUEQ, SETOGE, +// SETOLE, SETONE, SETULT and SETUGT should be expanded by legalize for +// floating-point types. + +multiclass CRNotPat<dag pattern, dag result> { + def : Pat<pattern, (crnot result)>; + def : Pat<(not pattern), result>; + + // We can also fold the crnot into an extension: + def : Pat<(i32 (zext pattern)), + (SELECT_I4 result, (LI 0), (LI 1))>; + def : Pat<(i32 (sext pattern)), + (SELECT_I4 result, (LI 0), (LI -1))>; + + // We can also fold the crnot into an extension: + def : Pat<(i64 (zext pattern)), + (SELECT_I8 result, (LI8 0), (LI8 1))>; + def : Pat<(i64 (sext pattern)), + (SELECT_I8 result, (LI8 0), (LI8 -1))>; + + // FIXME: We should choose either a zext or a sext based on other constants + // already around. + def : Pat<(i32 (anyext pattern)), + (SELECT_I4 result, (LI 0), (LI 1))>; + + def : Pat<(i64 (anyext pattern)), + (SELECT_I8 result, (LI8 0), (LI8 1))>; +} + +// FIXME: Because of what seems like a bug in TableGen's type-inference code, +// we need to write imm:$imm in the output patterns below, not just $imm, or +// else the resulting matcher will not correctly add the immediate operand +// (making it a register operand instead). + +// extended SETCC. +multiclass ExtSetCCPat<CondCode cc, PatFrag pfrag, + OutPatFrag rfrag, OutPatFrag rfrag8> { + def : Pat<(i32 (zext (i1 (pfrag i32:$s1, cc)))), + (rfrag $s1)>; + def : Pat<(i64 (zext (i1 (pfrag i64:$s1, cc)))), + (rfrag8 $s1)>; + def : Pat<(i64 (zext (i1 (pfrag i32:$s1, cc)))), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>; + def : Pat<(i32 (zext (i1 (pfrag i64:$s1, cc)))), + (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>; + + def : Pat<(i32 (anyext (i1 (pfrag i32:$s1, cc)))), + (rfrag $s1)>; + def : Pat<(i64 (anyext (i1 (pfrag i64:$s1, cc)))), + (rfrag8 $s1)>; + def : Pat<(i64 (anyext (i1 (pfrag i32:$s1, cc)))), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (rfrag $s1), sub_32)>; + def : Pat<(i32 (anyext (i1 (pfrag i64:$s1, cc)))), + (EXTRACT_SUBREG (rfrag8 $s1), sub_32)>; +} + +// Note that we do all inversions below with i(32|64)not, instead of using +// (xori x, 1) because on the A2 nor has single-cycle latency while xori +// has 2-cycle latency. + +defm : ExtSetCCPat<SETEQ, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (CNTLZW $in), 27, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (CNTLZD $in), 58, 63)> >; + +defm : ExtSetCCPat<SETNE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (i32not (CNTLZW $in)), 27, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (i64not (CNTLZD $in)), 58, 63)> >; + +defm : ExtSetCCPat<SETLT, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM $in, 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL $in, 1, 63)> >; + +defm : ExtSetCCPat<SETGE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (i32not $in), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (i64not $in), 1, 63)> >; + +defm : ExtSetCCPat<SETGT, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (ANDC (NEG $in), $in), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (ANDC8 (NEG8 $in), $in), 1, 63)> >; + +defm : ExtSetCCPat<SETLE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, 0, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (ORC $in, (NEG $in)), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (ORC8 $in, (NEG8 $in)), 1, 63)> >; + +defm : ExtSetCCPat<SETLT, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, -1, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (AND $in, (ADDI $in, 1)), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (AND8 $in, (ADDI8 $in, 1)), 1, 63)> >; + +defm : ExtSetCCPat<SETGE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, -1, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (NAND $in, (ADDI $in, 1)), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (NAND8 $in, (ADDI8 $in, 1)), 1, 63)> >; + +defm : ExtSetCCPat<SETGT, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, -1, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM (i32not $in), 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL (i64not $in), 1, 63)> >; + +defm : ExtSetCCPat<SETLE, + PatFrag<(ops node:$in, node:$cc), + (setcc $in, -1, $cc)>, + OutPatFrag<(ops node:$in), + (RLWINM $in, 1, 31, 31)>, + OutPatFrag<(ops node:$in), + (RLDICL $in, 1, 63)> >; + +// SETCC for i32. +def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULT)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>; +def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLT)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>; +def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGT)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>; +def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGT)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>; +def : Pat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>; +def : Pat<(i1 (setcc i32:$s1, immZExt16:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>; + +// For non-equality comparisons, the default code would materialize the +// constant, then compare against it, like this: +// lis r2, 4660 +// ori r2, r2, 22136 +// cmpw cr0, r3, r2 +// beq cr0,L6 +// Since we are just comparing for equality, we can emit this instead: +// xoris r0,r3,0x1234 +// cmplwi cr0,r0,0x5678 +// beq cr0,L6 + +def : Pat<(i1 (setcc i32:$s1, imm:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETGE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETULE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETLE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, imm32SExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPWI $s1, imm:$imm), sub_eq)>; +defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, imm:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETULT)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETLT)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETUGT)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETGT)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i32:$s1, i32:$s2, SETEQ)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETUGE)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETGE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETULE)), + (EXTRACT_SUBREG (CMPLW $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETLE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i32:$s1, i32:$s2, SETNE)), + (EXTRACT_SUBREG (CMPW $s1, $s2), sub_eq)>; + +// SETCC for i64. +def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULT)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLT)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGT)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGT)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>; +def : Pat<(i1 (setcc i64:$s1, immZExt16:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>; + +// For non-equality comparisons, the default code would materialize the +// constant, then compare against it, like this: +// lis r2, 4660 +// ori r2, r2, 22136 +// cmpd cr0, r3, r2 +// beq cr0,L6 +// Since we are just comparing for equality, we can emit this instead: +// xoris r0,r3,0x1234 +// cmpldi cr0,r0,0x5678 +// beq cr0,L6 + +def : Pat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETEQ)), + (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETUGE)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETGE)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_lt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETULE)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETLE)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_gt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, imm64SExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPDI $s1, imm:$imm), sub_eq)>; +defm : CRNotPat<(i1 (setcc i64:$s1, immZExt16:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLDI $s1, imm:$imm), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETULT)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETLT)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETUGT)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; + +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETUGE)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETGE)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETULE)), + (EXTRACT_SUBREG (CMPLD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)), + (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; + +// SETCC for f32. +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; + +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)), + (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; + +// SETCC for f64. +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; + +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; +defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)), + (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; + +// match select on i1 variables: +def : Pat<(i1 (select i1:$cond, i1:$tval, i1:$fval)), + (CROR (CRAND $cond , $tval), + (CRAND (crnot $cond), $fval))>; + +// match selectcc on i1 variables: +// select (lhs == rhs), tval, fval is: +// ((lhs == rhs) & tval) | (!(lhs == rhs) & fval) +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLT)), + (CROR (CRAND (CRANDC $rhs, $lhs), $tval), + (CRAND (CRORC $lhs, $rhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETLE)), + (CROR (CRAND (CRORC $rhs, $lhs), $tval), + (CRAND (CRANDC $lhs, $rhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETEQ)), + (CROR (CRAND (CREQV $lhs, $rhs), $tval), + (CRAND (CRXOR $lhs, $rhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGE)), + (CROR (CRAND (CRORC $lhs, $rhs), $tval), + (CRAND (CRANDC $rhs, $lhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETGT)), + (CROR (CRAND (CRANDC $lhs, $rhs), $tval), + (CRAND (CRORC $rhs, $lhs), $fval))>; +def : Pat <(i1 (selectcc i1:$lhs, i1:$rhs, i1:$tval, i1:$fval, SETNE)), + (CROR (CRAND (CREQV $lhs, $rhs), $fval), + (CRAND (CRXOR $lhs, $rhs), $tval))>; + +// match selectcc on i1 variables with non-i1 output. +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLT)), + (SELECT_I4 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETLE)), + (SELECT_I4 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETEQ)), + (SELECT_I4 (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGE)), + (SELECT_I4 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETGT)), + (SELECT_I4 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i32 (selectcc i1:$lhs, i1:$rhs, i32:$tval, i32:$fval, SETNE)), + (SELECT_I4 (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLT)), + (SELECT_I8 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETLE)), + (SELECT_I8 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETEQ)), + (SELECT_I8 (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGE)), + (SELECT_I8 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETGT)), + (SELECT_I8 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(i64 (selectcc i1:$lhs, i1:$rhs, i64:$tval, i64:$fval, SETNE)), + (SELECT_I8 (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)), + (SELECT_F4 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)), + (SELECT_F4 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)), + (SELECT_F4 (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)), + (SELECT_F4 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)), + (SELECT_F4 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)), + (SELECT_F4 (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)), + (SELECT_F8 (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)), + (SELECT_F8 (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)), + (SELECT_F8 (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)), + (SELECT_F8 (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)), + (SELECT_F8 (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)), + (SELECT_F8 (CRXOR $lhs, $rhs), $tval, $fval)>; + +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLT)), + (SELECT_VRRC (CRANDC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETLE)), + (SELECT_VRRC (CRORC $rhs, $lhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETEQ)), + (SELECT_VRRC (CREQV $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGE)), + (SELECT_VRRC (CRORC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETGT)), + (SELECT_VRRC (CRANDC $lhs, $rhs), $tval, $fval)>; +def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)), + (SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>; + +let usesCustomInserter = 1 in { +def ANDIo_1_EQ_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in), + "#ANDIo_1_EQ_BIT", + [(set i1:$dst, (trunc (not i32:$in)))]>; +def ANDIo_1_GT_BIT : Pseudo<(outs crbitrc:$dst), (ins gprc:$in), + "#ANDIo_1_GT_BIT", + [(set i1:$dst, (trunc i32:$in))]>; + +def ANDIo_1_EQ_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in), + "#ANDIo_1_EQ_BIT8", + [(set i1:$dst, (trunc (not i64:$in)))]>; +def ANDIo_1_GT_BIT8 : Pseudo<(outs crbitrc:$dst), (ins g8rc:$in), + "#ANDIo_1_GT_BIT8", + [(set i1:$dst, (trunc i64:$in))]>; +} + +def : Pat<(i1 (not (trunc i32:$in))), + (ANDIo_1_EQ_BIT $in)>; +def : Pat<(i1 (not (trunc i64:$in))), + (ANDIo_1_EQ_BIT8 $in)>; + +//===----------------------------------------------------------------------===// +// PowerPC Instructions used for assembler/disassembler only +// + +def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins), + "isync", IIC_SprISYNC, []>; + +def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src), + "icbi $src", IIC_LdStICBI, []>; + +def EIEIO : XForm_24_eieio<31, 854, (outs), (ins), + "eieio", IIC_LdStLoad, []>; + +def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L), + "wait $L", IIC_LdStLoad, []>; + +def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L), + "mtmsr $RS, $L", IIC_SprMTMSR>; + +def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins), + "mfmsr $RT", IIC_SprMFMSR, []>; + +def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L), + "mtmsrd $RS, $L", IIC_SprMTMSRD>; + +def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB), + "slbie $RB", IIC_SprSLBIE, []>; + +def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB), + "slbmte $RS, $RB", IIC_SprSLBMTE, []>; + +def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB), + "slbmfee $RT, $RB", IIC_SprSLBMFEE, []>; + +def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>; + +def TLBSYNC : XForm_0<31, 566, (outs), (ins), + "tlbsync", IIC_SprTLBSYNC, []>; + +def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB), + "tlbiel $RB", IIC_SprTLBIEL, []>; + +def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB), + "tlbie $RB,$RS", IIC_SprTLBIE, []>; + +//===----------------------------------------------------------------------===// +// PowerPC Assembler Instruction Aliases +// + +// Pseudo-instructions for alternate assembly syntax (never used by codegen). +// These are aliases that require C++ handling to convert to the target +// instruction, while InstAliases can be handled directly by tblgen. +class PPCAsmPseudo<string asm, dag iops> + : Instruction { + let Namespace = "PPC"; + bit PPC64 = 0; // Default value, override with isPPC64 + + let OutOperandList = (outs); + let InOperandList = iops; + let Pattern = []; + let AsmString = asm; + let isAsmParserOnly = 1; + let isPseudo = 1; +} + +def : InstAlias<"sc", (SC 0)>; + +def : InstAlias<"sync", (SYNC 0)>, Requires<[IsNotBookE]>; +def : InstAlias<"msync", (SYNC 0)>, Requires<[IsNotBookE]>; +def : InstAlias<"lwsync", (SYNC 1)>, Requires<[IsNotBookE]>; +def : InstAlias<"ptesync", (SYNC 2)>, Requires<[IsNotBookE]>; + +def : InstAlias<"wait", (WAIT 0)>; +def : InstAlias<"waitrsv", (WAIT 1)>; +def : InstAlias<"waitimpl", (WAIT 2)>; + +def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>; +def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>; +def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>; +def : InstAlias<"crnot $bx, $by", (CRNOR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>; + +def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>; +def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>; + +def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>; +def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>; + +def : InstAlias<"xnop", (XORI R0, R0, 0)>; + +def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>; +def : InstAlias<"mr. $rA, $rB", (OR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>; + +def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>; +def : InstAlias<"not. $rA, $rB", (NOR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>; + +def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>; + +def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>; + +def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm", + (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; +def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm", + (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; +def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm", + (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; +def SUBICo : PPCAsmPseudo<"subic. $rA, $rB, $imm", + (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; + +def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>; +def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>; +def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>; +def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>; + +def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>; +def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>; + +def : InstAlias<"mfsprg $RT, 0", (MFSPR gprc:$RT, 272)>; +def : InstAlias<"mfsprg $RT, 1", (MFSPR gprc:$RT, 273)>; +def : InstAlias<"mfsprg $RT, 2", (MFSPR gprc:$RT, 274)>; +def : InstAlias<"mfsprg $RT, 3", (MFSPR gprc:$RT, 275)>; + +def : InstAlias<"mfsprg0 $RT", (MFSPR gprc:$RT, 272)>; +def : InstAlias<"mfsprg1 $RT", (MFSPR gprc:$RT, 273)>; +def : InstAlias<"mfsprg2 $RT", (MFSPR gprc:$RT, 274)>; +def : InstAlias<"mfsprg3 $RT", (MFSPR gprc:$RT, 275)>; + +def : InstAlias<"mtsprg 0, $RT", (MTSPR 272, gprc:$RT)>; +def : InstAlias<"mtsprg 1, $RT", (MTSPR 273, gprc:$RT)>; +def : InstAlias<"mtsprg 2, $RT", (MTSPR 274, gprc:$RT)>; +def : InstAlias<"mtsprg 3, $RT", (MTSPR 275, gprc:$RT)>; + +def : InstAlias<"mtsprg0 $RT", (MTSPR 272, gprc:$RT)>; +def : InstAlias<"mtsprg1 $RT", (MTSPR 273, gprc:$RT)>; +def : InstAlias<"mtsprg2 $RT", (MTSPR 274, gprc:$RT)>; +def : InstAlias<"mtsprg3 $RT", (MTSPR 275, gprc:$RT)>; + +def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>; + +def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>; +def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>; + +def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>; + +def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>; +def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>; + +def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>; +def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>; +def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>; +def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>; + +def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>; + +def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def EXTRWI : PPCAsmPseudo<"extrwi $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def EXTRWIo : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def INSLWI : PPCAsmPseudo<"inslwi $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def INSLWIo : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def INSRWI : PPCAsmPseudo<"insrwi $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def INSRWIo : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b", + (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; +def ROTRWI : PPCAsmPseudo<"rotrwi $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def ROTRWIo : PPCAsmPseudo<"rotrwi. $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def SLWIo : PPCAsmPseudo<"slwi. $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def SRWIo : PPCAsmPseudo<"srwi. $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def CLRRWI : PPCAsmPseudo<"clrrwi $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def CLRRWIo : PPCAsmPseudo<"clrrwi. $rA, $rS, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$n)>; +def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>; +def CLRLSLWIo : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n", + (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>; + +def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>; +def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>; +def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>; +def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>; +def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; +def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; + +def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def EXTRDI : PPCAsmPseudo<"extrdi $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def EXTRDIo : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def INSRDI : PPCAsmPseudo<"insrdi $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def INSRDIo : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; +def ROTRDI : PPCAsmPseudo<"rotrdi $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def ROTRDIo : PPCAsmPseudo<"rotrdi. $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def SLDIo : PPCAsmPseudo<"sldi. $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def SRDIo : PPCAsmPseudo<"srdi. $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def CLRRDI : PPCAsmPseudo<"clrrdi $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def CLRRDIo : PPCAsmPseudo<"clrrdi. $rA, $rS, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; +def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>; +def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n", + (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>; + +def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>; +def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>; +def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; +def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; +def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; +def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; + +// These generic branch instruction forms are used for the assembler parser only. +// Defs and Uses are conservative, since we don't know the BO value. +let PPC970_Unit = 7 in { + let Defs = [CTR], Uses = [CTR, RM] in { + def gBC : BForm_3<16, 0, 0, (outs), + (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst), + "bc $bo, $bi, $dst">; + def gBCA : BForm_3<16, 1, 0, (outs), + (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst), + "bca $bo, $bi, $dst">; + } + let Defs = [LR, CTR], Uses = [CTR, RM] in { + def gBCL : BForm_3<16, 0, 1, (outs), + (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst), + "bcl $bo, $bi, $dst">; + def gBCLA : BForm_3<16, 1, 1, (outs), + (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst), + "bcla $bo, $bi, $dst">; + } + let Defs = [CTR], Uses = [CTR, LR, RM] in + def gBCLR : XLForm_2<19, 16, 0, (outs), + (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh), + "bclr $bo, $bi, $bh", IIC_BrB, []>; + let Defs = [LR, CTR], Uses = [CTR, LR, RM] in + def gBCLRL : XLForm_2<19, 16, 1, (outs), + (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh), + "bclrl $bo, $bi, $bh", IIC_BrB, []>; + let Defs = [CTR], Uses = [CTR, LR, RM] in + def gBCCTR : XLForm_2<19, 528, 0, (outs), + (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh), + "bcctr $bo, $bi, $bh", IIC_BrB, []>; + let Defs = [LR, CTR], Uses = [CTR, LR, RM] in + def gBCCTRL : XLForm_2<19, 528, 1, (outs), + (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh), + "bcctrl $bo, $bi, $bh", IIC_BrB, []>; +} +def : InstAlias<"bclr $bo, $bi", (gBCLR u5imm:$bo, crbitrc:$bi, 0)>; +def : InstAlias<"bclrl $bo, $bi", (gBCLRL u5imm:$bo, crbitrc:$bi, 0)>; +def : InstAlias<"bcctr $bo, $bi", (gBCCTR u5imm:$bo, crbitrc:$bi, 0)>; +def : InstAlias<"bcctrl $bo, $bi", (gBCCTRL u5imm:$bo, crbitrc:$bi, 0)>; + +multiclass BranchSimpleMnemonic1<string name, string pm, int bo> { + def : InstAlias<"b"#name#pm#" $bi, $dst", (gBC bo, crbitrc:$bi, condbrtarget:$dst)>; + def : InstAlias<"b"#name#"a"#pm#" $bi, $dst", (gBCA bo, crbitrc:$bi, abscondbrtarget:$dst)>; + def : InstAlias<"b"#name#"lr"#pm#" $bi", (gBCLR bo, crbitrc:$bi, 0)>; + def : InstAlias<"b"#name#"l"#pm#" $bi, $dst", (gBCL bo, crbitrc:$bi, condbrtarget:$dst)>; + def : InstAlias<"b"#name#"la"#pm#" $bi, $dst", (gBCLA bo, crbitrc:$bi, abscondbrtarget:$dst)>; + def : InstAlias<"b"#name#"lrl"#pm#" $bi", (gBCLRL bo, crbitrc:$bi, 0)>; +} +multiclass BranchSimpleMnemonic2<string name, string pm, int bo> + : BranchSimpleMnemonic1<name, pm, bo> { + def : InstAlias<"b"#name#"ctr"#pm#" $bi", (gBCCTR bo, crbitrc:$bi, 0)>; + def : InstAlias<"b"#name#"ctrl"#pm#" $bi", (gBCCTRL bo, crbitrc:$bi, 0)>; +} +defm : BranchSimpleMnemonic2<"t", "", 12>; +defm : BranchSimpleMnemonic2<"f", "", 4>; +defm : BranchSimpleMnemonic2<"t", "-", 14>; +defm : BranchSimpleMnemonic2<"f", "-", 6>; +defm : BranchSimpleMnemonic2<"t", "+", 15>; +defm : BranchSimpleMnemonic2<"f", "+", 7>; +defm : BranchSimpleMnemonic1<"dnzt", "", 8>; +defm : BranchSimpleMnemonic1<"dnzf", "", 0>; +defm : BranchSimpleMnemonic1<"dzt", "", 10>; +defm : BranchSimpleMnemonic1<"dzf", "", 2>; + +multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> { + def : InstAlias<"b"#name#pm#" $cc, $dst", + (BCC bibo, crrc:$cc, condbrtarget:$dst)>; + def : InstAlias<"b"#name#pm#" $dst", + (BCC bibo, CR0, condbrtarget:$dst)>; + + def : InstAlias<"b"#name#"a"#pm#" $cc, $dst", + (BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>; + def : InstAlias<"b"#name#"a"#pm#" $dst", + (BCCA bibo, CR0, abscondbrtarget:$dst)>; + + def : InstAlias<"b"#name#"lr"#pm#" $cc", + (BCCLR bibo, crrc:$cc)>; + def : InstAlias<"b"#name#"lr"#pm, + (BCCLR bibo, CR0)>; + + def : InstAlias<"b"#name#"ctr"#pm#" $cc", + (BCCCTR bibo, crrc:$cc)>; + def : InstAlias<"b"#name#"ctr"#pm, + (BCCCTR bibo, CR0)>; + + def : InstAlias<"b"#name#"l"#pm#" $cc, $dst", + (BCCL bibo, crrc:$cc, condbrtarget:$dst)>; + def : InstAlias<"b"#name#"l"#pm#" $dst", + (BCCL bibo, CR0, condbrtarget:$dst)>; + + def : InstAlias<"b"#name#"la"#pm#" $cc, $dst", + (BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>; + def : InstAlias<"b"#name#"la"#pm#" $dst", + (BCCLA bibo, CR0, abscondbrtarget:$dst)>; + + def : InstAlias<"b"#name#"lrl"#pm#" $cc", + (BCCLRL bibo, crrc:$cc)>; + def : InstAlias<"b"#name#"lrl"#pm, + (BCCLRL bibo, CR0)>; + + def : InstAlias<"b"#name#"ctrl"#pm#" $cc", + (BCCCTRL bibo, crrc:$cc)>; + def : InstAlias<"b"#name#"ctrl"#pm, + (BCCCTRL bibo, CR0)>; +} +multiclass BranchExtendedMnemonic<string name, int bibo> { + defm : BranchExtendedMnemonicPM<name, "", bibo>; + defm : BranchExtendedMnemonicPM<name, "-", !add(bibo, 2)>; + defm : BranchExtendedMnemonicPM<name, "+", !add(bibo, 3)>; +} +defm : BranchExtendedMnemonic<"lt", 12>; +defm : BranchExtendedMnemonic<"gt", 44>; +defm : BranchExtendedMnemonic<"eq", 76>; +defm : BranchExtendedMnemonic<"un", 108>; +defm : BranchExtendedMnemonic<"so", 108>; +defm : BranchExtendedMnemonic<"ge", 4>; +defm : BranchExtendedMnemonic<"nl", 4>; +defm : BranchExtendedMnemonic<"le", 36>; +defm : BranchExtendedMnemonic<"ng", 36>; +defm : BranchExtendedMnemonic<"ne", 68>; +defm : BranchExtendedMnemonic<"nu", 100>; +defm : BranchExtendedMnemonic<"ns", 100>; + +def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>; +def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>; +def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>; +def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>; +def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm64:$imm)>; +def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>; +def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm64:$imm)>; +def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>; + +def : InstAlias<"cmpi $bf, 0, $rA, $imm", (CMPWI crrc:$bf, gprc:$rA, s16imm:$imm)>; +def : InstAlias<"cmp $bf, 0, $rA, $rB", (CMPW crrc:$bf, gprc:$rA, gprc:$rB)>; +def : InstAlias<"cmpli $bf, 0, $rA, $imm", (CMPLWI crrc:$bf, gprc:$rA, u16imm:$imm)>; +def : InstAlias<"cmpl $bf, 0, $rA, $rB", (CMPLW crrc:$bf, gprc:$rA, gprc:$rB)>; +def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm64:$imm)>; +def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>; +def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm64:$imm)>; +def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>; + +multiclass TrapExtendedMnemonic<string name, int to> { + def : InstAlias<"td"#name#"i $rA, $imm", (TDI to, g8rc:$rA, s16imm:$imm)>; + def : InstAlias<"td"#name#" $rA, $rB", (TD to, g8rc:$rA, g8rc:$rB)>; + def : InstAlias<"tw"#name#"i $rA, $imm", (TWI to, gprc:$rA, s16imm:$imm)>; + def : InstAlias<"tw"#name#" $rA, $rB", (TW to, gprc:$rA, gprc:$rB)>; +} +defm : TrapExtendedMnemonic<"lt", 16>; +defm : TrapExtendedMnemonic<"le", 20>; +defm : TrapExtendedMnemonic<"eq", 4>; +defm : TrapExtendedMnemonic<"ge", 12>; +defm : TrapExtendedMnemonic<"gt", 8>; +defm : TrapExtendedMnemonic<"nl", 12>; +defm : TrapExtendedMnemonic<"ne", 24>; +defm : TrapExtendedMnemonic<"ng", 20>; +defm : TrapExtendedMnemonic<"llt", 2>; +defm : TrapExtendedMnemonic<"lle", 6>; +defm : TrapExtendedMnemonic<"lge", 5>; +defm : TrapExtendedMnemonic<"lgt", 1>; +defm : TrapExtendedMnemonic<"lnl", 5>; +defm : TrapExtendedMnemonic<"lng", 6>; +defm : TrapExtendedMnemonic<"u", 31>; + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td new file mode 100644 index 000000000000..49bcc4876d33 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -0,0 +1,816 @@ +//===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the VSX extension to the PowerPC instruction set. +// +//===----------------------------------------------------------------------===// + +def PPCRegVSRCAsmOperand : AsmOperandClass { + let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber"; +} +def vsrc : RegisterOperand<VSRC> { + let ParserMatchClass = PPCRegVSRCAsmOperand; +} + +def PPCRegVSFRCAsmOperand : AsmOperandClass { + let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber"; +} +def vsfrc : RegisterOperand<VSFRC> { + let ParserMatchClass = PPCRegVSFRCAsmOperand; +} + +multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list<dag> pattern> { + let BaseName = asmbase in { + def NAME : XX3Form_Rc<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), itin, + pattern>; + let Defs = [CR6] in + def o : XX3Form_Rc<opcode, xo, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), itin, + []>, isDOT; + } +} + +def HasVSX : Predicate<"PPCSubTarget->hasVSX()">; +let Predicates = [HasVSX] in { +let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. +let neverHasSideEffects = 1 in { // VSX instructions don't have side effects. +let Uses = [RM] in { + + // Load indexed instructions + let mayLoad = 1, canFoldAsLoad = 1 in { + def LXSDX : XForm_1<31, 588, + (outs vsfrc:$XT), (ins memrr:$src), + "lxsdx $XT, $src", IIC_LdStLFD, + [(set f64:$XT, (load xoaddr:$src))]>; + + def LXVD2X : XForm_1<31, 844, + (outs vsrc:$XT), (ins memrr:$src), + "lxvd2x $XT, $src", IIC_LdStLFD, + [(set v2f64:$XT, (load xoaddr:$src))]>; + + def LXVDSX : XForm_1<31, 332, + (outs vsrc:$XT), (ins memrr:$src), + "lxvdsx $XT, $src", IIC_LdStLFD, []>; + + def LXVW4X : XForm_1<31, 780, + (outs vsrc:$XT), (ins memrr:$src), + "lxvw4x $XT, $src", IIC_LdStLFD, []>; + } + + // Store indexed instructions + let mayStore = 1 in { + def STXSDX : XX1Form<31, 716, + (outs), (ins vsfrc:$XT, memrr:$dst), + "stxsdx $XT, $dst", IIC_LdStSTFD, + [(store f64:$XT, xoaddr:$dst)]>; + + def STXVD2X : XX1Form<31, 972, + (outs), (ins vsrc:$XT, memrr:$dst), + "stxvd2x $XT, $dst", IIC_LdStSTFD, + [(store v2f64:$XT, xoaddr:$dst)]>; + + def STXVW4X : XX1Form<31, 908, + (outs), (ins vsrc:$XT, memrr:$dst), + "stxvw4x $XT, $dst", IIC_LdStSTFD, []>; + } + + // Add/Mul Instructions + let isCommutable = 1 in { + def XSADDDP : XX3Form<60, 32, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsadddp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fadd f64:$XA, f64:$XB))]>; + def XSMULDP : XX3Form<60, 48, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmuldp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fmul f64:$XA, f64:$XB))]>; + + def XVADDDP : XX3Form<60, 96, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvadddp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fadd v2f64:$XA, v2f64:$XB))]>; + + def XVADDSP : XX3Form<60, 64, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvaddsp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fadd v4f32:$XA, v4f32:$XB))]>; + + def XVMULDP : XX3Form<60, 112, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmuldp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fmul v2f64:$XA, v2f64:$XB))]>; + + def XVMULSP : XX3Form<60, 80, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmulsp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fmul v4f32:$XA, v4f32:$XB))]>; + } + + // Subtract Instructions + def XSSUBDP : XX3Form<60, 40, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xssubdp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fsub f64:$XA, f64:$XB))]>; + + def XVSUBDP : XX3Form<60, 104, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvsubdp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fsub v2f64:$XA, v2f64:$XB))]>; + def XVSUBSP : XX3Form<60, 72, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvsubsp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fsub v4f32:$XA, v4f32:$XB))]>; + + // FMA Instructions + let BaseName = "XSMADDADP" in { + let isCommutable = 1 in + def XSMADDADP : XX3Form<60, 33, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsmaddadp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fma f64:$XA, f64:$XB, f64:$XTi))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSMADDMDP : XX3Form<60, 41, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XSMSUBADP" in { + let isCommutable = 1 in + def XSMSUBADP : XX3Form<60, 49, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsmsubadp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSMSUBMDP : XX3Form<60, 57, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XSNMADDADP" in { + let isCommutable = 1 in + def XSNMADDADP : XX3Form<60, 161, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsnmaddadp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, f64:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSNMADDMDP : XX3Form<60, 169, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XSNMSUBADP" in { + let isCommutable = 1 in + def XSNMSUBADP : XX3Form<60, 177, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsnmsubadp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XSNMSUBMDP : XX3Form<60, 185, + (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB), + "xsnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVMADDADP" in { + let isCommutable = 1 in + def XVMADDADP : XX3Form<60, 97, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmaddadp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVMADDMDP : XX3Form<60, 105, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVMADDASP" in { + let isCommutable = 1 in + def XVMADDASP : XX3Form<60, 65, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmaddasp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVMADDMSP : XX3Form<60, 73, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVMSUBADP" in { + let isCommutable = 1 in + def XVMSUBADP : XX3Form<60, 113, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmsubadp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVMSUBMDP : XX3Form<60, 121, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVMSUBASP" in { + let isCommutable = 1 in + def XVMSUBASP : XX3Form<60, 81, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmsubasp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVMSUBMSP : XX3Form<60, 89, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVNMADDADP" in { + let isCommutable = 1 in + def XVNMADDADP : XX3Form<60, 225, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmaddadp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVNMADDMDP : XX3Form<60, 233, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmaddmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVNMADDASP" in { + let isCommutable = 1 in + def XVNMADDASP : XX3Form<60, 193, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmaddasp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi)))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVNMADDMSP : XX3Form<60, 201, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmaddmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVNMSUBADP" in { + let isCommutable = 1 in + def XVNMSUBADP : XX3Form<60, 241, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmsubadp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVNMSUBMDP : XX3Form<60, 249, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmsubmdp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + let BaseName = "XVNMSUBASP" in { + let isCommutable = 1 in + def XVNMSUBASP : XX3Form<60, 209, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmsubasp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + let IsVSXFMAAlt = 1 in + def XVNMSUBMSP : XX3Form<60, 217, + (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB), + "xvnmsubmsp $XT, $XA, $XB", IIC_VecFP, []>, + RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, + AltVSXFMARel; + } + + // Division Instructions + def XSDIVDP : XX3Form<60, 56, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsdivdp $XT, $XA, $XB", IIC_FPDivD, + [(set f64:$XT, (fdiv f64:$XA, f64:$XB))]>; + def XSSQRTDP : XX2Form<60, 75, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xssqrtdp $XT, $XB", IIC_FPSqrtD, + [(set f64:$XT, (fsqrt f64:$XB))]>; + + def XSREDP : XX2Form<60, 90, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsredp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfre f64:$XB))]>; + def XSRSQRTEDP : XX2Form<60, 74, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrsqrtedp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfrsqrte f64:$XB))]>; + + def XSTDIVDP : XX3Form_1<60, 61, + (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), + "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>; + def XSTSQRTDP : XX2Form_1<60, 106, + (outs crrc:$crD), (ins vsfrc:$XB), + "xstsqrtdp $crD, $XB", IIC_FPCompare, []>; + + def XVDIVDP : XX3Form<60, 120, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvdivdp $XT, $XA, $XB", IIC_FPDivD, + [(set v2f64:$XT, (fdiv v2f64:$XA, v2f64:$XB))]>; + def XVDIVSP : XX3Form<60, 88, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvdivsp $XT, $XA, $XB", IIC_FPDivS, + [(set v4f32:$XT, (fdiv v4f32:$XA, v4f32:$XB))]>; + + def XVSQRTDP : XX2Form<60, 203, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvsqrtdp $XT, $XB", IIC_FPSqrtD, + [(set v2f64:$XT, (fsqrt v2f64:$XB))]>; + def XVSQRTSP : XX2Form<60, 139, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvsqrtsp $XT, $XB", IIC_FPSqrtS, + [(set v4f32:$XT, (fsqrt v4f32:$XB))]>; + + def XVTDIVDP : XX3Form_1<60, 125, + (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), + "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>; + def XVTDIVSP : XX3Form_1<60, 93, + (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), + "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>; + + def XVTSQRTDP : XX2Form_1<60, 234, + (outs crrc:$crD), (ins vsrc:$XB), + "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>; + def XVTSQRTSP : XX2Form_1<60, 170, + (outs crrc:$crD), (ins vsrc:$XB), + "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>; + + def XVREDP : XX2Form<60, 218, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvredp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (PPCfre v2f64:$XB))]>; + def XVRESP : XX2Form<60, 154, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvresp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (PPCfre v4f32:$XB))]>; + + def XVRSQRTEDP : XX2Form<60, 202, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrsqrtedp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (PPCfrsqrte v2f64:$XB))]>; + def XVRSQRTESP : XX2Form<60, 138, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrsqrtesp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (PPCfrsqrte v4f32:$XB))]>; + + // Compare Instructions + def XSCMPODP : XX3Form_1<60, 43, + (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), + "xscmpodp $crD, $XA, $XB", IIC_FPCompare, []>; + def XSCMPUDP : XX3Form_1<60, 35, + (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), + "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>; + + defm XVCMPEQDP : XX3Form_Rcr<60, 99, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare, []>; + defm XVCMPEQSP : XX3Form_Rcr<60, 67, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare, []>; + defm XVCMPGEDP : XX3Form_Rcr<60, 115, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare, []>; + defm XVCMPGESP : XX3Form_Rcr<60, 83, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare, []>; + defm XVCMPGTDP : XX3Form_Rcr<60, 107, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare, []>; + defm XVCMPGTSP : XX3Form_Rcr<60, 75, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare, []>; + + // Move Instructions + def XSABSDP : XX2Form<60, 345, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsabsdp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fabs f64:$XB))]>; + def XSNABSDP : XX2Form<60, 361, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsnabsdp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fneg (fabs f64:$XB)))]>; + def XSNEGDP : XX2Form<60, 377, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsnegdp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fneg f64:$XB))]>; + def XSCPSGNDP : XX3Form<60, 176, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xscpsgndp $XT, $XA, $XB", IIC_VecFP, + [(set f64:$XT, (fcopysign f64:$XB, f64:$XA))]>; + + def XVABSDP : XX2Form<60, 473, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvabsdp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fabs v2f64:$XB))]>; + + def XVABSSP : XX2Form<60, 409, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvabssp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fabs v4f32:$XB))]>; + + def XVCPSGNDP : XX3Form<60, 240, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcpsgndp $XT, $XA, $XB", IIC_VecFP, + [(set v2f64:$XT, (fcopysign v2f64:$XB, v2f64:$XA))]>; + def XVCPSGNSP : XX3Form<60, 208, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvcpsgnsp $XT, $XA, $XB", IIC_VecFP, + [(set v4f32:$XT, (fcopysign v4f32:$XB, v4f32:$XA))]>; + + def XVNABSDP : XX2Form<60, 489, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvnabsdp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fneg (fabs v2f64:$XB)))]>; + def XVNABSSP : XX2Form<60, 425, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvnabssp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fneg (fabs v4f32:$XB)))]>; + + def XVNEGDP : XX2Form<60, 505, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvnegdp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fneg v2f64:$XB))]>; + def XVNEGSP : XX2Form<60, 441, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvnegsp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fneg v4f32:$XB))]>; + + // Conversion Instructions + def XSCVDPSP : XX2Form<60, 265, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpsp $XT, $XB", IIC_VecFP, []>; + def XSCVDPSXDS : XX2Form<60, 344, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpsxds $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfctidz f64:$XB))]>; + def XSCVDPSXWS : XX2Form<60, 88, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpsxws $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfctiwz f64:$XB))]>; + def XSCVDPUXDS : XX2Form<60, 328, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpuxds $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfctiduz f64:$XB))]>; + def XSCVDPUXWS : XX2Form<60, 72, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvdpuxws $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfctiwuz f64:$XB))]>; + def XSCVSPDP : XX2Form<60, 329, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvspdp $XT, $XB", IIC_VecFP, []>; + def XSCVSXDDP : XX2Form<60, 376, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvsxddp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfcfid f64:$XB))]>; + def XSCVUXDDP : XX2Form<60, 360, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xscvuxddp $XT, $XB", IIC_VecFP, + [(set f64:$XT, (PPCfcfidu f64:$XB))]>; + + def XVCVDPSP : XX2Form<60, 393, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpsp $XT, $XB", IIC_VecFP, []>; + def XVCVDPSXDS : XX2Form<60, 472, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpsxds $XT, $XB", IIC_VecFP, + [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>; + def XVCVDPSXWS : XX2Form<60, 216, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpsxws $XT, $XB", IIC_VecFP, []>; + def XVCVDPUXDS : XX2Form<60, 456, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpuxds $XT, $XB", IIC_VecFP, + [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>; + def XVCVDPUXWS : XX2Form<60, 200, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvdpuxws $XT, $XB", IIC_VecFP, []>; + + def XVCVSPDP : XX2Form<60, 457, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspdp $XT, $XB", IIC_VecFP, []>; + def XVCVSPSXDS : XX2Form<60, 408, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspsxds $XT, $XB", IIC_VecFP, []>; + def XVCVSPSXWS : XX2Form<60, 152, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspsxws $XT, $XB", IIC_VecFP, []>; + def XVCVSPUXDS : XX2Form<60, 392, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspuxds $XT, $XB", IIC_VecFP, []>; + def XVCVSPUXWS : XX2Form<60, 136, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvspuxws $XT, $XB", IIC_VecFP, []>; + def XVCVSXDDP : XX2Form<60, 504, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvsxddp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>; + def XVCVSXDSP : XX2Form<60, 440, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvsxdsp $XT, $XB", IIC_VecFP, []>; + def XVCVSXWDP : XX2Form<60, 248, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvsxwdp $XT, $XB", IIC_VecFP, []>; + def XVCVSXWSP : XX2Form<60, 184, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvsxwsp $XT, $XB", IIC_VecFP, []>; + def XVCVUXDDP : XX2Form<60, 488, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvuxddp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>; + def XVCVUXDSP : XX2Form<60, 424, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvuxdsp $XT, $XB", IIC_VecFP, []>; + def XVCVUXWDP : XX2Form<60, 232, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvuxwdp $XT, $XB", IIC_VecFP, []>; + def XVCVUXWSP : XX2Form<60, 168, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvuxwsp $XT, $XB", IIC_VecFP, []>; + + // Rounding Instructions + def XSRDPI : XX2Form<60, 73, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpi $XT, $XB", IIC_VecFP, + [(set f64:$XT, (frnd f64:$XB))]>; + def XSRDPIC : XX2Form<60, 107, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpic $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fnearbyint f64:$XB))]>; + def XSRDPIM : XX2Form<60, 121, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpim $XT, $XB", IIC_VecFP, + [(set f64:$XT, (ffloor f64:$XB))]>; + def XSRDPIP : XX2Form<60, 105, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpip $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fceil f64:$XB))]>; + def XSRDPIZ : XX2Form<60, 89, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpiz $XT, $XB", IIC_VecFP, + [(set f64:$XT, (ftrunc f64:$XB))]>; + + def XVRDPI : XX2Form<60, 201, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpi $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (frnd v2f64:$XB))]>; + def XVRDPIC : XX2Form<60, 235, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpic $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>; + def XVRDPIM : XX2Form<60, 249, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpim $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (ffloor v2f64:$XB))]>; + def XVRDPIP : XX2Form<60, 233, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpip $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fceil v2f64:$XB))]>; + def XVRDPIZ : XX2Form<60, 217, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrdpiz $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (ftrunc v2f64:$XB))]>; + + def XVRSPI : XX2Form<60, 137, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspi $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (frnd v4f32:$XB))]>; + def XVRSPIC : XX2Form<60, 171, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspic $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>; + def XVRSPIM : XX2Form<60, 185, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspim $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (ffloor v4f32:$XB))]>; + def XVRSPIP : XX2Form<60, 169, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspip $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fceil v4f32:$XB))]>; + def XVRSPIZ : XX2Form<60, 153, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspiz $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (ftrunc v4f32:$XB))]>; + + // Max/Min Instructions + let isCommutable = 1 in { + def XSMAXDP : XX3Form<60, 160, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmaxdp $XT, $XA, $XB", IIC_VecFP, []>; + def XSMINDP : XX3Form<60, 168, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmindp $XT, $XA, $XB", IIC_VecFP, []>; + + def XVMAXDP : XX3Form<60, 224, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmaxdp $XT, $XA, $XB", IIC_VecFP, []>; + def XVMINDP : XX3Form<60, 232, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmindp $XT, $XA, $XB", IIC_VecFP, []>; + + def XVMAXSP : XX3Form<60, 192, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmaxsp $XT, $XA, $XB", IIC_VecFP, []>; + def XVMINSP : XX3Form<60, 200, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvminsp $XT, $XA, $XB", IIC_VecFP, []>; + } // isCommutable +} // Uses = [RM] + + // Logical Instructions + let isCommutable = 1 in + def XXLAND : XX3Form<60, 130, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxland $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (and v4i32:$XA, v4i32:$XB))]>; + def XXLANDC : XX3Form<60, 138, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlandc $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (and v4i32:$XA, + (vnot_ppc v4i32:$XB)))]>; + let isCommutable = 1 in { + def XXLNOR : XX3Form<60, 162, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlnor $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (vnot_ppc (or v4i32:$XA, + v4i32:$XB)))]>; + def XXLOR : XX3Form<60, 146, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlor $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (or v4i32:$XA, v4i32:$XB))]>; + let isCodeGenOnly = 1 in + def XXLORf: XX3Form<60, 146, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xxlor $XT, $XA, $XB", IIC_VecGeneral, []>; + def XXLXOR : XX3Form<60, 154, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxlxor $XT, $XA, $XB", IIC_VecGeneral, + [(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>; + } // isCommutable + + // Permutation Instructions + def XXMRGHW : XX3Form<60, 18, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxmrghw $XT, $XA, $XB", IIC_VecPerm, []>; + def XXMRGLW : XX3Form<60, 50, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xxmrglw $XT, $XA, $XB", IIC_VecPerm, []>; + + def XXPERMDI : XX3Form_2<60, 10, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM), + "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>; + def XXSEL : XX4Form<60, 3, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC), + "xxsel $XT, $XA, $XB, $XC", IIC_VecPerm, []>; + + def XXSLDWI : XX3Form_2<60, 2, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$SHW), + "xxsldwi $XT, $XA, $XB, $SHW", IIC_VecPerm, []>; + def XXSPLTW : XX2Form_2<60, 164, + (outs vsrc:$XT), (ins vsrc:$XB, u2imm:$UIM), + "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>; +} // neverHasSideEffects +} // AddedComplexity + +def : InstAlias<"xvmovdp $XT, $XB", + (XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>; +def : InstAlias<"xvmovsp $XT, $XB", + (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>; + +def : InstAlias<"xxspltd $XT, $XB, 0", + (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>; +def : InstAlias<"xxspltd $XT, $XB, 1", + (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>; +def : InstAlias<"xxmrghd $XT, $XA, $XB", + (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>; +def : InstAlias<"xxmrgld $XT, $XA, $XB", + (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>; +def : InstAlias<"xxswapd $XT, $XB", + (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>; + +let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. +def : Pat<(v2f64 (scalar_to_vector f64:$A)), + (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>; + +def : Pat<(f64 (vector_extract v2f64:$S, 0)), + (f64 (EXTRACT_SUBREG $S, sub_64))>; +def : Pat<(f64 (vector_extract v2f64:$S, 1)), + (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; + +// Additional fnmsub patterns: -a*c + b == -(a*c - b) +def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B), + (XSNMSUBADP $B, $C, $A)>; +def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B), + (XSNMSUBADP $B, $C, $A)>; + +def : Pat<(fma (fneg v2f64:$A), v2f64:$C, v2f64:$B), + (XVNMSUBADP $B, $C, $A)>; +def : Pat<(fma v2f64:$A, (fneg v2f64:$C), v2f64:$B), + (XVNMSUBADP $B, $C, $A)>; + +def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), + (XVNMSUBASP $B, $C, $A)>; +def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), + (XVNMSUBASP $B, $C, $A)>; + +def : Pat<(v2f64 (bitconvert v4f32:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2f64 (bitconvert v4i32:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2f64 (bitconvert v8i16:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2f64 (bitconvert v16i8:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; + +def : Pat<(v4f32 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v4i32 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v8i16 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v16i8 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; + +def : Pat<(v2i64 (bitconvert v4f32:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2i64 (bitconvert v4i32:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2i64 (bitconvert v8i16:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; +def : Pat<(v2i64 (bitconvert v16i8:$A)), + (COPY_TO_REGCLASS $A, VSRC)>; + +def : Pat<(v4f32 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v4i32 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v8i16 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v16i8 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; + +def : Pat<(v2f64 (bitconvert v2i64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; +def : Pat<(v2i64 (bitconvert v2f64:$A)), + (COPY_TO_REGCLASS $A, VRRC)>; + +// sign extension patterns +// To extend "in place" from v2i32 to v2i64, we have input data like: +// | undef | i32 | undef | i32 | +// but xvcvsxwdp expects the input in big-Endian format: +// | i32 | undef | i32 | undef | +// so we need to shift everything to the left by one i32 (word) before +// the conversion. +def : Pat<(sext_inreg v2i64:$C, v2i32), + (XVCVDPSXDS (XVCVSXWDP (XXSLDWI $C, $C, 1)))>; +def : Pat<(v2f64 (sint_to_fp (sext_inreg v2i64:$C, v2i32))), + (XVCVSXWDP (XXSLDWI $C, $C, 1))>; + +} // AddedComplexity +} // HasVSX + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.cpp new file mode 100644 index 000000000000..e5f113a0c030 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.cpp @@ -0,0 +1,482 @@ +//===-- PPCJITInfo.cpp - Implement the JIT interfaces for the PowerPC -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the JIT interfaces for the 32-bit PowerPC target. +// +//===----------------------------------------------------------------------===// + +#include "PPCJITInfo.h" +#include "PPCRelocations.h" +#include "PPCSubtarget.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Memory.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "jit" + +static TargetJITInfo::JITCompilerFn JITCompilerFunction; + +PPCJITInfo::PPCJITInfo(PPCSubtarget &STI) + : Subtarget(STI), is64Bit(STI.isPPC64()) { + useGOT = 0; +} + +#define BUILD_ADDIS(RD,RS,IMM16) \ + ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535)) +#define BUILD_ORI(RD,RS,UIMM16) \ + ((24 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535)) +#define BUILD_ORIS(RD,RS,UIMM16) \ + ((25 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535)) +#define BUILD_RLDICR(RD,RS,SH,ME) \ + ((30 << 26) | ((RS) << 21) | ((RD) << 16) | (((SH) & 31) << 11) | \ + (((ME) & 63) << 6) | (1 << 2) | ((((SH) >> 5) & 1) << 1)) +#define BUILD_MTSPR(RS,SPR) \ + ((31 << 26) | ((RS) << 21) | ((SPR) << 16) | (467 << 1)) +#define BUILD_BCCTRx(BO,BI,LINK) \ + ((19 << 26) | ((BO) << 21) | ((BI) << 16) | (528 << 1) | ((LINK) & 1)) +#define BUILD_B(TARGET, LINK) \ + ((18 << 26) | (((TARGET) & 0x00FFFFFF) << 2) | ((LINK) & 1)) + +// Pseudo-ops +#define BUILD_LIS(RD,IMM16) BUILD_ADDIS(RD,0,IMM16) +#define BUILD_SLDI(RD,RS,IMM6) BUILD_RLDICR(RD,RS,IMM6,63-IMM6) +#define BUILD_MTCTR(RS) BUILD_MTSPR(RS,9) +#define BUILD_BCTR(LINK) BUILD_BCCTRx(20,0,LINK) + +static void EmitBranchToAt(uint64_t At, uint64_t To, bool isCall, bool is64Bit){ + intptr_t Offset = ((intptr_t)To - (intptr_t)At) >> 2; + unsigned *AtI = (unsigned*)(intptr_t)At; + + if (Offset >= -(1 << 23) && Offset < (1 << 23)) { // In range? + AtI[0] = BUILD_B(Offset, isCall); // b/bl target + } else if (!is64Bit) { + AtI[0] = BUILD_LIS(12, To >> 16); // lis r12, hi16(address) + AtI[1] = BUILD_ORI(12, 12, To); // ori r12, r12, lo16(address) + AtI[2] = BUILD_MTCTR(12); // mtctr r12 + AtI[3] = BUILD_BCTR(isCall); // bctr/bctrl + } else { + AtI[0] = BUILD_LIS(12, To >> 48); // lis r12, hi16(address) + AtI[1] = BUILD_ORI(12, 12, To >> 32); // ori r12, r12, lo16(address) + AtI[2] = BUILD_SLDI(12, 12, 32); // sldi r12, r12, 32 + AtI[3] = BUILD_ORIS(12, 12, To >> 16); // oris r12, r12, hi16(address) + AtI[4] = BUILD_ORI(12, 12, To); // ori r12, r12, lo16(address) + AtI[5] = BUILD_MTCTR(12); // mtctr r12 + AtI[6] = BUILD_BCTR(isCall); // bctr/bctrl + } +} + +extern "C" void PPC32CompilationCallback(); +extern "C" void PPC64CompilationCallback(); + +// The first clause of the preprocessor directive looks wrong, but it is +// necessary when compiling this code on non-PowerPC hosts. +#if (!defined(__ppc__) && !defined(__powerpc__)) || defined(__powerpc64__) || defined(__ppc64__) +void PPC32CompilationCallback() { + llvm_unreachable("This is not a 32bit PowerPC, you can't execute this!"); +} +#elif !defined(__ELF__) +// CompilationCallback stub - We can't use a C function with inline assembly in +// it, because we the prolog/epilog inserted by GCC won't work for us. Instead, +// write our own wrapper, which does things our way, so we have complete control +// over register saving and restoring. +asm( + ".text\n" + ".align 2\n" + ".globl _PPC32CompilationCallback\n" +"_PPC32CompilationCallback:\n" + // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the + // FIXME: need to save v[0-19] for altivec? + // FIXME: could shrink frame + // Set up a proper stack frame + // FIXME Layout + // PowerPC32 ABI linkage - 24 bytes + // parameters - 32 bytes + // 13 double registers - 104 bytes + // 8 int registers - 32 bytes + "mflr r0\n" + "stw r0, 8(r1)\n" + "stwu r1, -208(r1)\n" + // Save all int arg registers + "stw r10, 204(r1)\n" "stw r9, 200(r1)\n" + "stw r8, 196(r1)\n" "stw r7, 192(r1)\n" + "stw r6, 188(r1)\n" "stw r5, 184(r1)\n" + "stw r4, 180(r1)\n" "stw r3, 176(r1)\n" + // Save all call-clobbered FP regs. + "stfd f13, 168(r1)\n" "stfd f12, 160(r1)\n" + "stfd f11, 152(r1)\n" "stfd f10, 144(r1)\n" + "stfd f9, 136(r1)\n" "stfd f8, 128(r1)\n" + "stfd f7, 120(r1)\n" "stfd f6, 112(r1)\n" + "stfd f5, 104(r1)\n" "stfd f4, 96(r1)\n" + "stfd f3, 88(r1)\n" "stfd f2, 80(r1)\n" + "stfd f1, 72(r1)\n" + // Arguments to Compilation Callback: + // r3 - our lr (address of the call instruction in stub plus 4) + // r4 - stub's lr (address of instruction that called the stub plus 4) + // r5 - is64Bit - always 0. + "mr r3, r0\n" + "lwz r2, 208(r1)\n" // stub's frame + "lwz r4, 8(r2)\n" // stub's lr + "li r5, 0\n" // 0 == 32 bit + "bl _LLVMPPCCompilationCallback\n" + "mtctr r3\n" + // Restore all int arg registers + "lwz r10, 204(r1)\n" "lwz r9, 200(r1)\n" + "lwz r8, 196(r1)\n" "lwz r7, 192(r1)\n" + "lwz r6, 188(r1)\n" "lwz r5, 184(r1)\n" + "lwz r4, 180(r1)\n" "lwz r3, 176(r1)\n" + // Restore all FP arg registers + "lfd f13, 168(r1)\n" "lfd f12, 160(r1)\n" + "lfd f11, 152(r1)\n" "lfd f10, 144(r1)\n" + "lfd f9, 136(r1)\n" "lfd f8, 128(r1)\n" + "lfd f7, 120(r1)\n" "lfd f6, 112(r1)\n" + "lfd f5, 104(r1)\n" "lfd f4, 96(r1)\n" + "lfd f3, 88(r1)\n" "lfd f2, 80(r1)\n" + "lfd f1, 72(r1)\n" + // Pop 3 frames off the stack and branch to target + "lwz r1, 208(r1)\n" + "lwz r2, 8(r1)\n" + "mtlr r2\n" + "bctr\n" + ); + +#else +// ELF PPC 32 support + +// CompilationCallback stub - We can't use a C function with inline assembly in +// it, because we the prolog/epilog inserted by GCC won't work for us. Instead, +// write our own wrapper, which does things our way, so we have complete control +// over register saving and restoring. +asm( + ".text\n" + ".align 2\n" + ".globl PPC32CompilationCallback\n" +"PPC32CompilationCallback:\n" + // Make space for 8 ints r[3-10] and 8 doubles f[1-8] and the + // FIXME: need to save v[0-19] for altivec? + // FIXME: could shrink frame + // Set up a proper stack frame + // FIXME Layout + // 8 double registers - 64 bytes + // 8 int registers - 32 bytes + "mflr 0\n" + "stw 0, 4(1)\n" + "stwu 1, -104(1)\n" + // Save all int arg registers + "stw 10, 100(1)\n" "stw 9, 96(1)\n" + "stw 8, 92(1)\n" "stw 7, 88(1)\n" + "stw 6, 84(1)\n" "stw 5, 80(1)\n" + "stw 4, 76(1)\n" "stw 3, 72(1)\n" + // Save all call-clobbered FP regs. + "stfd 8, 64(1)\n" + "stfd 7, 56(1)\n" "stfd 6, 48(1)\n" + "stfd 5, 40(1)\n" "stfd 4, 32(1)\n" + "stfd 3, 24(1)\n" "stfd 2, 16(1)\n" + "stfd 1, 8(1)\n" + // Arguments to Compilation Callback: + // r3 - our lr (address of the call instruction in stub plus 4) + // r4 - stub's lr (address of instruction that called the stub plus 4) + // r5 - is64Bit - always 0. + "mr 3, 0\n" + "lwz 5, 104(1)\n" // stub's frame + "lwz 4, 4(5)\n" // stub's lr + "li 5, 0\n" // 0 == 32 bit + "bl LLVMPPCCompilationCallback\n" + "mtctr 3\n" + // Restore all int arg registers + "lwz 10, 100(1)\n" "lwz 9, 96(1)\n" + "lwz 8, 92(1)\n" "lwz 7, 88(1)\n" + "lwz 6, 84(1)\n" "lwz 5, 80(1)\n" + "lwz 4, 76(1)\n" "lwz 3, 72(1)\n" + // Restore all FP arg registers + "lfd 8, 64(1)\n" + "lfd 7, 56(1)\n" "lfd 6, 48(1)\n" + "lfd 5, 40(1)\n" "lfd 4, 32(1)\n" + "lfd 3, 24(1)\n" "lfd 2, 16(1)\n" + "lfd 1, 8(1)\n" + // Pop 3 frames off the stack and branch to target + "lwz 1, 104(1)\n" + "lwz 0, 4(1)\n" + "mtlr 0\n" + "bctr\n" + ); +#endif + +#if !defined(__powerpc64__) && !defined(__ppc64__) +void PPC64CompilationCallback() { + llvm_unreachable("This is not a 64bit PowerPC, you can't execute this!"); +} +#else +# ifdef __ELF__ +asm( + ".text\n" + ".align 2\n" + ".globl PPC64CompilationCallback\n" +#if _CALL_ELF == 2 + ".type PPC64CompilationCallback,@function\n" +"PPC64CompilationCallback:\n" +#else + ".section \".opd\",\"aw\",@progbits\n" + ".align 3\n" +"PPC64CompilationCallback:\n" + ".quad .L.PPC64CompilationCallback,.TOC.@tocbase,0\n" + ".size PPC64CompilationCallback,24\n" + ".previous\n" + ".align 4\n" + ".type PPC64CompilationCallback,@function\n" +".L.PPC64CompilationCallback:\n" +#endif +# else +asm( + ".text\n" + ".align 2\n" + ".globl _PPC64CompilationCallback\n" +"_PPC64CompilationCallback:\n" +# endif + // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the + // FIXME: need to save v[0-19] for altivec? + // Set up a proper stack frame + // Layout + // PowerPC64 ABI linkage - 48 bytes + // parameters - 64 bytes + // 13 double registers - 104 bytes + // 8 int registers - 64 bytes + "mflr 0\n" + "std 0, 16(1)\n" + "stdu 1, -280(1)\n" + // Save all int arg registers + "std 10, 272(1)\n" "std 9, 264(1)\n" + "std 8, 256(1)\n" "std 7, 248(1)\n" + "std 6, 240(1)\n" "std 5, 232(1)\n" + "std 4, 224(1)\n" "std 3, 216(1)\n" + // Save all call-clobbered FP regs. + "stfd 13, 208(1)\n" "stfd 12, 200(1)\n" + "stfd 11, 192(1)\n" "stfd 10, 184(1)\n" + "stfd 9, 176(1)\n" "stfd 8, 168(1)\n" + "stfd 7, 160(1)\n" "stfd 6, 152(1)\n" + "stfd 5, 144(1)\n" "stfd 4, 136(1)\n" + "stfd 3, 128(1)\n" "stfd 2, 120(1)\n" + "stfd 1, 112(1)\n" + // Arguments to Compilation Callback: + // r3 - our lr (address of the call instruction in stub plus 4) + // r4 - stub's lr (address of instruction that called the stub plus 4) + // r5 - is64Bit - always 1. + "mr 3, 0\n" // return address (still in r0) + "ld 5, 280(1)\n" // stub's frame + "ld 4, 16(5)\n" // stub's lr + "li 5, 1\n" // 1 == 64 bit +# ifdef __ELF__ + "bl LLVMPPCCompilationCallback\n" + "nop\n" +# else + "bl _LLVMPPCCompilationCallback\n" +# endif + "mtctr 3\n" + // Restore all int arg registers + "ld 10, 272(1)\n" "ld 9, 264(1)\n" + "ld 8, 256(1)\n" "ld 7, 248(1)\n" + "ld 6, 240(1)\n" "ld 5, 232(1)\n" + "ld 4, 224(1)\n" "ld 3, 216(1)\n" + // Restore all FP arg registers + "lfd 13, 208(1)\n" "lfd 12, 200(1)\n" + "lfd 11, 192(1)\n" "lfd 10, 184(1)\n" + "lfd 9, 176(1)\n" "lfd 8, 168(1)\n" + "lfd 7, 160(1)\n" "lfd 6, 152(1)\n" + "lfd 5, 144(1)\n" "lfd 4, 136(1)\n" + "lfd 3, 128(1)\n" "lfd 2, 120(1)\n" + "lfd 1, 112(1)\n" + // Pop 3 frames off the stack and branch to target + "ld 1, 280(1)\n" + "ld 0, 16(1)\n" + "mtlr 0\n" + // XXX: any special TOC handling in the ELF case for JIT? + "bctr\n" + ); +#endif + +extern "C" { +LLVM_LIBRARY_VISIBILITY void * +LLVMPPCCompilationCallback(unsigned *StubCallAddrPlus4, + unsigned *OrigCallAddrPlus4, + bool is64Bit) { + // Adjust the pointer to the address of the call instruction in the stub + // emitted by emitFunctionStub, rather than the instruction after it. + unsigned *StubCallAddr = StubCallAddrPlus4 - 1; + unsigned *OrigCallAddr = OrigCallAddrPlus4 - 1; + + void *Target = JITCompilerFunction(StubCallAddr); + + // Check to see if *OrigCallAddr is a 'bl' instruction, and if we can rewrite + // it to branch directly to the destination. If so, rewrite it so it does not + // need to go through the stub anymore. + unsigned OrigCallInst = *OrigCallAddr; + if ((OrigCallInst >> 26) == 18) { // Direct call. + intptr_t Offset = ((intptr_t)Target - (intptr_t)OrigCallAddr) >> 2; + + if (Offset >= -(1 << 23) && Offset < (1 << 23)) { // In range? + // Clear the original target out. + OrigCallInst &= (63 << 26) | 3; + // Fill in the new target. + OrigCallInst |= (Offset & ((1 << 24)-1)) << 2; + // Replace the call. + *OrigCallAddr = OrigCallInst; + } + } + + // Assert that we are coming from a stub that was created with our + // emitFunctionStub. + if ((*StubCallAddr >> 26) == 18) + StubCallAddr -= 3; + else { + assert((*StubCallAddr >> 26) == 19 && "Call in stub is not indirect!"); + StubCallAddr -= is64Bit ? 9 : 6; + } + + // Rewrite the stub with an unconditional branch to the target, for any users + // who took the address of the stub. + EmitBranchToAt((intptr_t)StubCallAddr, (intptr_t)Target, false, is64Bit); + sys::Memory::InvalidateInstructionCache(StubCallAddr, 7*4); + + // Put the address of the target function to call and the address to return to + // after calling the target function in a place that is easy to get on the + // stack after we restore all regs. + return Target; +} +} + + + +TargetJITInfo::LazyResolverFn +PPCJITInfo::getLazyResolverFunction(JITCompilerFn Fn) { + JITCompilerFunction = Fn; + return is64Bit ? PPC64CompilationCallback : PPC32CompilationCallback; +} + +TargetJITInfo::StubLayout PPCJITInfo::getStubLayout() { + // The stub contains up to 10 4-byte instructions, aligned at 4 bytes: 3 + // instructions to save the caller's address if this is a lazy-compilation + // stub, plus a 1-, 4-, or 7-instruction sequence to load an arbitrary address + // into a register and jump through it. + StubLayout Result = {10*4, 4}; + return Result; +} + +#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \ +defined(__APPLE__) +extern "C" void sys_icache_invalidate(const void *Addr, size_t len); +#endif + +void *PPCJITInfo::emitFunctionStub(const Function* F, void *Fn, + JITCodeEmitter &JCE) { + // If this is just a call to an external function, emit a branch instead of a + // call. The code is the same except for one bit of the last instruction. + if (Fn != (void*)(intptr_t)PPC32CompilationCallback && + Fn != (void*)(intptr_t)PPC64CompilationCallback) { + void *Addr = (void*)JCE.getCurrentPCValue(); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + EmitBranchToAt((intptr_t)Addr, (intptr_t)Fn, false, is64Bit); + sys::Memory::InvalidateInstructionCache(Addr, 7*4); + return Addr; + } + + void *Addr = (void*)JCE.getCurrentPCValue(); + if (is64Bit) { + JCE.emitWordBE(0xf821ffb1); // stdu r1,-80(r1) + JCE.emitWordBE(0x7d6802a6); // mflr r11 + JCE.emitWordBE(0xf9610060); // std r11, 96(r1) + } else if (Subtarget.isDarwinABI()){ + JCE.emitWordBE(0x9421ffe0); // stwu r1,-32(r1) + JCE.emitWordBE(0x7d6802a6); // mflr r11 + JCE.emitWordBE(0x91610028); // stw r11, 40(r1) + } else { + JCE.emitWordBE(0x9421ffe0); // stwu r1,-32(r1) + JCE.emitWordBE(0x7d6802a6); // mflr r11 + JCE.emitWordBE(0x91610024); // stw r11, 36(r1) + } + intptr_t BranchAddr = (intptr_t)JCE.getCurrentPCValue(); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + JCE.emitWordBE(0); + EmitBranchToAt(BranchAddr, (intptr_t)Fn, true, is64Bit); + sys::Memory::InvalidateInstructionCache(Addr, 10*4); + return Addr; +} + + +void PPCJITInfo::relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase) { + for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { + unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4; + intptr_t ResultPtr = (intptr_t)MR->getResultPointer(); + switch ((PPC::RelocationType)MR->getRelocationType()) { + default: llvm_unreachable("Unknown relocation type!"); + case PPC::reloc_pcrel_bx: + // PC-relative relocation for b and bl instructions. + ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2; + assert(ResultPtr >= -(1 << 23) && ResultPtr < (1 << 23) && + "Relocation out of range!"); + *RelocPos |= (ResultPtr & ((1 << 24)-1)) << 2; + break; + case PPC::reloc_pcrel_bcx: + // PC-relative relocation for BLT,BLE,BEQ,BGE,BGT,BNE, or other + // bcx instructions. + ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2; + assert(ResultPtr >= -(1 << 13) && ResultPtr < (1 << 13) && + "Relocation out of range!"); + *RelocPos |= (ResultPtr & ((1 << 14)-1)) << 2; + break; + case PPC::reloc_absolute_high: // high bits of ref -> low 16 of instr + case PPC::reloc_absolute_low: { // low bits of ref -> low 16 of instr + ResultPtr += MR->getConstantVal(); + + // If this is a high-part access, get the high-part. + if (MR->getRelocationType() == PPC::reloc_absolute_high) { + // If the low part will have a carry (really a borrow) from the low + // 16-bits into the high 16, add a bit to borrow from. + if (((int)ResultPtr << 16) < 0) + ResultPtr += 1 << 16; + ResultPtr >>= 16; + } + + // Do the addition then mask, so the addition does not overflow the 16-bit + // immediate section of the instruction. + unsigned LowBits = (*RelocPos + ResultPtr) & 65535; + unsigned HighBits = *RelocPos & ~65535; + *RelocPos = LowBits | HighBits; // Slam into low 16-bits + break; + } + case PPC::reloc_absolute_low_ix: { // low bits of ref -> low 14 of instr + ResultPtr += MR->getConstantVal(); + // Do the addition then mask, so the addition does not overflow the 16-bit + // immediate section of the instruction. + unsigned LowBits = (*RelocPos + ResultPtr) & 0xFFFC; + unsigned HighBits = *RelocPos & 0xFFFF0003; + *RelocPos = LowBits | HighBits; // Slam into low 14-bits. + break; + } + } + } +} + +void PPCJITInfo::replaceMachineCodeForFunction(void *Old, void *New) { + EmitBranchToAt((intptr_t)Old, (intptr_t)New, false, is64Bit); + sys::Memory::InvalidateInstructionCache(Old, 7*4); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.h new file mode 100644 index 000000000000..b6b37ffb852b --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.h @@ -0,0 +1,46 @@ +//===-- PPCJITInfo.h - PowerPC impl. of the JIT interface -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetJITInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC_JITINFO_H +#define POWERPC_JITINFO_H + +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/Target/TargetJITInfo.h" + +namespace llvm { +class PPCSubtarget; +class PPCJITInfo : public TargetJITInfo { +protected: + PPCSubtarget &Subtarget; + bool is64Bit; + +public: + PPCJITInfo(PPCSubtarget &STI); + + StubLayout getStubLayout() override; + void *emitFunctionStub(const Function *F, void *Fn, + JITCodeEmitter &JCE) override; + LazyResolverFn getLazyResolverFunction(JITCompilerFn) override; + void relocate(void *Function, MachineRelocation *MR, unsigned NumRelocs, + unsigned char *GOTBase) override; + + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + void replaceMachineCodeForFunction(void *Old, void *New) override; +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp new file mode 100644 index 000000000000..668041371780 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -0,0 +1,216 @@ +//===-- PPCMCInstLower.cpp - Convert PPC MachineInstr to an MCInst --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower PPC MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCSubtarget.h" +#include "MCTargetDesc/PPCMCExpr.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +using namespace llvm; + +static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) { + return AP.MMI->getObjFileInfo<MachineModuleInfoMachO>(); +} + + +static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ + const TargetMachine &TM = AP.TM; + Mangler *Mang = AP.Mang; + const DataLayout *DL = TM.getDataLayout(); + MCContext &Ctx = AP.OutContext; + bool isDarwin = TM.getSubtarget<PPCSubtarget>().isDarwin(); + + SmallString<128> Name; + StringRef Suffix; + if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB) { + if (isDarwin) + Suffix = "$stub"; + } else if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) + Suffix = "$non_lazy_ptr"; + + if (!Suffix.empty()) + Name += DL->getPrivateGlobalPrefix(); + + unsigned PrefixLen = Name.size(); + + if (!MO.isGlobal()) { + assert(MO.isSymbol() && "Isn't a symbol reference"); + Mang->getNameWithPrefix(Name, MO.getSymbolName()); + } else { + const GlobalValue *GV = MO.getGlobal(); + TM.getNameWithPrefix(Name, GV, *Mang); + } + + unsigned OrigLen = Name.size() - PrefixLen; + + Name += Suffix; + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); + StringRef OrigName = StringRef(Name).substr(PrefixLen, OrigLen); + + // If the target flags on the operand changes the name of the symbol, do that + // before we return the symbol. + if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && isDarwin) { + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI(AP).getFnStubEntry(Sym); + if (StubSym.getPointer()) + return Sym; + + if (MO.isGlobal()) { + StubSym = + MachineModuleInfoImpl:: + StubValueTy(AP.getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } else { + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Ctx.GetOrCreateSymbol(OrigName), false); + } + return Sym; + } + + // If the symbol reference is actually to a non_lazy_ptr, not to the symbol, + // then add the suffix. + if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) { + MachineModuleInfoMachO &MachO = getMachOMMI(AP); + + MachineModuleInfoImpl::StubValueTy &StubSym = + (MO.getTargetFlags() & PPCII::MO_NLP_HIDDEN_FLAG) ? + MachO.getHiddenGVStubEntry(Sym) : MachO.getGVStubEntry(Sym); + + if (!StubSym.getPointer()) { + assert(MO.isGlobal() && "Extern symbol not handled yet"); + StubSym = MachineModuleInfoImpl:: + StubValueTy(AP.getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } + return Sym; + } + + return Sym; +} + +static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, + AsmPrinter &Printer, bool isDarwin) { + MCContext &Ctx = Printer.OutContext; + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + + unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK; + + switch (access) { + case PPCII::MO_TPREL_LO: + RefKind = MCSymbolRefExpr::VK_PPC_TPREL_LO; + break; + case PPCII::MO_TPREL_HA: + RefKind = MCSymbolRefExpr::VK_PPC_TPREL_HA; + break; + case PPCII::MO_DTPREL_LO: + RefKind = MCSymbolRefExpr::VK_PPC_DTPREL_LO; + break; + case PPCII::MO_TLSLD_LO: + RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO; + break; + case PPCII::MO_TOC_LO: + RefKind = MCSymbolRefExpr::VK_PPC_TOC_LO; + break; + case PPCII::MO_TLS: + RefKind = MCSymbolRefExpr::VK_PPC_TLS; + break; + } + + if (MO.getTargetFlags() == PPCII::MO_PLT_OR_STUB && !isDarwin) + RefKind = MCSymbolRefExpr::VK_PLT; + + const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, RefKind, Ctx); + + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(MO.getOffset(), Ctx), + Ctx); + + // Subtract off the PIC base if required. + if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG) { + const MachineFunction *MF = MO.getParent()->getParent()->getParent(); + + const MCExpr *PB = MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); + Expr = MCBinaryExpr::CreateSub(Expr, PB, Ctx); + } + + // Add ha16() / lo16() markers if required. + switch (access) { + case PPCII::MO_LO: + Expr = PPCMCExpr::CreateLo(Expr, isDarwin, Ctx); + break; + case PPCII::MO_HA: + Expr = PPCMCExpr::CreateHa(Expr, isDarwin, Ctx); + break; + } + + return MCOperand::CreateExpr(Expr); +} + +void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, + AsmPrinter &AP, bool isDarwin) { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + switch (MO.getType()) { + default: + MI->dump(); + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + MCOp = MCOperand::CreateReg(MO.getReg()); + break; + case MachineOperand::MO_Immediate: + MCOp = MCOperand::CreateImm(MO.getImm()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( + MO.getMBB()->getSymbol(), AP.OutContext)); + break; + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin); + break; + case MachineOperand::MO_JumpTableIndex: + MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin); + break; + case MachineOperand::MO_ConstantPoolIndex: + MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin); + break; + case MachineOperand::MO_BlockAddress: + MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP, + isDarwin); + break; + case MachineOperand::MO_RegisterMask: + continue; + } + + OutMI.addOperand(MCOp); + } +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp new file mode 100644 index 000000000000..9da1b1b5c754 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -0,0 +1,23 @@ +//===-- PPCMachineFunctionInfo.cpp - Private data used for PowerPC --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPCMachineFunctionInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCContext.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +void PPCFunctionInfo::anchor() { } + +MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const { + const DataLayout *DL = MF.getTarget().getDataLayout(); + return MF.getContext().GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+ + Twine(MF.getFunctionNumber())+"$poff"); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h new file mode 100644 index 000000000000..9a2cec744274 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -0,0 +1,191 @@ +//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the PowerPC specific subclass of MachineFunctionInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef PPC_MACHINE_FUNCTION_INFO_H +#define PPC_MACHINE_FUNCTION_INFO_H + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +/// PPCFunctionInfo - This class is derived from MachineFunction private +/// PowerPC target-specific information for each MachineFunction. +class PPCFunctionInfo : public MachineFunctionInfo { + virtual void anchor(); + + /// FramePointerSaveIndex - Frame index of where the old frame pointer is + /// stored. Also used as an anchor for instructions that need to be altered + /// when using frame pointers (dyna_add, dyna_sub.) + int FramePointerSaveIndex; + + /// ReturnAddrSaveIndex - Frame index of where the return address is stored. + /// + int ReturnAddrSaveIndex; + + /// Frame index where the old base pointer is stored. + int BasePointerSaveIndex; + + /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current + /// function. This is only valid after the initial scan of the function by + /// PEI. + bool MustSaveLR; + + /// Does this function have any stack spills. + bool HasSpills; + + /// Does this function spill using instructions with only r+r (not r+i) + /// forms. + bool HasNonRISpills; + + /// SpillsCR - Indicates whether CR is spilled in the current function. + bool SpillsCR; + + /// Indicates whether VRSAVE is spilled in the current function. + bool SpillsVRSAVE; + + /// LRStoreRequired - The bool indicates whether there is some explicit use of + /// the LR/LR8 stack slot that is not obvious from scanning the code. This + /// requires that the code generator produce a store of LR to the stack on + /// entry, even though LR may otherwise apparently not be used. + bool LRStoreRequired; + + /// MinReservedArea - This is the frame size that is at least reserved in a + /// potential caller (parameter+linkage area). + unsigned MinReservedArea; + + /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum + /// amount the stack pointer is adjusted to make the frame bigger for tail + /// calls. Used for creating an area before the register spill area. + int TailCallSPDelta; + + /// HasFastCall - Does this function contain a fast call. Used to determine + /// how the caller's stack pointer should be calculated (epilog/dynamicalloc). + bool HasFastCall; + + /// VarArgsFrameIndex - FrameIndex for start of varargs area. + int VarArgsFrameIndex; + /// VarArgsStackOffset - StackOffset for start of stack + /// arguments. + int VarArgsStackOffset; + /// VarArgsNumGPR - Index of the first unused integer + /// register for parameter passing. + unsigned VarArgsNumGPR; + /// VarArgsNumFPR - Index of the first unused double + /// register for parameter passing. + unsigned VarArgsNumFPR; + + /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4. + int CRSpillFrameIndex; + + /// If any of CR[2-4] need to be saved in the prologue and restored in the + /// epilogue then they are added to this array. This is used for the + /// 64-bit SVR4 ABI. + SmallVector<unsigned, 3> MustSaveCRs; + + /// Hold onto our MachineFunction context. + MachineFunction &MF; + + /// Whether this uses the PIC Base register or not. + bool UsesPICBase; + +public: + explicit PPCFunctionInfo(MachineFunction &MF) + : FramePointerSaveIndex(0), + ReturnAddrSaveIndex(0), + BasePointerSaveIndex(0), + HasSpills(false), + HasNonRISpills(false), + SpillsCR(false), + SpillsVRSAVE(false), + LRStoreRequired(false), + MinReservedArea(0), + TailCallSPDelta(0), + HasFastCall(false), + VarArgsFrameIndex(0), + VarArgsStackOffset(0), + VarArgsNumGPR(0), + VarArgsNumFPR(0), + CRSpillFrameIndex(0), + MF(MF), + UsesPICBase(0) {} + + int getFramePointerSaveIndex() const { return FramePointerSaveIndex; } + void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; } + + int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; } + void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; } + + int getBasePointerSaveIndex() const { return BasePointerSaveIndex; } + void setBasePointerSaveIndex(int Idx) { BasePointerSaveIndex = Idx; } + + unsigned getMinReservedArea() const { return MinReservedArea; } + void setMinReservedArea(unsigned size) { MinReservedArea = size; } + + int getTailCallSPDelta() const { return TailCallSPDelta; } + void setTailCallSPDelta(int size) { TailCallSPDelta = size; } + + /// MustSaveLR - This is set when the prolog/epilog inserter does its initial + /// scan of the function. It is true if the LR/LR8 register is ever explicitly + /// defined/clobbered in the machine function (e.g. by calls and movpctolr, + /// which is used in PIC generation), or if the LR stack slot is explicitly + /// referenced by builtin_return_address. + void setMustSaveLR(bool U) { MustSaveLR = U; } + bool mustSaveLR() const { return MustSaveLR; } + + void setHasSpills() { HasSpills = true; } + bool hasSpills() const { return HasSpills; } + + void setHasNonRISpills() { HasNonRISpills = true; } + bool hasNonRISpills() const { return HasNonRISpills; } + + void setSpillsCR() { SpillsCR = true; } + bool isCRSpilled() const { return SpillsCR; } + + void setSpillsVRSAVE() { SpillsVRSAVE = true; } + bool isVRSAVESpilled() const { return SpillsVRSAVE; } + + void setLRStoreRequired() { LRStoreRequired = true; } + bool isLRStoreRequired() const { return LRStoreRequired; } + + void setHasFastCall() { HasFastCall = true; } + bool hasFastCall() const { return HasFastCall;} + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; } + + int getVarArgsStackOffset() const { return VarArgsStackOffset; } + void setVarArgsStackOffset(int Offset) { VarArgsStackOffset = Offset; } + + unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; } + void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; } + + unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; } + void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; } + + int getCRSpillFrameIndex() const { return CRSpillFrameIndex; } + void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; } + + const SmallVectorImpl<unsigned> & + getMustSaveCRs() const { return MustSaveCRs; } + void addMustSaveCR(unsigned Reg) { MustSaveCRs.push_back(Reg); } + + void setUsesPICBase(bool uses) { UsesPICBase = uses; } + bool usesPICBase() const { return UsesPICBase; } + + MCSymbol *getPICOffsetSymbol() const; +}; + +} // end of namespace llvm + + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h new file mode 100644 index 000000000000..17b836d1ed97 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h @@ -0,0 +1,6586 @@ +//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file, which was autogenerated by llvm-PerfectShuffle, contains data +// for the optimal way to build a perfect shuffle without using vperm. +// +//===----------------------------------------------------------------------===// + +// 31 entries have cost 0 +// 292 entries have cost 1 +// 1384 entries have cost 2 +// 3061 entries have cost 3 +// 1733 entries have cost 4 +// 60 entries have cost 5 + +// This table is 6561*4 = 26244 bytes in size. +static const unsigned PerfectShuffleTable[6561+1] = { + 202162278U, // <0,0,0,0>: Cost 1 vspltisw0 LHS + 1140850790U, // <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS + 2617247181U, // <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0> + 2635163787U, // <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0> + 1543507254U, // <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS + 2281701705U, // <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5> + 2617250133U, // <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0> + 2659054575U, // <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0> + 202162278U, // <0,0,0,u>: Cost 1 vspltisw0 LHS + 1141686282U, // <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1> + 67944550U, // <0,0,1,1>: Cost 1 vmrghw LHS, LHS + 1685241958U, // <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS + 2215870716U, // <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0> + 1141727570U, // <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5> + 2215428562U, // <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7> + 2215428589U, // <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7> + 2659062768U, // <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1> + 67945117U, // <0,0,1,u>: Cost 1 vmrghw LHS, LHS + 2684356045U, // <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0> + 2216009830U, // <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS + 2216009901U, // <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2> + 2698290853U, // <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0> + 3289751890U, // <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5> + 3758098275U, // <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1> + 2684356538U, // <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7> + 3758098410U, // <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1> + 2216010397U, // <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS + 2702272651U, // <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0> + 2216656998U, // <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS + 3844669704U, // <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3> + 2216657148U, // <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0> + 2684357122U, // <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6> + 3732820066U, // <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0> + 3778005624U, // <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7> + 3374713464U, // <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7> + 2216657565U, // <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS + 2217361408U, // <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0> + 1143619686U, // <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS + 3291103405U, // <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2> + 3827269988U, // <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5> + 1143619922U, // <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5> + 1610616118U, // <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS + 3758099833U, // <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2> + 3854107016U, // <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5> + 1143620253U, // <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS + 2284396544U, // <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0> + 2218025062U, // <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS + 3758100203U, // <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3> + 3395966100U, // <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3> + 3804549052U, // <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5> + 2302314964U, // <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5> + 2785821138U, // <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7> + 3395966428U, // <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7> + 2787148260U, // <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7> + 2684358997U, // <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0> + 2218631270U, // <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS + 2684359162U, // <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3> + 3758101042U, // <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5> + 3732843830U, // <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS + 3758101227U, // <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1> + 2684359480U, // <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6> + 2724836173U, // <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0> + 2725499806U, // <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0> + 2726163439U, // <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0> + 2219311206U, // <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS + 3868557900U, // <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3> + 3377400112U, // <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3> + 2684360038U, // <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6> + 3732852834U, // <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0> + 3871507060U, // <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7> + 2303658616U, // <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7> + 2726163439U, // <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0> + 202162278U, // <0,0,u,0>: Cost 1 vspltisw0 LHS + 72589414U, // <0,0,u,1>: Cost 1 vmrghw LHS, LHS + 1685242525U, // <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS + 2220073212U, // <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0> + 1146331474U, // <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5> + 1610619034U, // <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS + 2785821138U, // <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7> + 2659120119U, // <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u> + 72589981U, // <0,0,u,u>: Cost 1 vmrghw LHS, LHS + 2698297344U, // <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0> + 1624555622U, // <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS + 2758984428U, // <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1> + 2635237524U, // <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0> + 2693652818U, // <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5> + 2281701714U, // <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5> + 2698297846U, // <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7> + 2659128312U, // <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0> + 1624556189U, // <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS + 1543585802U, // <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1> + 1141728052U, // <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1> + 1141728150U, // <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0> + 2295644334U, // <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3> + 1543589174U, // <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS + 2290999634U, // <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5> + 2617332135U, // <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1> + 2617332720U, // <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1> + 1142171004U, // <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0> + 1561509990U, // <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS + 2623308516U, // <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2> + 2698298984U, // <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1561513270U, // <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS + 2647199304U, // <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2> + 2698299322U, // <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7> + 1585402874U, // <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2698299540U, // <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0> + 3290399540U, // <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1> + 2698299720U, // <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0> + 2698299804U, // <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3> + 2698299906U, // <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6> + 3832726521U, // <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0> + 2724842160U, // <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0> + 2706926275U, // <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1> + 2698300190U, // <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2> + 2635268198U, // <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS + 2217362228U, // <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1> + 2217362326U, // <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0> + 2635270296U, // <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4> + 2635271478U, // <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS + 1624558902U, // <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS + 2659160910U, // <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1> + 2659161084U, // <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4> + 1624559145U, // <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS + 3832726639U, // <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1> + 2714889871U, // <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1> + 2302314646U, // <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2> + 3834717321U, // <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0> + 3832726679U, // <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5> + 2717544403U, // <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1> + 2718208036U, // <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1> + 3792613493U, // <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1> + 2719535302U, // <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1> + 2659172454U, // <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS + 3832726735U, // <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7> + 2724844026U, // <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3> + 3775361608U, // <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0> + 2659175734U, // <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS + 3832726771U, // <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7> + 2724844344U, // <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6> + 1651102542U, // <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1> + 1651766175U, // <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1> + 2724844536U, // <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0> + 3377397770U, // <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1> + 2698302636U, // <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0> + 2728162531U, // <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1> + 2724844902U, // <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6> + 3377398098U, // <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5> + 2724845076U, // <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0> + 2724845164U, // <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7> + 2724845186U, // <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2> + 1561559142U, // <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS + 1146331956U, // <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1> + 1146332054U, // <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0> + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1561562422U, // <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS + 1624561818U, // <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS + 2220074191U, // <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7> + 1585452032U, // <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2214593997U, // <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0> + 2214675999U, // <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1> + 2214594152U, // <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2> + 1207959654U, // <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS + 3709054262U, // <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS + 3375350836U, // <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5> + 2214594490U, // <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7> + 3288336362U, // <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1> + 1207959659U, // <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS + 2215871994U, // <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0> + 2215470623U, // <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1> + 1141728872U, // <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2> + 1141728934U, // <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1> + 2215872323U, // <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5> + 2215872405U, // <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6> + 1141729210U, // <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 2215430122U, // <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1> + 1141729368U, // <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3> + 3289736698U, // <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0> + 3289744927U, // <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1> + 2216011368U, // <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2> + 2216019622U, // <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1> + 3289769795U, // <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5> + 3289778069U, // <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6> + 2216044474U, // <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7> + 3732960259U, // <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2> + 2216061016U, // <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3> + 2758985382U, // <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1> + 2758985392U, // <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2> + 3290400360U, // <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2> + 2758985408U, // <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0> + 2758985422U, // <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5> + 2785822424U, // <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6> + 3290400698U, // <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7> + 2765915876U, // <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0> + 2758985453U, // <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0> + 3291104762U, // <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0> + 2217362979U, // <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5> + 2217363048U, // <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2> + 2217363110U, // <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1> + 3291105087U, // <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1> + 3291105173U, // <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6> + 2217363386U, // <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7> + 3788639688U, // <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0> + 2217363515U, // <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1> + 3376054371U, // <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0> + 3788639888U, // <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2> + 3376055912U, // <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2> + 2302312550U, // <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS + 3376054375U, // <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4> + 3374728244U, // <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5> + 3805229154U, // <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0> + 3376055512U, // <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7> + 2302312555U, // <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS + 3709100134U, // <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS + 3709100950U, // <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0> + 3709102010U, // <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7> + 2758985658U, // <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7> + 3709103414U, // <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS + 3732992098U, // <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0> + 3292374970U, // <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7> + 3798594383U, // <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2> + 2758985703U, // <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7> + 3788641274U, // <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2> + 3377398508U, // <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1> + 3377398590U, // <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2> + 2303656038U, // <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS + 3709111606U, // <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS + 3377398836U, // <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5> + 3803903447U, // <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2> + 3293054954U, // <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1> + 2303656043U, // <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS + 2220074490U, // <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0> + 2220074527U, // <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1> + 1146332776U, // <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2> + 1146332838U, // <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1> + 2220074819U, // <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5> + 2220074901U, // <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6> + 1146333114U, // <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 2220074986U, // <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1> + 1146333243U, // <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1> + 2629410816U, // <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0> + 2753530006U, // <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2> + 2629412301U, // <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0> + 2214594972U, // <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3> + 2758985908U, // <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5> + 3733016674U, // <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0> + 3777364488U, // <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7> + 2281703354U, // <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7> + 2758985941U, // <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2> + 1141729430U, // <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2> + 2215471334U, // <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1> + 2215471425U, // <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2> + 1141729692U, // <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3> + 1141729794U, // <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6> + 2215430738U, // <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5> + 2215430776U, // <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7> + 2295646138U, // <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7> + 1141730078U, // <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2> + 2758986032U, // <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3> + 3709141910U, // <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0> + 3289753921U, // <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2> + 2770929992U, // <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0> + 3289754114U, // <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6> + 3362095460U, // <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5> + 3832727910U, // <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3> + 3365414842U, // <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7> + 2771298677U, // <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0> + 2216659094U, // <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2> + 3290409190U, // <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1> + 2703624496U, // <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3> + 2216683932U, // <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3> + 2216692226U, // <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6> + 3733041250U, // <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0> + 3832727988U, // <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0> + 3374712762U, // <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7> + 2216725278U, // <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2> + 2217363606U, // <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2> + 3291105510U, // <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1> + 3291105601U, // <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2> + 2217363868U, // <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3> + 2217363970U, // <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6> + 2758986242U, // <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6> + 3727077685U, // <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4> + 3364767674U, // <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7> + 2217364254U, // <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2> + 3832728102U, // <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6> + 3405916003U, // <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1> + 3376055840U, // <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2> + 3376055679U, // <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3> + 3376055194U, // <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4> + 3859565138U, // <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5> + 2727514210U, // <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0> + 3376056250U, // <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7> + 2727514210U, // <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0> + 2758986360U, // <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7> + 3709174678U, // <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0> + 3795284411U, // <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3> + 3709175980U, // <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6> + 3833096860U, // <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7> + 3376728235U, // <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5> + 3859565229U, // <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6> + 2773879472U, // <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0> + 2758986360U, // <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7> + 2303656854U, // <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0> + 3807229018U, // <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u> + 2727515284U, // <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3> + 3377399410U, // <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3> + 3377398682U, // <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4> + 3801257409U, // <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7> + 3377399980U, // <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6> + 3375409082U, // <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7> + 2731497082U, // <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3> + 1146333334U, // <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2> + 2220075238U, // <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1> + 2220075329U, // <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2> + 1146333596U, // <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3> + 1146333698U, // <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6> + 2758986566U, // <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6> + 2803739472U, // <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7> + 2295703482U, // <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7> + 1146333982U, // <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2> + 2214595473U, // <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0> + 2693677158U, // <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS + 3839437689U, // <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3> + 3709200559U, // <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0> + 2693677394U, // <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5> + 1140854070U, // <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS + 3767419409U, // <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7> + 3854109604U, // <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1> + 1140854313U, // <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS + 1141689234U, // <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1> + 2215431114U, // <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3> + 2215431221U, // <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2> + 2635466928U, // <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1> + 1141689552U, // <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4> + 67947830U, // <0,4,1,5>: Cost 1 vmrghw LHS, RHS + 2215431545U, // <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2> + 2659357716U, // <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1> + 67948073U, // <0,4,1,u>: Cost 1 vmrghw LHS, RHS + 3767420369U, // <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4> + 3767420451U, // <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5> + 3767420520U, // <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2> + 2698323625U, // <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4> + 3709218102U, // <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS + 2216013110U, // <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS + 3767420858U, // <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7> + 3774719981U, // <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4> + 2216013353U, // <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS + 3767421078U, // <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2> + 3776710880U, // <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4> + 3833097325U, // <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4> + 3767421340U, // <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3> + 3767421442U, // <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6> + 2216660278U, // <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS + 3833097361U, // <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4> + 3780692678U, // <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4> + 2216660521U, // <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS + 2617573416U, // <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4> + 2217364450U, // <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0> + 3691316771U, // <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5> + 3709233331U, // <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4> + 2785823952U, // <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4> + 1143622966U, // <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS + 3691319723U, // <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5> + 3854109932U, // <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5> + 1143623209U, // <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS + 2635497574U, // <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS + 2635498390U, // <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0> + 3709240936U, // <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2> + 2635499700U, // <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5> + 2635500854U, // <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS + 2785824044U, // <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6> + 1685245238U, // <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2659390488U, // <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5> + 1685245256U, // <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 3839438161U, // <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7> + 3798610347U, // <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5> + 3798610426U, // <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3> + 3795956237U, // <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4> + 3733138742U, // <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS + 2218634550U, // <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS + 3798610744U, // <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6> + 2724868945U, // <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4> + 2725532578U, // <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4> + 3383371465U, // <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0> + 3800601668U, // <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4> + 3775386826U, // <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3> + 3801928934U, // <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4> + 3721202998U, // <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS + 2780368328U, // <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0> + 3383372686U, // <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6> + 3854110170U, // <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0> + 2780368328U, // <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0> + 1146334098U, // <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1> + 2220076002U, // <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0> + 2220076085U, // <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2> + 2635524279U, // <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u> + 1146334416U, // <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4> + 72592694U, // <0,4,u,5>: Cost 1 vmrghw LHS, RHS + 1685245481U, // <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2659415067U, // <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u> + 72592937U, // <0,4,u,u>: Cost 1 vmrghw LHS, RHS + 2281704337U, // <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0> + 2704965734U, // <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS + 3778707666U, // <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3> + 3778707708U, // <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0> + 2687050057U, // <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5> + 2214596612U, // <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5> + 2785824372U, // <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1> + 3854110332U, // <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0> + 2704966301U, // <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS + 1567768678U, // <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS + 2312236570U, // <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1> + 2215431915U, // <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3> + 2641512598U, // <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2> + 1567771538U, // <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1> + 1141690372U, // <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5> + 1141690466U, // <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0> + 2641515514U, // <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2> + 1141690615U, // <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5> + 3772736973U, // <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0> + 3778709024U, // <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2> + 3778709096U, // <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2> + 3778709158U, // <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1> + 3772737275U, // <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5> + 3859566351U, // <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3> + 3778709434U, // <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7> + 3805251562U, // <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1> + 3775391807U, // <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5> + 2704967830U, // <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2> + 3776719073U, // <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5> + 3777382706U, // <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5> + 3778709887U, // <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1> + 2704968148U, // <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5> + 3857428317U, // <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0> + 3364096514U, // <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6> + 3780700871U, // <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5> + 2707622680U, // <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5> + 2728856466U, // <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1> + 3697361674U, // <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4> + 3697362601U, // <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4> + 3364766635U, // <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3> + 2217365428U, // <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6> + 2704969014U, // <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS + 2785824700U, // <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5> + 3364766963U, // <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7> + 2704969257U, // <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS + 3846148050U, // <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0> + 2326203282U, // <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1> + 3291746027U, // <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3> + 3376054482U, // <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3> + 3790655366U, // <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5> + 2785824772U, // <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5> + 2724876386U, // <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0> + 3858903057U, // <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0> + 2736820484U, // <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0> + 2659467366U, // <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS + 3859566643U, // <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7> + 3798618618U, // <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3> + 3852857410U, // <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4> + 2659470646U, // <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS + 2659471458U, // <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0> + 3832729696U, // <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7> + 1712083042U, // <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0> + 1712156779U, // <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0> + 2731512826U, // <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2> + 3859566717U, // <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0> + 3798619284U, // <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3> + 3778712803U, // <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1> + 2728858936U, // <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5> + 3859566753U, // <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0> + 3377398135U, // <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6> + 3798619686U, // <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0> + 2731513468U, // <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5> + 1567826022U, // <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS + 2704971566U, // <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS + 2220076779U, // <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3> + 2641569942U, // <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2> + 1567828889U, // <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u> + 1146335236U, // <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5> + 1146335330U, // <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0> + 1713410308U, // <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0> + 1713484045U, // <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0> + 2214596949U, // <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0> + 2214678951U, // <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1> + 2214597114U, // <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3> + 3852857653U, // <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4> + 3832729919U, // <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5> + 3721293427U, // <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0> + 2214597432U, // <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6> + 1207962934U, // <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS + 1207962935U, // <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS + 2215432481U, // <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2> + 2215432615U, // <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1> + 1141690874U, // <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3> + 2215432754U, // <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5> + 2215432817U, // <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5> + 2215432939U, // <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1> + 1141691192U, // <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6> + 1221905718U, // <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS + 1221905719U, // <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS + 3852857787U, // <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3> + 3289764265U, // <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3> + 3289690618U, // <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3> + 3862589907U, // <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0> + 3733253430U, // <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS + 3733254242U, // <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0> + 3777390522U, // <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7> + 2785825274U, // <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3> + 2785825283U, // <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3> + 3777390742U, // <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2> + 3863106066U, // <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0> + 3777390899U, // <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6> + 3290436146U, // <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5> + 3779381762U, // <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6> + 3779381798U, // <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6> + 3733262920U, // <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0> + 2300972342U, // <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS + 2300972343U, // <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS + 3802606482U, // <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1> + 2217365931U, // <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5> + 2217366010U, // <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3> + 3291107890U, // <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5> + 3291099805U, // <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4> + 3777391926U, // <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS + 2217366328U, // <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6> + 2291027254U, // <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS + 2291027255U, // <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS + 3852858033U, // <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6> + 3395964532U, // <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1> + 3864507069U, // <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0> + 3376056678U, // <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3> + 3721334070U, // <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS + 3395964860U, // <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5> + 3864802017U, // <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0> + 2302315830U, // <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS + 2302315831U, // <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS + 3852858108U, // <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0> + 3398624745U, // <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1> + 2218668538U, // <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3> + 3292418610U, // <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5> + 3733286198U, // <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS + 3797299889U, // <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6> + 2785825592U, // <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6> + 2785825602U, // <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7> + 2785825611U, // <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7> + 2785825614U, // <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1> + 2758988632U, // <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2> + 3377400084U, // <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2> + 2792166248U, // <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0> + 2785825654U, // <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5> + 2785825664U, // <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6> + 3859567493U, // <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2> + 2303659318U, // <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS + 2303659319U, // <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS + 2785825695U, // <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1> + 2220077479U, // <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1> + 1146335738U, // <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3> + 2792829881U, // <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0> + 2785825735U, // <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5> + 2785825664U, // <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6> + 1146336056U, // <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6> + 1221963062U, // <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS + 1221963063U, // <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS + 2653593600U, // <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0> + 2706309222U, // <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS + 3709421498U, // <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7> + 2281705978U, // <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3> + 2785825816U, // <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5> + 2785825826U, // <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6> + 2653598037U, // <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0> + 2214598252U, // <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7> + 2706309789U, // <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS + 1141691386U, // <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2> + 2215433290U, // <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1> + 2706310038U, // <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0> + 2322190842U, // <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3> + 1141691750U, // <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6> + 2215433654U, // <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5> + 2653606230U, // <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1> + 1141692012U, // <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7> + 1141692034U, // <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2> + 2785825940U, // <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3> + 3768108576U, // <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2> + 3780052584U, // <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2> + 2794820780U, // <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0> + 3859641528U, // <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3> + 3733327970U, // <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0> + 3778062266U, // <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7> + 3733328944U, // <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2> + 2795189465U, // <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0> + 2324861026U, // <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0> + 3780053233U, // <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3> + 3780053296U, // <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3> + 3778062725U, // <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7> + 3780053506U, // <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6> + 3803941469U, // <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7> + 2706311800U, // <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7> + 3398603586U, // <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7> + 2707639066U, // <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7> + 2217366522U, // <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2> + 3727369110U, // <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0> + 3291108500U, // <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3> + 3727370872U, // <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7> + 2217366886U, // <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6> + 2706312502U, // <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS + 3786026321U, // <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7> + 2217367148U, // <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7> + 2706312745U, // <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS + 2322223202U, // <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0> + 3399946987U, // <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1> + 3291780244U, // <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3> + 3727378582U, // <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2> + 3727379766U, // <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS + 3859568054U, // <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5> + 2785826241U, // <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7> + 3395965762U, // <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7> + 2787153363U, // <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7> + 2785826268U, // <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7> + 3780055420U, // <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3> + 3859568110U, // <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7> + 3874534903U, // <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7> + 3859641856U, // <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7> + 3733360738U, // <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0> + 3859568145U, // <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6> + 2797770260U, // <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0> + 2797843997U, // <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0> + 2785826342U, // <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0> + 3727393686U, // <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0> + 3868563003U, // <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3> + 3377397988U, // <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3> + 2219349350U, // <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6> + 3859568217U, // <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6> + 2730202588U, // <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7> + 2785826412U, // <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7> + 2731529854U, // <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7> + 1146336250U, // <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2> + 2706315054U, // <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS + 2653660845U, // <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u> + 2322248186U, // <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3> + 1146336614U, // <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6> + 2706315418U, // <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS + 2653663581U, // <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u> + 1146336876U, // <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7> + 1146336898U, // <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2> + 202162278U, // <0,u,0,0>: Cost 1 vspltisw0 LHS + 1624612966U, // <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS + 2629780986U, // <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0> + 1207959708U, // <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS + 1544097078U, // <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS + 1140856986U, // <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS + 2698355253U, // <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7> + 1207962952U, // <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS + 202162278U, // <0,u,0,u>: Cost 1 vspltisw0 LHS + 1142134483U, // <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2> + 67950382U, // <0,u,1,1>: Cost 1 vmrghw LHS, LHS + 1142175624U, // <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3> + 1142175676U, // <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1> + 1142134847U, // <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6> + 67950746U, // <0,u,1,5>: Cost 1 vmrghw LHS, RHS + 1142175952U, // <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7> + 1221905736U, // <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS + 67950949U, // <0,u,1,u>: Cost 1 vmrghw LHS, LHS + 1562026086U, // <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS + 2216015662U, // <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS + 2698356328U, // <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2> + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1562029366U, // <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS + 2216016026U, // <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS + 2698356666U, // <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7> + 1585919033U, // <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2758989756U, // <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1> + 2216662830U, // <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS + 2703665461U, // <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u> + 2758989782U, // <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0> + 2758989796U, // <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5> + 2216663194U, // <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS + 2706319993U, // <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u> + 2300972360U, // <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS + 2216663397U, // <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS + 2217367251U, // <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2> + 1143625518U, // <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS + 2217367432U, // <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3> + 2217367484U, // <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1> + 1143619922U, // <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5> + 1143625882U, // <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS + 2217367760U, // <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7> + 2291027272U, // <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS + 1143626085U, // <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS + 2635792486U, // <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS + 2635793302U, // <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0> + 2302314646U, // <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2> + 2635794648U, // <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5> + 2635795766U, // <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS + 2717601754U, // <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u> + 1685248154U, // <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2302315848U, // <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS + 1685248172U, // <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS + 2759358645U, // <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7> + 2218637102U, // <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS + 2724901370U, // <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3> + 2758990032U, // <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7> + 2659691830U, // <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS + 2659471458U, // <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0> + 2724901688U, // <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6> + 1651159893U, // <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u> + 1651823526U, // <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u> + 2785827072U, // <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1> + 2803964168U, // <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0> + 2727556249U, // <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u> + 2303656092U, // <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS + 2785827112U, // <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5> + 2785827122U, // <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6> + 2730210781U, // <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u> + 2303659336U, // <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS + 2303656097U, // <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS + 202162278U, // <0,u,u,0>: Cost 1 vspltisw0 LHS + 72595246U, // <0,u,u,1>: Cost 1 vmrghw LHS, LHS + 1146337160U, // <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3> + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1146337343U, // <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6> + 72595610U, // <0,u,u,5>: Cost 1 vmrghw LHS, RHS + 1146337488U, // <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7> + 1221963080U, // <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2756853760U, // <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0> + 1677803530U, // <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1> + 3759497387U, // <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0> + 2686419196U, // <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0> + 2751766565U, // <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1> + 2687746462U, // <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0> + 3776086518U, // <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7> + 2689073728U, // <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0> + 1678319689U, // <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1> + 2287091712U, // <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0> + 1147568230U, // <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS + 1683112038U, // <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 3294970108U, // <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0> + 2623892790U, // <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS + 2647781007U, // <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1> + 2791948430U, // <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7> + 3721524218U, // <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2> + 1683112092U, // <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 2222112768U, // <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0> + 1148371046U, // <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS + 3356862524U, // <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2> + 2702345894U, // <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1> + 2222113106U, // <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5> + 2299709908U, // <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5> + 3760162746U, // <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7> + 3369470584U, // <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7> + 1148371613U, // <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS + 2686421142U, // <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2> + 2283128486U, // <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1> + 3296305326U, // <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3> + 3760163199U, // <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1> + 3760163330U, // <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6> + 3779406377U, // <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0> + 3865690416U, // <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7> + 3366824568U, // <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7> + 2707655452U, // <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0> + 2734861202U, // <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1> + 2756854098U, // <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5> + 3830595931U, // <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5> + 3296968960U, // <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4> + 3830595949U, // <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5> + 2686422326U, // <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS + 3297378806U, // <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7> + 3810594248U, // <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0> + 2686422569U, // <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS + 2284470272U, // <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0> + 2284471974U, // <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1> + 3809267435U, // <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3> + 3297968384U, // <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4> + 2284471977U, // <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4> + 3721555603U, // <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5> + 3792679010U, // <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0> + 3792679037U, // <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0> + 2284471981U, // <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u> + 3356893184U, // <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0> + 2224676966U, // <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS + 3298295985U, // <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6> + 3298345212U, // <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0> + 2224972114U, // <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5> + 3808604907U, // <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1> + 3799978808U, // <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6> + 2726237006U, // <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1> + 2224677522U, // <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1> + 2726237176U, // <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0> + 2285815462U, // <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1> + 3805951193U, // <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0> + 3807941859U, // <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1> + 3799979366U, // <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6> + 3803297165U, // <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0> + 3799979540U, // <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0> + 3799979628U, // <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7> + 2731546240U, // <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0> + 2284494848U, // <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0> + 1683112594U, // <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1> + 1683112605U, // <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 2734200772U, // <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0> + 2757075629U, // <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1> + 2686425242U, // <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS + 2791948430U, // <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7> + 2736855304U, // <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0> + 1683112659U, // <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 1610694666U, // <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1> + 1616003174U, // <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS + 2283767958U, // <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2> + 3357507596U, // <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3> + 2689745234U, // <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5> + 3357507922U, // <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5> + 3294397647U, // <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7> + 3373433334U, // <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7> + 1616003730U, // <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1> + 1550221414U, // <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS + 269271142U, // <1,1,1,1>: Cost 1 vspltisw1 LHS + 2287093910U, // <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2> + 2287092615U, // <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3> + 1550224694U, // <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS + 2287092050U, // <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5> + 2689746127U, // <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7> + 2659800138U, // <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1> + 269271142U, // <1,1,1,u>: Cost 1 vspltisw1 LHS + 2222113516U, // <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1> + 2756854663U, // <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3> + 1148371862U, // <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 2689746598U, // <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1> + 2618002742U, // <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS + 2299707730U, // <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5> + 2689746874U, // <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7> + 3361506511U, // <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7> + 1148371862U, // <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 2689747094U, // <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2> + 2691074278U, // <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1> + 3356870806U, // <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2> + 2283126958U, // <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3> + 2689747458U, // <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6> + 3356868946U, // <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5> + 3811265144U, // <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7> + 3362841807U, // <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7> + 2689747742U, // <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2> + 2623987814U, // <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS + 2758181931U, // <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5> + 2223408022U, // <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0> + 3697731734U, // <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2> + 2283798784U, // <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4> + 1616006454U, // <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS + 3297379535U, // <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7> + 3373466102U, // <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7> + 1616006697U, // <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS + 2760762479U, // <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1> + 2284470282U, // <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1> + 2284472470U, // <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2> + 3358212270U, // <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3> + 2284470285U, // <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4> + 1210728786U, // <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 2737524834U, // <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0> + 3360867535U, // <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7> + 1210728786U, // <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 3697746022U, // <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS + 2756854991U, // <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7> + 2737525242U, // <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3> + 3839149281U, // <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7> + 3697749302U, // <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS + 3356893522U, // <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5> + 2283151537U, // <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6> + 2791949566U, // <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0> + 2792613127U, // <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0> + 2737525754U, // <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2> + 2291786386U, // <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1> + 3365528292U, // <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2> + 3365528455U, // <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3> + 2737526118U, // <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6> + 3365527890U, // <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5> + 3365528377U, // <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6> + 2291786959U, // <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7> + 2737526402U, // <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2> + 1550221414U, // <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS + 269271142U, // <1,1,u,1>: Cost 1 vspltisw1 LHS + 1148371862U, // <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 2689750972U, // <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1> + 1550224694U, // <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS + 1616009370U, // <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS + 2689751248U, // <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7> + 2736863497U, // <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1> + 269271142U, // <1,1,u,u>: Cost 1 vspltisw1 LHS + 2702360576U, // <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0> + 1628618854U, // <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS + 2685771949U, // <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2> + 2283765862U, // <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS + 2702360914U, // <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5> + 3788046813U, // <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0> + 2688426481U, // <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2> + 2726249024U, // <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0> + 1628619421U, // <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS + 2690417380U, // <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2> + 2702361396U, // <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1> + 2287093352U, // <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2> + 1213349990U, // <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS + 3764159522U, // <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5> + 3295053672U, // <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6> + 2221311930U, // <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7> + 3799991593U, // <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7> + 1213349995U, // <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS + 2624045158U, // <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS + 2702362144U, // <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2> + 2283120232U, // <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2> + 1225965670U, // <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS + 2624048438U, // <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS + 3356860763U, // <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5> + 2222114746U, // <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7> + 2299708632U, // <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7> + 1225965675U, // <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS + 470597734U, // <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS + 1544340276U, // <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1544341096U, // <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544341916U, // <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3> + 470601014U, // <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS + 1592119300U, // <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5> + 1592119802U, // <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1592120314U, // <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 470603566U, // <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS + 2708335471U, // <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2> + 3838043908U, // <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5> + 3357541992U, // <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2> + 2283798630U, // <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS + 2726251728U, // <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4> + 1628622134U, // <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS + 3297077178U, // <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7> + 2726251976U, // <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0> + 1628622377U, // <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS + 2714308168U, // <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2> + 3297633827U, // <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5> + 2284471912U, // <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2> + 1210728550U, // <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS + 3776106420U, // <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6> + 2726252548U, // <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5> + 2726252642U, // <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0> + 3799994538U, // <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0> + 1210728555U, // <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS + 2720280865U, // <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2> + 2702365096U, // <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2> + 2726253050U, // <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3> + 2283151462U, // <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS + 3697823030U, // <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS + 3298715497U, // <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7> + 2726253368U, // <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6> + 2724926296U, // <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2> + 2283151467U, // <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS + 1652511738U, // <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2> + 3371500916U, // <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1> + 3365529192U, // <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2> + 2291785830U, // <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS + 2726253926U, // <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6> + 3788051845U, // <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1> + 3794023894U, // <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1> + 2726254119U, // <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1> + 1657820802U, // <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2> + 470638699U, // <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS + 1544381236U, // <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1544382056U, // <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544382614U, // <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2> + 470641974U, // <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS + 1628625050U, // <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS + 1592160762U, // <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1592161274U, // <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 470644526U, // <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS + 2769389708U, // <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1> + 2685780070U, // <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2685780142U, // <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3> + 2686443775U, // <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3> + 2769684656U, // <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1> + 3357507940U, // <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5> + 3759522294U, // <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7> + 3357509562U, // <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7> + 2685780637U, // <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2287092630U, // <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0> + 2221312230U, // <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1> + 2691752839U, // <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3> + 2287093362U, // <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3> + 2287092634U, // <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4> + 3360835107U, // <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5> + 3759523041U, // <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7> + 2287093690U, // <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7> + 2287092638U, // <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u> + 2222114966U, // <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2> + 2222115057U, // <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3> + 2630092320U, // <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2> + 2685781670U, // <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1> + 2222115330U, // <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6> + 3373449572U, // <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5> + 2222115448U, // <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7> + 2299709370U, // <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7> + 2222115614U, // <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2> + 2771380607U, // <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1> + 3356874468U, // <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1> + 3759524168U, // <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0> + 2283792796U, // <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3> + 3356869530U, // <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4> + 3721760428U, // <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3> + 3296496248U, // <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7> + 3356870586U, // <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7> + 2771970503U, // <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1> + 2772044240U, // <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1> + 3362186135U, // <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1> + 3297151280U, // <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3> + 3357542002U, // <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3> + 3357540626U, // <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4> + 2685783350U, // <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS + 3357546622U, // <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6> + 3357542330U, // <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7> + 2685783593U, // <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS + 2284471190U, // <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0> + 3358213015U, // <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1> + 2630116899U, // <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5> + 2284471922U, // <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3> + 2284471194U, // <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4> + 2284471843U, // <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5> + 3358218366U, // <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6> + 2284472250U, // <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7> + 2284471198U, // <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u> + 2224752790U, // <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2> + 3832736385U, // <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7> + 3703866916U, // <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6> + 3356894834U, // <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3> + 3356894106U, // <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4> + 3356894755U, // <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5> + 3356899130U, // <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6> + 2283153338U, // <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7> + 2283153338U, // <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7> + 2774035139U, // <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1> + 3703874767U, // <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7> + 3703875109U, // <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7> + 3365529202U, // <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3> + 3365528474U, // <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4> + 3789387159U, // <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1> + 3865692927U, // <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7> + 3363538874U, // <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7> + 2774625035U, // <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1> + 2284495766U, // <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0> + 2685785902U, // <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2630141478U, // <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u> + 2283169880U, // <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3> + 2284495770U, // <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4> + 2685786266U, // <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS + 2222115448U, // <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7> + 2284496826U, // <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7> + 2685786469U, // <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS + 2684461069U, // <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4> + 2686451814U, // <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS + 3759530159U, // <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4> + 2686451968U, // <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4> + 2684461394U, // <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5> + 1701989266U, // <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1> + 3776119286U, // <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7> + 2689106500U, // <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4> + 1702210477U, // <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1> + 2221312914U, // <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1> + 2691097399U, // <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4> + 3760194454U, // <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0> + 3766166489U, // <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4> + 2334870736U, // <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4> + 1147571510U, // <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS + 3760194794U, // <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7> + 3867315188U, // <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0> + 1147571753U, // <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS + 2222115730U, // <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1> + 2222115812U, // <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2> + 3760195176U, // <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2> + 2702378662U, // <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1> + 2323598544U, // <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4> + 1148374326U, // <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS + 3760195514U, // <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7> + 3373451932U, // <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7> + 1148374569U, // <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS + 2702379160U, // <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4> + 3760195840U, // <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0> + 3776121160U, // <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0> + 3760195996U, // <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3> + 2686454274U, // <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6> + 3356870350U, // <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5> + 3800009392U, // <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0> + 3366824604U, // <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7> + 2707688224U, // <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4> + 2775731368U, // <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0> + 3830820018U, // <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1> + 3691980454U, // <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1> + 3357541282U, // <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3> + 2781039824U, // <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4> + 2686455094U, // <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS + 3357541528U, // <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6> + 3810627020U, // <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4> + 2686455337U, // <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS + 2624217190U, // <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS + 2284470309U, // <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1> + 2618246822U, // <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1> + 3358212297U, // <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3> + 2284470312U, // <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4> + 2284470637U, // <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5> + 1683115318U, // <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 3721851898U, // <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2> + 1683115336U, // <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 3794039075U, // <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4> + 3830820186U, // <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7> + 3800011258U, // <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3> + 3807973938U, // <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5> + 3298716880U, // <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4> + 2224680246U, // <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS + 3800011576U, // <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6> + 2726269774U, // <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1> + 2224680489U, // <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS + 2726269948U, // <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4> + 3383444141U, // <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1> + 3805983961U, // <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0> + 3807974667U, // <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5> + 2736887142U, // <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6> + 3365528403U, // <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5> + 3800012308U, // <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0> + 3800012396U, // <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7> + 2731579012U, // <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4> + 2624241766U, // <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS + 2686457646U, // <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS + 2618271398U, // <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1> + 2734233544U, // <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4> + 2689775679U, // <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6> + 1152355638U, // <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS + 1683115561U, // <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 2736888076U, // <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4> + 1683115579U, // <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 2687123456U, // <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0> + 1613381734U, // <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 3759538352U, // <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5> + 3760865532U, // <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0> + 1613381970U, // <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5> + 2687787427U, // <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5> + 2781777524U, // <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1> + 3733828717U, // <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0> + 1613382301U, // <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 2781040271U, // <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1> + 2687124276U, // <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1> + 2687124374U, // <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0> + 3760866297U, // <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0> + 2693096491U, // <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5> + 2687124591U, // <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1> + 2687124723U, // <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7> + 3360834803U, // <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7> + 2687124860U, // <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0> + 2323598792U, // <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0> + 2687125027U, // <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5> + 2687125096U, // <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2> + 2687125158U, // <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1> + 2642185188U, // <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2> + 2323598554U, // <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5> + 2687125434U, // <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7> + 3373450483U, // <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7> + 2687125563U, // <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1> + 2687125654U, // <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2> + 2312990234U, // <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1> + 3760867649U, // <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2> + 2687125916U, // <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3> + 2687126018U, // <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6> + 3386731738U, // <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5> + 3356871170U, // <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6> + 3808643779U, // <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1> + 2687126302U, // <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2> + 2642198630U, // <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS + 2687126498U, // <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0> + 3715941923U, // <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5> + 3709970701U, // <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4> + 2687126736U, // <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4> + 1613385014U, // <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS + 2283801090U, // <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6> + 3733861489U, // <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4> + 1613385257U, // <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS + 2624290918U, // <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS + 2624291676U, // <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5> + 3698034211U, // <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5> + 2284471211U, // <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3> + 2624294198U, // <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS + 2284471132U, // <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5> + 2284472834U, // <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6> + 2284471539U, // <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7> + 2284471216U, // <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u> + 2785316900U, // <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1> + 2781040691U, // <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7> + 2734903802U, // <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3> + 3848736834U, // <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4> + 3298717620U, // <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6> + 3298717700U, // <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5> + 2734904120U, // <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6> + 2781040738U, // <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0> + 2781040747U, // <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0> + 2734904314U, // <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2> + 2315677210U, // <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1> + 3808646292U, // <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3> + 3808646371U, // <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1> + 2734904678U, // <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6> + 3389418714U, // <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5> + 3365528656U, // <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6> + 2734904940U, // <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7> + 2734904962U, // <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2> + 2687129299U, // <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2> + 1613387566U, // <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 2687129480U, // <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3> + 2687129532U, // <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1> + 1661163546U, // <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5> + 1613387930U, // <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS + 2687129808U, // <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7> + 2781040900U, // <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0> + 1613388133U, // <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS + 3759546368U, // <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0> + 2685804646U, // <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS + 2685804721U, // <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6> + 3861270834U, // <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1> + 3759546706U, // <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5> + 2687795620U, // <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6> + 2688459253U, // <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6> + 2283769142U, // <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS + 2685805213U, // <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS + 3698073702U, // <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS + 3759547188U, // <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1> + 2221314554U, // <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3> + 3759547401U, // <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7> + 3698076982U, // <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS + 3767510141U, // <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6> + 2334872376U, // <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6> + 1213353270U, // <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS + 1213353271U, // <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS + 3704053862U, // <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS + 3759547961U, // <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0> + 2222117370U, // <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3> + 3759548070U, // <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1> + 3704057142U, // <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS + 3373451057U, // <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5> + 2685806522U, // <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7> + 1225968950U, // <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS + 1225968951U, // <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS + 3759548566U, // <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2> + 3842912793U, // <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7> + 3759548774U, // <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3> + 3759548828U, // <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3> + 3759548930U, // <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6> + 3809315421U, // <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7> + 3386733368U, // <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6> + 2283130166U, // <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS + 2283130167U, // <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS + 3704070246U, // <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS + 3862229608U, // <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5> + 3704071741U, // <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4> + 3721988610U, // <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6> + 3704073526U, // <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS + 2685807926U, // <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS + 3865621141U, // <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5> + 2283801910U, // <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS + 2685808169U, // <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS + 3710050406U, // <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS + 3710051571U, // <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7> + 3405989597U, // <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2> + 3358214502U, // <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3> + 3710053686U, // <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS + 3721998025U, // <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5> + 2332250936U, // <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6> + 1210731830U, // <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS + 1210731831U, // <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS + 2791289597U, // <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1> + 3698115430U, // <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6> + 3698116538U, // <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7> + 3356894132U, // <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3> + 3698117942U, // <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS + 3722006218U, // <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6> + 2781041464U, // <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6> + 2283154742U, // <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS + 2283154743U, // <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS + 1718211406U, // <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1> + 2792026967U, // <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1> + 2765411170U, // <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3> + 3854783336U, // <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0> + 2781041526U, // <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5> + 3365528664U, // <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5> + 2791953290U, // <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7> + 2291789110U, // <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS + 1718801302U, // <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1> + 1718875039U, // <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1> + 2685810478U, // <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS + 2792764337U, // <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1> + 3759552444U, // <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1> + 2781041607U, // <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5> + 2685810842U, // <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS + 2689792208U, // <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7> + 1210756406U, // <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS + 1210756407U, // <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS + 2793280496U, // <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1> + 2694439014U, // <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS + 3393343912U, // <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2> + 3397325306U, // <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3> + 2793575444U, // <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1> + 3722030797U, // <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0> + 2688467446U, // <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7> + 2689131079U, // <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7> + 2694439570U, // <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1> + 2654265354U, // <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1> + 2794017866U, // <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1> + 3768181639U, // <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3> + 2334872058U, // <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3> + 2654268726U, // <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS + 3792069797U, // <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1> + 2694440143U, // <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7> + 2334872386U, // <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7> + 2695767409U, // <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7> + 2654273638U, // <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS + 2222117973U, // <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3> + 2299711912U, // <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2> + 2654275734U, // <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2> + 2654276918U, // <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS + 3385397675U, // <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5> + 2654278056U, // <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2> + 2323599627U, // <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7> + 2654279470U, // <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS + 2795271395U, // <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1> + 3768183059U, // <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1> + 3728025254U, // <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1> + 3768183196U, // <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3> + 3768183298U, // <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6> + 3792071255U, // <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1> + 3780127361U, // <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7> + 3847779617U, // <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0> + 2795861291U, // <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1> + 2795935028U, // <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1> + 3728032975U, // <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7> + 3839153480U, // <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3> + 3397358074U, // <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3> + 3854783835U, // <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4> + 2694442294U, // <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS + 3786100058U, // <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7> + 3722065254U, // <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6> + 2694442537U, // <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS + 2654298214U, // <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS + 3854783893U, // <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u> + 3710126010U, // <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7> + 2332250618U, // <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3> + 2654301494U, // <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS + 2284474795U, // <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5> + 2718330931U, // <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7> + 2332250946U, // <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7> + 2719658197U, // <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7> + 2332921954U, // <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0> + 3768185254U, // <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0> + 3710134202U, // <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7> + 3710134561U, // <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6> + 3710135606U, // <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS + 3864884745U, // <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7> + 3854784017U, // <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6> + 2791953940U, // <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0> + 2792617501U, // <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0> + 2797925927U, // <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1> + 3365528426U, // <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1> + 3728058022U, // <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1> + 3365528509U, // <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3> + 3854784079U, // <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5> + 3722088148U, // <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7> + 3728060845U, // <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7> + 2781042284U, // <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7> + 2798515823U, // <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1> + 2654322705U, // <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u> + 2694444846U, // <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS + 2299711912U, // <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2> + 2323649018U, // <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3> + 2654326070U, // <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS + 2694445210U, // <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS + 2654327214U, // <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u> + 2323649346U, // <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7> + 2694445413U, // <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS + 1610752017U, // <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u> + 1613406310U, // <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS + 2685821107U, // <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u> + 2283765916U, // <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS + 1613406549U, // <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u> + 1725880054U, // <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1> + 2688475639U, // <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u> + 2283769160U, // <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS + 1613406877U, // <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS + 1550221414U, // <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS + 269271142U, // <1,u,1,1>: Cost 1 vspltisw1 LHS + 1683117870U, // <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS + 1213350044U, // <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS + 1550224694U, // <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS + 1147574426U, // <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS + 2687149326U, // <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7> + 1213353288U, // <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS + 269271142U, // <1,u,1,u>: Cost 1 vspltisw1 LHS + 2222118611U, // <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2> + 1148376878U, // <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS + 1148371862U, // <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 1225965724U, // <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS + 2222118975U, // <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6> + 1148377242U, // <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS + 2687150010U, // <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7> + 1225968968U, // <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS + 1148377445U, // <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS + 471040156U, // <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS + 1544782644U, // <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1544783464U, // <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544784022U, // <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2> + 471043382U, // <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS + 1592561668U, // <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5> + 1592562170U, // <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1592562682U, // <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 471045934U, // <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS + 2708384629U, // <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u> + 2687151101U, // <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0> + 2223408022U, // <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0> + 2283798684U, // <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS + 2642422785U, // <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4> + 1613409590U, // <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS + 2283801090U, // <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6> + 2283801928U, // <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS + 1613409833U, // <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS + 2284471235U, // <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0> + 2284472046U, // <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1> + 2284472533U, // <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2> + 1210728604U, // <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS + 2284471239U, // <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4> + 1210728786U, // <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 1683118234U, // <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 1210731848U, // <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS + 1210728609U, // <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS + 2720330023U, // <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u> + 2757376190U, // <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7> + 2726302202U, // <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3> + 2283151516U, // <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS + 2224972114U, // <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5> + 2224683162U, // <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS + 2726302520U, // <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6> + 2283154760U, // <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS + 2283151521U, // <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS + 1652560896U, // <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u> + 2333590225U, // <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1> + 2765412628U, // <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3> + 2291785884U, // <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS + 2781042984U, // <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5> + 3365527953U, // <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5> + 2791954748U, // <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7> + 2291789128U, // <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS + 1657869960U, // <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u> + 471081121U, // <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS + 269271142U, // <1,u,u,1>: Cost 1 vspltisw1 LHS + 1544824424U, // <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2> + 1544824982U, // <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2> + 471084342U, // <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS + 1613412506U, // <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS + 1683118477U, // <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS + 1210756424U, // <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS + 471086894U, // <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS + 2226757632U, // <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0> + 2226757734U, // <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS + 3826622483U, // <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1> + 3843211292U, // <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1> + 3300499794U, // <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5> + 3356256724U, // <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5> + 3825664056U, // <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2> + 3762889289U, // <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0> + 2226758301U, // <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS + 2227429386U, // <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1> + 2227429478U, // <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS + 1691156582U, // <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS + 2666358997U, // <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2> + 2227462482U, // <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5> + 3722186464U, // <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1> + 3867099278U, // <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7> + 3366881912U, // <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7> + 1691156636U, // <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS + 2228027392U, // <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0> + 1154285670U, // <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS + 2228027565U, // <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2> + 3301769468U, // <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0> + 2228027730U, // <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5> + 3301769635U, // <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5> + 3780806586U, // <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7> + 3368880760U, // <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7> + 1154286237U, // <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS + 1213440000U, // <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0> + 1213441702U, // <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1> + 2228535470U, // <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3> + 2636515632U, // <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3> + 2287182962U, // <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4> + 2660405346U, // <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0> + 2228535798U, // <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7> + 2660406420U, // <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3> + 1213441709U, // <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u> + 3368894464U, // <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0> + 2764898642U, // <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5> + 3826622811U, // <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5> + 3843211620U, // <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5> + 3838640493U, // <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5> + 2732944694U, // <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS + 3797396857U, // <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2> + 3867099528U, // <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5> + 2764898705U, // <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5> + 3364257792U, // <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0> + 2230124646U, // <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS + 3304235184U, // <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5> + 3364260144U, // <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3> + 3303817554U, // <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5> + 3364260146U, // <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5> + 3867099602U, // <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7> + 3364260472U, // <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7> + 2230125213U, // <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS + 2230796288U, // <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0> + 1157054566U, // <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS + 2230796465U, // <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6> + 3304538364U, // <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0> + 2230796626U, // <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5> + 3797398205U, // <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0> + 3304538614U, // <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7> + 3798725471U, // <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0> + 1157055133U, // <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS + 3371573248U, // <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0> + 2231189606U, // <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS + 3801380003U, // <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0> + 3802043636U, // <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0> + 3806688614U, // <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6> + 3356317308U, // <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5> + 3804034535U, // <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0> + 3806688876U, // <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7> + 2231190173U, // <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS + 1208836096U, // <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0> + 1208837798U, // <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1> + 1691157149U, // <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS + 2636556597U, // <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u> + 2282579625U, // <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4> + 2660446306U, // <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0> + 2228535798U, // <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7> + 2660447385U, // <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u> + 1208837805U, // <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u> + 3692388523U, // <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0> + 2757526244U, // <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2> + 2330290974U, // <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2> + 3843212020U, // <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0> + 3692391734U, // <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS + 3300533362U, // <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4> + 3794084337U, // <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2> + 3374170614U, // <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7> + 2758042403U, // <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2> + 2690482924U, // <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1> + 2764899124U, // <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1> + 2695791510U, // <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0> + 3362235271U, // <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3> + 3692399926U, // <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS + 3832226649U, // <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2> + 3301205235U, // <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7> + 3768870179U, // <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1> + 2695791988U, // <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1> + 2618663085U, // <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2> + 2228028212U, // <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1> + 2618664552U, // <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2> + 2759000984U, // <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2> + 2618666294U, // <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS + 2295136594U, // <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5> + 3769534376U, // <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7> + 2793358266U, // <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0> + 2618668846U, // <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS + 2282536969U, // <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0> + 1208795146U, // <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1> + 1213442198U, // <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 2287181998U, // <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3> + 2618674486U, // <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS + 1208795474U, // <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2287182001U, // <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 2287183055U, // <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7> + 1208795153U, // <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u> + 3692421295U, // <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4> + 3838641195U, // <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5> + 2330323742U, // <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2> + 3692423318U, // <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2> + 3692424502U, // <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS + 2695793974U, // <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS + 3799395705U, // <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2> + 3368895695U, // <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7> + 2695794217U, // <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS + 3692429488U, // <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5> + 3364257802U, // <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1> + 3692431253U, // <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6> + 3692431874U, // <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6> + 3692432694U, // <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS + 3364258130U, // <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5> + 3303875827U, // <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7> + 3867100333U, // <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0> + 3692435246U, // <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS + 2618695857U, // <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6> + 2230797108U, // <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1> + 2618697658U, // <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7> + 3692439702U, // <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2> + 2618699062U, // <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS + 3364929874U, // <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5> + 3692442424U, // <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6> + 3798733664U, // <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1> + 2618701614U, // <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS + 3799397370U, // <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2> + 3371573258U, // <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1> + 2330351234U, // <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2> + 3799397658U, // <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2> + 3799397734U, // <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6> + 3371573586U, // <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5> + 3799397870U, // <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7> + 3799397956U, // <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3> + 2330351234U, // <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2> + 2282577929U, // <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0> + 1208836106U, // <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1> + 1208838294U, // <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 2282578094U, // <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3> + 2282577933U, // <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4> + 1208836434U, // <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2282578097U, // <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 2287224015U, // <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7> + 1208836113U, // <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u> + 2226759117U, // <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0> + 1624047718U, // <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS + 2697789613U, // <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2> + 2226767526U, // <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1> + 2697789778U, // <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5> + 3300657000U, // <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6> + 2226988986U, // <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7> + 3734271139U, // <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0> + 1624048285U, // <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS + 3831268868U, // <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1> + 2293138804U, // <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1> + 2697790358U, // <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0> + 2293137510U, // <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS + 3771532331U, // <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5> + 3767551106U, // <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2> + 3301173178U, // <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7> + 3372853169U, // <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7> + 2293137515U, // <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS + 1556938854U, // <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 2295137733U, // <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1> + 336380006U, // <2,2,2,2>: Cost 1 vspltisw2 LHS + 1221394534U, // <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS + 1556942134U, // <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 2295138061U, // <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5> + 2228029370U, // <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7> + 2660545701U, // <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2> + 336380006U, // <2,2,2,u>: Cost 1 vspltisw2 LHS + 2697791638U, // <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2> + 2765489840U, // <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2> + 1213441640U, // <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2> + 135053414U, // <2,2,3,3>: Cost 1 vmrglw LHS, LHS + 2697792002U, // <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6> + 2330313780U, // <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5> + 2287183549U, // <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6> + 2660553894U, // <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3> + 135053419U, // <2,2,3,u>: Cost 1 vmrglw LHS, LHS + 2630697062U, // <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS + 3771534282U, // <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3> + 2764900109U, // <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5> + 2295152742U, // <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS + 2295154282U, // <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4> + 1624050998U, // <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS + 2229675962U, // <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7> + 3368896433U, // <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7> + 1624051241U, // <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS + 3771534920U, // <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2> + 3364258540U, // <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1> + 2296489576U, // <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2> + 2290516070U, // <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS + 3771535284U, // <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6> + 2290517044U, // <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5> + 2697793634U, // <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0> + 3370231729U, // <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7> + 2290516075U, // <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS + 2230797801U, // <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1> + 3304539679U, // <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1> + 2764900273U, // <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7> + 2764900282U, // <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7> + 2230798129U, // <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5> + 3304540008U, // <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6> + 1157056442U, // <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2725000033U, // <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2> + 1157056442U, // <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2793359338U, // <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1> + 3371574725U, // <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1> + 2297833064U, // <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2> + 2297831526U, // <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS + 2697794918U, // <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6> + 3371575053U, // <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5> + 3304933297U, // <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7> + 2297833393U, // <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7> + 2297831531U, // <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS + 1556938854U, // <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 1624053550U, // <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS + 336380006U, // <2,2,u,2>: Cost 1 vspltisw2 LHS + 135094374U, // <2,2,u,3>: Cost 1 vmrglw LHS, LHS + 1556942134U, // <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 1624053914U, // <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS + 1157056442U, // <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2660594859U, // <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u> + 135094379U, // <2,2,u,u>: Cost 1 vmrglw LHS, LHS + 1611448320U, // <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0> + 537706598U, // <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS + 2689835181U, // <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 2689835260U, // <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0> + 1611448658U, // <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 2732966354U, // <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7> + 2732966390U, // <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7> + 2660603052U, // <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0> + 537707165U, // <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS + 2689835748U, // <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2> + 1611449140U, // <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1> + 1611449238U, // <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0> + 3763577805U, // <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1> + 2689836112U, // <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6> + 2689836143U, // <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1> + 2689836239U, // <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7> + 3366881210U, // <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7> + 1616094588U, // <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0> + 2689836493U, // <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0> + 2685191711U, // <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1> + 1611449960U, // <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2> + 1611450022U, // <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1> + 2689836822U, // <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5> + 2689836904U, // <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6> + 1611450298U, // <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 2295138234U, // <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7> + 1611450456U, // <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3> + 1213440918U, // <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0> + 2282538527U, // <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1> + 1557022322U, // <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3> + 1208796786U, // <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3> + 1213440922U, // <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4> + 2282538531U, // <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5> + 2287188094U, // <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6> + 1213441978U, // <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 1208796791U, // <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u> + 1551056998U, // <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS + 1551057818U, // <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4> + 2624800360U, // <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2> + 2624800918U, // <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2> + 1551060278U, // <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS + 537709878U, // <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS + 2732969337U, // <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2> + 2660635824U, // <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4> + 537710121U, // <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS + 2689838664U, // <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2> + 2732969615U, // <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1> + 2732969707U, // <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3> + 3763580721U, // <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1> + 2689839028U, // <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6> + 1659228164U, // <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5> + 1659228258U, // <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0> + 3364259770U, // <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7> + 1659228420U, // <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0> + 2230798486U, // <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2> + 2732970407U, // <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1> + 1659228666U, // <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3> + 2230798748U, // <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3> + 2230798850U, // <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6> + 2732970731U, // <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1> + 1659228984U, // <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6> + 1659229006U, // <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1> + 1659229087U, // <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1> + 1659229178U, // <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2> + 2726999125U, // <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3> + 2727662758U, // <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3> + 2732971235U, // <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1> + 1659229542U, // <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6> + 2732971446U, // <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5> + 2732971484U, // <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7> + 1659229804U, // <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7> + 1659229826U, // <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2> + 1208837014U, // <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0> + 537712430U, // <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS + 1616099205U, // <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0> + 1208837746U, // <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3> + 1208837018U, // <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4> + 537712794U, // <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS + 1616099536U, // <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7> + 1208838074U, // <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 537712997U, // <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS + 3771547648U, // <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0> + 2697805926U, // <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS + 3770884269U, // <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2> + 3806716164U, // <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u> + 3771547986U, // <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5> + 2226761014U, // <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS + 3853462427U, // <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1> + 3867102116U, // <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1> + 2226761257U, // <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS + 3849186231U, // <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2> + 3301207010U, // <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0> + 3766240150U, // <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0> + 3766240226U, // <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4> + 3301207248U, // <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4> + 2227432758U, // <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS + 3758941400U, // <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7> + 3768894758U, // <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4> + 2227433001U, // <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS + 2228030354U, // <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1> + 3770885657U, // <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4> + 2697807466U, // <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4> + 3368880468U, // <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3> + 2228030672U, // <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4> + 1154288950U, // <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS + 3771549617U, // <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7> + 3368880796U, // <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7> + 1154289193U, // <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS + 2636808294U, // <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS + 2287181861U, // <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1> + 2228866102U, // <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3> + 2636810580U, // <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3> + 1256574160U, // <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4> + 1213441742U, // <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5> + 2228866430U, // <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7> + 2660701368U, // <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3> + 1213441745U, // <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u> + 3704586342U, // <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS + 3782831051U, // <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4> + 3704587900U, // <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4> + 3368896123U, // <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3> + 2793360592U, // <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4> + 2697809206U, // <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS + 3303198078U, // <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7> + 3867102444U, // <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5> + 2697809449U, // <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS + 2630852710U, // <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS + 2624881572U, // <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5> + 2630854269U, // <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5> + 2666686677U, // <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2> + 2630855990U, // <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS + 2230127926U, // <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS + 1691159862U, // <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 3867102520U, // <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0> + 1691159880U, // <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2230799250U, // <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1> + 3304541130U, // <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3> + 2230799417U, // <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6> + 3304541323U, // <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7> + 2230799568U, // <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4> + 1157057846U, // <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS + 3304541566U, // <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7> + 3798758243U, // <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4> + 1157058089U, // <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS + 3806721018U, // <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2> + 3853831590U, // <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2> + 3801412775U, // <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4> + 3802076408U, // <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4> + 3401436368U, // <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4> + 2793360840U, // <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0> + 3804067307U, // <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4> + 3867102682U, // <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0> + 2793360867U, // <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0> + 2630877286U, // <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS + 2282580144U, // <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1> + 2630878848U, // <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u> + 2636851545U, // <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u> + 1256615120U, // <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4> + 1208837838U, // <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5> + 1691160105U, // <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2660742333U, // <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u> + 1208837841U, // <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u> + 3766910976U, // <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0> + 2693169254U, // <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS + 3760939181U, // <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2> + 3843214936U, // <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0> + 3760939355U, // <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5> + 3867102827U, // <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1> + 3867102836U, // <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1> + 3867102844U, // <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0> + 2693169821U, // <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS + 3766911724U, // <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1> + 3766911796U, // <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1> + 2693170070U, // <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0> + 3384798262U, // <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3> + 2693170228U, // <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5> + 3301208068U, // <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5> + 3366879607U, // <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6> + 3867102925U, // <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0> + 2695824760U, // <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5> + 2642845798U, // <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS + 2295139218U, // <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1> + 2699142760U, // <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2> + 3766912678U, // <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1> + 2699142925U, // <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5> + 2228031492U, // <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5> + 2295138818U, // <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6> + 3368879347U, // <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7> + 2295138820U, // <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u> + 2287184866U, // <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0> + 1256573842U, // <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1> + 2642855630U, // <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5> + 2287182763U, // <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3> + 2287184870U, // <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4> + 1256574170U, // <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5> + 1213442562U, // <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 2287183091U, // <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7> + 1213442564U, // <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u> + 3716604006U, // <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS + 3716604822U, // <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0> + 3766914099U, // <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0> + 3368895403U, // <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3> + 3716607031U, // <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4> + 2693172534U, // <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS + 3363588610U, // <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6> + 3368895731U, // <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7> + 2693172777U, // <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS + 3704668262U, // <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS + 3704669078U, // <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0> + 3704669830U, // <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5> + 3364259460U, // <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3> + 3704671542U, // <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS + 2793361412U, // <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5> + 3364258167U, // <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6> + 3867103249U, // <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0> + 2793361412U, // <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5> + 2642878566U, // <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS + 3386166810U, // <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1> + 2723033594U, // <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3> + 3848523842U, // <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4> + 2723033713U, // <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5> + 2230800388U, // <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5> + 2230800482U, // <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0> + 2785841252U, // <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2> + 2785914989U, // <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2> + 3796775930U, // <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2> + 3800757335U, // <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5> + 3853463689U, // <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3> + 3796776218U, // <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2> + 3796776294U, // <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6> + 3803411867U, // <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5> + 3371575081U, // <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6> + 3796776516U, // <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3> + 3371575083U, // <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u> + 2287225826U, // <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0> + 1256614802U, // <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1> + 2642896590U, // <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5> + 2287223723U, // <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3> + 2287225830U, // <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4> + 1256615130U, // <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5> + 1208838658U, // <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 2287224051U, // <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7> + 1208838660U, // <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u> + 3772227584U, // <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0> + 2698485862U, // <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS + 3759620282U, // <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6> + 3710675299U, // <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0> + 3767583058U, // <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5> + 3378153265U, // <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5> + 3865186637U, // <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1> + 2330291510U, // <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS + 2698486429U, // <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS + 3734569062U, // <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS + 3764929346U, // <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6> + 3772228502U, // <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0> + 3734571158U, // <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2> + 3734572342U, // <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS + 3767583878U, // <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6> + 3768247511U, // <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6> + 2293140790U, // <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS + 2293140791U, // <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS + 3704717414U, // <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS + 3395424589U, // <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1> + 2228031993U, // <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2> + 2698487485U, // <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6> + 3704720694U, // <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS + 3773556575U, // <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6> + 2698487738U, // <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7> + 1221397814U, // <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS + 1221397815U, // <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS + 2636955750U, // <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS + 2330314217U, // <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1> + 2636957626U, // <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7> + 2287184230U, // <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3> + 2636959030U, // <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS + 2648903448U, // <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3> + 1256575800U, // <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6> + 135056694U, // <2,6,3,7>: Cost 1 vmrglw LHS, RHS + 135056695U, // <2,6,3,u>: Cost 1 vmrglw LHS, RHS + 3710705766U, // <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS + 3698762677U, // <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4> + 3710707389U, // <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6> + 3710708071U, // <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4> + 3710709046U, // <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS + 2698489142U, // <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS + 3796782457U, // <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2> + 2295156022U, // <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS + 2295156023U, // <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS + 3303870753U, // <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2> + 3788820134U, // <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6> + 3779530520U, // <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3> + 3303871026U, // <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5> + 3303871117U, // <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6> + 3791474666U, // <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6> + 3792138299U, // <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6> + 2290519350U, // <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS + 2290519351U, // <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS + 2631008358U, // <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS + 3372893673U, // <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1> + 2791445264U, // <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2> + 2230800968U, // <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0> + 2631011638U, // <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS + 3372894001U, // <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5> + 2793362232U, // <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6> + 2295835958U, // <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS + 2295835959U, // <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS + 2793362254U, // <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1> + 2792035160U, // <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2> + 2792108897U, // <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2> + 2769474408U, // <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0> + 2793362294U, // <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5> + 3371575089U, // <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5> + 2792403845U, // <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2> + 2297834806U, // <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS + 2297834807U, // <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS + 2636996710U, // <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS + 2698491694U, // <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS + 2636998631U, // <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7> + 2282580326U, // <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3> + 2636999990U, // <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS + 2698492058U, // <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS + 1256616760U, // <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6> + 135097654U, // <2,6,u,7>: Cost 1 vmrglw LHS, RHS + 135097655U, // <2,6,u,u>: Cost 1 vmrglw LHS, RHS + 2666864742U, // <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS + 1719620602U, // <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2> + 3768254637U, // <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2> + 3393417722U, // <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3> + 2666868022U, // <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS + 3867104290U, // <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6> + 3728667127U, // <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0> + 2666869817U, // <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2> + 1720136761U, // <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2> + 3728670822U, // <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS + 3774227252U, // <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1> + 3774227350U, // <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0> + 2323001850U, // <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3> + 3728674102U, // <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS + 3774227567U, // <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1> + 2694513880U, // <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7> + 3396744002U, // <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7> + 2323001850U, // <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3> + 2654937190U, // <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS + 3728679732U, // <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1> + 2700486248U, // <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2> + 2321682938U, // <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3> + 2654940470U, // <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS + 3859584196U, // <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6> + 2700486577U, // <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7> + 2228033132U, // <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7> + 2701813843U, // <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7> + 1581203558U, // <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS + 2654946100U, // <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1> + 2637031354U, // <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7> + 1256575482U, // <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3> + 1581206838U, // <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS + 2654949380U, // <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5> + 1581208058U, // <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3> + 1256575810U, // <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7> + 1581209390U, // <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS + 3728695398U, // <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS + 3869758782U, // <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2> + 3728696936U, // <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2> + 3393450490U, // <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3> + 3728698678U, // <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS + 2700487990U, // <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS + 3728699899U, // <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4> + 3867104626U, // <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0> + 2700488233U, // <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS + 3855160709U, // <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1> + 3728704406U, // <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0> + 3370233956U, // <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2> + 2320380410U, // <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3> + 3728706870U, // <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS + 3867104694U, // <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5> + 3792146492U, // <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7> + 3394122562U, // <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7> + 2320380410U, // <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3> + 2230801402U, // <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2> + 3768258984U, // <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2> + 2730349050U, // <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3> + 3372894575U, // <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3> + 2230801766U, // <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6> + 3304543670U, // <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5> + 3728716285U, // <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6> + 2230802028U, // <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7> + 2730349050U, // <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3> + 2793362983U, // <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1> + 3728721112U, // <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7> + 3371574933U, // <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2> + 2327695866U, // <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3> + 3728723254U, // <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS + 3371574855U, // <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5> + 2730350062U, // <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7> + 2793363052U, // <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7> + 2798671471U, // <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1> + 1581244518U, // <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS + 1724929666U, // <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2> + 2637072314U, // <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7> + 1256616442U, // <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3> + 1581247798U, // <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS + 2700490906U, // <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS + 1581249023U, // <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u> + 1256616770U, // <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7> + 1581250350U, // <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS + 1611489280U, // <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0> + 537747563U, // <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS + 2685231277U, // <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 2685231356U, // <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0> + 1611489618U, // <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 2226763930U, // <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS + 2733007350U, // <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7> + 2660971737U, // <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0> + 537748125U, // <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS + 2689876708U, // <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2> + 1611490100U, // <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1> + 1611490198U, // <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0> + 2293137564U, // <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS + 2689877072U, // <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6> + 2689877103U, // <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1> + 2689877199U, // <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7> + 2293140808U, // <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS + 1616135548U, // <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0> + 1556938854U, // <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 1154291502U, // <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS + 336380006U, // <2,u,2,2>: Cost 1 vspltisw2 LHS + 1611490982U, // <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1> + 1556942134U, // <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 1154291866U, // <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS + 1611491258U, // <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 1221397832U, // <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS + 336380006U, // <2,u,2,u>: Cost 1 vspltisw2 LHS + 1611491478U, // <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2> + 1213440073U, // <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1> + 1213442261U, // <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2> + 135053468U, // <2,u,3,3>: Cost 1 vmrglw LHS, LHS + 1611491842U, // <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6> + 1213440401U, // <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5> + 1213442589U, // <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6> + 135056712U, // <2,u,3,7>: Cost 1 vmrglw LHS, RHS + 135053473U, // <2,u,3,u>: Cost 1 vmrglw LHS, LHS + 1551425638U, // <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS + 1551426503U, // <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4> + 2625169000U, // <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2> + 2625169558U, // <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2> + 1551428918U, // <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS + 537750838U, // <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS + 2733010297U, // <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2> + 2295156040U, // <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS + 537751081U, // <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS + 2689879624U, // <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2> + 2230130478U, // <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS + 2631149217U, // <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5> + 2290516124U, // <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS + 2689879988U, // <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6> + 1659269124U, // <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5> + 1691162778U, // <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2290519368U, // <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS + 1691162796U, // <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS + 2230802131U, // <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2> + 1157060398U, // <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS + 1659269626U, // <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3> + 2764904656U, // <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7> + 2230802495U, // <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6> + 1157060762U, // <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS + 1659269944U, // <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6> + 1659269966U, // <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1> + 1157060965U, // <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS + 1659270138U, // <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2> + 2727040090U, // <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u> + 2727703723U, // <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u> + 2297831580U, // <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS + 1659270502U, // <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6> + 2733012406U, // <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5> + 2730358255U, // <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u> + 1659270764U, // <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7> + 1659270786U, // <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2> + 1213481923U, // <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0> + 537753390U, // <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS + 336380006U, // <2,u,u,2>: Cost 1 vspltisw2 LHS + 135094428U, // <2,u,u,3>: Cost 1 vmrglw LHS, LHS + 1213481927U, // <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4> + 537753754U, // <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS + 1208838685U, // <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6> + 135097672U, // <2,u,u,7>: Cost 1 vmrglw LHS, RHS + 135094433U, // <2,u,u,u>: Cost 1 vmrglw LHS, LHS + 1678557184U, // <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0> + 1678557194U, // <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1> + 2631181989U, // <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0> + 2289223984U, // <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3> + 2756943909U, // <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1> + 3362965729U, // <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5> + 3362966054U, // <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6> + 2289224312U, // <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7> + 1683202121U, // <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1> + 1557446758U, // <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS + 2752741467U, // <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1> + 604815462U, // <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS + 2631190676U, // <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0> + 1557450038U, // <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS + 2667024388U, // <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5> + 2800074894U, // <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7> + 2661053667U, // <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1> + 604815516U, // <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS + 2696521165U, // <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0> + 2752741549U, // <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2> + 2691876456U, // <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2> + 2691876518U, // <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1> + 3830685895U, // <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1> + 3765618536U, // <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6> + 2691876794U, // <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7> + 2701166596U, // <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0> + 2756944108U, // <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2> + 2691877014U, // <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2> + 1161003110U, // <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS + 2691877168U, // <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3> + 2691877246U, // <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0> + 2691877378U, // <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6> + 3765619238U, // <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6> + 2691877496U, // <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7> + 3368962680U, // <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7> + 1161003677U, // <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS + 2289254400U, // <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0> + 1678557522U, // <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5> + 2631214761U, // <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4> + 2235580672U, // <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4> + 2756944237U, // <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5> + 1618136374U, // <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS + 3309322742U, // <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7> + 3362998904U, // <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7> + 1683202449U, // <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5> + 3765620296U, // <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2> + 2752299427U, // <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5> + 3789508346U, // <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0> + 3403486842U, // <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3> + 3765620660U, // <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6> + 2733682692U, // <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5> + 2800075218U, // <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7> + 3873817044U, // <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0> + 2800075234U, // <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5> + 2752299501U, // <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7> + 2236547174U, // <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS + 2733683194U, // <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3> + 3844473352U, // <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7> + 3310289234U, // <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5> + 3873817114U, // <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7> + 2733683512U, // <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6> + 2725057384U, // <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0> + 2236547741U, // <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS + 2297905152U, // <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0> + 2297906854U, // <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1> + 2727711916U, // <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0> + 3371649328U, // <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3> + 2733684070U, // <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6> + 3734843490U, // <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0> + 3798799895U, // <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3> + 2733684332U, // <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7> + 2297906861U, // <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u> + 1557504102U, // <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS + 1678557842U, // <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1> + 604816029U, // <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS + 2691880892U, // <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1> + 1557507382U, // <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS + 1618139290U, // <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS + 2691881168U, // <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7> + 2661111018U, // <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u> + 604816083U, // <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS + 2619310332U, // <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0> + 2756944612U, // <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2> + 2289221724U, // <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2> + 2619312278U, // <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2> + 2619313462U, // <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS + 2289221970U, // <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5> + 2232599768U, // <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7> + 3362964687U, // <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7> + 2619316014U, // <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS + 2756944683U, // <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1> + 1678558004U, // <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 2691883927U, // <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1> + 3826631496U, // <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3> + 2756944723U, // <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5> + 2756944732U, // <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5> + 3830686561U, // <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1> + 3734869228U, // <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1> + 1678558004U, // <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 2696529358U, // <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1> + 2756944775U, // <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3> + 2294548630U, // <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2> + 1678558102U, // <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0> + 2631273782U, // <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS + 2756944811U, // <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3> + 3830686644U, // <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3> + 2800075706U, // <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0> + 1679000515U, // <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0> + 2619334911U, // <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3> + 2295218186U, // <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1> + 2293229718U, // <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2> + 2619337116U, // <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3> + 2619338038U, // <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS + 2295218514U, // <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5> + 3830686729U, // <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7> + 3368961231U, // <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7> + 2619340590U, // <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS + 2619343104U, // <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4> + 2289254410U, // <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1> + 2289256598U, // <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2> + 2619345410U, // <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6> + 2619346230U, // <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS + 2756944976U, // <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6> + 3362996401U, // <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6> + 3362997455U, // <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7> + 2619348782U, // <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS + 2756945007U, // <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1> + 3830686840U, // <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1> + 3358361750U, // <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2> + 3830686857U, // <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0> + 2756945047U, // <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5> + 2294571346U, // <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5> + 3806105698U, // <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0> + 3873817774U, // <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1> + 2756945079U, // <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1> + 3830686912U, // <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1> + 2756945103U, // <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7> + 2236547990U, // <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0> + 3826631905U, // <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7> + 3830686952U, // <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5> + 2756945139U, // <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7> + 3830686972U, // <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7> + 2800076030U, // <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0> + 2756945166U, // <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7> + 3699081318U, // <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS + 2297905162U, // <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1> + 2297907350U, // <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2> + 3365675182U, // <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3> + 3699084598U, // <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS + 2297905490U, // <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5> + 2297905329U, // <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6> + 3368330447U, // <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7> + 2297905169U, // <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u> + 2619375876U, // <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u> + 1678558004U, // <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 2289289366U, // <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2> + 1679000956U, // <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0> + 2619378998U, // <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS + 2756945297U, // <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3> + 2297905329U, // <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6> + 2800076192U, // <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0> + 1683203497U, // <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0> + 3362964203U, // <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0> + 2289222380U, // <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1> + 2289222462U, // <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2> + 1215479910U, // <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS + 3362964207U, // <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4> + 2289222708U, // <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5> + 2232600506U, // <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7> + 3396142296U, // <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7> + 1215479915U, // <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS + 3699105894U, // <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS + 3765633844U, // <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1> + 2691892120U, // <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2> + 2752300575U, // <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1> + 3699109174U, // <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS + 3830687280U, // <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0> + 3830687289U, // <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0> + 3874260548U, // <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2> + 2752742988U, // <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1> + 2631344230U, // <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS + 2697201184U, // <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2> + 1678558824U, // <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2> + 1678558834U, // <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3> + 2631347510U, // <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS + 3368953613U, // <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5> + 2234304442U, // <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7> + 3368953777U, // <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7> + 1679001247U, // <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3> + 1678558886U, // <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1> + 2752300719U, // <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1> + 2752300729U, // <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2> + 1221476454U, // <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS + 1678558926U, // <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5> + 2800076503U, // <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5> + 2234746810U, // <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7> + 2800076516U, // <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0> + 1678558958U, // <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1> + 3699130470U, // <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS + 3362996972U, // <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1> + 2289256040U, // <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2> + 1215512678U, // <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS + 3362998676U, // <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4> + 2691894582U, // <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS + 2235582394U, // <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7> + 3734967544U, // <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4> + 1215512683U, // <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS + 3705110630U, // <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS + 3368313985U, // <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1> + 3368314472U, // <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2> + 2756945768U, // <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6> + 3705113910U, // <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS + 3310061416U, // <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6> + 3310135226U, // <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7> + 3370305457U, // <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7> + 2752743317U, // <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6> + 2631376998U, // <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS + 3705119540U, // <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1> + 2631378621U, // <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6> + 1678559162U, // <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7> + 2631380278U, // <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS + 3370976956U, // <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5> + 2237065146U, // <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7> + 3798815594U, // <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2> + 1679001575U, // <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7> + 2800076778U, // <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1> + 3371647724U, // <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1> + 2297906792U, // <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2> + 1224163430U, // <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS + 3705130294U, // <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS + 3371648052U, // <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5> + 2297906877U, // <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6> + 3371648702U, // <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7> + 1224163435U, // <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS + 1679001659U, // <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1> + 2752743492U, // <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1> + 1678558824U, // <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2> + 1678559320U, // <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3> + 1679001699U, // <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5> + 2691897498U, // <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS + 2237908922U, // <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7> + 2800519289U, // <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0> + 1679001731U, // <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1> + 1215480726U, // <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0> + 1678559382U, // <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2> + 2631403200U, // <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0> + 2289223282U, // <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3> + 2752301232U, // <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1> + 3362965027U, // <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5> + 3362965352U, // <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6> + 2289223610U, // <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7> + 1678559445U, // <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2> + 3830687964U, // <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0> + 2752301286U, // <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1> + 2752301297U, // <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3> + 2305157532U, // <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3> + 3830688000U, // <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0> + 3830688009U, // <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0> + 3830688019U, // <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1> + 3362973626U, // <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7> + 2752743719U, // <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3> + 2631417958U, // <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS + 3826043193U, // <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3> + 1624131186U, // <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3> + 2752301384U, // <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0> + 2631421238U, // <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS + 3826485602U, // <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u> + 2752301414U, // <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3> + 2771249519U, // <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3> + 1628112984U, // <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3> + 1563656294U, // <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS + 2301855911U, // <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1> + 2697873730U, // <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3> + 403488870U, // <3,3,3,3>: Cost 1 vspltisw3 LHS + 1563659574U, // <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS + 2301856239U, // <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5> + 2697874067U, // <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7> + 2295220154U, // <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7> + 403488870U, // <3,3,3,u>: Cost 1 vspltisw3 LHS + 2289255318U, // <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0> + 2631435162U, // <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4> + 2631435972U, // <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4> + 2289256050U, // <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3> + 1215513498U, // <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4> + 1679002114U, // <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6> + 3362998120U, // <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6> + 2289256378U, // <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7> + 1679002141U, // <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6> + 3831130657U, // <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1> + 3376277671U, // <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1> + 3771617012U, // <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3> + 2302536092U, // <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3> + 3831130697U, // <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5> + 2294572579U, // <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5> + 2800519773U, // <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7> + 3368314810U, // <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7> + 2800519791U, // <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7> + 2800077432U, // <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7> + 3310291185U, // <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3> + 2789165706U, // <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7> + 2764982931U, // <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7> + 2800077468U, // <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7> + 3873819301U, // <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7> + 2297235304U, // <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6> + 2725081963U, // <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3> + 2725745596U, // <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3> + 2631458918U, // <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS + 3705201460U, // <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1> + 2631460551U, // <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7> + 2297906802U, // <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3> + 2631462198U, // <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS + 3371648547U, // <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5> + 3371648548U, // <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6> + 1224165306U, // <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7> + 1224165306U, // <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7> + 1215480726U, // <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0> + 1679002398U, // <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2> + 1659967368U, // <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3> + 403488870U, // <3,3,u,3>: Cost 1 vspltisw3 LHS + 1563659574U, // <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS + 1679002438U, // <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6> + 2756946764U, // <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3> + 1224165306U, // <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7> + 403488870U, // <3,3,u,u>: Cost 1 vspltisw3 LHS + 2691907584U, // <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0> + 1618165862U, // <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS + 2631476937U, // <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0> + 2232601732U, // <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0> + 2691907922U, // <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5> + 1158860086U, // <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS + 3306343806U, // <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7> + 3366947484U, // <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7> + 1618166429U, // <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS + 2631483494U, // <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS + 2691908404U, // <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1> + 1618166682U, // <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4> + 3765650393U, // <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4> + 2631486774U, // <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS + 2756946914U, // <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0> + 3765650639U, // <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7> + 3735090439U, // <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1> + 1622148480U, // <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4> + 3765650893U, // <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0> + 3831131154U, // <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3> + 2691909224U, // <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2> + 2691909286U, // <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1> + 2699208469U, // <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4> + 2233863478U, // <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS + 2691909562U, // <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7> + 2701199368U, // <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4> + 2691909691U, // <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1> + 2691909782U, // <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2> + 3765651686U, // <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1> + 2691909972U, // <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3> + 2691910044U, // <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1> + 1161006390U, // <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS + 2691910300U, // <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7> + 3368962716U, // <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7> + 1161006633U, // <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS + 2631508070U, // <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS + 2631508890U, // <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4> + 2631509709U, // <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4> + 2289256788U, // <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3> + 1726336208U, // <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4> + 1618169142U, // <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS + 3362998858U, // <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6> + 2289257116U, // <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7> + 1618169385U, // <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS + 1557774438U, // <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS + 2631516980U, // <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1> + 1557776078U, // <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5> + 2631518358U, // <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2> + 1557777718U, // <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS + 2296563406U, // <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5> + 604818742U, // <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS + 2661381387U, // <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5> + 604818760U, // <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS + 3705266278U, // <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS + 3831131482U, // <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7> + 2733715962U, // <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3> + 3844771180U, // <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7> + 2800078197U, // <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7> + 2236550454U, // <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS + 2733716280U, // <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6> + 2725090156U, // <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4> + 2236550697U, // <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS + 2733716474U, // <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2> + 3371647013U, // <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1> + 2727744688U, // <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4> + 3371649364U, // <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3> + 2733716838U, // <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6> + 2297906894U, // <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5> + 3371647180U, // <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6> + 2733717100U, // <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7> + 2297906897U, // <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u> + 1557799014U, // <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS + 1618171694U, // <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS + 1557800657U, // <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u> + 2691913660U, // <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1> + 1557802294U, // <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS + 1618172058U, // <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS + 604818985U, // <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS + 2661405966U, // <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u> + 604819003U, // <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS + 2643492966U, // <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS + 2756947528U, // <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2> + 2331029019U, // <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2> + 2643495062U, // <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2> + 2756947554U, // <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1> + 2800078443U, // <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1> + 2289224194U, // <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6> + 3362964723U, // <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7> + 2756947590U, // <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1> + 2800078479U, // <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1> + 2333027218U, // <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1> + 2691916699U, // <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5> + 3832901294U, // <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5> + 2800078519U, // <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5> + 3830689467U, // <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0> + 3830689481U, // <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5> + 3873820365U, // <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0> + 2800078551U, // <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1> + 3770967487U, // <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4> + 2697225763U, // <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5> + 3830689523U, // <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2> + 2699216590U, // <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5> + 2699216662U, // <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5> + 2783047439U, // <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3> + 2783121176U, // <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3> + 3856936737U, // <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3> + 2701871194U, // <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5> + 2643517542U, // <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS + 2331052946U, // <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1> + 3699345010U, // <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3> + 2705189276U, // <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3> + 2705189359U, // <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5> + 2331053274U, // <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5> + 2295220738U, // <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6> + 3368961267U, // <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7> + 2295220740U, // <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u> + 2643525734U, // <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS + 2331061138U, // <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1> + 2235584280U, // <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3> + 2643528194U, // <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6> + 2735713498U, // <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5> + 2756947892U, // <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6> + 2289256962U, // <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6> + 3362997491U, // <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7> + 2756947919U, // <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6> + 2800078803U, // <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1> + 2800078812U, // <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1> + 2631591639U, // <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5> + 3832901616U, // <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3> + 2800078843U, // <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5> + 1726337028U, // <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 2800078862U, // <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6> + 3368314099U, // <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7> + 1726337028U, // <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 2800078884U, // <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1> + 2800078899U, // <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7> + 2631599832U, // <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6> + 2800078914U, // <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4> + 2800078924U, // <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5> + 2800078935U, // <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7> + 2297235970U, // <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6> + 1726337122U, // <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0> + 1726337131U, // <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0> + 3699376230U, // <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS + 2333739922U, // <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1> + 3699378106U, // <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7> + 3371647915U, // <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3> + 3699379510U, // <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS + 2333740250U, // <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5> + 2297907714U, // <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6> + 3370984691U, // <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7> + 2297907716U, // <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u> + 2800079046U, // <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1> + 2756948176U, // <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2> + 2331029019U, // <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2> + 2800079076U, // <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4> + 2800079085U, // <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4> + 1726337028U, // <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 2289289730U, // <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6> + 1726337284U, // <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0> + 1726337293U, // <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0> + 3773628416U, // <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0> + 2699886694U, // <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS + 2789167401U, // <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1> + 3362965862U, // <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3> + 3773628754U, // <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5> + 3723284326U, // <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0> + 2800079181U, // <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1> + 1215483190U, // <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS + 1215483191U, // <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS + 3873821032U, // <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1> + 3773629236U, // <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1> + 2691924892U, // <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6> + 3830690184U, // <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6> + 3873821072U, // <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5> + 3873821082U, // <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6> + 3403453240U, // <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6> + 2289233206U, // <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS + 2289233207U, // <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS + 2661498982U, // <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS + 3770975780U, // <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6> + 2631640797U, // <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2> + 3771639485U, // <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6> + 2661502262U, // <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS + 2699888488U, // <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6> + 2661503482U, // <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3> + 1715425786U, // <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3> + 1715499523U, // <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3> + 3773630614U, // <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2> + 3372942825U, // <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1> + 2234749434U, // <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3> + 3368962406U, // <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3> + 2699889154U, // <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6> + 3773631068U, // <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6> + 2331054904U, // <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6> + 1221479734U, // <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS + 1221479735U, // <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS + 2235584801U, // <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2> + 3717342106U, // <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4> + 2789167729U, // <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5> + 2235585074U, // <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5> + 2235585165U, // <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6> + 2699889974U, // <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS + 2800079509U, // <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5> + 1215515958U, // <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS + 1215515959U, // <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS + 3873821356U, // <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1> + 3372959209U, // <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1> + 3862909629U, // <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0> + 3773632358U, // <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0> + 3873821396U, // <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5> + 3873821405U, // <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5> + 3862909672U, // <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7> + 2294574390U, // <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS + 2294574391U, // <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS + 2800079613U, // <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1> + 3873821446U, // <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1> + 2789167888U, // <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2> + 3844920090U, // <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3> + 2800079653U, // <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5> + 3723333484U, // <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6> + 1726337848U, // <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6> + 1726337858U, // <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7> + 1726337867U, // <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7> + 1726337870U, // <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1> + 2297906665U, // <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1> + 2792117090U, // <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3> + 2297907558U, // <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3> + 1726337910U, // <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5> + 2297906993U, // <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5> + 2297906832U, // <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6> + 1224166710U, // <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS + 1224166711U, // <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS + 1726337951U, // <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1> + 2699892526U, // <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS + 2789168049U, // <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1> + 2792854460U, // <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3> + 1726337991U, // <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5> + 2699892890U, // <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS + 1726337848U, // <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6> + 1215548726U, // <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS + 1215548727U, // <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS + 2700558336U, // <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0> + 1626816614U, // <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 2700558513U, // <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6> + 2331030010U, // <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3> + 2700558674U, // <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5> + 2800079906U, // <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6> + 2655588936U, // <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0> + 2800079919U, // <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1> + 1626817181U, // <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 3774300899U, // <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1> + 2700559156U, // <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1> + 2700559254U, // <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0> + 3774301148U, // <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7> + 3774301227U, // <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5> + 3774301295U, // <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1> + 3768329441U, // <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7> + 3403453250U, // <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7> + 2700559740U, // <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0> + 2700559849U, // <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1> + 3770983973U, // <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7> + 2700559976U, // <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2> + 2698569415U, // <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7> + 2700560177U, // <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5> + 3773638505U, // <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7> + 1626818490U, // <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7> + 2795140307U, // <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3> + 1628145756U, // <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7> + 2700560534U, // <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2> + 3774302438U, // <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1> + 2700560742U, // <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3> + 2700560796U, // <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3> + 2700560898U, // <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6> + 3774302821U, // <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6> + 2700561079U, // <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7> + 2700561091U, // <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1> + 2700561182U, // <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2> + 2655617126U, // <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS + 3774303178U, // <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3> + 2655619002U, // <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7> + 2331062778U, // <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3> + 2655620406U, // <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS + 1626819894U, // <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS + 2655621708U, // <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4> + 2800080247U, // <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5> + 1626820137U, // <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS + 3774303816U, // <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2> + 3873822093U, // <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0> + 3774303998U, // <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4> + 3862910368U, // <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1> + 3774304180U, // <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6> + 2800080310U, // <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5> + 2800080321U, // <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7> + 3873822147U, // <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0> + 2800080339U, // <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7> + 2800080348U, // <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7> + 3873822181U, // <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7> + 2789168622U, // <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7> + 2700563016U, // <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0> + 2800080384U, // <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7> + 3862910472U, // <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6> + 2700563256U, // <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6> + 2800080404U, // <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0> + 2793149988U, // <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7> + 2637725798U, // <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS + 3371649227U, // <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1> + 2637727674U, // <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7> + 2297907567U, // <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3> + 2637729078U, // <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS + 3371649312U, // <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5> + 2655646287U, // <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7> + 1726338668U, // <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7> + 1726338668U, // <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7> + 2700564179U, // <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2> + 1626822446U, // <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 2700564357U, // <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0> + 2700564412U, // <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1> + 2700564543U, // <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6> + 1626822810U, // <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS + 1662654672U, // <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7> + 1726338668U, // <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7> + 1626823013U, // <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS + 1678557184U, // <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0> + 1679005395U, // <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2> + 2289221787U, // <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2> + 1215479964U, // <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS + 2752747245U, // <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1> + 1158863002U, // <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS + 2289224221U, // <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6> + 1215483208U, // <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS + 1679005458U, // <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2> + 1558036582U, // <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS + 1678558004U, // <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1> + 604821294U, // <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS + 2752747317U, // <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1> + 1558039862U, // <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS + 2756949830U, // <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0> + 2800080726U, // <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7> + 2289233224U, // <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS + 604821348U, // <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS + 2696586709U, // <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u> + 2757392246U, // <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3> + 1624172151U, // <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u> + 1679005576U, // <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3> + 2631789878U, // <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS + 2699904874U, // <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u> + 1626826683U, // <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u> + 1726338988U, // <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3> + 1683208117U, // <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3> + 1679005628U, // <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1> + 1161008942U, // <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS + 2752747471U, // <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2> + 403488870U, // <3,u,3,3>: Cost 1 vspltisw3 LHS + 1679005668U, // <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5> + 1161009306U, // <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS + 2691943104U, // <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7> + 1221479752U, // <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS + 403488870U, // <3,u,3,u>: Cost 1 vspltisw3 LHS + 2289255363U, // <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0> + 1161844526U, // <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS + 2289256661U, // <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2> + 1215512732U, // <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS + 1215513498U, // <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4> + 1679005759U, // <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6> + 2289256989U, // <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6> + 1215515976U, // <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS + 1679005786U, // <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6> + 1558069350U, // <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS + 2631811892U, // <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1> + 1558071026U, // <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5> + 2752747646U, // <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6> + 1558072630U, // <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS + 1726337028U, // <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5> + 604821658U, // <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS + 2294574408U, // <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS + 604821676U, // <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS + 2631819366U, // <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS + 2757392574U, // <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7> + 2631821043U, // <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6> + 1679005904U, // <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7> + 2631822646U, // <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS + 2236553370U, // <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS + 1726337848U, // <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6> + 1726339309U, // <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0> + 1683208445U, // <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7> + 1726339328U, // <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1> + 2297905225U, // <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1> + 2631829236U, // <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7> + 1224163484U, // <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS + 1726339368U, // <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5> + 2297905553U, // <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5> + 2297905392U, // <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6> + 1224166728U, // <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS + 1224163489U, // <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS + 1683208529U, // <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1> + 1679006043U, // <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2> + 604821861U, // <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS + 403488870U, // <3,u,u,3>: Cost 1 vspltisw3 LHS + 1683208569U, // <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5> + 1679006083U, // <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6> + 604821901U, // <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS + 1215548744U, // <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS + 604821915U, // <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS + 2759016448U, // <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0> + 1165115494U, // <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS + 3717531337U, // <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0> + 3369675785U, // <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3> + 2751791144U, // <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4> + 2238857630U, // <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0> + 3312591341U, // <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7> + 3369676113U, // <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7> + 1165116061U, // <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS + 2637824102U, // <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS + 2637824922U, // <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4> + 1685274726U, // <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2637826512U, // <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1> + 2637827382U, // <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS + 2661716070U, // <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4> + 3729486427U, // <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1> + 2661717300U, // <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1> + 1685274780U, // <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 3711574118U, // <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS + 2240200806U, // <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS + 3771663992U, // <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0> + 2698585801U, // <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0> + 3373672105U, // <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4> + 3810813795U, // <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1> + 3772327866U, // <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7> + 3386280568U, // <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7> + 2701903966U, // <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0> + 3699638374U, // <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS + 2753560832U, // <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4> + 3772328276U, // <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3> + 3827302674U, // <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4> + 3699641654U, // <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS + 3779627588U, // <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0> + 3772328604U, // <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7> + 3780954854U, // <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0> + 2753560832U, // <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4> + 2725129106U, // <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1> + 1167720550U, // <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS + 3839172953U, // <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3> + 3772329051U, // <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4> + 2241462610U, // <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5> + 2698587446U, // <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS + 3772329297U, // <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7> + 3735483703U, // <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4> + 1167721117U, // <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS + 1168556032U, // <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0> + 94814310U, // <4,0,5,1>: Cost 1 vmrghw RHS, LHS + 2242298029U, // <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2> + 2637859284U, // <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5> + 1168556370U, // <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5> + 2242306530U, // <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5> + 2242298358U, // <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7> + 2661750072U, // <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5> + 94814877U, // <4,0,5,u>: Cost 1 vmrghw RHS, LHS + 3316580362U, // <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1> + 2242846822U, // <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS + 3798872570U, // <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3> + 3796218413U, // <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0> + 3834528273U, // <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7> + 3798872811U, // <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1> + 3316621876U, // <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6> + 2725131121U, // <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0> + 2242847389U, // <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS + 3377692672U, // <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0> + 2243493990U, // <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS + 3775648970U, // <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3> + 3802191110U, // <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0> + 3317236050U, // <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5> + 3803518376U, // <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0> + 3317236214U, // <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7> + 3798873708U, // <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7> + 2243494557U, // <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS + 1170546688U, // <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0> + 96804966U, // <4,0,u,1>: Cost 1 vmrghw RHS, LHS + 1685275293U, // <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2637883863U, // <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u> + 1170547026U, // <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5> + 2698590362U, // <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS + 2244289014U, // <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7> + 2661774651U, // <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u> + 96805533U, // <4,0,u,u>: Cost 1 vmrghw RHS, LHS + 2667749478U, // <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS + 2689966182U, // <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS + 2238571418U, // <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4> + 3711633880U, // <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0> + 2689966418U, // <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5> + 3361046866U, // <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5> + 3741495802U, // <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3> + 3741496314U, // <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2> + 2689966765U, // <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1> + 3764372222U, // <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1> + 2758206263U, // <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4> + 2698593178U, // <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4> + 3361057810U, // <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3> + 3827303250U, // <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4> + 2287313234U, // <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5> + 3763709171U, // <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7> + 3361058138U, // <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7> + 2239759744U, // <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4> + 2637906022U, // <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS + 2637906842U, // <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4> + 3763709544U, // <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2> + 1685275546U, // <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4> + 2637909302U, // <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS + 3361063250U, // <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5> + 3763709882U, // <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7> + 3735541054U, // <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2> + 1685644231U, // <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4> + 2702575792U, // <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1> + 3832759257U, // <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4> + 3833349090U, // <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4> + 3763710364U, // <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3> + 2707884546U, // <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6> + 3361071442U, // <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5> + 3772336796U, // <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7> + 3775654595U, // <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1> + 2707884856U, // <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1> + 2667782246U, // <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS + 2241463092U, // <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1> + 2241553306U, // <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4> + 3827303484U, // <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4> + 2667785424U, // <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4> + 2689969462U, // <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS + 3763711322U, // <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7> + 3867116636U, // <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0> + 2689969705U, // <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS + 1546273106U, // <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5> + 1168556852U, // <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1> + 1168556950U, // <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0> + 2620016790U, // <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2> + 1546276150U, // <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS + 2620018692U, // <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5> + 2242299087U, // <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7> + 2667795450U, // <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2> + 1546278702U, // <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS + 3781628193U, // <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2> + 3832759503U, // <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7> + 3316261786U, // <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4> + 3781628466U, // <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5> + 3827303658U, // <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7> + 3361096018U, // <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5> + 3788264248U, // <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6> + 3788264270U, // <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1> + 3832759566U, // <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7> + 2726466580U, // <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1> + 3377692682U, // <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1> + 3377694870U, // <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2> + 3802199303U, // <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1> + 2731775334U, // <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6> + 3377693010U, // <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5> + 3365749804U, // <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6> + 3788265068U, // <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7> + 2731775644U, // <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1> + 1546297685U, // <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u> + 1170547508U, // <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1> + 1170547606U, // <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0> + 1689257344U, // <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4> + 1546300726U, // <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS + 2284716370U, // <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5> + 2244289743U, // <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7> + 2667820026U, // <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2> + 1546303278U, // <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS + 3729621094U, // <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS + 3763716198U, // <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS + 2238858856U, // <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2> + 2295930982U, // <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS + 3763716434U, // <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5> + 2238859107U, // <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1> + 2238859194U, // <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7> + 3312601066U, // <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1> + 2295930987U, // <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS + 3699769446U, // <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS + 3313255971U, // <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5> + 3361056360U, // <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2> + 2287312998U, // <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS + 3788932148U, // <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5> + 3313256290U, // <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0> + 3838289469U, // <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4> + 3369682865U, // <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7> + 2287313003U, // <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS + 3838658133U, // <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1> + 3711722394U, // <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4> + 2759018088U, // <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2> + 2759018098U, // <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3> + 3838658168U, // <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0> + 3369027341U, // <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5> + 2240227258U, // <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7> + 3735614791U, // <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2> + 2759018143U, // <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3> + 2759018150U, // <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1> + 3831948975U, // <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1> + 3832759993U, // <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2> + 2759018180U, // <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4> + 2759018185U, // <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0> + 3839542998U, // <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4> + 3314640826U, // <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7> + 2765948648U, // <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4> + 2759018222U, // <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1> + 3838658295U, // <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1> + 3315205667U, // <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5> + 2241463912U, // <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2> + 1234829414U, // <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS + 2241464085U, // <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4> + 2241546087U, // <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5> + 2241464250U, // <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7> + 3741602873U, // <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2> + 1234829419U, // <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS + 2626060390U, // <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS + 2626061364U, // <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5> + 1168557672U, // <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2> + 1222230118U, // <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS + 2626063670U, // <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS + 2242299752U, // <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6> + 1168558010U, // <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7> + 2242299882U, // <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1> + 1222230123U, // <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS + 3711754342U, // <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS + 3711755162U, // <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4> + 3838658481U, // <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7> + 2759018426U, // <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7> + 3838658499U, // <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7> + 3735646310U, // <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4> + 3316590522U, // <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7> + 3798889331U, // <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2> + 2759018471U, // <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7> + 3874564074U, // <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1> + 3800880230U, // <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2> + 3371722344U, // <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2> + 2303950950U, // <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS + 3371722346U, // <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4> + 3371722509U, // <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5> + 3317237690U, // <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7> + 3317237738U, // <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1> + 2303950955U, // <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS + 2759018555U, // <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1> + 2626085943U, // <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u> + 1170548328U, // <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2> + 1222254694U, // <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS + 2759018595U, // <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5> + 2244290408U, // <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6> + 1170548666U, // <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7> + 2769266813U, // <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4> + 1222254699U, // <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS + 2238859414U, // <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2> + 2759018646U, // <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2> + 3312314708U, // <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3> + 2238859676U, // <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3> + 2295931802U, // <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4> + 3735670886U, // <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4> + 3312315036U, // <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7> + 3369674682U, // <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7> + 2759018709U, // <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2> + 3361055638U, // <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0> + 3831949542U, // <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1> + 2703917978U, // <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4> + 3361056370U, // <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3> + 2295939994U, // <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4> + 3361056291U, // <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5> + 3378972520U, // <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6> + 3361056698U, // <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7> + 2703917978U, // <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4> + 3832760624U, // <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3> + 3711796122U, // <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4> + 3832760641U, // <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2> + 2770962764U, // <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4> + 2759018836U, // <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3> + 3827304802U, // <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u> + 3832760678U, // <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3> + 3859597679U, // <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3> + 2771331449U, // <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4> + 2240841878U, // <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2> + 3776997635U, // <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3> + 2703919444U, // <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3> + 2759018908U, // <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3> + 2759018918U, // <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4> + 3386951446U, // <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5> + 3777661596U, // <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7> + 3375007674U, // <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7> + 2707901242U, // <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3> + 2759018960U, // <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1> + 2759018970U, // <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2> + 2632099605U, // <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4> + 2241464732U, // <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3> + 2759019000U, // <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5> + 2753563138U, // <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6> + 3777662316U, // <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7> + 2308573114U, // <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7> + 2759019032U, // <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1> + 1168558230U, // <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2> + 2242300134U, // <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1> + 2632107798U, // <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5> + 1168558492U, // <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3> + 1168558594U, // <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6> + 2295973654U, // <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5> + 2242300536U, // <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7> + 2295973818U, // <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7> + 1168558878U, // <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2> + 3832760952U, // <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7> + 3711828890U, // <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4> + 3316484436U, // <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3> + 3711830512U, // <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6> + 2759019164U, // <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7> + 3361097251U, // <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5> + 3316624045U, // <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6> + 2773912244U, // <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4> + 2759019164U, // <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7> + 3377693590U, // <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0> + 3365751680U, // <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1> + 2727810232U, // <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3> + 3377694322U, // <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3> + 2303951770U, // <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4> + 3741700198U, // <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4> + 3377695216U, // <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6> + 3375703994U, // <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7> + 2731792030U, // <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3> + 1170548886U, // <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2> + 2759019294U, // <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2> + 2632132377U, // <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u> + 1170549148U, // <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3> + 1170549250U, // <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6> + 2759019334U, // <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6> + 2244291192U, // <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7> + 2295998394U, // <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7> + 1170549534U, // <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2> + 1165118354U, // <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1> + 1637482598U, // <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS + 3711854285U, // <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4> + 3827305344U, // <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1> + 2711224658U, // <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5> + 1165118774U, // <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS + 3312602489U, // <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2> + 3369675420U, // <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7> + 1165119017U, // <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS + 3369682633U, // <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0> + 2287313581U, // <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1> + 2759019466U, // <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3> + 3369683284U, // <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3> + 2311204048U, // <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4> + 2239319350U, // <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS + 3784967411U, // <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7> + 3369683612U, // <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7> + 2763000832U, // <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3> + 3711869030U, // <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS + 3711869850U, // <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4> + 2240203830U, // <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3> + 2698618573U, // <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4> + 2711226133U, // <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4> + 2240204086U, // <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS + 2711226298U, // <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7> + 3832761416U, // <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3> + 2701936738U, // <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4> + 2711226518U, // <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2> + 3777005828U, // <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4> + 3832761453U, // <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4> + 2301266260U, // <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3> + 2705254903U, // <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4> + 2240843062U, // <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS + 3832761489U, // <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4> + 3375008412U, // <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7> + 2301266260U, // <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3> + 1570373734U, // <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS + 2308574089U, // <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1> + 2644117096U, // <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2> + 2638146039U, // <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4> + 229035318U, // <4,4,4,4>: Cost 1 vspltisw0 RHS + 1167723830U, // <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS + 2644120058U, // <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3> + 2662036827U, // <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4> + 229035318U, // <4,4,4,u>: Cost 1 vspltisw0 RHS + 1168558994U, // <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1> + 2638152602U, // <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4> + 2242300981U, // <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2> + 2638154232U, // <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5> + 1168559322U, // <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5> + 94817590U, // <4,4,5,5>: Cost 1 vmrghw RHS, RHS + 1685278006U, // <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 2242309576U, // <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0> + 94817833U, // <4,4,5,u>: Cost 1 vmrghw RHS, RHS + 3316591506U, // <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1> + 3758428587U, // <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5> + 2711228922U, // <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3> + 3796251185U, // <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4> + 2711229085U, // <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4> + 2242850102U, // <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS + 2242850169U, // <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2> + 2725163893U, // <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4> + 2242850345U, // <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS + 2711229434U, // <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2> + 3377694410U, // <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1> + 3868593584U, // <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3> + 3377695060U, // <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3> + 2729145691U, // <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4> + 2243497270U, // <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS + 3871542744U, // <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7> + 2303953564U, // <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7> + 2243497513U, // <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS + 1170549650U, // <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1> + 1637488430U, // <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS + 2244291637U, // <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2> + 2638178811U, // <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u> + 229035318U, // <4,4,u,4>: Cost 1 vspltisw0 RHS + 96808246U, // <4,4,u,5>: Cost 1 vmrghw RHS, RHS + 1685278249U, // <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 2244292040U, // <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0> + 96808489U, // <4,4,u,u>: Cost 1 vmrghw RHS, RHS + 2698625024U, // <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0> + 1624883302U, // <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS + 2638186190U, // <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5> + 2638187004U, // <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0> + 2687345005U, // <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5> + 2238861316U, // <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5> + 2662077302U, // <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5> + 2662077792U, // <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0> + 1624883869U, // <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS + 3361057762U, // <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0> + 2691326803U, // <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5> + 2698625942U, // <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0> + 3361055659U, // <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3> + 3761087567U, // <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5> + 2693981335U, // <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5> + 2305231362U, // <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6> + 3361055987U, // <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7> + 2695972234U, // <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5> + 2638200934U, // <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS + 3761088035U, // <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5> + 2697963133U, // <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5> + 1624884942U, // <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5> + 2698626838U, // <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5> + 3772368744U, // <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6> + 2698627002U, // <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7> + 3775023122U, // <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5> + 1628203107U, // <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5> + 2698627222U, // <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2> + 3765070057U, // <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4> + 2698627404U, // <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4> + 2698627484U, // <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3> + 2698627580U, // <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0> + 3779668553U, // <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5> + 2725169844U, // <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4> + 2707253995U, // <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5> + 2698627870U, // <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2> + 2638217318U, // <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS + 2308574098U, // <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1> + 2698628150U, // <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3> + 2638219776U, // <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4> + 2698628314U, // <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5> + 1624886582U, // <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS + 2698628478U, // <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7> + 2662110564U, // <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4> + 1624886825U, // <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS + 1570455654U, // <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS + 2312564250U, // <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1> + 2644199118U, // <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5> + 2295974966U, // <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3> + 1570458842U, // <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5> + 1168568324U, // <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5> + 1168568418U, // <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0> + 2295975294U, // <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7> + 1168716036U, // <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0> + 1564491878U, // <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS + 2626290768U, // <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6> + 2632263465U, // <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6> + 1564494338U, // <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6> + 1564495158U, // <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS + 2638237464U, // <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3> + 2656154253U, // <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2725172218U, // <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2> + 3859599489U, // <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4> + 2698630320U, // <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4> + 2728490251U, // <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5> + 2725172576U, // <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0> + 3317239812U, // <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5> + 2725172760U, // <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4> + 2725172844U, // <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7> + 2725172866U, // <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2> + 1564508262U, // <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS + 1624889134U, // <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS + 2698631045U, // <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0> + 1564510724U, // <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u> + 1564511542U, // <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS + 1624889498U, // <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS + 1170550882U, // <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0> + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 3312595285U, // <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0> + 3763748966U, // <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS + 2238861818U, // <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3> + 3767730432U, // <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4> + 3763749202U, // <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5> + 2238862059U, // <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1> + 2238862136U, // <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6> + 2295934262U, // <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS + 2295934263U, // <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS + 3378973999U, // <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0> + 3378974648U, // <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1> + 3779675034U, // <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4> + 3378974002U, // <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3> + 3378974003U, // <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4> + 3767731352U, // <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6> + 3378974734U, // <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6> + 2287316278U, // <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS + 2287316279U, // <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS + 3735904358U, // <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS + 3763750435U, // <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5> + 3313938937U, // <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2> + 3772376782U, // <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5> + 3852890591U, // <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3> + 3735908454U, // <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4> + 3801573306U, // <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7> + 2785858042U, // <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3> + 2785858051U, // <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3> + 3863065101U, // <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4> + 3314586024U, // <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2> + 3863212575U, // <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4> + 3863286312U, // <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4> + 3767732738U, // <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6> + 3779676746U, // <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6> + 3398898488U, // <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6> + 2301267254U, // <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS + 2301267255U, // <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS + 3852890715U, // <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1> + 3315208615U, // <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1> + 2241466874U, // <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3> + 3852890745U, // <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4> + 2241467037U, // <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4> + 2241549039U, // <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5> + 2241467192U, // <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6> + 1234832694U, // <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS + 1234832695U, // <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS + 2242302241U, // <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2> + 2242310567U, // <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1> + 1168568826U, // <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3> + 2242302514U, // <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5> + 2242302605U, // <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6> + 2242310891U, // <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1> + 1168569144U, // <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6> + 1222233398U, // <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS + 1222233399U, // <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS + 3316576545U, // <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2> + 3316584871U, // <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1> + 2242851322U, // <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3> + 3316601394U, // <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5> + 3852890916U, // <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4> + 3316617963U, // <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1> + 2242884408U, // <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6> + 2785858370U, // <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7> + 2785858379U, // <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7> + 2785858382U, // <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1> + 3859600215U, // <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1> + 3317240314U, // <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3> + 2792199020U, // <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4> + 2785858422U, // <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5> + 3856651132U, // <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2> + 3317240632U, // <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6> + 2303954230U, // <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS + 2303954231U, // <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS + 2244292897U, // <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2> + 2244293031U, // <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1> + 1170551290U, // <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3> + 2244293170U, // <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5> + 2244293261U, // <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6> + 2244293355U, // <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1> + 1170551608U, // <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6> + 1222257974U, // <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS + 1222257975U, // <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS + 2238862330U, // <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2> + 2706604134U, // <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS + 3312604308U, // <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3> + 3768402176U, // <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4> + 2238862648U, // <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5> + 3859600418U, // <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6> + 3729994393U, // <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0> + 2238862956U, // <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7> + 2706604701U, // <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS + 3385610338U, // <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0> + 3780346676U, // <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1> + 2706604954U, // <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4> + 3385610746U, // <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3> + 3385610342U, // <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4> + 3385610667U, // <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5> + 3768403178U, // <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7> + 3385611074U, // <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7> + 2706604954U, // <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4> + 3859600532U, // <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3> + 3712091034U, // <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4> + 3774375528U, // <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2> + 2794853552U, // <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4> + 2785858744U, // <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3> + 3735982182U, // <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4> + 3774375875U, // <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7> + 3735983476U, // <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2> + 2795222237U, // <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4> + 3780348054U, // <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2> + 3730015130U, // <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4> + 3780348244U, // <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3> + 3778357673U, // <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7> + 2325155942U, // <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4> + 3779684939U, // <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7> + 2706606748U, // <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7> + 3398898498U, // <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7> + 2707934014U, // <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7> + 2785858868U, // <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1> + 3780348874U, // <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3> + 3780349000U, // <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3> + 2308575738U, // <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3> + 2656283856U, // <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4> + 2706607414U, // <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS + 2656285341U, // <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4> + 2241468012U, // <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7> + 2706607657U, // <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS + 1168569338U, // <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2> + 2242311242U, // <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1> + 2242303178U, // <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3> + 2242311395U, // <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1> + 1168569702U, // <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6> + 2242311606U, // <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5> + 2242311662U, // <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7> + 1168569964U, // <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7> + 1168569986U, // <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2> + 3316593658U, // <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2> + 3316593738U, // <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1> + 3316634800U, // <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4> + 3386978810U, // <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3> + 2785859072U, // <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7> + 3736014950U, // <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4> + 3316594158U, // <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7> + 2797803032U, // <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4> + 2797876769U, // <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4> + 2243499002U, // <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2> + 3718103962U, // <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4> + 3317257418U, // <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3> + 3377695816U, // <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3> + 2243532134U, // <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6> + 3317282230U, // <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5> + 2730497536U, // <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7> + 2243556972U, // <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7> + 2243565186U, // <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2> + 1170551802U, // <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2> + 2706609966U, // <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS + 2244293797U, // <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2> + 2244293859U, // <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1> + 1170552166U, // <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6> + 2706610330U, // <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS + 2244294126U, // <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7> + 1170552428U, // <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7> + 1170552450U, // <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2> + 1165118354U, // <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1> + 1624907878U, // <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS + 2638407377U, // <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u> + 2295931036U, // <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS + 2687369584U, // <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u> + 1165121690U, // <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS + 2662298489U, // <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u> + 2295934280U, // <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS + 1624908445U, // <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS + 2638413926U, // <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS + 2691351382U, // <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u> + 1685280558U, // <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2287313052U, // <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS + 2299257799U, // <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4> + 2694005914U, // <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u> + 2305231362U, // <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6> + 2287316296U, // <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS + 1685280612U, // <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 2638422118U, // <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS + 2240206638U, // <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS + 2697987712U, // <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u> + 1624909521U, // <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u> + 2759391121U, // <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3> + 2240207002U, // <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS + 2698651578U, // <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7> + 2785859500U, // <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3> + 1628227686U, // <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u> + 2759022524U, // <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1> + 2801342408U, // <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4> + 2703960409U, // <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u> + 2759022554U, // <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4> + 2759022564U, // <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5> + 2240845978U, // <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS + 2706614941U, // <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u> + 2301267272U, // <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS + 2759022596U, // <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1> + 1570668646U, // <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS + 1167726382U, // <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS + 2698652753U, // <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3> + 1234829468U, // <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS + 229035318U, // <4,u,4,4>: Cost 1 vspltisw0 RHS + 1624911158U, // <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS + 2698653081U, // <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7> + 1234832712U, // <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS + 229035318U, // <4,u,4,u>: Cost 1 vspltisw0 RHS + 1168561875U, // <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2> + 94820142U, // <4,u,5,1>: Cost 1 vmrghw RHS, LHS + 1168562053U, // <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0> + 1222230172U, // <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS + 1168562239U, // <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6> + 94820506U, // <4,u,5,5>: Cost 1 vmrghw RHS, RHS + 1685280922U, // <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 1222233416U, // <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS + 94820709U, // <4,u,5,u>: Cost 1 vmrghw RHS, LHS + 1564713062U, // <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS + 2626511979U, // <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6> + 2632484676U, // <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6> + 1564715549U, // <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6> + 1564716342U, // <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS + 2242853018U, // <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS + 2656375464U, // <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6> + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2785859840U, // <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1> + 2243499822U, // <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS + 2727851197U, // <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u> + 2303951004U, // <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS + 2785859880U, // <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5> + 2243500186U, // <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS + 2730505729U, // <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u> + 2303954248U, // <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS + 2303951009U, // <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS + 1564729446U, // <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS + 96810798U, // <4,u,u,1>: Cost 1 vmrghw RHS, LHS + 1685281125U, // <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS + 1222254748U, // <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS + 229035318U, // <4,u,u,4>: Cost 1 vspltisw0 RHS + 96811162U, // <4,u,u,5>: Cost 1 vmrghw RHS, RHS + 1685281165U, // <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2754232320U, // <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0> + 2754232330U, // <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1> + 3718194894U, // <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5> + 3376385762U, // <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3> + 2754232357U, // <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1> + 3845816370U, // <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5> + 3782353389U, // <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7> + 3376386090U, // <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7> + 2757402697U, // <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1> + 2626543718U, // <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS + 2626544751U, // <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1> + 1680490598U, // <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 3766428665U, // <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0> + 2626546998U, // <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS + 2650435539U, // <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1> + 3783017715U, // <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7> + 3385019000U, // <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7> + 1680490652U, // <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 3376398336U, // <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0> + 2245877862U, // <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS + 3773064808U, // <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2> + 2705295054U, // <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5> + 3827974343U, // <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1> + 3845816530U, // <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3> + 3779037114U, // <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7> + 3810887658U, // <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1> + 2245878429U, // <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS + 2710603926U, // <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2> + 3827974396U, // <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0> + 3779037516U, // <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4> + 3779037596U, // <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3> + 2705295868U, // <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0> + 3379726804U, // <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5> + 3802925748U, // <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4> + 3363138168U, // <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7> + 2707950400U, // <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0> + 2626568294U, // <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS + 1680490834U, // <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5> + 3828048219U, // <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5> + 2710604932U, // <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0> + 2754232685U, // <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5> + 2705296694U, // <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS + 3779038590U, // <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7> + 2713259464U, // <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0> + 1680490834U, // <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5> + 2311307264U, // <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0> + 1174437990U, // <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS + 3779038946U, // <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3> + 3845816752U, // <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0> + 2248180050U, // <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5> + 2248180194U, // <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5> + 3779039274U, // <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7> + 3385051768U, // <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7> + 1174438557U, // <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS + 2302689280U, // <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0> + 1175208038U, // <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS + 3787002362U, // <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3> + 3376432160U, // <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3> + 2248950098U, // <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5> + 2248950180U, // <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6> + 3376433702U, // <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6> + 2729186166U, // <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5> + 1175208605U, // <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS + 2713261050U, // <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2> + 3365823599U, // <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1> + 3808900317U, // <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4> + 3784348899U, // <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1> + 2729186656U, // <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0> + 3787003268U, // <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0> + 3802928664U, // <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4> + 3787003431U, // <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1> + 2731841188U, // <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0> + 2626601062U, // <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS + 1683145366U, // <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5> + 1680491165U, // <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 2705295054U, // <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5> + 2754233005U, // <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1> + 2705299610U, // <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS + 3779041488U, // <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7> + 2737150252U, // <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0> + 1680491219U, // <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 2713927680U, // <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0> + 1640185958U, // <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 2310607866U, // <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2> + 3787669756U, // <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0> + 2713928018U, // <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5> + 2306621778U, // <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5> + 3787670006U, // <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7> + 3736188301U, // <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0> + 1640186525U, // <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 2650505318U, // <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS + 2754233140U, // <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1> + 2311276694U, // <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2> + 2311278315U, // <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3> + 2758435667U, // <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5> + 2754233180U, // <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5> + 3385016497U, // <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6> + 2311278643U, // <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7> + 2758730615U, // <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5> + 3700367462U, // <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS + 3830629255U, // <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3> + 2713929320U, // <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2> + 2754233238U, // <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0> + 2759099300U, // <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5> + 2754233259U, // <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3> + 2713929658U, // <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7> + 3872359354U, // <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0> + 2754233283U, // <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0> + 2713929878U, // <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2> + 3363135498U, // <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1> + 3363137686U, // <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2> + 2713930140U, // <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3> + 2713930242U, // <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6> + 2289394002U, // <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5> + 3787672184U, // <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7> + 3787672259U, // <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1> + 2713930526U, // <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2> + 1634880402U, // <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1> + 2760205355U, // <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5> + 2760279092U, // <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5> + 3787672708U, // <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0> + 2713930960U, // <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4> + 1640189238U, // <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS + 3786345848U, // <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1> + 3787009481U, // <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1> + 1640189466U, // <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1> + 2754233455U, // <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1> + 2713931407U, // <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1> + 2713931499U, // <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3> + 3827975305U, // <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0> + 2754233495U, // <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5> + 2288746834U, // <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5> + 2713931827U, // <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7> + 3787673725U, // <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0> + 2754233527U, // <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1> + 2668462182U, // <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS + 2290746002U, // <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1> + 2302691478U, // <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2> + 3364488071U, // <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3> + 2302689536U, // <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4> + 2754233587U, // <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7> + 2713932600U, // <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6> + 2713932622U, // <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1> + 2302689297U, // <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u> + 2713932794U, // <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2> + 3365822474U, // <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1> + 3365824662U, // <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2> + 3787674851U, // <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1> + 2713933158U, // <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6> + 2292080978U, // <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5> + 3365823613U, // <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6> + 2713933420U, // <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7> + 2713933442U, // <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2> + 1658771190U, // <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1> + 1640191790U, // <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 2762933624U, // <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5> + 2754233724U, // <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0> + 2763081098U, // <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5> + 1640192154U, // <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS + 2713934032U, // <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7> + 2713934080U, // <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1> + 1640192357U, // <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS + 3779051520U, // <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0> + 2705309798U, // <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS + 3838813637U, // <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1> + 2302640230U, // <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS + 3765117266U, // <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5> + 3381027892U, // <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5> + 3842794985U, // <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1> + 3408232554U, // <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7> + 2302640235U, // <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS + 3700432998U, // <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS + 3765117785U, // <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2> + 2311276136U, // <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2> + 1237532774U, // <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS + 3700436278U, // <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS + 3381036084U, // <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5> + 3385018045U, // <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6> + 3385017560U, // <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7> + 1237532779U, // <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS + 3700441190U, // <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS + 3700442242U, // <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2> + 2754233960U, // <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2> + 2754233970U, // <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3> + 2765071997U, // <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5> + 3834021508U, // <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3> + 3842795152U, // <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6> + 3376402492U, // <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7> + 2754234015U, // <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3> + 2754234022U, // <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1> + 3827975855U, // <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1> + 2644625102U, // <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5> + 2289393766U, // <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS + 1691993806U, // <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5> + 2785052375U, // <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5> + 3854812897U, // <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6> + 3802942187U, // <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5> + 1692288754U, // <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5> + 3839846139U, // <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5> + 2709294052U, // <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2> + 2766251789U, // <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5> + 2765735702U, // <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5> + 3840141087U, // <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5> + 2705313078U, // <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS + 2712612217U, // <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2> + 3787017674U, // <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2> + 2765735747U, // <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5> + 3834021704U, // <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1> + 3834021714U, // <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2> + 2311308904U, // <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2> + 1237565542U, // <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS + 3834021744U, // <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5> + 3369124916U, // <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5> + 2248181690U, // <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7> + 3786354825U, // <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3> + 1237565547U, // <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS + 3700473958U, // <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS + 3700475014U, // <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6> + 2296718952U, // <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2> + 1228947558U, // <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS + 3700477238U, // <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS + 3834021836U, // <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7> + 2248951738U, // <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7> + 3370461105U, // <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7> + 1228947563U, // <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS + 3786355706U, // <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2> + 3783038037U, // <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3> + 3365824104U, // <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2> + 2292080742U, // <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS + 3842131986U, // <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5> + 3371795508U, // <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5> + 3786356206U, // <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7> + 3786356332U, // <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7> + 2292080747U, // <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS + 2754234427U, // <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1> + 2705315630U, // <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS + 2296735336U, // <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2> + 1228963942U, // <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS + 1695311971U, // <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5> + 2705315994U, // <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS + 2769201269U, // <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5> + 3370477489U, // <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7> + 1695606919U, // <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5> + 3827976331U, // <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0> + 2754234518U, // <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2> + 3706472290U, // <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0> + 3700500630U, // <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2> + 2754234544U, // <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1> + 3376383766U, // <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5> + 3769770513U, // <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7> + 3376383930U, // <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7> + 2754234581U, // <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2> + 2311275414U, // <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0> + 2305967971U, // <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1> + 2692047787U, // <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3> + 2311276146U, // <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3> + 2311275418U, // <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4> + 3765789807U, // <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1> + 3765789939U, // <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7> + 2311276474U, // <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7> + 2696029585U, // <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3> + 2311288709U, // <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0> + 3765790243U, // <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5> + 3827976513U, // <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2> + 2765736268U, // <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4> + 2246248962U, // <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6> + 3765790563U, // <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1> + 3827976550U, // <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3> + 3842795887U, // <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3> + 2769054073U, // <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4> + 3827976575U, // <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1> + 3765790963U, // <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5> + 3839478162U, // <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2> + 2754234780U, // <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3> + 2771708327U, // <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5> + 3363137059U, // <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5> + 3375081320U, // <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6> + 3363137466U, // <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7> + 2772003275U, // <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5> + 2772077012U, // <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5> + 3765791714U, // <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0> + 2709965878U, // <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3> + 2772298223U, // <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5> + 2772371960U, // <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5> + 2754234882U, // <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6> + 3839478282U, // <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5> + 3376416698U, // <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7> + 2754234909U, // <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6> + 2311308182U, // <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0> + 3765792421U, // <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5> + 2715938575U, // <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3> + 2311308914U, // <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3> + 2311308186U, // <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4> + 2248182354U, // <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5> + 3765792837U, // <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7> + 2311309242U, // <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7> + 2311308190U, // <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u> + 2632777830U, // <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS + 3706520372U, // <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1> + 2632779624U, // <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6> + 2632780290U, // <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6> + 2632781110U, // <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS + 2248952413U, // <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7> + 2302691176U, // <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6> + 2302691258U, // <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7> + 2632783662U, // <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS + 3365823382U, // <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0> + 3706529011U, // <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7> + 3706529641U, // <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7> + 3365824114U, // <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3> + 2774362859U, // <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5> + 3365824035U, // <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5> + 3383740183U, // <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6> + 3363833786U, // <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7> + 2774657807U, // <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5> + 2632794214U, // <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS + 2754235166U, // <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2> + 2632796010U, // <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u> + 2632796676U, // <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u> + 2632797494U, // <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS + 2754235206U, // <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6> + 2302691176U, // <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6> + 2302707642U, // <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7> + 2754235229U, // <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2> + 3765133325U, // <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4> + 2705326182U, // <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS + 3718489806U, // <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5> + 3718490624U, // <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4> + 2709307730U, // <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5> + 2302641870U, // <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5> + 3376383695U, // <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6> + 3384351018U, // <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7> + 2705326749U, // <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS + 2305971057U, // <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0> + 3765134171U, // <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4> + 3766461338U, // <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4> + 3766461437U, // <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4> + 2311277776U, // <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4> + 2754235362U, // <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0> + 3783050483U, // <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7> + 3385019036U, // <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7> + 2311276241U, // <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u> + 3718504550U, // <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS + 3783050787U, // <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5> + 3773097576U, // <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2> + 2705327822U, // <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5> + 3773097767U, // <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4> + 2765737014U, // <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3> + 3779069882U, // <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7> + 3376401052U, // <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7> + 2245881370U, // <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1> + 3779070102U, // <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2> + 3363135525U, // <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1> + 3779070284U, // <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4> + 3779070364U, // <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3> + 2705328640U, // <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4> + 2307311310U, // <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5> + 3866021012U, // <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7> + 3363138204U, // <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7> + 2707983172U, // <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4> + 2708646805U, // <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4> + 2709310438U, // <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4> + 3779071030U, // <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3> + 2710637704U, // <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4> + 2754235600U, // <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4> + 1704676570U, // <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5> + 3779071358U, // <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7> + 2713292236U, // <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4> + 1704897781U, // <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5> + 2626871398U, // <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS + 2626872471U, // <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5> + 2765737230U, // <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3> + 3700615318U, // <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2> + 2626874678U, // <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS + 1174441270U, // <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS + 1680493878U, // <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 3385051804U, // <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7> + 1680493896U, // <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 2248952722U, // <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1> + 2302692152U, // <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1> + 3382406107U, // <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2> + 3700623874U, // <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6> + 2248953040U, // <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4> + 1175211318U, // <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS + 3376432280U, // <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6> + 2729218934U, // <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5> + 1175211561U, // <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS + 3787035642U, // <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2> + 3365822501U, // <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1> + 3808933085U, // <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4> + 3784381707U, // <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5> + 2713294182U, // <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6> + 2309998286U, // <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5> + 3383740111U, // <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6> + 3787036239U, // <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5> + 2731873960U, // <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4> + 2626895974U, // <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS + 2626897050U, // <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u> + 2644813518U, // <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5> + 2705327822U, // <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5> + 2626899254U, // <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS + 1707331102U, // <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5> + 1680494121U, // <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 2737183024U, // <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4> + 1680494139U, // <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 2302642684U, // <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0> + 1640218726U, // <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS + 3376384510U, // <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2> + 3376385078U, // <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3> + 2754236002U, // <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1> + 2717942242U, // <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5> + 2244907106U, // <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0> + 3376385406U, // <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7> + 1640219293U, // <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS + 2305969365U, // <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0> + 1237536282U, // <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 2713961366U, // <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0> + 3766469630U, // <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5> + 2782326455U, // <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5> + 2311277786U, // <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5> + 2311277058U, // <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6> + 3385017587U, // <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7> + 1237536282U, // <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 3376400892U, // <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0> + 3827977963U, // <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3> + 2302659070U, // <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2> + 2765737726U, // <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4> + 3839479558U, // <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3> + 2781073167U, // <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3> + 2713962426U, // <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7> + 3376401790U, // <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7> + 2769055531U, // <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4> + 2713962646U, // <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2> + 3765143786U, // <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5> + 3839479621U, // <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3> + 2289394603U, // <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3> + 2713963010U, // <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6> + 2313285150U, // <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5> + 3363138050U, // <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6> + 3363136755U, // <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7> + 2713963294U, // <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2> + 2713963410U, // <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1> + 3827978127U, // <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5> + 3839479704U, // <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5> + 3376417846U, // <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3> + 1637567706U, // <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5> + 1640222006U, // <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS + 2310640998U, // <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6> + 3376418174U, // <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7> + 1640222238U, // <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5> + 1577091174U, // <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS + 2311310226U, // <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1> + 2713964303U, // <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3> + 2311311119U, // <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3> + 1577094454U, // <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS + 296144182U, // <5,5,5,5>: Cost 1 vspltisw1 RHS + 2311309826U, // <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6> + 2311311447U, // <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7> + 296144182U, // <5,5,5,u>: Cost 1 vspltisw1 RHS + 2248953460U, // <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1> + 2326580114U, // <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1> + 2713965050U, // <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3> + 3700697602U, // <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6> + 2785644620U, // <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5> + 2781073495U, // <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7> + 1228950018U, // <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 2713965390U, // <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1> + 1228950018U, // <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 2713965562U, // <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2> + 3383741330U, // <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1> + 3718620878U, // <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5> + 3365823403U, // <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3> + 2713965926U, // <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6> + 2717947318U, // <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5> + 3365825026U, // <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6> + 2292081907U, // <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7> + 2713966210U, // <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2> + 1577091174U, // <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS + 1640224558U, // <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS + 2713966469U, // <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0> + 2713966524U, // <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1> + 1577094454U, // <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS + 296144182U, // <5,5,u,5>: Cost 1 vspltisw1 RHS + 1228950018U, // <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 2713966848U, // <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1> + 296144182U, // <5,5,u,u>: Cost 1 vspltisw1 RHS + 2705342464U, // <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0> + 1631600742U, // <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS + 3773112493U, // <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2> + 2705342720U, // <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4> + 2705342802U, // <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5> + 3779084708U, // <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6> + 3779084790U, // <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7> + 2302643510U, // <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS + 1631601309U, // <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS + 3767141092U, // <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2> + 2705343284U, // <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1> + 2705343382U, // <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0> + 3779085282U, // <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4> + 2693399632U, // <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6> + 3767805089U, // <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6> + 2311279416U, // <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6> + 1237536054U, // <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS + 1237536055U, // <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS + 3773113789U, // <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2> + 3779085855U, // <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1> + 2699372136U, // <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2> + 2705344166U, // <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1> + 2699372329U, // <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6> + 2705344360U, // <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6> + 2705344442U, // <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7> + 2302659894U, // <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS + 2702026861U, // <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6> + 2705344662U, // <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2> + 3767142661U, // <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5> + 3773114689U, // <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2> + 2705344924U, // <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3> + 1631603202U, // <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6> + 3842945597U, // <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7> + 3779086962U, // <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1> + 2289397046U, // <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS + 1634257734U, // <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6> + 2644926566U, // <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS + 3779087306U, // <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3> + 2790142577U, // <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5> + 2644929026U, // <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6> + 2711317723U, // <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6> + 1631604022U, // <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS + 2712644989U, // <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6> + 2302676278U, // <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS + 1631604265U, // <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS + 3842945708U, // <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1> + 3767144133U, // <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1> + 2705346328U, // <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3> + 3779088207U, // <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4> + 2717290420U, // <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6> + 2705346574U, // <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6> + 2705346596U, // <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1> + 1237568822U, // <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS + 1237568823U, // <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS + 2650914918U, // <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS + 3364490949U, // <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1> + 2248954362U, // <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3> + 2302693144U, // <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3> + 2650918198U, // <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS + 2650918926U, // <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6> + 2302693390U, // <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6> + 1228950838U, // <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS + 1228950839U, // <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS + 497467494U, // <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS + 1571210036U, // <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1> + 1571210856U, // <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1571211414U, // <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 497470774U, // <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS + 1571213316U, // <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5> + 1571213818U, // <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3> + 1571214956U, // <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7> + 497473326U, // <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS + 497475686U, // <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS + 1631606574U, // <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS + 1571219048U, // <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1571219606U, // <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 497478967U, // <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS + 1631606938U, // <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS + 1571222010U, // <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3> + 1228967222U, // <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS + 497481518U, // <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS + 3768475648U, // <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0> + 2694733926U, // <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 3718711395U, // <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5> + 3384349178U, // <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3> + 2694734162U, // <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5> + 3384347884U, // <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5> + 3730658026U, // <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0> + 3718714362U, // <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2> + 2694734493U, // <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 2311278690U, // <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0> + 2305970923U, // <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1> + 3768476566U, // <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0> + 2311279098U, // <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3> + 2311278694U, // <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4> + 3768476783U, // <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1> + 2694735091U, // <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7> + 2311279426U, // <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7> + 2696062357U, // <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7> + 3383701602U, // <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0> + 3768477219U, // <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5> + 3768477288U, // <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2> + 2309960186U, // <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3> + 3383701606U, // <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4> + 3768477545U, // <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7> + 3766486970U, // <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7> + 3383702338U, // <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7> + 2309960186U, // <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3> + 3768477846U, // <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2> + 3768477975U, // <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5> + 3786393932U, // <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4> + 3768478108U, // <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3> + 2795599115U, // <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5> + 3385037470U, // <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5> + 3780422309U, // <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7> + 3848107301U, // <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4> + 2795894063U, // <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5> + 2795967800U, // <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5> + 3768478690U, // <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0> + 3718744163U, // <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5> + 3784404107U, // <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7> + 2796262748U, // <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5> + 2694737206U, // <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS + 2712653182U, // <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7> + 2713316815U, // <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7> + 2694737449U, // <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS + 2311311458U, // <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0> + 3768479433U, // <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5> + 3768479521U, // <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3> + 2311311866U, // <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3> + 2311311462U, // <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4> + 2248185270U, // <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5> + 2718625879U, // <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7> + 2311312194U, // <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7> + 2311311466U, // <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u> + 2248954874U, // <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2> + 3322696778U, // <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1> + 2248955028U, // <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3> + 2656963074U, // <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6> + 2248955238U, // <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6> + 2248955329U, // <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7> + 2656965360U, // <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6> + 2248955500U, // <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7> + 2248955522U, // <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2> + 3718766694U, // <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS + 3724739827U, // <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7> + 3718768739U, // <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5> + 3365826337U, // <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3> + 2798253647U, // <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5> + 3365826258U, // <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5> + 3730715377U, // <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7> + 2310665836U, // <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7> + 2798548595U, // <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5> + 2311336034U, // <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0> + 2694739758U, // <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 2248955028U, // <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3> + 2311336442U, // <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3> + 2311336038U, // <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4> + 2694740122U, // <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS + 2656981746U, // <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u> + 2311336770U, // <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7> + 2694740325U, // <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS + 2705358848U, // <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0> + 1631617126U, // <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS + 2310607866U, // <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2> + 2302640284U, // <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS + 2754238189U, // <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1> + 2305296114U, // <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5> + 2244907106U, // <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0> + 2302643528U, // <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS + 1631617693U, // <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS + 2627133542U, // <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS + 1237536282U, // <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 1680496430U, // <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 1237532828U, // <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS + 2693416018U, // <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u> + 2756892486U, // <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0> + 2694743284U, // <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u> + 1237536072U, // <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS + 1680496484U, // <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 2311288709U, // <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0> + 2245883694U, // <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS + 2699388520U, // <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2> + 2754238344U, // <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3> + 2699388715U, // <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u> + 2757408666U, // <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3> + 2705360826U, // <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7> + 2302659912U, // <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS + 2754238389U, // <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3> + 2754238396U, // <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1> + 3827980229U, // <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1> + 2644625102U, // <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5> + 2289393820U, // <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS + 1631619588U, // <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u> + 2785056749U, // <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5> + 3363138077U, // <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6> + 2289397064U, // <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS + 1634274120U, // <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u> + 1634937753U, // <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u> + 1728272410U, // <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5> + 2710006843U, // <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u> + 2765740076U, // <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5> + 1637592285U, // <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u> + 1631620406U, // <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS + 2712661375U, // <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u> + 2302676296U, // <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS + 1631620649U, // <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS + 1577091174U, // <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS + 1174443822U, // <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS + 2766035058U, // <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3> + 1237565596U, // <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS + 1577094454U, // <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS + 296144182U, // <5,u,5,5>: Cost 1 vspltisw1 RHS + 1680496794U, // <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 1237568840U, // <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS + 296144182U, // <5,u,5,u>: Cost 1 vspltisw1 RHS + 2633146470U, // <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS + 1175213870U, // <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS + 2633148309U, // <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6> + 1228947612U, // <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS + 2633149750U, // <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS + 1175214234U, // <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS + 1228950018U, // <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 1228950856U, // <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS + 1228947617U, // <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS + 497614950U, // <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS + 1571357492U, // <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1> + 1571358312U, // <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1571358870U, // <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 497618248U, // <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS + 1571360772U, // <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5> + 1571361274U, // <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3> + 1571361786U, // <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2> + 497620782U, // <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS + 497623142U, // <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS + 1631622958U, // <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS + 1680496997U, // <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS + 1228963996U, // <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS + 497626441U, // <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS + 296144182U, // <5,u,u,5>: Cost 1 vspltisw1 RHS + 1680497037U, // <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS + 1228967240U, // <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS + 497628974U, // <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS + 2772451328U, // <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0> + 2772451338U, // <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1> + 3771146417U, // <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6> + 3383095739U, // <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3> + 3846193189U, // <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1> + 3724832803U, // <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0> + 3383095985U, // <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6> + 3383096067U, // <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7> + 2772451401U, // <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1> + 2651095142U, // <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS + 2251612262U, // <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS + 1698709606U, // <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2651097602U, // <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6> + 2651098422U, // <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS + 2651099172U, // <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1> + 2657071869U, // <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1> + 3724841978U, // <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2> + 1698709660U, // <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2252292096U, // <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0> + 1178550374U, // <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS + 3826655418U, // <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6> + 3777783485U, // <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6> + 2252292434U, // <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5> + 3785746280U, // <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6> + 2252292593U, // <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2> + 3736794583U, // <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2> + 1178550941U, // <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS + 3375153152U, // <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0> + 2772451584U, // <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4> + 3777784163U, // <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0> + 3846193426U, // <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4> + 2712005122U, // <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6> + 3724857382U, // <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3> + 3802335864U, // <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7> + 3801672410U, // <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6> + 2772451647U, // <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4> + 3383123968U, // <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0> + 2772451666U, // <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5> + 3773803577U, // <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6> + 3724864002U, // <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6> + 3846193517U, // <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5> + 2712005935U, // <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0> + 3327009265U, // <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2> + 3383126648U, // <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7> + 2772451729U, // <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5> + 3373178880U, // <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0> + 2254266470U, // <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS + 3785748248U, // <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3> + 3790393190U, // <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0> + 3328000338U, // <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5> + 3785748494U, // <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6> + 3785748516U, // <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1> + 3379153528U, // <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7> + 2254267037U, // <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS + 2254897152U, // <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0> + 1181155430U, // <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS + 3785748923U, // <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3> + 3785749042U, // <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5> + 2254897490U, // <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5> + 3785749169U, // <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6> + 2724614962U, // <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0> + 3787739982U, // <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1> + 1181155997U, // <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS + 1235664896U, // <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0> + 1235666598U, // <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1> + 3712943720U, // <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2> + 2639202936U, // <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7> + 2639203638U, // <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS + 2309409236U, // <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5> + 3712946517U, // <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0> + 2309409400U, // <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 1235666605U, // <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u> + 1235673088U, // <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0> + 1235674790U, // <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1> + 1698710173U, // <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2639211129U, // <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u> + 2639211830U, // <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS + 2712008858U, // <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS + 2657129220U, // <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u> + 2309417592U, // <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 1698710227U, // <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 3775799296U, // <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0> + 2702057574U, // <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS + 3373143763U, // <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2> + 3695045122U, // <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6> + 3775799634U, // <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5> + 3383091538U, // <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5> + 3368493233U, // <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6> + 3362522319U, // <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7> + 2702058141U, // <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS + 3834250027U, // <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1> + 2772452148U, // <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1> + 3832038210U, // <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6> + 3373150660U, // <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3> + 3834250067U, // <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5> + 3373146450U, // <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5> + 3826656102U, // <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6> + 3362530511U, // <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7> + 2772452148U, // <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1> + 2669092966U, // <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS + 2252292916U, // <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1> + 2252293014U, // <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0> + 2772452246U, // <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0> + 2669096246U, // <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS + 3846194091U, // <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3> + 2702059450U, // <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7> + 3870081978U, // <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0> + 2702059633U, // <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1> + 3775801494U, // <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2> + 3777128723U, // <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1> + 3775801702U, // <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3> + 3775801756U, // <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3> + 3775801858U, // <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6> + 3375153490U, // <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5> + 3826656265U, // <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7> + 3775802051U, // <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1> + 3775802142U, // <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2> + 3846194206U, // <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1> + 3846194219U, // <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5> + 3846194228U, // <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5> + 3846194236U, // <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4> + 3846194246U, // <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5> + 2760508496U, // <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6> + 3368526001U, // <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6> + 3870082144U, // <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4> + 2760729707U, // <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6> + 2714668660U, // <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1> + 3834619005U, // <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6> + 3834692742U, // <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6> + 3846194317U, // <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4> + 3834840216U, // <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6> + 3834913953U, // <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6> + 2719977570U, // <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0> + 3367208143U, // <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7> + 2719977724U, // <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1> + 2669125734U, // <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS + 2254897972U, // <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1> + 2254898070U, // <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0> + 3775803929U, // <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7> + 2669129014U, // <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS + 2322006354U, // <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5> + 2725950264U, // <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6> + 3793720142U, // <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1> + 2254898556U, // <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0> + 2627330150U, // <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS + 1235664906U, // <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1> + 1235667094U, // <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2> + 2309406894U, // <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3> + 2627333430U, // <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS + 1235665234U, // <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5> + 2309406897U, // <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6> + 2309407222U, // <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7> + 1235664913U, // <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u> + 2627338342U, // <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS + 1235673098U, // <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1> + 1235675286U, // <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2> + 2772452732U, // <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0> + 2627341622U, // <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS + 1235673426U, // <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5> + 2309415089U, // <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6> + 2309415414U, // <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7> + 1235673105U, // <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u> + 3324683725U, // <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0> + 2725290086U, // <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS + 3771162801U, // <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6> + 2309349478U, // <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS + 3730951478U, // <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS + 3840738784U, // <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1> + 3842655721U, // <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1> + 3736925671U, // <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0> + 2309349483U, // <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS + 3367840468U, // <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0> + 3325355551U, // <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1> + 3373147752U, // <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2> + 2299404390U, // <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS + 3701099830U, // <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS + 3767846054U, // <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2> + 3826656825U, // <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0> + 3373147838U, // <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7> + 2299404395U, // <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS + 2657222758U, // <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS + 3771164219U, // <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2> + 2766481000U, // <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2> + 2772452978U, // <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3> + 2657226038U, // <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS + 3790407528U, // <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6> + 2252294074U, // <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7> + 2252294148U, // <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0> + 2772453023U, // <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3> + 2772453030U, // <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1> + 3834250930U, // <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4> + 2765596349U, // <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6> + 2301411430U, // <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS + 2772453070U, // <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5> + 2765817560U, // <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6> + 2252933050U, // <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7> + 2796340968U, // <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4> + 2766038771U, // <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6> + 3725008998U, // <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS + 3368530217U, // <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1> + 3840222989U, // <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5> + 2309382246U, // <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS + 3725012278U, // <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS + 2766481193U, // <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6> + 3842656049U, // <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5> + 3327010820U, // <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0> + 2766702404U, // <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6> + 3713073254U, // <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS + 3789082310U, // <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2> + 3840665439U, // <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6> + 2766997352U, // <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6> + 3713076534U, // <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS + 3791736842U, // <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2> + 3373180605U, // <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6> + 3793064108U, // <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2> + 2767366037U, // <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6> + 3701137510U, // <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS + 3701138647U, // <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6> + 2254898792U, // <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2> + 1248264294U, // <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS + 3701140790U, // <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS + 3725029435U, // <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6> + 2254899130U, // <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7> + 2725294981U, // <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2> + 1248264299U, // <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS + 2633375846U, // <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS + 2309407468U, // <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1> + 1235666536U, // <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2> + 161923174U, // <6,2,7,3>: Cost 1 vmrglw RHS, LHS + 2633379126U, // <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS + 2309407796U, // <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5> + 2309408445U, // <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6> + 2309407960U, // <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7> + 161923179U, // <6,2,7,u>: Cost 1 vmrglw RHS, LHS + 2633384038U, // <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS + 2309415660U, // <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1> + 1235674728U, // <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2> + 161931366U, // <6,2,u,3>: Cost 1 vmrglw RHS, LHS + 2633387318U, // <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS + 2769135725U, // <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6> + 2309416637U, // <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6> + 2309416152U, // <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7> + 161931371U, // <6,2,u,u>: Cost 1 vmrglw RHS, LHS + 3777806336U, // <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0> + 2704064614U, // <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS + 3765862577U, // <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6> + 3843393708U, // <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6> + 2250516994U, // <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6> + 3725054014U, // <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0> + 3383093096U, // <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6> + 3368495034U, // <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7> + 2704065181U, // <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS + 2251622550U, // <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2> + 3777807156U, // <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1> + 3765863348U, // <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3> + 3373147762U, // <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3> + 3834251525U, // <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5> + 3373147683U, // <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5> + 3391727545U, // <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6> + 2299406266U, // <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7> + 2251622550U, // <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2> + 2252294294U, // <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2> + 3326036198U, // <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1> + 3771836045U, // <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3> + 2252294556U, // <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3> + 2252294658U, // <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6> + 3840739677U, // <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3> + 2704066490U, // <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7> + 3368511418U, // <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7> + 2252294942U, // <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2> + 3707158630U, // <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS + 3765864692U, // <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6> + 2704066918U, // <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3> + 2772453788U, // <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3> + 2772453799U, // <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5> + 3789752888U, // <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6> + 3840739770U, // <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6> + 2301413306U, // <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7> + 2775108043U, // <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5> + 2651340902U, // <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS + 3846195674U, // <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2> + 3845974503U, // <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6> + 2651343362U, // <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6> + 2651344182U, // <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS + 1698712066U, // <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6> + 3383125864U, // <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6> + 3368527802U, // <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7> + 1698933277U, // <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6> + 3373179798U, // <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0> + 3707176179U, // <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7> + 2716012312U, // <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3> + 3373180530U, // <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3> + 2254309890U, // <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6> + 3785773070U, // <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6> + 3840739932U, // <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6> + 2299439034U, // <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7> + 2719994110U, // <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3> + 2254899350U, // <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2> + 3328641254U, // <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1> + 2633443257U, // <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6> + 2254899612U, // <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3> + 2254899714U, // <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6> + 3785773772U, // <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6> + 2725966648U, // <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6> + 2322007994U, // <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7> + 2254899998U, // <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2> + 1559707750U, // <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS + 2633450292U, // <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1> + 1559709626U, // <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7> + 1235666546U, // <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3> + 1559711030U, // <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS + 2309408291U, // <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5> + 2633454152U, // <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0> + 1235666874U, // <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7> + 1559713582U, // <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS + 1559715942U, // <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS + 2633458484U, // <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1> + 1559717819U, // <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u> + 1235674738U, // <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3> + 1559719222U, // <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS + 1701366598U, // <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6> + 2633462353U, // <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0> + 1235675066U, // <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7> + 1559721774U, // <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS + 3785777152U, // <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0> + 2712035430U, // <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS + 3771179185U, // <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6> + 3846196096U, // <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1> + 3785777490U, // <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5> + 2250517814U, // <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS + 3324259703U, // <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0> + 3383092458U, // <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7> + 2712035997U, // <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS + 3325356946U, // <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1> + 3785777972U, // <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1> + 3846196170U, // <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3> + 3325365380U, // <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0> + 3852168155U, // <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2> + 2251615542U, // <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS + 3325357432U, // <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1> + 3870084088U, // <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4> + 2251615785U, // <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS + 2252295058U, // <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1> + 3771180605U, // <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4> + 3785778792U, // <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2> + 3777816253U, // <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6> + 2252295376U, // <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4> + 1178553654U, // <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS + 2252295545U, // <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2> + 3326037448U, // <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0> + 1178553897U, // <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS + 3785779350U, // <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2> + 3383118648U, // <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1> + 3777816935U, // <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4> + 3785779612U, // <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3> + 2712037890U, // <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6> + 2252754230U, // <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS + 3784452764U, // <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7> + 3801705178U, // <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6> + 2252754473U, // <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS + 3787770770U, // <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1> + 3383126840U, // <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1> + 3327380534U, // <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3> + 3784453265U, // <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4> + 2253630672U, // <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4> + 2778426587U, // <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6> + 3383128789U, // <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6> + 3381799580U, // <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7> + 2778647798U, // <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6> + 2651422822U, // <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS + 3701277928U, // <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5> + 3701278650U, // <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7> + 2651425282U, // <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6> + 2651426102U, // <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS + 2651426892U, // <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5> + 1698712886U, // <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 3725169658U, // <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2> + 1698712904U, // <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 2254900114U, // <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1> + 3389115192U, // <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1> + 3785781727U, // <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3> + 3785781810U, // <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5> + 2254900432U, // <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4> + 1181158710U, // <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS + 2254900605U, // <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6> + 3787772750U, // <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1> + 1181158953U, // <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS + 2639495270U, // <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS + 2639496090U, // <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4> + 3707267011U, // <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7> + 2639497884U, // <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7> + 1237658832U, // <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4> + 1235666638U, // <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5> + 3713241753U, // <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0> + 2309409436U, // <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 1235666641U, // <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u> + 2639503462U, // <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS + 2639504282U, // <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4> + 3701303226U, // <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7> + 2639506077U, // <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u> + 1235676368U, // <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4> + 1235674830U, // <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5> + 1698713129U, // <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 2309417628U, // <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 1698713147U, // <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 3775832064U, // <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0> + 2702090342U, // <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS + 3775832241U, // <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6> + 3719227906U, // <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6> + 3775832402U, // <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5> + 3385085146U, // <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5> + 2309351938U, // <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6> + 3376459134U, // <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7> + 2702090909U, // <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS + 3719233546U, // <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1> + 3775832884U, // <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1> + 3775832982U, // <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0> + 3846196909U, // <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4> + 3719236984U, // <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1> + 3856150209U, // <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6> + 3834252997U, // <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1> + 3870084817U, // <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4> + 3769861532U, // <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5> + 2645500006U, // <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS + 3719242548U, // <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1> + 3775833704U, // <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2> + 3775833766U, // <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1> + 2645503353U, // <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2> + 2252296196U, // <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5> + 2702092218U, // <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7> + 3719246842U, // <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2> + 2702092405U, // <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5> + 3775834262U, // <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2> + 3777161495U, // <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5> + 3775834470U, // <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3> + 3775834524U, // <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3> + 3775834626U, // <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6> + 3385109722U, // <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5> + 2309376514U, // <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6> + 3775834819U, // <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1> + 2309376514U, // <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6> + 3719258214U, // <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS + 3385117586U, // <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1> + 3327242008U, // <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3> + 3719260674U, // <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6> + 3719261563U, // <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4> + 2702093622U, // <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS + 2309384706U, // <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6> + 3870085060U, // <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4> + 2702093865U, // <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS + 3719266406U, // <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS + 3789106889U, // <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5> + 3785789208U, // <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3> + 3373183950U, // <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3> + 2717355964U, // <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5> + 2791772164U, // <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5> + 2772455438U, // <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6> + 3373183549U, // <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7> + 2720010496U, // <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5> + 2772455460U, // <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1> + 2322008978U, // <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1> + 3840225335U, // <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2> + 2772455490U, // <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4> + 2772455500U, // <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5> + 2254901252U, // <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5> + 2772455520U, // <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7> + 2785874024U, // <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6> + 2772455532U, // <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1> + 2627625062U, // <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS + 1235667858U, // <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1> + 2309409278U, // <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2> + 2309407659U, // <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3> + 2627628342U, // <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS + 1235668186U, // <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5> + 1235667458U, // <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6> + 2309407987U, // <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7> + 1235667460U, // <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u> + 2627633254U, // <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS + 1235676050U, // <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1> + 2309417470U, // <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2> + 2309415851U, // <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3> + 2627636534U, // <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS + 1235676378U, // <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5> + 1235675650U, // <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6> + 2309416179U, // <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7> + 1235675652U, // <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u> + 2309352751U, // <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0> + 1650917478U, // <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS + 2250584570U, // <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3> + 3846197554U, // <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1> + 2724659538U, // <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5> + 3725275225U, // <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0> + 2791772493U, // <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1> + 2309352758U, // <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS + 1650918045U, // <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS + 3325358368U, // <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1> + 2299406449U, // <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1> + 2724660118U, // <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0> + 3373148518U, // <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3> + 3834253712U, // <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5> + 3373147953U, // <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5> + 2323297080U, // <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6> + 2299407670U, // <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS + 2299407671U, // <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS + 2252296489U, // <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1> + 3326038394U, // <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1> + 1178554874U, // <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2724660902U, // <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1> + 2252296817U, // <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5> + 3840741864U, // <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3> + 2252296976U, // <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2> + 2785874426U, // <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3> + 1178554874U, // <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2724661398U, // <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2> + 3375154665U, // <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1> + 3375154909U, // <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2> + 2301413734U, // <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3> + 2772455986U, // <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5> + 3375154993U, // <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5> + 2323313464U, // <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6> + 2301414710U, // <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS + 2301414711U, // <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS + 2724662162U, // <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1> + 3326939559U, // <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1> + 2253271546U, // <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3> + 3383127346U, // <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3> + 2309385523U, // <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4> + 1650920758U, // <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS + 2724662653U, // <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6> + 2309385526U, // <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS + 1650921001U, // <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS + 3725312102U, // <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS + 3373180393U, // <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1> + 3791769368U, // <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3> + 3373181286U, // <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3> + 3725315382U, // <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS + 2299439221U, // <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5> + 2724663394U, // <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0> + 2299440438U, // <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS + 2299440439U, // <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS + 1583808614U, // <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 2322010445U, // <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1> + 2254574074U, // <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3> + 2322010609U, // <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3> + 1583811894U, // <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 2322010773U, // <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5> + 363253046U, // <6,6,6,6>: Cost 1 vspltisw2 RHS + 1248267574U, // <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS + 363253046U, // <6,6,6,u>: Cost 1 vspltisw2 RHS + 2309410095U, // <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0> + 2309408233U, // <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1> + 2311402373U, // <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2> + 2309409126U, // <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3> + 2309410099U, // <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4> + 2309408561U, // <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5> + 1237660472U, // <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6> + 161926454U, // <6,6,7,7>: Cost 1 vmrglw RHS, RHS + 161926455U, // <6,6,7,u>: Cost 1 vmrglw RHS, RHS + 1583808614U, // <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 1650923310U, // <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS + 1178554874U, // <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2309417318U, // <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3> + 1583811894U, // <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 1650923674U, // <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS + 363253046U, // <6,6,u,6>: Cost 1 vspltisw2 RHS + 161934646U, // <6,6,u,7>: Cost 1 vmrglw RHS, RHS + 161934647U, // <6,6,u,u>: Cost 1 vmrglw RHS, RHS + 1638318080U, // <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0> + 564576358U, // <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS + 2712060077U, // <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2> + 2712060156U, // <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0> + 1638318418U, // <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5> + 1577865314U, // <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0> + 2712060406U, // <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7> + 2651608058U, // <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2> + 564576925U, // <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS + 2712060643U, // <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1> + 1638318900U, // <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1> + 1638318998U, // <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0> + 3766559753U, // <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7> + 2712060971U, // <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5> + 2712061039U, // <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1> + 2712061135U, // <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7> + 3373148612U, // <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7> + 1638319484U, // <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0> + 2712061373U, // <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2> + 2712061471U, // <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1> + 1638319720U, // <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2> + 1638319782U, // <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1> + 2712061709U, // <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5> + 2712061800U, // <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6> + 1638320058U, // <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7> + 2252297836U, // <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7> + 1638320187U, // <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1> + 1638320278U, // <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2> + 2712062182U, // <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1> + 2712062256U, // <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3> + 1638320540U, // <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3> + 1638320642U, // <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6> + 2712062546U, // <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5> + 2712062584U, // <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7> + 2712062659U, // <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1> + 1638320926U, // <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2> + 1638321042U, // <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1> + 2712062922U, // <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3> + 2712063029U, // <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2> + 2712063108U, // <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0> + 1638321360U, // <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4> + 564579638U, // <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS + 2712063357U, // <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6> + 2712063439U, // <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7> + 564579881U, // <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS + 2712063560U, // <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2> + 2714054287U, // <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1> + 2712063742U, // <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4> + 3373181295U, // <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3> + 2712063924U, // <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6> + 1638322180U, // <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5> + 1638322274U, // <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0> + 3373181380U, // <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7> + 1640313092U, // <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0> + 2712064289U, // <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2> + 2712064423U, // <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1> + 1638322682U, // <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 2712064562U, // <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5> + 2712064653U, // <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6> + 2712064747U, // <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1> + 1638323000U, // <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6> + 1638323022U, // <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1> + 1638323168U, // <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3> + 1237659746U, // <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0> + 2309411158U, // <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1> + 2639718330U, // <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7> + 1235669498U, // <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3> + 1237659750U, // <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4> + 2309411243U, // <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5> + 1583895362U, // <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7> + 1235669826U, // <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7> + 1235669503U, // <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u> + 1638323923U, // <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2> + 564582190U, // <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS + 1638324101U, // <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0> + 1638324156U, // <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1> + 1638324287U, // <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6> + 564582554U, // <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS + 1638324432U, // <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7> + 1235678018U, // <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7> + 564582757U, // <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS + 1638326272U, // <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0> + 564584550U, // <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS + 2712068269U, // <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2> + 2309349532U, // <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS + 1638326610U, // <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5> + 1577939051U, // <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0> + 2712068598U, // <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7> + 2309352776U, // <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS + 564585117U, // <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS + 2712068835U, // <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1> + 1638327092U, // <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1> + 1698715438U, // <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2299404444U, // <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS + 2712069163U, // <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5> + 2712069231U, // <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1> + 2712069327U, // <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7> + 2299407688U, // <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS + 1698715492U, // <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS + 2712069565U, // <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2> + 1178556206U, // <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS + 1638327912U, // <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2> + 1638327974U, // <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1> + 2712069901U, // <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5> + 1178556570U, // <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS + 1638328250U, // <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7> + 2252298496U, // <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1> + 1638328379U, // <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1> + 1638328470U, // <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2> + 2712070374U, // <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1> + 2704107883U, // <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u> + 1638328732U, // <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3> + 1638328834U, // <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6> + 2712070738U, // <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5> + 2712070776U, // <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7> + 2301414728U, // <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS + 1638329118U, // <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2> + 1638329234U, // <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1> + 2712071114U, // <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3> + 2712071221U, // <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2> + 2309382300U, // <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS + 1638329552U, // <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4> + 564587831U, // <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS + 2712071545U, // <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2> + 2309385544U, // <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS + 564588073U, // <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS + 2712071752U, // <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2> + 2714062479U, // <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1> + 2712071934U, // <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4> + 2299437212U, // <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS + 2712072116U, // <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6> + 1638330372U, // <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5> + 1698715802U, // <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 2299440456U, // <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS + 1698715820U, // <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS + 1583808614U, // <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 1181161262U, // <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS + 1638330874U, // <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 1248264348U, // <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS + 1583811894U, // <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 1181161626U, // <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS + 363253046U, // <6,u,6,6>: Cost 1 vspltisw2 RHS + 1638331214U, // <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1> + 363253046U, // <6,u,6,u>: Cost 1 vspltisw2 RHS + 1560076390U, // <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS + 1235664969U, // <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1> + 1560078311U, // <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7> + 161923228U, // <6,u,7,3>: Cost 1 vmrglw RHS, LHS + 1560079670U, // <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS + 1235665297U, // <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5> + 1235667485U, // <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6> + 161926472U, // <6,u,7,7>: Cost 1 vmrglw RHS, RHS + 161923233U, // <6,u,7,u>: Cost 1 vmrglw RHS, LHS + 1560084582U, // <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS + 564590382U, // <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS + 1560086504U, // <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u> + 161931420U, // <6,u,u,3>: Cost 1 vmrglw RHS, LHS + 1560087862U, // <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS + 564590746U, // <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS + 363253046U, // <6,u,u,6>: Cost 1 vspltisw2 RHS + 161934664U, // <6,u,u,7>: Cost 1 vmrglw RHS, RHS + 161931425U, // <6,u,u,u>: Cost 1 vmrglw RHS, LHS + 1705426944U, // <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0> + 1705426954U, // <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1> + 3713550266U, // <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7> + 2316063892U, // <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3> + 2779168805U, // <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1> + 2663698530U, // <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0> + 2657727309U, // <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0> + 2316064220U, // <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7> + 1705427017U, // <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1> + 1583988838U, // <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS + 2779168859U, // <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1> + 631685222U, // <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS + 2639817411U, // <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1> + 1583992118U, // <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS + 2657734660U, // <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5> + 1583993678U, // <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1> + 2657735672U, // <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0> + 631685276U, // <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS + 2779168933U, // <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3> + 2767667377U, // <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6> + 2718713448U, // <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2> + 2718713510U, // <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1> + 3841409228U, // <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6> + 3852910802U, // <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3> + 2718713786U, // <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7> + 3847160036U, // <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3> + 2767667440U, // <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6> + 2718714006U, // <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2> + 2779169020U, // <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0> + 3852910853U, // <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0> + 2718714268U, // <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3> + 2718714370U, // <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6> + 2718714461U, // <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7> + 2706770608U, // <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0> + 3847160114U, // <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0> + 2779169083U, // <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0> + 2718714770U, // <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1> + 1705427282U, // <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5> + 3713583034U, // <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7> + 3713583814U, // <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4> + 2779169133U, // <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5> + 1644973366U, // <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS + 2657760081U, // <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4> + 2259468868U, // <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4> + 1705427345U, // <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5> + 2718715508U, // <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1> + 2260123750U, // <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS + 3792457451U, // <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3> + 3852911024U, // <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0> + 2718715836U, // <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5> + 2718715908U, // <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5> + 1644974178U, // <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0> + 3792457853U, // <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0> + 1646301444U, // <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0> + 2720706901U, // <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0> + 2779169270U, // <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7> + 2718716410U, // <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3> + 2722697800U, // <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0> + 3852911121U, // <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7> + 3852911130U, // <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7> + 2718716728U, // <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6> + 2718716750U, // <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1> + 2779169333U, // <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7> + 2718716922U, // <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2> + 1187872870U, // <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS + 2718717076U, // <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3> + 3847160408U, // <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6> + 2718717286U, // <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6> + 2718717377U, // <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7> + 2718717404U, // <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7> + 2718717478U, // <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0> + 1187873437U, // <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS + 1584046182U, // <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS + 1705427602U, // <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1> + 631685789U, // <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS + 2639874762U, // <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u> + 1584049462U, // <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS + 1644976282U, // <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS + 1584051029U, // <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u> + 2718718208U, // <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1> + 631685843U, // <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS + 2721374218U, // <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1> + 2779169507U, // <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1> + 2779169516U, // <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1> + 3852911348U, // <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0> + 2669743414U, // <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS + 2316058962U, // <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5> + 2316059044U, // <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6> + 2669745146U, // <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2> + 2779169570U, // <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1> + 2779169579U, // <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1> + 1705427764U, // <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 2779169598U, // <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2> + 3713632972U, // <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1> + 2779169619U, // <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5> + 2779169628U, // <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5> + 2657809239U, // <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1> + 3835290474U, // <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1> + 1705427764U, // <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 2779169660U, // <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1> + 2779169671U, // <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3> + 2779169680U, // <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3> + 1705427862U, // <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0> + 2779169700U, // <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5> + 2779169707U, // <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3> + 2657817432U, // <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2> + 2803057594U, // <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0> + 1705427907U, // <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0> + 3776538827U, // <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1> + 2319400970U, // <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1> + 2316085398U, // <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2> + 3852911591U, // <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0> + 3852911600U, // <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0> + 2319401298U, // <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5> + 3833668617U, // <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7> + 3367265487U, // <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7> + 2319400977U, // <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u> + 2724031378U, // <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1> + 2779169835U, // <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5> + 2779169844U, // <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5> + 3852911672U, // <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0> + 2669776182U, // <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS + 2779169872U, // <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6> + 3835290712U, // <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5> + 2669778278U, // <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6> + 2779169898U, // <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5> + 2779169903U, // <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1> + 3835585661U, // <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6> + 3841410182U, // <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6> + 3852911753U, // <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0> + 2779169943U, // <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5> + 2318754130U, // <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5> + 2718724195U, // <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1> + 3859178670U, // <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1> + 2779169975U, // <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1> + 2720715094U, // <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1> + 2761549007U, // <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7> + 2779170008U, // <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7> + 3835438305U, // <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7> + 3835512042U, // <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7> + 2761843955U, // <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7> + 3835659516U, // <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7> + 2803057918U, // <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0> + 2762065166U, // <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7> + 2669797478U, // <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS + 2322087946U, // <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1> + 2317448186U, // <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2> + 3395829934U, // <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3> + 2669800758U, // <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS + 2322088274U, // <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5> + 3375923377U, // <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6> + 2731996780U, // <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7> + 2322087953U, // <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u> + 2779170146U, // <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1> + 1705427764U, // <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 2779170164U, // <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1> + 1705428348U, // <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0> + 2779170186U, // <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5> + 2763171221U, // <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7> + 2657866590U, // <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u> + 2803058080U, // <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0> + 1705428393U, // <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0> + 3713695846U, // <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS + 2779170237U, // <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2> + 2779170245U, // <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1> + 1242316902U, // <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS + 3713699126U, // <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS + 3852912096U, // <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1> + 2767668713U, // <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1> + 2256488426U, // <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1> + 1242316907U, // <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS + 3852912132U, // <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1> + 3852912141U, // <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1> + 3852912149U, // <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0> + 2779170335U, // <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1> + 3852912172U, // <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5> + 3840747062U, // <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6> + 3841410617U, // <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0> + 3795125538U, // <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0> + 2779170380U, // <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1> + 2779170389U, // <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1> + 3852912222U, // <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1> + 1705428584U, // <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2> + 1705428594U, // <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3> + 2779170429U, // <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5> + 3852912259U, // <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2> + 2767668880U, // <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6> + 3841336981U, // <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2> + 1705428639U, // <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3> + 1705428646U, // <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1> + 2779170479U, // <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1> + 2767668925U, // <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6> + 1245659238U, // <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS + 1705428686U, // <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5> + 2779170519U, // <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5> + 2657899362U, // <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3> + 2319406574U, // <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7> + 1705428718U, // <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1> + 3713728614U, // <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS + 3852912388U, // <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5> + 2779170573U, // <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5> + 1242349670U, // <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS + 3713731894U, // <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS + 2779170601U, // <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6> + 2767669041U, // <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5> + 3389834456U, // <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7> + 1242349675U, // <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS + 3852912456U, // <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1> + 3852912466U, // <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2> + 3852912475U, // <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2> + 2779170664U, // <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6> + 3852912496U, // <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5> + 3792474116U, // <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5> + 2718732388U, // <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2> + 3841337228U, // <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6> + 2779170709U, // <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6> + 2640003174U, // <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS + 2721386920U, // <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2> + 2767595441U, // <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7> + 1693927354U, // <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7> + 2640006454U, // <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS + 3841558476U, // <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7> + 2657923941U, // <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6> + 3841337310U, // <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7> + 1694296039U, // <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7> + 2803058666U, // <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1> + 3852912632U, // <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6> + 2322089576U, // <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2> + 1248346214U, // <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS + 3841337362U, // <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5> + 3395830836U, // <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5> + 2261616570U, // <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7> + 3371943857U, // <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7> + 1248346219U, // <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS + 1705429051U, // <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1> + 2779170884U, // <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1> + 1705428584U, // <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2> + 1695254620U, // <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7> + 1705429091U, // <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5> + 2779170924U, // <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5> + 2767669361U, // <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1> + 2803058809U, // <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0> + 1695623305U, // <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7> + 2779170955U, // <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0> + 1705429142U, // <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2> + 2634057732U, // <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0> + 2779170983U, // <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1> + 2779170992U, // <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1> + 3852912829U, // <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5> + 2657948520U, // <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0> + 2316060602U, // <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7> + 1705429205U, // <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2> + 3852912860U, // <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0> + 2779171046U, // <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1> + 2779171057U, // <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3> + 3852912887U, // <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0> + 3852912896U, // <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0> + 3852912905U, // <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0> + 3835291923U, // <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1> + 3841411356U, // <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1> + 2779171111U, // <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3> + 2779171120U, // <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3> + 3852912952U, // <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2> + 2779171137U, // <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2> + 2779171144U, // <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0> + 2779171156U, // <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3> + 3852912989U, // <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3> + 2767669606U, // <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3> + 2767669615U, // <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3> + 2779171189U, // <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0> + 2779171198U, // <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0> + 3852913032U, // <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1> + 2704140655U, // <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3> + 1705429404U, // <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3> + 2779171238U, // <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4> + 3852913070U, // <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3> + 2657973099U, // <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3> + 2767669700U, // <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7> + 1705429404U, // <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3> + 2779171280U, // <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1> + 2779171290U, // <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2> + 2634090504U, // <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4> + 2779171311U, // <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5> + 2779171319U, // <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4> + 1705429506U, // <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6> + 2722057593U, // <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2> + 2316093370U, // <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7> + 1705429533U, // <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6> + 3852913185U, // <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1> + 3795799695U, // <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1> + 3852913203U, // <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1> + 3852913214U, // <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3> + 3852913225U, // <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5> + 2779171410U, // <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5> + 2718740581U, // <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3> + 3841411685U, // <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6> + 2720067847U, // <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3> + 2773420664U, // <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7> + 3847236225U, // <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7> + 1648316922U, // <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3> + 2773641875U, // <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7> + 2773715612U, // <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7> + 3847531173U, // <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7> + 2722059024U, // <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2> + 2767669943U, // <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7> + 1652298720U, // <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3> + 2767669955U, // <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1> + 3841411788U, // <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1> + 2767669978U, // <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6> + 2722059546U, // <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2> + 2767669995U, // <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5> + 3852913396U, // <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5> + 2722059758U, // <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7> + 2302183354U, // <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7> + 2767670027U, // <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1> + 2774747930U, // <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7> + 1705429790U, // <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2> + 1660262316U, // <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3> + 1705429404U, // <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3> + 2775042878U, // <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7> + 1705429830U, // <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6> + 2779171660U, // <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3> + 2767670101U, // <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3> + 1705429853U, // <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2> + 2718744576U, // <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0> + 1645002854U, // <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS + 3852913527U, // <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1> + 3852913536U, // <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1> + 2316061904U, // <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4> + 1705429906U, // <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1> + 2658022257U, // <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0> + 2256489928U, // <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0> + 1707420589U, // <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1> + 3852913590U, // <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1> + 2718745396U, // <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1> + 2779171786U, // <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3> + 3852913616U, // <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0> + 3852913627U, // <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2> + 2779171810U, // <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0> + 3792487631U, // <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7> + 3394456220U, // <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7> + 2779171837U, // <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0> + 3852913673U, // <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3> + 3852913682U, // <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3> + 2718746216U, // <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2> + 2718746278U, // <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1> + 2779171885U, // <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3> + 2779171893U, // <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2> + 2718746554U, // <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7> + 3847457864U, // <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3> + 2779171921U, // <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3> + 2718746774U, // <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2> + 3852913762U, // <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2> + 3852913772U, // <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3> + 2718747036U, // <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3> + 2718747138U, // <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6> + 2779171972U, // <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0> + 2706803380U, // <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4> + 3847457946U, // <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4> + 2781162655U, // <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0> + 2718747538U, // <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1> + 3852913842U, // <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1> + 3852913852U, // <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2> + 2316096696U, // <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3> + 1705430224U, // <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4> + 1705430234U, // <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5> + 2658055029U, // <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4> + 2316097024U, // <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7> + 1707420917U, // <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5> + 1584316518U, // <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS + 2658059060U, // <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1> + 2640144314U, // <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7> + 2640145131U, // <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5> + 1584319798U, // <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS + 2779172134U, // <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0> + 631688502U, // <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS + 2658063354U, // <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2> + 631688520U, // <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS + 3852914001U, // <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7> + 3852914010U, // <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7> + 2718749178U, // <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3> + 2722730572U, // <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4> + 2723394205U, // <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4> + 2779172221U, // <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6> + 2718749496U, // <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6> + 2718749518U, // <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1> + 2779172249U, // <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7> + 2718749690U, // <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2> + 3847458214U, // <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2> + 2718749880U, // <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3> + 3847458236U, // <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6> + 2718750004U, // <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1> + 1187876150U, // <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS + 2718750208U, // <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7> + 2718750286U, // <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4> + 1187876393U, // <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS + 1584341094U, // <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS + 1645008686U, // <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS + 2640168890U, // <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7> + 2640169710U, // <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u> + 1584344374U, // <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS + 1705430554U, // <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1> + 631688745U, // <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS + 2718750976U, // <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1> + 631688763U, // <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS + 2646147174U, // <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS + 2779172424U, // <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2> + 3852914258U, // <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3> + 3852914268U, // <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4> + 2779172450U, // <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1> + 2316061914U, // <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5> + 2316061186U, // <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6> + 2646152186U, // <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2> + 2779172486U, // <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1> + 2781163151U, // <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1> + 2321378194U, // <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1> + 3852914339U, // <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3> + 3852914350U, // <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5> + 2781163191U, // <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5> + 3852914363U, // <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0> + 3835588297U, // <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5> + 3835588306U, // <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5> + 2781163223U, // <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1> + 3852914400U, // <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1> + 2781163243U, // <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3> + 3852914419U, // <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2> + 2779172606U, // <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4> + 3780552497U, // <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5> + 2781163279U, // <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3> + 2779172632U, // <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3> + 3835588385U, // <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3> + 2779172650U, // <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3> + 3852914481U, // <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1> + 2319403922U, // <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1> + 2319404409U, // <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2> + 3852914510U, // <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3> + 3779226131U, // <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5> + 2319404250U, // <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5> + 2319403522U, // <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6> + 3852914547U, // <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4> + 2319403524U, // <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u> + 2646179942U, // <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS + 2316094354U, // <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1> + 3852914582U, // <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3> + 3852914592U, // <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4> + 2646183372U, // <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4> + 2779172788U, // <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6> + 2316093954U, // <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6> + 2646185318U, // <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6> + 2779172815U, // <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6> + 2781163475U, // <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1> + 2781163484U, // <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1> + 3852914662U, // <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2> + 3852914672U, // <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3> + 2781163515U, // <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5> + 1705431044U, // <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 2779172878U, // <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6> + 3835588632U, // <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7> + 1705431044U, // <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 2779172900U, // <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1> + 2781163571U, // <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7> + 3852914743U, // <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2> + 2779172930U, // <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4> + 2779172940U, // <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5> + 2781163607U, // <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7> + 2779172960U, // <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7> + 1705431138U, // <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0> + 1705578603U, // <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0> + 2646204518U, // <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS + 2322090898U, // <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1> + 3719947880U, // <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2> + 3719948438U, // <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2> + 2646207951U, // <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7> + 2322091226U, // <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5> + 2322090498U, // <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6> + 2646210156U, // <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7> + 2646210350U, // <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS + 2779173062U, // <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1> + 2779173072U, // <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2> + 2319404409U, // <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2> + 2779173092U, // <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4> + 2779173101U, // <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4> + 1705431044U, // <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 2779173118U, // <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3> + 1705578756U, // <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0> + 1707421965U, // <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0> + 3852914966U, // <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0> + 2779173153U, // <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2> + 2256491002U, // <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3> + 3852914994U, // <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1> + 3852915003U, // <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1> + 2316062652U, // <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5> + 2316063544U, // <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6> + 1242320182U, // <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS + 1242320183U, // <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS + 3852915048U, // <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1> + 3377866217U, // <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1> + 3852915068U, // <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3> + 3833672072U, // <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6> + 3852915088U, // <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5> + 3395122056U, // <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5> + 3389813560U, // <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6> + 2779173287U, // <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1> + 2779320752U, // <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1> + 2658181222U, // <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS + 3852915140U, // <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3> + 2257973754U, // <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3> + 3841413589U, // <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2> + 2658184502U, // <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS + 3852915176U, // <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3> + 2658186117U, // <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2> + 1705431546U, // <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3> + 1705579011U, // <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3> + 3714015334U, // <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS + 3777243425U, // <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6> + 2319405957U, // <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2> + 3375229286U, // <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3> + 2779173426U, // <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5> + 3375228721U, // <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5> + 2319405880U, // <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6> + 1245662518U, // <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS + 1245662519U, // <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS + 3852915291U, // <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1> + 3389834729U, // <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1> + 2259472890U, // <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3> + 3852915321U, // <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4> + 3852915330U, // <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4> + 2779173517U, // <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6> + 2316096312U, // <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6> + 1242352950U, // <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS + 1242352951U, // <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS + 3852915372U, // <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1> + 3835294392U, // <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4> + 3852915395U, // <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6> + 3852915404U, // <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6> + 3852915412U, // <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5> + 3377899313U, // <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5> + 2718765160U, // <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6> + 2779173611U, // <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1> + 2779321076U, // <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1> + 2658213990U, // <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS + 3852915462U, // <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1> + 2718765562U, // <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3> + 3714042622U, // <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6> + 2658217270U, // <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS + 2724074224U, // <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6> + 1705431864U, // <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6> + 1705431874U, // <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7> + 1705579339U, // <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7> + 1705431886U, // <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1> + 2779173719U, // <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1> + 2779173729U, // <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2> + 2779173736U, // <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0> + 1705431926U, // <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5> + 2779173759U, // <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5> + 2779173765U, // <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2> + 1248349494U, // <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS + 1705431958U, // <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1> + 1705579423U, // <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1> + 2779173801U, // <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2> + 2779321266U, // <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2> + 2779321273U, // <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0> + 1705579463U, // <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5> + 2779173841U, // <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6> + 1705431864U, // <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6> + 1705432032U, // <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3> + 1705579495U, // <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1> + 1242320994U, // <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0> + 1705432058U, // <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2> + 3841414146U, // <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1> + 2316063226U, // <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3> + 2779173908U, // <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1> + 2658242658U, // <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0> + 2658243468U, // <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0> + 2316063554U, // <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7> + 1705432121U, // <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2> + 3852915777U, // <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1> + 2779173962U, // <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1> + 2779173973U, // <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3> + 3389813242U, // <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3> + 3852915813U, // <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1> + 3852915821U, // <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0> + 3835294839U, // <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1> + 2329343596U, // <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7> + 2779174027U, // <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3> + 2803061908U, // <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3> + 3852915869U, // <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3> + 2779174053U, // <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2> + 2779174060U, // <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0> + 2803061944U, // <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3> + 3852915905U, // <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3> + 2767672522U, // <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3> + 2791855315U, // <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3> + 2768999644U, // <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3> + 2779174115U, // <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1> + 3852915948U, // <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1> + 3841414394U, // <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6> + 1245663738U, // <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3> + 2779174155U, // <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5> + 3852915988U, // <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5> + 2706827959U, // <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7> + 2319405890U, // <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7> + 1245663738U, // <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3> + 2779174200U, // <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5> + 3852916030U, // <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2> + 3714099130U, // <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7> + 2316095994U, // <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3> + 1242353766U, // <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4> + 1705432422U, // <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6> + 2658276240U, // <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4> + 2316096322U, // <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7> + 1705432449U, // <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6> + 3852916101U, // <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1> + 3854906765U, // <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0> + 3852916121U, // <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3> + 3389846010U, // <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3> + 3852916141U, // <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5> + 2779174326U, // <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5> + 2779174337U, // <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7> + 2329376364U, // <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7> + 2779321811U, // <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7> + 2658287718U, // <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS + 3852916197U, // <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7> + 2779174382U, // <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7> + 2316112378U, // <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3> + 2658290998U, // <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS + 3852916233U, // <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7> + 1651004226U, // <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7> + 2779174420U, // <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0> + 1652331492U, // <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7> + 1590526054U, // <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS + 2328728623U, // <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1> + 2724746451U, // <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3> + 2322092538U, // <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3> + 1590529334U, // <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS + 2328728951U, // <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5> + 2724746770U, // <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7> + 430361910U, // <7,7,7,7>: Cost 1 vspltisw3 RHS + 430361910U, // <7,7,7,u>: Cost 1 vspltisw3 RHS + 1242320994U, // <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0> + 1705580162U, // <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2> + 2779321996U, // <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3> + 1245663738U, // <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3> + 1242353766U, // <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4> + 1705580202U, // <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6> + 1662949620U, // <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7> + 430361910U, // <7,7,u,7>: Cost 1 vspltisw3 RHS + 430361910U, // <7,7,u,u>: Cost 1 vspltisw3 RHS + 1705426944U, // <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0> + 1705432787U, // <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2> + 2316060885U, // <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2> + 1242316956U, // <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS + 2779174637U, // <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1> + 1182750874U, // <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS + 2316061213U, // <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6> + 1242320200U, // <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS + 1705432850U, // <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2> + 1584578662U, // <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS + 1705427764U, // <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1> + 631691054U, // <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS + 2640407307U, // <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1> + 1584581942U, // <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS + 2779174726U, // <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0> + 1584583574U, // <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1> + 2779322201U, // <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1> + 631691108U, // <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS + 2779174763U, // <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1> + 2779174774U, // <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3> + 1705428584U, // <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2> + 1705432965U, // <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0> + 2779174801U, // <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3> + 2779174810U, // <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3> + 2767673251U, // <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3> + 1705580460U, // <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3> + 1705433010U, // <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0> + 1705433020U, // <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1> + 2779174853U, // <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1> + 2767673299U, // <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6> + 1245659292U, // <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS + 1705433060U, // <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5> + 2779174893U, // <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5> + 2706836152U, // <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u> + 1245662536U, // <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS + 1705433092U, // <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1> + 2779174925U, // <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1> + 1185732398U, // <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS + 2316093653U, // <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2> + 1242349724U, // <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS + 1705430224U, // <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4> + 1705433151U, // <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6> + 2316093981U, // <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6> + 1242352968U, // <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS + 1705433178U, // <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6> + 1584611430U, // <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS + 2781165670U, // <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0> + 2640439226U, // <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7> + 2640440079U, // <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5> + 1584614710U, // <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS + 1705431044U, // <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5> + 631691418U, // <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS + 2779322525U, // <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1> + 631691436U, // <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS + 2779175087U, // <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1> + 2779175102U, // <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7> + 1648357887U, // <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u> + 1705433296U, // <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7> + 2779175127U, // <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5> + 2779175138U, // <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7> + 1651012419U, // <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u> + 1705580788U, // <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7> + 1705433341U, // <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7> + 1705580800U, // <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1> + 1187878702U, // <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS + 2768042263U, // <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6> + 1248346268U, // <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS + 1705580840U, // <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5> + 1187879066U, // <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS + 2779322679U, // <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2> + 430361910U, // <7,u,7,7>: Cost 1 vspltisw3 RHS + 430361910U, // <7,u,7,u>: Cost 1 vspltisw3 RHS + 1705433425U, // <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1> + 1705433435U, // <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2> + 631691621U, // <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS + 1705433451U, // <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0> + 1705433465U, // <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5> + 1705433475U, // <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6> + 631691661U, // <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS + 430361910U, // <7,u,u,7>: Cost 1 vspltisw3 RHS + 631691675U, // <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS + 202162278U, // <u,0,0,0>: Cost 1 vspltisw0 LHS + 1678598154U, // <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1> + 2634500154U, // <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0> + 2289596269U, // <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3> + 1548815670U, // <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS + 2663698530U, // <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0> + 2658390942U, // <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0> + 2289596597U, // <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7> + 202162278U, // <u,0,0,u>: Cost 1 vspltisw0 LHS + 1560764518U, // <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS + 115720294U, // <u,0,1,1>: Cost 1 vmrghw LHS, LHS + 604856427U, // <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS + 2634508438U, // <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2> + 1560767798U, // <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS + 2652426438U, // <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1> + 1584657311U, // <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1> + 2658399226U, // <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2> + 604856476U, // <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS + 2696889850U, // <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0> + 1190174822U, // <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS + 2692245096U, // <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2> + 2692245158U, // <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1> + 2263916882U, // <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5> + 2299709908U, // <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5> + 2692245434U, // <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7> + 2701535281U, // <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0> + 1190175389U, // <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS + 1209237504U, // <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0> + 1209239206U, // <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1> + 2704189813U, // <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0> + 2692245916U, // <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3> + 2282981033U, // <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4> + 2664386658U, // <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0> + 2691877496U, // <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7> + 2664388218U, // <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3> + 1209239213U, // <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u> + 2289623040U, // <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0> + 1678598482U, // <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5> + 2634532926U, // <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4> + 2235580672U, // <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4> + 1143619922U, // <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5> + 1618505014U, // <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS + 2658423714U, // <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4> + 2713259464U, // <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0> + 1683243409U, // <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5> + 1192443904U, // <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0> + 118702182U, // <u,0,5,1>: Cost 1 vmrghw RHS, LHS + 2266185901U, // <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2> + 2640513816U, // <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5> + 1192444242U, // <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5> + 2718789636U, // <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5> + 1645047915U, // <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0> + 2664404604U, // <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5> + 118702749U, // <u,0,5,u>: Cost 1 vmrghw RHS, LHS + 2302910464U, // <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0> + 1192886374U, // <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS + 2718790138U, // <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3> + 2722771537U, // <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0> + 2266628434U, // <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5> + 2248950180U, // <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6> + 2718790456U, // <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6> + 2718790478U, // <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1> + 1192886941U, // <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS + 1235812352U, // <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0> + 1235814054U, // <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1> + 2728080601U, // <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0> + 2640530202U, // <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7> + 2640530742U, // <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS + 2309556692U, // <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5> + 2730735133U, // <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0> + 2309556856U, // <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 1235814061U, // <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u> + 202162278U, // <u,0,u,0>: Cost 1 vspltisw0 LHS + 120365158U, // <u,0,u,1>: Cost 1 vmrghw LHS, LHS + 604856989U, // <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS + 2692249532U, // <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1> + 1560825142U, // <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS + 1618507930U, // <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS + 1584714662U, // <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u> + 2309565048U, // <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7> + 604857043U, // <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS + 1611210825U, // <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1> + 1616519270U, // <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS + 2287605459U, // <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2> + 2640546588U, // <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0> + 2622631222U, // <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS + 2289590610U, // <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5> + 2664436630U, // <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1> + 2664437376U, // <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0> + 1616519889U, // <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1> + 1548894866U, // <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1> + 269271142U, // <u,1,1,1>: Cost 1 vspltisw1 LHS + 1189462934U, // <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0> + 2622638230U, // <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2> + 1548897590U, // <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS + 2756985692U, // <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5> + 2658472872U, // <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1> + 2287614142U, // <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7> + 269271142U, // <u,1,1,u>: Cost 1 vspltisw1 LHS + 1566818406U, // <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS + 2756985735U, // <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3> + 1148371862U, // <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0> + 835584U, // <u,1,2,3>: Cost 0 copy LHS + 1566821686U, // <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS + 2756985771U, // <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3> + 2690262970U, // <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7> + 1590711938U, // <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2> + 835584U, // <u,1,2,u>: Cost 0 copy LHS + 2282979337U, // <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0> + 1209237514U, // <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1> + 1209239702U, // <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 2282979502U, // <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3> + 2282979341U, // <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4> + 1209237842U, // <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2282979505U, // <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 2287625423U, // <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7> + 1209237521U, // <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u> + 1635101613U, // <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1> + 2289623050U, // <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1> + 2289625238U, // <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2> + 2640579360U, // <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4> + 2622663990U, // <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS + 1616522550U, // <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS + 2664469398U, // <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1> + 2664470148U, // <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4> + 1616522793U, // <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS + 1548927638U, // <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5> + 1192444724U, // <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1> + 1192444822U, // <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0> + 2622670998U, // <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2> + 1548930358U, // <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS + 1210728786U, // <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5> + 2714153058U, // <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0> + 2670449658U, // <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2> + 1548932910U, // <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS + 2622677655U, // <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6> + 2756986063U, // <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7> + 2302912662U, // <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2> + 3696421014U, // <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2> + 2622680374U, // <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS + 2756986099U, // <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7> + 2714153784U, // <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6> + 1651692438U, // <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1> + 1652356071U, // <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1> + 2628657254U, // <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS + 1235812362U, // <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1> + 1235814550U, // <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2> + 2309554350U, // <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3> + 2628660534U, // <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS + 1235812690U, // <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5> + 2309554353U, // <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6> + 2309554678U, // <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7> + 1235812369U, // <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u> + 1548952217U, // <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u> + 269271142U, // <u,1,u,1>: Cost 1 vspltisw1 LHS + 1209280662U, // <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2> + 835584U, // <u,1,u,3>: Cost 0 copy LHS + 1548954934U, // <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS + 1209278802U, // <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5> + 2283020465U, // <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6> + 1590761096U, // <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u> + 835584U, // <u,1,u,u>: Cost 0 copy LHS + 2702876672U, // <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0> + 1629134950U, // <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS + 2289591912U, // <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2> + 1215848550U, // <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS + 2702877010U, // <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5> + 2289222708U, // <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5> + 2779178473U, // <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1> + 2726249024U, // <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0> + 1215848555U, // <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS + 2690933539U, // <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2> + 2628683124U, // <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1> + 1189463656U, // <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2> + 1213866086U, // <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS + 2628685110U, // <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS + 2263205736U, // <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6> + 1189463994U, // <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 2263205866U, // <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1> + 1213866091U, // <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS + 1556938854U, // <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS + 2697569869U, // <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2> + 336380006U, // <u,2,2,2>: Cost 1 vspltisw2 LHS + 1678599794U, // <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3> + 1556942134U, // <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS + 2295138061U, // <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5> + 2702878650U, // <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7> + 2300229831U, // <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7> + 336380006U, // <u,2,2,u>: Cost 1 vspltisw2 LHS + 475243165U, // <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS + 1548985140U, // <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 1209239144U, // <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2> + 135495782U, // <u,2,3,3>: Cost 1 vmrglw LHS, LHS + 475245878U, // <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS + 1596764164U, // <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5> + 1596764666U, // <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3> + 1596765178U, // <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 135495787U, // <u,2,3,u>: Cost 1 vmrglw LHS, LHS + 2708851630U, // <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2> + 2217362979U, // <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5> + 2289624680U, // <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2> + 1215881318U, // <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS + 2726767824U, // <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4> + 1629138230U, // <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS + 2779178801U, // <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5> + 2726251976U, // <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0> + 1215881323U, // <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS + 2628714598U, // <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS + 2628715896U, // <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5> + 1192445544U, // <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2> + 1213898854U, // <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS + 2628717878U, // <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS + 2726768644U, // <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5> + 1192445882U, // <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7> + 2266187754U, // <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1> + 1213898859U, // <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS + 2634694758U, // <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS + 2721460657U, // <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2> + 2296940136U, // <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2> + 1678600122U, // <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7> + 2634698038U, // <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS + 3370682125U, // <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5> + 1157056442U, // <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7> + 2725442455U, // <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2> + 1678600167U, // <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7> + 1653027897U, // <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2> + 2309554924U, // <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1> + 1235813992U, // <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2> + 162070630U, // <u,2,7,3>: Cost 1 vmrglw RHS, LHS + 2634706230U, // <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS + 2309555252U, // <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5> + 2309555901U, // <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6> + 2309555416U, // <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7> + 162070635U, // <u,2,7,u>: Cost 1 vmrglw RHS, LHS + 475284130U, // <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS + 1549026100U, // <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1> + 336380006U, // <u,2,u,2>: Cost 1 vspltisw2 LHS + 135536742U, // <u,2,u,3>: Cost 1 vmrglw LHS, LHS + 475286838U, // <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS + 1629141146U, // <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS + 1194108858U, // <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7> + 1596806138U, // <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2> + 135536747U, // <u,2,u,u>: Cost 1 vmrglw LHS, LHS + 1611890688U, // <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0> + 538149020U, // <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS + 2685632685U, // <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 2685632764U, // <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0> + 1611891026U, // <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 2733408722U, // <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7> + 2658612153U, // <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0> + 2289592250U, // <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7> + 538149533U, // <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS + 1189464214U, // <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2> + 1611891508U, // <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1> + 1611891606U, // <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0> + 1189464476U, // <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3> + 1189464578U, // <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6> + 2690278511U, // <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1> + 2690278607U, // <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7> + 2287609786U, // <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7> + 1611892092U, // <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0> + 2685634042U, // <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0> + 2685634079U, // <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1> + 1611892328U, // <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2> + 1611892390U, // <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1> + 2685634371U, // <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5> + 2685634453U, // <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6> + 1611892666U, // <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 2300225466U, // <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7> + 1611892795U, // <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1> + 1209238422U, // <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0> + 2282980247U, // <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1> + 1561004120U, // <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3> + 403488870U, // <u,3,3,3>: Cost 1 vspltisw3 LHS + 1209238426U, // <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4> + 2282980899U, // <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5> + 2282985598U, // <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6> + 1209239482U, // <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 403488870U, // <u,3,3,u>: Cost 1 vspltisw3 LHS + 1555038310U, // <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS + 1555039616U, // <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4> + 2628781672U, // <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2> + 2289624690U, // <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3> + 1555041590U, // <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS + 538152246U, // <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS + 2658644925U, // <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4> + 2289625018U, // <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7> + 538152489U, // <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS + 1192446102U, // <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2> + 2733411983U, // <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1> + 2634762330U, // <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5> + 1192446364U, // <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3> + 1192446466U, // <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6> + 1659670532U, // <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5> + 1659670626U, // <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0> + 2287642554U, // <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7> + 1659670788U, // <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0> + 2634768486U, // <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS + 2733412775U, // <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1> + 1648390659U, // <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3> + 2634770973U, // <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6> + 2634771766U, // <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS + 2733413099U, // <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1> + 1659671352U, // <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6> + 1659671374U, // <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1> + 1652372457U, // <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3> + 1561034854U, // <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS + 2634777396U, // <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1> + 1561036892U, // <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7> + 1235814002U, // <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3> + 1561038134U, // <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS + 2309555747U, // <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5> + 2309556072U, // <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6> + 1235814330U, // <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7> + 1561040686U, // <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS + 1611896531U, // <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2> + 538154798U, // <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS + 1611896712U, // <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3> + 403488870U, // <u,3,u,3>: Cost 1 vspltisw3 LHS + 1611896895U, // <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6> + 538155162U, // <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS + 1611897040U, // <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7> + 1209280442U, // <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7> + 538155365U, // <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS + 1165118354U, // <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1> + 1618534502U, // <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS + 2634795102U, // <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0> + 2686451968U, // <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4> + 2692276562U, // <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5> + 1705438098U, // <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1> + 2658685890U, // <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0> + 2256489928U, // <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0> + 1618535069U, // <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS + 1189464978U, // <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1> + 2692277044U, // <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1> + 1618535367U, // <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4> + 2640775992U, // <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1> + 1189465296U, // <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4> + 115723574U, // <u,4,1,5>: Cost 1 vmrghw LHS, RHS + 2263207289U, // <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2> + 2664666780U, // <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1> + 115723817U, // <u,4,1,u>: Cost 1 vmrghw LHS, RHS + 2263919506U, // <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1> + 2222115812U, // <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2> + 2692277864U, // <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2> + 2692277926U, // <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1> + 2324114640U, // <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4> + 1190178102U, // <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS + 2692278202U, // <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7> + 2701568053U, // <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4> + 1190178345U, // <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS + 2692278422U, // <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2> + 2282981552U, // <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1> + 2704222585U, // <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4> + 2692278684U, // <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3> + 1257016528U, // <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4> + 1209239246U, // <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5> + 2691910300U, // <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7> + 2664683166U, // <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3> + 1209239249U, // <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u> + 1573027942U, // <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS + 2634826695U, // <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4> + 2634827874U, // <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4> + 2289629073U, // <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3> + 229035318U, // <u,4,4,4>: Cost 1 vspltisw0 RHS + 1618537782U, // <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS + 2658718662U, // <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4> + 2289629401U, // <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7> + 229035318U, // <u,4,4,u>: Cost 1 vspltisw0 RHS + 1561092198U, // <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS + 2628863370U, // <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5> + 1561094243U, // <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5> + 2634836118U, // <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2> + 1561095478U, // <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS + 118705462U, // <u,4,5,5>: Cost 1 vmrghw RHS, RHS + 604859702U, // <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS + 2658726906U, // <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2> + 604859720U, // <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS + 2266631058U, // <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1> + 2302692152U, // <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1> + 2718822906U, // <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3> + 2722804309U, // <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4> + 2723467942U, // <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4> + 1192889654U, // <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS + 2718823224U, // <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6> + 2718823246U, // <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1> + 1192889897U, // <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS + 2640822374U, // <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS + 2640823194U, // <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4> + 2728113373U, // <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4> + 2640825150U, // <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7> + 1235815632U, // <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4> + 1235814094U, // <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5> + 2730767905U, // <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4> + 2309556892U, // <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 1235814097U, // <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u> + 1561116774U, // <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS + 1618540334U, // <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS + 1561118822U, // <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u> + 2692282300U, // <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1> + 229035318U, // <u,4,u,4>: Cost 1 vspltisw0 RHS + 120368438U, // <u,4,u,5>: Cost 1 vmrghw LHS, RHS + 604859945U, // <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS + 2309565084U, // <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7> + 604859963U, // <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS + 2690293760U, // <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0> + 1616552038U, // <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS + 2640840434U, // <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5> + 2640841536U, // <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0> + 1613381970U, // <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5> + 2316135642U, // <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5> + 2289592834U, // <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6> + 2664732324U, // <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0> + 1616552661U, // <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5> + 1573077094U, // <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS + 1237536282U, // <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1> + 2690294678U, // <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0> + 2646821014U, // <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2> + 1573080602U, // <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1> + 1189466116U, // <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5> + 1189466210U, // <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0> + 2646823930U, // <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2> + 1573082926U, // <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS + 2640855142U, // <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS + 2697594448U, // <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5> + 2690295400U, // <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2> + 1625179890U, // <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5> + 2699585347U, // <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5> + 2781171471U, // <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3> + 2690295738U, // <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7> + 3775318070U, // <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5> + 1628498055U, // <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5> + 2287627234U, // <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0> + 1257016210U, // <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1> + 2646836942U, // <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5> + 2287625131U, // <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3> + 2287627238U, // <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4> + 1257016538U, // <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5> + 1209240066U, // <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 2287625459U, // <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7> + 1209240068U, // <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u> + 2640871526U, // <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS + 2316168082U, // <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1> + 2640873202U, // <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5> + 2640874308U, // <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4> + 1637788917U, // <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5> + 1616555318U, // <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS + 2287638591U, // <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6> + 2664765096U, // <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4> + 1616555561U, // <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS + 1573109862U, // <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS + 2646852404U, // <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1> + 2646853224U, // <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2> + 2287646618U, // <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3> + 1573113374U, // <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5> + 296144182U, // <u,5,5,5>: Cost 1 vspltisw1 RHS + 1192448098U, // <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0> + 2287646946U, // <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7> + 296144182U, // <u,5,5,u>: Cost 1 vspltisw1 RHS + 1567146086U, // <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS + 2628945300U, // <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6> + 2634917997U, // <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6> + 1567148870U, // <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6> + 1567149366U, // <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS + 2781171799U, // <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7> + 1228950018U, // <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6> + 27705344U, // <u,5,6,7>: Cost 0 copy RHS + 27705344U, // <u,5,6,u>: Cost 0 copy RHS + 2628952166U, // <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS + 1235815314U, // <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1> + 2309556734U, // <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2> + 2309555115U, // <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3> + 2628955446U, // <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS + 1235815642U, // <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5> + 1235814914U, // <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6> + 2309555443U, // <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7> + 1235814916U, // <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u> + 1567162470U, // <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS + 1616557870U, // <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS + 2690299781U, // <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0> + 1567165256U, // <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u> + 1567165750U, // <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS + 296144182U, // <u,5,u,5>: Cost 1 vspltisw1 RHS + 1209281026U, // <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6> + 27705344U, // <u,5,u,7>: Cost 0 copy RHS + 27705344U, // <u,5,u,u>: Cost 0 copy RHS + 2705563648U, // <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0> + 1631821926U, // <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS + 2262462970U, // <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3> + 2646886941U, // <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6> + 2705563986U, // <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5> + 2316062652U, // <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5> + 2316137272U, // <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6> + 1215851830U, // <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS + 1215851831U, // <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS + 2634948710U, // <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS + 2705564468U, // <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1> + 1189466618U, // <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3> + 2263208498U, // <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5> + 2693620843U, // <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6> + 2652868860U, // <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1> + 1189466936U, // <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6> + 1213869366U, // <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS + 1213869367U, // <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS + 2658844774U, // <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS + 3771344465U, // <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6> + 1178554874U, // <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3> + 2698929907U, // <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6> + 2699593540U, // <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6> + 2700257173U, // <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6> + 2705565626U, // <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7> + 1226485046U, // <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS + 1226485047U, // <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS + 2705565846U, // <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2> + 2330756585U, // <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1> + 2330756829U, // <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2> + 2282981734U, // <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3> + 1631824413U, // <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6> + 2652885246U, // <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3> + 1257018168U, // <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6> + 135499062U, // <u,6,3,7>: Cost 1 vmrglw LHS, RHS + 135499063U, // <u,6,3,u>: Cost 1 vmrglw LHS, RHS + 2646917222U, // <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS + 2217365931U, // <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5> + 2790167156U, // <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u> + 2646919709U, // <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6> + 2711538934U, // <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6> + 1631825206U, // <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS + 2316170040U, // <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6> + 1215884598U, // <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS + 1215884599U, // <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS + 2634981478U, // <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS + 2266190247U, // <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1> + 1192448506U, // <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3> + 2266190386U, // <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5> + 2634984758U, // <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS + 2652901632U, // <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5> + 1192448824U, // <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6> + 1213902134U, // <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS + 1213902135U, // <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS + 1583808614U, // <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS + 2322010445U, // <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1> + 2718839290U, // <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3> + 2670823965U, // <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6> + 1583811894U, // <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS + 2724147961U, // <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6> + 363253046U, // <u,6,6,6>: Cost 1 vspltisw2 RHS + 1229172022U, // <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS + 363253046U, // <u,6,6,u>: Cost 1 vspltisw2 RHS + 499458150U, // <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS + 1573200692U, // <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1> + 1573201512U, // <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1573202070U, // <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 499461673U, // <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS + 1573203972U, // <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5> + 1235817272U, // <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6> + 162073910U, // <u,6,7,7>: Cost 1 vmrglw RHS, RHS + 162073911U, // <u,6,7,u>: Cost 1 vmrglw RHS, RHS + 499466342U, // <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS + 1631827758U, // <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS + 1573209704U, // <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2> + 1573210262U, // <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2> + 499469866U, // <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS + 1631828122U, // <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS + 363253046U, // <u,6,u,6>: Cost 1 vspltisw2 RHS + 135540022U, // <u,6,u,7>: Cost 1 vmrglw LHS, RHS + 135540023U, // <u,6,u,u>: Cost 1 vmrglw LHS, RHS + 1638465536U, // <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0> + 564723814U, // <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS + 2712207533U, // <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2> + 2712207612U, // <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0> + 1638465874U, // <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5> + 1579192580U, // <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0> + 2712207862U, // <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7> + 2316137282U, // <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7> + 564724381U, // <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS + 1189467130U, // <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2> + 1638466356U, // <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1> + 1638466454U, // <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0> + 2311500282U, // <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3> + 1189467494U, // <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6> + 2712208495U, // <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1> + 2694956302U, // <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7> + 1189467756U, // <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7> + 1638466940U, // <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0> + 2712208829U, // <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2> + 2712208927U, // <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1> + 1638467176U, // <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2> + 1638467238U, // <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1> + 2712209165U, // <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5> + 2712209256U, // <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6> + 1627187175U, // <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7> + 2324116290U, // <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7> + 1628514441U, // <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7> + 1638467734U, // <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2> + 2712209638U, // <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1> + 2700929387U, // <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u> + 1638467996U, // <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3> + 1638468098U, // <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6> + 2712210002U, // <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5> + 1585189856U, // <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3> + 1257018178U, // <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7> + 1638468382U, // <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2> + 1638468498U, // <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1> + 2712210378U, // <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3> + 2712210485U, // <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2> + 2712210564U, // <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0> + 1638468816U, // <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4> + 564727112U, // <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS + 2712210809U, // <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2> + 2712210888U, // <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0> + 564727337U, // <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS + 1192449018U, // <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2> + 2714201743U, // <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1> + 2712211198U, // <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4> + 2311533050U, // <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3> + 1192449382U, // <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6> + 1638469636U, // <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5> + 1638469730U, // <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0> + 1192449644U, // <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7> + 1638469892U, // <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0> + 2712211745U, // <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2> + 2712211879U, // <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1> + 1638470138U, // <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 2712212018U, // <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5> + 2712212109U, // <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6> + 2712212203U, // <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1> + 1638470456U, // <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6> + 1638470478U, // <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1> + 1638470559U, // <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1> + 1235816546U, // <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0> + 2309558371U, // <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1> + 2641045434U, // <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7> + 1235816954U, // <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3> + 1235816550U, // <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4> + 2309558375U, // <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5> + 1585222628U, // <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7> + 430361910U, // <u,7,7,7>: Cost 1 vspltisw3 RHS + 430361910U, // <u,7,7,u>: Cost 1 vspltisw3 RHS + 1638471379U, // <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2> + 564729646U, // <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS + 1638471557U, // <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0> + 1638471612U, // <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1> + 1638471743U, // <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6> + 564730010U, // <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS + 1638471888U, // <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7> + 430361910U, // <u,7,u,7>: Cost 1 vspltisw3 RHS + 564730213U, // <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS + 202162278U, // <u,u,0,0>: Cost 1 vspltisw0 LHS + 538189985U, // <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS + 2685673645U, // <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2> + 1215848604U, // <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS + 1611931986U, // <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5> + 1579266317U, // <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0> + 2289592861U, // <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6> + 1215851848U, // <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS + 538190493U, // <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS + 1549411025U, // <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1> + 115726126U, // <u,u,1,1>: Cost 1 vmrghw LHS, LHS + 604862254U, // <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS + 1213866140U, // <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS + 1549413686U, // <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS + 115726490U, // <u,u,1,5>: Cost 1 vmrghw LHS, RHS + 1585247207U, // <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1> + 1213869384U, // <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS + 604862308U, // <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS + 1567334502U, // <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS + 1190180654U, // <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS + 336380006U, // <u,u,2,2>: Cost 1 vspltisw2 LHS + 835584U, // <u,u,2,3>: Cost 0 copy LHS + 1567337782U, // <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS + 1190181018U, // <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS + 1611933626U, // <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7> + 1226485064U, // <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS + 835584U, // <u,u,2,u>: Cost 0 copy LHS + 475685587U, // <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS + 1209239278U, // <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1> + 1209239765U, // <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2> + 135495836U, // <u,u,3,3>: Cost 1 vmrglw LHS, LHS + 475688246U, // <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS + 1209239282U, // <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5> + 1209240093U, // <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6> + 135499080U, // <u,u,3,7>: Cost 1 vmrglw LHS, RHS + 135495841U, // <u,u,3,u>: Cost 1 vmrglw LHS, LHS + 1555406950U, // <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS + 1555408301U, // <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4> + 2289625301U, // <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2> + 1215881372U, // <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS + 229035318U, // <u,u,4,4>: Cost 1 vspltisw0 RHS + 538193206U, // <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS + 2289625629U, // <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6> + 1215884616U, // <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS + 538193449U, // <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS + 1549443797U, // <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5> + 118708014U, // <u,u,5,1>: Cost 1 vmrghw RHS, LHS + 1561389191U, // <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5> + 1213898908U, // <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS + 1549446454U, // <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS + 118708378U, // <u,u,5,5>: Cost 1 vmrghw RHS, RHS + 604862618U, // <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS + 1213902152U, // <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS + 604862636U, // <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS + 1567367270U, // <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS + 1192892206U, // <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS + 1638478330U, // <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3> + 1679046864U, // <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7> + 1567370550U, // <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS + 1192892570U, // <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS + 363253046U, // <u,u,6,6>: Cost 1 vspltisw2 RHS + 27705344U, // <u,u,6,7>: Cost 0 copy RHS + 27705344U, // <u,u,6,u>: Cost 0 copy RHS + 499605606U, // <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS + 1235812425U, // <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1> + 1561405577U, // <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7> + 162070684U, // <u,u,7,3>: Cost 1 vmrglw RHS, LHS + 499609147U, // <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS + 1235812753U, // <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5> + 1235814941U, // <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6> + 162073928U, // <u,u,7,7>: Cost 1 vmrglw RHS, RHS + 162070689U, // <u,u,7,u>: Cost 1 vmrglw RHS, LHS + 475726552U, // <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS + 538195758U, // <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS + 604862821U, // <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS + 835584U, // <u,u,u,3>: Cost 0 copy LHS + 475729206U, // <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS + 538196122U, // <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS + 604862861U, // <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS + 27705344U, // <u,u,u,7>: Cost 0 copy RHS + 835584U, // <u,u,u,u>: Cost 0 copy LHS + 0 +}; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp new file mode 100644 index 000000000000..9895ee6267aa --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -0,0 +1,1007 @@ +//===-- PPCRegisterInfo.cpp - PowerPC Register Information ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#include "PPCRegisterInfo.h" +#include "PPC.h" +#include "PPCFrameLowering.h" +#include "PPCInstrBuilder.h" +#include "PPCMachineFunctionInfo.h" +#include "PPCSubtarget.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include <cstdlib> + +using namespace llvm; + +#define DEBUG_TYPE "reginfo" + +#define GET_REGINFO_TARGET_DESC +#include "PPCGenRegisterInfo.inc" + +static cl::opt<bool> +EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true), + cl::desc("Enable use of a base pointer for complex stack frames")); + +static cl::opt<bool> +AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false), + cl::desc("Force the use of a base pointer in every function")); + +PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST) + : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR, + ST.isPPC64() ? 0 : 1, + ST.isPPC64() ? 0 : 1), + Subtarget(ST) { + ImmToIdxMap[PPC::LD] = PPC::LDX; ImmToIdxMap[PPC::STD] = PPC::STDX; + ImmToIdxMap[PPC::LBZ] = PPC::LBZX; ImmToIdxMap[PPC::STB] = PPC::STBX; + ImmToIdxMap[PPC::LHZ] = PPC::LHZX; ImmToIdxMap[PPC::LHA] = PPC::LHAX; + ImmToIdxMap[PPC::LWZ] = PPC::LWZX; ImmToIdxMap[PPC::LWA] = PPC::LWAX; + ImmToIdxMap[PPC::LFS] = PPC::LFSX; ImmToIdxMap[PPC::LFD] = PPC::LFDX; + ImmToIdxMap[PPC::STH] = PPC::STHX; ImmToIdxMap[PPC::STW] = PPC::STWX; + ImmToIdxMap[PPC::STFS] = PPC::STFSX; ImmToIdxMap[PPC::STFD] = PPC::STFDX; + ImmToIdxMap[PPC::ADDI] = PPC::ADD4; + ImmToIdxMap[PPC::LWA_32] = PPC::LWAX_32; + + // 64-bit + ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8; + ImmToIdxMap[PPC::LHZ8] = PPC::LHZX8; ImmToIdxMap[PPC::LWZ8] = PPC::LWZX8; + ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8; + ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX; + ImmToIdxMap[PPC::ADDI8] = PPC::ADD8; +} + +/// getPointerRegClass - Return the register class to use to hold pointers. +/// This is used for addressing modes. +const TargetRegisterClass * +PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) + const { + // Note that PPCInstrInfo::FoldImmediate also directly uses this Kind value + // when it checks for ZERO folding. + if (Kind == 1) { + if (Subtarget.isPPC64()) + return &PPC::G8RC_NOX0RegClass; + return &PPC::GPRC_NOR0RegClass; + } + + if (Subtarget.isPPC64()) + return &PPC::G8RCRegClass; + return &PPC::GPRCRegClass; +} + +const MCPhysReg* +PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + if (Subtarget.isDarwinABI()) + return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ? + CSR_Darwin64_Altivec_SaveList : + CSR_Darwin64_SaveList) : + (Subtarget.hasAltivec() ? + CSR_Darwin32_Altivec_SaveList : + CSR_Darwin32_SaveList); + + return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ? + CSR_SVR464_Altivec_SaveList : + CSR_SVR464_SaveList) : + (Subtarget.hasAltivec() ? + CSR_SVR432_Altivec_SaveList : + CSR_SVR432_SaveList); +} + +const uint32_t* +PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { + if (Subtarget.isDarwinABI()) + return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ? + CSR_Darwin64_Altivec_RegMask : + CSR_Darwin64_RegMask) : + (Subtarget.hasAltivec() ? + CSR_Darwin32_Altivec_RegMask : + CSR_Darwin32_RegMask); + + return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ? + CSR_SVR464_Altivec_RegMask : + CSR_SVR464_RegMask) : + (Subtarget.hasAltivec() ? + CSR_SVR432_Altivec_RegMask : + CSR_SVR432_RegMask); +} + +const uint32_t* +PPCRegisterInfo::getNoPreservedMask() const { + return CSR_NoRegs_RegMask; +} + +BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + const PPCFrameLowering *PPCFI = + static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering()); + + // The ZERO register is not really a register, but the representation of r0 + // when used in instructions that treat r0 as the constant 0. + Reserved.set(PPC::ZERO); + Reserved.set(PPC::ZERO8); + + // The FP register is also not really a register, but is the representation + // of the frame pointer register used by ISD::FRAMEADDR. + Reserved.set(PPC::FP); + Reserved.set(PPC::FP8); + + // The BP register is also not really a register, but is the representation + // of the base pointer register used by setjmp. + Reserved.set(PPC::BP); + Reserved.set(PPC::BP8); + + // The counter registers must be reserved so that counter-based loops can + // be correctly formed (and the mtctr instructions are not DCE'd). + Reserved.set(PPC::CTR); + Reserved.set(PPC::CTR8); + + Reserved.set(PPC::R1); + Reserved.set(PPC::LR); + Reserved.set(PPC::LR8); + Reserved.set(PPC::RM); + + if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec()) + Reserved.set(PPC::VRSAVE); + + // The SVR4 ABI reserves r2 and r13 + if (Subtarget.isSVR4ABI()) { + Reserved.set(PPC::R2); // System-reserved register + Reserved.set(PPC::R13); // Small Data Area pointer register + } + + // On PPC64, r13 is the thread pointer. Never allocate this register. + if (Subtarget.isPPC64()) { + Reserved.set(PPC::R13); + + Reserved.set(PPC::X1); + Reserved.set(PPC::X13); + + if (PPCFI->needsFP(MF)) + Reserved.set(PPC::X31); + + if (hasBasePointer(MF)) + Reserved.set(PPC::X30); + + // The 64-bit SVR4 ABI reserves r2 for the TOC pointer. + if (Subtarget.isSVR4ABI()) { + Reserved.set(PPC::X2); + } + } + + if (PPCFI->needsFP(MF)) + Reserved.set(PPC::R31); + + if (hasBasePointer(MF)) { + if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() && + MF.getTarget().getRelocationModel() == Reloc::PIC_) + Reserved.set(PPC::R29); + else + Reserved.set(PPC::R30); + } + + if (Subtarget.isSVR4ABI() && !Subtarget.isPPC64() && + MF.getTarget().getRelocationModel() == Reloc::PIC_) + Reserved.set(PPC::R30); + + // Reserve Altivec registers when Altivec is unavailable. + if (!Subtarget.hasAltivec()) + for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(), + IE = PPC::VRRCRegClass.end(); I != IE; ++I) + Reserved.set(*I); + + return Reserved; +} + +unsigned +PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const unsigned DefaultSafety = 1; + + switch (RC->getID()) { + default: + return 0; + case PPC::G8RC_NOX0RegClassID: + case PPC::GPRC_NOR0RegClassID: + case PPC::G8RCRegClassID: + case PPC::GPRCRegClassID: { + unsigned FP = TFI->hasFP(MF) ? 1 : 0; + return 32 - FP - DefaultSafety; + } + case PPC::F8RCRegClassID: + case PPC::F4RCRegClassID: + case PPC::VRRCRegClassID: + case PPC::VFRCRegClassID: + case PPC::VSLRCRegClassID: + case PPC::VSHRCRegClassID: + return 32 - DefaultSafety; + case PPC::VSRCRegClassID: + case PPC::VSFRCRegClassID: + return 64 - DefaultSafety; + case PPC::CRRCRegClassID: + return 8 - DefaultSafety; + } +} + +const TargetRegisterClass* +PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)const { + if (Subtarget.hasVSX()) { + // With VSX, we can inflate various sub-register classes to the full VSX + // register set. + + if (RC == &PPC::F8RCRegClass) + return &PPC::VSFRCRegClass; + else if (RC == &PPC::VRRCRegClass) + return &PPC::VSRCRegClass; + } + + return TargetRegisterInfo::getLargestLegalSuperClass(RC); +} + +//===----------------------------------------------------------------------===// +// Stack Frame Processing methods +//===----------------------------------------------------------------------===// + +/// lowerDynamicAlloc - Generate the code for allocating an object in the +/// current frame. The sequence of code with be in the general form +/// +/// addi R0, SP, \#frameSize ; get the address of the previous frame +/// stwxu R0, SP, Rnegsize ; add and update the SP with the negated size +/// addi Rnew, SP, \#maxCalFrameSize ; get the top of the allocation +/// +void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + // Get the instruction info. + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + // Determine whether 64-bit pointers are used. + bool LP64 = Subtarget.isPPC64(); + DebugLoc dl = MI.getDebugLoc(); + + // Get the maximum call stack size. + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + // Get the total frame size. + unsigned FrameSize = MFI->getStackSize(); + + // Get stack alignments. + unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment(); + unsigned MaxAlign = MFI->getMaxAlignment(); + assert((maxCallFrameSize & (MaxAlign-1)) == 0 && + "Maximum call-frame size not sufficiently aligned"); + + // Determine the previous frame's address. If FrameSize can't be + // represented as 16 bits or we need special alignment, then we load the + // previous frame's address from 0(SP). Why not do an addis of the hi? + // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. + // Constructing the constant and adding would take 3 instructions. + // Fortunately, a frame greater than 32K is rare. + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + + if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) { + BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg) + .addReg(PPC::R31) + .addImm(FrameSize); + } else if (LP64) { + BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg) + .addImm(0) + .addReg(PPC::X1); + } else { + BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg) + .addImm(0) + .addReg(PPC::R1); + } + + bool KillNegSizeReg = MI.getOperand(1).isKill(); + unsigned NegSizeReg = MI.getOperand(1).getReg(); + + // Grow the stack and update the stack pointer link, then determine the + // address of new allocated space. + if (LP64) { + if (MaxAlign > TargetAlign) { + unsigned UnalNegSizeReg = NegSizeReg; + NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC); + + // Unfortunately, there is no andi, only andi., and we can't insert that + // here because we might clobber cr0 while it is live. + BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg) + .addImm(~(MaxAlign-1)); + + unsigned NegSizeReg1 = NegSizeReg; + NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC); + BuildMI(MBB, II, dl, TII.get(PPC::AND8), NegSizeReg) + .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg)) + .addReg(NegSizeReg1, RegState::Kill); + KillNegSizeReg = true; + } + + BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1) + .addReg(Reg, RegState::Kill) + .addReg(PPC::X1) + .addReg(NegSizeReg, getKillRegState(KillNegSizeReg)); + BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg()) + .addReg(PPC::X1) + .addImm(maxCallFrameSize); + } else { + if (MaxAlign > TargetAlign) { + unsigned UnalNegSizeReg = NegSizeReg; + NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC); + + // Unfortunately, there is no andi, only andi., and we can't insert that + // here because we might clobber cr0 while it is live. + BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg) + .addImm(~(MaxAlign-1)); + + unsigned NegSizeReg1 = NegSizeReg; + NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC); + BuildMI(MBB, II, dl, TII.get(PPC::AND), NegSizeReg) + .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg)) + .addReg(NegSizeReg1, RegState::Kill); + KillNegSizeReg = true; + } + + BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1) + .addReg(Reg, RegState::Kill) + .addReg(PPC::R1) + .addReg(NegSizeReg, getKillRegState(KillNegSizeReg)); + BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg()) + .addReg(PPC::R1) + .addImm(maxCallFrameSize); + } + + // Discard the DYNALLOC instruction. + MBB.erase(II); +} + +/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of +/// reserving a whole register (R0), we scrounge for one here. This generates +/// code like this: +/// +/// mfcr rA ; Move the conditional register into GPR rA. +/// rlwinm rA, rA, SB, 0, 31 ; Shift the bits left so they are in CR0's slot. +/// stw rA, FI ; Store rA to the frame. +/// +void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; SPILL_CR <SrcReg>, <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + bool LP64 = Subtarget.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + unsigned SrcReg = MI.getOperand(0).getReg(); + + // We need to store the CR in the low 4-bits of the saved value. First, issue + // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + + // If the saved register wasn't CR0, shift the bits left so that they are in + // CR0's slot. + if (SrcReg != PPC::CR0) { + unsigned Reg1 = Reg; + Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + + // rlwinm rA, rA, ShiftBits, 0, 31. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg) + .addReg(Reg1, RegState::Kill) + .addImm(getEncodingValue(SrcReg) * 4) + .addImm(0) + .addImm(31); + } + + addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW)) + .addReg(Reg, RegState::Kill), + FrameIndex); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; <DestReg> = RESTORE_CR <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + bool LP64 = Subtarget.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + unsigned DestReg = MI.getOperand(0).getReg(); + assert(MI.definesRegister(DestReg) && + "RESTORE_CR does not define its destination"); + + addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ), + Reg), FrameIndex); + + // If the reloaded register isn't CR0, shift the bits right so that they are + // in the right CR's slot. + if (DestReg != PPC::CR0) { + unsigned Reg1 = Reg; + Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + + unsigned ShiftBits = getEncodingValue(DestReg)*4; + // rlwinm r11, r11, 32-ShiftBits, 0, 31. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg) + .addReg(Reg1, RegState::Kill).addImm(32-ShiftBits).addImm(0) + .addImm(31); + } + + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), DestReg) + .addReg(Reg, RegState::Kill); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +static unsigned getCRFromCRBit(unsigned SrcReg) { + unsigned Reg = 0; + if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT || + SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN) + Reg = PPC::CR0; + else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT || + SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN) + Reg = PPC::CR1; + else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT || + SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN) + Reg = PPC::CR2; + else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT || + SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN) + Reg = PPC::CR3; + else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT || + SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN) + Reg = PPC::CR4; + else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT || + SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN) + Reg = PPC::CR5; + else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT || + SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN) + Reg = PPC::CR6; + else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT || + SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN) + Reg = PPC::CR7; + + assert(Reg != 0 && "Invalid CR bit register"); + return Reg; +} + +void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; SPILL_CRBIT <SrcReg>, <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + bool LP64 = Subtarget.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + unsigned SrcReg = MI.getOperand(0).getReg(); + + BuildMI(MBB, II, dl, TII.get(TargetOpcode::KILL), + getCRFromCRBit(SrcReg)) + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) + .addReg(getCRFromCRBit(SrcReg)); + + // If the saved register wasn't CR0LT, shift the bits left so that the bit to + // store is the first one. Mask all but that bit. + unsigned Reg1 = Reg; + Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + + // rlwinm rA, rA, ShiftBits, 0, 0. + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg) + .addReg(Reg1, RegState::Kill) + .addImm(getEncodingValue(SrcReg)) + .addImm(0).addImm(0); + + addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW)) + .addReg(Reg, RegState::Kill), + FrameIndex); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; <DestReg> = RESTORE_CRBIT <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + bool LP64 = Subtarget.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + + unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + unsigned DestReg = MI.getOperand(0).getReg(); + assert(MI.definesRegister(DestReg) && + "RESTORE_CRBIT does not define its destination"); + + addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ), + Reg), FrameIndex); + + BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg); + + unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO) + .addReg(getCRFromCRBit(DestReg)); + + unsigned ShiftBits = getEncodingValue(DestReg); + // rlwimi r11, r10, 32-ShiftBits, ..., ... + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO) + .addReg(RegO, RegState::Kill).addReg(Reg, RegState::Kill) + .addImm(ShiftBits ? 32-ShiftBits : 0) + .addImm(ShiftBits).addImm(ShiftBits); + + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), + getCRFromCRBit(DestReg)) + .addReg(RegO, RegState::Kill) + // Make sure we have a use dependency all the way through this + // sequence of instructions. We can't have the other bits in the CR + // modified in between the mfocrf and the mtocrf. + .addReg(getCRFromCRBit(DestReg), RegState::Implicit); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; SPILL_VRSAVE <SrcReg>, <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC); + unsigned SrcReg = MI.getOperand(0).getReg(); + + BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg) + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + + addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW)) + .addReg(Reg, RegState::Kill), + FrameIndex); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + // Get the instruction. + MachineInstr &MI = *II; // ; <DestReg> = RESTORE_VRSAVE <offset> + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + DebugLoc dl = MI.getDebugLoc(); + + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC); + unsigned DestReg = MI.getOperand(0).getReg(); + assert(MI.definesRegister(DestReg) && + "RESTORE_VRSAVE does not define its destination"); + + addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ), + Reg), FrameIndex); + + BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg) + .addReg(Reg, RegState::Kill); + + // Discard the pseudo instruction. + MBB.erase(II); +} + +bool +PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, + unsigned Reg, int &FrameIdx) const { + + // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4 + // ABI, return true to prevent allocating an additional frame slot. + // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0 + // is arbitrary and will be subsequently ignored. For 32-bit, we have + // previously created the stack slot if needed, so return its FrameIdx. + if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) { + if (Subtarget.isPPC64()) + FrameIdx = 0; + else { + const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + FrameIdx = FI->getCRSpillFrameIndex(); + } + return true; + } + return false; +} + +// Figure out if the offset in the instruction must be a multiple of 4. +// This is true for instructions like "STD". +static bool usesIXAddr(const MachineInstr &MI) { + unsigned OpC = MI.getOpcode(); + + switch (OpC) { + default: + return false; + case PPC::LWA: + case PPC::LWA_32: + case PPC::LD: + case PPC::STD: + return true; + } +} + +// Return the OffsetOperandNo given the FIOperandNum (and the instruction). +static unsigned getOffsetONFromFION(const MachineInstr &MI, + unsigned FIOperandNum) { + // Take into account whether it's an add or mem instruction + unsigned OffsetOperandNo = (FIOperandNum == 2) ? 1 : 2; + if (MI.isInlineAsm()) + OffsetOperandNo = FIOperandNum-1; + + return OffsetOperandNo; +} + +void +PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the instruction info. + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + DebugLoc dl = MI.getDebugLoc(); + + unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum); + + // Get the frame index. + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + + // Get the frame pointer save index. Users of this index are primarily + // DYNALLOC instructions. + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + int FPSI = FI->getFramePointerSaveIndex(); + // Get the instruction opcode. + unsigned OpC = MI.getOpcode(); + + // Special case for dynamic alloca. + if (FPSI && FrameIndex == FPSI && + (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) { + lowerDynamicAlloc(II); + return; + } + + // Special case for pseudo-ops SPILL_CR and RESTORE_CR, etc. + if (OpC == PPC::SPILL_CR) { + lowerCRSpilling(II, FrameIndex); + return; + } else if (OpC == PPC::RESTORE_CR) { + lowerCRRestore(II, FrameIndex); + return; + } else if (OpC == PPC::SPILL_CRBIT) { + lowerCRBitSpilling(II, FrameIndex); + return; + } else if (OpC == PPC::RESTORE_CRBIT) { + lowerCRBitRestore(II, FrameIndex); + return; + } else if (OpC == PPC::SPILL_VRSAVE) { + lowerVRSAVESpilling(II, FrameIndex); + return; + } else if (OpC == PPC::RESTORE_VRSAVE) { + lowerVRSAVERestore(II, FrameIndex); + return; + } + + // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP). + MI.getOperand(FIOperandNum).ChangeToRegister( + FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false); + + // Figure out if the offset in the instruction is shifted right two bits. + bool isIXAddr = usesIXAddr(MI); + + // If the instruction is not present in ImmToIdxMap, then it has no immediate + // form (and must be r+r). + bool noImmForm = !MI.isInlineAsm() && !ImmToIdxMap.count(OpC); + + // Now add the frame object offset to the offset from r1. + int Offset = MFI->getObjectOffset(FrameIndex); + Offset += MI.getOperand(OffsetOperandNo).getImm(); + + // If we're not using a Frame Pointer that has been set to the value of the + // SP before having the stack size subtracted from it, then add the stack size + // to Offset to get the correct offset. + // Naked functions have stack size 0, although getStackSize may not reflect that + // because we didn't call all the pieces that compute it for naked functions. + if (!MF.getFunction()->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::Naked)) { + if (!(hasBasePointer(MF) && FrameIndex < 0)) + Offset += MFI->getStackSize(); + } + + // If we can, encode the offset directly into the instruction. If this is a + // normal PPC "ri" instruction, any 16-bit value can be safely encoded. If + // this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits + // clear can be encoded. This is extremely uncommon, because normally you + // only "std" to a stack slot that is at least 4-byte aligned, but it can + // happen in invalid code. + assert(OpC != PPC::DBG_VALUE && + "This should be handle in a target independent way"); + if (!noImmForm && isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) { + MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); + return; + } + + // The offset doesn't fit into a single register, scavenge one to build the + // offset in. + + bool is64Bit = Subtarget.isPPC64(); + const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; + const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; + const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC; + unsigned SRegHi = MF.getRegInfo().createVirtualRegister(RC), + SReg = MF.getRegInfo().createVirtualRegister(RC); + + // Insert a set of rA with the full offset value before the ld, st, or add + BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi) + .addImm(Offset >> 16); + BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg) + .addReg(SRegHi, RegState::Kill) + .addImm(Offset); + + // Convert into indexed form of the instruction: + // + // sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0 + // addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0 + unsigned OperandBase; + + if (noImmForm) + OperandBase = 1; + else if (OpC != TargetOpcode::INLINEASM) { + assert(ImmToIdxMap.count(OpC) && + "No indexed form of load or store available!"); + unsigned NewOpcode = ImmToIdxMap.find(OpC)->second; + MI.setDesc(TII.get(NewOpcode)); + OperandBase = 1; + } else { + OperandBase = OffsetOperandNo; + } + + unsigned StackReg = MI.getOperand(FIOperandNum).getReg(); + MI.getOperand(OperandBase).ChangeToRegister(StackReg, false); + MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true); +} + +unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + if (!Subtarget.isPPC64()) + return TFI->hasFP(MF) ? PPC::R31 : PPC::R1; + else + return TFI->hasFP(MF) ? PPC::X31 : PPC::X1; +} + +unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const { + if (!hasBasePointer(MF)) + return getFrameRegister(MF); + + if (Subtarget.isPPC64()) + return PPC::X30; + + if (Subtarget.isSVR4ABI() && + MF.getTarget().getRelocationModel() == Reloc::PIC_) + return PPC::R29; + + return PPC::R30; +} + +bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const { + if (!EnableBasePointer) + return false; + if (AlwaysBasePointer) + return true; + + // If we need to realign the stack, then the stack pointer can no longer + // serve as an offset into the caller's stack space. As a result, we need a + // base pointer. + return needsStackRealignment(MF); +} + +bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const { + if (MF.getFunction()->hasFnAttribute("no-realign-stack")) + return false; + + return true; +} + +bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *F = MF.getFunction(); + unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment(); + bool requiresRealignment = + ((MFI->getMaxAlignment() > StackAlign) || + F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::StackAlignment)); + + return requiresRealignment && canRealignStack(MF); +} + +/// Returns true if the instruction's frame index +/// reference would be better served by a base register other than FP +/// or SP. Used by LocalStackFrameAllocation to determine which frame index +/// references it should create new base registers for. +bool PPCRegisterInfo:: +needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { + assert(Offset < 0 && "Local offset must be negative"); + + // It's the load/store FI references that cause issues, as it can be difficult + // to materialize the offset if it won't fit in the literal field. Estimate + // based on the size of the local frame and some conservative assumptions + // about the rest of the stack frame (note, this is pre-regalloc, so + // we don't know everything for certain yet) whether this offset is likely + // to be out of range of the immediate. Return true if so. + + // We only generate virtual base registers for loads and stores that have + // an r+i form. Return false for everything else. + unsigned OpC = MI->getOpcode(); + if (!ImmToIdxMap.count(OpC)) + return false; + + // Don't generate a new virtual base register just to add zero to it. + if ((OpC == PPC::ADDI || OpC == PPC::ADDI8) && + MI->getOperand(2).getImm() == 0) + return false; + + MachineBasicBlock &MBB = *MI->getParent(); + MachineFunction &MF = *MBB.getParent(); + + const PPCFrameLowering *PPCFI = + static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering()); + unsigned StackEst = + PPCFI->determineFrameLayout(MF, false, true); + + // If we likely don't need a stack frame, then we probably don't need a + // virtual base register either. + if (!StackEst) + return false; + + // Estimate an offset from the stack pointer. + // The incoming offset is relating to the SP at the start of the function, + // but when we access the local it'll be relative to the SP after local + // allocation, so adjust our SP-relative offset by that allocation size. + Offset += StackEst; + + // The frame pointer will point to the end of the stack, so estimate the + // offset as the difference between the object offset and the FP location. + return !isFrameOffsetLegal(MI, Offset); +} + +/// Insert defining instruction(s) for BaseReg to +/// be a pointer to FrameIdx at the beginning of the basic block. +void PPCRegisterInfo:: +materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const { + unsigned ADDriOpc = Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI; + + MachineBasicBlock::iterator Ins = MBB->begin(); + DebugLoc DL; // Defaults to "unknown" + if (Ins != MBB->end()) + DL = Ins->getDebugLoc(); + + const MachineFunction &MF = *MBB->getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const MCInstrDesc &MCID = TII.get(ADDriOpc); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF)); + + BuildMI(*MBB, Ins, DL, MCID, BaseReg) + .addFrameIndex(FrameIdx).addImm(Offset); +} + +void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const { + unsigned FIOperandNum = 0; + while (!MI.getOperand(FIOperandNum).isFI()) { + ++FIOperandNum; + assert(FIOperandNum < MI.getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + } + + MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, false); + unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum); + Offset += MI.getOperand(OffsetOperandNo).getImm(); + MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); + + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const MCInstrDesc &MCID = MI.getDesc(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.constrainRegClass(BaseReg, + TII.getRegClass(MCID, FIOperandNum, this, MF)); +} + +bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + int64_t Offset) const { + unsigned FIOperandNum = 0; + while (!MI->getOperand(FIOperandNum).isFI()) { + ++FIOperandNum; + assert(FIOperandNum < MI->getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + } + + unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum); + Offset += MI->getOperand(OffsetOperandNo).getImm(); + + return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm + (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0)); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h new file mode 100644 index 000000000000..13a35f6309d4 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -0,0 +1,113 @@ +//===-- PPCRegisterInfo.h - PowerPC Register Information Impl ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the PowerPC implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPC32_REGISTERINFO_H +#define POWERPC32_REGISTERINFO_H + +#include "PPC.h" +#include "llvm/ADT/DenseMap.h" + +#define GET_REGINFO_HEADER +#include "PPCGenRegisterInfo.inc" + +namespace llvm { +class PPCSubtarget; +class TargetInstrInfo; +class Type; + +class PPCRegisterInfo : public PPCGenRegisterInfo { + DenseMap<unsigned, unsigned> ImmToIdxMap; + const PPCSubtarget &Subtarget; +public: + PPCRegisterInfo(const PPCSubtarget &SubTarget); + + /// getPointerRegClass - Return the register class to use to hold pointers. + /// This is used for addressing modes. + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const override; + + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const override; + + /// Code Generation virtual methods... + const MCPhysReg * + getCalleeSavedRegs(const MachineFunction* MF =nullptr) const override; + const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override; + const uint32_t *getNoPreservedMask() const; + + BitVector getReservedRegs(const MachineFunction &MF) const override; + + /// We require the register scavenger. + bool requiresRegisterScavenging(const MachineFunction &MF) const override { + return true; + } + + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override { + return true; + } + + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override { + return true; + } + + bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override { + return true; + } + + void lowerDynamicAlloc(MachineBasicBlock::iterator II) const; + void lowerCRSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerCRRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerCRBitSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerCRBitRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerVRSAVESpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerVRSAVERestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + + bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, + int &FrameIdx) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; + + // Support for virtual base registers. + bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; + void materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const override; + void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const override; + bool isFrameOffsetLegal(const MachineInstr *MI, + int64_t Offset) const override; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const override; + + // Base pointer (stack realignment) support. + unsigned getBaseRegister(const MachineFunction &MF) const; + bool hasBasePointer(const MachineFunction &MF) const; + bool canRealignStack(const MachineFunction &MF) const; + bool needsStackRealignment(const MachineFunction &MF) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td new file mode 100644 index 000000000000..b3d145b2cc49 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -0,0 +1,314 @@ +//===-- PPCRegisterInfo.td - The PowerPC Register File -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +let Namespace = "PPC" in { +def sub_lt : SubRegIndex<1>; +def sub_gt : SubRegIndex<1, 1>; +def sub_eq : SubRegIndex<1, 2>; +def sub_un : SubRegIndex<1, 3>; +def sub_32 : SubRegIndex<32>; +def sub_64 : SubRegIndex<64>; +def sub_128 : SubRegIndex<128>; +} + + +class PPCReg<string n> : Register<n> { + let Namespace = "PPC"; +} + +// We identify all our registers with a 5-bit ID, for consistency's sake. + +// GPR - One of the 32 32-bit general-purpose registers +class GPR<bits<5> num, string n> : PPCReg<n> { + let HWEncoding{4-0} = num; +} + +// GP8 - One of the 32 64-bit general-purpose registers +class GP8<GPR SubReg, string n> : PPCReg<n> { + let HWEncoding = SubReg.HWEncoding; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_32]; +} + +// SPR - One of the 32-bit special-purpose registers +class SPR<bits<10> num, string n> : PPCReg<n> { + let HWEncoding{9-0} = num; +} + +// FPR - One of the 32 64-bit floating-point registers +class FPR<bits<5> num, string n> : PPCReg<n> { + let HWEncoding{4-0} = num; +} + +// VF - One of the 32 64-bit floating-point subregisters of the vector +// registers (used by VSX). +class VF<bits<5> num, string n> : PPCReg<n> { + let HWEncoding{4-0} = num; + let HWEncoding{5} = 1; +} + +// VR - One of the 32 128-bit vector registers +class VR<VF SubReg, string n> : PPCReg<n> { + let HWEncoding{4-0} = SubReg.HWEncoding{4-0}; + let HWEncoding{5} = 0; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_64]; +} + +// VSRL - One of the 32 128-bit VSX registers that overlap with the scalar +// floating-point registers. +class VSRL<FPR SubReg, string n> : PPCReg<n> { + let HWEncoding = SubReg.HWEncoding; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_64]; +} + +// VSRH - One of the 32 128-bit VSX registers that overlap with the vector +// registers. +class VSRH<VR SubReg, string n> : PPCReg<n> { + let HWEncoding{4-0} = SubReg.HWEncoding{4-0}; + let HWEncoding{5} = 1; + let SubRegs = [SubReg]; + let SubRegIndices = [sub_128]; +} + +// CR - One of the 8 4-bit condition registers +class CR<bits<3> num, string n, list<Register> subregs> : PPCReg<n> { + let HWEncoding{2-0} = num; + let SubRegs = subregs; +} + +// CRBIT - One of the 32 1-bit condition register fields +class CRBIT<bits<5> num, string n> : PPCReg<n> { + let HWEncoding{4-0} = num; +} + +// General-purpose registers +foreach Index = 0-31 in { + def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>; +} + +// 64-bit General-purpose registers +foreach Index = 0-31 in { + def X#Index : GP8<!cast<GPR>("R"#Index), "r"#Index>, + DwarfRegNum<[Index, -2]>; +} + +// Floating-point registers +foreach Index = 0-31 in { + def F#Index : FPR<Index, "f"#Index>, + DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>; +} + +// Floating-point vector subregisters (for VSX) +foreach Index = 0-31 in { + def VF#Index : VF<Index, "vs" # !add(Index, 32)>; +} + +// Vector registers +foreach Index = 0-31 in { + def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>, + DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>; +} + +// VSX registers +foreach Index = 0-31 in { + def VSL#Index : VSRL<!cast<FPR>("F"#Index), "vs"#Index>, + DwarfRegAlias<!cast<FPR>("F"#Index)>; +} +foreach Index = 0-31 in { + def VSH#Index : VSRH<!cast<VR>("V"#Index), "vs" # !add(Index, 32)>, + DwarfRegAlias<!cast<VR>("V"#Index)>; +} + +// The reprsentation of r0 when treated as the constant 0. +def ZERO : GPR<0, "0">; +def ZERO8 : GP8<ZERO, "0">; + +// Representations of the frame pointer used by ISD::FRAMEADDR. +def FP : GPR<0 /* arbitrary */, "**FRAME POINTER**">; +def FP8 : GP8<FP, "**FRAME POINTER**">; + +// Representations of the base pointer used by setjmp. +def BP : GPR<0 /* arbitrary */, "**BASE POINTER**">; +def BP8 : GP8<BP, "**BASE POINTER**">; + +// Condition register bits +def CR0LT : CRBIT< 0, "0">; +def CR0GT : CRBIT< 1, "1">; +def CR0EQ : CRBIT< 2, "2">; +def CR0UN : CRBIT< 3, "3">; +def CR1LT : CRBIT< 4, "4">; +def CR1GT : CRBIT< 5, "5">; +def CR1EQ : CRBIT< 6, "6">; +def CR1UN : CRBIT< 7, "7">; +def CR2LT : CRBIT< 8, "8">; +def CR2GT : CRBIT< 9, "9">; +def CR2EQ : CRBIT<10, "10">; +def CR2UN : CRBIT<11, "11">; +def CR3LT : CRBIT<12, "12">; +def CR3GT : CRBIT<13, "13">; +def CR3EQ : CRBIT<14, "14">; +def CR3UN : CRBIT<15, "15">; +def CR4LT : CRBIT<16, "16">; +def CR4GT : CRBIT<17, "17">; +def CR4EQ : CRBIT<18, "18">; +def CR4UN : CRBIT<19, "19">; +def CR5LT : CRBIT<20, "20">; +def CR5GT : CRBIT<21, "21">; +def CR5EQ : CRBIT<22, "22">; +def CR5UN : CRBIT<23, "23">; +def CR6LT : CRBIT<24, "24">; +def CR6GT : CRBIT<25, "25">; +def CR6EQ : CRBIT<26, "26">; +def CR6UN : CRBIT<27, "27">; +def CR7LT : CRBIT<28, "28">; +def CR7GT : CRBIT<29, "29">; +def CR7EQ : CRBIT<30, "30">; +def CR7UN : CRBIT<31, "31">; + +// Condition registers +let SubRegIndices = [sub_lt, sub_gt, sub_eq, sub_un] in { +def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68, 68]>; +def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69, 69]>; +def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70, 70]>; +def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71, 71]>; +def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72, 72]>; +def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73, 73]>; +def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>; +def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>; +} + +// The full condition-code register. This is not modeled fully, but defined +// here primarily, for compatibility with gcc, to allow the inline asm "cc" +// clobber specification to work. +def CC : PPCReg<"cc">, DwarfRegAlias<CR0> { + let Aliases = [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7]; +} + +// Link register +def LR : SPR<8, "lr">, DwarfRegNum<[-2, 65]>; +//let Aliases = [LR] in +def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>; + +// Count register +def CTR : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>; +def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>; + +// VRsave register +def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>; + +// Carry bit. In the architecture this is really bit 0 of the XER register +// (which really is SPR register 1); this is the only bit interesting to a +// compiler. +def CARRY: SPR<1, "ca">; + +// FP rounding mode: bits 30 and 31 of the FP status and control register +// This is not allocated as a normal register; it appears only in +// Uses and Defs. The ABI says it needs to be preserved by a function, +// but this is not achieved by saving and restoring it as with +// most registers, it has to be done in code; to make this work all the +// return and call instructions are described as Uses of RM, so instructions +// that do nothing but change RM will not get deleted. +// Also, in the architecture it is not really a SPR; 512 is arbitrary. +def RM: SPR<512, "**ROUNDING MODE**">; + +/// Register classes +// Allocate volatiles first +// then nonvolatiles in reverse order since stmw/lmw save from rN to r31 +def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12), + (sequence "R%u", 30, 13), + R31, R0, R1, FP, BP)>; + +def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12), + (sequence "X%u", 30, 14), + X31, X13, X0, X1, FP8, BP8)>; + +// For some instructions r0 is special (representing the value 0 instead of +// the value in the r0 register), and we use these register subclasses to +// prevent r0 from being allocated for use by those instructions. +def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)>; +def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)>; + +// Allocate volatiles first, then non-volatiles in reverse order. With the SVR4 +// ABI the size of the Floating-point register save area is determined by the +// allocated non-volatile register with the lowest register number, as FP +// register N is spilled to offset 8 * (32 - N) below the back chain word of the +// previous stack frame. By allocating non-volatiles in reverse order we make +// sure that the Floating-point register save area is always as small as +// possible because there aren't any unused spill slots. +def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13), + (sequence "F%u", 31, 14))>; +def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>; + +def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128, + (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11, + V12, V13, V14, V15, V16, V17, V18, V19, V31, V30, + V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>; + +// VSX register classes (the allocation order mirrors that of the corresponding +// subregister classes). +def VSLRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128, + (add (sequence "VSL%u", 0, 13), + (sequence "VSL%u", 31, 14))>; +def VSHRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128, + (add VSH2, VSH3, VSH4, VSH5, VSH0, VSH1, VSH6, VSH7, + VSH8, VSH9, VSH10, VSH11, VSH12, VSH13, VSH14, + VSH15, VSH16, VSH17, VSH18, VSH19, VSH31, VSH30, + VSH29, VSH28, VSH27, VSH26, VSH25, VSH24, VSH23, + VSH22, VSH21, VSH20)>; +def VSRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128, + (add VSLRC, VSHRC)>; + +// Register classes for the 64-bit "scalar" VSX subregisters. +def VFRC : RegisterClass<"PPC", [f64], 64, + (add VF2, VF3, VF4, VF5, VF0, VF1, VF6, VF7, + VF8, VF9, VF10, VF11, VF12, VF13, VF14, + VF15, VF16, VF17, VF18, VF19, VF31, VF30, + VF29, VF28, VF27, VF26, VF25, VF24, VF23, + VF22, VF21, VF20)>; +def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>; + +def CRBITRC : RegisterClass<"PPC", [i1], 32, + (add CR2LT, CR2GT, CR2EQ, CR2UN, + CR3LT, CR3GT, CR3EQ, CR3UN, + CR4LT, CR4GT, CR4EQ, CR4UN, + CR5LT, CR5GT, CR5EQ, CR5UN, + CR6LT, CR6GT, CR6EQ, CR6UN, + CR7LT, CR7GT, CR7EQ, CR7UN, + CR1LT, CR1GT, CR1EQ, CR1UN, + CR0LT, CR0GT, CR0EQ, CR0UN)> { + let Size = 32; +} + +def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6, + CR7, CR2, CR3, CR4)>; + +// The CTR registers are not allocatable because they're used by the +// decrement-and-branch instructions, and thus need to stay live across +// multiple basic blocks. +def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> { + let isAllocatable = 0; +} +def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> { + let isAllocatable = 0; +} + +def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>; +def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> { + let CopyCost = -1; +} + +def CCRC : RegisterClass<"PPC", [i32], 32, (add CC)> { + let isAllocatable = 0; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRelocations.h b/contrib/llvm/lib/Target/PowerPC/PPCRelocations.h new file mode 100644 index 000000000000..0b392f99b6d7 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCRelocations.h @@ -0,0 +1,56 @@ +//===-- PPCRelocations.h - PPC Code Relocations -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the PowerPC 32-bit target-specific relocation types. +// +//===----------------------------------------------------------------------===// + +#ifndef PPCRELOCATIONS_H +#define PPCRELOCATIONS_H + +#include "llvm/CodeGen/MachineRelocation.h" + +// Hack to rid us of a PPC pre-processor symbol which is erroneously +// defined in a PowerPC header file (bug in Linux/PPC) +#ifdef PPC +#undef PPC +#endif + +namespace llvm { + namespace PPC { + enum RelocationType { + // reloc_vanilla - A standard relocation, where the address of the + // relocated object completely overwrites the address of the relocation. + reloc_vanilla, + + // reloc_pcrel_bx - PC relative relocation, for the b or bl instructions. + reloc_pcrel_bx, + + // reloc_pcrel_bcx - PC relative relocation, for BLT,BLE,BEQ,BGE,BGT,BNE, + // and other bcx instructions. + reloc_pcrel_bcx, + + // reloc_absolute_high - Absolute relocation, for the loadhi instruction + // (which is really addis). Add the high 16-bits of the specified global + // address into the low 16-bits of the instruction. + reloc_absolute_high, + + // reloc_absolute_low - Absolute relocation, for the la instruction (which + // is really an addi). Add the low 16-bits of the specified global + // address into the low 16-bits of the instruction. + reloc_absolute_low, + + // reloc_absolute_low_ix - Absolute relocation for the 64-bit load/store + // instruction which have two implicit zero bits. + reloc_absolute_low_ix + }; + } +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td new file mode 100644 index 000000000000..1221d4149996 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td @@ -0,0 +1,520 @@ +//===-- PPCSchedule.td - PowerPC Scheduling Definitions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Instruction Itinerary classes used for PowerPC +// +def IIC_IntSimple : InstrItinClass; +def IIC_IntGeneral : InstrItinClass; +def IIC_IntCompare : InstrItinClass; +def IIC_IntDivD : InstrItinClass; +def IIC_IntDivW : InstrItinClass; +def IIC_IntMFFS : InstrItinClass; +def IIC_IntMFVSCR : InstrItinClass; +def IIC_IntMTFSB0 : InstrItinClass; +def IIC_IntMTSRD : InstrItinClass; +def IIC_IntMulHD : InstrItinClass; +def IIC_IntMulHW : InstrItinClass; +def IIC_IntMulHWU : InstrItinClass; +def IIC_IntMulLI : InstrItinClass; +def IIC_IntRFID : InstrItinClass; +def IIC_IntRotateD : InstrItinClass; +def IIC_IntRotateDI : InstrItinClass; +def IIC_IntRotate : InstrItinClass; +def IIC_IntShift : InstrItinClass; +def IIC_IntTrapD : InstrItinClass; +def IIC_IntTrapW : InstrItinClass; +def IIC_BrB : InstrItinClass; +def IIC_BrCR : InstrItinClass; +def IIC_BrMCR : InstrItinClass; +def IIC_BrMCRX : InstrItinClass; +def IIC_LdStDCBA : InstrItinClass; +def IIC_LdStDCBF : InstrItinClass; +def IIC_LdStDCBI : InstrItinClass; +def IIC_LdStLoad : InstrItinClass; +def IIC_LdStLoadUpd : InstrItinClass; +def IIC_LdStLoadUpdX : InstrItinClass; +def IIC_LdStStore : InstrItinClass; +def IIC_LdStStoreUpd : InstrItinClass; +def IIC_LdStDSS : InstrItinClass; +def IIC_LdStICBI : InstrItinClass; +def IIC_LdStLD : InstrItinClass; +def IIC_LdStLDU : InstrItinClass; +def IIC_LdStLDUX : InstrItinClass; +def IIC_LdStLDARX : InstrItinClass; +def IIC_LdStLFD : InstrItinClass; +def IIC_LdStLFDU : InstrItinClass; +def IIC_LdStLFDUX : InstrItinClass; +def IIC_LdStLHA : InstrItinClass; +def IIC_LdStLHAU : InstrItinClass; +def IIC_LdStLHAUX : InstrItinClass; +def IIC_LdStLMW : InstrItinClass; +def IIC_LdStLVecX : InstrItinClass; +def IIC_LdStLWA : InstrItinClass; +def IIC_LdStLWARX : InstrItinClass; +def IIC_LdStSLBIA : InstrItinClass; +def IIC_LdStSLBIE : InstrItinClass; +def IIC_LdStSTD : InstrItinClass; +def IIC_LdStSTDCX : InstrItinClass; +def IIC_LdStSTDU : InstrItinClass; +def IIC_LdStSTDUX : InstrItinClass; +def IIC_LdStSTFD : InstrItinClass; +def IIC_LdStSTFDU : InstrItinClass; +def IIC_LdStSTVEBX : InstrItinClass; +def IIC_LdStSTWCX : InstrItinClass; +def IIC_LdStSync : InstrItinClass; +def IIC_SprISYNC : InstrItinClass; +def IIC_SprMFSR : InstrItinClass; +def IIC_SprMTMSR : InstrItinClass; +def IIC_SprMTSR : InstrItinClass; +def IIC_SprTLBSYNC : InstrItinClass; +def IIC_SprMFCR : InstrItinClass; +def IIC_SprMFCRF : InstrItinClass; +def IIC_SprMFMSR : InstrItinClass; +def IIC_SprMFSPR : InstrItinClass; +def IIC_SprMFTB : InstrItinClass; +def IIC_SprMTSPR : InstrItinClass; +def IIC_SprMTSRIN : InstrItinClass; +def IIC_SprRFI : InstrItinClass; +def IIC_SprSC : InstrItinClass; +def IIC_FPGeneral : InstrItinClass; +def IIC_FPAddSub : InstrItinClass; +def IIC_FPCompare : InstrItinClass; +def IIC_FPDivD : InstrItinClass; +def IIC_FPDivS : InstrItinClass; +def IIC_FPFused : InstrItinClass; +def IIC_FPRes : InstrItinClass; +def IIC_FPSqrtD : InstrItinClass; +def IIC_FPSqrtS : InstrItinClass; +def IIC_VecGeneral : InstrItinClass; +def IIC_VecFP : InstrItinClass; +def IIC_VecFPCompare : InstrItinClass; +def IIC_VecComplex : InstrItinClass; +def IIC_VecPerm : InstrItinClass; +def IIC_VecFPRound : InstrItinClass; +def IIC_VecVSL : InstrItinClass; +def IIC_VecVSR : InstrItinClass; +def IIC_SprMTMSRD : InstrItinClass; +def IIC_SprSLIE : InstrItinClass; +def IIC_SprSLBIE : InstrItinClass; +def IIC_SprSLBMTE : InstrItinClass; +def IIC_SprSLBMFEE : InstrItinClass; +def IIC_SprSLBIA : InstrItinClass; +def IIC_SprTLBIEL : InstrItinClass; +def IIC_SprTLBIE : InstrItinClass; + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +include "PPCScheduleG3.td" +include "PPCSchedule440.td" +include "PPCScheduleG4.td" +include "PPCScheduleG4Plus.td" +include "PPCScheduleG5.td" +include "PPCScheduleP7.td" +include "PPCScheduleA2.td" +include "PPCScheduleE500mc.td" +include "PPCScheduleE5500.td" + +//===----------------------------------------------------------------------===// +// Instruction to itinerary class map - When add new opcodes to the supported +// set, refer to the following table to determine which itinerary class the +// opcode belongs. +// +// opcode itinerary class +// ====== =============== +// add IIC_IntSimple +// addc IIC_IntGeneral +// adde IIC_IntGeneral +// addi IIC_IntSimple +// addic IIC_IntGeneral +// addic. IIC_IntGeneral +// addis IIC_IntSimple +// addme IIC_IntGeneral +// addze IIC_IntGeneral +// and IIC_IntSimple +// andc IIC_IntSimple +// andi. IIC_IntGeneral +// andis. IIC_IntGeneral +// b IIC_BrB +// bc IIC_BrB +// bcctr IIC_BrB +// bclr IIC_BrB +// cmp IIC_IntCompare +// cmpi IIC_IntCompare +// cmpl IIC_IntCompare +// cmpli IIC_IntCompare +// cntlzd IIC_IntRotateD +// cntlzw IIC_IntGeneral +// crand IIC_BrCR +// crandc IIC_BrCR +// creqv IIC_BrCR +// crnand IIC_BrCR +// crnor IIC_BrCR +// cror IIC_BrCR +// crorc IIC_BrCR +// crxor IIC_BrCR +// dcba IIC_LdStDCBA +// dcbf IIC_LdStDCBF +// dcbi IIC_LdStDCBI +// dcbst IIC_LdStDCBF +// dcbt IIC_LdStLoad +// dcbtst IIC_LdStLoad +// dcbz IIC_LdStDCBF +// divd IIC_IntDivD +// divdu IIC_IntDivD +// divw IIC_IntDivW +// divwu IIC_IntDivW +// dss IIC_LdStDSS +// dst IIC_LdStDSS +// dstst IIC_LdStDSS +// eciwx IIC_LdStLoad +// ecowx IIC_LdStLoad +// eieio IIC_LdStLoad +// eqv IIC_IntSimple +// extsb IIC_IntSimple +// extsh IIC_IntSimple +// extsw IIC_IntSimple +// fabs IIC_FPGeneral +// fadd IIC_FPAddSub +// fadds IIC_FPGeneral +// fcfid IIC_FPGeneral +// fcmpo IIC_FPCompare +// fcmpu IIC_FPCompare +// fctid IIC_FPGeneral +// fctidz IIC_FPGeneral +// fctiw IIC_FPGeneral +// fctiwz IIC_FPGeneral +// fdiv IIC_FPDivD +// fdivs IIC_FPDivS +// fmadd IIC_FPFused +// fmadds IIC_FPGeneral +// fmr IIC_FPGeneral +// fmsub IIC_FPFused +// fmsubs IIC_FPGeneral +// fmul IIC_FPFused +// fmuls IIC_FPGeneral +// fnabs IIC_FPGeneral +// fneg IIC_FPGeneral +// fnmadd IIC_FPFused +// fnmadds IIC_FPGeneral +// fnmsub IIC_FPFused +// fnmsubs IIC_FPGeneral +// fres IIC_FPRes +// frsp IIC_FPGeneral +// frsqrte IIC_FPGeneral +// fsel IIC_FPGeneral +// fsqrt IIC_FPSqrtD +// fsqrts IIC_FPSqrtS +// fsub IIC_FPAddSub +// fsubs IIC_FPGeneral +// icbi IIC_LdStICBI +// isync IIC_SprISYNC +// lbz IIC_LdStLoad +// lbzu IIC_LdStLoadUpd +// lbzux IIC_LdStLoadUpdX +// lbzx IIC_LdStLoad +// ld IIC_LdStLD +// ldarx IIC_LdStLDARX +// ldu IIC_LdStLDU +// ldux IIC_LdStLDUX +// ldx IIC_LdStLD +// lfd IIC_LdStLFD +// lfdu IIC_LdStLFDU +// lfdux IIC_LdStLFDUX +// lfdx IIC_LdStLFD +// lfs IIC_LdStLFD +// lfsu IIC_LdStLFDU +// lfsux IIC_LdStLFDUX +// lfsx IIC_LdStLFD +// lha IIC_LdStLHA +// lhau IIC_LdStLHAU +// lhaux IIC_LdStLHAUX +// lhax IIC_LdStLHA +// lhbrx IIC_LdStLoad +// lhz IIC_LdStLoad +// lhzu IIC_LdStLoadUpd +// lhzux IIC_LdStLoadUpdX +// lhzx IIC_LdStLoad +// lmw IIC_LdStLMW +// lswi IIC_LdStLMW +// lswx IIC_LdStLMW +// lvebx IIC_LdStLVecX +// lvehx IIC_LdStLVecX +// lvewx IIC_LdStLVecX +// lvsl IIC_LdStLVecX +// lvsr IIC_LdStLVecX +// lvx IIC_LdStLVecX +// lvxl IIC_LdStLVecX +// lwa IIC_LdStLWA +// lwarx IIC_LdStLWARX +// lwaux IIC_LdStLHAUX +// lwax IIC_LdStLHA +// lwbrx IIC_LdStLoad +// lwz IIC_LdStLoad +// lwzu IIC_LdStLoadUpd +// lwzux IIC_LdStLoadUpdX +// lwzx IIC_LdStLoad +// mcrf IIC_BrMCR +// mcrfs IIC_FPGeneral +// mcrxr IIC_BrMCRX +// mfcr IIC_SprMFCR +// mffs IIC_IntMFFS +// mfmsr IIC_SprMFMSR +// mfspr IIC_SprMFSPR +// mfsr IIC_SprMFSR +// mfsrin IIC_SprMFSR +// mftb IIC_SprMFTB +// mfvscr IIC_IntMFVSCR +// mtcrf IIC_BrMCRX +// mtfsb0 IIC_IntMTFSB0 +// mtfsb1 IIC_IntMTFSB0 +// mtfsf IIC_IntMTFSB0 +// mtfsfi IIC_IntMTFSB0 +// mtmsr IIC_SprMTMSR +// mtmsrd IIC_LdStLD +// mtspr IIC_SprMTSPR +// mtsr IIC_SprMTSR +// mtsrd IIC_IntMTSRD +// mtsrdin IIC_IntMTSRD +// mtsrin IIC_SprMTSRIN +// mtvscr IIC_IntMFVSCR +// mulhd IIC_IntMulHD +// mulhdu IIC_IntMulHD +// mulhw IIC_IntMulHW +// mulhwu IIC_IntMulHWU +// mulld IIC_IntMulHD +// mulli IIC_IntMulLI +// mullw IIC_IntMulHW +// nand IIC_IntSimple +// neg IIC_IntSimple +// nor IIC_IntSimple +// or IIC_IntSimple +// orc IIC_IntSimple +// ori IIC_IntSimple +// oris IIC_IntSimple +// rfi IIC_SprRFI +// rfid IIC_IntRFID +// rldcl IIC_IntRotateD +// rldcr IIC_IntRotateD +// rldic IIC_IntRotateDI +// rldicl IIC_IntRotateDI +// rldicr IIC_IntRotateDI +// rldimi IIC_IntRotateDI +// rlwimi IIC_IntRotate +// rlwinm IIC_IntGeneral +// rlwnm IIC_IntGeneral +// sc IIC_SprSC +// slbia IIC_LdStSLBIA +// slbie IIC_LdStSLBIE +// sld IIC_IntRotateD +// slw IIC_IntGeneral +// srad IIC_IntRotateD +// sradi IIC_IntRotateDI +// sraw IIC_IntShift +// srawi IIC_IntShift +// srd IIC_IntRotateD +// srw IIC_IntGeneral +// stb IIC_LdStStore +// stbu IIC_LdStStoreUpd +// stbux IIC_LdStStoreUpd +// stbx IIC_LdStStore +// std IIC_LdStSTD +// stdcx. IIC_LdStSTDCX +// stdu IIC_LdStSTDU +// stdux IIC_LdStSTDUX +// stdx IIC_LdStSTD +// stfd IIC_LdStSTFD +// stfdu IIC_LdStSTFDU +// stfdux IIC_LdStSTFDU +// stfdx IIC_LdStSTFD +// stfiwx IIC_LdStSTFD +// stfs IIC_LdStSTFD +// stfsu IIC_LdStSTFDU +// stfsux IIC_LdStSTFDU +// stfsx IIC_LdStSTFD +// sth IIC_LdStStore +// sthbrx IIC_LdStStore +// sthu IIC_LdStStoreUpd +// sthux IIC_LdStStoreUpd +// sthx IIC_LdStStore +// stmw IIC_LdStLMW +// stswi IIC_LdStLMW +// stswx IIC_LdStLMW +// stvebx IIC_LdStSTVEBX +// stvehx IIC_LdStSTVEBX +// stvewx IIC_LdStSTVEBX +// stvx IIC_LdStSTVEBX +// stvxl IIC_LdStSTVEBX +// stw IIC_LdStStore +// stwbrx IIC_LdStStore +// stwcx. IIC_LdStSTWCX +// stwu IIC_LdStStoreUpd +// stwux IIC_LdStStoreUpd +// stwx IIC_LdStStore +// subf IIC_IntGeneral +// subfc IIC_IntGeneral +// subfe IIC_IntGeneral +// subfic IIC_IntGeneral +// subfme IIC_IntGeneral +// subfze IIC_IntGeneral +// sync IIC_LdStSync +// td IIC_IntTrapD +// tdi IIC_IntTrapD +// tlbia IIC_LdStSLBIA +// tlbie IIC_LdStDCBF +// tlbsync IIC_SprTLBSYNC +// tw IIC_IntTrapW +// twi IIC_IntTrapW +// vaddcuw IIC_VecGeneral +// vaddfp IIC_VecFP +// vaddsbs IIC_VecGeneral +// vaddshs IIC_VecGeneral +// vaddsws IIC_VecGeneral +// vaddubm IIC_VecGeneral +// vaddubs IIC_VecGeneral +// vadduhm IIC_VecGeneral +// vadduhs IIC_VecGeneral +// vadduwm IIC_VecGeneral +// vadduws IIC_VecGeneral +// vand IIC_VecGeneral +// vandc IIC_VecGeneral +// vavgsb IIC_VecGeneral +// vavgsh IIC_VecGeneral +// vavgsw IIC_VecGeneral +// vavgub IIC_VecGeneral +// vavguh IIC_VecGeneral +// vavguw IIC_VecGeneral +// vcfsx IIC_VecFP +// vcfux IIC_VecFP +// vcmpbfp IIC_VecFPCompare +// vcmpeqfp IIC_VecFPCompare +// vcmpequb IIC_VecGeneral +// vcmpequh IIC_VecGeneral +// vcmpequw IIC_VecGeneral +// vcmpgefp IIC_VecFPCompare +// vcmpgtfp IIC_VecFPCompare +// vcmpgtsb IIC_VecGeneral +// vcmpgtsh IIC_VecGeneral +// vcmpgtsw IIC_VecGeneral +// vcmpgtub IIC_VecGeneral +// vcmpgtuh IIC_VecGeneral +// vcmpgtuw IIC_VecGeneral +// vctsxs IIC_VecFP +// vctuxs IIC_VecFP +// vexptefp IIC_VecFP +// vlogefp IIC_VecFP +// vmaddfp IIC_VecFP +// vmaxfp IIC_VecFPCompare +// vmaxsb IIC_VecGeneral +// vmaxsh IIC_VecGeneral +// vmaxsw IIC_VecGeneral +// vmaxub IIC_VecGeneral +// vmaxuh IIC_VecGeneral +// vmaxuw IIC_VecGeneral +// vmhaddshs IIC_VecComplex +// vmhraddshs IIC_VecComplex +// vminfp IIC_VecFPCompare +// vminsb IIC_VecGeneral +// vminsh IIC_VecGeneral +// vminsw IIC_VecGeneral +// vminub IIC_VecGeneral +// vminuh IIC_VecGeneral +// vminuw IIC_VecGeneral +// vmladduhm IIC_VecComplex +// vmrghb IIC_VecPerm +// vmrghh IIC_VecPerm +// vmrghw IIC_VecPerm +// vmrglb IIC_VecPerm +// vmrglh IIC_VecPerm +// vmrglw IIC_VecPerm +// vmsubfp IIC_VecFP +// vmsummbm IIC_VecComplex +// vmsumshm IIC_VecComplex +// vmsumshs IIC_VecComplex +// vmsumubm IIC_VecComplex +// vmsumuhm IIC_VecComplex +// vmsumuhs IIC_VecComplex +// vmulesb IIC_VecComplex +// vmulesh IIC_VecComplex +// vmuleub IIC_VecComplex +// vmuleuh IIC_VecComplex +// vmulosb IIC_VecComplex +// vmulosh IIC_VecComplex +// vmuloub IIC_VecComplex +// vmulouh IIC_VecComplex +// vnor IIC_VecGeneral +// vor IIC_VecGeneral +// vperm IIC_VecPerm +// vpkpx IIC_VecPerm +// vpkshss IIC_VecPerm +// vpkshus IIC_VecPerm +// vpkswss IIC_VecPerm +// vpkswus IIC_VecPerm +// vpkuhum IIC_VecPerm +// vpkuhus IIC_VecPerm +// vpkuwum IIC_VecPerm +// vpkuwus IIC_VecPerm +// vrefp IIC_VecFPRound +// vrfim IIC_VecFPRound +// vrfin IIC_VecFPRound +// vrfip IIC_VecFPRound +// vrfiz IIC_VecFPRound +// vrlb IIC_VecGeneral +// vrlh IIC_VecGeneral +// vrlw IIC_VecGeneral +// vrsqrtefp IIC_VecFP +// vsel IIC_VecGeneral +// vsl IIC_VecVSL +// vslb IIC_VecGeneral +// vsldoi IIC_VecPerm +// vslh IIC_VecGeneral +// vslo IIC_VecPerm +// vslw IIC_VecGeneral +// vspltb IIC_VecPerm +// vsplth IIC_VecPerm +// vspltisb IIC_VecPerm +// vspltish IIC_VecPerm +// vspltisw IIC_VecPerm +// vspltw IIC_VecPerm +// vsr IIC_VecVSR +// vsrab IIC_VecGeneral +// vsrah IIC_VecGeneral +// vsraw IIC_VecGeneral +// vsrb IIC_VecGeneral +// vsrh IIC_VecGeneral +// vsro IIC_VecPerm +// vsrw IIC_VecGeneral +// vsubcuw IIC_VecGeneral +// vsubfp IIC_VecFP +// vsubsbs IIC_VecGeneral +// vsubshs IIC_VecGeneral +// vsubsws IIC_VecGeneral +// vsububm IIC_VecGeneral +// vsububs IIC_VecGeneral +// vsubuhm IIC_VecGeneral +// vsubuhs IIC_VecGeneral +// vsubuwm IIC_VecGeneral +// vsubuws IIC_VecGeneral +// vsum2sws IIC_VecComplex +// vsum4sbs IIC_VecComplex +// vsum4shs IIC_VecComplex +// vsum4ubs IIC_VecComplex +// vsumsws IIC_VecComplex +// vupkhpx IIC_VecPerm +// vupkhsb IIC_VecPerm +// vupkhsh IIC_VecPerm +// vupklpx IIC_VecPerm +// vupklsb IIC_VecPerm +// vupklsh IIC_VecPerm +// vxor IIC_VecGeneral +// xor IIC_IntSimple +// xori IIC_IntSimple +// xoris IIC_IntSimple +// diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td new file mode 100644 index 000000000000..218fed248a31 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td @@ -0,0 +1,599 @@ +//===-- PPCSchedule440.td - PPC 440 Scheduling Definitions -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// Primary reference: +// PowerPC 440x6 Embedded Processor Core User's Manual. +// IBM (as updated in) 2010. + +// The basic PPC 440 does not include a floating-point unit; the pipeline +// timings here are constructed to match the FP2 unit shipped with the +// PPC-440- and PPC-450-based Blue Gene (L and P) supercomputers. +// References: +// S. Chatterjee, et al. Design and exploitation of a high-performance +// SIMD floating-point unit for Blue Gene/L. +// IBM J. Res. & Dev. 49 (2/3) March/May 2005. +// also: +// Carlos Sosa and Brant Knudson. IBM System Blue Gene Solution: +// Blue Gene/P Application Development. +// IBM (as updated in) 2009. + +//===----------------------------------------------------------------------===// +// Functional units on the PowerPC 440/450 chip sets +// +def P440_DISS1 : FuncUnit; // Issue unit 1 +def P440_DISS2 : FuncUnit; // Issue unit 2 +def P440_LRACC : FuncUnit; // Register access and dispatch for + // the simple integer (J-pipe) and + // load/store (L-pipe) pipelines +def P440_IRACC : FuncUnit; // Register access and dispatch for + // the complex integer (I-pipe) pipeline +def P440_FRACC : FuncUnit; // Register access and dispatch for + // the floating-point execution (F-pipe) pipeline +def P440_IEXE1 : FuncUnit; // Execution stage 1 for the I pipeline +def P440_IEXE2 : FuncUnit; // Execution stage 2 for the I pipeline +def P440_IWB : FuncUnit; // Write-back unit for the I pipeline +def P440_JEXE1 : FuncUnit; // Execution stage 1 for the J pipeline +def P440_JEXE2 : FuncUnit; // Execution stage 2 for the J pipeline +def P440_JWB : FuncUnit; // Write-back unit for the J pipeline +def P440_AGEN : FuncUnit; // Address generation for the L pipeline +def P440_CRD : FuncUnit; // D-cache access for the L pipeline +def P440_LWB : FuncUnit; // Write-back unit for the L pipeline +def P440_FEXE1 : FuncUnit; // Execution stage 1 for the F pipeline +def P440_FEXE2 : FuncUnit; // Execution stage 2 for the F pipeline +def P440_FEXE3 : FuncUnit; // Execution stage 3 for the F pipeline +def P440_FEXE4 : FuncUnit; // Execution stage 4 for the F pipeline +def P440_FEXE5 : FuncUnit; // Execution stage 5 for the F pipeline +def P440_FEXE6 : FuncUnit; // Execution stage 6 for the F pipeline +def P440_FWB : FuncUnit; // Write-back unit for the F pipeline + +def P440_LWARX_Hold : FuncUnit; // This is a pseudo-unit which is used + // to make sure that no lwarx/stwcx. + // instructions are issued while another + // lwarx/stwcx. is in the L pipe. + +def P440_GPR_Bypass : Bypass; // The bypass for general-purpose regs. +def P440_FPR_Bypass : Bypass; // The bypass for floating-point regs. + +// Notes: +// Instructions are held in the FRACC, LRACC and IRACC pipeline +// stages until their source operands become ready. Exceptions: +// - Store instructions will hold in the AGEN stage +// - The integer multiply-accumulate instruction will hold in +// the IEXE1 stage +// +// For most I-pipe operations, the result is available at the end of +// the IEXE1 stage. Operations such as multiply and divide must +// continue to execute in IEXE2 and IWB. Divide resides in IWB for +// 33 cycles (multiply also calculates its result in IWB). For all +// J-pipe instructions, the result is available +// at the end of the JEXE1 stage. Loads have a 3-cycle latency +// (data is not available until after the LWB stage). +// +// The L1 cache hit latency is four cycles for floating point loads +// and three cycles for integer loads. +// +// The stwcx. instruction requires both the LRACC and the IRACC +// dispatch stages. It must be issued from DISS0. +// +// All lwarx/stwcx. instructions hold in LRACC if another +// uncommitted lwarx/stwcx. is in AGEN, CRD, or LWB. +// +// msync (a.k.a. sync) and mbar will hold in LWB until all load/store +// resources are empty. AGEN and CRD are held empty until the msync/mbar +// commits. +// +// Most floating-point instructions, computational and move, +// have a 5-cycle latency. Divide takes longer (30 cycles). Instructions that +// update the CR take 2 cycles. Stores take 3 cycles and, as mentioned above, +// loads take 4 cycles (for L1 hit). + +// +// This file defines the itinerary class data for the PPC 440 processor. +// +//===----------------------------------------------------------------------===// + + +def PPC440Itineraries : ProcessorItineraries< + [P440_DISS1, P440_DISS2, P440_FRACC, P440_IRACC, P440_IEXE1, P440_IEXE2, + P440_IWB, P440_LRACC, P440_JEXE1, P440_JEXE2, P440_JWB, P440_AGEN, P440_CRD, + P440_LWB, P440_FEXE1, P440_FEXE2, P440_FEXE3, P440_FEXE4, P440_FEXE5, + P440_FEXE6, P440_FWB, P440_LWARX_Hold], + [P440_GPR_Bypass, P440_FPR_Bypass], [ + InstrItinData<IIC_IntSimple, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntGeneral, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntDivW, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<33, [P440_IWB]>], + [36, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMFFS, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [3, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [3, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMulHW, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMulHWU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntMulLI, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntRotate, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntShift, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC, P440_LRACC]>, + InstrStage<1, [P440_IEXE1, P440_JEXE1]>, + InstrStage<1, [P440_IEXE2, P440_JEXE2]>, + InstrStage<1, [P440_IWB, P440_JWB]>], + [2, 0, 0], + [P440_GPR_Bypass, + P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_IntTrapW, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [2, 0], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_BrB, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_BrCR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_BrMCR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_BrMCRX, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0, 0], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBA, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBF, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBI, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLoad, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [5, 1, 1], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLoadUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [5, 2, 1, 1], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [5, 2, 1, 1], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStStore, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [1, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [2, 1, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStICBI, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFD, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [1, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [2, 1, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLFD, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [5, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLFDU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [5, 2, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [5, 2, 1, 1], + [NoBypass, P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLHA, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLMW, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P440_DISS1]>, + InstrStage<1, [P440_IRACC], 0>, + InstrStage<4, [P440_LWARX_Hold], 0>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTD, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDU, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [2, 1, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDUX, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<2, [P440_LWB]>], + [2, 1, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDCX, [InstrStage<1, [P440_DISS1]>, + InstrStage<1, [P440_IRACC], 0>, + InstrStage<4, [P440_LWARX_Hold], 0>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [P440_DISS1]>, + InstrStage<1, [P440_IRACC], 0>, + InstrStage<4, [P440_LWARX_Hold], 0>, + InstrStage<1, [P440_LRACC]>, + InstrStage<1, [P440_AGEN]>, + InstrStage<1, [P440_CRD]>, + InstrStage<1, [P440_LWB]>], + [4, 1, 1], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_LdStSync, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_LRACC]>, + InstrStage<3, [P440_AGEN], 1>, + InstrStage<2, [P440_CRD], 1>, + InstrStage<1, [P440_LWB]>]>, + InstrItinData<IIC_SprISYNC, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC], 0>, + InstrStage<1, [P440_LRACC], 0>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_FEXE1], 0>, + InstrStage<1, [P440_AGEN], 0>, + InstrStage<1, [P440_JEXE1], 0>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_FEXE2], 0>, + InstrStage<1, [P440_CRD], 0>, + InstrStage<1, [P440_JEXE2], 0>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<6, [P440_FEXE3], 0>, + InstrStage<6, [P440_LWB], 0>, + InstrStage<6, [P440_JWB], 0>, + InstrStage<6, [P440_IWB]>]>, + InstrItinData<IIC_SprMFSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [2, 0], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMTMSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [2, 0], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMTSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [5, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>]>, + InstrItinData<IIC_SprMFCR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMFMSR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [3, 0], + [P440_GPR_Bypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMFSPR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [6, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMFTB, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [6, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMTSPR, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [6, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprMTSRIN, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<3, [P440_IWB]>], + [6, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprRFI, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_SprSC, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_IRACC]>, + InstrStage<1, [P440_IEXE1]>, + InstrStage<1, [P440_IEXE2]>, + InstrStage<1, [P440_IWB]>], + [4, 0], + [NoBypass, P440_GPR_Bypass]>, + InstrItinData<IIC_FPGeneral, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0, 0], + [P440_FPR_Bypass, + P440_FPR_Bypass, P440_FPR_Bypass]>, + InstrItinData<IIC_FPAddSub, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0, 0], + [P440_FPR_Bypass, + P440_FPR_Bypass, P440_FPR_Bypass]>, + InstrItinData<IIC_FPCompare, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0, 0], + [P440_FPR_Bypass, P440_FPR_Bypass, + P440_FPR_Bypass]>, + InstrItinData<IIC_FPDivD, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<25, [P440_FWB]>], + [31, 0, 0], + [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>, + InstrItinData<IIC_FPDivS, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<13, [P440_FWB]>], + [19, 0, 0], + [NoBypass, P440_FPR_Bypass, P440_FPR_Bypass]>, + InstrItinData<IIC_FPFused, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0, 0, 0], + [P440_FPR_Bypass, + P440_FPR_Bypass, P440_FPR_Bypass, + P440_FPR_Bypass]>, + InstrItinData<IIC_FPRes, [InstrStage<1, [P440_DISS1, P440_DISS2]>, + InstrStage<1, [P440_FRACC]>, + InstrStage<1, [P440_FEXE1]>, + InstrStage<1, [P440_FEXE2]>, + InstrStage<1, [P440_FEXE3]>, + InstrStage<1, [P440_FEXE4]>, + InstrStage<1, [P440_FEXE5]>, + InstrStage<1, [P440_FEXE6]>, + InstrStage<1, [P440_FWB]>], + [6, 0], + [P440_FPR_Bypass, P440_FPR_Bypass]> +]>; + +// ===---------------------------------------------------------------------===// +// PPC440 machine model for scheduling and other instruction cost heuristics. + +def PPC440Model : SchedMachineModel { + let IssueWidth = 2; // 2 instructions are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 5; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = PPC440Itineraries; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td new file mode 100644 index 000000000000..14476963bad0 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td @@ -0,0 +1,169 @@ +//===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// Primary reference: +// A2 Processor User's Manual. +// IBM (as updated in) 2010. + +//===----------------------------------------------------------------------===// +// Functional units on the PowerPC A2 chip sets +// +def A2_XU : FuncUnit; // A2_XU pipeline +def A2_FU : FuncUnit; // FI pipeline + +// +// This file defines the itinerary class data for the PPC A2 processor. +// +//===----------------------------------------------------------------------===// + + +def PPCA2Itineraries : ProcessorItineraries< + [A2_XU, A2_FU], [], [ + InstrItinData<IIC_IntSimple, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_IntGeneral, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntCompare, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntDivW, [InstrStage<1, [A2_XU]>], + [39, 0, 0]>, + InstrItinData<IIC_IntDivD, [InstrStage<1, [A2_XU]>], + [71, 0, 0]>, + InstrItinData<IIC_IntMulHW, [InstrStage<1, [A2_XU]>], + [5, 0, 0]>, + InstrItinData<IIC_IntMulHWU, [InstrStage<1, [A2_XU]>], + [5, 0, 0]>, + InstrItinData<IIC_IntMulLI, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_IntRotate, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntRotateD, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntRotateDI, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntShift, [InstrStage<1, [A2_XU]>], + [2, 0, 0]>, + InstrItinData<IIC_IntTrapW, [InstrStage<1, [A2_XU]>], + [2, 0]>, + InstrItinData<IIC_IntTrapD, [InstrStage<1, [A2_XU]>], + [2, 0]>, + InstrItinData<IIC_BrB, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_BrCR, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_BrMCR, [InstrStage<1, [A2_XU]>], + [5, 0, 0]>, + InstrItinData<IIC_BrMCRX, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_LdStDCBA, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_LdStDCBF, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_LdStDCBI, [InstrStage<1, [A2_XU]>], + [1, 0, 0]>, + InstrItinData<IIC_LdStLoad, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [A2_XU]>], + [6, 8, 0, 0]>, + InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [A2_XU]>], + [6, 8, 0, 0]>, + InstrItinData<IIC_LdStLDU, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_LdStLDUX, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_LdStStore, [InstrStage<1, [A2_XU]>], + [0, 0, 0]>, + InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [A2_XU]>], + [2, 0, 0, 0]>, + InstrItinData<IIC_LdStICBI, [InstrStage<1, [A2_XU]>], + [16, 0, 0]>, + InstrItinData<IIC_LdStSTFD, [InstrStage<1, [A2_XU]>], + [0, 0, 0]>, + InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [A2_XU]>], + [2, 0, 0, 0]>, + InstrItinData<IIC_LdStLFD, [InstrStage<1, [A2_XU]>], + [7, 0, 0]>, + InstrItinData<IIC_LdStLFDU, [InstrStage<1, [A2_XU]>], + [7, 9, 0, 0]>, + InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [A2_XU]>], + [7, 9, 0, 0]>, + InstrItinData<IIC_LdStLHA, [InstrStage<1, [A2_XU]>], + [6, 0, 0]>, + InstrItinData<IIC_LdStLHAU, [InstrStage<1, [A2_XU]>], + [6, 8, 0, 0]>, + InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [A2_XU]>], + [6, 8, 0, 0]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [A2_XU]>], + [82, 0, 0]>, // L2 latency + InstrItinData<IIC_LdStSTD, [InstrStage<1, [A2_XU]>], + [0, 0, 0]>, + InstrItinData<IIC_LdStSTDU, [InstrStage<1, [A2_XU]>], + [2, 0, 0, 0]>, + InstrItinData<IIC_LdStSTDUX, [InstrStage<1, [A2_XU]>], + [2, 0, 0, 0]>, + InstrItinData<IIC_LdStSTDCX, [InstrStage<1, [A2_XU]>], + [82, 0, 0]>, // L2 latency + InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [A2_XU]>], + [82, 0, 0]>, // L2 latency + InstrItinData<IIC_LdStSync, [InstrStage<1, [A2_XU]>], + [6]>, + InstrItinData<IIC_SprISYNC, [InstrStage<1, [A2_XU]>], + [16]>, + InstrItinData<IIC_SprMTMSR, [InstrStage<1, [A2_XU]>], + [16, 0]>, + InstrItinData<IIC_SprMFCR, [InstrStage<1, [A2_XU]>], + [6, 0]>, + InstrItinData<IIC_SprMFCRF, [InstrStage<1, [A2_XU]>], + [1, 0]>, + InstrItinData<IIC_SprMFMSR, [InstrStage<1, [A2_XU]>], + [4, 0]>, + InstrItinData<IIC_SprMFSPR, [InstrStage<1, [A2_XU]>], + [6, 0]>, + InstrItinData<IIC_SprMFTB, [InstrStage<1, [A2_XU]>], + [4, 0]>, + InstrItinData<IIC_SprMTSPR, [InstrStage<1, [A2_XU]>], + [6, 0]>, + InstrItinData<IIC_SprRFI, [InstrStage<1, [A2_XU]>], + [16]>, + InstrItinData<IIC_SprSC, [InstrStage<1, [A2_XU]>], + [16]>, + InstrItinData<IIC_FPGeneral, [InstrStage<1, [A2_FU]>], + [6, 0, 0]>, + InstrItinData<IIC_FPAddSub, [InstrStage<1, [A2_FU]>], + [6, 0, 0]>, + InstrItinData<IIC_FPCompare, [InstrStage<1, [A2_FU]>], + [5, 0, 0]>, + InstrItinData<IIC_FPDivD, [InstrStage<1, [A2_FU]>], + [72, 0, 0]>, + InstrItinData<IIC_FPDivS, [InstrStage<1, [A2_FU]>], + [59, 0, 0]>, + InstrItinData<IIC_FPSqrtD, [InstrStage<1, [A2_FU]>], + [69, 0, 0]>, + InstrItinData<IIC_FPSqrtS, [InstrStage<1, [A2_FU]>], + [65, 0, 0]>, + InstrItinData<IIC_FPFused, [InstrStage<1, [A2_FU]>], + [6, 0, 0, 0]>, + InstrItinData<IIC_FPRes, [InstrStage<1, [A2_FU]>], + [6, 0]> +]>; + +// ===---------------------------------------------------------------------===// +// A2 machine model for scheduling and other instruction cost heuristics. + +def PPCA2Model : SchedMachineModel { + let IssueWidth = 1; // 1 instruction is dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 6; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 13; + + let Itineraries = PPCA2Itineraries; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td new file mode 100644 index 000000000000..dab89e3db353 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td @@ -0,0 +1,314 @@ +//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Freescale e500mc 32-bit +// Power processor. +// +// All information is derived from the "e500mc Core Reference Manual", +// Freescale Document Number E500MCRM, Rev. 1, 03/2012. +// +//===----------------------------------------------------------------------===// +// Relevant functional units in the Freescale e500mc core: +// +// * Decode & Dispatch +// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue +// queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ). +def E500_DIS0 : FuncUnit; // Dispatch stage - insn 1 +def E500_DIS1 : FuncUnit; // Dispatch stage - insn 2 + +// * Execute +// 6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX. +// Some instructions can only execute in SFX0 but not SFX1. +// The CFX has a bypass path, allowing non-divide instructions to execute +// while a divide instruction is executed. +def E500_SFX0 : FuncUnit; // Simple unit 0 +def E500_SFX1 : FuncUnit; // Simple unit 1 +def E500_BU : FuncUnit; // Branch unit +def E500_CFX_DivBypass + : FuncUnit; // CFX divide bypass path +def E500_CFX_0 : FuncUnit; // CFX pipeline +def E500_LSU_0 : FuncUnit; // LSU pipeline +def E500_FPU_0 : FuncUnit; // FPU pipeline + +def E500_GPR_Bypass : Bypass; +def E500_FPR_Bypass : Bypass; +def E500_CR_Bypass : Bypass; + +def PPCE500mcItineraries : ProcessorItineraries< + [E500_DIS0, E500_DIS1, E500_SFX0, E500_SFX1, E500_BU, E500_CFX_DivBypass, + E500_CFX_0, E500_LSU_0, E500_FPU_0], + [E500_CR_Bypass, E500_GPR_Bypass, E500_FPR_Bypass], [ + InstrItinData<IIC_IntSimple, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntCompare, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [5, 1, 1], // Latency = 1 or 2 + [E500_CR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntDivW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_CFX_0], 0>, + InstrStage<14, [E500_CFX_DivBypass]>], + [17, 1, 1], // Latency=4..35, Repeat= 4..35 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntMFFS, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<8, [E500_FPU_0]>], + [11], // Latency = 8 + [E500_FPR_Bypass]>, + InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<8, [E500_FPU_0]>], + [11, 1, 1], // Latency = 8 + [NoBypass, NoBypass, NoBypass]>, + InstrItinData<IIC_IntMulHW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_CFX_0]>], + [7, 1, 1], // Latency = 4, Repeat rate = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntMulHWU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_CFX_0]>], + [7, 1, 1], // Latency = 4, Repeat rate = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntMulLI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_CFX_0]>], + [7, 1, 1], // Latency = 4, Repeat rate = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntRotate, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntShift, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_IntTrapW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<2, [E500_SFX0]>], + [5, 1], // Latency = 2, Repeat rate = 2 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_BrB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_BU]>], + [4, 1], // Latency = 1 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_BrCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_BU]>], + [4, 1, 1], // Latency = 1 + [E500_CR_Bypass, + E500_CR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_BrMCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_BU]>], + [4, 1], // Latency = 1 + [E500_CR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_BrMCRX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1, 1], // Latency = 1 + [E500_CR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3, Repeat rate = 1 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLoad, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStStore, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStICBI, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1, 1], // Latency = 3 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1, 1], // Latency = 3 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [7, 1, 1], // Latency = 4 + [E500_FPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLFDU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [7, 1, 1], // Latency = 4 + [E500_FPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [7, 1, 1], // Latency = 4 + [E500_FPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLHA, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAU, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLMW, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [7, 1], // Latency = r+3 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<3, [E500_LSU_0]>], + [6, 1, 1], // Latency = 3, Repeat rate = 3 + [E500_GPR_Bypass, + E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>], + [6, 1], // Latency = 3 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_LdStSync, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0]>]>, + InstrItinData<IIC_SprMFSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_SFX0]>], + [7, 1], + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMTMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<2, [E500_SFX0, E500_SFX1]>], + [5, 1], // Latency = 2, Repeat rate = 4 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMTSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0]>], + [5, 1], + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_LSU_0], 0>]>, + InstrItinData<IIC_SprMFCR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<5, [E500_SFX0]>], + [8, 1], + [E500_GPR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_SprMFCRF, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<5, [E500_SFX0]>], + [8, 1], + [E500_GPR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_SprMFMSR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_SFX0]>], + [7, 1], // Latency = 4, Repeat rate = 4 + [E500_GPR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMFSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1], // Latency = 1, Repeat rate = 1 + [E500_GPR_Bypass, E500_CR_Bypass]>, + InstrItinData<IIC_SprMFTB, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_SFX0]>], + [7, 1], // Latency = 4, Repeat rate = 4 + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMTSPR, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0, E500_SFX1]>], + [4, 1], // Latency = 1, Repeat rate = 1 + [E500_CR_Bypass, E500_GPR_Bypass]>, + InstrItinData<IIC_SprMTSRIN, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<1, [E500_SFX0]>], + [4, 1], + [NoBypass, E500_GPR_Bypass]>, + InstrItinData<IIC_FPGeneral, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<2, [E500_FPU_0]>], + [11, 1, 1], // Latency = 8, Repeat rate = 2 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPAddSub, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_FPU_0]>], + [13, 1, 1], // Latency = 10, Repeat rate = 4 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPCompare, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<2, [E500_FPU_0]>], + [11, 1, 1], // Latency = 8, Repeat rate = 2 + [E500_CR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPDivD, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<68, [E500_FPU_0]>], + [71, 1, 1], // Latency = 68, Repeat rate = 68 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPDivS, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<38, [E500_FPU_0]>], + [41, 1, 1], // Latency = 38, Repeat rate = 38 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass]>, + InstrItinData<IIC_FPFused, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<4, [E500_FPU_0]>], + [13, 1, 1, 1], // Latency = 10, Repeat rate = 4 + [E500_FPR_Bypass, + E500_FPR_Bypass, E500_FPR_Bypass, + E500_FPR_Bypass]>, + InstrItinData<IIC_FPRes, [InstrStage<1, [E500_DIS0, E500_DIS1], 0>, + InstrStage<38, [E500_FPU_0]>], + [41, 1], // Latency = 38, Repeat rate = 38 + [E500_FPR_Bypass, E500_FPR_Bypass]> +]>; + +// ===---------------------------------------------------------------------===// +// e500mc machine model for scheduling and other instruction cost heuristics. + +def PPCE500mcModel : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 5; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = PPCE500mcItineraries; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td new file mode 100644 index 000000000000..de097d9d8cf5 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td @@ -0,0 +1,374 @@ +//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Freescale e5500 64-bit +// Power processor. +// +// All information is derived from the "e5500 Core Reference Manual", +// Freescale Document Number e5500RM, Rev. 1, 03/2012. +// +//===----------------------------------------------------------------------===// +// Relevant functional units in the Freescale e5500 core +// (These are the same as for the e500mc) +// +// * Decode & Dispatch +// Can dispatch up to 2 instructions per clock cycle to either the GPR Issue +// queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ). +def E5500_DIS0 : FuncUnit; +def E5500_DIS1 : FuncUnit; + +// * Execute +// 6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX. +// The CFX has a bypass path, allowing non-divide instructions to execute +// while a divide instruction is being executed. +def E5500_SFX0 : FuncUnit; // Simple unit 0 +def E5500_SFX1 : FuncUnit; // Simple unit 1 +def E5500_BU : FuncUnit; // Branch unit +def E5500_CFX_DivBypass + : FuncUnit; // CFX divide bypass path +def E5500_CFX_0 : FuncUnit; // CFX pipeline stage 0 + +def E5500_CFX_1 : FuncUnit; // CFX pipeline stage 1 + +def E5500_LSU_0 : FuncUnit; // LSU pipeline +def E5500_FPU_0 : FuncUnit; // FPU pipeline + +def E5500_GPR_Bypass : Bypass; +def E5500_FPR_Bypass : Bypass; +def E5500_CR_Bypass : Bypass; + +def PPCE5500Itineraries : ProcessorItineraries< + [E5500_DIS0, E5500_DIS1, E5500_SFX0, E5500_SFX1, E5500_BU, + E5500_CFX_DivBypass, E5500_CFX_0, E5500_CFX_1, + E5500_LSU_0, E5500_FPU_0], + [E5500_CR_Bypass, E5500_GPR_Bypass, E5500_FPR_Bypass], [ + InstrItinData<IIC_IntSimple, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5, 2, 2], // Latency = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntGeneral, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5, 2, 2], // Latency = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntCompare, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [6, 2, 2], // Latency = 1 or 2 + [E5500_CR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntDivD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<26, [E5500_CFX_DivBypass]>], + [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntDivW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<16, [E5500_CFX_DivBypass]>], + [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntMFFS, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11], // Latency = 7, Repeat rate = 1 + [E5500_FPR_Bypass]>, + InstrItinData<IIC_IntMTFSB0, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<7, [E5500_FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 7 + [NoBypass, NoBypass, NoBypass]>, + InstrItinData<IIC_IntMulHD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<2, [E5500_CFX_1]>], + [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntMulHW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<1, [E5500_CFX_1]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntMulHWU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<1, [E5500_CFX_1]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntMulLI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0], 0>, + InstrStage<2, [E5500_CFX_1]>], + [8, 2, 2], // Latency = 4 or 5, Repeat = 2 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntRotate, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5, 2, 2], // Latency = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntRotateD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_SFX0, E5500_SFX1]>], + [6, 2, 2], // Latency = 2, Repeat rate = 2 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntRotateDI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5, 2, 2], // Latency = 1, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntShift, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_SFX0, E5500_SFX1]>], + [6, 2, 2], // Latency = 2, Repeat rate = 2 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_IntTrapW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_SFX0]>], + [6, 2], // Latency = 2, Repeat rate = 2 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_BrB, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_BU]>], + [5, 2], // Latency = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_BrCR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_BU]>], + [5, 2, 2], // Latency = 1 + [E5500_CR_Bypass, + E5500_CR_Bypass, E5500_CR_Bypass]>, + InstrItinData<IIC_BrMCR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_BU]>], + [5, 2], // Latency = 1 + [E5500_CR_Bypass, E5500_CR_Bypass]>, + InstrItinData<IIC_BrMCRX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0]>], + [5, 2, 2], // Latency = 1 + [E5500_CR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBA, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBF, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStDCBI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLoad, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLoadUpd, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLoadUpdX,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLDARX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<3, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 3 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLDUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStStore, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStStoreUpd,[InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStICBI, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTFDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLFDUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [8, 2, 2], // Latency = 4, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLHA, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLHAU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLHAUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [E5500_GPR_Bypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStLMW, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<4, [E5500_LSU_0]>], + [8, 2], // Latency = r+3, Repeat rate = r+3 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<3, [E5500_LSU_0]>], + [7, 2, 2], // Latency = 3, Repeat rate = 3 + [E5500_GPR_Bypass, + E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDCX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSTDU, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStSTDUX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass], + 2>, // 2 micro-ops + InstrItinData<IIC_LdStSTWCX, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>], + [7, 2], // Latency = 3, Repeat rate = 1 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_LdStSync, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0]>]>, + InstrItinData<IIC_SprMTMSR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_CFX_0]>], + [6, 2], // Latency = 2, Repeat rate = 4 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_SprTLBSYNC, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_LSU_0], 0>]>, + InstrItinData<IIC_SprMFCR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<5, [E5500_CFX_0]>], + [9, 2], // Latency = 5, Repeat rate = 5 + [E5500_GPR_Bypass, E5500_CR_Bypass]>, + InstrItinData<IIC_SprMFCRF, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<5, [E5500_CFX_0]>], + [9, 2], // Latency = 5, Repeat rate = 5 + [E5500_GPR_Bypass, E5500_CR_Bypass]>, + InstrItinData<IIC_SprMFMSR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<4, [E5500_SFX0]>], + [8, 2], // Latency = 4, Repeat rate = 4 + [E5500_GPR_Bypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_SprMFSPR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_CFX_0]>], + [5], // Latency = 1, Repeat rate = 1 + [E5500_GPR_Bypass]>, + InstrItinData<IIC_SprMFTB, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<4, [E5500_CFX_0]>], + [8, 2], // Latency = 4, Repeat rate = 4 + [NoBypass, E5500_GPR_Bypass]>, + InstrItinData<IIC_SprMTSPR, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_SFX0, E5500_SFX1]>], + [5], // Latency = 1, Repeat rate = 1 + [E5500_GPR_Bypass]>, + InstrItinData<IIC_FPGeneral, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPAddSub, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPCompare, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11, 2, 2], // Latency = 7, Repeat rate = 1 + [E5500_CR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPDivD, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<31, [E5500_FPU_0]>], + [39, 2, 2], // Latency = 35, Repeat rate = 31 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPDivS, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<16, [E5500_FPU_0]>], + [24, 2, 2], // Latency = 20, Repeat rate = 16 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass]>, + InstrItinData<IIC_FPFused, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<1, [E5500_FPU_0]>], + [11, 2, 2, 2], // Latency = 7, Repeat rate = 1 + [E5500_FPR_Bypass, + E5500_FPR_Bypass, E5500_FPR_Bypass, + E5500_FPR_Bypass]>, + InstrItinData<IIC_FPRes, [InstrStage<1, [E5500_DIS0, E5500_DIS1], 0>, + InstrStage<2, [E5500_FPU_0]>], + [12, 2], // Latency = 8, Repeat rate = 2 + [E5500_FPR_Bypass, E5500_FPR_Bypass]> +]>; + +// ===---------------------------------------------------------------------===// +// e5500 machine model for scheduling and other instruction cost heuristics. + +def PPCE5500Model : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = -1; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 6; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + + let Itineraries = PPCE5500Itineraries; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td new file mode 100644 index 000000000000..21efd8f8f6c9 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td @@ -0,0 +1,80 @@ +//===-- PPCScheduleG3.td - PPC G3 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G3 (750) processor. +// +//===----------------------------------------------------------------------===// + +def G3_BPU : FuncUnit; // Branch unit +def G3_SLU : FuncUnit; // Store/load unit +def G3_SRU : FuncUnit; // special register unit +def G3_IU1 : FuncUnit; // integer unit 1 (simple) +def G3_IU2 : FuncUnit; // integer unit 2 (complex) +def G3_FPU1 : FuncUnit; // floating point unit 1 + +def G3Itineraries : ProcessorItineraries< + [G3_IU1, G3_IU2, G3_FPU1, G3_BPU, G3_SRU, G3_SLU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntGeneral , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntCompare , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntDivW , [InstrStage<19, [G3_IU1]>]>, + InstrItinData<IIC_IntMFFS , [InstrStage<1, [G3_FPU1]>]>, + InstrItinData<IIC_IntMTFSB0 , [InstrStage<3, [G3_FPU1]>]>, + InstrItinData<IIC_IntMulHW , [InstrStage<5, [G3_IU1]>]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<6, [G3_IU1]>]>, + InstrItinData<IIC_IntMulLI , [InstrStage<3, [G3_IU1]>]>, + InstrItinData<IIC_IntRotate , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntShift , [InstrStage<1, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_IntTrapW , [InstrStage<2, [G3_IU1, G3_IU2]>]>, + InstrItinData<IIC_BrB , [InstrStage<1, [G3_BPU]>]>, + InstrItinData<IIC_BrCR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_BrMCR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_BrMCRX , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_LdStDCBA , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStDCBF , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_LdStDCBI , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_LdStLoad , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStStore , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStICBI , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLFD , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLHA , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<2, [G3_SLU]>]>, + InstrItinData<IIC_LdStLMW , [InstrStage<34, [G3_SLU]>]>, + InstrItinData<IIC_LdStLWARX , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<8, [G3_SLU]>]>, + InstrItinData<IIC_LdStSync , [InstrStage<3, [G3_SLU]>]>, + InstrItinData<IIC_SprISYNC , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprMFSR , [InstrStage<3, [G3_SRU]>]>, + InstrItinData<IIC_SprMTMSR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_SprMTSR , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprTLBSYNC , [InstrStage<3, [G3_SRU]>]>, + InstrItinData<IIC_SprMFCR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_SprMFMSR , [InstrStage<1, [G3_SRU]>]>, + InstrItinData<IIC_SprMFSPR , [InstrStage<3, [G3_SRU]>]>, + InstrItinData<IIC_SprMFTB , [InstrStage<3, [G3_SRU]>]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprMTSRIN , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprRFI , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_SprSC , [InstrStage<2, [G3_SRU]>]>, + InstrItinData<IIC_FPGeneral , [InstrStage<1, [G3_FPU1]>]>, + InstrItinData<IIC_FPAddSub , [InstrStage<1, [G3_FPU1]>]>, + InstrItinData<IIC_FPCompare , [InstrStage<1, [G3_FPU1]>]>, + InstrItinData<IIC_FPDivD , [InstrStage<31, [G3_FPU1]>]>, + InstrItinData<IIC_FPDivS , [InstrStage<17, [G3_FPU1]>]>, + InstrItinData<IIC_FPFused , [InstrStage<2, [G3_FPU1]>]>, + InstrItinData<IIC_FPRes , [InstrStage<10, [G3_FPU1]>]> +]>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td new file mode 100644 index 000000000000..340773ef7876 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td @@ -0,0 +1,96 @@ +//===-- PPCScheduleG4.td - PPC G4 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G4 (7400) processor. +// +//===----------------------------------------------------------------------===// + +def G4_BPU : FuncUnit; // Branch unit +def G4_SLU : FuncUnit; // Store/load unit +def G4_SRU : FuncUnit; // special register unit +def G4_IU1 : FuncUnit; // integer unit 1 (simple) +def G4_IU2 : FuncUnit; // integer unit 2 (complex) +def G4_FPU1 : FuncUnit; // floating point unit 1 +def G4_VPU : FuncUnit; // vector permutation unit +def G4_VIU1 : FuncUnit; // vector integer unit 1 (simple) +def G4_VIU2 : FuncUnit; // vector integer unit 2 (complex) +def G4_VFPU : FuncUnit; // vector floating point unit + +def G4Itineraries : ProcessorItineraries< + [G4_IU1, G4_IU2, G4_SLU, G4_SRU, G4_BPU, G4_FPU1, + G4_VIU1, G4_VIU2, G4_VPU, G4_VFPU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntGeneral , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntCompare , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntDivW , [InstrStage<19, [G4_IU1]>]>, + InstrItinData<IIC_IntMFFS , [InstrStage<3, [G4_FPU1]>]>, + InstrItinData<IIC_IntMFVSCR , [InstrStage<1, [G4_VIU1]>]>, + InstrItinData<IIC_IntMTFSB0 , [InstrStage<3, [G4_FPU1]>]>, + InstrItinData<IIC_IntMulHW , [InstrStage<5, [G4_IU1]>]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<6, [G4_IU1]>]>, + InstrItinData<IIC_IntMulLI , [InstrStage<3, [G4_IU1]>]>, + InstrItinData<IIC_IntRotate , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntShift , [InstrStage<1, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_IntTrapW , [InstrStage<2, [G4_IU1, G4_IU2]>]>, + InstrItinData<IIC_BrB , [InstrStage<1, [G4_BPU]>]>, + InstrItinData<IIC_BrCR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_BrMCR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_BrMCRX , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_LdStDCBF , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStDCBI , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLoad , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStStore , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStStoreUpd, [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStDSS , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStICBI , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLFD , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLHA , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLMW , [InstrStage<34, [G4_SLU]>]>, + InstrItinData<IIC_LdStLVecX , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStLWARX , [InstrStage<3, [G4_SLU]>]>, + InstrItinData<IIC_LdStSTVEBX , [InstrStage<2, [G4_SLU]>]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<5, [G4_SLU]>]>, + InstrItinData<IIC_LdStSync , [InstrStage<8, [G4_SLU]>]>, + InstrItinData<IIC_SprISYNC , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprMFSR , [InstrStage<3, [G4_SRU]>]>, + InstrItinData<IIC_SprMTMSR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_SprMTSR , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprTLBSYNC , [InstrStage<8, [G4_SRU]>]>, + InstrItinData<IIC_SprMFCR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_SprMFMSR , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_SprMFSPR , [InstrStage<3, [G4_SRU]>]>, + InstrItinData<IIC_SprMFTB , [InstrStage<1, [G4_SRU]>]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprMTSRIN , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprRFI , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_SprSC , [InstrStage<2, [G4_SRU]>]>, + InstrItinData<IIC_FPGeneral , [InstrStage<1, [G4_FPU1]>]>, + InstrItinData<IIC_FPAddSub , [InstrStage<1, [G4_FPU1]>]>, + InstrItinData<IIC_FPCompare , [InstrStage<1, [G4_FPU1]>]>, + InstrItinData<IIC_FPDivD , [InstrStage<31, [G4_FPU1]>]>, + InstrItinData<IIC_FPDivS , [InstrStage<17, [G4_FPU1]>]>, + InstrItinData<IIC_FPFused , [InstrStage<1, [G4_FPU1]>]>, + InstrItinData<IIC_FPRes , [InstrStage<10, [G4_FPU1]>]>, + InstrItinData<IIC_VecGeneral , [InstrStage<1, [G4_VIU1]>]>, + InstrItinData<IIC_VecFP , [InstrStage<4, [G4_VFPU]>]>, + InstrItinData<IIC_VecFPCompare, [InstrStage<1, [G4_VIU1]>]>, + InstrItinData<IIC_VecComplex , [InstrStage<3, [G4_VIU2]>]>, + InstrItinData<IIC_VecPerm , [InstrStage<1, [G4_VPU]>]>, + InstrItinData<IIC_VecFPRound , [InstrStage<4, [G4_VFPU]>]>, + InstrItinData<IIC_VecVSL , [InstrStage<1, [G4_VIU1]>]>, + InstrItinData<IIC_VecVSR , [InstrStage<1, [G4_VIU1]>]> +]>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td new file mode 100644 index 000000000000..1d9f13fcb850 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td @@ -0,0 +1,112 @@ +//===-- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G4+ (7450) processor. +// +//===----------------------------------------------------------------------===// + +def G4P_BPU : FuncUnit; // Branch unit +def G4P_SLU : FuncUnit; // Store/load unit +def G4P_SRU : FuncUnit; // special register unit +def G4P_IU1 : FuncUnit; // integer unit 1 (simple) +def G4P_IU2 : FuncUnit; // integer unit 2 (complex) +def G4P_IU3 : FuncUnit; // integer unit 3 (simple) +def G4P_IU4 : FuncUnit; // integer unit 4 (simple) +def G4P_FPU1 : FuncUnit; // floating point unit 1 +def G4P_VPU : FuncUnit; // vector permutation unit +def G4P_VIU1 : FuncUnit; // vector integer unit 1 (simple) +def G4P_VIU2 : FuncUnit; // vector integer unit 2 (complex) +def G4P_VFPU : FuncUnit; // vector floating point unit + +def G4PlusItineraries : ProcessorItineraries< + [G4P_IU1, G4P_IU2, G4P_IU3, G4P_IU4, G4P_BPU, G4P_SLU, G4P_FPU1, + G4P_VFPU, G4P_VIU1, G4P_VIU2, G4P_VPU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntGeneral , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntCompare , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntDivW , [InstrStage<23, [G4P_IU2]>]>, + InstrItinData<IIC_IntMFFS , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_IntMFVSCR , [InstrStage<2, [G4P_VFPU]>]>, + InstrItinData<IIC_IntMTFSB0 , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_IntMulHW , [InstrStage<4, [G4P_IU2]>]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<4, [G4P_IU2]>]>, + InstrItinData<IIC_IntMulLI , [InstrStage<3, [G4P_IU2]>]>, + InstrItinData<IIC_IntRotate , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntShift , [InstrStage<2, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_IntTrapW , [InstrStage<2, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_BrB , [InstrStage<1, [G4P_BPU]>]>, + InstrItinData<IIC_BrCR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_BrMCR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_BrMCRX , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_LdStDCBF , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStDCBI , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLoad , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStStore , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStDSS , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStICBI , [InstrStage<3, [G4P_IU2]>]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLFD , [InstrStage<4, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<4, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<4, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLHA , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLMW , [InstrStage<37, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLVecX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLWA , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStLWARX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTD , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTDCX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTDU , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTDUX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTVEBX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_LdStSync , [InstrStage<35, [G4P_SLU]>]>, + InstrItinData<IIC_SprISYNC , [InstrStage<0, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_SprMFSR , [InstrStage<4, [G4P_IU2]>]>, + InstrItinData<IIC_SprMTMSR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprMTSR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprTLBSYNC , [InstrStage<3, [G4P_SLU]>]>, + InstrItinData<IIC_SprMFCR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprMFMSR , [InstrStage<3, [G4P_IU2]>]>, + InstrItinData<IIC_SprMFSPR , [InstrStage<4, [G4P_IU2]>]>, + InstrItinData<IIC_SprMFTB , [InstrStage<5, [G4P_IU2]>]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprMTSRIN , [InstrStage<2, [G4P_IU2]>]>, + InstrItinData<IIC_SprRFI , [InstrStage<1, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_SprSC , [InstrStage<0, [G4P_IU1, G4P_IU2, + G4P_IU3, G4P_IU4]>]>, + InstrItinData<IIC_FPGeneral , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_FPAddSub , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_FPCompare , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_FPDivD , [InstrStage<35, [G4P_FPU1]>]>, + InstrItinData<IIC_FPDivS , [InstrStage<21, [G4P_FPU1]>]>, + InstrItinData<IIC_FPFused , [InstrStage<5, [G4P_FPU1]>]>, + InstrItinData<IIC_FPRes , [InstrStage<14, [G4P_FPU1]>]>, + InstrItinData<IIC_VecGeneral , [InstrStage<1, [G4P_VIU1]>]>, + InstrItinData<IIC_VecFP , [InstrStage<4, [G4P_VFPU]>]>, + InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G4P_VFPU]>]>, + InstrItinData<IIC_VecComplex , [InstrStage<4, [G4P_VIU2]>]>, + InstrItinData<IIC_VecPerm , [InstrStage<2, [G4P_VPU]>]>, + InstrItinData<IIC_VecFPRound , [InstrStage<4, [G4P_VIU1]>]>, + InstrItinData<IIC_VecVSL , [InstrStage<2, [G4P_VPU]>]>, + InstrItinData<IIC_VecVSR , [InstrStage<2, [G4P_VPU]>]> +]>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td new file mode 100644 index 000000000000..a3b73ab4454f --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td @@ -0,0 +1,129 @@ +//===-- PPCScheduleG5.td - PPC G5 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the G5 (970) processor. +// +//===----------------------------------------------------------------------===// + +def G5_BPU : FuncUnit; // Branch unit +def G5_SLU : FuncUnit; // Store/load unit +def G5_SRU : FuncUnit; // special register unit +def G5_IU1 : FuncUnit; // integer unit 1 (simple) +def G5_IU2 : FuncUnit; // integer unit 2 (complex) +def G5_FPU1 : FuncUnit; // floating point unit 1 +def G5_FPU2 : FuncUnit; // floating point unit 2 +def G5_VPU : FuncUnit; // vector permutation unit +def G5_VIU1 : FuncUnit; // vector integer unit 1 (simple) +def G5_VIU2 : FuncUnit; // vector integer unit 2 (complex) +def G5_VFPU : FuncUnit; // vector floating point unit + +def G5Itineraries : ProcessorItineraries< + [G5_IU1, G5_IU2, G5_SLU, G5_BPU, G5_FPU1, G5_FPU2, + G5_VFPU, G5_VIU1, G5_VIU2, G5_VPU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntGeneral , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntCompare , [InstrStage<3, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntDivD , [InstrStage<68, [G5_IU1]>]>, + InstrItinData<IIC_IntDivW , [InstrStage<36, [G5_IU1]>]>, + InstrItinData<IIC_IntMFFS , [InstrStage<6, [G5_IU2]>]>, + InstrItinData<IIC_IntMFVSCR , [InstrStage<1, [G5_VFPU]>]>, + InstrItinData<IIC_IntMTFSB0 , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_IntMulHD , [InstrStage<7, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntMulHW , [InstrStage<5, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<5, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntMulLI , [InstrStage<4, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntRFID , [InstrStage<1, [G5_IU2]>]>, + InstrItinData<IIC_IntRotateD , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntRotateDI , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntRotate , [InstrStage<4, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntShift , [InstrStage<2, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntTrapD , [InstrStage<1, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_IntTrapW , [InstrStage<1, [G5_IU1, G5_IU2]>]>, + InstrItinData<IIC_BrB , [InstrStage<1, [G5_BPU]>]>, + InstrItinData<IIC_BrCR , [InstrStage<4, [G5_BPU]>]>, + InstrItinData<IIC_BrMCR , [InstrStage<2, [G5_BPU]>]>, + InstrItinData<IIC_BrMCRX , [InstrStage<3, [G5_BPU]>]>, + InstrItinData<IIC_LdStDCBF , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLoad , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStStore , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStStoreUpd, [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStDSS , [InstrStage<10, [G5_SLU]>]>, + InstrItinData<IIC_LdStICBI , [InstrStage<40, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<4, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<4, [G5_SLU]>]>, + InstrItinData<IIC_LdStLD , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLDU , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLDUX , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLDARX , [InstrStage<11, [G5_SLU]>]>, + InstrItinData<IIC_LdStLFD , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLHA , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLMW , [InstrStage<64, [G5_SLU]>]>, + InstrItinData<IIC_LdStLVecX , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStLWA , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStLWARX , [InstrStage<11, [G5_SLU]>]>, + InstrItinData<IIC_LdStSLBIA , [InstrStage<40, [G5_SLU]>]>, // needs work + InstrItinData<IIC_LdStSLBIE , [InstrStage<2, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTD , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTDU , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTDUX , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTDCX , [InstrStage<11, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTVEBX , [InstrStage<5, [G5_SLU]>]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<11, [G5_SLU]>]>, + InstrItinData<IIC_LdStSync , [InstrStage<35, [G5_SLU]>]>, + InstrItinData<IIC_SprISYNC , [InstrStage<40, [G5_SLU]>]>, // needs work + InstrItinData<IIC_SprMFSR , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_SprMTMSR , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_SprMTSR , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_SprTLBSYNC , [InstrStage<3, [G5_SLU]>]>, + InstrItinData<IIC_SprMFCR , [InstrStage<2, [G5_IU2]>]>, + InstrItinData<IIC_SprMFCRF , [InstrStage<2, [G5_IU2]>]>, + InstrItinData<IIC_SprMFMSR , [InstrStage<3, [G5_IU2]>]>, + InstrItinData<IIC_SprMFSPR , [InstrStage<3, [G5_IU2]>]>, + InstrItinData<IIC_SprMFTB , [InstrStage<10, [G5_IU2]>]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<8, [G5_IU2]>]>, + InstrItinData<IIC_SprSC , [InstrStage<1, [G5_IU2]>]>, + InstrItinData<IIC_FPGeneral , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPAddSub , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPCompare , [InstrStage<8, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPDivD , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPDivS , [InstrStage<33, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPFused , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPRes , [InstrStage<6, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPSqrtD , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_FPSqrtS , [InstrStage<40, [G5_FPU1, G5_FPU2]>]>, + InstrItinData<IIC_VecGeneral , [InstrStage<2, [G5_VIU1]>]>, + InstrItinData<IIC_VecFP , [InstrStage<8, [G5_VFPU]>]>, + InstrItinData<IIC_VecFPCompare, [InstrStage<2, [G5_VFPU]>]>, + InstrItinData<IIC_VecComplex , [InstrStage<5, [G5_VIU2]>]>, + InstrItinData<IIC_VecPerm , [InstrStage<3, [G5_VPU]>]>, + InstrItinData<IIC_VecFPRound , [InstrStage<8, [G5_VFPU]>]>, + InstrItinData<IIC_VecVSL , [InstrStage<2, [G5_VIU1]>]>, + InstrItinData<IIC_VecVSR , [InstrStage<3, [G5_VPU]>]> +]>; + +// ===---------------------------------------------------------------------===// +// G5 machine model for scheduling and other instruction cost heuristics. + +def G5Model : SchedMachineModel { + let IssueWidth = 4; // 4 (non-branch) instructions are dispatched per cycle. + let MinLatency = 0; // Out-of-order dispatch. + let LoadLatency = 3; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 16; + + let Itineraries = G5Itineraries; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td new file mode 100644 index 000000000000..d3e426975ec0 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP7.td @@ -0,0 +1,385 @@ +//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the POWER7 processor. +// +//===----------------------------------------------------------------------===// + +// Primary reference: +// IBM POWER7 multicore server processor +// B. Sinharoy, et al. +// IBM J. Res. & Dev. (55) 3. May/June 2011. + +// Scheduling for the P7 involves tracking two types of resources: +// 1. The dispatch bundle slots +// 2. The functional unit resources + +// Dispatch units: +def P7_DU1 : FuncUnit; +def P7_DU2 : FuncUnit; +def P7_DU3 : FuncUnit; +def P7_DU4 : FuncUnit; +def P7_DU5 : FuncUnit; +def P7_DU6 : FuncUnit; + +def P7_LS1 : FuncUnit; // Load/Store pipeline 1 +def P7_LS2 : FuncUnit; // Load/Store pipeline 2 + +def P7_FX1 : FuncUnit; // FX pipeline 1 +def P7_FX2 : FuncUnit; // FX pipeline 2 + +// VS pipeline 1 (vector integer ops. always here) +def P7_VS1 : FuncUnit; // VS pipeline 1 +// VS pipeline 2 (128-bit stores and perms. here) +def P7_VS2 : FuncUnit; // VS pipeline 2 + +def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs) +def P7_BRU : FuncUnit; // BR unit + +// Notes: +// Each LSU pipeline can also execute FX add and logical instructions. +// Each LSU pipeline can complete a load or store in one cycle. +// +// Each store is broken into two parts, AGEN goes to the LSU while a +// "data steering" op. goes to the FXU or VSU. +// +// FX loads have a two cycle load-to-use latency (so one "bubble" cycle). +// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle). +// +// Frequent FX ops. take only one cycle and results can be used again in the +// next cycle (there is a self-bypass). Getting results from the other FX +// pipeline takes an additional cycle. +// +// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles +// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops. +// Dispatch of an instruction to VS1 that uses four single prec. inputs +// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any +// floating point instruction. +// +// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles +// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline +// (unlike on the POWER6). +// +// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP +// share the same write-back, and have a 5-cycle latency difference, so the +// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP +// op. has been dispatched to VS1. +// +// Three cycles after an L1 cache hit, a dependent VSU instruction can issue. +// +// Instruction dispatch groups have (at most) four non-branch instructions, and +// two branches. Unlike on the POWER4/5, a branch does not automatically +// end the dispatch group, but a second branch must be the last in the group. + +def P7Itineraries : ProcessorItineraries< + [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6, + P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [ + InstrItinData<IIC_IntSimple , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2, + P7_LS1, P7_LS2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntGeneral , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntCompare , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + // FIXME: Add record-form itinerary data. + InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<36, [P7_FX1, P7_FX2]>], + [36, 1, 1]>, + InstrItinData<IIC_IntDivD , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<68, [P7_FX1, P7_FX2]>], + [68, 1, 1]>, + InstrItinData<IIC_IntMulHW , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 1, 1]>, + InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 1, 1]>, + InstrItinData<IIC_IntMulLI , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 1, 1]>, + InstrItinData<IIC_IntRotate , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntRotateD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntShift , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_IntTrapW , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1]>, + InstrItinData<IIC_IntTrapD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1]>, + InstrItinData<IIC_BrB , [InstrStage<1, [P7_DU5, P7_DU6], 0>, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData<IIC_BrCR , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_CRU]>], + [3, 1, 1]>, + InstrItinData<IIC_BrMCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU5, P7_DU6], 0>, + InstrStage<1, [P7_BRU]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLoad , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [2, 1, 1]>, + InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 2, 1, 1]>, + InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [2, 1, 1]>, + InstrItinData<IIC_LdStLDU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 2, 1, 1]>, + InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLFD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 3, 1, 1]>, + InstrItinData<IIC_LdStLHA , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 4, 1, 1]>, + InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [4, 4, 1, 1]>, + InstrItinData<IIC_LdStLWA , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [3, 1, 1]>, + InstrItinData<IIC_LdStLMW , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [2, 1, 1]>, + InstrItinData<IIC_LdStStore , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTDU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_LdStSTDUX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_FX1, P7_FX2], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [2, 1, 1, 1]>, + InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2], 0>, + InstrStage<1, [P7_VS2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [1, 1, 1]>, + InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_LS1, P7_LS2]>], + [1, 1, 1]>, + InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_DU2], 0>, + InstrStage<1, [P7_DU3], 0>, + InstrStage<1, [P7_DU4], 0>, + InstrStage<1, [P7_CRU]>, + InstrStage<1, [P7_FX1, P7_FX2]>], + [3, 1]>, // mtcr + InstrItinData<IIC_SprMFCR , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_CRU]>], + [6, 1]>, + InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_CRU]>], + [3, 1]>, + InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_FX1]>], + [4, 1]>, // mtctr + InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1]>, + InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [8, 1, 1]>, + InstrItinData<IIC_FPDivD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [33, 1, 1]>, + InstrItinData<IIC_FPDivS , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [27, 1, 1]>, + InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [44, 1, 1]>, + InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [32, 1, 1]>, + InstrItinData<IIC_FPFused , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1, 1]>, + InstrItinData<IIC_FPRes , [InstrStage<1, [P7_DU1, P7_DU2, + P7_DU3, P7_DU4], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [5, 1, 1]>, + InstrItinData<IIC_VecGeneral , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1]>], + [2, 1, 1]>, + InstrItinData<IIC_VecVSL , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1]>], + [2, 1, 1]>, + InstrItinData<IIC_VecVSR , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1]>], + [2, 1, 1]>, + InstrItinData<IIC_VecFP , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [6, 1, 1]>, + InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [6, 1, 1]>, + InstrItinData<IIC_VecFPRound , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1, P7_VS2]>], + [6, 1, 1]>, + InstrItinData<IIC_VecComplex , [InstrStage<1, [P7_DU1], 0>, + InstrStage<1, [P7_VS1]>], + [7, 1, 1]>, + InstrItinData<IIC_VecPerm , [InstrStage<1, [P7_DU1, P7_DU2], 0>, + InstrStage<1, [P7_VS2]>], + [3, 1, 1]> +]>; + +// ===---------------------------------------------------------------------===// +// P7 machine model for scheduling and other instruction cost heuristics. + +def P7Model : SchedMachineModel { + let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle. + // Note that the dispatch bundle size is 6 (including + // branches), but the total internal issue bandwidth per + // cycle (from all queues) is 8. + + let MinLatency = 0; // Out-of-order dispatch. + let LoadLatency = 3; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 16; + + let Itineraries = P7Itineraries; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp new file mode 100644 index 000000000000..dc1674214769 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp @@ -0,0 +1,22 @@ +//===-- PPCSelectionDAGInfo.cpp - PowerPC SelectionDAG Info ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPCSelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#include "PPCTargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "powerpc-selectiondag-info" + +PPCSelectionDAGInfo::PPCSelectionDAGInfo(const DataLayout *DL) + : TargetSelectionDAGInfo(DL) {} + +PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h new file mode 100644 index 000000000000..b2e7f3b5f2ac --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h @@ -0,0 +1,31 @@ +//===-- PPCSelectionDAGInfo.h - PowerPC SelectionDAG Info -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the PowerPC subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPCCSELECTIONDAGINFO_H +#define POWERPCCSELECTIONDAGINFO_H + +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +class PPCTargetMachine; + +class PPCSelectionDAGInfo : public TargetSelectionDAGInfo { +public: + explicit PPCSelectionDAGInfo(const DataLayout *DL); + ~PPCSelectionDAGInfo(); +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp new file mode 100644 index 000000000000..b51512d335fc --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -0,0 +1,277 @@ +//===-- PowerPCSubtarget.cpp - PPC Subtarget Information ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the PPC specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#include "PPCSubtarget.h" +#include "PPC.h" +#include "PPCRegisterInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" +#include <cstdlib> + +using namespace llvm; + +#define DEBUG_TYPE "ppc-subtarget" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "PPCGenSubtargetInfo.inc" + +/// Return the datalayout string of a subtarget. +static std::string getDataLayoutString(const PPCSubtarget &ST) { + const Triple &T = ST.getTargetTriple(); + + std::string Ret; + + // Most PPC* platforms are big endian, PPC64LE is little endian. + if (ST.isLittleEndian()) + Ret = "e"; + else + Ret = "E"; + + Ret += DataLayout::getManglingComponent(T); + + // PPC32 has 32 bit pointers. The PS3 (OS Lv2) is a PPC64 machine with 32 bit + // pointers. + if (!ST.isPPC64() || T.getOS() == Triple::Lv2) + Ret += "-p:32:32"; + + // Note, the alignment values for f64 and i64 on ppc64 in Darwin + // documentation are wrong; these are correct (i.e. "what gcc does"). + if (ST.isPPC64() || ST.isSVR4ABI()) + Ret += "-i64:64"; + else + Ret += "-f64:32:64"; + + // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones. + if (ST.isPPC64()) + Ret += "-n32:64"; + else + Ret += "-n32"; + + return Ret; +} + +PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU, + StringRef FS) { + initializeEnvironment(); + resetSubtargetFeatures(CPU, FS); + return *this; +} + +PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, PPCTargetMachine &TM, + bool is64Bit, CodeGenOpt::Level OptLevel) + : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT), + OptLevel(OptLevel), + FrameLowering(initializeSubtargetDependencies(CPU, FS)), + DL(getDataLayoutString(*this)), InstrInfo(*this), JITInfo(*this), + TLInfo(TM), TSInfo(&DL) {} + +/// SetJITMode - This is called to inform the subtarget info that we are +/// producing code for the JIT. +void PPCSubtarget::SetJITMode() { + // JIT mode doesn't want lazy resolver stubs, it knows exactly where + // everything is. This matters for PPC64, which codegens in PIC mode without + // stubs. + HasLazyResolverStubs = false; + + // Calls to external functions need to use indirect calls + IsJITCodeModel = true; +} + +void PPCSubtarget::resetSubtargetFeatures(const MachineFunction *MF) { + AttributeSet FnAttrs = MF->getFunction()->getAttributes(); + Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex, + "target-cpu"); + Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex, + "target-features"); + std::string CPU = + !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : ""; + std::string FS = + !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : ""; + if (!FS.empty()) { + initializeEnvironment(); + resetSubtargetFeatures(CPU, FS); + } +} + +void PPCSubtarget::initializeEnvironment() { + StackAlignment = 16; + DarwinDirective = PPC::DIR_NONE; + HasMFOCRF = false; + Has64BitSupport = false; + Use64BitRegs = false; + UseCRBits = false; + HasAltivec = false; + HasQPX = false; + HasVSX = false; + HasFCPSGN = false; + HasFSQRT = false; + HasFRE = false; + HasFRES = false; + HasFRSQRTE = false; + HasFRSQRTES = false; + HasRecipPrec = false; + HasSTFIWX = false; + HasLFIWAX = false; + HasFPRND = false; + HasFPCVT = false; + HasISEL = false; + HasPOPCNTD = false; + HasLDBRX = false; + IsBookE = false; + DeprecatedMFTB = false; + DeprecatedDST = false; + HasLazyResolverStubs = false; + IsJITCodeModel = false; +} + +void PPCSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { + // Determine default and user specified characteristics + std::string CPUName = CPU; + if (CPUName.empty()) + CPUName = "generic"; +#if (defined(__APPLE__) || defined(__linux__)) && \ + (defined(__ppc__) || defined(__powerpc__)) + if (CPUName == "generic") + CPUName = sys::getHostCPUName(); +#endif + + // Initialize scheduling itinerary for the specified CPU. + InstrItins = getInstrItineraryForCPU(CPUName); + + // Make sure 64-bit features are available when CPUname is generic + std::string FullFS = FS; + + // If we are generating code for ppc64, verify that options make sense. + if (IsPPC64) { + Has64BitSupport = true; + // Silently force 64-bit register use on ppc64. + Use64BitRegs = true; + if (!FullFS.empty()) + FullFS = "+64bit," + FullFS; + else + FullFS = "+64bit"; + } + + // At -O2 and above, track CR bits as individual registers. + if (OptLevel >= CodeGenOpt::Default) { + if (!FullFS.empty()) + FullFS = "+crbits," + FullFS; + else + FullFS = "+crbits"; + } + + // Parse features string. + ParseSubtargetFeatures(CPUName, FullFS); + + // If the user requested use of 64-bit regs, but the cpu selected doesn't + // support it, ignore. + if (use64BitRegs() && !has64BitSupport()) + Use64BitRegs = false; + + // Set up darwin-specific properties. + if (isDarwin()) + HasLazyResolverStubs = true; + + // QPX requires a 32-byte aligned stack. Note that we need to do this if + // we're compiling for a BG/Q system regardless of whether or not QPX + // is enabled because external functions will assume this alignment. + if (hasQPX() || isBGQ()) + StackAlignment = 32; + + // Determine endianness. + IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le); + + // FIXME: For now, we disable VSX in little-endian mode until endian + // issues in those instructions can be addressed. + if (IsLittleEndian) + HasVSX = false; +} + +/// hasLazyResolverStub - Return true if accesses to the specified global have +/// to go through a dyld lazy resolution stub. This means that an extra load +/// is required to get the address of the global. +bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV, + const TargetMachine &TM) const { + // We never have stubs if HasLazyResolverStubs=false or if in static mode. + if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static) + return false; + // If symbol visibility is hidden, the extra load is not needed if + // the symbol is definitely defined in the current translation unit. + bool isDecl = GV->isDeclaration() && !GV->isMaterializable(); + if (GV->hasHiddenVisibility() && !isDecl && !GV->hasCommonLinkage()) + return false; + return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() || + GV->hasCommonLinkage() || isDecl; +} + +// Embedded cores need aggressive scheduling (and some others also benefit). +static bool needsAggressiveScheduling(unsigned Directive) { + switch (Directive) { + default: return false; + case PPC::DIR_440: + case PPC::DIR_A2: + case PPC::DIR_E500mc: + case PPC::DIR_E5500: + case PPC::DIR_PWR7: + case PPC::DIR_PWR8: + return true; + } +} + +bool PPCSubtarget::enableMachineScheduler() const { + // Enable MI scheduling for the embedded cores. + // FIXME: Enable this for all cores (some additional modeling + // may be necessary). + return needsAggressiveScheduling(DarwinDirective); +} + +// This overrides the PostRAScheduler bit in the SchedModel for each CPU. +bool PPCSubtarget::enablePostMachineScheduler() const { return true; } + +PPCGenSubtargetInfo::AntiDepBreakMode PPCSubtarget::getAntiDepBreakMode() const { + return TargetSubtargetInfo::ANTIDEP_ALL; +} + +void PPCSubtarget::getCriticalPathRCs(RegClassVector &CriticalPathRCs) const { + CriticalPathRCs.clear(); + CriticalPathRCs.push_back(isPPC64() ? + &PPC::G8RCRegClass : &PPC::GPRCRegClass); +} + +void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const { + if (needsAggressiveScheduling(DarwinDirective)) { + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + } + + // Spilling is generally expensive on all PPC cores, so always enable + // register-pressure tracking. + Policy.ShouldTrackPressure = true; +} + +bool PPCSubtarget::useAA() const { + // Use AA during code generation for the embedded cores. + return needsAggressiveScheduling(DarwinDirective); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h new file mode 100644 index 000000000000..a3cedafb5ef2 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -0,0 +1,251 @@ +//===-- PPCSubtarget.h - Define Subtarget for the PPC ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the PowerPC specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef POWERPCSUBTARGET_H +#define POWERPCSUBTARGET_H + +#include "PPCFrameLowering.h" +#include "PPCInstrInfo.h" +#include "PPCISelLowering.h" +#include "PPCJITInfo.h" +#include "PPCSelectionDAGInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCInstrItineraries.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <string> + +#define GET_SUBTARGETINFO_HEADER +#include "PPCGenSubtargetInfo.inc" + +// GCC #defines PPC on Linux but we use it as our namespace name +#undef PPC + +namespace llvm { +class StringRef; + +namespace PPC { + // -m directive values. + enum { + DIR_NONE, + DIR_32, + DIR_440, + DIR_601, + DIR_602, + DIR_603, + DIR_7400, + DIR_750, + DIR_970, + DIR_A2, + DIR_E500mc, + DIR_E5500, + DIR_PWR3, + DIR_PWR4, + DIR_PWR5, + DIR_PWR5X, + DIR_PWR6, + DIR_PWR6X, + DIR_PWR7, + DIR_PWR8, + DIR_64 + }; +} + +class GlobalValue; +class TargetMachine; + +class PPCSubtarget : public PPCGenSubtargetInfo { +protected: + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned StackAlignment; + + /// Selected instruction itineraries (one entry per itinerary class.) + InstrItineraryData InstrItins; + + /// Which cpu directive was used. + unsigned DarwinDirective; + + /// Used by the ISel to turn in optimizations for POWER4-derived architectures + bool HasMFOCRF; + bool Has64BitSupport; + bool Use64BitRegs; + bool UseCRBits; + bool IsPPC64; + bool HasAltivec; + bool HasQPX; + bool HasVSX; + bool HasFCPSGN; + bool HasFSQRT; + bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES; + bool HasRecipPrec; + bool HasSTFIWX; + bool HasLFIWAX; + bool HasFPRND; + bool HasFPCVT; + bool HasISEL; + bool HasPOPCNTD; + bool HasLDBRX; + bool IsBookE; + bool DeprecatedMFTB; + bool DeprecatedDST; + bool HasLazyResolverStubs; + bool IsJITCodeModel; + bool IsLittleEndian; + + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + + /// OptLevel - What default optimization level we're emitting code for. + CodeGenOpt::Level OptLevel; + + PPCFrameLowering FrameLowering; + const DataLayout DL; + PPCInstrInfo InstrInfo; + PPCJITInfo JITInfo; + PPCTargetLowering TLInfo; + PPCSelectionDAGInfo TSInfo; + +public: + /// This constructor initializes the data members to match that + /// of the specified triple. + /// + PPCSubtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, PPCTargetMachine &TM, bool is64Bit, + CodeGenOpt::Level OptLevel); + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + /// SetJITMode - This is called to inform the subtarget info that we are + /// producing code for the JIT. + void SetJITMode(); + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return StackAlignment; } + + /// getDarwinDirective - Returns the -m directive specified for the cpu. + /// + unsigned getDarwinDirective() const { return DarwinDirective; } + + /// getInstrItins - Return the instruction itineraries based on subtarget + /// selection. + const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + + const PPCFrameLowering *getFrameLowering() const { return &FrameLowering; } + const DataLayout *getDataLayout() const { return &DL; } + const PPCInstrInfo *getInstrInfo() const { return &InstrInfo; } + PPCJITInfo *getJITInfo() { return &JITInfo; } + const PPCTargetLowering *getTargetLowering() const { return &TLInfo; } + const PPCSelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } + + /// initializeSubtargetDependencies - Initializes using a CPU and feature string + /// so that we can use initializer lists for subtarget initialization. + PPCSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); + + /// \brief Reset the features for the PowerPC target. + void resetSubtargetFeatures(const MachineFunction *MF) override; +private: + void initializeEnvironment(); + void resetSubtargetFeatures(StringRef CPU, StringRef FS); + +public: + /// isPPC64 - Return true if we are generating code for 64-bit pointer mode. + /// + bool isPPC64() const { return IsPPC64; } + + /// has64BitSupport - Return true if the selected CPU supports 64-bit + /// instructions, regardless of whether we are in 32-bit or 64-bit mode. + bool has64BitSupport() const { return Has64BitSupport; } + + /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit + /// registers in 32-bit mode when possible. This can only true if + /// has64BitSupport() returns true. + bool use64BitRegs() const { return Use64BitRegs; } + + /// useCRBits - Return true if we should store and manipulate i1 values in + /// the individual condition register bits. + bool useCRBits() const { return UseCRBits; } + + /// hasLazyResolverStub - Return true if accesses to the specified global have + /// to go through a dyld lazy resolution stub. This means that an extra load + /// is required to get the address of the global. + bool hasLazyResolverStub(const GlobalValue *GV, + const TargetMachine &TM) const; + + // isJITCodeModel - True if we're generating code for the JIT + bool isJITCodeModel() const { return IsJITCodeModel; } + + // isLittleEndian - True if generating little-endian code + bool isLittleEndian() const { return IsLittleEndian; } + + // Specific obvious features. + bool hasFCPSGN() const { return HasFCPSGN; } + bool hasFSQRT() const { return HasFSQRT; } + bool hasFRE() const { return HasFRE; } + bool hasFRES() const { return HasFRES; } + bool hasFRSQRTE() const { return HasFRSQRTE; } + bool hasFRSQRTES() const { return HasFRSQRTES; } + bool hasRecipPrec() const { return HasRecipPrec; } + bool hasSTFIWX() const { return HasSTFIWX; } + bool hasLFIWAX() const { return HasLFIWAX; } + bool hasFPRND() const { return HasFPRND; } + bool hasFPCVT() const { return HasFPCVT; } + bool hasAltivec() const { return HasAltivec; } + bool hasQPX() const { return HasQPX; } + bool hasVSX() const { return HasVSX; } + bool hasMFOCRF() const { return HasMFOCRF; } + bool hasISEL() const { return HasISEL; } + bool hasPOPCNTD() const { return HasPOPCNTD; } + bool hasLDBRX() const { return HasLDBRX; } + bool isBookE() const { return IsBookE; } + bool isDeprecatedMFTB() const { return DeprecatedMFTB; } + bool isDeprecatedDST() const { return DeprecatedDST; } + + const Triple &getTargetTriple() const { return TargetTriple; } + + /// isDarwin - True if this is any darwin platform. + bool isDarwin() const { return TargetTriple.isMacOSX(); } + /// isBGQ - True if this is a BG/Q platform. + bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; } + + bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } + bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + + bool isDarwinABI() const { return isDarwin(); } + bool isSVR4ABI() const { return !isDarwin(); } + /// FIXME: Should use a command-line option. + bool isELFv2ABI() const { return isPPC64() && isSVR4ABI() && + isLittleEndian(); } + + bool enableEarlyIfConversion() const override { return hasISEL(); } + + // Scheduling customization. + bool enableMachineScheduler() const override; + // This overrides the PostRAScheduler bit in the SchedModel for each CPU. + bool enablePostMachineScheduler() const override; + AntiDepBreakMode getAntiDepBreakMode() const override; + void getCriticalPathRCs(RegClassVector &CriticalPathRCs) const override; + + void overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const override; + bool useAA() const override; +}; +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp new file mode 100644 index 000000000000..9563b9045c39 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -0,0 +1,170 @@ +//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Top-level implementation for the PowerPC target. +// +//===----------------------------------------------------------------------===// + +#include "PPCTargetMachine.h" +#include "PPC.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/PassManager.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +static cl:: +opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden, + cl::desc("Disable CTR loops for PPC")); + +static cl::opt<bool> +VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early", + cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early")); + +extern "C" void LLVMInitializePowerPCTarget() { + // Register the targets + RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target); + RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target); + RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget); +} + +PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool is64Bit) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, *this, is64Bit, OL) { + initAsmInfo(); +} + +void PPC32TargetMachine::anchor() { } + +PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) { +} + +void PPC64TargetMachine::anchor() { } + +PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) { +} + + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +namespace { +/// PPC Code Generator Pass Configuration Options. +class PPCPassConfig : public TargetPassConfig { +public: + PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + PPCTargetMachine &getPPCTargetMachine() const { + return getTM<PPCTargetMachine>(); + } + + const PPCSubtarget &getPPCSubtarget() const { + return *getPPCTargetMachine().getSubtargetImpl(); + } + + bool addPreISel() override; + bool addILPOpts() override; + bool addInstSelector() override; + bool addPreRegAlloc() override; + bool addPreSched2() override; + bool addPreEmitPass() override; +}; +} // namespace + +TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { + return new PPCPassConfig(this, PM); +} + +bool PPCPassConfig::addPreISel() { + if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) + addPass(createPPCCTRLoops(getPPCTargetMachine())); + + return false; +} + +bool PPCPassConfig::addILPOpts() { + addPass(&EarlyIfConverterID); + return true; +} + +bool PPCPassConfig::addInstSelector() { + // Install an instruction selector. + addPass(createPPCISelDag(getPPCTargetMachine())); + +#ifndef NDEBUG + if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) + addPass(createPPCCTRLoopsVerify()); +#endif + + addPass(createPPCVSXCopyPass()); + return false; +} + +bool PPCPassConfig::addPreRegAlloc() { + initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry()); + insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID, + &PPCVSXFMAMutateID); + return false; +} + +bool PPCPassConfig::addPreSched2() { + addPass(createPPCVSXCopyCleanupPass()); + + if (getOptLevel() != CodeGenOpt::None) + addPass(&IfConverterID); + + return true; +} + +bool PPCPassConfig::addPreEmitPass() { + if (getOptLevel() != CodeGenOpt::None) + addPass(createPPCEarlyReturnPass()); + // Must run branch selection immediately preceding the asm printer. + addPass(createPPCBranchSelectionPass()); + return false; +} + +bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM, + JITCodeEmitter &JCE) { + // Inform the subtarget that we are in JIT mode. FIXME: does this break macho + // writing? + Subtarget.SetJITMode(); + + // Machine code emitter pass for PowerPC. + PM.add(createPPCJITCodeEmitterPass(*this, JCE)); + + return false; +} + +void PPCTargetMachine::addAnalysisPasses(PassManagerBase &PM) { + // Add first the target-independent BasicTTI pass, then our PPC pass. This + // allows the PPC pass to delegate to the target independent layer when + // appropriate. + PM.add(createBasicTargetTransformInfoPass(this)); + PM.add(createPPCTargetTransformInfoPass(this)); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h new file mode 100644 index 000000000000..4c7029ca7a36 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h @@ -0,0 +1,93 @@ +//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the PowerPC specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef PPC_TARGETMACHINE_H +#define PPC_TARGETMACHINE_H + +#include "PPCInstrInfo.h" +#include "PPCSubtarget.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +/// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets. +/// +class PPCTargetMachine : public LLVMTargetMachine { + PPCSubtarget Subtarget; + +public: + PPCTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool is64Bit); + + const PPCInstrInfo *getInstrInfo() const override { + return getSubtargetImpl()->getInstrInfo(); + } + const PPCFrameLowering *getFrameLowering() const override { + return getSubtargetImpl()->getFrameLowering(); + } + PPCJITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } + const PPCTargetLowering *getTargetLowering() const override { + return getSubtargetImpl()->getTargetLowering(); + } + const PPCSelectionDAGInfo* getSelectionDAGInfo() const override { + return getSubtargetImpl()->getSelectionDAGInfo(); + } + const PPCRegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); + } + + const DataLayout *getDataLayout() const override { + return getSubtargetImpl()->getDataLayout(); + } + const PPCSubtarget *getSubtargetImpl() const override { return &Subtarget; } + const InstrItineraryData *getInstrItineraryData() const override { + return &getSubtargetImpl()->getInstrItineraryData(); + } + + // Pass Pipeline Configuration + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + bool addCodeEmitter(PassManagerBase &PM, + JITCodeEmitter &JCE) override; + + /// \brief Register PPC analysis passes with a pass manager. + void addAnalysisPasses(PassManagerBase &PM) override; +}; + +/// PPC32TargetMachine - PowerPC 32-bit target machine. +/// +class PPC32TargetMachine : public PPCTargetMachine { + virtual void anchor(); +public: + PPC32TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +/// PPC64TargetMachine - PowerPC 64-bit target machine. +/// +class PPC64TargetMachine : public PPCTargetMachine { + virtual void anchor(); +public: + PPC64TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp new file mode 100644 index 000000000000..2903cc192aa8 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp @@ -0,0 +1,63 @@ +//===-- PPCTargetObjectFile.cpp - PPC Object Info -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPCTargetObjectFile.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" + +using namespace llvm; + +void +PPC64LinuxTargetObjectFile:: +Initialize(MCContext &Ctx, const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + +const MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal( + const GlobalValue *GV, SectionKind Kind, Mangler &Mang, + const TargetMachine &TM) const { + // Here override ReadOnlySection to DataRelROSection for PPC64 SVR4 ABI + // when we have a constant that contains global relocations. This is + // necessary because of this ABI's handling of pointers to functions in + // a shared library. The address of a function is actually the address + // of a function descriptor, which resides in the .opd section. Generated + // code uses the descriptor directly rather than going via the GOT as some + // other ABIs do, which means that initialized function pointers must + // reference the descriptor. The linker must convert copy relocs of + // pointers to functions in shared libraries into dynamic relocations, + // because of an ordering problem with initialization of copy relocs and + // PLT entries. The dynamic relocation will be initialized by the dynamic + // linker, so we must use DataRelROSection instead of ReadOnlySection. + // For more information, see the description of ELIMINATE_COPY_RELOCS in + // GNU ld. + if (Kind.isReadOnly()) { + const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); + + if (GVar && GVar->isConstant() && + (GVar->getInitializer()->getRelocationInfo() == + Constant::GlobalRelocations)) + Kind = SectionKind::getReadOnlyWithRel(); + } + + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, + Mang, TM); +} + +const MCExpr *PPC64LinuxTargetObjectFile:: +getDebugThreadLocalSymbol(const MCSymbol *Sym) const { + const MCExpr *Expr = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PPC_DTPREL, getContext()); + return MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(0x8000, getContext()), + getContext()); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h new file mode 100644 index 000000000000..3e71bbc67379 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.h @@ -0,0 +1,35 @@ +//===-- PPCTargetObjectFile.h - PPC Object Info -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_PPC_TARGETOBJECTFILE_H +#define LLVM_TARGET_PPC_TARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + + /// PPC64LinuxTargetObjectFile - This implementation is used for + /// 64-bit PowerPC Linux. + class PPC64LinuxTargetObjectFile : public TargetLoweringObjectFileELF { + + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + + const MCSection *SelectSectionForGlobal(const GlobalValue *GV, + SectionKind Kind, Mangler &Mang, + const TargetMachine &TM) const override; + + /// \brief Describe a TLS variable address within debug info. + const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; + }; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h new file mode 100644 index 000000000000..73fb69101353 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h @@ -0,0 +1,27 @@ +//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef PPCTARGETSTREAMER_H +#define PPCTARGETSTREAMER_H + +#include "llvm/MC/MCStreamer.h" + +namespace llvm { +class PPCTargetStreamer : public MCTargetStreamer { +public: + PPCTargetStreamer(MCStreamer &S); + virtual ~PPCTargetStreamer(); + virtual void emitTCEntry(const MCSymbol &S) = 0; + virtual void emitMachine(StringRef CPU) = 0; + virtual void emitAbiVersion(int AbiVersion) = 0; + virtual void emitLocalEntry(MCSymbol *S, const MCExpr *LocalOffset) = 0; +}; +} + +#endif diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp new file mode 100644 index 000000000000..007901b23e0c --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -0,0 +1,420 @@ +//===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// PPC target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" +using namespace llvm; + +#define DEBUG_TYPE "ppctti" + +static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", +cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); + +// Declare the pass initialization routine locally as target-specific passes +// don't have a target-wide initialization entry point, and so we rely on the +// pass constructor initialization. +namespace llvm { +void initializePPCTTIPass(PassRegistry &); +} + +namespace { + +class PPCTTI final : public ImmutablePass, public TargetTransformInfo { + const PPCSubtarget *ST; + const PPCTargetLowering *TLI; + +public: + PPCTTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) { + llvm_unreachable("This pass cannot be directly constructed"); + } + + PPCTTI(const PPCTargetMachine *TM) + : ImmutablePass(ID), ST(TM->getSubtargetImpl()), + TLI(TM->getTargetLowering()) { + initializePPCTTIPass(*PassRegistry::getPassRegistry()); + } + + virtual void initializePass() override { + pushTTIStack(this); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + TargetTransformInfo::getAnalysisUsage(AU); + } + + /// Pass identification. + static char ID; + + /// Provide necessary pointer adjustments for the two base classes. + virtual void *getAdjustedAnalysisPointer(const void *ID) override { + if (ID == &TargetTransformInfo::ID) + return (TargetTransformInfo*)this; + return this; + } + + /// \name Scalar TTI Implementations + /// @{ + unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; + + unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) const override; + unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) const override; + + virtual PopcntSupportKind + getPopcntSupport(unsigned TyWidth) const override; + virtual void getUnrollingPreferences( + Loop *L, UnrollingPreferences &UP) const override; + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + virtual unsigned getNumberOfRegisters(bool Vector) const override; + virtual unsigned getRegisterBitWidth(bool Vector) const override; + virtual unsigned getMaximumUnrollFactor() const override; + virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind, + OperandValueKind) const override; + virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, + int Index, Type *SubTp) const override; + virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const override; + virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const override; + virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const override; + virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const override; + + /// @} +}; + +} // end anonymous namespace + +INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti", + "PPC Target Transform Info", true, true, false) +char PPCTTI::ID = 0; + +ImmutablePass * +llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) { + return new PPCTTI(TM); +} + + +//===----------------------------------------------------------------------===// +// +// PPC cost model. +// +//===----------------------------------------------------------------------===// + +PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + if (ST->hasPOPCNTD() && TyWidth <= 64) + return PSK_FastHardware; + return PSK_Software; +} + +unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const { + if (DisablePPCConstHoist) + return TargetTransformInfo::getIntImmCost(Imm, Ty); + + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + if (Imm == 0) + return TCC_Free; + + if (Imm.getBitWidth() <= 64) { + if (isInt<16>(Imm.getSExtValue())) + return TCC_Basic; + + if (isInt<32>(Imm.getSExtValue())) { + // A constant that can be materialized using lis. + if ((Imm.getZExtValue() & 0xFFFF) == 0) + return TCC_Basic; + + return 2 * TCC_Basic; + } + } + + return 4 * TCC_Basic; +} + +unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) const { + if (DisablePPCConstHoist) + return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty); + + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + switch (IID) { + default: return TCC_Free; + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue())) + return TCC_Free; + break; + } + return PPCTTI::getIntImmCost(Imm, Ty); +} + +unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) const { + if (DisablePPCConstHoist) + return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty); + + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + unsigned ImmIdx = ~0U; + bool ShiftedFree = false, RunFree = false, UnsignedFree = false, + ZeroFree = false; + switch (Opcode) { + default: return TCC_Free; + case Instruction::GetElementPtr: + // Always hoist the base address of a GetElementPtr. This prevents the + // creation of new constants for every base constant that gets constant + // folded with the offset. + if (Idx == 0) + return 2 * TCC_Basic; + return TCC_Free; + case Instruction::And: + RunFree = true; // (for the rotate-and-mask instructions) + // Fallthrough... + case Instruction::Add: + case Instruction::Or: + case Instruction::Xor: + ShiftedFree = true; + // Fallthrough... + case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + ImmIdx = 1; + break; + case Instruction::ICmp: + UnsignedFree = true; + ImmIdx = 1; + // Fallthrough... (zero comparisons can use record-form instructions) + case Instruction::Select: + ZeroFree = true; + break; + case Instruction::PHI: + case Instruction::Call: + case Instruction::Ret: + case Instruction::Load: + case Instruction::Store: + break; + } + + if (ZeroFree && Imm == 0) + return TCC_Free; + + if (Idx == ImmIdx && Imm.getBitWidth() <= 64) { + if (isInt<16>(Imm.getSExtValue())) + return TCC_Free; + + if (RunFree) { + if (Imm.getBitWidth() <= 32 && + (isShiftedMask_32(Imm.getZExtValue()) || + isShiftedMask_32(~Imm.getZExtValue()))) + return TCC_Free; + + + if (ST->isPPC64() && + (isShiftedMask_64(Imm.getZExtValue()) || + isShiftedMask_64(~Imm.getZExtValue()))) + return TCC_Free; + } + + if (UnsignedFree && isUInt<16>(Imm.getZExtValue())) + return TCC_Free; + + if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0) + return TCC_Free; + } + + return PPCTTI::getIntImmCost(Imm, Ty); +} + +void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const { + if (ST->getDarwinDirective() == PPC::DIR_A2) { + // The A2 is in-order with a deep pipeline, and concatenation unrolling + // helps expose latency-hiding opportunities to the instruction scheduler. + UP.Partial = UP.Runtime = true; + } +} + +unsigned PPCTTI::getNumberOfRegisters(bool Vector) const { + if (Vector && !ST->hasAltivec()) + return 0; + return ST->hasVSX() ? 64 : 32; +} + +unsigned PPCTTI::getRegisterBitWidth(bool Vector) const { + if (Vector) { + if (ST->hasAltivec()) return 128; + return 0; + } + + if (ST->isPPC64()) + return 64; + return 32; + +} + +unsigned PPCTTI::getMaximumUnrollFactor() const { + unsigned Directive = ST->getDarwinDirective(); + // The 440 has no SIMD support, but floating-point instructions + // have a 5-cycle latency, so unroll by 5x for latency hiding. + if (Directive == PPC::DIR_440) + return 5; + + // The A2 has no SIMD support, but floating-point instructions + // have a 6-cycle latency, so unroll by 6x for latency hiding. + if (Directive == PPC::DIR_A2) + return 6; + + // FIXME: For lack of any better information, do no harm... + if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) + return 1; + + // For most things, modern systems have two execution units (and + // out-of-order execution). + return 2; +} + +unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind Op1Info, + OperandValueKind Op2Info) const { + assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); + + // Fallback to the default implementation. + return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, + Op2Info); +} + +unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) const { + return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); +} + +unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { + assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); + + return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); +} + +unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const { + return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); +} + +unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const { + assert(Val->isVectorTy() && "This must be a vector type"); + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { + // Double-precision scalars are already located in index #0. + if (Index == 0) + return 0; + + return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); + } + + // Estimated cost of a load-hit-store delay. This was obtained + // experimentally as a minimum needed to prevent unprofitable + // vectorization for the paq8p benchmark. It may need to be + // raised further if other unprofitable cases remain. + unsigned LHSPenalty = 2; + if (ISD == ISD::INSERT_VECTOR_ELT) + LHSPenalty += 7; + + // Vector element insert/extract with Altivec is very expensive, + // because they require store and reload with the attendant + // processor stall for load-hit-store. Until VSX is available, + // these need to be estimated as very costly. + if (ISD == ISD::EXTRACT_VECTOR_ELT || + ISD == ISD::INSERT_VECTOR_ELT) + return LHSPenalty + + TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); + + return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); +} + +unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) const { + // Legalize the type. + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Invalid Opcode"); + + unsigned Cost = + TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); + + // VSX loads/stores support unaligned access. + if (ST->hasVSX()) { + if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) + return Cost; + } + + bool UnalignedAltivec = + Src->isVectorTy() && + Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && + LT.second.getSizeInBits() == 128 && + Opcode == Instruction::Load; + + // PPC in general does not support unaligned loads and stores. They'll need + // to be decomposed based on the alignment factor. + unsigned SrcBytes = LT.second.getStoreSize(); + if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) { + Cost += LT.first*(SrcBytes/Alignment-1); + + // For a vector type, there is also scalarization overhead (only for + // stores, loads are expanded using the vector-load + permutation sequence, + // which is much less expensive). + if (Src->isVectorTy() && Opcode == Instruction::Store) + for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); + } + + return Cost; +} + diff --git a/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp new file mode 100644 index 000000000000..5727dbc4170d --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp @@ -0,0 +1,26 @@ +//===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +Target llvm::ThePPC32Target, llvm::ThePPC64Target, llvm::ThePPC64LETarget; + +extern "C" void LLVMInitializePowerPCTargetInfo() { + RegisterTarget<Triple::ppc, /*HasJIT=*/true> + X(ThePPC32Target, "ppc32", "PowerPC 32"); + + RegisterTarget<Triple::ppc64, /*HasJIT=*/true> + Y(ThePPC64Target, "ppc64", "PowerPC 64"); + + RegisterTarget<Triple::ppc64le, /*HasJIT=*/true> + Z(ThePPC64LETarget, "ppc64le", "PowerPC 64 LE"); +} |