diff options
Diffstat (limited to 'llvm/lib/Target/PowerPC')
75 files changed, 8653 insertions, 5915 deletions
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 13fd7d05ab9f..197fd3c7aa74 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -11,7 +11,6 @@ #include "PPCTargetStreamer.h" #include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -99,12 +98,10 @@ struct PPCOperand; class PPCAsmParser : public MCTargetAsmParser { bool IsPPC64; - bool IsDarwin; void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); } bool isPPC64() const { return IsPPC64; } - bool isDarwin() const { return IsDarwin; } bool MatchRegisterName(unsigned &RegNo, int64_t &IntVal); @@ -116,14 +113,12 @@ class PPCAsmParser : public MCTargetAsmParser { PPCMCExpr::VariantKind &Variant); const MCExpr *FixupVariantKind(const MCExpr *E); bool ParseExpression(const MCExpr *&EVal); - bool ParseDarwinExpression(const MCExpr *&EVal); bool ParseOperand(OperandVector &Operands); bool ParseDirectiveWord(unsigned Size, AsmToken ID); bool ParseDirectiveTC(unsigned Size, AsmToken ID); bool ParseDirectiveMachine(SMLoc L); - bool ParseDarwinDirectiveMachine(SMLoc L); bool ParseDirectiveAbiVersion(SMLoc L); bool ParseDirectiveLocalEntry(SMLoc L); @@ -150,7 +145,6 @@ public: // Check for 64-bit vs. 32-bit pointer mode. const Triple &TheTriple = STI.getTargetTriple(); IsPPC64 = TheTriple.isPPC64(); - IsDarwin = TheTriple.isMacOSX(); // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); } @@ -290,6 +284,16 @@ public: return (unsigned) Imm.Val; } + unsigned getACCReg() const { + assert(isACCRegNumber() && "Invalid access!"); + return (unsigned) Imm.Val; + } + + unsigned getVSRpEvenReg() const { + assert(isVSRpEvenRegNumber() && "Invalid access!"); + return (unsigned) Imm.Val >> 1; + } + unsigned getCCReg() const { assert(isCCRegNumber() && "Invalid access!"); return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal); @@ -402,6 +406,12 @@ public: (getImm() & 3) == 0); } bool isImmZero() const { return Kind == Immediate && getImm() == 0; } bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); } + bool isACCRegNumber() const { + return Kind == Immediate && isUInt<3>(getImm()); + } + bool isVSRpEvenRegNumber() const { + return Kind == Immediate && isUInt<6>(getImm()) && ((getImm() & 1) == 0); + } bool isVSRegNumber() const { return Kind == Immediate && isUInt<6>(getImm()); } @@ -492,29 +502,29 @@ public: Inst.addOperand(MCOperand::createReg(VSSRegs[getVSReg()])); } - void addRegQFRCOperands(MCInst &Inst, unsigned N) const { + void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); + Inst.addOperand(MCOperand::createReg(RRegs[getReg()])); } - void addRegQSRCOperands(MCInst &Inst, unsigned N) const { + void addRegSPERCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); + Inst.addOperand(MCOperand::createReg(SPERegs[getReg()])); } - void addRegQBRCOperands(MCInst &Inst, unsigned N) const { + void addRegACCRCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(QFRegs[getReg()])); + Inst.addOperand(MCOperand::createReg(ACCRegs[getACCReg()])); } - void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const { + void addRegVSRpRCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(RRegs[getReg()])); + Inst.addOperand(MCOperand::createReg(VSRpRegs[getVSRpEvenReg()])); } - void addRegSPERCOperands(MCInst &Inst, unsigned N) const { + void addRegVSRpEvenRCOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(SPERegs[getReg()])); + Inst.addOperand(MCOperand::createReg(VSRpRegs[getVSRpEvenReg()])); } void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const { @@ -666,7 +676,8 @@ public: return CreateImm(CE->getValue(), S, E, IsPPC64); if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val)) - if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS) + if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS || + SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS_PCREL) return CreateTLSReg(SRE, S, E, IsPPC64); if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) { @@ -762,12 +773,18 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, } case PPC::DCBFx: case PPC::DCBFL: - case PPC::DCBFLP: { + case PPC::DCBFLP: + case PPC::DCBFPS: + case PPC::DCBSTPS: { int L = 0; if (Opcode == PPC::DCBFL) L = 1; else if (Opcode == PPC::DCBFLP) L = 3; + else if (Opcode == PPC::DCBFPS) + L = 4; + else if (Opcode == PPC::DCBSTPS) + L = 6; MCInst TmpInst; TmpInst.setOpcode(PPC::DCBF); @@ -1184,41 +1201,41 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) { - if (getParser().getTok().is(AsmToken::Identifier)) { - StringRef Name = getParser().getTok().getString(); - if (Name.equals_lower("lr")) { - RegNo = isPPC64()? PPC::LR8 : PPC::LR; - IntVal = 8; - } else if (Name.equals_lower("ctr")) { - RegNo = isPPC64()? PPC::CTR8 : PPC::CTR; - IntVal = 9; - } else if (Name.equals_lower("vrsave")) { - RegNo = PPC::VRSAVE; - IntVal = 256; - } else if (Name.startswith_lower("r") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal]; - } else if (Name.startswith_lower("f") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = FRegs[IntVal]; - } else if (Name.startswith_lower("vs") && - !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) { - RegNo = VSRegs[IntVal]; - } else if (Name.startswith_lower("v") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = VRegs[IntVal]; - } else if (Name.startswith_lower("q") && - !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { - RegNo = QFRegs[IntVal]; - } else if (Name.startswith_lower("cr") && - !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) { - RegNo = CRRegs[IntVal]; - } else - return true; - getParser().Lex(); - return false; - } - return true; + if (getParser().getTok().is(AsmToken::Percent)) + getParser().Lex(); // Eat the '%'. + + if (!getParser().getTok().is(AsmToken::Identifier)) + return true; + + StringRef Name = getParser().getTok().getString(); + if (Name.equals_lower("lr")) { + RegNo = isPPC64() ? PPC::LR8 : PPC::LR; + IntVal = 8; + } else if (Name.equals_lower("ctr")) { + RegNo = isPPC64() ? PPC::CTR8 : PPC::CTR; + IntVal = 9; + } else if (Name.equals_lower("vrsave")) { + RegNo = PPC::VRSAVE; + IntVal = 256; + } else if (Name.startswith_lower("r") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = isPPC64() ? XRegs[IntVal] : RRegs[IntVal]; + } else if (Name.startswith_lower("f") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = FRegs[IntVal]; + } else if (Name.startswith_lower("vs") && + !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) { + RegNo = VSRegs[IntVal]; + } else if (Name.startswith_lower("v") && + !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) { + RegNo = VRegs[IntVal]; + } else if (Name.startswith_lower("cr") && + !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) { + RegNo = CRRegs[IntVal]; + } else + return true; + getParser().Lex(); + return false; } bool PPCAsmParser:: @@ -1387,10 +1404,6 @@ FixupVariantKind(const MCExpr *E) { /// it handles modifiers. bool PPCAsmParser:: ParseExpression(const MCExpr *&EVal) { - - if (isDarwin()) - return ParseDarwinExpression(EVal); - // (ELF Platforms) // Handle \code @l/@ha \endcode if (getParser().parseExpression(EVal)) @@ -1406,53 +1419,6 @@ ParseExpression(const MCExpr *&EVal) { return false; } -/// ParseDarwinExpression. (MachO Platforms) -/// This differs from the default "parseExpression" in that it handles detection -/// of the \code hi16(), ha16() and lo16() \endcode modifiers. At present, -/// parseExpression() doesn't recognise the modifiers when in the Darwin/MachO -/// syntax form so it is done here. TODO: Determine if there is merit in -/// arranging for this to be done at a higher level. -bool PPCAsmParser:: -ParseDarwinExpression(const MCExpr *&EVal) { - MCAsmParser &Parser = getParser(); - PPCMCExpr::VariantKind Variant = PPCMCExpr::VK_PPC_None; - switch (getLexer().getKind()) { - default: - break; - case AsmToken::Identifier: - // Compiler-generated Darwin identifiers begin with L,l,_ or "; thus - // something starting with any other char should be part of the - // asm syntax. If handwritten asm includes an identifier like lo16, - // then all bets are off - but no-one would do that, right? - StringRef poss = Parser.getTok().getString(); - if (poss.equals_lower("lo16")) { - Variant = PPCMCExpr::VK_PPC_LO; - } else if (poss.equals_lower("hi16")) { - Variant = PPCMCExpr::VK_PPC_HI; - } else if (poss.equals_lower("ha16")) { - Variant = PPCMCExpr::VK_PPC_HA; - } - if (Variant != PPCMCExpr::VK_PPC_None) { - Parser.Lex(); // Eat the xx16 - if (getLexer().isNot(AsmToken::LParen)) - return Error(Parser.getTok().getLoc(), "expected '('"); - Parser.Lex(); // Eat the '(' - } - break; - } - - if (getParser().parseExpression(EVal)) - return true; - - if (Variant != PPCMCExpr::VK_PPC_None) { - if (getLexer().isNot(AsmToken::RParen)) - return Error(Parser.getTok().getLoc(), "expected ')'"); - Parser.Lex(); // Eat the ')' - EVal = PPCMCExpr::create(Variant, EVal, getParser().getContext()); - } - return false; -} - /// ParseOperand /// This handles registers in the form 'NN', '%rNN' for ELF platforms and /// rNN for MachO. @@ -1466,8 +1432,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) { switch (getLexer().getKind()) { // Special handling for register names. These are interpreted // as immediates corresponding to the register number. - case AsmToken::Percent: - Parser.Lex(); // Eat the '%'. + case AsmToken::Percent: { unsigned RegNo; int64_t IntVal; if (MatchRegisterName(RegNo, IntVal)) @@ -1475,7 +1440,7 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) { Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); return false; - + } case AsmToken::Identifier: case AsmToken::LParen: case AsmToken::Plus: @@ -1485,20 +1450,6 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) { case AsmToken::Dollar: case AsmToken::Exclaim: case AsmToken::Tilde: - // Note that non-register-name identifiers from the compiler will begin - // with '_', 'L'/'l' or '"'. Of course, handwritten asm could include - // identifiers like r31foo - so we fall through in the event that parsing - // a register name fails. - if (isDarwin()) { - unsigned RegNo; - int64_t IntVal; - if (!MatchRegisterName(RegNo, IntVal)) { - Operands.push_back(PPCOperand::CreateImm(IntVal, S, E, isPPC64())); - return false; - } - } - // All other expressions - if (!ParseExpression(EVal)) break; // Fall-through @@ -1537,29 +1488,18 @@ bool PPCAsmParser::ParseOperand(OperandVector &Operands) { int64_t IntVal; switch (getLexer().getKind()) { - case AsmToken::Percent: - Parser.Lex(); // Eat the '%'. + case AsmToken::Percent: { unsigned RegNo; if (MatchRegisterName(RegNo, IntVal)) return Error(S, "invalid register name"); break; - + } case AsmToken::Integer: - if (isDarwin()) - return Error(S, "unexpected integer value"); - else if (getParser().parseAbsoluteExpression(IntVal) || IntVal < 0 || - IntVal > 31) + if (getParser().parseAbsoluteExpression(IntVal) || IntVal < 0 || + IntVal > 31) return Error(S, "invalid register number"); break; - case AsmToken::Identifier: - if (isDarwin()) { - unsigned RegNo; - if (!MatchRegisterName(RegNo, IntVal)) { - break; - } - } - LLVM_FALLTHROUGH; - + case AsmToken::Identifier: default: return Error(S, "invalid memory operand"); } @@ -1643,12 +1583,7 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, /// ParseDirective parses the PPC specific directives bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getIdentifier(); - if (isDarwin()) { - if (IDVal == ".machine") - ParseDarwinDirectiveMachine(DirectiveID.getLoc()); - else - return true; - } else if (IDVal == ".word") + if (IDVal == ".word") ParseDirectiveWord(2, DirectiveID); else if (IDVal == ".llong") ParseDirectiveWord(8, DirectiveID); @@ -1720,11 +1655,7 @@ bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) { // FIXME: Right now, the parser always allows any available // instruction, so the .machine directive is not useful. - // Implement ".machine any" (by doing nothing) for the benefit - // of existing assembler code. Likewise, we can then implement - // ".machine push" and ".machine pop" as no-op. - if (CPU != "any" && CPU != "push" && CPU != "pop") - return TokError("unrecognized machine type"); + // In the wild, any/push/pop/ppc64/altivec/power[4-9] are seen. Parser.Lex(); @@ -1739,31 +1670,6 @@ bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) { return false; } -/// ParseDarwinDirectiveMachine (Mach-o platforms) -/// ::= .machine cpu-identifier -bool PPCAsmParser::ParseDarwinDirectiveMachine(SMLoc L) { - MCAsmParser &Parser = getParser(); - if (Parser.getTok().isNot(AsmToken::Identifier) && - Parser.getTok().isNot(AsmToken::String)) - return Error(L, "unexpected token in directive"); - - StringRef CPU = Parser.getTok().getIdentifier(); - Parser.Lex(); - - // FIXME: this is only the 'default' set of cpu variants. - // However we don't act on this information at present, this is simply - // allowing parsing to proceed with minimal sanity checking. - if (check(CPU != "ppc7400" && CPU != "ppc" && CPU != "ppc64", L, - "unrecognized cpu type") || - check(isPPC64() && (CPU == "ppc7400" || CPU == "ppc"), L, - "wrong cpu type specified for 64bit") || - check(!isPPC64() && CPU == "ppc64", L, - "wrong cpu type specified for 32bit") || - parseToken(AsmToken::EndOfStatement)) - return addErrorSuffix(" in '.machine' directive"); - return false; -} - /// ParseDirectiveAbiVersion /// ::= .abiversion constant-expression bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) { @@ -1809,8 +1715,9 @@ bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) { /// Force static initialization. extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() { RegisterMCAsmParser<PPCAsmParser> A(getThePPC32Target()); - RegisterMCAsmParser<PPCAsmParser> B(getThePPC64Target()); - RegisterMCAsmParser<PPCAsmParser> C(getThePPC64LETarget()); + RegisterMCAsmParser<PPCAsmParser> B(getThePPC32LETarget()); + RegisterMCAsmParser<PPCAsmParser> C(getThePPC64Target()); + RegisterMCAsmParser<PPCAsmParser> D(getThePPC64LETarget()); } #define GET_REGISTER_MATCHER diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 74c6fd3733f0..3e9286fb0b30 100644 --- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -54,6 +54,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() { // Register the disassembler for each target. TargetRegistry::RegisterMCDisassembler(getThePPC32Target(), createPPCDisassembler); + TargetRegistry::RegisterMCDisassembler(getThePPC32LETarget(), + createPPCLEDisassembler); TargetRegistry::RegisterMCDisassembler(getThePPC64Target(), createPPCDisassembler); TargetRegistry::RegisterMCDisassembler(getThePPC64LETarget(), @@ -167,18 +169,24 @@ static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo, #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass -static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo, - uint64_t Address, - const void *Decoder) { - return decodeRegisterClass(Inst, RegNo, QFRegs); -} - static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo, uint64_t Address, const void *Decoder) { return decodeRegisterClass(Inst, RegNo, SPERegs); } +static DecodeStatus DecodeACCRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, ACCRegs); +} + +static DecodeStatus DecodeVSRpRCRegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, VSRpRegs); +} + #define DecodeQSRCRegisterClass DecodeQFRCRegisterClass #define DecodeQBRCRegisterClass DecodeQFRCRegisterClass @@ -206,6 +214,15 @@ static DecodeStatus decodeImmZeroOperand(MCInst &Inst, uint64_t Imm, return MCDisassembler::Success; } +static DecodeStatus decodeVSRpEvenOperands(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo & 1) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createReg(VSRpRegs[RegNo >> 1])); + return MCDisassembler::Success; +} + static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm, int64_t Address, const void *Decoder) { // Decode the memri field (imm, reg), which has the low 16-bits as the @@ -401,14 +418,9 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Read the instruction in the proper endianness. uint64_t Inst = ReadFunc(Bytes.data()); - if (STI.getFeatureBits()[PPC::FeatureQPX]) { - DecodeStatus result = - decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI); - if (result != MCDisassembler::Fail) - return result; - } else if (STI.getFeatureBits()[PPC::FeatureSPE]) { + if (STI.getFeatureBits()[PPC::FeatureSPE]) { DecodeStatus result = - decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); + decodeInstruction(DecoderTableSPE32, MI, Inst, Address, this, STI); if (result != MCDisassembler::Fail) return result; } diff --git a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp new file mode 100644 index 000000000000..e8f8cbfee6ee --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.cpp @@ -0,0 +1,53 @@ +//===-- PPCCallLowering.h - Call lowering for GlobalISel -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +/// +//===----------------------------------------------------------------------===// + +#include "PPCCallLowering.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "ppc-call-lowering" + +using namespace llvm; + +PPCCallLowering::PPCCallLowering(const PPCTargetLowering &TLI) + : CallLowering(&TLI) {} + +bool PPCCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, ArrayRef<Register> VRegs, + FunctionLoweringInfo &FLI, + Register SwiftErrorVReg) const { + assert(((Val && !VRegs.empty()) || (!Val && VRegs.empty())) && + "Return value without a vreg"); + if (VRegs.size() > 0) + return false; + + MIRBuilder.buildInstr(PPC::BLR8); + return true; +} + +bool PPCCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, + const Function &F, + ArrayRef<ArrayRef<Register>> VRegs, + FunctionLoweringInfo &FLI) const { + + // If VRegs is empty, then there are no formal arguments to lower and thus can + // always return true. If there are formal arguments, we currently do not + // handle them and thus return false. + return VRegs.empty(); +} + +bool PPCCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const { + return false; +} diff --git a/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h new file mode 100644 index 000000000000..5a449f4cab1b --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCCallLowering.h @@ -0,0 +1,40 @@ +//===-- PPCCallLowering.h - Call lowering for GlobalISel -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_GISEL_PPCCALLLOWERING_H +#define LLVM_LIB_TARGET_POWERPC_GISEL_PPCCALLLOWERING_H + +#include "PPCISelLowering.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/IR/CallingConv.h" + +namespace llvm { + +class PPCTargetLowering; + +class PPCCallLowering : public CallLowering { +public: + PPCCallLowering(const PPCTargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, + ArrayRef<Register> VRegs, FunctionLoweringInfo &FLI, + Register SwiftErrorVReg) const override; + bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, + ArrayRef<ArrayRef<Register>> VRegs, + FunctionLoweringInfo &FLI) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, + CallLoweringInfo &Info) const override; +}; +} // end namespace llvm + +#endif diff --git a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp new file mode 100644 index 000000000000..7d64816ed6c7 --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp @@ -0,0 +1,92 @@ +//===- PPCInstructionSelector.cpp --------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the InstructionSelector class for +/// PowerPC. +//===----------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPCRegisterBankInfo.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/IntrinsicsPowerPC.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "ppc-gisel" + +using namespace llvm; + +namespace { + +#define GET_GLOBALISEL_PREDICATE_BITSET +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET + +class PPCInstructionSelector : public InstructionSelector { +public: + PPCInstructionSelector(const PPCTargetMachine &TM, const PPCSubtarget &STI, + const PPCRegisterBankInfo &RBI); + + bool select(MachineInstr &I) override; + static const char *getName() { return DEBUG_TYPE; } + +private: + /// tblgen generated 'select' implementation that is used as the initial + /// selector for the patterns that do not require complex C++. + bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + + const PPCInstrInfo &TII; + const PPCRegisterInfo &TRI; + const PPCRegisterBankInfo &RBI; + +#define GET_GLOBALISEL_PREDICATES_DECL +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_DECL + +#define GET_GLOBALISEL_TEMPORARIES_DECL +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_DECL +}; + +} // end anonymous namespace + +#define GET_GLOBALISEL_IMPL +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_IMPL + +PPCInstructionSelector::PPCInstructionSelector(const PPCTargetMachine &TM, + const PPCSubtarget &STI, + const PPCRegisterBankInfo &RBI) + : InstructionSelector(), TII(*STI.getInstrInfo()), + TRI(*STI.getRegisterInfo()), RBI(RBI), +#define GET_GLOBALISEL_PREDICATES_INIT +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_INIT +#define GET_GLOBALISEL_TEMPORARIES_INIT +#include "PPCGenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_INIT +{ +} + +bool PPCInstructionSelector::select(MachineInstr &I) { + if (selectImpl(I, *CoverageInfo)) + return true; + return false; +} + +namespace llvm { +InstructionSelector * +createPPCInstructionSelector(const PPCTargetMachine &TM, + const PPCSubtarget &Subtarget, + const PPCRegisterBankInfo &RBI) { + return new PPCInstructionSelector(TM, Subtarget, RBI); +} +} // end namespace llvm diff --git a/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp b/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp new file mode 100644 index 000000000000..c16bcaea592b --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.cpp @@ -0,0 +1,20 @@ +//===- PPCLegalizerInfo.h ----------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the Machinelegalizer class for PowerPC +//===----------------------------------------------------------------------===// + +#include "PPCLegalizerInfo.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "ppc-legalinfo" + +using namespace llvm; +using namespace LegalizeActions; + +PPCLegalizerInfo::PPCLegalizerInfo(const PPCSubtarget &ST) { computeTables(); } diff --git a/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h b/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h new file mode 100644 index 000000000000..c73186d3d0c1 --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCLegalizerInfo.h @@ -0,0 +1,28 @@ +//===- PPCLegalizerInfo.h ----------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares the targeting of the Machinelegalizer class for PowerPC +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_POWERPC_GISEL_PPCMACHINELEGALIZER_H +#define LLVM_LIB_TARGET_POWERPC_GISEL_PPCMACHINELEGALIZER_H + +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" + +namespace llvm { + +class PPCSubtarget; + +/// This class provides the information for the PowerPC target legalizer for +/// GlobalISel. +class PPCLegalizerInfo : public LegalizerInfo { +public: + PPCLegalizerInfo(const PPCSubtarget &ST); +}; +} // namespace llvm +#endif diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp new file mode 100644 index 000000000000..6af79324919c --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp @@ -0,0 +1,27 @@ +//===- PPCRegisterBankInfo.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements the targeting of the RegisterBankInfo class for +/// PowerPC. +//===----------------------------------------------------------------------===// + +#include "PPCRegisterBankInfo.h" +#include "PPCRegisterInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "ppc-reg-bank-info" + +#define GET_TARGET_REGBANK_IMPL +#include "PPCGenRegisterBank.inc" + +using namespace llvm; + +PPCRegisterBankInfo::PPCRegisterBankInfo(const TargetRegisterInfo &TRI) + : PPCGenRegisterBankInfo() {} diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h new file mode 100644 index 000000000000..358d5ed3cf14 --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.h @@ -0,0 +1,39 @@ +//===-- PPCRegisterBankInfo.h -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares the targeting of the RegisterBankInfo class for PowerPC. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H +#define LLVM_LIB_TARGET_PPC_GISEL_PPCREGISTERBANKINFO_H + +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +#define GET_REGBANK_DECLARATIONS +#include "PPCGenRegisterBank.inc" + +namespace llvm { +class TargetRegisterInfo; + +class PPCGenRegisterBankInfo : public RegisterBankInfo { +protected: +#define GET_TARGET_REGBANK_CLASS +#include "PPCGenRegisterBank.inc" +}; + +class PPCRegisterBankInfo final : public PPCGenRegisterBankInfo { +public: + PPCRegisterBankInfo(const TargetRegisterInfo &TRI); +}; +} // namespace llvm + +#endif diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td new file mode 100644 index 000000000000..0e8a4b7061c5 --- /dev/null +++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBanks.td @@ -0,0 +1,15 @@ +//===-- PPCRegisterBanks.td - Describe the PPC Banks -------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Define the PPC register banks used for GlobalISel. +/// +//===----------------------------------------------------------------------===// + +/// General Purpose Registers +def GPRRegBank : RegisterBank<"GPR", [G8RC]>; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index dbaf221db9fc..72401668c8d0 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -46,6 +46,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { case PPC::fixup_ppc_half16ds: return Value & 0xfffc; case PPC::fixup_ppc_pcrel34: + case PPC::fixup_ppc_imm34: return Value & 0x3ffffffff; } } @@ -68,6 +69,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case PPC::fixup_ppc_br24_notoc: return 4; case PPC::fixup_ppc_pcrel34: + case PPC::fixup_ppc_imm34: case FK_Data_8: return 8; case PPC::fixup_ppc_nofixup: @@ -100,6 +102,7 @@ public: { "fixup_ppc_half16", 0, 16, 0 }, { "fixup_ppc_half16ds", 0, 14, 0 }, { "fixup_ppc_pcrel34", 0, 34, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_imm34", 0, 34, 0 }, { "fixup_ppc_nofixup", 0, 0, 0 } }; const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = { @@ -112,6 +115,7 @@ public: { "fixup_ppc_half16", 0, 16, 0 }, { "fixup_ppc_half16ds", 2, 14, 0 }, { "fixup_ppc_pcrel34", 0, 34, MCFixupKindInfo::FKF_IsPCRel }, + { "fixup_ppc_imm34", 0, 34, 0 }, { "fixup_ppc_nofixup", 0, 0, 0 } }; @@ -178,12 +182,6 @@ public: } } - bool mayNeedRelaxation(const MCInst &Inst, - const MCSubtargetInfo &STI) const override { - // FIXME. - return false; - } - bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index d8b3301e97f1..94ef7b45434f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -138,6 +138,15 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, case MCSymbolRefExpr::VK_PPC_GOT_PCREL: Type = ELF::R_PPC64_GOT_PCREL34; break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL: + Type = ELF::R_PPC64_GOT_TLSGD_PCREL34; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL: + Type = ELF::R_PPC64_GOT_TLSLD_PCREL34; + break; + case MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL: + Type = ELF::R_PPC64_GOT_TPREL_PCREL34; + break; } break; case FK_Data_4: @@ -407,6 +416,21 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, else Type = ELF::R_PPC_TLS; break; + case MCSymbolRefExpr::VK_PPC_TLS_PCREL: + Type = ELF::R_PPC64_TLS; + break; + } + break; + case PPC::fixup_ppc_imm34: + switch (Modifier) { + default: + report_fatal_error("Unsupported Modifier for fixup_ppc_imm34."); + case MCSymbolRefExpr::VK_DTPREL: + Type = ELF::R_PPC64_DTPREL34; + break; + case MCSymbolRefExpr::VK_TPREL: + Type = ELF::R_PPC64_TPREL34; + break; } break; case FK_Data_8: diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp index 4373778cc96c..386d59266096 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp @@ -20,6 +20,7 @@ #include "PPCELFStreamer.h" +#include "PPCFixupKinds.h" #include "PPCInstrInfo.h" #include "PPCMCCodeEmitter.h" #include "llvm/BinaryFormat/ELF.h" @@ -89,12 +90,33 @@ void PPCELFStreamer::emitInstruction(const MCInst &Inst, PPCMCCodeEmitter *Emitter = static_cast<PPCMCCodeEmitter*>(getAssembler().getEmitterPtr()); + // If the instruction is a part of the GOT to PC-Rel link time optimization + // instruction pair, return a value, otherwise return None. A true returned + // value means the instruction is the PLDpc and a false value means it is + // the user instruction. + Optional<bool> IsPartOfGOTToPCRelPair = isPartOfGOTToPCRelPair(Inst, STI); + + // User of the GOT-indirect address. + // For example, the load that will get the relocation as follows: + // .reloc .Lpcrel1-8,R_PPC64_PCREL_OPT,.-(.Lpcrel1-8) + // lwa 3, 4(3) + if (IsPartOfGOTToPCRelPair.hasValue() && !IsPartOfGOTToPCRelPair.getValue()) + emitGOTToPCRelReloc(Inst); + // Special handling is only for prefixed instructions. if (!Emitter->isPrefixedInstruction(Inst)) { MCELFStreamer::emitInstruction(Inst, STI); return; } emitPrefixedInstruction(Inst, STI); + + // Producer of the GOT-indirect address. + // For example, the prefixed load from the got that will get the label as + // follows: + // pld 3, vec@got@pcrel(0), 1 + // .Lpcrel1: + if (IsPartOfGOTToPCRelPair.hasValue() && IsPartOfGOTToPCRelPair.getValue()) + emitGOTToPCRelLabel(Inst); } void PPCELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { @@ -103,6 +125,102 @@ void PPCELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { MCELFStreamer::emitLabel(Symbol); } +// This linker time GOT PC Relative optimization relocation will look like this: +// pld <reg> symbol@got@pcrel +// <Label###>: +// .reloc Label###-8,R_PPC64_PCREL_OPT,.-(Label###-8) +// load <loadedreg>, 0(<reg>) +// The reason we place the label after the PLDpc instruction is that there +// may be an alignment nop before it since prefixed instructions must not +// cross a 64-byte boundary (please see +// PPCELFStreamer::emitPrefixedInstruction()). When referring to the +// label, we subtract the width of a prefixed instruction (8 bytes) to ensure +// we refer to the PLDpc. +void PPCELFStreamer::emitGOTToPCRelReloc(const MCInst &Inst) { + // Get the last operand which contains the symbol. + const MCOperand &Operand = Inst.getOperand(Inst.getNumOperands() - 1); + assert(Operand.isExpr() && "Expecting an MCExpr."); + // Cast the last operand to MCSymbolRefExpr to get the symbol. + const MCExpr *Expr = Operand.getExpr(); + const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr); + assert(SymExpr->getKind() == MCSymbolRefExpr::VK_PPC_PCREL_OPT && + "Expecting a symbol of type VK_PPC_PCREL_OPT"); + MCSymbol *LabelSym = + getContext().getOrCreateSymbol(SymExpr->getSymbol().getName()); + const MCExpr *LabelExpr = MCSymbolRefExpr::create(LabelSym, getContext()); + const MCExpr *Eight = MCConstantExpr::create(8, getContext()); + // SubExpr is just Label###-8 + const MCExpr *SubExpr = + MCBinaryExpr::createSub(LabelExpr, Eight, getContext()); + MCSymbol *CurrentLocation = getContext().createTempSymbol(); + const MCExpr *CurrentLocationExpr = + MCSymbolRefExpr::create(CurrentLocation, getContext()); + // SubExpr2 is .-(Label###-8) + const MCExpr *SubExpr2 = + MCBinaryExpr::createSub(CurrentLocationExpr, SubExpr, getContext()); + + MCDataFragment *DF = static_cast<MCDataFragment *>(LabelSym->getFragment()); + assert(DF && "Expecting a valid data fragment."); + MCFixupKind FixupKind = static_cast<MCFixupKind>(FirstLiteralRelocationKind + + ELF::R_PPC64_PCREL_OPT); + DF->getFixups().push_back( + MCFixup::create(LabelSym->getOffset() - 8, SubExpr2, + FixupKind, Inst.getLoc())); + emitLabel(CurrentLocation, Inst.getLoc()); +} + +// Emit the label that immediately follows the PLDpc for a link time GOT PC Rel +// optimization. +void PPCELFStreamer::emitGOTToPCRelLabel(const MCInst &Inst) { + // Get the last operand which contains the symbol. + const MCOperand &Operand = Inst.getOperand(Inst.getNumOperands() - 1); + assert(Operand.isExpr() && "Expecting an MCExpr."); + // Cast the last operand to MCSymbolRefExpr to get the symbol. + const MCExpr *Expr = Operand.getExpr(); + const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr); + assert(SymExpr->getKind() == MCSymbolRefExpr::VK_PPC_PCREL_OPT && + "Expecting a symbol of type VK_PPC_PCREL_OPT"); + MCSymbol *LabelSym = + getContext().getOrCreateSymbol(SymExpr->getSymbol().getName()); + emitLabel(LabelSym, Inst.getLoc()); +} + +// This funciton checks if the parameter Inst is part of the setup for a link +// time GOT PC Relative optimization. For example in this situation: +// <MCInst PLDpc <MCOperand Reg:282> <MCOperand Expr:(glob_double@got@pcrel)> +// <MCOperand Imm:0> <MCOperand Expr:(.Lpcrel@<<invalid>>)>> +// <MCInst SOME_LOAD <MCOperand Reg:22> <MCOperand Imm:0> <MCOperand Reg:282> +// <MCOperand Expr:(.Lpcrel@<<invalid>>)>> +// The above is a pair of such instructions and this function will not return +// None for either one of them. In both cases we are looking for the last +// operand <MCOperand Expr:(.Lpcrel@<<invalid>>)> which needs to be an MCExpr +// and has the flag MCSymbolRefExpr::VK_PPC_PCREL_OPT. After that we just look +// at the opcode and in the case of PLDpc we will return true. For the load +// (or store) this function will return false indicating it has found the second +// instruciton in the pair. +Optional<bool> llvm::isPartOfGOTToPCRelPair(const MCInst &Inst, + const MCSubtargetInfo &STI) { + // Need at least two operands. + if (Inst.getNumOperands() < 2) + return None; + + unsigned LastOp = Inst.getNumOperands() - 1; + // The last operand needs to be an MCExpr and it needs to have a variant kind + // of VK_PPC_PCREL_OPT. If it does not satisfy these conditions it is not a + // link time GOT PC Rel opt instruction and we can ignore it and return None. + const MCOperand &Operand = Inst.getOperand(LastOp); + if (!Operand.isExpr()) + return None; + + // Check for the variant kind VK_PPC_PCREL_OPT in this expression. + const MCExpr *Expr = Operand.getExpr(); + const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr); + if (!SymExpr || SymExpr->getKind() != MCSymbolRefExpr::VK_PPC_PCREL_OPT) + return None; + + return (Inst.getOpcode() == PPC::PLDpc); +} + MCELFStreamer *llvm::createPPCELFStreamer( MCContext &Context, std::unique_ptr<MCAsmBackend> MAB, std::unique_ptr<MCObjectWriter> OW, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h index 51863232d071..f44200104f32 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h @@ -43,8 +43,15 @@ public: void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override; private: void emitPrefixedInstruction(const MCInst &Inst, const MCSubtargetInfo &STI); + void emitGOTToPCRelReloc(const MCInst &Inst); + void emitGOTToPCRelLabel(const MCInst &Inst); }; +// Check if the instruction Inst is part of a pair of instructions that make up +// a link time GOT PC Rel optimization. +Optional<bool> isPartOfGOTToPCRelPair(const MCInst &Inst, + const MCSubtargetInfo &STI); + MCELFStreamer *createPPCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB, std::unique_ptr<MCObjectWriter> OW, diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h index 2fb8947fd4e0..73292f7b7938 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h @@ -43,6 +43,9 @@ enum Fixups { // A 34-bit fixup corresponding to PC-relative paddi. fixup_ppc_pcrel34, + // A 34-bit fixup corresponding to Non-PC-relative paddi. + fixup_ppc_imm34, + /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the /// TLS general and local dynamic models, or inserts the thread-pointer /// register number. diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp index 16da62a74b8c..a291a34d4c52 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp @@ -49,18 +49,6 @@ FullRegNamesWithPercent("ppc-reg-with-percent-prefix", cl::Hidden, void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { const char *RegName = getRegisterName(RegNo); - if (RegName[0] == 'q' /* QPX */) { - // The system toolchain on the BG/Q does not understand QPX register names - // in .cfi_* directives, so print the name of the floating-point - // subregister instead. - std::string RN(RegName); - - RN[0] = 'f'; - OS << RN; - - return; - } - OS << RegName; } @@ -83,15 +71,45 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address, "reference expression if it is an expression at all."); O << "\taddis "; - printOperand(MI, 0, O); + printOperand(MI, 0, STI, O); O << ", "; - printOperand(MI, 2, O); + printOperand(MI, 2, STI, O); O << "("; - printOperand(MI, 1, O); + printOperand(MI, 1, STI, O); O << ")"; return; } + // Check if the last operand is an expression with the variant kind + // VK_PPC_PCREL_OPT. If this is the case then this is a linker optimization + // relocation and the .reloc directive needs to be added. + unsigned LastOp = MI->getNumOperands() - 1; + if (MI->getNumOperands() > 1) { + const MCOperand &Operand = MI->getOperand(LastOp); + if (Operand.isExpr()) { + const MCExpr *Expr = Operand.getExpr(); + const MCSymbolRefExpr *SymExpr = + static_cast<const MCSymbolRefExpr *>(Expr); + + if (SymExpr && SymExpr->getKind() == MCSymbolRefExpr::VK_PPC_PCREL_OPT) { + const MCSymbol &Symbol = SymExpr->getSymbol(); + if (MI->getOpcode() == PPC::PLDpc) { + printInstruction(MI, Address, STI, O); + O << "\n"; + Symbol.print(O, &MAI); + O << ":"; + return; + } else { + O << "\t.reloc "; + Symbol.print(O, &MAI); + O << "-8,R_PPC64_PCREL_OPT,.-("; + Symbol.print(O, &MAI); + O << "-8)\n"; + } + } + } + } + // Check for slwi/srwi mnemonics. if (MI->getOpcode() == PPC::RLWINM) { unsigned char SH = MI->getOperand(2).getImm(); @@ -106,9 +124,9 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address, SH = 32-SH; } if (useSubstituteMnemonic) { - printOperand(MI, 0, O); + printOperand(MI, 0, STI, O); O << ", "; - printOperand(MI, 1, O); + printOperand(MI, 1, STI, O); O << ", " << (unsigned int)SH; printAnnotation(O, Annot); @@ -123,9 +141,9 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address, // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH if (63-SH == ME) { O << "\tsldi "; - printOperand(MI, 0, O); + printOperand(MI, 0, STI, O); O << ", "; - printOperand(MI, 1, O); + printOperand(MI, 1, STI, O); O << ", " << (unsigned int)SH; printAnnotation(O, Annot); return; @@ -153,9 +171,9 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address, if (IsBookE && TH != 0 && TH != 16) O << (unsigned int) TH << ", "; - printOperand(MI, 1, O); + printOperand(MI, 1, STI, O); O << ", "; - printOperand(MI, 2, O); + printOperand(MI, 2, STI, O); if (!IsBookE && TH != 0 && TH != 16) O << ", " << (unsigned int) TH; @@ -166,29 +184,36 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address, if (MI->getOpcode() == PPC::DCBF) { unsigned char L = MI->getOperand(0).getImm(); - if (!L || L == 1 || L == 3) { - O << "\tdcbf"; - if (L == 1 || L == 3) + if (!L || L == 1 || L == 3 || L == 4 || L == 6) { + O << "\tdcb"; + if (L != 6) + O << "f"; + if (L == 1) O << "l"; if (L == 3) - O << "p"; + O << "lp"; + if (L == 4) + O << "ps"; + if (L == 6) + O << "stps"; O << " "; - printOperand(MI, 1, O); + printOperand(MI, 1, STI, O); O << ", "; - printOperand(MI, 2, O); + printOperand(MI, 2, STI, O); printAnnotation(O, Annot); return; } } - if (!printAliasInstr(MI, Address, O)) - printInstruction(MI, Address, O); + if (!printAliasInstr(MI, Address, STI, O)) + printInstruction(MI, Address, STI, O); printAnnotation(O, Annot); } void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O, const char *Modifier) { unsigned Code = MI->getOperand(OpNo).getImm(); @@ -282,10 +307,11 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, assert(StringRef(Modifier) == "reg" && "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!"); - printOperand(MI, OpNo+1, O); + printOperand(MI, OpNo + 1, STI, O); } void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Code = MI->getOperand(OpNo).getImm(); if (Code == 2) @@ -295,6 +321,7 @@ void PPCInstPrinter::printATBitsAsHint(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); assert(Value <= 1 && "Invalid u1imm argument!"); @@ -302,6 +329,7 @@ void PPCInstPrinter::printU1ImmOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); assert(Value <= 3 && "Invalid u2imm argument!"); @@ -309,6 +337,7 @@ void PPCInstPrinter::printU2ImmOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); assert(Value <= 8 && "Invalid u3imm argument!"); @@ -316,6 +345,7 @@ void PPCInstPrinter::printU3ImmOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); assert(Value <= 15 && "Invalid u4imm argument!"); @@ -323,6 +353,7 @@ void PPCInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { int Value = MI->getOperand(OpNo).getImm(); Value = SignExtend32<5>(Value); @@ -330,6 +361,7 @@ void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printImmZeroOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); assert(Value == 0 && "Operand must be zero"); @@ -337,6 +369,7 @@ void PPCInstPrinter::printImmZeroOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); assert(Value <= 31 && "Invalid u5imm argument!"); @@ -344,6 +377,7 @@ void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); assert(Value <= 63 && "Invalid u6imm argument!"); @@ -351,6 +385,7 @@ void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned int Value = MI->getOperand(OpNo).getImm(); assert(Value <= 127 && "Invalid u7imm argument!"); @@ -361,12 +396,14 @@ void PPCInstPrinter::printU7ImmOperand(const MCInst *MI, unsigned OpNo, // of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and // print as unsigned. void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned char Value = MI->getOperand(OpNo).getImm(); O << (unsigned int)Value; } void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned short Value = MI->getOperand(OpNo).getImm(); assert(Value <= 1023 && "Invalid u10imm argument!"); @@ -374,6 +411,7 @@ void PPCInstPrinter::printU10ImmOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned short Value = MI->getOperand(OpNo).getImm(); assert(Value <= 4095 && "Invalid u12imm argument!"); @@ -381,14 +419,16 @@ void PPCInstPrinter::printU12ImmOperand(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { if (MI->getOperand(OpNo).isImm()) O << (short)MI->getOperand(OpNo).getImm(); else - printOperand(MI, OpNo, O); + printOperand(MI, OpNo, STI, O); } void PPCInstPrinter::printS34ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { if (MI->getOperand(OpNo).isImm()) { long long Value = MI->getOperand(OpNo).getImm(); @@ -396,21 +436,24 @@ void PPCInstPrinter::printS34ImmOperand(const MCInst *MI, unsigned OpNo, O << (long long)Value; } else - printOperand(MI, OpNo, O); + printOperand(MI, OpNo, STI, O); } void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { if (MI->getOperand(OpNo).isImm()) O << (unsigned short)MI->getOperand(OpNo).getImm(); else - printOperand(MI, OpNo, O); + printOperand(MI, OpNo, STI, O); } void PPCInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address, - unsigned OpNo, raw_ostream &O) { + unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { if (!MI->getOperand(OpNo).isImm()) - return printOperand(MI, OpNo, O); + return printOperand(MI, OpNo, STI, O); int32_t Imm = SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2); if (PrintBranchImmAsAddress) { uint64_t Target = Address + Imm; @@ -433,16 +476,16 @@ void PPCInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address, } void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { if (!MI->getOperand(OpNo).isImm()) - return printOperand(MI, OpNo, O); + return printOperand(MI, OpNo, STI, O); O << SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2); } - void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned CCReg = MI->getOperand(OpNo).getReg(); unsigned RegNo; switch (CCReg) { @@ -460,33 +503,37 @@ void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo, } void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { - printS16ImmOperand(MI, OpNo, O); + printS16ImmOperand(MI, OpNo, STI, O); O << '('; if (MI->getOperand(OpNo+1).getReg() == PPC::R0) O << "0"; else - printOperand(MI, OpNo+1, O); + printOperand(MI, OpNo + 1, STI, O); O << ')'; } void PPCInstPrinter::printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { - printS34ImmOperand(MI, OpNo, O); + printS34ImmOperand(MI, OpNo, STI, O); O << '('; - printImmZeroOperand(MI, OpNo + 1, O); + printImmZeroOperand(MI, OpNo + 1, STI, O); O << ')'; } void PPCInstPrinter::printMemRegImm34(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printS34ImmOperand(MI, OpNo, O); + const MCSubtargetInfo &STI, + raw_ostream &O) { + printS34ImmOperand(MI, OpNo, STI, O); O << '('; - printOperand(MI, OpNo + 1, O); + printOperand(MI, OpNo + 1, STI, O); O << ')'; } void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { // When used as the base register, r0 reads constant zero rather than // the value contained in the register. For this reason, the darwin @@ -494,13 +541,13 @@ void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo, if (MI->getOperand(OpNo).getReg() == PPC::R0) O << "0"; else - printOperand(MI, OpNo, O); + printOperand(MI, OpNo, STI, O); O << ", "; - printOperand(MI, OpNo+1, O); + printOperand(MI, OpNo + 1, STI, O); } void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { // On PPC64, VariantKind is VK_None, but on PPC32, it's VK_PLT, and it must // come at the _end_ of the expression. const MCOperand &Op = MI->getOperand(OpNo); @@ -513,10 +560,17 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo, RefExp = cast<MCSymbolRefExpr>(Op.getExpr()); O << RefExp->getSymbol().getName(); + // The variant kind VK_PPC_NOTOC needs to be handled as a special case + // because we do not want the assembly to print out the @notoc at the + // end like __tls_get_addr(x@tlsgd)@notoc. Instead we want it to look + // like __tls_get_addr@notoc(x@tlsgd). + if (RefExp->getKind() == MCSymbolRefExpr::VK_PPC_NOTOC) + O << '@' << MCSymbolRefExpr::getVariantKindName(RefExp->getKind()); O << '('; - printOperand(MI, OpNo+1, O); + printOperand(MI, OpNo + 1, STI, O); O << ')'; - if (RefExp->getKind() != MCSymbolRefExpr::VK_None) + if (RefExp->getKind() != MCSymbolRefExpr::VK_None && + RefExp->getKind() != MCSymbolRefExpr::VK_PPC_NOTOC) O << '@' << MCSymbolRefExpr::getVariantKindName(RefExp->getKind()); if (ConstExp != nullptr) O << '+' << ConstExp->getValue(); @@ -525,7 +579,7 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo, /// showRegistersWithPercentPrefix - Check if this register name should be /// printed with a percentage symbol as prefix. bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const { - if (!FullRegNamesWithPercent || TT.isOSDarwin() || TT.getOS() == Triple::AIX) + if (!FullRegNamesWithPercent || TT.getOS() == Triple::AIX) return false; switch (RegName[0]) { @@ -545,7 +599,7 @@ bool PPCInstPrinter::showRegistersWithPercentPrefix(const char *RegName) const { const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum, unsigned RegEncoding) const { - if (!TT.isOSDarwin() && !FullRegNames) + if (!FullRegNames) return nullptr; if (RegNum < PPC::CR0EQ || RegNum > PPC::CR7UN) return nullptr; @@ -567,11 +621,11 @@ const char *PPCInstPrinter::getVerboseConditionRegName(unsigned RegNum, bool PPCInstPrinter::showRegistersWithPrefix() const { if (TT.getOS() == Triple::AIX) return false; - return TT.isOSDarwin() || FullRegNamesWithPercent || FullRegNames; + return FullRegNamesWithPercent || FullRegNames; } void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { + const MCSubtargetInfo &STI, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { unsigned Reg = Op.getReg(); @@ -600,4 +654,3 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, assert(Op.isExpr() && "unknown operand kind in printOperand"); Op.getExpr()->print(O, &MAI); } - diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h index 9763aeceef94..5e9b01494416 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h @@ -36,45 +36,73 @@ public: const MCSubtargetInfo &STI, raw_ostream &O) override; // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O); + std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; + void printInstruction(const MCInst *MI, uint64_t Address, + const MCSubtargetInfo &STI, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); - bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS); + bool printAliasInstr(const MCInst *MI, uint64_t Address, + const MCSubtargetInfo &STI, raw_ostream &OS); void printCustomAliasOperand(const MCInst *MI, uint64_t Address, unsigned OpIdx, unsigned PrintMethodIdx, - raw_ostream &OS); + const MCSubtargetInfo &STI, raw_ostream &OS); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printPredicateOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O, const char *Modifier = nullptr); - void printATBitsAsHint(const MCInst *MI, unsigned OpNo, raw_ostream &O); + const MCSubtargetInfo &STI, raw_ostream &O, + const char *Modifier = nullptr); + void printATBitsAsHint(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); - void printU1ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU3ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU7ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printS34ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printImmZeroOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU1ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU2ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU3ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU4ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printS5ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU5ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU6ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU7ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU8ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU10ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU12ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printS16ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printS34ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printU16ImmOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printImmZeroOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printBranchOperand(const MCInst *MI, uint64_t Address, unsigned OpNo, - raw_ostream &O); - void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O); + const MCSubtargetInfo &STI, raw_ostream &O); + void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printTLSCall(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); - void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printcrbitm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); - void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMemRegImm34(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemRegImm(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMemRegImm34(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printMemRegReg(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); }; } // end namespace llvm diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 593dc2843c3d..2b76af279ce6 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -26,7 +26,8 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { if (is64Bit) { CodePointerSize = CalleeSaveStackSlotSize = 8; } - IsLittleEndian = T.getArch() == Triple::ppc64le; + IsLittleEndian = + T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle; // ".comm align is in bytes but .align is pow-2." AlignmentIsInBytes = false; @@ -56,7 +57,7 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { void PPCXCOFFMCAsmInfo::anchor() {} PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) { - if (T.getArch() == Triple::ppc64le) + if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle) report_fatal_error("XCOFF is not supported for little-endian targets"); CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h index 27c687686641..48806051f581 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h @@ -13,7 +13,6 @@ #ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H #define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCASMINFO_H -#include "llvm/MC/MCAsmInfoDarwin.h" #include "llvm/MC/MCAsmInfoELF.h" #include "llvm/MC/MCAsmInfoXCOFF.h" diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index fb65e7320f2b..5f0769fd21f9 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -44,11 +44,13 @@ getDirectBrEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { const MCOperand &MO = MI.getOperand(OpNo); - if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); + if (MO.isReg() || MO.isImm()) + return getMachineOpValue(MI, MO, Fixups, STI); // Add a fixup for the branch target. Fixups.push_back(MCFixup::create(0, MO.getExpr(), - ((MI.getOpcode() == PPC::BL8_NOTOC) + ((MI.getOpcode() == PPC::BL8_NOTOC || + MI.getOpcode() == PPC::BL8_NOTOC_TLS) ? (MCFixupKind)PPC::fixup_ppc_br24_notoc : (MCFixupKind)PPC::fixup_ppc_br24))); return 0; @@ -92,6 +94,16 @@ getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo, return 0; } +unsigned +PPCMCCodeEmitter::getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + assert(MI.getOperand(OpNo).isReg() && "Operand should be a register"); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) + << 1; + return RegBits; +} + unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -104,20 +116,36 @@ unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo, return 0; } -uint64_t -PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +uint64_t PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI, + MCFixupKind Fixup) const { const MCOperand &MO = MI.getOperand(OpNo); - if (MO.isReg() || MO.isImm()) + assert(!MO.isReg() && "Not expecting a register for this operand."); + if (MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI); // Add a fixup for the immediate field. - Fixups.push_back(MCFixup::create(0, MO.getExpr(), - (MCFixupKind)PPC::fixup_ppc_pcrel34)); + Fixups.push_back(MCFixup::create(0, MO.getExpr(), Fixup)); return 0; } +uint64_t +PPCMCCodeEmitter::getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return getImm34Encoding(MI, OpNo, Fixups, STI, + (MCFixupKind)PPC::fixup_ppc_imm34); +} + +uint64_t +PPCMCCodeEmitter::getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + return getImm34Encoding(MI, OpNo, Fixups, STI, + (MCFixupKind)PPC::fixup_ppc_pcrel34); +} + unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -213,8 +241,13 @@ PPCMCCodeEmitter::getMemRI34PCRelEncoding(const MCInst &MI, unsigned OpNo, (void)SRE; // Currently these are the only valid PCRelative Relocations. assert((SRE->getKind() == MCSymbolRefExpr::VK_PCREL || - SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_PCREL) && - "VariantKind must be VK_PCREL or VK_PPC_GOT_PCREL"); + SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_PCREL || + SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL || + SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL || + SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL) && + "VariantKind must be VK_PCREL or VK_PPC_GOT_PCREL or " + "VK_PPC_GOT_TLSGD_PCREL or VK_PPC_GOT_TLSLD_PCREL or " + "VK_PPC_GOT_TPREL_PCREL."); // Generate the fixup for the relocation. Fixups.push_back( MCFixup::create(0, Expr, @@ -326,8 +359,12 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo, // Add a fixup for the TLS register, which simply provides a relocation // hint to the linker that this statement is part of a relocation sequence. - // Return the thread-pointer register's encoding. - Fixups.push_back(MCFixup::create(0, MO.getExpr(), + // Return the thread-pointer register's encoding. Add a one byte displacement + // if using PC relative memops. + const MCExpr *Expr = MO.getExpr(); + const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(Expr); + bool IsPCRel = SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS_PCREL; + Fixups.push_back(MCFixup::create(IsPCRel ? 1 : 0, Expr, (MCFixupKind)PPC::fixup_ppc_nofixup)); const Triple &TT = STI.getTargetTriple(); bool isPPC64 = TT.isPPC64(); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h index 588aa76bd806..347e163c9515 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h @@ -52,7 +52,14 @@ public: const MCSubtargetInfo &STI) const; uint64_t getImm34Encoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + const MCSubtargetInfo &STI, + MCFixupKind Fixup) const; + uint64_t getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint64_t getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; @@ -86,6 +93,9 @@ public: unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + unsigned getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; /// getMachineOpValue - Return binary encoding of operand. If the machine /// operand requires relocation, record the relocation and return zero. diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 3092d56da1c5..bf9c6feb541e 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -20,8 +20,8 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDwarf.h" @@ -30,6 +30,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSectionXCOFF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" @@ -77,7 +78,17 @@ static MCRegisterInfo *createPPCMCRegisterInfo(const Triple &TT) { static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { - return createPPCMCSubtargetInfoImpl(TT, CPU, FS); + // Set some default feature to MC layer. + std::string FullFS = std::string(FS); + + if (TT.isOSAIX()) { + if (!FullFS.empty()) + FullFS = "+aix," + FullFS; + else + FullFS = "+aix"; + } + + return createPPCMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FullFS); } static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, @@ -122,11 +133,12 @@ public: void emitTCEntry(const MCSymbol &S) override { if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) { MCSymbolXCOFF *TCSym = - cast<MCSymbolXCOFF>(Streamer.getContext().getOrCreateSymbol( - XSym->getSymbolTableName() + "[TC]")); + cast<MCSectionXCOFF>(Streamer.getCurrentSectionOnly()) + ->getQualNameSymbol(); + OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n'; + if (TCSym->hasRename()) Streamer.emitXCOFFRenameDirective(TCSym, TCSym->getSymbolTableName()); - OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n'; return; } @@ -334,8 +346,8 @@ static MCInstPrinter *createPPCMCInstPrinter(const Triple &T, } extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() { - for (Target *T : - {&getThePPC32Target(), &getThePPC64Target(), &getThePPC64LETarget()}) { + for (Target *T : {&getThePPC32Target(), &getThePPC32LETarget(), + &getThePPC64Target(), &getThePPC64LETarget()}) { // Register the MC asm info. RegisterMCAsmInfoFn C(*T, createPPCMCAsmInfo); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 719e005d9813..03b316341717 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -124,6 +124,11 @@ static inline bool isRunOfOnes64(uint64_t Val, unsigned &MB, unsigned &ME) { #define GET_SUBTARGETINFO_ENUM #include "PPCGenSubtargetInfo.inc" +#define PPC_REGS0_7(X) \ + { \ + X##0, X##1, X##2, X##3, X##4, X##5, X##6, X##7 \ + } + #define PPC_REGS0_31(X) \ { \ X##0, X##1, X##2, X##3, X##4, X##5, X##6, X##7, X##8, X##9, X##10, X##11, \ @@ -156,10 +161,10 @@ using llvm::MCPhysReg; static const MCPhysReg RRegs[32] = PPC_REGS0_31(PPC::R); \ static const MCPhysReg XRegs[32] = PPC_REGS0_31(PPC::X); \ static const MCPhysReg FRegs[32] = PPC_REGS0_31(PPC::F); \ + static const MCPhysReg VSRpRegs[32] = PPC_REGS0_31(PPC::VSRp); \ static const MCPhysReg SPERegs[32] = PPC_REGS0_31(PPC::S); \ static const MCPhysReg VFRegs[32] = PPC_REGS0_31(PPC::VF); \ static const MCPhysReg VRegs[32] = PPC_REGS0_31(PPC::V); \ - static const MCPhysReg QFRegs[32] = PPC_REGS0_31(PPC::QF); \ static const MCPhysReg RRegsNoR0[32] = \ PPC_REGS_NO0_31(PPC::ZERO, PPC::R); \ static const MCPhysReg XRegsNoX0[32] = \ @@ -179,8 +184,6 @@ using llvm::MCPhysReg; PPC::CR5LT, PPC::CR5GT, PPC::CR5EQ, PPC::CR5UN, \ PPC::CR6LT, PPC::CR6GT, PPC::CR6EQ, PPC::CR6UN, \ PPC::CR7LT, PPC::CR7GT, PPC::CR7EQ, PPC::CR7UN}; \ - static const MCPhysReg CRRegs[8] = { \ - PPC::CR0, PPC::CR1, PPC::CR2, PPC::CR3, \ - PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7} - + static const MCPhysReg CRRegs[8] = PPC_REGS0_7(PPC::CR); \ + static const MCPhysReg ACCRegs[8] = PPC_REGS0_7(PPC::ACC) #endif // LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp index d672d54772e0..77b0331bb14c 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -58,14 +58,19 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize( switch ((unsigned)Fixup.getKind()) { default: report_fatal_error("Unimplemented fixup kind."); - case PPC::fixup_ppc_half16: + case PPC::fixup_ppc_half16: { + const uint8_t SignAndSizeForHalf16 = EncodedSignednessIndicator | 15; switch (Modifier) { default: report_fatal_error("Unsupported modifier for half16 fixup."); case MCSymbolRefExpr::VK_None: - return {XCOFF::RelocationType::R_TOC, EncodedSignednessIndicator | 15}; + return {XCOFF::RelocationType::R_TOC, SignAndSizeForHalf16}; + case MCSymbolRefExpr::VK_PPC_U: + return {XCOFF::RelocationType::R_TOCU, SignAndSizeForHalf16}; + case MCSymbolRefExpr::VK_PPC_L: + return {XCOFF::RelocationType::R_TOCL, SignAndSizeForHalf16}; } - break; + } break; case PPC::fixup_ppc_br24: // Branches are 4 byte aligned, so the 24 bits we encode in // the instruction actually represents a 26 bit offset. diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td index d7e3519d5539..63531f72adfb 100644 --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -94,7 +94,7 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_3SLOTS_1C], (instregex "CMPRB(8)?$"), (instregex "TD(I)?$"), (instregex "TW(I)?$"), - (instregex "FCMPU(S|D)$"), + (instregex "FCMP(O|U)(S|D)$"), (instregex "XSTSTDC(S|D)P$"), FTDIV, FTSQRT, diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index 7e0aa2c6061d..264582b244a7 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -20,17 +20,20 @@ #undef PPC namespace llvm { - class PPCTargetMachine; - class PassRegistry; - class FunctionPass; - class MachineInstr; - class MachineOperand; - class AsmPrinter; - class MCInst; - class MCOperand; - class ModulePass; - - FunctionPass *createPPCCTRLoops(); +class PPCRegisterBankInfo; +class PPCSubtarget; +class PPCTargetMachine; +class PassRegistry; +class FunctionPass; +class InstructionSelector; +class MachineInstr; +class MachineOperand; +class AsmPrinter; +class MCInst; +class MCOperand; +class ModulePass; + +FunctionPass *createPPCCTRLoops(); #ifndef NDEBUG FunctionPass *createPPCCTRLoopsVerify(); #endif @@ -44,7 +47,6 @@ namespace llvm { FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCBranchCoalescingPass(); - FunctionPass *createPPCQPXLoadSplatPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL); FunctionPass *createPPCTLSDynamicCallPass(); FunctionPass *createPPCBoolRetToIntPass(); @@ -68,7 +70,6 @@ namespace llvm { void initializePPCReduceCRLogicalsPass(PassRegistry&); void initializePPCBSelPass(PassRegistry&); void initializePPCBranchCoalescingPass(PassRegistry&); - void initializePPCQPXLoadSplatPass(PassRegistry&); void initializePPCBoolRetToIntPass(PassRegistry&); void initializePPCExpandISELPass(PassRegistry &); void initializePPCPreEmitPeepholePass(PassRegistry &); @@ -80,7 +81,10 @@ namespace llvm { ModulePass *createPPCLowerMASSVEntriesPass(); void initializePPCLowerMASSVEntriesPass(PassRegistry &); extern char &PPCLowerMASSVEntriesID; - + + InstructionSelector * + createPPCInstructionSelector(const PPCTargetMachine &, const PPCSubtarget &, + const PPCRegisterBankInfo &); namespace PPCII { /// Target Operand Flag enum. @@ -107,6 +111,37 @@ namespace llvm { /// produce the relocation @got@pcrel. Fixup is VK_PPC_GOT_PCREL. MO_GOT_FLAG = 8, + // MO_PCREL_OPT_FLAG - If this bit is set the operand is part of a + // PC Relative linker optimization. + MO_PCREL_OPT_FLAG = 16, + + /// MO_TLSGD_FLAG - If this bit is set the symbol reference is relative to + /// TLS General Dynamic model. + MO_TLSGD_FLAG = 32, + + /// MO_TPREL_FLAG - If this bit is set the symbol reference is relative to + /// TLS Initial Exec model. + MO_TPREL_FLAG = 64, + + /// MO_TLSLD_FLAG - If this bit is set the symbol reference is relative to + /// TLS Local Dynamic model. + MO_TLSLD_FLAG = 128, + + /// MO_GOT_TLSGD_PCREL_FLAG - A combintaion of flags, if these bits are set + /// they should produce the relocation @got@tlsgd@pcrel. + /// Fix up is VK_PPC_GOT_TLSGD_PCREL + MO_GOT_TLSGD_PCREL_FLAG = MO_PCREL_FLAG | MO_GOT_FLAG | MO_TLSGD_FLAG, + + /// MO_GOT_TLSLD_PCREL_FLAG - A combintaion of flags, if these bits are set + /// they should produce the relocation @got@tlsld@pcrel. + /// Fix up is VK_PPC_GOT_TLSLD_PCREL + MO_GOT_TLSLD_PCREL_FLAG = MO_PCREL_FLAG | MO_GOT_FLAG | MO_TLSLD_FLAG, + + /// MO_GOT_TPREL_PCREL_FLAG - A combintaion of flags, if these bits are set + /// they should produce the relocation @got@tprel@pcrel. + /// Fix up is VK_PPC_GOT_TPREL_PCREL + MO_GOT_TPREL_PCREL_FLAG = MO_GOT_FLAG | MO_TPREL_FLAG | MO_PCREL_FLAG, + /// The next are not flags but distinct values. MO_ACCESS_MASK = 0xf00, diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 9ad78bf67fe6..1e6ded231585 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -57,6 +57,10 @@ def DirectivePwrFuture def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", "Enable 64-bit instructions">; +def AIXOS: SubtargetFeature<"aix", "IsAIX", "true", "AIX OS">; +def FeatureModernAIXAs + : SubtargetFeature<"modern-aix-as", "HasModernAIXAs", "true", + "AIX system assembler is modern enough to support new mnes">; def FeatureHardFloat : SubtargetFeature<"hard-float", "HasHardFloat", "true", "Enable floating-point instructions">; def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true", @@ -72,6 +76,9 @@ def FeatureAltivec : SubtargetFeature<"altivec","HasAltivec", "true", def FeatureSPE : SubtargetFeature<"spe","HasSPE", "true", "Enable SPE instructions", [FeatureHardFloat]>; +def FeatureEFPU2 : SubtargetFeature<"efpu2", "HasEFPU2", "true", + "Enable Embedded Floating-Point APU 2 instructions", + [FeatureSPE]>; def FeatureMFOCRF : SubtargetFeature<"mfocrf","HasMFOCRF", "true", "Enable the MFOCRF instruction">; def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true", @@ -132,9 +139,6 @@ def FeaturePPC4xx : SubtargetFeature<"ppc4xx", "IsPPC4xx", "true", "Enable PPC 4xx instructions">; def FeaturePPC6xx : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true", "Enable PPC 6xx instructions">; -def FeatureQPX : SubtargetFeature<"qpx","HasQPX", "true", - "Enable QPX instructions", - [FeatureFPU]>; def FeatureVSX : SubtargetFeature<"vsx","HasVSX", "true", "Enable VSX instructions", [FeatureAltivec]>; @@ -177,6 +181,9 @@ def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load", "HasAddisLoadFusion", "true", "Power8 Addis-Load fusion", [FeatureFusion]>; +def FeatureStoreFusion : SubtargetFeature<"fuse-store", "HasStoreFusion", "true", + "Target supports store clustering", + [FeatureFusion]>; def FeatureUnalignedFloats : SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess", "true", "CPU does not trap on unaligned FP access">; @@ -193,7 +200,7 @@ def FeatureFloat128 : def FeaturePOPCNTD : SubtargetFeature<"popcntd","HasPOPCNTD", "POPCNTD_Fast", "Enable the popcnt[dw] instructions">; -// Note that for the a2/a2q processor models we should not use popcnt[dw] by +// Note that for the a2 processor models we should not use popcnt[dw] by // default. These processors do support the instructions, but they're // microcoded, and the software emulation is about twice as fast. def FeatureSlowPOPCNTD : SubtargetFeature<"slow-popcntd","HasPOPCNTD", @@ -236,7 +243,15 @@ def FeaturePrefixInstrs : SubtargetFeature<"prefix-instrs", "HasPrefixInstrs", def FeaturePCRelativeMemops : SubtargetFeature<"pcrelative-memops", "HasPCRelativeMemops", "true", "Enable PC relative Memory Ops", + [FeatureISA3_0, FeaturePrefixInstrs]>; +def FeaturePairedVectorMemops: + SubtargetFeature<"paired-vector-memops", "PairedVectorMemops", "true", + "32Byte load and store instructions", [FeatureISA3_0]>; +def FeatureMMA : SubtargetFeature<"mma", "HasMMA", "true", + "Enable MMA instructions", + [FeatureP8Vector, FeatureP9Altivec, + FeaturePairedVectorMemops]>; def FeaturePredictableSelectIsExpensive : SubtargetFeature<"predictable-select-expensive", @@ -320,6 +335,8 @@ def ProcessorFeatures { [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, + FeaturePPCPreRASched, + FeaturePPCPostRASched, FeatureISA3_0, FeaturePredictableSelectIsExpensive ]; @@ -329,9 +346,7 @@ def ProcessorFeatures { // dispatch for vector operations than scalar ones. For the time being, // this list also includes scheduling-related features since we do not have // enough info to create custom scheduling strategies for future CPUs. - list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits, - FeaturePPCPreRASched, - FeaturePPCPostRASched]; + list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits]; list<SubtargetFeature> P9InheritableFeatures = !listconcat(P8InheritableFeatures, P9AdditionalFeatures); list<SubtargetFeature> P9Features = @@ -340,9 +355,12 @@ def ProcessorFeatures { // Power10 // For P10 CPU we assume that all of the existing features from Power9 // still exist with the exception of those we know are Power9 specific. + list<SubtargetFeature> FusionFeatures = [FeatureStoreFusion]; list<SubtargetFeature> P10AdditionalFeatures = - [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs, - FeaturePCRelativeMemops, FeatureP10Vector]; + !listconcat(FusionFeatures, [ + DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs, + FeaturePCRelativeMemops, FeatureP10Vector, FeatureMMA, + FeaturePairedVectorMemops]); list<SubtargetFeature> P10SpecificFeatures = []; list<SubtargetFeature> P10InheritableFeatures = !listconcat(P9InheritableFeatures, P10AdditionalFeatures); @@ -427,6 +445,7 @@ def getAltVSXFMAOpcode : InstrMapping { include "PPCRegisterInfo.td" include "PPCSchedule.td" +include "GISel/PPCRegisterBanks.td" //===----------------------------------------------------------------------===// // PowerPC processors supported. @@ -514,15 +533,6 @@ def : ProcessorModel<"a2", PPCA2Model, FeatureFPRND, FeatureFPCVT, FeatureISEL, FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>; -def : ProcessorModel<"a2q", PPCA2Model, - [DirectiveA2, FeatureICBT, FeatureBookE, FeatureMFOCRF, - FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES, - FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec, - FeatureSTFIWX, FeatureLFIWAX, - FeatureFPRND, FeatureFPCVT, FeatureISEL, - FeatureSlowPOPCNTD, FeatureCMPB, FeatureLDBRX, - Feature64Bit /*, Feature64BitRegs */, FeatureQPX, - FeatureMFTB]>; def : ProcessorModel<"pwr3", G5Model, [DirectivePwr3, FeatureAltivec, FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF, @@ -561,7 +571,7 @@ def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>; def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>; def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>; // No scheduler model yet. -def : ProcessorModel<"pwr10", NoSchedModel, ProcessorFeatures.P10Features>; +def : ProcessorModel<"pwr10", P9Model, ProcessorFeatures.P10Features>; // No scheduler model for future CPU. def : ProcessorModel<"future", NoSchedModel, ProcessorFeatures.FutureFeatures>; @@ -592,6 +602,13 @@ def PPCInstrInfo : InstrInfo { let noNamedPositionallyEncodedOperands = 1; } +def PPCAsmWriter : AsmWriter { + string AsmWriterClassName = "InstPrinter"; + int PassSubtarget = 1; + int Variant = 0; + bit isMCAsmWriter = 1; +} + def PPCAsmParser : AsmParser { let ShouldEmitMatchRegisterName = 0; } @@ -610,6 +627,7 @@ def PPC : Target { // Information about the instructions. let InstructionSet = PPCInstrInfo; + let AssemblyWriters = [PPCAsmWriter]; let AssemblyParsers = [PPCAsmParser]; let AssemblyParserVariants = [PPCAsmParserVariant]; let AllowRegisterRenaming = 1; diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index bf5fe741bac8..cce21f32414a 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -27,11 +27,11 @@ #include "PPCTargetStreamer.h" #include "TargetInfo/PowerPCTargetInfo.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/BinaryFormat/MachO.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" @@ -47,11 +47,11 @@ #include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSectionXCOFF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -62,9 +62,11 @@ #include "llvm/Support/CodeGen.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Process.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -72,6 +74,7 @@ #include <new> using namespace llvm; +using namespace llvm::XCOFF; #define DEBUG_TYPE "asmprinter" @@ -147,7 +150,21 @@ public: class PPCAIXAsmPrinter : public PPCAsmPrinter { private: + /// Symbols lowered from ExternalSymbolSDNodes, we will need to emit extern + /// linkage for them in AIX. + SmallPtrSet<MCSymbol *, 8> ExtSymSDNodeSymbols; + + /// A format indicator and unique trailing identifier to form part of the + /// sinit/sterm function names. + std::string FormatIndicatorAndUniqueModId; + static void ValidateGV(const GlobalVariable *GV); + // Record a list of GlobalAlias associated with a GlobalObject. + // This is used for AIX's extra-label-at-definition aliasing strategy. + DenseMap<const GlobalObject *, SmallVector<const GlobalAlias *, 1>> + GOAliasMap; + + void emitTracebackTable(); public: PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) @@ -161,15 +178,28 @@ public: bool doInitialization(Module &M) override; + void emitXXStructorList(const DataLayout &DL, const Constant *List, + bool IsCtor) override; + void SetupMachineFunction(MachineFunction &MF) override; void emitGlobalVariable(const GlobalVariable *GV) override; void emitFunctionDescriptor() override; + void emitFunctionEntryLabel() override; + + void emitFunctionBodyEnd() override; + void emitEndOfAsmFile(Module &) override; void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const override; + + void emitInstruction(const MachineInstr *MI) override; + + bool doFinalization(Module &M) override; + + void emitTTypeReference(const GlobalValue *GV, unsigned Encoding) override; }; } // end anonymous namespace @@ -463,6 +493,14 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, StringRef Name = "__tls_get_addr"; MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol(Name); MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None; + unsigned Opcode = PPC::BL8_NOP_TLS; + + assert(MI->getNumOperands() >= 3 && "Expecting at least 3 operands from MI"); + if (MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG || + MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG) { + Kind = MCSymbolRefExpr::VK_PPC_NOTOC; + Opcode = PPC::BL8_NOTOC_TLS; + } const Module *M = MF->getFunction().getParent(); assert(MI->getOperand(0).isReg() && @@ -490,10 +528,10 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, MCSymbol *MOSymbol = getSymbol(GValue); const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, VK, OutContext); EmitToStreamer(*OutStreamer, - MCInstBuilder(Subtarget->isPPC64() ? - PPC::BL8_NOP_TLS : PPC::BL_TLS) - .addExpr(TlsRef) - .addExpr(SymVar)); + MCInstBuilder(Subtarget->isPPC64() ? Opcode + : (unsigned)PPC::BL_TLS) + .addExpr(TlsRef) + .addExpr(SymVar)); } /// Map a machine operand for a TOC pseudo-machine instruction to its @@ -533,9 +571,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { if (Subtarget->hasSPE()) { if (PPC::F4RCRegClass.contains(Reg) || PPC::F8RCRegClass.contains(Reg) || - PPC::QBRCRegClass.contains(Reg) || - PPC::QFRCRegClass.contains(Reg) || - PPC::QSRCRegClass.contains(Reg) || PPC::VFRCRegClass.contains(Reg) || PPC::VRRCRegClass.contains(Reg) || PPC::VSFRCRegClass.contains(Reg) || @@ -550,6 +585,38 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { } } #endif + + auto getTOCRelocAdjustedExprForXCOFF = [this](const MCExpr *Expr, + ptrdiff_t OriginalOffset) { + // Apply an offset to the TOC-based expression such that the adjusted + // notional offset from the TOC base (to be encoded into the instruction's D + // or DS field) is the signed 16-bit truncation of the original notional + // offset from the TOC base. + // This is consistent with the treatment used both by XL C/C++ and + // by AIX ld -r. + ptrdiff_t Adjustment = + OriginalOffset - llvm::SignExtend32<16>(OriginalOffset); + return MCBinaryExpr::createAdd( + Expr, MCConstantExpr::create(-Adjustment, OutContext), OutContext); + }; + + auto getTOCEntryLoadingExprForXCOFF = + [IsPPC64, getTOCRelocAdjustedExprForXCOFF, + this](const MCSymbol *MOSymbol, const MCExpr *Expr) -> const MCExpr * { + const unsigned EntryByteSize = IsPPC64 ? 8 : 4; + const auto TOCEntryIter = TOC.find(MOSymbol); + assert(TOCEntryIter != TOC.end() && + "Could not find the TOC entry for this symbol."); + const ptrdiff_t EntryDistanceFromTOCBase = + (TOCEntryIter - TOC.begin()) * EntryByteSize; + constexpr int16_t PositiveTOCRange = INT16_MAX; + + if (EntryDistanceFromTOCBase > PositiveTOCRange) + return getTOCRelocAdjustedExprForXCOFF(Expr, EntryDistanceFromTOCBase); + + return Expr; + }; + // Lower multi-instruction pseudo operations. switch (MI->getOpcode()) { default: break; @@ -696,6 +763,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { assert( TM.getCodeModel() == CodeModel::Small && "This pseudo should only be selected for 32-bit small code model."); + Exp = getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp); TmpInst.getOperand(1) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; @@ -724,17 +792,20 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) && "Invalid operand!"); + // Map the operand to its corresponding MCSymbol. + const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + // Map the machine operand to its corresponding MCSymbol, then map the // global address operand to be a reference to the TOC entry we will // synthesize later. - MCSymbol *TOCEntry = - lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this)); + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol); const MCSymbolRefExpr::VariantKind VK = IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC; const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry, VK, OutContext); - TmpInst.getOperand(1) = MCOperand::createExpr(Exp); + TmpInst.getOperand(1) = MCOperand::createExpr( + IsAIX ? getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp) : Exp); EmitToStreamer(*OutStreamer, TmpInst); return; } @@ -1010,6 +1081,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { case PPC::GETtlsADDR: // Transform: %x3 = GETtlsADDR %x3, @sym // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsgd) + case PPC::GETtlsADDRPCREL: case PPC::GETtlsADDR32: { // Transform: %r3 = GETtlsADDR32 %r3, @sym // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT @@ -1055,6 +1127,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { case PPC::GETtlsldADDR: // Transform: %x3 = GETtlsldADDR %x3, @sym // Into: BL8_NOP_TLS __tls_get_addr(sym at tlsld) + case PPC::GETtlsldADDRPCREL: case PPC::GETtlsldADDR32: { // Transform: %r3 = GETtlsldADDR32 %r3, @sym // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT @@ -1081,6 +1154,21 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { .addExpr(SymDtprel)); return; } + case PPC::PADDIdtprel: { + // Transform: %rd = PADDIdtprel %rs, @sym + // Into: %rd = PADDI8 %rs, sym@dtprel + const MachineOperand &MO = MI->getOperand(2); + const GlobalValue *GValue = MO.getGlobal(); + MCSymbol *MOSymbol = getSymbol(GValue); + const MCExpr *SymDtprel = MCSymbolRefExpr::create( + MOSymbol, MCSymbolRefExpr::VK_DTPREL, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::PADDI8) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(SymDtprel)); + return; + } + case PPC::ADDIdtprelL: // Transform: %xd = ADDIdtprelL %xs, @sym // Into: %xd = ADDI8 %xs, sym@dtprel@l @@ -1137,10 +1225,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { case PPC::LWA: { // Verify alignment is legal, so we don't create relocations // that can't be supported. - // FIXME: This test is currently disabled for Darwin. The test - // suite shows a handful of test cases that fail this check for - // Darwin. Those need to be investigated before this sanity test - // can be enabled for those subtargets. unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1; const MachineOperand &MO = MI->getOperand(OpNum); if (MO.isGlobal()) { @@ -1621,17 +1705,19 @@ void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV, assert(LinkageAttr != MCSA_Invalid && "LinkageAttr should not MCSA_Invalid."); MCSymbolAttr VisibilityAttr = MCSA_Invalid; - switch (GV->getVisibility()) { + if (!TM.getIgnoreXCOFFVisibility()) { + switch (GV->getVisibility()) { - // TODO: "exported" and "internal" Visibility needs to go here. - case GlobalValue::DefaultVisibility: - break; - case GlobalValue::HiddenVisibility: - VisibilityAttr = MAI->getHiddenVisibilityAttr(); - break; - case GlobalValue::ProtectedVisibility: - VisibilityAttr = MAI->getProtectedVisibilityAttr(); - break; + // TODO: "exported" and "internal" Visibility needs to go here. + case GlobalValue::DefaultVisibility: + break; + case GlobalValue::HiddenVisibility: + VisibilityAttr = MAI->getHiddenVisibilityAttr(); + break; + case GlobalValue::ProtectedVisibility: + VisibilityAttr = MAI->getProtectedVisibilityAttr(); + break; + } } OutStreamer->emitXCOFFSymbolLinkageWithVisibility(GVSym, LinkageAttr, @@ -1650,18 +1736,305 @@ void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) { return AsmPrinter::SetupMachineFunction(MF); } +void PPCAIXAsmPrinter::emitFunctionBodyEnd() { + + if (!TM.getXCOFFTracebackTable()) + return; + + emitTracebackTable(); +} + +void PPCAIXAsmPrinter::emitTracebackTable() { + + // Create a symbol for the end of function. + MCSymbol *FuncEnd = createTempSymbol(MF->getName()); + OutStreamer->emitLabel(FuncEnd); + + OutStreamer->AddComment("Traceback table begin"); + // Begin with a fullword of zero. + OutStreamer->emitIntValueInHexWithPadding(0, 4 /*size*/); + + SmallString<128> CommentString; + raw_svector_ostream CommentOS(CommentString); + + auto EmitComment = [&]() { + OutStreamer->AddComment(CommentOS.str()); + CommentString.clear(); + }; + + auto EmitCommentAndValue = [&](uint64_t Value, int Size) { + EmitComment(); + OutStreamer->emitIntValueInHexWithPadding(Value, Size); + }; + + unsigned int Version = 0; + CommentOS << "Version = " << Version; + EmitCommentAndValue(Version, 1); + + // There is a lack of information in the IR to assist with determining the + // source language. AIX exception handling mechanism would only search for + // personality routine and LSDA area when such language supports exception + // handling. So to be conservatively correct and allow runtime to do its job, + // we need to set it to C++ for now. + TracebackTable::LanguageID LanguageIdentifier = + TracebackTable::CPlusPlus; // C++ + + CommentOS << "Language = " + << getNameForTracebackTableLanguageId(LanguageIdentifier); + EmitCommentAndValue(LanguageIdentifier, 1); + + // This is only populated for the third and fourth bytes. + uint32_t FirstHalfOfMandatoryField = 0; + + // Emit the 3rd byte of the mandatory field. + + // We always set traceback offset bit to true. + FirstHalfOfMandatoryField |= TracebackTable::HasTraceBackTableOffsetMask; + + const PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Check the function uses floating-point processor instructions or not + for (unsigned Reg = PPC::F0; Reg <= PPC::F31; ++Reg) { + if (MRI.isPhysRegUsed(Reg)) { + FirstHalfOfMandatoryField |= TracebackTable::IsFloatingPointPresentMask; + break; + } + } + +#define GENBOOLCOMMENT(Prefix, V, Field) \ + CommentOS << (Prefix) << ((V) & (TracebackTable::Field##Mask) ? "+" : "-") \ + << #Field + +#define GENVALUECOMMENT(PrefixAndName, V, Field) \ + CommentOS << (PrefixAndName) << " = " \ + << static_cast<unsigned>(((V) & (TracebackTable::Field##Mask)) >> \ + (TracebackTable::Field##Shift)) + + GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsGlobaLinkage); + GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsOutOfLineEpilogOrPrologue); + EmitComment(); + + GENBOOLCOMMENT("", FirstHalfOfMandatoryField, HasTraceBackTableOffset); + GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsInternalProcedure); + EmitComment(); + + GENBOOLCOMMENT("", FirstHalfOfMandatoryField, HasControlledStorage); + GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsTOCless); + EmitComment(); + + GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsFloatingPointPresent); + EmitComment(); + GENBOOLCOMMENT("", FirstHalfOfMandatoryField, + IsFloatingPointOperationLogOrAbortEnabled); + EmitComment(); + + OutStreamer->emitIntValueInHexWithPadding( + (FirstHalfOfMandatoryField & 0x0000ff00) >> 8, 1); + + // Set the 4th byte of the mandatory field. + FirstHalfOfMandatoryField |= TracebackTable::IsFunctionNamePresentMask; + + static_assert(XCOFF::AllocRegNo == 31, "Unexpected register usage!"); + if (MRI.isPhysRegUsed(Subtarget->isPPC64() ? PPC::X31 : PPC::R31)) + FirstHalfOfMandatoryField |= TracebackTable::IsAllocaUsedMask; + + const SmallVectorImpl<Register> &MustSaveCRs = FI->getMustSaveCRs(); + if (!MustSaveCRs.empty()) + FirstHalfOfMandatoryField |= TracebackTable::IsCRSavedMask; + + if (FI->mustSaveLR()) + FirstHalfOfMandatoryField |= TracebackTable::IsLRSavedMask; + + GENBOOLCOMMENT("", FirstHalfOfMandatoryField, IsInterruptHandler); + GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsFunctionNamePresent); + GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsAllocaUsed); + EmitComment(); + GENVALUECOMMENT("OnConditionDirective", FirstHalfOfMandatoryField, + OnConditionDirective); + GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsCRSaved); + GENBOOLCOMMENT(", ", FirstHalfOfMandatoryField, IsLRSaved); + EmitComment(); + OutStreamer->emitIntValueInHexWithPadding((FirstHalfOfMandatoryField & 0xff), + 1); + + // Set the 5th byte of mandatory field. + uint32_t SecondHalfOfMandatoryField = 0; + + // Always store back chain. + SecondHalfOfMandatoryField |= TracebackTable::IsBackChainStoredMask; + + uint32_t FPRSaved = 0; + for (unsigned Reg = PPC::F14; Reg <= PPC::F31; ++Reg) { + if (MRI.isPhysRegModified(Reg)) { + FPRSaved = PPC::F31 - Reg + 1; + break; + } + } + SecondHalfOfMandatoryField |= (FPRSaved << TracebackTable::FPRSavedShift) & + TracebackTable::FPRSavedMask; + GENBOOLCOMMENT("", SecondHalfOfMandatoryField, IsBackChainStored); + GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, IsFixup); + GENVALUECOMMENT(", NumOfFPRsSaved", SecondHalfOfMandatoryField, FPRSaved); + EmitComment(); + OutStreamer->emitIntValueInHexWithPadding( + (SecondHalfOfMandatoryField & 0xff000000) >> 24, 1); + + // Set the 6th byte of mandatory field. + bool ShouldEmitEHBlock = TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(MF); + if (ShouldEmitEHBlock) + SecondHalfOfMandatoryField |= TracebackTable::HasExtensionTableMask; + + uint32_t GPRSaved = 0; + + // X13 is reserved under 64-bit environment. + unsigned GPRBegin = Subtarget->isPPC64() ? PPC::X14 : PPC::R13; + unsigned GPREnd = Subtarget->isPPC64() ? PPC::X31 : PPC::R31; + + for (unsigned Reg = GPRBegin; Reg <= GPREnd; ++Reg) { + if (MRI.isPhysRegModified(Reg)) { + GPRSaved = GPREnd - Reg + 1; + break; + } + } + + SecondHalfOfMandatoryField |= (GPRSaved << TracebackTable::GPRSavedShift) & + TracebackTable::GPRSavedMask; + + GENBOOLCOMMENT("", SecondHalfOfMandatoryField, HasVectorInfo); + GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, HasExtensionTable); + GENVALUECOMMENT(", NumOfGPRsSaved", SecondHalfOfMandatoryField, GPRSaved); + EmitComment(); + OutStreamer->emitIntValueInHexWithPadding( + (SecondHalfOfMandatoryField & 0x00ff0000) >> 16, 1); + + // Set the 7th byte of mandatory field. + uint32_t NumberOfFixedPara = FI->getFixedParamNum(); + SecondHalfOfMandatoryField |= + (NumberOfFixedPara << TracebackTable::NumberOfFixedParmsShift) & + TracebackTable::NumberOfFixedParmsMask; + GENVALUECOMMENT("NumberOfFixedParms", SecondHalfOfMandatoryField, + NumberOfFixedParms); + EmitComment(); + OutStreamer->emitIntValueInHexWithPadding( + (SecondHalfOfMandatoryField & 0x0000ff00) >> 8, 1); + + // Set the 8th byte of mandatory field. + + // Always set parameter on stack. + SecondHalfOfMandatoryField |= TracebackTable::HasParmsOnStackMask; + + uint32_t NumberOfFPPara = FI->getFloatingPointParamNum(); + SecondHalfOfMandatoryField |= + (NumberOfFPPara << TracebackTable::NumberOfFloatingPointParmsShift) & + TracebackTable::NumberOfFloatingPointParmsMask; + + GENVALUECOMMENT("NumberOfFPParms", SecondHalfOfMandatoryField, + NumberOfFloatingPointParms); + GENBOOLCOMMENT(", ", SecondHalfOfMandatoryField, HasParmsOnStack); + EmitComment(); + OutStreamer->emitIntValueInHexWithPadding(SecondHalfOfMandatoryField & 0xff, + 1); + + // Generate the optional fields of traceback table. + + // Parameter type. + if (NumberOfFixedPara || NumberOfFPPara) { + assert((SecondHalfOfMandatoryField & TracebackTable::HasVectorInfoMask) == + 0 && + "VectorInfo has not been implemented."); + uint32_t ParaType = FI->getParameterType(); + CommentOS << "Parameter type = " + << XCOFF::parseParmsType(ParaType, + NumberOfFixedPara + NumberOfFPPara); + EmitComment(); + OutStreamer->emitIntValueInHexWithPadding(ParaType, sizeof(ParaType)); + } + + // Traceback table offset. + OutStreamer->AddComment("Function size"); + if (FirstHalfOfMandatoryField & TracebackTable::HasTraceBackTableOffsetMask) { + MCSymbol *FuncSectSym = getObjFileLowering().getFunctionEntryPointSymbol( + &(MF->getFunction()), TM); + OutStreamer->emitAbsoluteSymbolDiff(FuncEnd, FuncSectSym, 4); + } + + // Since we unset the Int_Handler. + if (FirstHalfOfMandatoryField & TracebackTable::IsInterruptHandlerMask) + report_fatal_error("Hand_Mask not implement yet"); + + if (FirstHalfOfMandatoryField & TracebackTable::HasControlledStorageMask) + report_fatal_error("Ctl_Info not implement yet"); + + if (FirstHalfOfMandatoryField & TracebackTable::IsFunctionNamePresentMask) { + StringRef Name = MF->getName().substr(0, INT16_MAX); + int16_t NameLength = Name.size(); + CommentOS << "Function name len = " + << static_cast<unsigned int>(NameLength); + EmitCommentAndValue(NameLength, 2); + OutStreamer->AddComment("Function Name"); + OutStreamer->emitBytes(Name); + } + + if (FirstHalfOfMandatoryField & TracebackTable::IsAllocaUsedMask) { + uint8_t AllocReg = XCOFF::AllocRegNo; + OutStreamer->AddComment("AllocaUsed"); + OutStreamer->emitIntValueInHex(AllocReg, sizeof(AllocReg)); + } + + uint8_t ExtensionTableFlag = 0; + if (SecondHalfOfMandatoryField & TracebackTable::HasExtensionTableMask) { + if (ShouldEmitEHBlock) + ExtensionTableFlag |= ExtendedTBTableFlag::TB_EH_INFO; + + CommentOS << "ExtensionTableFlag = " + << getExtendedTBTableFlagString(ExtensionTableFlag); + EmitCommentAndValue(ExtensionTableFlag, sizeof(ExtensionTableFlag)); + } + + if (ExtensionTableFlag & ExtendedTBTableFlag::TB_EH_INFO) { + auto &Ctx = OutStreamer->getContext(); + MCSymbol *EHInfoSym = + TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(MF); + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(EHInfoSym); + const MCSymbol *TOCBaseSym = + cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection()) + ->getQualNameSymbol(); + const MCExpr *Exp = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx), + MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx); + + const DataLayout &DL = getDataLayout(); + OutStreamer->emitValueToAlignment(4); + OutStreamer->AddComment("EHInfo Table"); + OutStreamer->emitValue(Exp, DL.getPointerSize()); + } + +#undef GENBOOLCOMMENT +#undef GENVALUECOMMENT +} + void PPCAIXAsmPrinter::ValidateGV(const GlobalVariable *GV) { // Early error checking limiting what is supported. if (GV->isThreadLocal()) report_fatal_error("Thread local not yet supported on AIX."); - if (GV->hasSection()) - report_fatal_error("Custom section for Data not yet supported."); - if (GV->hasComdat()) report_fatal_error("COMDAT not yet supported by AIX."); } +static bool isSpecialLLVMGlobalArrayToSkip(const GlobalVariable *GV) { + return GV->hasAppendingLinkage() && + StringSwitch<bool>(GV->getName()) + // TODO: Linker could still eliminate the GV if we just skip + // handling llvm.used array. Skipping them for now until we or the + // AIX OS team come up with a good solution. + .Case("llvm.used", true) + // It's correct to just skip llvm.compiler.used array here. + .Case("llvm.compiler.used", true) + .Default(false); +} + static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) { return StringSwitch<bool>(GV->getName()) .Cases("llvm.global_ctors", "llvm.global_dtors", true) @@ -1669,19 +2042,15 @@ static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) { } void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { - ValidateGV(GV); - - // TODO: Update the handling of global arrays for static init when we support - // the ".ref" directive. - // Otherwise, we can skip these arrays, because the AIX linker collects - // static init functions simply based on their name. - if (isSpecialLLVMGlobalArrayForStaticInit(GV)) + // Special LLVM global arrays have been handled at the initialization. + if (isSpecialLLVMGlobalArrayToSkip(GV) || isSpecialLLVMGlobalArrayForStaticInit(GV)) return; - // Create the symbol, set its storage class. + assert(!GV->getName().startswith("llvm.") && + "Unhandled intrinsic global variable."); + ValidateGV(GV); + MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV)); - GVSym->setStorageClass( - TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV)); if (GV->isDeclarationForLinker()) { emitLinkage(GV, GVSym); @@ -1705,10 +2074,12 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { if (GVKind.isCommon() || GVKind.isBSSLocal()) { Align Alignment = GV->getAlign().getValueOr(DL.getPreferredAlign(GV)); uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); + GVSym->setStorageClass( + TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV)); if (GVKind.isBSSLocal()) OutStreamer->emitXCOFFLocalCommonSymbol( - OutContext.getOrCreateSymbol(GVSym->getUnqualifiedName()), Size, + OutContext.getOrCreateSymbol(GVSym->getSymbolTableName()), Size, GVSym, Alignment.value()); else OutStreamer->emitCommonSymbol(GVSym, Size, Alignment.value()); @@ -1718,7 +2089,18 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { MCSymbol *EmittedInitSym = GVSym; emitLinkage(GV, EmittedInitSym); emitAlignment(getGVAlignment(GV, DL), GV); - OutStreamer->emitLabel(EmittedInitSym); + + // When -fdata-sections is enabled, every GlobalVariable will + // be put into its own csect; therefore, label is not necessary here. + if (!TM.getDataSections() || GV->hasSection()) { + OutStreamer->emitLabel(EmittedInitSym); + } + + // Emit aliasing label for global variable. + llvm::for_each(GOAliasMap[GV], [this](const GlobalAlias *Alias) { + OutStreamer->emitLabel(getSymbol(Alias)); + }); + emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer()); } @@ -1730,6 +2112,13 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() { // Emit function descriptor. OutStreamer->SwitchSection( cast<MCSymbolXCOFF>(CurrentFnDescSym)->getRepresentedCsect()); + + // Emit aliasing label for function descriptor csect. + llvm::for_each(GOAliasMap[&MF->getFunction()], + [this](const GlobalAlias *Alias) { + OutStreamer->emitLabel(getSymbol(Alias)); + }); + // Emit function entry point address. OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext), PointerSize); @@ -1745,6 +2134,20 @@ void PPCAIXAsmPrinter::emitFunctionDescriptor() { OutStreamer->SwitchSection(Current.first, Current.second); } +void PPCAIXAsmPrinter::emitFunctionEntryLabel() { + // It's not necessary to emit the label when we have individual + // function in its own csect. + if (!TM.getFunctionSections()) + PPCAsmPrinter::emitFunctionEntryLabel(); + + // Emit aliasing label for function entry point label. + llvm::for_each( + GOAliasMap[&MF->getFunction()], [this](const GlobalAlias *Alias) { + OutStreamer->emitLabel( + getObjFileLowering().getFunctionEntryPointSymbol(Alias, TM)); + }); +} + void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { // If there are no functions in this module, we will never need to reference // the TOC base. @@ -1757,20 +2160,10 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { PPCTargetStreamer *TS = static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer()); - const unsigned EntryByteSize = Subtarget->isPPC64() ? 8 : 4; - const unsigned TOCEntriesByteSize = TOC.size() * EntryByteSize; - // TODO: If TOC entries' size is larger than 32768, then we run out of - // positive displacement to reach the TOC entry. We need to decide how to - // handle entries' size larger than that later. - if (TOCEntriesByteSize > 32767) { - report_fatal_error("Handling of TOC entry displacement larger than 32767 " - "is not yet implemented."); - } - for (auto &I : TOC) { // Setup the csect for the current TC entry. MCSectionXCOFF *TCEntry = cast<MCSectionXCOFF>( - getObjFileLowering().getSectionForTOCEntry(I.first)); + getObjFileLowering().getSectionForTOCEntry(I.first, TM)); OutStreamer->SwitchSection(TCEntry); OutStreamer->emitLabel(I.second); @@ -1780,10 +2173,6 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) { } bool PPCAIXAsmPrinter::doInitialization(Module &M) { - if (M.alias_size() > 0u) - report_fatal_error( - "module has aliases, which LLVM does not yet support for AIX"); - const bool Result = PPCAsmPrinter::doInitialization(M); auto setCsectAlignment = [this](const GlobalObject *GO) { @@ -1803,19 +2192,174 @@ bool PPCAIXAsmPrinter::doInitialization(Module &M) { // We need to know, up front, the alignment of csects for the assembly path, // because once a .csect directive gets emitted, we could not change the // alignment value on it. - for (const auto &G : M.globals()) + for (const auto &G : M.globals()) { + if (isSpecialLLVMGlobalArrayToSkip(&G)) + continue; + + if (isSpecialLLVMGlobalArrayForStaticInit(&G)) { + // Generate a format indicator and a unique module id to be a part of + // the sinit and sterm function names. + if (FormatIndicatorAndUniqueModId.empty()) { + std::string UniqueModuleId = getUniqueModuleId(&M); + if (UniqueModuleId != "") + // TODO: Use source file full path to generate the unique module id + // and add a format indicator as a part of function name in case we + // will support more than one format. + FormatIndicatorAndUniqueModId = "clang_" + UniqueModuleId.substr(1); + else + // Use the Pid and current time as the unique module id when we cannot + // generate one based on a module's strong external symbols. + // FIXME: Adjust the comment accordingly after we use source file full + // path instead. + FormatIndicatorAndUniqueModId = + "clangPidTime_" + llvm::itostr(sys::Process::getProcessId()) + + "_" + llvm::itostr(time(nullptr)); + } + + emitSpecialLLVMGlobal(&G); + continue; + } + setCsectAlignment(&G); + } for (const auto &F : M) setCsectAlignment(&F); + // Construct an aliasing list for each GlobalObject. + for (const auto &Alias : M.aliases()) { + const GlobalObject *Base = Alias.getBaseObject(); + if (!Base) + report_fatal_error( + "alias without a base object is not yet supported on AIX"); + GOAliasMap[Base].push_back(&Alias); + } + return Result; } -/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code -/// for a MachineFunction to the given output stream, in a format that the -/// Darwin assembler can deal with. -/// +void PPCAIXAsmPrinter::emitInstruction(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + break; + case PPC::BL8: + case PPC::BL: + case PPC::BL8_NOP: + case PPC::BL_NOP: { + const MachineOperand &MO = MI->getOperand(0); + if (MO.isSymbol()) { + MCSymbolXCOFF *S = + cast<MCSymbolXCOFF>(OutContext.getOrCreateSymbol(MO.getSymbolName())); + ExtSymSDNodeSymbols.insert(S); + } + } break; + case PPC::BL_TLS: + case PPC::BL8_TLS: + case PPC::BL8_TLS_: + case PPC::BL8_NOP_TLS: + report_fatal_error("TLS call not yet implemented"); + case PPC::TAILB: + case PPC::TAILB8: + case PPC::TAILBA: + case PPC::TAILBA8: + case PPC::TAILBCTR: + case PPC::TAILBCTR8: + if (MI->getOperand(0).isSymbol()) + report_fatal_error("Tail call for extern symbol not yet supported."); + break; + } + return PPCAsmPrinter::emitInstruction(MI); +} + +bool PPCAIXAsmPrinter::doFinalization(Module &M) { + for (MCSymbol *Sym : ExtSymSDNodeSymbols) + OutStreamer->emitSymbolAttribute(Sym, MCSA_Extern); + return PPCAsmPrinter::doFinalization(M); +} + +static unsigned mapToSinitPriority(int P) { + if (P < 0 || P > 65535) + report_fatal_error("invalid init priority"); + + if (P <= 20) + return P; + + if (P < 81) + return 20 + (P - 20) * 16; + + if (P <= 1124) + return 1004 + (P - 81); + + if (P < 64512) + return 2047 + (P - 1124) * 33878; + + return 2147482625u + (P - 64512); +} + +static std::string convertToSinitPriority(int Priority) { + // This helper function converts clang init priority to values used in sinit + // and sterm functions. + // + // The conversion strategies are: + // We map the reserved clang/gnu priority range [0, 100] into the sinit/sterm + // reserved priority range [0, 1023] by + // - directly mapping the first 21 and the last 20 elements of the ranges + // - linear interpolating the intermediate values with a step size of 16. + // + // We map the non reserved clang/gnu priority range of [101, 65535] into the + // sinit/sterm priority range [1024, 2147483648] by: + // - directly mapping the first and the last 1024 elements of the ranges + // - linear interpolating the intermediate values with a step size of 33878. + unsigned int P = mapToSinitPriority(Priority); + + std::string PrioritySuffix; + llvm::raw_string_ostream os(PrioritySuffix); + os << llvm::format_hex_no_prefix(P, 8); + os.flush(); + return PrioritySuffix; +} + +void PPCAIXAsmPrinter::emitXXStructorList(const DataLayout &DL, + const Constant *List, bool IsCtor) { + SmallVector<Structor, 8> Structors; + preprocessXXStructorList(DL, List, Structors); + if (Structors.empty()) + return; + + unsigned Index = 0; + for (Structor &S : Structors) { + if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(S.Func)) + S.Func = CE->getOperand(0); + + llvm::GlobalAlias::create( + GlobalValue::ExternalLinkage, + (IsCtor ? llvm::Twine("__sinit") : llvm::Twine("__sterm")) + + llvm::Twine(convertToSinitPriority(S.Priority)) + + llvm::Twine("_", FormatIndicatorAndUniqueModId) + + llvm::Twine("_", llvm::utostr(Index++)), + cast<Function>(S.Func)); + } +} + +void PPCAIXAsmPrinter::emitTTypeReference(const GlobalValue *GV, + unsigned Encoding) { + if (GV) { + MCSymbol *TypeInfoSym = TM.getSymbol(GV); + MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(TypeInfoSym); + const MCSymbol *TOCBaseSym = + cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection()) + ->getQualNameSymbol(); + auto &Ctx = OutStreamer->getContext(); + const MCExpr *Exp = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCEntry, Ctx), + MCSymbolRefExpr::create(TOCBaseSym, Ctx), Ctx); + OutStreamer->emitValue(Exp, GetSizeOfEncodedValue(Encoding)); + } else + OutStreamer->emitIntValue(0, GetSizeOfEncodedValue(Encoding)); +} + +// Return a pass that prints the PPC assembly code for a MachineFunction to the +// given output stream. static AsmPrinter * createPPCAsmPrinterPass(TargetMachine &tm, std::unique_ptr<MCStreamer> &&Streamer) { @@ -1829,6 +2373,8 @@ createPPCAsmPrinterPass(TargetMachine &tm, extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmPrinter() { TargetRegistry::RegisterAsmPrinter(getThePPC32Target(), createPPCAsmPrinterPass); + TargetRegistry::RegisterAsmPrinter(getThePPC32LETarget(), + createPPCAsmPrinterPass); TargetRegistry::RegisterAsmPrinter(getThePPC64Target(), createPPCAsmPrinterPass); TargetRegistry::RegisterAsmPrinter(getThePPC64LETarget(), diff --git a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp index 2259a29f838a..3c6b1f84b821 100644 --- a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp +++ b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp @@ -59,7 +59,7 @@ using namespace llvm; namespace { -#define DEBUG_TYPE "bool-ret-to-int" +#define DEBUG_TYPE "ppc-bool-ret-to-int" STATISTIC(NumBoolRetPromotion, "Number of times a bool feeding a RetInst was promoted to an int"); @@ -75,12 +75,11 @@ class PPCBoolRetToInt : public FunctionPass { WorkList.push_back(V); Defs.insert(V); while (!WorkList.empty()) { - Value *Curr = WorkList.back(); - WorkList.pop_back(); + Value *Curr = WorkList.pop_back_val(); auto *CurrUser = dyn_cast<User>(Curr); - // Operands of CallInst are skipped because they may not be Bool type, - // and their positions are defined by ABI. - if (CurrUser && !isa<CallInst>(Curr)) + // Operands of CallInst/Constant are skipped because they may not be Bool + // type. For CallInst, their positions are defined by ABI. + if (CurrUser && !isa<CallInst>(Curr) && !isa<Constant>(Curr)) for (auto &Op : CurrUser->operands()) if (Defs.insert(Op).second) WorkList.push_back(Op); @@ -90,6 +89,9 @@ class PPCBoolRetToInt : public FunctionPass { // Translate a i1 value to an equivalent i32/i64 value: Value *translate(Value *V) { + assert(V->getType() == Type::getInt1Ty(V->getContext()) && + "Expect an i1 value"); + Type *IntTy = ST->isPPC64() ? Type::getInt64Ty(V->getContext()) : Type::getInt32Ty(V->getContext()); @@ -252,9 +254,9 @@ class PPCBoolRetToInt : public FunctionPass { auto *First = dyn_cast<User>(Pair.first); auto *Second = dyn_cast<User>(Pair.second); assert((!First || Second) && "translated from user to non-user!?"); - // Operands of CallInst are skipped because they may not be Bool type, - // and their positions are defined by ABI. - if (First && !isa<CallInst>(First)) + // Operands of CallInst/Constant are skipped because they may not be Bool + // type. For CallInst, their positions are defined by ABI. + if (First && !isa<CallInst>(First) && !isa<Constant>(First)) for (unsigned i = 0; i < First->getNumOperands(); ++i) Second->setOperand(i, BoolToIntMap[First->getOperand(i)]); } @@ -280,8 +282,8 @@ private: } // end anonymous namespace char PPCBoolRetToInt::ID = 0; -INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int", - "Convert i1 constants to i32/i64 if they are returned", - false, false) +INITIALIZE_PASS(PPCBoolRetToInt, "ppc-bool-ret-to-int", + "Convert i1 constants to i32/i64 if they are returned", false, + false) FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); } diff --git a/llvm/lib/Target/PowerPC/PPCCCState.cpp b/llvm/lib/Target/PowerPC/PPCCCState.cpp index 5116f0d121f4..79ffc6627a61 100644 --- a/llvm/lib/Target/PowerPC/PPCCCState.cpp +++ b/llvm/lib/Target/PowerPC/PPCCCState.cpp @@ -32,4 +32,4 @@ void PPCCCState::PreAnalyzeFormalArguments( OriginalArgWasPPCF128.push_back(false); } } -}
\ No newline at end of file +} diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp index bb12e05173a6..b9518d6d7064 100644 --- a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -1,4 +1,4 @@ -//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===// +//===-- PPCCTRLoops.cpp - Verify CTR loops -----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,74 +6,48 @@ // //===----------------------------------------------------------------------===// // -// This pass identifies loops where we can generate the PPC branch instructions -// that decrement and test the count register (CTR) (bdnz and friends). -// -// The pattern that defines the induction variable can changed depending on -// prior optimizations. For example, the IndVarSimplify phase run by 'opt' -// normalizes induction variables, and the Loop Strength Reduction pass -// run by 'llc' may also make changes to the induction variable. -// -// Criteria for CTR loops: -// - Countable loops (w/ ind. var for a trip count) -// - Try inner-most loops first -// - No nested CTR loops. -// - No function calls in loops. +// This pass verifies that all bdnz/bdz instructions are dominated by a loop +// mtctr before any other instructions that might clobber the ctr register. // //===----------------------------------------------------------------------===// +// CTR loops are produced by the HardwareLoops pass and this pass is simply a +// verification that no invalid CTR loops are produced. As such, it isn't +// something that needs to be run (or even defined) for Release builds so the +// entire file is guarded by NDEBUG. +#ifndef NDEBUG +#include <vector> + +#include "MCTargetDesc/PPCMCTargetDesc.h" #include "PPC.h" -#include "PPCSubtarget.h" -#include "PPCTargetMachine.h" -#include "PPCTargetTransformInfo.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/CodeGen/TargetSchedule.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/ValueHandle.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/ilist_iterator.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBundleIterator.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/Register.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GenericDomTreeConstruction.h" +#include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/LoopUtils.h" - -#ifndef NDEBUG -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#endif using namespace llvm; -#define DEBUG_TYPE "ctrloops" - -#ifndef NDEBUG -static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1)); -#endif +#define DEBUG_TYPE "ppc-ctrloops-verify" namespace { -#ifndef NDEBUG struct PPCCTRLoopsVerify : public MachineFunctionPass { public: static char ID; @@ -94,10 +68,8 @@ namespace { }; char PPCCTRLoopsVerify::ID = 0; -#endif // NDEBUG } // end anonymous namespace -#ifndef NDEBUG INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) @@ -107,9 +79,7 @@ INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", FunctionPass *llvm::createPPCCTRLoopsVerify() { return new PPCCTRLoopsVerify(); } -#endif // NDEBUG -#ifndef NDEBUG static bool clobbersCTR(const MachineInstr &MI) { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); @@ -178,9 +148,7 @@ queue_preds: return false; } - for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), - PIE = MBB->pred_end(); PI != PIE; ++PI) - Preds.push_back(*PI); + append_range(Preds, MBB->predecessors()); } do { diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td index 1eaa7f7a44b3..cc3486718179 100644 --- a/llvm/lib/Target/PowerPC/PPCCallingConv.td +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -59,10 +59,7 @@ def RetCC_PPC_Cold : CallingConv<[ CCIfType<[f32], CCAssignToReg<[F1]>>, CCIfType<[f64], CCAssignToReg<[F1]>>, - CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2]>>>, - - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>, + CCIfType<[f128], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2]>>>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", @@ -95,13 +92,9 @@ def RetCC_PPC : CallingConv<[ // For P9, f128 are passed in vector registers. CCIfType<[f128], - CCIfSubtarget<"hasP9Vector()", + CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, - // QPX vectors are returned in QF1 and QF2. - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, - // Vector types returned as "direct" go into V2 .. V9; note that only the // ELFv2 ABI fully utilizes all these registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], @@ -156,10 +149,8 @@ def RetCC_PPC64_ELF_FIS : CallingConv<[ CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>, CCIfType<[f128], - CCIfSubtarget<"hasP9Vector()", + CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>>, - CCIfType<[v4f64, v4f32, v4i1], - CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1, QF2]>>>, CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9]>>> @@ -223,12 +214,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[ CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>, CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>, - // QPX vectors that are stored in double precision need 32-byte alignment. - CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>, - // Vectors and float128 get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v4f32, v2f64, v2i64], CCAssignToStack<16, 16>>, - CCIfType<[f128], CCIfSubtarget<"hasP9Vector()", CCAssignToStack<16, 16>>> + CCIfType<[f128], CCIfSubtarget<"hasAltivec()", CCAssignToStack<16, 16>>> ]>; // This calling convention puts vector arguments always on the stack. It is used @@ -243,10 +231,6 @@ def CC_PPC32_SVR4_VarArg : CallingConv<[ // put vector arguments in vector registers before putting them on the stack. let Entry = 1 in def CC_PPC32_SVR4 : CallingConv<[ - // QPX vectors mirror the scalar FP convention. - CCIfType<[v4f64, v4f32, v4i1], CCIfSubtarget<"hasQPX()", - CCAssignToReg<[QF1, QF2, QF3, QF4, QF5, QF6, QF7, QF8]>>>, - // The first 12 Vector arguments are passed in AltiVec registers. CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64], CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, @@ -254,7 +238,7 @@ def CC_PPC32_SVR4 : CallingConv<[ // Float128 types treated as vector arguments. CCIfType<[f128], - CCIfSubtarget<"hasP9Vector()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, + CCIfSubtarget<"hasAltivec()", CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13]>>>, CCDelegateTo<CC_PPC32_SVR4_Common> @@ -307,6 +291,8 @@ def CSR_AIX32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20, F27, F28, F29, F30, F31, CR2, CR3, CR4 )>; +def CSR_AIX32_Altivec : CalleeSavedRegs<(add CSR_AIX32, CSR_Altivec)>; + // Common CalleeSavedRegs for SVR4 and AIX. def CSR_PPC64 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20, X21, X22, X23, X24, X25, X26, X27, X28, diff --git a/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp index c9f74bbf861c..08b7bdb3ac1e 100644 --- a/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp +++ b/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -77,8 +77,9 @@ protected: if (J->getOperand(0).getMBB() == &ReturnMBB) { // This is an unconditional branch to the return. Replace the // branch with a blr. - BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode())) - .copyImplicitOps(*I); + MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I); + (*PI)->insert(J, MI); + MachineBasicBlock::iterator K = J--; K->eraseFromParent(); BlockChanged = true; @@ -89,10 +90,13 @@ protected: if (J->getOperand(2).getMBB() == &ReturnMBB) { // This is a conditional branch to the return. Replace the branch // with a bclr. - BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR)) + MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I); + MI->setDesc(TII->get(PPC::BCCLR)); + MachineInstrBuilder(*ReturnMBB.getParent(), MI) .add(J->getOperand(0)) - .add(J->getOperand(1)) - .copyImplicitOps(*I); + .add(J->getOperand(1)); + (*PI)->insert(J, MI); + MachineBasicBlock::iterator K = J--; K->eraseFromParent(); BlockChanged = true; @@ -103,11 +107,13 @@ protected: if (J->getOperand(1).getMBB() == &ReturnMBB) { // This is a conditional branch to the return. Replace the branch // with a bclr. - BuildMI( - **PI, J, J->getDebugLoc(), - TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn)) - .add(J->getOperand(0)) - .copyImplicitOps(*I); + MachineInstr *MI = ReturnMBB.getParent()->CloneMachineInstr(&*I); + MI->setDesc( + TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn)); + MachineInstrBuilder(*ReturnMBB.getParent(), MI) + .add(J->getOperand(0)); + (*PI)->insert(J, MI); + MachineBasicBlock::iterator K = J--; K->eraseFromParent(); BlockChanged = true; diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 39790ac9a8aa..c181816e31c6 100644 --- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -86,7 +86,6 @@ typedef struct Address { class PPCFastISel final : public FastISel { const TargetMachine &TM; - const PPCSubtarget *PPCSubTarget; const PPCSubtarget *Subtarget; PPCFunctionInfo *PPCFuncInfo; const TargetInstrInfo &TII; @@ -97,7 +96,6 @@ class PPCFastISel final : public FastISel { explicit PPCFastISel(FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) : FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()), - PPCSubTarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()), Subtarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()), PPCFuncInfo(FuncInfo.MF->getInfo<PPCFunctionInfo>()), TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()), @@ -1567,6 +1565,10 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) { if (IsVarArg) return false; + // If this is a PC-Rel function, let SDISel handle the call. + if (Subtarget->isUsingPCRelativeCalls()) + return false; + // Handle simple calls for now, with legal return types and // those that can be extended. Type *RetTy = CLI.RetTy; @@ -1622,7 +1624,10 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) { if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8) return false; - if (ArgVT.isVector()) + // FIXME: FastISel cannot handle non-simple types yet, including 128-bit FP + // types, which is passed through vector register. Skip these types and + // fallback to default SelectionDAG based selection. + if (ArgVT.isVector() || ArgVT == MVT::f128) return false; unsigned Arg = getRegForValue(ArgValue); @@ -1991,6 +1996,10 @@ bool PPCFastISel::fastSelectInstruction(const Instruction *I) { // Materialize a floating-point constant into a register, and return // the register number (or zero if we failed to handle it). unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { + // If this is a PC-Rel function, let SDISel handle constant pool. + if (Subtarget->isUsingPCRelativeCalls()) + return false; + // No plans to handle long double here. if (VT != MVT::f32 && VT != MVT::f64) return 0; @@ -2055,6 +2064,10 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { // Materialize the address of a global value into a register, and return // the register number (or zero if we failed to handle it). unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { + // If this is a PC-Rel function, let SDISel handle GV materialization. + if (Subtarget->isUsingPCRelativeCalls()) + return false; + assert(VT == MVT::i64 && "Non-address!"); const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass; unsigned DestReg = createResultReg(RC); diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index bd9174c1973d..50ce11b8374f 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -39,15 +39,6 @@ EnablePEVectorSpills("ppc-enable-pe-vector-spills", cl::desc("Enable spills in prologue to vector registers."), cl::init(false), cl::Hidden); -/// VRRegNo - Map from a numbered VR register to its enum value. -/// -static const MCPhysReg VRRegNo[] = { - PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 , - PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15, - PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23, - PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31 -}; - static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) { if (STI.isAIXABI()) return STI.isPPC64() ? 16 : 8; @@ -227,19 +218,14 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots( CALLEE_SAVED_VRS }; - static const SpillSlot AIXOffsets32[] = { - CALLEE_SAVED_FPRS, - CALLEE_SAVED_GPRS32, - // Add AIX's extra CSR. - {PPC::R13, -76}, - // TODO: Update when we add vector support for AIX. - }; + static const SpillSlot AIXOffsets32[] = {CALLEE_SAVED_FPRS, + CALLEE_SAVED_GPRS32, + // Add AIX's extra CSR. + {PPC::R13, -76}, + CALLEE_SAVED_VRS}; static const SpillSlot AIXOffsets64[] = { - CALLEE_SAVED_FPRS, - CALLEE_SAVED_GPRS64, - // TODO: Update when we add vector support for AIX. - }; + CALLEE_SAVED_FPRS, CALLEE_SAVED_GPRS64, CALLEE_SAVED_VRS}; if (Subtarget.is64BitELFABI()) { NumEntries = array_lengthof(ELFOffsets64); @@ -262,153 +248,11 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots( return AIXOffsets32; } -/// RemoveVRSaveCode - We have found that this function does not need any code -/// to manipulate the VRSAVE register, even though it uses vector registers. -/// This can happen when the only registers used are known to be live in or out -/// of the function. Remove all of the VRSAVE related code from the function. -/// FIXME: The removal of the code results in a compile failure at -O0 when the -/// function contains a function call, as the GPR containing original VRSAVE -/// contents is spilled and reloaded around the call. Without the prolog code, -/// the spill instruction refers to an undefined register. This code needs -/// to account for all uses of that GPR. -static void RemoveVRSaveCode(MachineInstr &MI) { - MachineBasicBlock *Entry = MI.getParent(); - MachineFunction *MF = Entry->getParent(); - - // We know that the MTVRSAVE instruction immediately follows MI. Remove it. - MachineBasicBlock::iterator MBBI = MI; - ++MBBI; - assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE); - MBBI->eraseFromParent(); - - bool RemovedAllMTVRSAVEs = true; - // See if we can find and remove the MTVRSAVE instruction from all of the - // epilog blocks. - for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) { - // If last instruction is a return instruction, add an epilogue - if (I->isReturnBlock()) { - bool FoundIt = false; - for (MBBI = I->end(); MBBI != I->begin(); ) { - --MBBI; - if (MBBI->getOpcode() == PPC::MTVRSAVE) { - MBBI->eraseFromParent(); // remove it. - FoundIt = true; - break; - } - } - RemovedAllMTVRSAVEs &= FoundIt; - } - } - - // If we found and removed all MTVRSAVE instructions, remove the read of - // VRSAVE as well. - if (RemovedAllMTVRSAVEs) { - MBBI = MI; - assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?"); - --MBBI; - assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?"); - MBBI->eraseFromParent(); - } - - // Finally, nuke the UPDATE_VRSAVE. - MI.eraseFromParent(); -} - -// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the -// instruction selector. Based on the vector registers that have been used, -// transform this into the appropriate ORI instruction. -static void HandleVRSaveUpdate(MachineInstr &MI, const TargetInstrInfo &TII) { - MachineFunction *MF = MI.getParent()->getParent(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - DebugLoc dl = MI.getDebugLoc(); - - const MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned UsedRegMask = 0; - for (unsigned i = 0; i != 32; ++i) - if (MRI.isPhysRegModified(VRRegNo[i])) - UsedRegMask |= 1 << (31-i); - - // Live in and live out values already must be in the mask, so don't bother - // marking them. - for (std::pair<unsigned, unsigned> LI : MF->getRegInfo().liveins()) { - unsigned RegNo = TRI->getEncodingValue(LI.first); - if (VRRegNo[RegNo] == LI.first) // If this really is a vector reg. - UsedRegMask &= ~(1 << (31-RegNo)); // Doesn't need to be marked. - } - - // Live out registers appear as use operands on return instructions. - for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end(); - UsedRegMask != 0 && BI != BE; ++BI) { - const MachineBasicBlock &MBB = *BI; - if (!MBB.isReturnBlock()) - continue; - const MachineInstr &Ret = MBB.back(); - for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) { - const MachineOperand &MO = Ret.getOperand(I); - if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg())) - continue; - unsigned RegNo = TRI->getEncodingValue(MO.getReg()); - UsedRegMask &= ~(1 << (31-RegNo)); - } - } - - // If no registers are used, turn this into a copy. - if (UsedRegMask == 0) { - // Remove all VRSAVE code. - RemoveVRSaveCode(MI); - return; - } - - Register SrcReg = MI.getOperand(1).getReg(); - Register DstReg = MI.getOperand(0).getReg(); - - if ((UsedRegMask & 0xFFFF) == UsedRegMask) { - if (DstReg != SrcReg) - BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg) - .addReg(SrcReg) - .addImm(UsedRegMask); - else - BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg) - .addReg(SrcReg, RegState::Kill) - .addImm(UsedRegMask); - } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) { - if (DstReg != SrcReg) - BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) - .addReg(SrcReg) - .addImm(UsedRegMask >> 16); - else - BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) - .addReg(SrcReg, RegState::Kill) - .addImm(UsedRegMask >> 16); - } else { - if (DstReg != SrcReg) - BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) - .addReg(SrcReg) - .addImm(UsedRegMask >> 16); - else - BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORIS), DstReg) - .addReg(SrcReg, RegState::Kill) - .addImm(UsedRegMask >> 16); - - BuildMI(*MI.getParent(), MI, dl, TII.get(PPC::ORI), DstReg) - .addReg(DstReg, RegState::Kill) - .addImm(UsedRegMask & 0xFFFF); - } - - // Remove the old UPDATE_VRSAVE instruction. - MI.eraseFromParent(); -} - static bool spillsCR(const MachineFunction &MF) { const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); return FuncInfo->isCRSpilled(); } -static bool spillsVRSAVE(const MachineFunction &MF) { - const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); - return FuncInfo->isVRSAVESpilled(); -} - static bool hasSpills(const MachineFunction &MF) { const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); return FuncInfo->hasSpills(); @@ -474,7 +318,7 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF, !FI->mustSaveTOC() && // No need to save TOC. !RegInfo->hasBasePointer(MF); // No special alignment. - // Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless + // Note: for PPC32 SVR4ABI, we can still generate stackless // code if all local vars are reg-allocated. bool FitsInRedZone = FrameSize <= Subtarget.getRedZoneSize(); @@ -531,9 +375,10 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const { return false; return MF.getTarget().Options.DisableFramePointerElim(MF) || - MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint() || - (MF.getTarget().Options.GuaranteedTailCallOpt && - MF.getInfo<PPCFunctionInfo>()->hasFastCall()); + MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint() || + MF.exposesReturnsTwice() || + (MF.getTarget().Options.GuaranteedTailCallOpt && + MF.getInfo<PPCFunctionInfo>()->hasFastCall()); } void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { @@ -681,6 +526,8 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB, // register is available, we can adjust for that by not overlapping the spill // code. However, if we need to realign the stack (i.e. have a base pointer) // and the stack frame is large, we need two scratch registers. +// Also, stack probe requires two scratch registers, one for old sp, one for +// large frame and large probe size. bool PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const { const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); @@ -692,8 +539,10 @@ PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const { MachineFrameInfo &MFI = MF.getFrameInfo(); Align MaxAlign = MFI.getMaxAlign(); bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI(); + const PPCTargetLowering &TLI = *Subtarget.getTargetLowering(); - return (IsLargeFrame || !HasRedZone) && HasBP && MaxAlign > 1; + return ((IsLargeFrame || !HasRedZone) && HasBP && MaxAlign > 1) || + TLI.hasInlineStackProbe(MF); } bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const { @@ -736,8 +585,8 @@ bool PPCFrameLowering::stackUpdateCanBeMoved(MachineFunction &MF) const { // Frame pointers and base pointers complicate matters so don't do anything // if we have them. For example having a frame pointer will sometimes require // a copy of r1 into r31 and that makes keeping track of updates to r1 more - // difficult. - if (hasFP(MF) || RegInfo->hasBasePointer(MF)) + // difficult. Similar situation exists with setjmp. + if (hasFP(MF) || RegInfo->hasBasePointer(MF) || MF.exposesReturnsTwice()) return false; // Calls to fast_cc functions use different rules for passing parameters on @@ -771,24 +620,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, bool isPPC64 = Subtarget.isPPC64(); // Get the ABI. bool isSVR4ABI = Subtarget.isSVR4ABI(); - bool isAIXABI = Subtarget.isAIXABI(); bool isELFv2ABI = Subtarget.isELFv2ABI(); - assert((isSVR4ABI || isAIXABI) && "Unsupported PPC ABI."); - - // Scan the prolog, looking for an UPDATE_VRSAVE instruction. If we find it, - // process it. - if (!isSVR4ABI) - for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) { - if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) { - if (isAIXABI) - report_fatal_error("UPDATE_VRSAVE is unexpected on AIX."); - HandleVRSaveUpdate(*MBBI, TII); - break; - } - } - - // Move MBBI back to the beginning of the prologue block. - MBBI = MBB.begin(); + assert((isSVR4ABI || Subtarget.isAIXABI()) && "Unsupported PPC ABI."); // Work out frame sizes. unsigned FrameSize = determineFrameLayoutAndUpdate(MF); @@ -848,12 +681,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4."); // Using the same bool variable as below to suppress compiler warnings. - // Stack probe requires two scratch registers, one for old sp, one for large - // frame and large probe size. bool SingleScratchReg = findScratchRegister( - &MBB, false, - twoUniqueScratchRegsRequired(&MBB) || TLI.hasInlineStackProbe(MF), - &ScratchReg, &TempReg); + &MBB, false, twoUniqueScratchRegsRequired(&MBB), &ScratchReg, &TempReg); assert(SingleScratchReg && "Required number of registers not available in this block"); @@ -863,26 +692,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, int FPOffset = 0; if (HasFP) { - if (isSVR4ABI) { - MachineFrameInfo &MFI = MF.getFrameInfo(); - int FPIndex = FI->getFramePointerSaveIndex(); - assert(FPIndex && "No Frame Pointer Save Slot!"); - FPOffset = MFI.getObjectOffset(FPIndex); - } else { - FPOffset = getFramePointerSaveOffset(); - } + MachineFrameInfo &MFI = MF.getFrameInfo(); + int FPIndex = FI->getFramePointerSaveIndex(); + assert(FPIndex && "No Frame Pointer Save Slot!"); + FPOffset = MFI.getObjectOffset(FPIndex); } int BPOffset = 0; if (HasBP) { - if (isSVR4ABI) { - MachineFrameInfo &MFI = MF.getFrameInfo(); - int BPIndex = FI->getBasePointerSaveIndex(); - assert(BPIndex && "No Base Pointer Save Slot!"); - BPOffset = MFI.getObjectOffset(BPIndex); - } else { - BPOffset = getBasePointerSaveOffset(); - } + MachineFrameInfo &MFI = MF.getFrameInfo(); + int BPIndex = FI->getBasePointerSaveIndex(); + assert(BPIndex && "No Base Pointer Save Slot!"); + BPOffset = MFI.getObjectOffset(BPIndex); } int PBPOffset = 0; @@ -1382,10 +1203,12 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF, if (StackAllocMIPos == PrologMBB.end()) return; const BasicBlock *ProbedBB = PrologMBB.getBasicBlock(); + MachineBasicBlock *CurrentMBB = &PrologMBB; DebugLoc DL = PrologMBB.findDebugLoc(StackAllocMIPos); MachineInstr &MI = *StackAllocMIPos; int64_t NegFrameSize = MI.getOperand(2).getImm(); - int64_t NegProbeSize = -(int64_t)TLI.getStackProbeSize(MF); + unsigned ProbeSize = TLI.getStackProbeSize(MF); + int64_t NegProbeSize = -(int64_t)ProbeSize; assert(isInt<32>(NegProbeSize) && "Unhandled probe size"); int64_t NumBlocks = NegFrameSize / NegProbeSize; int64_t NegResidualSize = NegFrameSize % NegProbeSize; @@ -1394,10 +1217,9 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF, Register FPReg = MI.getOperand(1).getReg(); const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); bool HasBP = RegInfo->hasBasePointer(MF); + Register BPReg = RegInfo->getBaseRegister(MF); Align MaxAlign = MFI.getMaxAlign(); - // Initialize current frame pointer. const MCInstrDesc &CopyInst = TII.get(isPPC64 ? PPC::OR8 : PPC::OR); - BuildMI(PrologMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg); // Subroutines to generate .cfi_* directives. auto buildDefCFAReg = [&](MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register Reg) { @@ -1437,90 +1259,218 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF, // Subroutine to store frame pointer and decrease stack pointer by probe size. auto allocateAndProbe = [&](MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, int64_t NegSize, - Register NegSizeReg, bool UseDForm) { + Register NegSizeReg, bool UseDForm, + Register StoreReg) { if (UseDForm) BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDU : PPC::STWU), SPReg) - .addReg(FPReg) + .addReg(StoreReg) .addImm(NegSize) .addReg(SPReg); else BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg) - .addReg(FPReg) + .addReg(StoreReg) .addReg(SPReg) .addReg(NegSizeReg); }; - // Use FPReg to calculate CFA. - if (needsCFI) - buildDefCFA(PrologMBB, {MI}, FPReg, 0); - // For case HasBP && MaxAlign > 1, we have to align the SP by performing + // Used to probe realignment gap [stackptr - (stackptr % align), stackptr) + // when HasBP && isPPC64. In such scenario, normally we have r0, r1, r12, r30 + // available and r1 is already copied to r30 which is BPReg. So BPReg stores + // the value of stackptr. + // First we have to probe tail interval whose size is less than probesize, + // i.e., [stackptr - (stackptr % align) % probesize, stackptr). At this stage, + // ScratchReg stores the value of ((stackptr % align) % probesize). Then we + // probe each block sized probesize until stackptr meets + // (stackptr - (stackptr % align)). At this stage, ScratchReg is materialized + // as negprobesize. At both stages, TempReg stores the value of + // (stackptr - (stackptr % align)). + auto dynamicProbe = [&](MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, Register ScratchReg, + Register TempReg) { + assert(HasBP && isPPC64 && "Probe alignment part not available"); + assert(isPowerOf2_64(ProbeSize) && "Probe size should be power of 2"); + // ScratchReg = stackptr % align + BuildMI(MBB, MBBI, DL, TII.get(PPC::RLDICL), ScratchReg) + .addReg(BPReg) + .addImm(0) + .addImm(64 - Log2(MaxAlign)); + // TempReg = stackptr - (stackptr % align) + BuildMI(MBB, MBBI, DL, TII.get(PPC::SUBFC8), TempReg) + .addReg(ScratchReg) + .addReg(BPReg); + // ScratchReg = (stackptr % align) % probesize + BuildMI(MBB, MBBI, DL, TII.get(PPC::RLDICL), ScratchReg) + .addReg(ScratchReg) + .addImm(0) + .addImm(64 - Log2(ProbeSize)); + Register CRReg = PPC::CR0; + // If (stackptr % align) % probesize == 0, we should not generate probe + // code. Layout of output assembly kinda like: + // bb.0: + // ... + // cmpldi $scratchreg, 0 + // beq bb.2 + // bb.1: # Probe tail interval + // neg $scratchreg, $scratchreg + // stdux $bpreg, r1, $scratchreg + // bb.2: + // <materialize negprobesize into $scratchreg> + // cmpd r1, $tempreg + // beq bb.4 + // bb.3: # Loop to probe each block + // stdux $bpreg, r1, $scratchreg + // cmpd r1, $tempreg + // bne bb.3 + // bb.4: + // ... + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *ProbeResidualMBB = MF.CreateMachineBasicBlock(ProbedBB); + MF.insert(MBBInsertPoint, ProbeResidualMBB); + MachineBasicBlock *ProbeLoopPreHeaderMBB = + MF.CreateMachineBasicBlock(ProbedBB); + MF.insert(MBBInsertPoint, ProbeLoopPreHeaderMBB); + MachineBasicBlock *ProbeLoopBodyMBB = MF.CreateMachineBasicBlock(ProbedBB); + MF.insert(MBBInsertPoint, ProbeLoopBodyMBB); + MachineBasicBlock *ProbeExitMBB = MF.CreateMachineBasicBlock(ProbedBB); + MF.insert(MBBInsertPoint, ProbeExitMBB); + // bb.4 + ProbeExitMBB->splice(ProbeExitMBB->end(), &MBB, MBBI, MBB.end()); + ProbeExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + // bb.0 + BuildMI(&MBB, DL, TII.get(PPC::CMPDI), CRReg).addReg(ScratchReg).addImm(0); + BuildMI(&MBB, DL, TII.get(PPC::BCC)) + .addImm(PPC::PRED_EQ) + .addReg(CRReg) + .addMBB(ProbeLoopPreHeaderMBB); + MBB.addSuccessor(ProbeResidualMBB); + MBB.addSuccessor(ProbeLoopPreHeaderMBB); + // bb.1 + BuildMI(ProbeResidualMBB, DL, TII.get(PPC::NEG8), ScratchReg) + .addReg(ScratchReg); + allocateAndProbe(*ProbeResidualMBB, ProbeResidualMBB->end(), 0, ScratchReg, + false, BPReg); + ProbeResidualMBB->addSuccessor(ProbeLoopPreHeaderMBB); + // bb.2 + MaterializeImm(*ProbeLoopPreHeaderMBB, ProbeLoopPreHeaderMBB->end(), + NegProbeSize, ScratchReg); + BuildMI(ProbeLoopPreHeaderMBB, DL, TII.get(PPC::CMPD), CRReg) + .addReg(SPReg) + .addReg(TempReg); + BuildMI(ProbeLoopPreHeaderMBB, DL, TII.get(PPC::BCC)) + .addImm(PPC::PRED_EQ) + .addReg(CRReg) + .addMBB(ProbeExitMBB); + ProbeLoopPreHeaderMBB->addSuccessor(ProbeLoopBodyMBB); + ProbeLoopPreHeaderMBB->addSuccessor(ProbeExitMBB); + // bb.3 + allocateAndProbe(*ProbeLoopBodyMBB, ProbeLoopBodyMBB->end(), 0, ScratchReg, + false, BPReg); + BuildMI(ProbeLoopBodyMBB, DL, TII.get(PPC::CMPD), CRReg) + .addReg(SPReg) + .addReg(TempReg); + BuildMI(ProbeLoopBodyMBB, DL, TII.get(PPC::BCC)) + .addImm(PPC::PRED_NE) + .addReg(CRReg) + .addMBB(ProbeLoopBodyMBB); + ProbeLoopBodyMBB->addSuccessor(ProbeExitMBB); + ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB); + // Update liveins. + recomputeLiveIns(*ProbeResidualMBB); + recomputeLiveIns(*ProbeLoopPreHeaderMBB); + recomputeLiveIns(*ProbeLoopBodyMBB); + recomputeLiveIns(*ProbeExitMBB); + return ProbeExitMBB; + }; + // For case HasBP && MaxAlign > 1, we have to realign the SP by performing // SP = SP - SP % MaxAlign. if (HasBP && MaxAlign > 1) { - if (isPPC64) - BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLDICL), ScratchReg) - .addReg(FPReg) - .addImm(0) - .addImm(64 - Log2(MaxAlign)); - else - BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg) + // FIXME: Currently only probe the gap [stackptr & alignmask, stackptr) in + // 64-bit mode. + if (isPPC64) { + // Use BPReg to calculate CFA. + if (needsCFI) + buildDefCFA(*CurrentMBB, {MI}, BPReg, 0); + // Since we have SPReg copied to BPReg at the moment, FPReg can be used as + // TempReg. + Register TempReg = FPReg; + CurrentMBB = dynamicProbe(*CurrentMBB, {MI}, ScratchReg, TempReg); + // Copy BPReg to FPReg to meet the definition of PROBED_STACKALLOC_64. + BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg) + .addReg(BPReg) + .addReg(BPReg); + } else { + // Initialize current frame pointer. + BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg) + .addReg(SPReg) + .addReg(SPReg); + // Use FPReg to calculate CFA. + if (needsCFI) + buildDefCFA(*CurrentMBB, {MI}, FPReg, 0); + BuildMI(*CurrentMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg) .addReg(FPReg) .addImm(0) .addImm(32 - Log2(MaxAlign)) .addImm(31); - BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX), - SPReg) - .addReg(FPReg) - .addReg(SPReg) - .addReg(ScratchReg); + BuildMI(*CurrentMBB, {MI}, DL, TII.get(PPC::SUBFC), SPReg) + .addReg(ScratchReg) + .addReg(SPReg); + } + } else { + // Initialize current frame pointer. + BuildMI(*CurrentMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg); + // Use FPReg to calculate CFA. + if (needsCFI) + buildDefCFA(*CurrentMBB, {MI}, FPReg, 0); } // Probe residual part. if (NegResidualSize) { bool ResidualUseDForm = CanUseDForm(NegResidualSize); if (!ResidualUseDForm) - MaterializeImm(PrologMBB, {MI}, NegResidualSize, ScratchReg); - allocateAndProbe(PrologMBB, {MI}, NegResidualSize, ScratchReg, - ResidualUseDForm); + MaterializeImm(*CurrentMBB, {MI}, NegResidualSize, ScratchReg); + allocateAndProbe(*CurrentMBB, {MI}, NegResidualSize, ScratchReg, + ResidualUseDForm, FPReg); } bool UseDForm = CanUseDForm(NegProbeSize); // If number of blocks is small, just probe them directly. if (NumBlocks < 3) { if (!UseDForm) - MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg); + MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg); for (int i = 0; i < NumBlocks; ++i) - allocateAndProbe(PrologMBB, {MI}, NegProbeSize, ScratchReg, UseDForm); + allocateAndProbe(*CurrentMBB, {MI}, NegProbeSize, ScratchReg, UseDForm, + FPReg); if (needsCFI) { // Restore using SPReg to calculate CFA. - buildDefCFAReg(PrologMBB, {MI}, SPReg); + buildDefCFAReg(*CurrentMBB, {MI}, SPReg); } } else { // Since CTR is a volatile register and current shrinkwrap implementation // won't choose an MBB in a loop as the PrologMBB, it's safe to synthesize a // CTR loop to probe. // Calculate trip count and stores it in CTRReg. - MaterializeImm(PrologMBB, {MI}, NumBlocks, ScratchReg); - BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR)) + MaterializeImm(*CurrentMBB, {MI}, NumBlocks, ScratchReg); + BuildMI(*CurrentMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR)) .addReg(ScratchReg, RegState::Kill); if (!UseDForm) - MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg); + MaterializeImm(*CurrentMBB, {MI}, NegProbeSize, ScratchReg); // Create MBBs of the loop. MachineFunction::iterator MBBInsertPoint = - std::next(PrologMBB.getIterator()); + std::next(CurrentMBB->getIterator()); MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(ProbedBB); MF.insert(MBBInsertPoint, LoopMBB); MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(ProbedBB); MF.insert(MBBInsertPoint, ExitMBB); // Synthesize the loop body. allocateAndProbe(*LoopMBB, LoopMBB->end(), NegProbeSize, ScratchReg, - UseDForm); + UseDForm, FPReg); BuildMI(LoopMBB, DL, TII.get(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ)) .addMBB(LoopMBB); LoopMBB->addSuccessor(ExitMBB); LoopMBB->addSuccessor(LoopMBB); // Synthesize the exit MBB. - ExitMBB->splice(ExitMBB->end(), &PrologMBB, + ExitMBB->splice(ExitMBB->end(), CurrentMBB, std::next(MachineBasicBlock::iterator(MI)), - PrologMBB.end()); - ExitMBB->transferSuccessorsAndUpdatePHIs(&PrologMBB); - PrologMBB.addSuccessor(LoopMBB); + CurrentMBB->end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(CurrentMBB); + CurrentMBB->addSuccessor(LoopMBB); if (needsCFI) { // Restore using SPReg to calculate CFA. buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg); @@ -1552,8 +1502,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, // Get processor type. bool isPPC64 = Subtarget.isPPC64(); - // Get the ABI. - bool isSVR4ABI = Subtarget.isSVR4ABI(); // Check if the link register (LR) has been saved. PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); @@ -1601,24 +1549,16 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, SingleScratchReg = ScratchReg == TempReg; if (HasFP) { - if (isSVR4ABI) { - int FPIndex = FI->getFramePointerSaveIndex(); - assert(FPIndex && "No Frame Pointer Save Slot!"); - FPOffset = MFI.getObjectOffset(FPIndex); - } else { - FPOffset = getFramePointerSaveOffset(); - } + int FPIndex = FI->getFramePointerSaveIndex(); + assert(FPIndex && "No Frame Pointer Save Slot!"); + FPOffset = MFI.getObjectOffset(FPIndex); } int BPOffset = 0; if (HasBP) { - if (isSVR4ABI) { int BPIndex = FI->getBasePointerSaveIndex(); assert(BPIndex && "No Base Pointer Save Slot!"); BPOffset = MFI.getObjectOffset(BPIndex); - } else { - BPOffset = getBasePointerSaveOffset(); - } } int PBPOffset = 0; @@ -1704,11 +1644,18 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, // offset by the STDU/STDUX/STWU/STWUX instruction. For targets with red // zone add this offset back now. + // If the function has a base pointer, the stack pointer has been copied + // to it so we can restore it by copying in the other direction. + if (HasRedZone && HasBP) { + BuildMI(MBB, MBBI, dl, OrInst, RBReg). + addReg(BPReg). + addReg(BPReg); + } // If this function contained a fastcc call and GuaranteedTailCallOpt is // enabled (=> hasFastCall()==true) the fastcc call might contain a tail // call which invalidates the stack pointer value in SP(0). So we use the - // value of R31 in this case. - if (FI->hasFastCall()) { + // value of R31 in this case. Similar situation exists with setjmp. + else if (FI->hasFastCall() || MF.exposesReturnsTwice()) { assert(HasFP && "Expecting a valid frame pointer."); if (!HasRedZone) RBReg = FPReg; @@ -2054,7 +2001,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, bool HasGPSaveArea = false; bool HasG8SaveArea = false; bool HasFPSaveArea = false; - bool HasVRSAVESaveArea = false; bool HasVRSaveArea = false; SmallVector<CalleeSavedInfo, 18> GPRegs; @@ -2094,8 +2040,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, } else if (PPC::CRBITRCRegClass.contains(Reg) || PPC::CRRCRegClass.contains(Reg)) { ; // do nothing, as we already know whether CRs are spilled - } else if (PPC::VRSAVERCRegClass.contains(Reg)) { - HasVRSAVESaveArea = true; } else if (PPC::VRRCRegClass.contains(Reg) || PPC::SPERCRegClass.contains(Reg)) { // Altivec and SPE are mutually exclusive, but have the same stack @@ -2218,23 +2162,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, LowerBound -= 4; // The CR save area is always 4 bytes long. } - if (HasVRSAVESaveArea) { - // FIXME SVR4: Is it actually possible to have multiple elements in CSI - // which have the VRSAVE register class? - // Adjust the frame index of the VRSAVE spill slot. - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - - if (PPC::VRSAVERCRegClass.contains(Reg)) { - int FI = CSI[i].getFrameIdx(); - - MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI)); - } - } - - LowerBound -= 4; // The VRSAVE save area is always 4 bytes long. - } - // Both Altivec and SPE have the same alignment and padding requirements // within the stack frame. if (HasVRSaveArea) { @@ -2274,8 +2201,8 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, // needed alignment padding. unsigned StackSize = determineFrameLayout(MF, true); MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) || - hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) { + if (MFI.hasVarSizedObjects() || spillsCR(MF) || hasNonRISpills(MF) || + (hasSpills(MF) && !isInt<16>(StackSize))) { const TargetRegisterClass &GPRC = PPC::GPRCRegClass; const TargetRegisterClass &G8RC = PPC::G8RCRegClass; const TargetRegisterClass &RC = Subtarget.isPPC64() ? G8RC : GPRC; @@ -2289,7 +2216,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF, MFI.hasVarSizedObjects() && MFI.getMaxAlign() > getStackAlign(); // These kinds of spills might need two registers. - if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars) + if (spillsCR(MF) || HasAlVars) RS->addScavengingFrameIndex( MFI.CreateStackObject(Size, Alignment, false)); } @@ -2366,9 +2293,6 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); - // VRSAVE can appear here if, for example, @llvm.eh.unwind.init() is used. - if (Reg == PPC::VRSAVE) - continue; // CR2 through CR4 are the nonvolatile CR fields. bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4; @@ -2533,10 +2457,6 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); - // VRSAVE can appear here if, for example, @llvm.eh.unwind.init() is used. - if (Reg == PPC::VRSAVE) - continue; - if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC) continue; diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 8ffd89ef5ccd..693b0adaede4 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -43,6 +43,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/IntrinsicsPowerPC.h" #include "llvm/IR/Module.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" @@ -138,7 +139,6 @@ namespace { /// class PPCDAGToDAGISel : public SelectionDAGISel { const PPCTargetMachine &TM; - const PPCSubtarget *PPCSubTarget = nullptr; const PPCSubtarget *Subtarget = nullptr; const PPCTargetLowering *PPCLowering = nullptr; unsigned GlobalBaseReg = 0; @@ -150,14 +150,10 @@ namespace { bool runOnMachineFunction(MachineFunction &MF) override { // Make sure we re-emit a set of the global base reg if necessary GlobalBaseReg = 0; - PPCSubTarget = &MF.getSubtarget<PPCSubtarget>(); Subtarget = &MF.getSubtarget<PPCSubtarget>(); PPCLowering = Subtarget->getTargetLowering(); SelectionDAGISel::runOnMachineFunction(MF); - if (!Subtarget->isSVR4ABI()) - InsertVRSaveCode(MF); - return true; } @@ -218,7 +214,7 @@ namespace { /// SelectCC - Select a comparison of the specified values with the /// specified condition code, returning the CR# of the expression. SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, - const SDLoc &dl); + const SDLoc &dl, SDValue Chain = SDValue()); /// SelectAddrImmOffs - Return true if the operand is valid for a preinc /// immediate field. Note that the operand at this point is already the @@ -295,6 +291,13 @@ namespace { Align(16)); } + /// SelectAddrImmX34 - Returns true if the address N can be represented by + /// a base register plus a signed 34-bit displacement. Suitable for use by + /// PSTXVP and friends. + bool SelectAddrImmX34(SDValue N, SDValue &Disp, SDValue &Base) { + return PPCLowering->SelectAddressRegImm34(N, Disp, Base, *CurDAG); + } + // Select an address into a single register. bool SelectAddr(SDValue N, SDValue &Base) { Base = N; @@ -340,8 +343,6 @@ namespace { return true; } - void InsertVRSaveCode(MachineFunction &MF); - StringRef getPassName() const override { return "PowerPC DAG->DAG Pattern Instruction Selection"; } @@ -351,6 +352,7 @@ namespace { private: bool trySETCC(SDNode *N); + bool tryFoldSWTestBRCC(SDNode *N); bool tryAsSingleRLDICL(SDNode *N); bool tryAsSingleRLDICR(SDNode *N); bool tryAsSingleRLWINM(SDNode *N); @@ -375,70 +377,6 @@ private: } // end anonymous namespace -/// InsertVRSaveCode - Once the entire function has been instruction selected, -/// all virtual registers are created and all machine instructions are built, -/// check to see if we need to save/restore VRSAVE. If so, do it. -void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) { - // Check to see if this function uses vector registers, which means we have to - // save and restore the VRSAVE register and update it with the regs we use. - // - // In this case, there will be virtual registers of vector type created - // by the scheduler. Detect them now. - bool HasVectorVReg = false; - for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) { - unsigned Reg = Register::index2VirtReg(i); - if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) { - HasVectorVReg = true; - break; - } - } - if (!HasVectorVReg) return; // nothing to do. - - // If we have a vector register, we want to emit code into the entry and exit - // blocks to save and restore the VRSAVE register. We do this here (instead - // of marking all vector instructions as clobbering VRSAVE) for two reasons: - // - // 1. This (trivially) reduces the load on the register allocator, by not - // having to represent the live range of the VRSAVE register. - // 2. This (more significantly) allows us to create a temporary virtual - // register to hold the saved VRSAVE value, allowing this temporary to be - // register allocated, instead of forcing it to be spilled to the stack. - - // Create two vregs - one to hold the VRSAVE register that is live-in to the - // function and one for the value after having bits or'd into it. - Register InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); - Register UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass); - - const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - MachineBasicBlock &EntryBB = *Fn.begin(); - DebugLoc dl; - // Emit the following code into the entry block: - // InVRSAVE = MFVRSAVE - // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE - // MTVRSAVE UpdatedVRSAVE - MachineBasicBlock::iterator IP = EntryBB.begin(); // Insert Point - BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE); - BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE), - UpdatedVRSAVE).addReg(InVRSAVE); - BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE); - - // Find all return blocks, outputting a restore in each epilog. - for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { - if (BB->isReturnBlock()) { - IP = BB->end(); --IP; - - // Skip over all terminator instructions, which are part of the return - // sequence. - MachineBasicBlock::iterator I2 = IP; - while (I2 != BB->begin() && (--I2)->isTerminator()) - IP = I2; - - // Emit: MTVRSAVE InVRSave - BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE); - } - } -} - /// getGlobalBaseReg - Output the instructions required to put the /// base address to use for accessing globals into a register. /// @@ -648,6 +586,8 @@ bool PPCDAGToDAGISel::tryTLSXFormStore(StoreSDNode *ST) { SDValue Offset = ST->getOffset(); if (!Offset.isUndef()) return false; + if (Base.getOperand(1).getOpcode() == PPCISD::TLS_LOCAL_EXEC_MAT_ADDR) + return false; SDLoc dl(ST); EVT MemVT = ST->getMemoryVT(); @@ -691,6 +631,8 @@ bool PPCDAGToDAGISel::tryTLSXFormLoad(LoadSDNode *LD) { SDValue Offset = LD->getOffset(); if (!Offset.isUndef()) return false; + if (Base.getOperand(1).getOpcode() == PPCISD::TLS_LOCAL_EXEC_MAT_ADDR) + return false; SDLoc dl(LD); EVT MemVT = LD->getMemoryVT(); @@ -800,251 +742,6 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) { return false; } -// Predict the number of instructions that would be generated by calling -// selectI64Imm(N). -static unsigned selectI64ImmInstrCountDirect(int64_t Imm) { - // Assume no remaining bits. - unsigned Remainder = 0; - // Assume no shift required. - unsigned Shift = 0; - - // If it can't be represented as a 32 bit value. - if (!isInt<32>(Imm)) { - Shift = countTrailingZeros<uint64_t>(Imm); - int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; - - // If the shifted value fits 32 bits. - if (isInt<32>(ImmSh)) { - // Go with the shifted value. - Imm = ImmSh; - } else { - // Still stuck with a 64 bit value. - Remainder = Imm; - Shift = 32; - Imm >>= 32; - } - } - - // Intermediate operand. - unsigned Result = 0; - - // Handle first 32 bits. - unsigned Lo = Imm & 0xFFFF; - - // Simple value. - if (isInt<16>(Imm)) { - // Just the Lo bits. - ++Result; - } else if (Lo) { - // Handle the Hi bits and Lo bits. - Result += 2; - } else { - // Just the Hi bits. - ++Result; - } - - // If no shift, we're done. - if (!Shift) return Result; - - // If Hi word == Lo word, - // we can use rldimi to insert the Lo word into Hi word. - if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) { - ++Result; - return Result; - } - - // Shift for next step if the upper 32-bits were not zero. - if (Imm) - ++Result; - - // Add in the last bits as required. - if ((Remainder >> 16) & 0xFFFF) - ++Result; - if (Remainder & 0xFFFF) - ++Result; - - return Result; -} - -static uint64_t Rot64(uint64_t Imm, unsigned R) { - return (Imm << R) | (Imm >> (64 - R)); -} - -static unsigned selectI64ImmInstrCount(int64_t Imm) { - unsigned Count = selectI64ImmInstrCountDirect(Imm); - - // If the instruction count is 1 or 2, we do not need further analysis - // since rotate + load constant requires at least 2 instructions. - if (Count <= 2) - return Count; - - for (unsigned r = 1; r < 63; ++r) { - uint64_t RImm = Rot64(Imm, r); - unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1; - Count = std::min(Count, RCount); - - // See comments in selectI64Imm for an explanation of the logic below. - unsigned LS = findLastSet(RImm); - if (LS != r-1) - continue; - - uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1)); - uint64_t RImmWithOnes = RImm | OnesMask; - - RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1; - Count = std::min(Count, RCount); - } - - return Count; -} - -// Select a 64-bit constant. For cost-modeling purposes, selectI64ImmInstrCount -// (above) needs to be kept in sync with this function. -static SDNode *selectI64ImmDirect(SelectionDAG *CurDAG, const SDLoc &dl, - int64_t Imm) { - // Assume no remaining bits. - unsigned Remainder = 0; - // Assume no shift required. - unsigned Shift = 0; - - // If it can't be represented as a 32 bit value. - if (!isInt<32>(Imm)) { - Shift = countTrailingZeros<uint64_t>(Imm); - int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift; - - // If the shifted value fits 32 bits. - if (isInt<32>(ImmSh)) { - // Go with the shifted value. - Imm = ImmSh; - } else { - // Still stuck with a 64 bit value. - Remainder = Imm; - Shift = 32; - Imm >>= 32; - } - } - - // Intermediate operand. - SDNode *Result; - - // Handle first 32 bits. - unsigned Lo = Imm & 0xFFFF; - unsigned Hi = (Imm >> 16) & 0xFFFF; - - auto getI32Imm = [CurDAG, dl](unsigned Imm) { - return CurDAG->getTargetConstant(Imm, dl, MVT::i32); - }; - - // Simple value. - if (isInt<16>(Imm)) { - uint64_t SextImm = SignExtend64(Lo, 16); - SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64); - // Just the Lo bits. - Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm); - } else if (Lo) { - // Handle the Hi bits. - unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8; - Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi)); - // And Lo bits. - Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, - SDValue(Result, 0), getI32Imm(Lo)); - } else { - // Just the Hi bits. - Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi)); - } - - // If no shift, we're done. - if (!Shift) return Result; - - // If Hi word == Lo word, - // we can use rldimi to insert the Lo word into Hi word. - if ((unsigned)(Imm & 0xFFFFFFFF) == Remainder) { - SDValue Ops[] = - { SDValue(Result, 0), SDValue(Result, 0), getI32Imm(Shift), getI32Imm(0)}; - return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops); - } - - // Shift for next step if the upper 32-bits were not zero. - if (Imm) { - Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, - SDValue(Result, 0), - getI32Imm(Shift), - getI32Imm(63 - Shift)); - } - - // Add in the last bits as required. - if ((Hi = (Remainder >> 16) & 0xFFFF)) { - Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, - SDValue(Result, 0), getI32Imm(Hi)); - } - if ((Lo = Remainder & 0xFFFF)) { - Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, - SDValue(Result, 0), getI32Imm(Lo)); - } - - return Result; -} - -static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl, - int64_t Imm) { - unsigned Count = selectI64ImmInstrCountDirect(Imm); - - // If the instruction count is 1 or 2, we do not need further analysis - // since rotate + load constant requires at least 2 instructions. - if (Count <= 2) - return selectI64ImmDirect(CurDAG, dl, Imm); - - unsigned RMin = 0; - - int64_t MatImm; - unsigned MaskEnd; - - for (unsigned r = 1; r < 63; ++r) { - uint64_t RImm = Rot64(Imm, r); - unsigned RCount = selectI64ImmInstrCountDirect(RImm) + 1; - if (RCount < Count) { - Count = RCount; - RMin = r; - MatImm = RImm; - MaskEnd = 63; - } - - // If the immediate to generate has many trailing zeros, it might be - // worthwhile to generate a rotated value with too many leading ones - // (because that's free with li/lis's sign-extension semantics), and then - // mask them off after rotation. - - unsigned LS = findLastSet(RImm); - // We're adding (63-LS) higher-order ones, and we expect to mask them off - // after performing the inverse rotation by (64-r). So we need that: - // 63-LS == 64-r => LS == r-1 - if (LS != r-1) - continue; - - uint64_t OnesMask = -(int64_t) (UINT64_C(1) << (LS+1)); - uint64_t RImmWithOnes = RImm | OnesMask; - - RCount = selectI64ImmInstrCountDirect(RImmWithOnes) + 1; - if (RCount < Count) { - Count = RCount; - RMin = r; - MatImm = RImmWithOnes; - MaskEnd = LS; - } - } - - if (!RMin) - return selectI64ImmDirect(CurDAG, dl, Imm); - - auto getI32Imm = [CurDAG, dl](unsigned Imm) { - return CurDAG->getTargetConstant(Imm, dl, MVT::i32); - }; - - SDValue Val = SDValue(selectI64ImmDirect(CurDAG, dl, MatImm), 0); - return CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64, Val, - getI32Imm(64 - RMin), getI32Imm(MaskEnd)); -} - static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) { unsigned MaxTruncation = 0; // Cannot use range-based for loop here as we need the actual use (i.e. we @@ -1101,6 +798,274 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) { return MaxTruncation; } +// For any 32 < Num < 64, check if the Imm contains at least Num consecutive +// zeros and return the number of bits by the left of these consecutive zeros. +static int findContiguousZerosAtLeast(uint64_t Imm, unsigned Num) { + unsigned HiTZ = countTrailingZeros<uint32_t>(Hi_32(Imm)); + unsigned LoLZ = countLeadingZeros<uint32_t>(Lo_32(Imm)); + if ((HiTZ + LoLZ) >= Num) + return (32 + HiTZ); + return 0; +} + +// Direct materialization of 64-bit constants by enumerated patterns. +static SDNode *selectI64ImmDirect(SelectionDAG *CurDAG, const SDLoc &dl, + uint64_t Imm, unsigned &InstCnt) { + unsigned TZ = countTrailingZeros<uint64_t>(Imm); + unsigned LZ = countLeadingZeros<uint64_t>(Imm); + unsigned TO = countTrailingOnes<uint64_t>(Imm); + unsigned LO = countLeadingOnes<uint64_t>(Imm); + unsigned Hi32 = Hi_32(Imm); + unsigned Lo32 = Lo_32(Imm); + SDNode *Result = nullptr; + unsigned Shift = 0; + + auto getI32Imm = [CurDAG, dl](unsigned Imm) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i32); + }; + + // Following patterns use 1 instructions to materialize the Imm. + InstCnt = 1; + // 1-1) Patterns : {zeros}{15-bit valve} + // {ones}{15-bit valve} + if (isInt<16>(Imm)) { + SDValue SDImm = CurDAG->getTargetConstant(Imm, dl, MVT::i64); + return CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, SDImm); + } + // 1-2) Patterns : {zeros}{15-bit valve}{16 zeros} + // {ones}{15-bit valve}{16 zeros} + if (TZ > 15 && (LZ > 32 || LO > 32)) + return CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, + getI32Imm((Imm >> 16) & 0xffff)); + + // Following patterns use 2 instructions to materialize the Imm. + InstCnt = 2; + assert(LZ < 64 && "Unexpected leading zeros here."); + // Count of ones follwing the leading zeros. + unsigned FO = countLeadingOnes<uint64_t>(Imm << LZ); + // 2-1) Patterns : {zeros}{31-bit value} + // {ones}{31-bit value} + if (isInt<32>(Imm)) { + uint64_t ImmHi16 = (Imm >> 16) & 0xffff; + unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8; + Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16)); + return CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(Imm & 0xffff)); + } + // 2-2) Patterns : {zeros}{ones}{15-bit value}{zeros} + // {zeros}{15-bit value}{zeros} + // {zeros}{ones}{15-bit value} + // {ones}{15-bit value}{zeros} + // We can take advantage of LI's sign-extension semantics to generate leading + // ones, and then use RLDIC to mask off the ones in both sides after rotation. + if ((LZ + FO + TZ) > 48) { + Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, + getI32Imm((Imm >> TZ) & 0xffff)); + return CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(TZ), getI32Imm(LZ)); + } + // 2-3) Pattern : {zeros}{15-bit value}{ones} + // Shift right the Imm by (48 - LZ) bits to construct a negtive 16 bits value, + // therefore we can take advantage of LI's sign-extension semantics, and then + // mask them off after rotation. + // + // +--LZ--||-15-bit-||--TO--+ +-------------|--16-bit--+ + // |00000001bbbbbbbbb1111111| -> |00000000000001bbbbbbbbb1| + // +------------------------+ +------------------------+ + // 63 0 63 0 + // Imm (Imm >> (48 - LZ) & 0xffff) + // +----sext-----|--16-bit--+ +clear-|-----------------+ + // |11111111111111bbbbbbbbb1| -> |00000001bbbbbbbbb1111111| + // +------------------------+ +------------------------+ + // 63 0 63 0 + // LI8: sext many leading zeros RLDICL: rotate left (48 - LZ), clear left LZ + if ((LZ + TO) > 48) { + // Since the immediates with (LZ > 32) have been handled by previous + // patterns, here we have (LZ <= 32) to make sure we will not shift right + // the Imm by a negative value. + assert(LZ <= 32 && "Unexpected shift value."); + Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, + getI32Imm((Imm >> (48 - LZ) & 0xffff))); + return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(48 - LZ), getI32Imm(LZ)); + } + // 2-4) Patterns : {zeros}{ones}{15-bit value}{ones} + // {ones}{15-bit value}{ones} + // We can take advantage of LI's sign-extension semantics to generate leading + // ones, and then use RLDICL to mask off the ones in left sides (if required) + // after rotation. + // + // +-LZ-FO||-15-bit-||--TO--+ +-------------|--16-bit--+ + // |00011110bbbbbbbbb1111111| -> |000000000011110bbbbbbbbb| + // +------------------------+ +------------------------+ + // 63 0 63 0 + // Imm (Imm >> TO) & 0xffff + // +----sext-----|--16-bit--+ +LZ|---------------------+ + // |111111111111110bbbbbbbbb| -> |00011110bbbbbbbbb1111111| + // +------------------------+ +------------------------+ + // 63 0 63 0 + // LI8: sext many leading zeros RLDICL: rotate left TO, clear left LZ + if ((LZ + FO + TO) > 48) { + Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, + getI32Imm((Imm >> TO) & 0xffff)); + return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(TO), getI32Imm(LZ)); + } + // 2-5) Pattern : {32 zeros}{****}{0}{15-bit value} + // If Hi32 is zero and the Lo16(in Lo32) can be presented as a positive 16 bit + // value, we can use LI for Lo16 without generating leading ones then add the + // Hi16(in Lo32). + if (LZ == 32 && ((Lo32 & 0x8000) == 0)) { + Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, + getI32Imm(Lo32 & 0xffff)); + return CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(Lo32 >> 16)); + } + // 2-6) Patterns : {******}{49 zeros}{******} + // {******}{49 ones}{******} + // If the Imm contains 49 consecutive zeros/ones, it means that a total of 15 + // bits remain on both sides. Rotate right the Imm to construct an int<16> + // value, use LI for int<16> value and then use RLDICL without mask to rotate + // it back. + // + // 1) findContiguousZerosAtLeast(Imm, 49) + // +------|--zeros-|------+ +---ones--||---15 bit--+ + // |bbbbbb0000000000aaaaaa| -> |0000000000aaaaaabbbbbb| + // +----------------------+ +----------------------+ + // 63 0 63 0 + // + // 2) findContiguousZerosAtLeast(~Imm, 49) + // +------|--ones--|------+ +---ones--||---15 bit--+ + // |bbbbbb1111111111aaaaaa| -> |1111111111aaaaaabbbbbb| + // +----------------------+ +----------------------+ + // 63 0 63 0 + if ((Shift = findContiguousZerosAtLeast(Imm, 49)) || + (Shift = findContiguousZerosAtLeast(~Imm, 49))) { + uint64_t RotImm = (Imm >> Shift) | (Imm << (64 - Shift)); + Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, + getI32Imm(RotImm & 0xffff)); + return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(Shift), getI32Imm(0)); + } + + // Following patterns use 3 instructions to materialize the Imm. + InstCnt = 3; + // 3-1) Patterns : {zeros}{ones}{31-bit value}{zeros} + // {zeros}{31-bit value}{zeros} + // {zeros}{ones}{31-bit value} + // {ones}{31-bit value}{zeros} + // We can take advantage of LIS's sign-extension semantics to generate leading + // ones, add the remaining bits with ORI, and then use RLDIC to mask off the + // ones in both sides after rotation. + if ((LZ + FO + TZ) > 32) { + uint64_t ImmHi16 = (Imm >> (TZ + 16)) & 0xffff; + unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8; + Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16)); + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0), + getI32Imm((Imm >> TZ) & 0xffff)); + return CurDAG->getMachineNode(PPC::RLDIC, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(TZ), getI32Imm(LZ)); + } + // 3-2) Pattern : {zeros}{31-bit value}{ones} + // Shift right the Imm by (32 - LZ) bits to construct a negtive 32 bits value, + // therefore we can take advantage of LIS's sign-extension semantics, add + // the remaining bits with ORI, and then mask them off after rotation. + // This is similar to Pattern 2-3, please refer to the diagram there. + if ((LZ + TO) > 32) { + // Since the immediates with (LZ > 32) have been handled by previous + // patterns, here we have (LZ <= 32) to make sure we will not shift right + // the Imm by a negative value. + assert(LZ <= 32 && "Unexpected shift value."); + Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, + getI32Imm((Imm >> (48 - LZ)) & 0xffff)); + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0), + getI32Imm((Imm >> (32 - LZ)) & 0xffff)); + return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(32 - LZ), getI32Imm(LZ)); + } + // 3-3) Patterns : {zeros}{ones}{31-bit value}{ones} + // {ones}{31-bit value}{ones} + // We can take advantage of LIS's sign-extension semantics to generate leading + // ones, add the remaining bits with ORI, and then use RLDICL to mask off the + // ones in left sides (if required) after rotation. + // This is similar to Pattern 2-4, please refer to the diagram there. + if ((LZ + FO + TO) > 32) { + Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, + getI32Imm((Imm >> (TO + 16)) & 0xffff)); + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0), + getI32Imm((Imm >> TO) & 0xffff)); + return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(TO), getI32Imm(LZ)); + } + // 3-4) Patterns : High word == Low word + if (Hi32 == Lo32) { + // Handle the first 32 bits. + uint64_t ImmHi16 = (Lo32 >> 16) & 0xffff; + unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8; + Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16)); + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(Lo32 & 0xffff)); + // Use rldimi to insert the Low word into High word. + SDValue Ops[] = {SDValue(Result, 0), SDValue(Result, 0), getI32Imm(32), + getI32Imm(0)}; + return CurDAG->getMachineNode(PPC::RLDIMI, dl, MVT::i64, Ops); + } + // 3-5) Patterns : {******}{33 zeros}{******} + // {******}{33 ones}{******} + // If the Imm contains 33 consecutive zeros/ones, it means that a total of 31 + // bits remain on both sides. Rotate right the Imm to construct an int<32> + // value, use LIS + ORI for int<32> value and then use RLDICL without mask to + // rotate it back. + // This is similar to Pattern 2-6, please refer to the diagram there. + if ((Shift = findContiguousZerosAtLeast(Imm, 33)) || + (Shift = findContiguousZerosAtLeast(~Imm, 33))) { + uint64_t RotImm = (Imm >> Shift) | (Imm << (64 - Shift)); + uint64_t ImmHi16 = (RotImm >> 16) & 0xffff; + unsigned Opcode = ImmHi16 ? PPC::LIS8 : PPC::LI8; + Result = CurDAG->getMachineNode(Opcode, dl, MVT::i64, getI32Imm(ImmHi16)); + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(RotImm & 0xffff)); + return CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(Shift), getI32Imm(0)); + } + + InstCnt = 0; + return nullptr; +} + +static SDNode *selectI64Imm(SelectionDAG *CurDAG, const SDLoc &dl, uint64_t Imm, + unsigned *InstCnt = nullptr) { + unsigned InstCntDirect = 0; + // No more than 3 instructions is used if we can select the i64 immediate + // directly. + SDNode *Result = selectI64ImmDirect(CurDAG, dl, Imm, InstCntDirect); + if (Result) { + if (InstCnt) + *InstCnt = InstCntDirect; + return Result; + } + auto getI32Imm = [CurDAG, dl](unsigned Imm) { + return CurDAG->getTargetConstant(Imm, dl, MVT::i32); + }; + // Handle the upper 32 bit value. + Result = + selectI64ImmDirect(CurDAG, dl, Imm & 0xffffffff00000000, InstCntDirect); + // Add in the last bits as required. + if (uint32_t Hi16 = (Lo_32(Imm) >> 16) & 0xffff) { + Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64, + SDValue(Result, 0), getI32Imm(Hi16)); + ++InstCntDirect; + } + if (uint32_t Lo16 = Lo_32(Imm) & 0xffff) { + Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64, SDValue(Result, 0), + getI32Imm(Lo16)); + ++InstCntDirect; + } + if (InstCnt) + *InstCnt = InstCntDirect; + return Result; +} + // Select a 64-bit constant. static SDNode *selectI64Imm(SelectionDAG *CurDAG, SDNode *N) { SDLoc dl(N); @@ -1253,6 +1218,7 @@ class BitPermutationSelector { } break; case ISD::SHL: + case PPCISD::SHL: if (isa<ConstantSDNode>(V.getOperand(1))) { unsigned ShiftAmt = V.getConstantOperandVal(1); @@ -1268,6 +1234,7 @@ class BitPermutationSelector { } break; case ISD::SRL: + case PPCISD::SRL: if (isa<ConstantSDNode>(V.getOperand(1))) { unsigned ShiftAmt = V.getConstantOperandVal(1); @@ -2147,11 +2114,14 @@ class BitPermutationSelector { unsigned NumAndInsts = (unsigned) NeedsRotate + (unsigned) (bool) Res; + unsigned NumOfSelectInsts = 0; + selectI64Imm(CurDAG, dl, Mask, &NumOfSelectInsts); + assert(NumOfSelectInsts > 0 && "Failed to select an i64 constant."); if (Use32BitInsts) NumAndInsts += (unsigned) (ANDIMask != 0) + (unsigned) (ANDISMask != 0) + (unsigned) (ANDIMask != 0 && ANDISMask != 0); else - NumAndInsts += selectI64ImmInstrCount(Mask) + /* and */ 1; + NumAndInsts += NumOfSelectInsts + /* and */ 1; unsigned NumRLInsts = 0; bool FirstBG = true; @@ -2375,12 +2345,14 @@ class BitPermutationSelector { Res = SDValue(CurDAG->getMachineNode(PPC::OR8, dl, MVT::i64, ExtendToInt64(ANDIVal, dl), ANDISVal), 0); } else { - if (InstCnt) *InstCnt += selectI64ImmInstrCount(Mask) + /* and */ 1; - - SDValue MaskVal = SDValue(selectI64Imm(CurDAG, dl, Mask), 0); - Res = - SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, - ExtendToInt64(Res, dl), MaskVal), 0); + unsigned NumOfSelectInsts = 0; + SDValue MaskVal = + SDValue(selectI64Imm(CurDAG, dl, Mask, &NumOfSelectInsts), 0); + Res = SDValue(CurDAG->getMachineNode(PPC::AND8, dl, MVT::i64, + ExtendToInt64(Res, dl), MaskVal), + 0); + if (InstCnt) + *InstCnt += NumOfSelectInsts + /* and */ 1; } } @@ -2411,7 +2383,7 @@ class BitPermutationSelector { } void eraseMatchingBitGroups(function_ref<bool(const BitGroup &)> F) { - BitGroups.erase(remove_if(BitGroups, F), BitGroups.end()); + erase_if(BitGroups, F); } SmallVector<ValueBit, 64> Bits; @@ -3661,6 +3633,12 @@ bool PPCDAGToDAGISel::tryIntCompareInGPR(SDNode *N) { if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) return false; + // For POWER10, it is more profitable to use the set boolean extension + // instructions rather than the integer compare elimination codegen. + // Users can override this via the command line option, `--ppc-gpr-icmps`. + if (!(CmpInGPR.getNumOccurrences() > 0) && Subtarget->isISA3_1()) + return false; + switch (N->getOpcode()) { default: break; case ISD::ZERO_EXTEND: @@ -3708,7 +3686,7 @@ bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) { /// SelectCC - Select a comparison of the specified values with the specified /// condition code, returning the CR# of the expression. SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, - const SDLoc &dl) { + const SDLoc &dl, SDValue Chain) { // Always select the LHS. unsigned Opc; @@ -3861,7 +3839,12 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, assert(Subtarget->hasVSX() && "__float128 requires VSX"); Opc = PPC::XSCMPUQP; } - return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0); + if (Chain) + return SDValue( + CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::Other, LHS, RHS, Chain), + 0); + else + return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0); } static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC, const EVT &VT, @@ -3936,7 +3919,8 @@ static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) { // getVCmpInst: return the vector compare instruction for the specified // vector type and condition code. Since this is for altivec specific code, -// only support the altivec types (v16i8, v8i16, v4i32, v2i64, and v4f32). +// only support the altivec types (v16i8, v8i16, v4i32, v2i64, v1i128, +// and v4f32). static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, bool HasVSX, bool &Swap, bool &Negate) { Swap = false; @@ -4017,6 +4001,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, return PPC::VCMPEQUW; else if (VecVT == MVT::v2i64) return PPC::VCMPEQUD; + else if (VecVT == MVT::v1i128) + return PPC::VCMPEQUQ; break; case ISD::SETGT: if (VecVT == MVT::v16i8) @@ -4027,6 +4013,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, return PPC::VCMPGTSW; else if (VecVT == MVT::v2i64) return PPC::VCMPGTSD; + else if (VecVT == MVT::v1i128) + return PPC::VCMPGTSQ; break; case ISD::SETUGT: if (VecVT == MVT::v16i8) @@ -4037,6 +4025,8 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, return PPC::VCMPGTUW; else if (VecVT == MVT::v2i64) return PPC::VCMPGTUD; + else if (VecVT == MVT::v1i128) + return PPC::VCMPGTUQ; break; default: break; @@ -4048,17 +4038,23 @@ static unsigned int getVCmpInst(MVT VecVT, ISD::CondCode CC, bool PPCDAGToDAGISel::trySETCC(SDNode *N) { SDLoc dl(N); unsigned Imm; - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + bool IsStrict = N->isStrictFPOpcode(); + ISD::CondCode CC = + cast<CondCodeSDNode>(N->getOperand(IsStrict ? 3 : 2))->get(); EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout()); bool isPPC64 = (PtrVT == MVT::i64); + SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); + + SDValue LHS = N->getOperand(IsStrict ? 1 : 0); + SDValue RHS = N->getOperand(IsStrict ? 2 : 1); - if (!Subtarget->useCRBits() && isInt32Immediate(N->getOperand(1), Imm)) { + if (!IsStrict && !Subtarget->useCRBits() && isInt32Immediate(RHS, Imm)) { // We can codegen setcc op, imm very efficiently compared to a brcond. // Check for those cases here. // setcc op, 0 if (Imm == 0) { - SDValue Op = N->getOperand(0); + SDValue Op = LHS; switch (CC) { default: break; case ISD::SETEQ: { @@ -4093,7 +4089,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { } } } else if (Imm == ~0U) { // setcc op, -1 - SDValue Op = N->getOperand(0); + SDValue Op = LHS; switch (CC) { default: break; case ISD::SETEQ: @@ -4136,13 +4132,10 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { } } - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - // Altivec Vector compare instructions do not set any CR register by default and // vector compare operations return the same type as the operands. - if (LHS.getValueType().isVector()) { - if (Subtarget->hasQPX() || Subtarget->hasSPE()) + if (!IsStrict && LHS.getValueType().isVector()) { + if (Subtarget->hasSPE()) return false; EVT VecVT = LHS.getValueType(); @@ -4169,7 +4162,9 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { bool Inv; unsigned Idx = getCRIdxForSetCC(CC, Inv); - SDValue CCReg = SelectCC(LHS, RHS, CC, dl); + SDValue CCReg = SelectCC(LHS, RHS, CC, dl, Chain); + if (IsStrict) + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), CCReg.getValue(1)); SDValue IntCR; // SPE e*cmp* instructions only set the 'gt' bit, so hard-code that @@ -4272,8 +4267,10 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG, (FalseRes.getOpcode() != ISD::SELECT_CC || CC != ISD::SETEQ))) return false; - bool InnerIsSel = FalseRes.getOpcode() == ISD::SELECT_CC; - SDValue SetOrSelCC = InnerIsSel ? FalseRes : FalseRes.getOperand(0); + SDValue SetOrSelCC = FalseRes.getOpcode() == ISD::SELECT_CC + ? FalseRes + : FalseRes.getOperand(0); + bool InnerIsSel = SetOrSelCC.getOpcode() == ISD::SELECT_CC; if (SetOrSelCC.getOpcode() != ISD::SETCC && SetOrSelCC.getOpcode() != ISD::SELECT_CC) return false; @@ -4382,6 +4379,81 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG, return true; } +// Return true if it's a software square-root/divide operand. +static bool isSWTestOp(SDValue N) { + if (N.getOpcode() == PPCISD::FTSQRT) + return true; + if (N.getNumOperands() < 1 || !isa<ConstantSDNode>(N.getOperand(0))) + return false; + switch (N.getConstantOperandVal(0)) { + case Intrinsic::ppc_vsx_xvtdivdp: + case Intrinsic::ppc_vsx_xvtdivsp: + case Intrinsic::ppc_vsx_xvtsqrtdp: + case Intrinsic::ppc_vsx_xvtsqrtsp: + return true; + } + return false; +} + +bool PPCDAGToDAGISel::tryFoldSWTestBRCC(SDNode *N) { + assert(N->getOpcode() == ISD::BR_CC && "ISD::BR_CC is expected."); + // We are looking for following patterns, where `truncate to i1` actually has + // the same semantic with `and 1`. + // (br_cc seteq, (truncateToi1 SWTestOp), 0) -> (BCC PRED_NU, SWTestOp) + // (br_cc seteq, (and SWTestOp, 2), 0) -> (BCC PRED_NE, SWTestOp) + // (br_cc seteq, (and SWTestOp, 4), 0) -> (BCC PRED_LE, SWTestOp) + // (br_cc seteq, (and SWTestOp, 8), 0) -> (BCC PRED_GE, SWTestOp) + // (br_cc setne, (truncateToi1 SWTestOp), 0) -> (BCC PRED_UN, SWTestOp) + // (br_cc setne, (and SWTestOp, 2), 0) -> (BCC PRED_EQ, SWTestOp) + // (br_cc setne, (and SWTestOp, 4), 0) -> (BCC PRED_GT, SWTestOp) + // (br_cc setne, (and SWTestOp, 8), 0) -> (BCC PRED_LT, SWTestOp) + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); + if (CC != ISD::SETEQ && CC != ISD::SETNE) + return false; + + SDValue CmpRHS = N->getOperand(3); + if (!isa<ConstantSDNode>(CmpRHS) || + cast<ConstantSDNode>(CmpRHS)->getSExtValue() != 0) + return false; + + SDValue CmpLHS = N->getOperand(2); + if (CmpLHS.getNumOperands() < 1 || !isSWTestOp(CmpLHS.getOperand(0))) + return false; + + unsigned PCC = 0; + bool IsCCNE = CC == ISD::SETNE; + if (CmpLHS.getOpcode() == ISD::AND && + isa<ConstantSDNode>(CmpLHS.getOperand(1))) + switch (CmpLHS.getConstantOperandVal(1)) { + case 1: + PCC = IsCCNE ? PPC::PRED_UN : PPC::PRED_NU; + break; + case 2: + PCC = IsCCNE ? PPC::PRED_EQ : PPC::PRED_NE; + break; + case 4: + PCC = IsCCNE ? PPC::PRED_GT : PPC::PRED_LE; + break; + case 8: + PCC = IsCCNE ? PPC::PRED_LT : PPC::PRED_GE; + break; + default: + return false; + } + else if (CmpLHS.getOpcode() == ISD::TRUNCATE && + CmpLHS.getValueType() == MVT::i1) + PCC = IsCCNE ? PPC::PRED_UN : PPC::PRED_NU; + + if (PCC) { + SDLoc dl(N); + SDValue Ops[] = {getI32Imm(PCC, dl), CmpLHS.getOperand(0), N->getOperand(4), + N->getOperand(0)}; + CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops); + return true; + } + return false; +} + bool PPCDAGToDAGISel::tryAsSingleRLWINM(SDNode *N) { assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected"); unsigned Imm; @@ -4661,7 +4733,48 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } break; + case ISD::INTRINSIC_WO_CHAIN: { + if (!Subtarget->isISA3_1()) + break; + unsigned Opcode = 0; + switch (N->getConstantOperandVal(0)) { + default: + break; + case Intrinsic::ppc_altivec_vstribr_p: + Opcode = PPC::VSTRIBR_rec; + break; + case Intrinsic::ppc_altivec_vstribl_p: + Opcode = PPC::VSTRIBL_rec; + break; + case Intrinsic::ppc_altivec_vstrihr_p: + Opcode = PPC::VSTRIHR_rec; + break; + case Intrinsic::ppc_altivec_vstrihl_p: + Opcode = PPC::VSTRIHL_rec; + break; + } + if (!Opcode) + break; + + // Generate the appropriate vector string isolate intrinsic to match. + EVT VTs[] = {MVT::v16i8, MVT::Glue}; + SDValue VecStrOp = + SDValue(CurDAG->getMachineNode(Opcode, dl, VTs, N->getOperand(2)), 0); + // Vector string isolate instructions update the EQ bit of CR6. + // Generate a SETBC instruction to extract the bit and place it in a GPR. + SDValue SubRegIdx = CurDAG->getTargetConstant(PPC::sub_eq, dl, MVT::i32); + SDValue CR6Reg = CurDAG->getRegister(PPC::CR6, MVT::i32); + SDValue CRBit = SDValue( + CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i1, + CR6Reg, SubRegIdx, VecStrOp.getValue(1)), + 0); + CurDAG->SelectNodeTo(N, PPC::SETBC, MVT::i32, CRBit); + return; + } + case ISD::SETCC: + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: if (trySETCC(N)) return; break; @@ -4813,8 +4926,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load"); switch (LoadedVT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Invalid PPC load type!"); - case MVT::v4f64: Opcode = PPC::QVLFDUX; break; // QPX - case MVT::v4f32: Opcode = PPC::QVLFSUX; break; // QPX case MVT::f64: Opcode = PPC::LFDUX; break; case MVT::f32: Opcode = PPC::LFSUX; break; case MVT::i32: Opcode = PPC::LWZUX; break; @@ -4961,6 +5072,32 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; } + case ISD::MUL: { + SDValue Op1 = N->getOperand(1); + if (Op1.getOpcode() != ISD::Constant || Op1.getValueType() != MVT::i64) + break; + + // If the multiplier fits int16, we can handle it with mulli. + int64_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue(); + unsigned Shift = countTrailingZeros<uint64_t>(Imm); + if (isInt<16>(Imm) || !Shift) + break; + + // If the shifted value fits int16, we can do this transformation: + // (mul X, c1 << c2) -> (rldicr (mulli X, c1) c2). We do this in ISEL due to + // DAGCombiner prefers (shl (mul X, c1), c2) -> (mul X, c1 << c2). + uint64_t ImmSh = Imm >> Shift; + if (isInt<16>(ImmSh)) { + uint64_t SextImm = SignExtend64(ImmSh & 0xFFFF, 16); + SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64); + SDNode *MulNode = CurDAG->getMachineNode(PPC::MULLI8, dl, MVT::i64, + N->getOperand(0), SDImm); + CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, SDValue(MulNode, 0), + getI32Imm(Shift, dl), getI32Imm(63 - Shift, dl)); + return; + } + break; + } // FIXME: Remove this once the ANDI glue bug is fixed: case PPCISD::ANDI_rec_1_EQ_BIT: case PPCISD::ANDI_rec_1_GT_BIT: { @@ -5095,12 +5232,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SelectCCOp = PPC::SELECT_CC_F16; else if (Subtarget->hasSPE()) SelectCCOp = PPC::SELECT_CC_SPE; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f64) - SelectCCOp = PPC::SELECT_CC_QFRC; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f32) - SelectCCOp = PPC::SELECT_CC_QSRC; - else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4i1) - SelectCCOp = PPC::SELECT_CC_QBRC; else if (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64) SelectCCOp = PPC::SELECT_CC_VSRC; @@ -5192,6 +5323,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) { return; } case ISD::BR_CC: { + if (tryFoldSWTestBRCC(N)) + return; ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); unsigned PCC = getPredicateForSetCC(CC, N->getOperand(2).getValueType(), Subtarget); @@ -5856,9 +5989,6 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: - case PPC::SELECT_QFRC: - case PPC::SELECT_QSRC: - case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: @@ -6177,9 +6307,6 @@ void PPCDAGToDAGISel::PeepholeCROps() { case PPC::SELECT_I8: case PPC::SELECT_F4: case PPC::SELECT_F8: - case PPC::SELECT_QFRC: - case PPC::SELECT_QSRC: - case PPC::SELECT_QBRC: case PPC::SELECT_SPE: case PPC::SELECT_SPE4: case PPC::SELECT_VRRC: diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index ddfbd04e1ebc..9215c17cb94b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -74,6 +74,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSectionXCOFF.h" #include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/BranchProbability.h" @@ -120,6 +121,11 @@ cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden); static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables", cl::desc("use absolute jump tables on ppc"), cl::Hidden); +// TODO - Remove this option if soft fp128 has been fully supported . +static cl::opt<bool> + EnableSoftFP128("enable-soft-fp128", + cl::desc("temp option to enable soft fp128"), cl::Hidden); + STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM"); @@ -145,7 +151,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (!useSoftFloat()) { if (hasSPE()) { addRegisterClass(MVT::f32, &PPC::GPRCRegClass); - addRegisterClass(MVT::f64, &PPC::SPERCRegClass); + // EFPU2 APU only supports f32 + if (!Subtarget.hasEFPU2()) + addRegisterClass(MVT::f64, &PPC::SPERCRegClass); } else { addRegisterClass(MVT::f32, &PPC::F4RCRegClass); addRegisterClass(MVT::f64, &PPC::F8RCRegClass); @@ -215,13 +223,36 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); if (isPPC64 || Subtarget.hasFPCVT()) { + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Promote); + AddPromotedToType(ISD::STRICT_SINT_TO_FP, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Promote); + AddPromotedToType(ISD::STRICT_UINT_TO_FP, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); + setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); + + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i1, Promote); + AddPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i1, Promote); + AddPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); + + setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote); + AddPromotedToType(ISD::FP_TO_SINT, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); + setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote); + AddPromotedToType(ISD::FP_TO_UINT, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); } else { + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); } @@ -247,6 +278,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // PPC (the libcall is not available). setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::ppcf128, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::ppcf128, Custom); // We do not currently implement these libm ops for PowerPC. setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand); @@ -299,8 +332,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal); - if (Subtarget.hasVSX()) - setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Legal); + if (Subtarget.hasVSX()) { + setOperationAction(ISD::STRICT_FRINT, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FRINT, MVT::f64, Legal); + } if (Subtarget.hasFSQRT()) { setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); @@ -338,6 +373,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FMA , MVT::f32, Legal); } + if (Subtarget.hasSPE()) + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); // If we're enabling GP optimizations, use hardware square root @@ -415,6 +453,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (!Subtarget.useCRBits()) setOperationAction(ISD::SETCC, MVT::i32, Custom); + if (Subtarget.hasFPU()) { + setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Legal); + + setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Legal); + } + // PowerPC does not have BRCOND which requires SetCC if (!Subtarget.useCRBits()) setOperationAction(ISD::BRCOND, MVT::Other, Expand); @@ -431,9 +479,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); } else { // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores. + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); // PowerPC does not have [U|S]INT_TO_FP + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Expand); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); } @@ -561,36 +612,56 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setCondCodeAction(ISD::SETONE, MVT::f32, Expand); setCondCodeAction(ISD::SETONE, MVT::f64, Expand); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + if (Subtarget.has64BitSupport()) { // They also have instructions for converting between i64 and fp. + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Expand); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Expand); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); // This is just the low 32 bits of a (signed) fp->i64 conversion. // We cannot do this with Promote because i64 is not a legal type. + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) + if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) { setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); + } } else { // PowerPC does not have FP_TO_UINT on 32-bit implementations. if (Subtarget.hasSPE()) { setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); - } else + } else { + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); + } } // With the instructions enabled under FPCVT, we can do everything. if (Subtarget.hasFPCVT()) { if (Subtarget.has64BitSupport()) { + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); } + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); @@ -613,6 +684,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } + // PowerPC has better expansions for funnel shifts than the generic + // TargetLowering::expandFunnelShift. + if (Subtarget.has64BitSupport()) { + setOperationAction(ISD::FSHL, MVT::i64, Custom); + setOperationAction(ISD::FSHR, MVT::i64, Custom); + } + setOperationAction(ISD::FSHL, MVT::i32, Custom); + setOperationAction(ISD::FSHR, MVT::i32, Custom); + if (Subtarget.hasVSX()) { setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); @@ -745,9 +825,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v2i64, Expand); } - for (auto VT : {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8}) - setOperationAction(ISD::ABS, VT, Custom); - // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle // with merges, splats, etc. setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); @@ -767,6 +844,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v4i32, Subtarget.useCRBits() ? Legal : Expand); setOperationAction(ISD::STORE , MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); @@ -776,11 +857,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); - // Without hasP8Altivec set, v2i64 SMAX isn't available. - // But ABS custom lowering requires SMAX support. - if (!Subtarget.hasP8Altivec()) - setOperationAction(ISD::ABS, MVT::v2i64, Expand); - // Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8. setOperationAction(ISD::ROTL, MVT::v1i128, Custom); // With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w). @@ -799,7 +875,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::MUL, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); - if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) { + if (Subtarget.hasVSX()) { setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } @@ -809,6 +885,27 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, else setOperationAction(ISD::MUL, MVT::v4i32, Custom); + if (Subtarget.isISA3_1()) { + setOperationAction(ISD::MUL, MVT::v2i64, Legal); + setOperationAction(ISD::MULHS, MVT::v2i64, Legal); + setOperationAction(ISD::MULHU, MVT::v2i64, Legal); + setOperationAction(ISD::MULHS, MVT::v4i32, Legal); + setOperationAction(ISD::MULHU, MVT::v4i32, Legal); + setOperationAction(ISD::UDIV, MVT::v2i64, Legal); + setOperationAction(ISD::SDIV, MVT::v2i64, Legal); + setOperationAction(ISD::UDIV, MVT::v4i32, Legal); + setOperationAction(ISD::SDIV, MVT::v4i32, Legal); + setOperationAction(ISD::UREM, MVT::v2i64, Legal); + setOperationAction(ISD::SREM, MVT::v2i64, Legal); + setOperationAction(ISD::UREM, MVT::v4i32, Legal); + setOperationAction(ISD::SREM, MVT::v4i32, Legal); + setOperationAction(ISD::UREM, MVT::v1i128, Legal); + setOperationAction(ISD::SREM, MVT::v1i128, Legal); + setOperationAction(ISD::UDIV, MVT::v1i128, Legal); + setOperationAction(ISD::SDIV, MVT::v1i128, Legal); + setOperationAction(ISD::ROTL, MVT::v1i128, Legal); + } + setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v16i8, Custom); @@ -920,6 +1017,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SUB, MVT::v2i64, Expand); } + if (Subtarget.isISA3_1()) + setOperationAction(ISD::SETCC, MVT::v1i128, Legal); + else + setOperationAction(ISD::SETCC, MVT::v1i128, Expand); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64); setOperationAction(ISD::STORE, MVT::v2i64, Promote); @@ -927,6 +1029,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); @@ -935,6 +1041,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // Custom handling for partial vectors of integers converted to // floating point. We already have optimal handling for v2i32 through // the DAG combine, so those aren't necessary. + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i8, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i8, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i16, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i16, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i8, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i8, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i16, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom); @@ -966,7 +1080,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal); - setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal); @@ -980,7 +1094,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal); - setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal); + setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal); @@ -1063,6 +1177,48 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::BSWAP, MVT::v4i32, Legal); setOperationAction(ISD::BSWAP, MVT::v2i64, Legal); setOperationAction(ISD::BSWAP, MVT::v1i128, Legal); + } else if (Subtarget.hasAltivec() && EnableSoftFP128) { + addRegisterClass(MVT::f128, &PPC::VRRCRegClass); + + for (MVT FPT : MVT::fp_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand); + + setOperationAction(ISD::LOAD, MVT::f128, Promote); + setOperationAction(ISD::STORE, MVT::f128, Promote); + + AddPromotedToType(ISD::LOAD, MVT::f128, MVT::v4i32); + AddPromotedToType(ISD::STORE, MVT::f128, MVT::v4i32); + + // Set FADD/FSUB as libcall to avoid the legalizer to expand the + // fp_to_uint and int_to_fp. + setOperationAction(ISD::FADD, MVT::f128, LibCall); + setOperationAction(ISD::FSUB, MVT::f128, LibCall); + + setOperationAction(ISD::FMUL, MVT::f128, Expand); + setOperationAction(ISD::FDIV, MVT::f128, Expand); + setOperationAction(ISD::FNEG, MVT::f128, Expand); + setOperationAction(ISD::FABS, MVT::f128, Expand); + setOperationAction(ISD::FSIN, MVT::f128, Expand); + setOperationAction(ISD::FCOS, MVT::f128, Expand); + setOperationAction(ISD::FPOW, MVT::f128, Expand); + setOperationAction(ISD::FPOWI, MVT::f128, Expand); + setOperationAction(ISD::FREM, MVT::f128, Expand); + setOperationAction(ISD::FSQRT, MVT::f128, Expand); + setOperationAction(ISD::FMA, MVT::f128, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); + + setTruncStoreAction(MVT::f128, MVT::f64, Expand); + setTruncStoreAction(MVT::f128, MVT::f32, Expand); + + // Expand the fp_extend if the target type is fp128. + setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Expand); + + // Expand the fp_round if the source type is fp128. + for (MVT VT : {MVT::f32, MVT::f64}) { + setOperationAction(ISD::FP_ROUND, VT, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); + } } if (Subtarget.hasP9Altivec()) { @@ -1079,164 +1235,24 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } } - if (Subtarget.hasQPX()) { - setOperationAction(ISD::FADD, MVT::v4f64, Legal); - setOperationAction(ISD::FSUB, MVT::v4f64, Legal); - setOperationAction(ISD::FMUL, MVT::v4f64, Legal); - setOperationAction(ISD::FREM, MVT::v4f64, Expand); - - setOperationAction(ISD::FCOPYSIGN, MVT::v4f64, Legal); - setOperationAction(ISD::FGETSIGN, MVT::v4f64, Expand); - - setOperationAction(ISD::LOAD , MVT::v4f64, Custom); - setOperationAction(ISD::STORE , MVT::v4f64, Custom); - - setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Custom); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4f64, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f64, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f64, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f64, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f64, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f64, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f64, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::v4f64, Legal); - setOperationAction(ISD::FP_TO_UINT , MVT::v4f64, Expand); - - setOperationAction(ISD::FP_ROUND , MVT::v4f32, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); - - setOperationAction(ISD::FNEG , MVT::v4f64, Legal); - setOperationAction(ISD::FABS , MVT::v4f64, Legal); - setOperationAction(ISD::FSIN , MVT::v4f64, Expand); - setOperationAction(ISD::FCOS , MVT::v4f64, Expand); - setOperationAction(ISD::FPOW , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand); - setOperationAction(ISD::FLOG10 , MVT::v4f64, Expand); - setOperationAction(ISD::FEXP , MVT::v4f64, Expand); - setOperationAction(ISD::FEXP2 , MVT::v4f64, Expand); - - setOperationAction(ISD::FMINNUM, MVT::v4f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f64, Legal); - - setIndexedLoadAction(ISD::PRE_INC, MVT::v4f64, Legal); - setIndexedStoreAction(ISD::PRE_INC, MVT::v4f64, Legal); - - addRegisterClass(MVT::v4f64, &PPC::QFRCRegClass); - - setOperationAction(ISD::FADD, MVT::v4f32, Legal); - setOperationAction(ISD::FSUB, MVT::v4f32, Legal); - setOperationAction(ISD::FMUL, MVT::v4f32, Legal); - setOperationAction(ISD::FREM, MVT::v4f32, Expand); - - setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal); - setOperationAction(ISD::FGETSIGN, MVT::v4f32, Expand); - - setOperationAction(ISD::LOAD , MVT::v4f32, Custom); - setOperationAction(ISD::STORE , MVT::v4f32, Custom); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4f32, Expand); - setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4f32, Legal); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4f32, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4f32, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4f32, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); - - setOperationAction(ISD::FP_TO_SINT , MVT::v4f32, Legal); - setOperationAction(ISD::FP_TO_UINT , MVT::v4f32, Expand); - - setOperationAction(ISD::FNEG , MVT::v4f32, Legal); - setOperationAction(ISD::FABS , MVT::v4f32, Legal); - setOperationAction(ISD::FSIN , MVT::v4f32, Expand); - setOperationAction(ISD::FCOS , MVT::v4f32, Expand); - setOperationAction(ISD::FPOW , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand); - setOperationAction(ISD::FLOG10 , MVT::v4f32, Expand); - setOperationAction(ISD::FEXP , MVT::v4f32, Expand); - setOperationAction(ISD::FEXP2 , MVT::v4f32, Expand); - - setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); - - setIndexedLoadAction(ISD::PRE_INC, MVT::v4f32, Legal); - setIndexedStoreAction(ISD::PRE_INC, MVT::v4f32, Legal); - - addRegisterClass(MVT::v4f32, &PPC::QSRCRegClass); - - setOperationAction(ISD::AND , MVT::v4i1, Legal); - setOperationAction(ISD::OR , MVT::v4i1, Legal); - setOperationAction(ISD::XOR , MVT::v4i1, Legal); - - if (!Subtarget.useCRBits()) - setOperationAction(ISD::SELECT, MVT::v4i1, Expand); - setOperationAction(ISD::VSELECT, MVT::v4i1, Legal); - - setOperationAction(ISD::LOAD , MVT::v4i1, Custom); - setOperationAction(ISD::STORE , MVT::v4i1, Custom); - - setOperationAction(ISD::EXTRACT_VECTOR_ELT , MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT , MVT::v4i1, Expand); - setOperationAction(ISD::CONCAT_VECTORS , MVT::v4i1, Expand); - setOperationAction(ISD::EXTRACT_SUBVECTOR , MVT::v4i1, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE , MVT::v4i1, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i1, Expand); - setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); - - setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); - - addRegisterClass(MVT::v4i1, &PPC::QBRCRegClass); - - setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); - setOperationAction(ISD::FROUND, MVT::v4f64, Legal); - - setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); - setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); - setOperationAction(ISD::FROUND, MVT::v4f32, Legal); - - setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Expand); - setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); - - // These need to set FE_INEXACT, and so cannot be vectorized here. - setOperationAction(ISD::FRINT, MVT::v4f64, Expand); - setOperationAction(ISD::FRINT, MVT::v4f32, Expand); - - if (TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FDIV, MVT::v4f64, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); - - setOperationAction(ISD::FDIV, MVT::v4f32, Legal); - setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); - } else { - setOperationAction(ISD::FDIV, MVT::v4f64, Expand); - setOperationAction(ISD::FSQRT, MVT::v4f64, Expand); - - setOperationAction(ISD::FDIV, MVT::v4f32, Expand); - setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); - } - - // TODO: Handle constrained floating-point operations of v4f64 + if (Subtarget.pairedVectorMemops()) { + addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass); + setOperationAction(ISD::LOAD, MVT::v256i1, Custom); + setOperationAction(ISD::STORE, MVT::v256i1, Custom); + } + if (Subtarget.hasMMA()) { + addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass); + setOperationAction(ISD::LOAD, MVT::v512i1, Custom); + setOperationAction(ISD::STORE, MVT::v512i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom); } if (Subtarget.has64BitSupport()) setOperationAction(ISD::PREFETCH, MVT::Other, Legal); + if (Subtarget.isISA3_1()) + setOperationAction(ISD::SRA, MVT::v1i128, Legal); + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom); if (!isPPC64) { @@ -1258,6 +1274,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLibcallName(RTLIB::SRA_I128, nullptr); } + if (!isPPC64) + setMaxAtomicSizeInBitsSupported(32); + setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: @@ -1295,12 +1314,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setTargetDAGCombine(ISD::SELECT_CC); } - // Use reciprocal estimates. - if (TM.Options.UnsafeFPMath) { - setTargetDAGCombine(ISD::FDIV); - setTargetDAGCombine(ISD::FSQRT); - } - if (Subtarget.hasP9Altivec()) { setTargetDAGCombine(ISD::ABS); setTargetDAGCombine(ISD::VSELECT); @@ -1316,8 +1329,19 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLibcallName(RTLIB::POW_F128, "powf128"); setLibcallName(RTLIB::FMIN_F128, "fminf128"); setLibcallName(RTLIB::FMAX_F128, "fmaxf128"); - setLibcallName(RTLIB::POWI_F128, "__powikf2"); setLibcallName(RTLIB::REM_F128, "fmodf128"); + setLibcallName(RTLIB::SQRT_F128, "sqrtf128"); + setLibcallName(RTLIB::CEIL_F128, "ceilf128"); + setLibcallName(RTLIB::FLOOR_F128, "floorf128"); + setLibcallName(RTLIB::TRUNC_F128, "truncf128"); + setLibcallName(RTLIB::ROUND_F128, "roundf128"); + setLibcallName(RTLIB::LROUND_F128, "lroundf128"); + setLibcallName(RTLIB::LLROUND_F128, "llroundf128"); + setLibcallName(RTLIB::RINT_F128, "rintf128"); + setLibcallName(RTLIB::LRINT_F128, "lrintf128"); + setLibcallName(RTLIB::LLRINT_F128, "llrintf128"); + setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128"); + setLibcallName(RTLIB::FMA_F128, "fmaf128"); // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. @@ -1380,6 +1404,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, MaxLoadsPerMemcmpOptSize = 4; } + IsStrictFPEnabled = true; + // Let the subtarget (CPU) decide if a predictable select is more expensive // than the corresponding branch. This information is used in CGP to decide // when to convert selects into branches. @@ -1422,8 +1448,8 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, // 16byte and wider vectors are passed on 16byte boundary. // The rest is 8 on PPC64 and 4 on PPC32 boundary. Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4); - if (Subtarget.hasAltivec() || Subtarget.hasQPX()) - getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16)); + if (Subtarget.hasAltivec()) + getMaxByValAlign(Ty, Alignment, Align(16)); return Alignment.value(); } @@ -1439,16 +1465,6 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { return VT.isScalarInteger(); } -/// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a specific -/// type is cheaper than a multiply followed by a shift. -/// This is true for words and doublewords on 64-bit PowerPC. -bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const { - if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) || - isOperationLegal(ISD::MULHU, Type))) - return true; - return TargetLowering::isMulhCheaperThanMulShift(Type); -} - const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; @@ -1469,6 +1485,10 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { return "PPCISD::FP_TO_SINT_IN_VSR"; case PPCISD::FRE: return "PPCISD::FRE"; case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; + case PPCISD::FTSQRT: + return "PPCISD::FTSQRT"; + case PPCISD::FSQRT: + return "PPCISD::FSQRT"; case PPCISD::STFIWX: return "PPCISD::STFIWX"; case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; @@ -1516,7 +1536,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::ANDI_rec_1_GT_BIT: return "PPCISD::ANDI_rec_1_GT_BIT"; case PPCISD::VCMP: return "PPCISD::VCMP"; - case PPCISD::VCMPo: return "PPCISD::VCMPo"; + case PPCISD::VCMP_rec: return "PPCISD::VCMP_rec"; case PPCISD::LBRX: return "PPCISD::LBRX"; case PPCISD::STBRX: return "PPCISD::STBRX"; case PPCISD::LFIWAX: return "PPCISD::LFIWAX"; @@ -1553,6 +1573,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR"; case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA"; case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; + case PPCISD::PADDI_DTPREL: + return "PPCISD::PADDI_DTPREL"; case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; case PPCISD::SC: return "PPCISD::SC"; case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; @@ -1561,12 +1583,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; case PPCISD::VABSD: return "PPCISD::VABSD"; - case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; - case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; - case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; - case PPCISD::QVESPLATI: return "PPCISD::QVESPLATI"; - case PPCISD::QBFLT: return "PPCISD::QBFLT"; - case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64"; case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; @@ -1574,8 +1590,35 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH"; case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF"; case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR"; + case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR: + return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR"; + case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR: + return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR"; + case PPCISD::ACC_BUILD: return "PPCISD::ACC_BUILD"; + case PPCISD::PAIR_BUILD: return "PPCISD::PAIR_BUILD"; + case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG"; + case PPCISD::XXMFACC: return "PPCISD::XXMFACC"; case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; case PPCISD::FNMSUB: return "PPCISD::FNMSUB"; + case PPCISD::STRICT_FADDRTZ: + return "PPCISD::STRICT_FADDRTZ"; + case PPCISD::STRICT_FCTIDZ: + return "PPCISD::STRICT_FCTIDZ"; + case PPCISD::STRICT_FCTIWZ: + return "PPCISD::STRICT_FCTIWZ"; + case PPCISD::STRICT_FCTIDUZ: + return "PPCISD::STRICT_FCTIDUZ"; + case PPCISD::STRICT_FCTIWUZ: + return "PPCISD::STRICT_FCTIWUZ"; + case PPCISD::STRICT_FCFID: + return "PPCISD::STRICT_FCFID"; + case PPCISD::STRICT_FCFIDU: + return "PPCISD::STRICT_FCFIDU"; + case PPCISD::STRICT_FCFIDS: + return "PPCISD::STRICT_FCFIDS"; + case PPCISD::STRICT_FCFIDUS: + return "PPCISD::STRICT_FCFIDUS"; + case PPCISD::LXVRZX: return "PPCISD::LXVRZX"; } return nullptr; } @@ -1585,9 +1628,6 @@ EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C, if (!VT.isVector()) return Subtarget.useCRBits() ? MVT::i1 : MVT::i32; - if (Subtarget.hasQPX()) - return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements()); - return VT.changeVectorElementTypeToInteger(); } @@ -2361,36 +2401,6 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { return SDValue(); } -/// isQVALIGNIShuffleMask - If this is a qvaligni shuffle mask, return the shift -/// amount, otherwise return -1. -int PPC::isQVALIGNIShuffleMask(SDNode *N) { - EVT VT = N->getValueType(0); - if (VT != MVT::v4f64 && VT != MVT::v4f32 && VT != MVT::v4i1) - return -1; - - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); - - // Find the first non-undef value in the shuffle mask. - unsigned i; - for (i = 0; i != 4 && SVOp->getMaskElt(i) < 0; ++i) - /*search*/; - - if (i == 4) return -1; // all undef. - - // Otherwise, check to see if the rest of the elements are consecutively - // numbered from this value. - unsigned ShiftAmt = SVOp->getMaskElt(i); - if (ShiftAmt < i) return -1; - ShiftAmt -= i; - - // Check the rest of the elements to see if they are consecutive. - for (++i; i != 4; ++i) - if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i)) - return -1; - - return ShiftAmt; -} - //===----------------------------------------------------------------------===// // Addressing Mode Selection //===----------------------------------------------------------------------===// @@ -2432,6 +2442,20 @@ bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base, return false; } +/// isIntS34Immediate - This method tests if value of node given can be +/// accurately represented as a sign extension from a 34-bit value. If so, +/// this returns true and the immediate. +bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) { + if (!isa<ConstantSDNode>(N)) + return false; + + Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); + return isInt<34>(Imm); +} +bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) { + return isIntS34Immediate(Op.getNode(), Imm); +} + /// SelectAddressRegReg - Given the specified addressed, check to see if it /// can be represented as an indexed [r+r] operation. Returns false if it /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is @@ -2632,6 +2656,55 @@ bool PPCTargetLowering::SelectAddressRegImm( return true; // [r+0] } +/// Similar to the 16-bit case but for instructions that take a 34-bit +/// displacement field (prefixed loads/stores). +bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp, + SDValue &Base, + SelectionDAG &DAG) const { + // Only on 64-bit targets. + if (N.getValueType() != MVT::i64) + return false; + + SDLoc dl(N); + int64_t Imm = 0; + + if (N.getOpcode() == ISD::ADD) { + if (!isIntS34Immediate(N.getOperand(1), Imm)) + return false; + Disp = DAG.getTargetConstant(Imm, dl, N.getValueType()); + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + else + Base = N.getOperand(0); + return true; + } + + if (N.getOpcode() == ISD::OR) { + if (!isIntS34Immediate(N.getOperand(1), Imm)) + return false; + // If this is an or of disjoint bitfields, we can codegen this as an add + // (for better address arithmetic) if the LHS and RHS of the OR are + // provably disjoint. + KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0)); + if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL) + return false; + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) + Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); + else + Base = N.getOperand(0); + Disp = DAG.getTargetConstant(Imm, dl, N.getValueType()); + return true; + } + + if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const. + Disp = DAG.getTargetConstant(Imm, dl, N.getValueType()); + Base = DAG.getRegister(PPC::ZERO8, N.getValueType()); + return true; + } + + return false; +} + /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, @@ -2761,16 +2834,9 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, return false; } - // PowerPC doesn't have preinc load/store instructions for vectors (except - // for QPX, which does have preinc r+r forms). - if (VT.isVector()) { - if (!Subtarget.hasQPX() || (VT != MVT::v4f64 && VT != MVT::v4f32)) { - return false; - } else if (SelectAddressRegRegOnly(Ptr, Offset, Base, DAG)) { - AM = ISD::PRE_INC; - return true; - } - } + // PowerPC doesn't have preinc load/store instructions for vectors + if (VT.isVector()) + return false; if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) { // Common code will reject creating a pre-inc form if the base pointer @@ -3065,6 +3131,15 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, TLSModel::Model Model = TM.getTLSModel(GV); if (Model == TLSModel::LocalExec) { + if (Subtarget.isUsingPCRelativeCalls()) { + SDValue TLSReg = DAG.getRegister(PPC::X13, MVT::i64); + SDValue TGA = DAG.getTargetGlobalAddress( + GV, dl, PtrVT, 0, (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG)); + SDValue MatAddr = + DAG.getNode(PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, dl, PtrVT, TGA); + return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, MatAddr); + } + SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_HA); SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, @@ -3077,29 +3152,44 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, } if (Model == TLSModel::InitialExec) { - SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); - SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, - PPCII::MO_TLS); - SDValue GOTPtr; - if (is64bit) { - setUsesTOCBasePtr(DAG); - SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); - GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, - PtrVT, GOTReg, TGA); + bool IsPCRel = Subtarget.isUsingPCRelativeCalls(); + SDValue TGA = DAG.getTargetGlobalAddress( + GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0); + SDValue TGATLS = DAG.getTargetGlobalAddress( + GV, dl, PtrVT, 0, + IsPCRel ? (PPCII::MO_TLS | PPCII::MO_PCREL_FLAG) : PPCII::MO_TLS); + SDValue TPOffset; + if (IsPCRel) { + SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, dl, PtrVT, TGA); + TPOffset = DAG.getLoad(MVT::i64, dl, DAG.getEntryNode(), MatPCRel, + MachinePointerInfo()); } else { - if (!TM.isPositionIndependent()) - GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); - else if (picLevel == PICLevel::SmallPIC) - GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); - else - GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + SDValue GOTPtr; + if (is64bit) { + setUsesTOCBasePtr(DAG); + SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64); + GOTPtr = + DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA); + } else { + if (!TM.isPositionIndependent()) + GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT); + else if (picLevel == PICLevel::SmallPIC) + GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT); + else + GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT); + } + TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr); } - SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, - PtrVT, TGA, GOTPtr); return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS); } if (Model == TLSModel::GeneralDynamic) { + if (Subtarget.isUsingPCRelativeCalls()) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_GOT_TLSGD_PCREL_FLAG); + return DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA); + } + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue GOTPtr; if (is64bit) { @@ -3118,6 +3208,14 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, } if (Model == TLSModel::LocalDynamic) { + if (Subtarget.isUsingPCRelativeCalls()) { + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_GOT_TLSLD_PCREL_FLAG); + SDValue MatPCRel = + DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA); + return DAG.getNode(PPCISD::PADDI_DTPREL, dl, PtrVT, MatPCRel, TGA); + } + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0); SDValue GOTPtr; if (is64bit) { @@ -3492,11 +3590,6 @@ static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13}; -/// QFPR - The set of QPX registers that should be allocated for arguments. -static const MCPhysReg QFPR[] = { - PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, - PPC::QF8, PPC::QF9, PPC::QF10, PPC::QF11, PPC::QF12, PPC::QF13}; - /// CalculateStackSlotSize - Calculates the size reserved for this argument on /// the stack. static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags, @@ -3526,10 +3619,6 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 || ArgVT == MVT::v1i128 || ArgVT == MVT::f128) Alignment = Align(16); - // QPX vector types stored in double-precision are padded to a 32 byte - // boundary. - else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1) - Alignment = Align(32); // ByVal parameters are aligned as requested. if (Flags.isByVal()) { @@ -3561,14 +3650,11 @@ static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT, /// stack slot (instead of being passed in registers). ArgOffset, /// AvailableFPRs, and AvailableVRs must hold the current argument /// position, and will be updated to account for this argument. -static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, - ISD::ArgFlagsTy Flags, - unsigned PtrByteSize, - unsigned LinkageSize, - unsigned ParamAreaSize, - unsigned &ArgOffset, +static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags, + unsigned PtrByteSize, unsigned LinkageSize, + unsigned ParamAreaSize, unsigned &ArgOffset, unsigned &AvailableFPRs, - unsigned &AvailableVRs, bool HasQPX) { + unsigned &AvailableVRs) { bool UseMemory = false; // Respect alignment of argument on the stack. @@ -3592,11 +3678,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, // However, if the argument is actually passed in an FPR or a VR, // we don't use memory after all. if (!Flags.isByVal()) { - if (ArgVT == MVT::f32 || ArgVT == MVT::f64 || - // QPX registers overlap with the scalar FP registers. - (HasQPX && (ArgVT == MVT::v4f32 || - ArgVT == MVT::v4f64 || - ArgVT == MVT::v4i1))) + if (ArgVT == MVT::f32 || ArgVT == MVT::f64) if (AvailableFPRs > 0) { --AvailableFPRs; return false; @@ -3631,11 +3713,8 @@ SDValue PPCTargetLowering::LowerFormalArguments( if (Subtarget.is64BitELFABI()) return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); - if (Subtarget.is32BitELFABI()) - return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, - InVals); - - return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG, + assert(Subtarget.is32BitELFABI()); + return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); } @@ -3735,18 +3814,12 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( RC = &PPC::VRRCRegClass; break; case MVT::v4f32: - RC = Subtarget.hasQPX() ? &PPC::QSRCRegClass : &PPC::VRRCRegClass; + RC = &PPC::VRRCRegClass; break; case MVT::v2f64: case MVT::v2i64: RC = &PPC::VRRCRegClass; break; - case MVT::v4f64: - RC = &PPC::QFRCRegClass; - break; - case MVT::v4i1: - RC = &PPC::QBRCRegClass; - break; } SDValue ArgValue; @@ -3945,7 +4018,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( const unsigned Num_GPR_Regs = array_lengthof(GPR); const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; const unsigned Num_VR_Regs = array_lengthof(VR); - const unsigned Num_QFPR_Regs = Num_FPR_Regs; // Do a first pass over the arguments to determine whether the ABI // guarantees that our caller has allocated the parameter save area @@ -3964,8 +4036,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags, PtrByteSize, LinkageSize, ParamAreaSize, - NumBytes, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + NumBytes, AvailableFPRs, AvailableVRs)) HasParameterArea = true; } @@ -3975,7 +4046,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( unsigned ArgOffset = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - unsigned &QFPR_idx = FPR_idx; SmallVector<SDValue, 8> MemOps; Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; @@ -4218,51 +4288,20 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: - if (!Subtarget.hasQPX()) { - // These can be scalar arguments or elements of a vector array type - // passed directly. The latter are used to implement ELFv2 homogenous - // vector aggregates. - if (VR_idx != Num_VR_Regs) { - unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); - ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++VR_idx; - } else { - if (CallConv == CallingConv::Fast) - ComputeArgOffset(); - needsLoad = true; - } - if (CallConv != CallingConv::Fast || needsLoad) - ArgOffset += 16; - break; - } // not QPX - - assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 && - "Invalid QPX parameter type"); - LLVM_FALLTHROUGH; - - case MVT::v4f64: - case MVT::v4i1: - // QPX vectors are treated like their scalar floating-point subregisters - // (except that they're larger). - unsigned Sz = ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 ? 16 : 32; - if (QFPR_idx != Num_QFPR_Regs) { - const TargetRegisterClass *RC; - switch (ObjectVT.getSimpleVT().SimpleTy) { - case MVT::v4f64: RC = &PPC::QFRCRegClass; break; - case MVT::v4f32: RC = &PPC::QSRCRegClass; break; - default: RC = &PPC::QBRCRegClass; break; - } - - unsigned VReg = MF.addLiveIn(QFPR[QFPR_idx], RC); + // These can be scalar arguments or elements of a vector array type + // passed directly. The latter are used to implement ELFv2 homogenous + // vector aggregates. + if (VR_idx != Num_VR_Regs) { + unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++QFPR_idx; + ++VR_idx; } else { if (CallConv == CallingConv::Fast) ComputeArgOffset(); needsLoad = true; } if (CallConv != CallingConv::Fast || needsLoad) - ArgOffset += Sz; + ArgOffset += 16; break; } @@ -4329,366 +4368,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( return Chain; } -SDValue PPCTargetLowering::LowerFormalArguments_Darwin( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, - SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { - // TODO: add description of PPC stack frame format, or at least some docs. - // - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo &MFI = MF.getFrameInfo(); - PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); - - EVT PtrVT = getPointerTy(MF.getDataLayout()); - bool isPPC64 = PtrVT == MVT::i64; - // Potential tail calls could cause overwriting of argument stack slots. - bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt && - (CallConv == CallingConv::Fast)); - unsigned PtrByteSize = isPPC64 ? 8 : 4; - unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); - unsigned ArgOffset = LinkageSize; - // Area that is at least reserved in caller of this function. - unsigned MinReservedArea = ArgOffset; - - static const MCPhysReg GPR_32[] = { // 32-bit registers. - PPC::R3, PPC::R4, PPC::R5, PPC::R6, - PPC::R7, PPC::R8, PPC::R9, PPC::R10, - }; - static const MCPhysReg GPR_64[] = { // 64-bit registers. - PPC::X3, PPC::X4, PPC::X5, PPC::X6, - PPC::X7, PPC::X8, PPC::X9, PPC::X10, - }; - static const MCPhysReg VR[] = { - PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, - PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 - }; - - const unsigned Num_GPR_Regs = array_lengthof(GPR_32); - const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; - const unsigned Num_VR_Regs = array_lengthof( VR); - - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - - const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; - - // In 32-bit non-varargs functions, the stack space for vectors is after the - // stack space for non-vectors. We do not use this space unless we have - // too many vectors to fit in registers, something that only occurs in - // constructed examples:), but we have to walk the arglist to figure - // that out...for the pathological case, compute VecArgOffset as the - // start of the vector parameter area. Computing VecArgOffset is the - // entire point of the following loop. - unsigned VecArgOffset = ArgOffset; - if (!isVarArg && !isPPC64) { - for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; - ++ArgNo) { - EVT ObjectVT = Ins[ArgNo].VT; - ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; - - if (Flags.isByVal()) { - // ObjSize is the true size, ArgSize rounded up to multiple of regs. - unsigned ObjSize = Flags.getByValSize(); - unsigned ArgSize = - ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; - VecArgOffset += ArgSize; - continue; - } - - switch(ObjectVT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Unhandled argument type!"); - case MVT::i1: - case MVT::i32: - case MVT::f32: - VecArgOffset += 4; - break; - case MVT::i64: // PPC64 - case MVT::f64: - // FIXME: We are guaranteed to be !isPPC64 at this point. - // Does MVT::i64 apply? - VecArgOffset += 8; - break; - case MVT::v4f32: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v16i8: - // Nothing to do, we're only looking at Nonvector args here. - break; - } - } - } - // We've found where the vector parameter area in memory is. Skip the - // first 12 parameters; these don't use that memory. - VecArgOffset = ((VecArgOffset+15)/16)*16; - VecArgOffset += 12*16; - - // Add DAG nodes to load the arguments or copy them out of registers. On - // entry to a function on PPC, the arguments start after the linkage area, - // although the first ones are often in registers. - - SmallVector<SDValue, 8> MemOps; - unsigned nAltivecParamsAtEnd = 0; - Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin(); - unsigned CurArgIdx = 0; - for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) { - SDValue ArgVal; - bool needsLoad = false; - EVT ObjectVT = Ins[ArgNo].VT; - unsigned ObjSize = ObjectVT.getSizeInBits()/8; - unsigned ArgSize = ObjSize; - ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags; - if (Ins[ArgNo].isOrigArg()) { - std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx); - CurArgIdx = Ins[ArgNo].getOrigArgIndex(); - } - unsigned CurArgOffset = ArgOffset; - - // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary. - if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 || - ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) { - if (isVarArg || isPPC64) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += CalculateStackSlotSize(ObjectVT, - Flags, - PtrByteSize); - } else nAltivecParamsAtEnd++; - } else - // Calculate min reserved area. - MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT, - Flags, - PtrByteSize); - - // FIXME the codegen can be much improved in some cases. - // We do not have to keep everything in memory. - if (Flags.isByVal()) { - assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit"); - - // ObjSize is the true size, ArgSize rounded up to multiple of registers. - ObjSize = Flags.getByValSize(); - ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize; - // Objects of size 1 and 2 are right justified, everything else is - // left justified. This means the memory address is adjusted forwards. - if (ObjSize==1 || ObjSize==2) { - CurArgOffset = CurArgOffset + (4 - ObjSize); - } - // The value of the object is its address. - int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, false, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - InVals.push_back(FIN); - if (ObjSize==1 || ObjSize==2) { - if (GPR_idx != Num_GPR_Regs) { - unsigned VReg; - if (isPPC64) - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); - else - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; - SDValue Store = - DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(&*FuncArg), ObjType); - MemOps.push_back(Store); - ++GPR_idx; - } - - ArgOffset += PtrByteSize; - - continue; - } - for (unsigned j = 0; j < ArgSize; j += PtrByteSize) { - // Store whatever pieces of the object are in registers - // to memory. ArgOffset will be the address of the beginning - // of the object. - if (GPR_idx != Num_GPR_Regs) { - unsigned VReg; - if (isPPC64) - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); - else - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); - int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(&*FuncArg, j)); - MemOps.push_back(Store); - ++GPR_idx; - ArgOffset += PtrByteSize; - } else { - ArgOffset += ArgSize - (ArgOffset-CurArgOffset); - break; - } - } - continue; - } - - switch (ObjectVT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Unhandled argument type!"); - case MVT::i1: - case MVT::i32: - if (!isPPC64) { - if (GPR_idx != Num_GPR_Regs) { - unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); - ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); - - if (ObjectVT == MVT::i1) - ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal); - - ++GPR_idx; - } else { - needsLoad = true; - ArgSize = PtrByteSize; - } - // All int arguments reserve stack space in the Darwin ABI. - ArgOffset += PtrByteSize; - break; - } - LLVM_FALLTHROUGH; - case MVT::i64: // PPC64 - if (GPR_idx != Num_GPR_Regs) { - unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); - ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); - - if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1) - // PPC64 passes i8, i16, and i32 values in i64 registers. Promote - // value to MVT::i64 and then truncate to the correct register size. - ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl); - - ++GPR_idx; - } else { - needsLoad = true; - ArgSize = PtrByteSize; - } - // All int arguments reserve stack space in the Darwin ABI. - ArgOffset += 8; - break; - - case MVT::f32: - case MVT::f64: - // Every 4 bytes of argument space consumes one of the GPRs available for - // argument passing. - if (GPR_idx != Num_GPR_Regs) { - ++GPR_idx; - if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64) - ++GPR_idx; - } - if (FPR_idx != Num_FPR_Regs) { - unsigned VReg; - - if (ObjectVT == MVT::f32) - VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass); - else - VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass); - - ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - ++FPR_idx; - } else { - needsLoad = true; - } - - // All FP arguments reserve stack space in the Darwin ABI. - ArgOffset += isPPC64 ? 8 : ObjSize; - break; - case MVT::v4f32: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v16i8: - // Note that vector arguments in registers don't reserve stack space, - // except in varargs functions. - if (VR_idx != Num_VR_Regs) { - unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass); - ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT); - if (isVarArg) { - while ((ArgOffset % 16) != 0) { - ArgOffset += PtrByteSize; - if (GPR_idx != Num_GPR_Regs) - GPR_idx++; - } - ArgOffset += 16; - GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64? - } - ++VR_idx; - } else { - if (!isVarArg && !isPPC64) { - // Vectors go after all the nonvectors. - CurArgOffset = VecArgOffset; - VecArgOffset += 16; - } else { - // Vectors are aligned. - ArgOffset = ((ArgOffset+15)/16)*16; - CurArgOffset = ArgOffset; - ArgOffset += 16; - } - needsLoad = true; - } - break; - } - - // We need to load the argument to a virtual register if we determined above - // that we ran out of physical registers of the appropriate type. - if (needsLoad) { - int FI = MFI.CreateFixedObject(ObjSize, - CurArgOffset + (ArgSize - ObjSize), - isImmutable); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo()); - } - - InVals.push_back(ArgVal); - } - - // Allow for Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - MinReservedArea = ((MinReservedArea+15)/16)*16; - MinReservedArea += 16*nAltivecParamsAtEnd; - } - - // Area that is at least reserved in the caller of this function. - MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize); - - // Set the size that is at least reserved in caller of this function. Tail - // call optimized functions' reserved stack space needs to be aligned so that - // taking the difference between two stack areas will result in an aligned - // stack. - MinReservedArea = - EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); - FuncInfo->setMinReservedArea(MinReservedArea); - - // If the function takes variable number of arguments, make a frame index for - // the start of the first vararg value... for expansion of llvm.va_start. - if (isVarArg) { - int Depth = ArgOffset; - - FuncInfo->setVarArgsFrameIndex( - MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, - Depth, true)); - SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); - - // If this function is vararg, store any remaining integer argument regs - // to their spots on the stack so that they may be loaded by dereferencing - // the result of va_next. - for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) { - unsigned VReg; - - if (isPPC64) - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass); - else - VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass); - - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo()); - MemOps.push_back(Store); - // Increment the address by four for the next argument to store - SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT); - FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff); - } - } - - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); - - return Chain; -} - /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be /// adjusted to accommodate the arguments for the tailcall. static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, @@ -4759,6 +4438,13 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee, if (STICallee->isUsingPCRelativeCalls()) return false; + // If the GV is not a strong definition then we need to assume it can be + // replaced by another function at link time. The function that replaces + // it may not share the same TOC as the caller since the callee may be + // replaced by a PC Relative version of the same function. + if (!GV->isStrongDefinitionForLinker()) + return false; + // The medium and large code models are expected to provide a sufficiently // large TOC to provide all data addressing needs of a module with a // single TOC. @@ -4766,12 +4452,6 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee, CodeModel::Large == TM.getCodeModel()) return true; - // Otherwise we need to ensure callee and caller are in the same section, - // since the linker may allocate multiple TOCs, and we don't know which - // sections will belong to the same TOC base. - if (!GV->isStrongDefinitionForLinker()) - return false; - // Any explicitly-specified sections and section prefixes must also match. // Also, if we're using -ffunction-sections, then each function is always in // a different section (the same is true for COMDAT functions). @@ -4815,10 +4495,9 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, for (const ISD::OutputArg& Param : Outs) { if (Param.Flags.isNest()) continue; - if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, - PtrByteSize, LinkageSize, ParamAreaSize, - NumBytes, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize, + LinkageSize, ParamAreaSize, NumBytes, + AvailableFPRs, AvailableVRs)) return true; } return false; @@ -5332,66 +5011,53 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, Subtarget.is32BitELFABI() && !isLocalCallee() && Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_; - // On AIX, direct function calls reference the symbol for the function's - // entry point, which is named by prepending a "." before the function's - // C-linkage name. - const auto getAIXFuncEntryPointSymbolSDNode = - [&](StringRef FuncName, bool IsDeclaration, - const XCOFF::StorageClass &SC) { - auto &Context = DAG.getMachineFunction().getMMI().getContext(); - - MCSymbolXCOFF *S = cast<MCSymbolXCOFF>( - Context.getOrCreateSymbol(Twine(".") + Twine(FuncName))); - - if (IsDeclaration && !S->hasRepresentedCsectSet()) { - // On AIX, an undefined symbol needs to be associated with a - // MCSectionXCOFF to get the correct storage mapping class. - // In this case, XCOFF::XMC_PR. - MCSectionXCOFF *Sec = Context.getXCOFFSection( - S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC, - SectionKind::getMetadata()); - S->setRepresentedCsect(Sec); - } + const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) { + const TargetMachine &TM = Subtarget.getTargetMachine(); + const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering(); + MCSymbolXCOFF *S = + cast<MCSymbolXCOFF>(TLOF->getFunctionEntryPointSymbol(GV, TM)); - MVT PtrVT = - DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); - return DAG.getMCSymbol(S, PtrVT); - }; + MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + return DAG.getMCSymbol(S, PtrVT); + }; if (isFunctionGlobalAddress(Callee)) { - const GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee); - const GlobalValue *GV = G->getGlobal(); + const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); - if (!Subtarget.isAIXABI()) - return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0, - UsePlt ? PPCII::MO_PLT : 0); - - assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX."); - const GlobalObject *GO = cast<GlobalObject>(GV); - const XCOFF::StorageClass SC = - TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO); - return getAIXFuncEntryPointSymbolSDNode(GO->getName(), GO->isDeclaration(), - SC); + if (Subtarget.isAIXABI()) { + assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX."); + return getAIXFuncEntryPointSymbolSDNode(GV); + } + return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0, + UsePlt ? PPCII::MO_PLT : 0); } if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *SymName = S->getSymbol(); - if (!Subtarget.isAIXABI()) - return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(), - UsePlt ? PPCII::MO_PLT : 0); + if (Subtarget.isAIXABI()) { + // If there exists a user-declared function whose name is the same as the + // ExternalSymbol's, then we pick up the user-declared version. + const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); + if (const Function *F = + dyn_cast_or_null<Function>(Mod->getNamedValue(SymName))) + return getAIXFuncEntryPointSymbolSDNode(F); + + // On AIX, direct function calls reference the symbol for the function's + // entry point, which is named by prepending a "." before the function's + // C-linkage name. A Qualname is returned here because an external + // function entry point is a csect with XTY_ER property. + const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) { + auto &Context = DAG.getMachineFunction().getMMI().getContext(); + MCSectionXCOFF *Sec = Context.getXCOFFSection( + (Twine(".") + Twine(SymName)).str(), XCOFF::XMC_PR, XCOFF::XTY_ER, + SectionKind::getMetadata()); + return Sec->getQualNameSymbol(); + }; - // If there exists a user-declared function whose name is the same as the - // ExternalSymbol's, then we pick up the user-declared version. - const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); - if (const Function *F = - dyn_cast_or_null<Function>(Mod->getNamedValue(SymName))) { - const XCOFF::StorageClass SC = - TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F); - return getAIXFuncEntryPointSymbolSDNode(F->getName(), F->isDeclaration(), - SC); + SymName = getExternalFunctionEntryPointSymbol(SymName)->getName().data(); } - - return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT); + return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(), + UsePlt ? PPCII::MO_PLT : 0); } // No transformation needed. @@ -5736,19 +5402,15 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }), CLI.NoMerge); - if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) - return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG, - InVals, CB); - - if (Subtarget.isSVR4ABI()) - return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG, - InVals, CB); - if (Subtarget.isAIXABI()) return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG, InVals, CB); - return LowerCall_Darwin(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG, + assert(Subtarget.isSVR4ABI()); + if (Subtarget.isPPC64()) + return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG, + InVals, CB); + return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG, InVals, CB); } @@ -6045,7 +5707,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); unsigned NumBytes = LinkageSize; unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - unsigned &QFPR_idx = FPR_idx; static const MCPhysReg GPR[] = { PPC::X3, PPC::X4, PPC::X5, PPC::X6, @@ -6059,7 +5720,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( const unsigned NumGPRs = array_lengthof(GPR); const unsigned NumFPRs = useSoftFloat() ? 0 : 13; const unsigned NumVRs = array_lengthof(VR); - const unsigned NumQFPRs = NumFPRs; // On ELFv2, we can avoid allocating the parameter area if all the arguments // can be passed to the callee in registers. @@ -6074,9 +5734,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( for (unsigned i = 0; i != NumOps; ++i) { if (Outs[i].Flags.isNest()) continue; if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags, - PtrByteSize, LinkageSize, ParamAreaSize, - NumBytesTmp, AvailableFPRs, AvailableVRs, - Subtarget.hasQPX())) + PtrByteSize, LinkageSize, ParamAreaSize, + NumBytesTmp, AvailableFPRs, AvailableVRs)) HasParameterArea = true; } } @@ -6124,20 +5783,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( continue; break; case MVT::v4f32: - // When using QPX, this is handled like a FP register, otherwise, it - // is an Altivec register. - if (Subtarget.hasQPX()) { - if (++NumFPRsUsed <= NumFPRs) - continue; - } else { - if (++NumVRsUsed <= NumVRs) - continue; - } + if (++NumVRsUsed <= NumVRs) + continue; break; case MVT::f32: case MVT::f64: - case MVT::v4f64: // QPX - case MVT::v4i1: // QPX if (++NumFPRsUsed <= NumFPRs) continue; break; @@ -6499,7 +6149,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( case MVT::v2i64: case MVT::v1i128: case MVT::f128: - if (!Subtarget.hasQPX()) { // These can be scalar arguments or elements of a vector array type // passed directly. The latter are used to implement ELFv2 homogenous // vector aggregates. @@ -6555,63 +6204,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( if (!IsFastCall) ArgOffset += 16; break; - } // not QPX - - assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 && - "Invalid QPX parameter type"); - - LLVM_FALLTHROUGH; - case MVT::v4f64: - case MVT::v4i1: { - bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32; - if (CFlags.IsVarArg) { - assert(HasParameterArea && - "Parameter area must exist if we have a varargs call."); - // We could elide this store in the case where the object fits - // entirely in R registers. Maybe later. - SDValue Store = - DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Store); - if (QFPR_idx != NumQFPRs) { - SDValue Load = DAG.getLoad(IsF32 ? MVT::v4f32 : MVT::v4f64, dl, Store, - PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Load)); - } - ArgOffset += (IsF32 ? 16 : 32); - for (unsigned i = 0; i < (IsF32 ? 16U : 32U); i += PtrByteSize) { - if (GPR_idx == NumGPRs) - break; - SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, - DAG.getConstant(i, dl, PtrVT)); - SDValue Load = - DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - } - break; - } - - // Non-varargs QPX params go into registers or on the stack. - if (QFPR_idx != NumQFPRs) { - RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg)); - } else { - if (IsFastCall) - ComputePtrOff(); - - assert(HasParameterArea && - "Parameter area must exist to pass an argument in memory."); - LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, - true, CFlags.IsTailCall, true, MemOpChains, - TailCallArguments, dl); - if (IsFastCall) - ArgOffset += (IsF32 ? 16 : 32); - } - - if (!IsFastCall) - ArgOffset += (IsF32 ? 16 : 32); - break; - } } } @@ -6665,384 +6257,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( Callee, SPDiff, NumBytes, Ins, InVals, CB); } -SDValue PPCTargetLowering::LowerCall_Darwin( - SDValue Chain, SDValue Callee, CallFlags CFlags, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, - SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, - const CallBase *CB) const { - unsigned NumOps = Outs.size(); - - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - bool isPPC64 = PtrVT == MVT::i64; - unsigned PtrByteSize = isPPC64 ? 8 : 4; - - MachineFunction &MF = DAG.getMachineFunction(); - - // Mark this function as potentially containing a function that contains a - // tail call. As a consequence the frame pointer will be used for dynamicalloc - // and restoring the callers stack pointer in this functions epilog. This is - // done because by tail calling the called function might overwrite the value - // in this function's (MF) stack pointer stack slot 0(SP). - if (getTargetMachine().Options.GuaranteedTailCallOpt && - CFlags.CallConv == CallingConv::Fast) - MF.getInfo<PPCFunctionInfo>()->setHasFastCall(); - - // Count how many bytes are to be pushed on the stack, including the linkage - // area, and parameter passing area. We start with 24/48 bytes, which is - // prereserved space for [SP][CR][LR][3 x unused]. - unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); - unsigned NumBytes = LinkageSize; - - // Add up all the space actually used. - // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually - // they all go in registers, but we must reserve stack space for them for - // possible use by the caller. In varargs or 64-bit calls, parameters are - // assigned stack space in order, with padding so Altivec parameters are - // 16-byte aligned. - unsigned nAltivecParamsAtEnd = 0; - for (unsigned i = 0; i != NumOps; ++i) { - ISD::ArgFlagsTy Flags = Outs[i].Flags; - EVT ArgVT = Outs[i].VT; - // Varargs Altivec parameters are padded to a 16 byte boundary. - if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 || - ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 || - ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) { - if (!CFlags.IsVarArg && !isPPC64) { - // Non-varargs Altivec parameters go after all the non-Altivec - // parameters; handle those later so we know how much padding we need. - nAltivecParamsAtEnd++; - continue; - } - // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary. - NumBytes = ((NumBytes+15)/16)*16; - } - NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize); - } - - // Allow for Altivec parameters at the end, if needed. - if (nAltivecParamsAtEnd) { - NumBytes = ((NumBytes+15)/16)*16; - NumBytes += 16*nAltivecParamsAtEnd; - } - - // The prolog code of the callee may store up to 8 GPR argument registers to - // the stack, allowing va_start to index over them in memory if its varargs. - // Because we cannot tell if this is needed on the caller side, we have to - // conservatively assume that it is needed. As such, make sure we have at - // least enough stack space for the caller to store the 8 GPRs. - NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize); - - // Tail call needs the stack to be aligned. - if (getTargetMachine().Options.GuaranteedTailCallOpt && - CFlags.CallConv == CallingConv::Fast) - NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes); - - // Calculate by how many bytes the stack has to be adjusted in case of tail - // call optimization. - int SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes); - - // To protect arguments on the stack from being clobbered in a tail call, - // force all the loads to happen before doing any other lowering. - if (CFlags.IsTailCall) - Chain = DAG.getStackArgumentTokenFactor(Chain); - - // Adjust the stack pointer for the new arguments... - // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); - SDValue CallSeqStart = Chain; - - // Load the return address and frame pointer so it can be move somewhere else - // later. - SDValue LROp, FPOp; - Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl); - - // Set up a copy of the stack pointer for use loading and storing any - // arguments that may not fit in the registers available for argument - // passing. - SDValue StackPtr; - if (isPPC64) - StackPtr = DAG.getRegister(PPC::X1, MVT::i64); - else - StackPtr = DAG.getRegister(PPC::R1, MVT::i32); - - // Figure out which arguments are going to go in registers, and which in - // memory. Also, if this is a vararg function, floating point operations - // must be stored to our stack, and loaded into integer regs as well, if - // any integer regs are available for argument passing. - unsigned ArgOffset = LinkageSize; - unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0; - - static const MCPhysReg GPR_32[] = { // 32-bit registers. - PPC::R3, PPC::R4, PPC::R5, PPC::R6, - PPC::R7, PPC::R8, PPC::R9, PPC::R10, - }; - static const MCPhysReg GPR_64[] = { // 64-bit registers. - PPC::X3, PPC::X4, PPC::X5, PPC::X6, - PPC::X7, PPC::X8, PPC::X9, PPC::X10, - }; - static const MCPhysReg VR[] = { - PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8, - PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 - }; - const unsigned NumGPRs = array_lengthof(GPR_32); - const unsigned NumFPRs = 13; - const unsigned NumVRs = array_lengthof(VR); - - const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; - - SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; - SmallVector<TailCallArgumentInfo, 8> TailCallArguments; - - SmallVector<SDValue, 8> MemOpChains; - for (unsigned i = 0; i != NumOps; ++i) { - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; - - // PtrOff will be used to store the current argument to the stack if a - // register cannot be found for it. - SDValue PtrOff; - - PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType()); - - PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); - - // On PPC64, promote integers to 64-bit values. - if (isPPC64 && Arg.getValueType() == MVT::i32) { - // FIXME: Should this use ANY_EXTEND if neither sext nor zext? - unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg); - } - - // FIXME memcpy is used way more than necessary. Correctness first. - // Note: "by value" is code for passing a structure by value, not - // basic types. - if (Flags.isByVal()) { - unsigned Size = Flags.getByValSize(); - // Very small objects are passed right-justified. Everything else is - // passed left-justified. - if (Size==1 || Size==2) { - EVT VT = (Size==1) ? MVT::i8 : MVT::i16; - if (GPR_idx != NumGPRs) { - SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg, - MachinePointerInfo(), VT); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - - ArgOffset += PtrByteSize; - } else { - SDValue Const = DAG.getConstant(PtrByteSize - Size, dl, - PtrOff.getValueType()); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const); - Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr, - CallSeqStart, - Flags, DAG, dl); - ArgOffset += PtrByteSize; - } - continue; - } - // Copy entire object into memory. There are cases where gcc-generated - // code assumes it is there, even if it could be put entirely into - // registers. (This is not what the doc says.) - Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, - CallSeqStart, - Flags, DAG, dl); - - // For small aggregates (Darwin only) and aggregates >= PtrByteSize, - // copy the pieces of the object that fit into registers from the - // parameter save area. - for (unsigned j=0; j<Size; j+=PtrByteSize) { - SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); - SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); - if (GPR_idx != NumGPRs) { - SDValue Load = - DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - ArgOffset += PtrByteSize; - } else { - ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize; - break; - } - } - continue; - } - - switch (Arg.getSimpleValueType().SimpleTy) { - default: llvm_unreachable("Unexpected ValueType for argument!"); - case MVT::i1: - case MVT::i32: - case MVT::i64: - if (GPR_idx != NumGPRs) { - if (Arg.getValueType() == MVT::i1) - Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg); - - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); - } else { - LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, - isPPC64, CFlags.IsTailCall, false, MemOpChains, - TailCallArguments, dl); - } - ArgOffset += PtrByteSize; - break; - case MVT::f32: - case MVT::f64: - if (FPR_idx != NumFPRs) { - RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); - - if (CFlags.IsVarArg) { - SDValue Store = - DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Store); - - // Float varargs are always shadowed in available integer registers - if (GPR_idx != NumGPRs) { - SDValue Load = - DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - } - if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){ - SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType()); - PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour); - SDValue Load = - DAG.getLoad(PtrVT, dl, Store, PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - } - } else { - // If we have any FPRs remaining, we may also have GPRs remaining. - // Args passed in FPRs consume either 1 (f32) or 2 (f64) available - // GPRs. - if (GPR_idx != NumGPRs) - ++GPR_idx; - if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && - !isPPC64) // PPC64 has 64-bit GPR's obviously :) - ++GPR_idx; - } - } else - LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, - isPPC64, CFlags.IsTailCall, false, MemOpChains, - TailCallArguments, dl); - if (isPPC64) - ArgOffset += 8; - else - ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8; - break; - case MVT::v4f32: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v16i8: - if (CFlags.IsVarArg) { - // These go aligned on the stack, or in the corresponding R registers - // when within range. The Darwin PPC ABI doc claims they also go in - // V registers; in fact gcc does this only for arguments that are - // prototyped, not for those that match the ... We do it for all - // arguments, seems to work. - while (ArgOffset % 16 !=0) { - ArgOffset += PtrByteSize; - if (GPR_idx != NumGPRs) - GPR_idx++; - } - // We could elide this store in the case where the object fits - // entirely in R registers. Maybe later. - PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, - DAG.getConstant(ArgOffset, dl, PtrVT)); - SDValue Store = - DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Store); - if (VR_idx != NumVRs) { - SDValue Load = - DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load)); - } - ArgOffset += 16; - for (unsigned i=0; i<16; i+=PtrByteSize) { - if (GPR_idx == NumGPRs) - break; - SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, - DAG.getConstant(i, dl, PtrVT)); - SDValue Load = - DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo()); - MemOpChains.push_back(Load.getValue(1)); - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); - } - break; - } - - // Non-varargs Altivec params generally go in registers, but have - // stack space allocated at the end. - if (VR_idx != NumVRs) { - // Doesn't have GPR space allocated. - RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg)); - } else if (nAltivecParamsAtEnd==0) { - // We are emitting Altivec params in order. - LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, - isPPC64, CFlags.IsTailCall, true, MemOpChains, - TailCallArguments, dl); - ArgOffset += 16; - } - break; - } - } - // If all Altivec parameters fit in registers, as they usually do, - // they get stack space following the non-Altivec parameters. We - // don't track this here because nobody below needs it. - // If there are more Altivec parameters than fit in registers emit - // the stores here. - if (!CFlags.IsVarArg && nAltivecParamsAtEnd > NumVRs) { - unsigned j = 0; - // Offset is aligned; skip 1st 12 params which go in V registers. - ArgOffset = ((ArgOffset+15)/16)*16; - ArgOffset += 12*16; - for (unsigned i = 0; i != NumOps; ++i) { - SDValue Arg = OutVals[i]; - EVT ArgType = Outs[i].VT; - if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 || - ArgType==MVT::v8i16 || ArgType==MVT::v16i8) { - if (++j > NumVRs) { - SDValue PtrOff; - // We are emitting Altivec params in order. - LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset, - isPPC64, CFlags.IsTailCall, true, MemOpChains, - TailCallArguments, dl); - ArgOffset += 16; - } - } - } - } - - if (!MemOpChains.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); - - // On Darwin, R12 must contain the address of an indirect callee. This does - // not mean the MTCTR instruction must use R12; it's easier to model this as - // an extra parameter, so do that. - if (CFlags.IsIndirect) { - assert(!CFlags.IsTailCall && "Indirect tail-calls not supported."); - RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 : - PPC::R12), Callee)); - } - - // Build a sequence of copy-to-reg nodes chained together with token chain - // and flag operands which copy the outgoing args into the appropriate regs. - SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - - if (CFlags.IsTailCall) - PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, - TailCallArguments); - - return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart, - Callee, SPDiff, NumBytes, Ins, InVals, CB); -} - static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { @@ -7053,9 +6267,10 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, const Align PtrAlign = IsPPC64 ? Align(8) : Align(4); const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32; - assert((!ValVT.isInteger() || - (ValVT.getSizeInBits() <= RegVT.getSizeInBits())) && - "Integer argument exceeds register size: should have been legalized"); + if (ValVT.isVector() && !State.getMachineFunction() + .getTarget() + .Options.EnableAIXExtendedAltivecABI) + report_fatal_error("the default Altivec AIX ABI is not yet supported"); if (ValVT == MVT::f128) report_fatal_error("f128 is unimplemented on AIX."); @@ -7063,9 +6278,6 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, if (ArgFlags.isNest()) report_fatal_error("Nest arguments are unimplemented."); - if (ValVT.isVector() || LocVT.isVector()) - report_fatal_error("Vector arguments are unimplemented on AIX."); - static const MCPhysReg GPR_32[] = {// 32-bit registers. PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10}; @@ -7073,6 +6285,11 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10}; + static const MCPhysReg VR[] = {// Vector registers. + PPC::V2, PPC::V3, PPC::V4, PPC::V5, + PPC::V6, PPC::V7, PPC::V8, PPC::V9, + PPC::V10, PPC::V11, PPC::V12, PPC::V13}; + if (ArgFlags.isByVal()) { if (ArgFlags.getNonZeroByValAlign() > PtrAlign) report_fatal_error("Pass-by-value arguments with alignment greater than " @@ -7117,7 +6334,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::i32: { const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign); // AIX integer arguments are always passed in register width. - if (ValVT.getSizeInBits() < RegVT.getSizeInBits()) + if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits()) LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt : CCValAssign::LocInfo::ZExt; if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) @@ -7168,6 +6385,25 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, return false; } + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2i64: + case MVT::v2f64: + case MVT::v1i128: { + if (State.isVarArg()) + report_fatal_error( + "variadic arguments for vector types are unimplemented for AIX"); + + if (unsigned VReg = State.AllocateReg(VR)) + State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo)); + else { + report_fatal_error( + "passing vector parameters to the stack is unimplemented for AIX"); + } + return false; + } } return true; } @@ -7188,6 +6424,14 @@ static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT, return &PPC::F4RCRegClass; case MVT::f64: return &PPC::F8RCRegClass; + case MVT::v4f32: + case MVT::v4i32: + case MVT::v8i16: + case MVT::v16i8: + case MVT::v2i64: + case MVT::v2f64: + case MVT::v1i128: + return &PPC::VRRCRegClass; } } @@ -7195,7 +6439,7 @@ static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT, SelectionDAG &DAG, SDValue ArgValue, MVT LocVT, const SDLoc &dl) { assert(ValVT.isScalarInteger() && LocVT.isScalarInteger()); - assert(ValVT.getSizeInBits() < LocVT.getSizeInBits()); + assert(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits()); if (Flags.isSExt()) ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue, @@ -7282,8 +6526,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(DAG.getSubtarget()); - if (Subtarget.hasQPX()) - report_fatal_error("QPX support is not supported on AIX."); const bool IsPPC64 = Subtarget.isPPC64(); const unsigned PtrByteSize = IsPPC64 ? 8 : 4; @@ -7292,6 +6534,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( SmallVector<CCValAssign, 16> ArgLocs; MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); + PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); const EVT PtrVT = getPointerTy(MF.getDataLayout()); @@ -7306,6 +6549,9 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( CCValAssign &VA = ArgLocs[I++]; MVT LocVT = VA.getLocVT(); ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags; + if (VA.isMemLoc() && VA.getValVT().isVector()) + report_fatal_error( + "passing vector parameters to the stack is unimplemented for AIX"); // For compatibility with the AIX XL compiler, the float args in the // parameter save area are initialized even if the argument is available @@ -7316,6 +6562,15 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( if (VA.isMemLoc() && VA.needsCustom()) continue; + if (VA.isRegLoc()) { + if (VA.getValVT().isScalarInteger()) + FuncInfo->appendParameterType(PPCFunctionInfo::FixedType); + else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector()) + FuncInfo->appendParameterType(VA.getValVT().SimpleTy == MVT::f32 + ? PPCFunctionInfo::ShortFloatPoint + : PPCFunctionInfo::LongFloatPoint); + } + if (Flags.isByVal() && VA.isMemLoc()) { const unsigned Size = alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize, @@ -7361,10 +6616,10 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( // to extracting the value from the register directly, and elide the // stores when the arguments address is not taken, but that will need to // be future work. - SDValue Store = - DAG.getStore(CopyFrom.getValue(1), dl, CopyFrom, - DAG.getObjectPtrOffset(dl, FIN, Offset), - MachinePointerInfo::getFixedStack(MF, FI, Offset)); + SDValue Store = DAG.getStore( + CopyFrom.getValue(1), dl, CopyFrom, + DAG.getObjectPtrOffset(dl, FIN, TypeSize::Fixed(Offset)), + MachinePointerInfo::getFixedStack(MF, FI, Offset)); MemOps.push_back(Store); }; @@ -7379,6 +6634,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( const CCValAssign RL = ArgLocs[I++]; HandleRegLoc(RL.getLocReg(), Offset); + FuncInfo->appendParameterType(PPCFunctionInfo::FixedType); } if (Offset != StackSize) { @@ -7400,7 +6656,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64)); SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT); if (ValVT.isScalarInteger() && - (ValVT.getSizeInBits() < LocVT.getSizeInBits())) { + (ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) { ArgValue = truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl); } @@ -7441,7 +6697,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( // aligned stack. CallerReservedArea = EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea); - PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>(); FuncInfo->setMinReservedArea(CallerReservedArea); if (isVarArg) { @@ -7503,10 +6758,6 @@ SDValue PPCTargetLowering::LowerCall_AIX( const PPCSubtarget& Subtarget = static_cast<const PPCSubtarget&>(DAG.getSubtarget()); - if (Subtarget.hasQPX()) - report_fatal_error("QPX is not supported on AIX."); - if (Subtarget.hasAltivec()) - report_fatal_error("Altivec support is unimplemented on AIX."); MachineFunction &MF = DAG.getMachineFunction(); SmallVector<CCValAssign, 16> ArgLocs; @@ -7563,11 +6814,12 @@ SDValue PPCTargetLowering::LowerCall_AIX( } auto GetLoad = [&](EVT VT, unsigned LoadOffset) { - return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, - (LoadOffset != 0) - ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset) - : Arg, - MachinePointerInfo(), VT); + return DAG.getExtLoad( + ISD::ZEXTLOAD, dl, PtrVT, Chain, + (LoadOffset != 0) + ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset)) + : Arg, + MachinePointerInfo(), VT); }; unsigned LoadOffset = 0; @@ -7597,9 +6849,11 @@ SDValue PPCTargetLowering::LowerCall_AIX( // Only memcpy the bytes that don't pass in register. MemcpyFlags.setByValSize(ByValSize - LoadOffset); Chain = CallSeqStart = createMemcpyOutsideCallSeq( - (LoadOffset != 0) ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset) - : Arg, - DAG.getObjectPtrOffset(dl, StackPtr, ByValVA.getLocMemOffset()), + (LoadOffset != 0) + ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset)) + : Arg, + DAG.getObjectPtrOffset(dl, StackPtr, + TypeSize::Fixed(ByValVA.getLocMemOffset())), CallSeqStart, MemcpyFlags, DAG, dl); continue; } @@ -7649,6 +6903,10 @@ SDValue PPCTargetLowering::LowerCall_AIX( const MVT LocVT = VA.getLocVT(); const MVT ValVT = VA.getValVT(); + if (VA.isMemLoc() && VA.getValVT().isVector()) + report_fatal_error( + "passing vector parameters to the stack is unimplemented for AIX"); + switch (VA.getLocInfo()) { default: report_fatal_error("Unexpected argument extension type."); @@ -7690,7 +6948,8 @@ SDValue PPCTargetLowering::LowerCall_AIX( // f32 in 32-bit GPR // f64 in 64-bit GPR RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt)); - else if (Arg.getValueType().getSizeInBits() < LocVT.getSizeInBits()) + else if (Arg.getValueType().getFixedSizeInBits() < + LocVT.getFixedSizeInBits()) // f32 in 64-bit GPR. RegsToPass.push_back(std::make_pair( VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT))); @@ -8049,20 +7308,45 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, // <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to // <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1> - assert(Op.getValueType().isVector() && "Vector type expected."); - - SDLoc DL(Op); - SDValue N1 = Op.getOperand(0); - unsigned SrcSize = N1.getValueType().getSizeInBits(); - assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector"); - SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL); - EVT TrgVT = Op.getValueType(); + assert(TrgVT.isVector() && "Vector type expected."); unsigned TrgNumElts = TrgVT.getVectorNumElements(); EVT EltVT = TrgVT.getVectorElementType(); + if (!isOperationCustom(Op.getOpcode(), TrgVT) || + TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(TrgNumElts) || + !isPowerOf2_32(EltVT.getSizeInBits())) + return SDValue(); + + SDValue N1 = Op.getOperand(0); + EVT SrcVT = N1.getValueType(); + unsigned SrcSize = SrcVT.getSizeInBits(); + if (SrcSize > 256 || + !isPowerOf2_32(SrcVT.getVectorNumElements()) || + !isPowerOf2_32(SrcVT.getVectorElementType().getSizeInBits())) + return SDValue(); + if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2) + return SDValue(); + unsigned WideNumElts = 128 / EltVT.getSizeInBits(); EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); + SDLoc DL(Op); + SDValue Op1, Op2; + if (SrcSize == 256) { + EVT VecIdxTy = getVectorIdxTy(DAG.getDataLayout()); + EVT SplitVT = + N1.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); + unsigned SplitNumElts = SplitVT.getVectorNumElements(); + Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1, + DAG.getConstant(0, DL, VecIdxTy)); + Op2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1, + DAG.getConstant(SplitNumElts, DL, VecIdxTy)); + } + else { + Op1 = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL); + Op2 = DAG.getUNDEF(WideVT); + } + // First list the elements we want to keep. unsigned SizeMult = SrcSize / TrgVT.getSizeInBits(); SmallVector<int, 16> ShuffV; @@ -8078,16 +7362,17 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, // ShuffV.push_back(i + WideNumElts); ShuffV.push_back(WideNumElts + 1); - SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc); - return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV); + Op1 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op1); + Op2 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op2); + return DAG.getVectorShuffle(WideVT, DL, Op1, Op2, ShuffV); } /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - // Not FP? Not a fsel. + // Not FP, or using SPE? Not a fsel. if (!Op.getOperand(0).getValueType().isFloatingPoint() || - !Op.getOperand(2).getValueType().isFloatingPoint()) + !Op.getOperand(2).getValueType().isFloatingPoint() || Subtarget.hasSPE()) return Op; ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); @@ -8203,54 +7488,105 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { return Op; } -void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, - SelectionDAG &DAG, - const SDLoc &dl) const { - assert(Op.getOperand(0).getValueType().isFloatingPoint()); - SDValue Src = Op.getOperand(0); - if (Src.getValueType() == MVT::f32) - Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); - - SDValue Tmp; +static unsigned getPPCStrictOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("No strict version of this opcode!"); + case PPCISD::FCTIDZ: + return PPCISD::STRICT_FCTIDZ; + case PPCISD::FCTIWZ: + return PPCISD::STRICT_FCTIWZ; + case PPCISD::FCTIDUZ: + return PPCISD::STRICT_FCTIDUZ; + case PPCISD::FCTIWUZ: + return PPCISD::STRICT_FCTIWUZ; + case PPCISD::FCFID: + return PPCISD::STRICT_FCFID; + case PPCISD::FCFIDU: + return PPCISD::STRICT_FCFIDU; + case PPCISD::FCFIDS: + return PPCISD::STRICT_FCFIDS; + case PPCISD::FCFIDUS: + return PPCISD::STRICT_FCFIDUS; + } +} + +static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + SDLoc dl(Op); + bool IsStrict = Op->isStrictFPOpcode(); + bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT || + Op.getOpcode() == ISD::STRICT_FP_TO_SINT; + + // TODO: Any other flags to propagate? + SDNodeFlags Flags; + Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept()); + + // For strict nodes, source is the second operand. + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + assert(Src.getValueType().isFloatingPoint()); + if (Src.getValueType() == MVT::f32) { + if (IsStrict) { + Src = + DAG.getNode(ISD::STRICT_FP_EXTEND, dl, + DAG.getVTList(MVT::f64, MVT::Other), {Chain, Src}, Flags); + Chain = Src.getValue(1); + } else + Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); + } + SDValue Conv; + unsigned Opc = ISD::DELETED_NODE; switch (Op.getSimpleValueType().SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: - Tmp = DAG.getNode( - Op.getOpcode() == ISD::FP_TO_SINT - ? PPCISD::FCTIWZ - : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), - dl, MVT::f64, Src); + Opc = IsSigned ? PPCISD::FCTIWZ + : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ); break; case MVT::i64: - assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && + assert((IsSigned || Subtarget.hasFPCVT()) && "i64 FP_TO_UINT is supported only with FPCVT"); - Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : - PPCISD::FCTIDUZ, - dl, MVT::f64, Src); - break; + Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ; } + if (IsStrict) { + Opc = getPPCStrictOpcode(Opc); + Conv = DAG.getNode(Opc, dl, DAG.getVTList(MVT::f64, MVT::Other), + {Chain, Src}, Flags); + } else { + Conv = DAG.getNode(Opc, dl, MVT::f64, Src); + } + return Conv; +} + +void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, + SelectionDAG &DAG, + const SDLoc &dl) const { + SDValue Tmp = convertFPToInt(Op, DAG, Subtarget); + bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT || + Op.getOpcode() == ISD::STRICT_FP_TO_SINT; + bool IsStrict = Op->isStrictFPOpcode(); // Convert the FP value to an int value through memory. bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() && - (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); + (IsSigned || Subtarget.hasFPCVT()); SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Emit a store to the stack slot. - SDValue Chain; + SDValue Chain = IsStrict ? Tmp.getValue(1) : DAG.getEntryNode(); Align Alignment(DAG.getEVTAlign(Tmp.getValueType())); if (i32Stack) { MachineFunction &MF = DAG.getMachineFunction(); Alignment = Align(4); MachineMemOperand *MMO = MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment); - SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr }; + SDValue Ops[] = { Chain, Tmp, FIPtr }; Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl, DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO); } else - Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI, Alignment); + Chain = DAG.getStore(Chain, dl, Tmp, FIPtr, MPI, Alignment); // Result is a load from the stack slot. If loading 4 bytes, make sure to // add in a bias on big endian. @@ -8272,76 +7608,100 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { - assert(Op.getOperand(0).getValueType().isFloatingPoint()); - SDValue Src = Op.getOperand(0); - - if (Src.getValueType() == MVT::f32) - Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); - - SDValue Tmp; - switch (Op.getSimpleValueType().SimpleTy) { - default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); - case MVT::i32: - Tmp = DAG.getNode( - Op.getOpcode() == ISD::FP_TO_SINT - ? PPCISD::FCTIWZ - : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ), - dl, MVT::f64, Src); - Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i32, Tmp); - break; - case MVT::i64: - assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) && - "i64 FP_TO_UINT is supported only with FPCVT"); - Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ : - PPCISD::FCTIDUZ, - dl, MVT::f64, Src); - Tmp = DAG.getNode(PPCISD::MFVSR, dl, MVT::i64, Tmp); - break; - } - return Tmp; + SDValue Conv = convertFPToInt(Op, DAG, Subtarget); + SDValue Mov = DAG.getNode(PPCISD::MFVSR, dl, Op.getValueType(), Conv); + if (Op->isStrictFPOpcode()) + return DAG.getMergeValues({Mov, Conv.getValue(1)}, dl); + else + return Mov; } SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { + bool IsStrict = Op->isStrictFPOpcode(); + bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT || + Op.getOpcode() == ISD::STRICT_FP_TO_SINT; + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + EVT SrcVT = Src.getValueType(); + EVT DstVT = Op.getValueType(); // FP to INT conversions are legal for f128. - if (Op->getOperand(0).getValueType() == MVT::f128) - return Op; + if (SrcVT == MVT::f128) + return Subtarget.hasP9Vector() ? Op : SDValue(); // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on // PPC (the libcall is not available). - if (Op.getOperand(0).getValueType() == MVT::ppcf128) { - if (Op.getValueType() == MVT::i32) { - if (Op.getOpcode() == ISD::FP_TO_SINT) { - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, - MVT::f64, Op.getOperand(0), + if (SrcVT == MVT::ppcf128) { + if (DstVT == MVT::i32) { + // TODO: Conservatively pass only nofpexcept flag here. Need to check and + // set other fast-math flags to FP operations in both strict and + // non-strict cases. (FP_TO_SINT, FSUB) + SDNodeFlags Flags; + Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept()); + + if (IsSigned) { + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src, DAG.getIntPtrConstant(0, dl)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, - MVT::f64, Op.getOperand(0), + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src, DAG.getIntPtrConstant(1, dl)); - // Add the two halves of the long double in round-to-zero mode. - SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); - - // Now use a smaller FP_TO_SINT. - return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res); - } - if (Op.getOpcode() == ISD::FP_TO_UINT) { + // Add the two halves of the long double in round-to-zero mode, and use + // a smaller FP_TO_SINT. + if (IsStrict) { + SDValue Res = DAG.getNode(PPCISD::STRICT_FADDRTZ, dl, + DAG.getVTList(MVT::f64, MVT::Other), + {Op.getOperand(0), Lo, Hi}, Flags); + return DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, + DAG.getVTList(MVT::i32, MVT::Other), + {Res.getValue(1), Res}, Flags); + } else { + SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi); + return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res); + } + } else { const uint64_t TwoE31[] = {0x41e0000000000000LL, 0}; APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31)); - SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128); - // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X - // FIXME: generated code sucks. - // TODO: Are there fast-math-flags to propagate to this FSUB? - SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, - Op.getOperand(0), Tmp); - True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True); - True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, - DAG.getConstant(0x80000000, dl, MVT::i32)); - SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, - Op.getOperand(0)); - return DAG.getSelectCC(dl, Op.getOperand(0), Tmp, True, False, - ISD::SETGE); + SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT); + SDValue SignMask = DAG.getConstant(0x80000000, dl, DstVT); + if (IsStrict) { + // Sel = Src < 0x80000000 + // FltOfs = select Sel, 0.0, 0x80000000 + // IntOfs = select Sel, 0, 0x80000000 + // Result = fp_to_sint(Src - FltOfs) ^ IntOfs + SDValue Chain = Op.getOperand(0); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT); + EVT DstSetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT); + SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT, + Chain, true); + Chain = Sel.getValue(1); + + SDValue FltOfs = DAG.getSelect( + dl, SrcVT, Sel, DAG.getConstantFP(0.0, dl, SrcVT), Cst); + Sel = DAG.getBoolExtOrTrunc(Sel, dl, DstSetCCVT, DstVT); + + SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl, + DAG.getVTList(SrcVT, MVT::Other), + {Chain, Src, FltOfs}, Flags); + Chain = Val.getValue(1); + SDValue SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, + DAG.getVTList(DstVT, MVT::Other), + {Chain, Val}, Flags); + Chain = SInt.getValue(1); + SDValue IntOfs = DAG.getSelect( + dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT), SignMask); + SDValue Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, IntOfs); + return DAG.getMergeValues({Result, Chain}, dl); + } else { + // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X + // FIXME: generated code sucks. + SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, Src, Cst); + True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True); + True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, SignMask); + SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); + return DAG.getSelectCC(dl, Src, Cst, True, False, ISD::SETGE); + } } } @@ -8370,6 +7730,10 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI, SelectionDAG &DAG, ISD::LoadExtType ET) const { + // Conservatively skip reusing for constrained FP nodes. + if (Op->isStrictFPOpcode()) + return false; + SDLoc dl(Op); bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT && (Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32); @@ -8389,6 +7753,13 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT, if (LD->getMemoryVT() != MemVT) return false; + // If the result of the load is an illegal type, then we can't build a + // valid chain for reuse since the legalised loads and token factor node that + // ties the legalised loads together uses a different output chain then the + // illegal load. + if (!isTypeLegal(LD->getValueType(0))) + return false; + RLI.Ptr = LD->getBasePtr(); if (LD->isIndexed() && !LD->getOffset().isUndef()) { assert(LD->getAddressingMode() == ISD::PRE_INC && @@ -8453,13 +7824,41 @@ bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const { continue; if (UI->getOpcode() != ISD::SINT_TO_FP && - UI->getOpcode() != ISD::UINT_TO_FP) + UI->getOpcode() != ISD::UINT_TO_FP && + UI->getOpcode() != ISD::STRICT_SINT_TO_FP && + UI->getOpcode() != ISD::STRICT_UINT_TO_FP) return true; } return false; } +static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG, + const PPCSubtarget &Subtarget, + SDValue Chain = SDValue()) { + bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_SINT_TO_FP; + SDLoc dl(Op); + + // TODO: Any other flags to propagate? + SDNodeFlags Flags; + Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept()); + + // If we have FCFIDS, then use it when converting to single-precision. + // Otherwise, convert to double-precision and then round. + bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT(); + unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS) + : (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU); + EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64; + if (Op->isStrictFPOpcode()) { + if (!Chain) + Chain = Op.getOperand(0); + return DAG.getNode(getPPCStrictOpcode(ConvOpc), dl, + DAG.getVTList(ConvTy, MVT::Other), {Chain, Src}, Flags); + } else + return DAG.getNode(ConvOpc, dl, ConvTy, Src); +} + /// Custom lowers integer to floating point conversions to use /// the direct move instructions available in ISA 2.07 to avoid the /// need for load/store combinations. @@ -8471,25 +7870,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op, "Invalid floating point type as target of conversion"); assert(Subtarget.hasFPCVT() && "Int to FP conversions with direct moves require FPCVT"); - SDValue FP; - SDValue Src = Op.getOperand(0); - bool SinglePrec = Op.getValueType() == MVT::f32; + SDValue Src = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0); bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32; - bool Signed = Op.getOpcode() == ISD::SINT_TO_FP; - unsigned ConvOp = Signed ? (SinglePrec ? PPCISD::FCFIDS : PPCISD::FCFID) : - (SinglePrec ? PPCISD::FCFIDUS : PPCISD::FCFIDU); - - if (WordInt) { - FP = DAG.getNode(Signed ? PPCISD::MTVSRA : PPCISD::MTVSRZ, - dl, MVT::f64, Src); - FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); - } - else { - FP = DAG.getNode(PPCISD::MTVSRA, dl, MVT::f64, Src); - FP = DAG.getNode(ConvOp, dl, SinglePrec ? MVT::f32 : MVT::f64, FP); - } - - return FP; + bool Signed = Op.getOpcode() == ISD::SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_SINT_TO_FP; + unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA; + SDValue Mov = DAG.getNode(MovOpc, dl, MVT::f64, Src); + return convertIntToFP(Op, Mov, DAG, Subtarget); } static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) { @@ -8514,17 +7901,23 @@ static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) { SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const { - + bool IsStrict = Op->isStrictFPOpcode(); unsigned Opc = Op.getOpcode(); - assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) && + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP || + Opc == ISD::STRICT_UINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP) && "Unexpected conversion type"); assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) && "Supports conversions to v2f64/v4f32 only."); - bool SignedConv = Opc == ISD::SINT_TO_FP; + // TODO: Any other flags to propagate? + SDNodeFlags Flags; + Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept()); + + bool SignedConv = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP; bool FourEltRes = Op.getValueType() == MVT::v4f32; - SDValue Wide = widenVec(DAG, Op.getOperand(0), dl); + SDValue Wide = widenVec(DAG, Src, dl); EVT WideVT = Wide.getValueType(); unsigned WideNumElts = WideVT.getVectorNumElements(); MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64; @@ -8549,7 +7942,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, SDValue Extend; if (SignedConv) { Arrange = DAG.getBitcast(IntermediateVT, Arrange); - EVT ExtVT = Op.getOperand(0).getValueType(); + EVT ExtVT = Src.getValueType(); if (Subtarget.hasP9Altivec()) ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(), IntermediateVT.getVectorNumElements()); @@ -8559,14 +7952,27 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, } else Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange); + if (IsStrict) + return DAG.getNode(Opc, dl, DAG.getVTList(Op.getValueType(), MVT::Other), + {Op.getOperand(0), Extend}, Flags); + return DAG.getNode(Opc, dl, Op.getValueType(), Extend); } SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP || + Op.getOpcode() == ISD::STRICT_SINT_TO_FP; + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); + + // TODO: Any other flags to propagate? + SDNodeFlags Flags; + Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept()); - EVT InVT = Op.getOperand(0).getValueType(); + EVT InVT = Src.getValueType(); EVT OutVT = Op.getValueType(); if (OutVT.isVector() && OutVT.isFloatingPoint() && isOperationCustom(Op.getOpcode(), InVT)) @@ -8574,37 +7980,21 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // Conversions to f128 are legal. if (Op.getValueType() == MVT::f128) - return Op; - - if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) { - if (Op.getValueType() != MVT::v4f32 && Op.getValueType() != MVT::v4f64) - return SDValue(); - - SDValue Value = Op.getOperand(0); - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - if (Op.getValueType() != MVT::v4f64) - Value = DAG.getNode(ISD::FP_ROUND, dl, - Op.getValueType(), Value, - DAG.getIntPtrConstant(1, dl)); - return Value; - } + return Subtarget.hasP9Vector() ? Op : SDValue(); // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); - if (Op.getOperand(0).getValueType() == MVT::i1) - return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0), - DAG.getConstantFP(1.0, dl, Op.getValueType()), - DAG.getConstantFP(0.0, dl, Op.getValueType())); + if (Src.getValueType() == MVT::i1) { + SDValue Sel = DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Src, + DAG.getConstantFP(1.0, dl, Op.getValueType()), + DAG.getConstantFP(0.0, dl, Op.getValueType())); + if (IsStrict) + return DAG.getMergeValues({Sel, Chain}, dl); + else + return Sel; + } // If we have direct moves, we can do all the conversion, skip the store/load // however, without FPCVT we can't do most conversions. @@ -8612,22 +8002,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, Subtarget.isPPC64() && Subtarget.hasFPCVT()) return LowerINT_TO_FPDirectMove(Op, DAG, dl); - assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) && + assert((IsSigned || Subtarget.hasFPCVT()) && "UINT_TO_FP is supported only with FPCVT"); - // If we have FCFIDS, then use it when converting to single-precision. - // Otherwise, convert to double-precision and then round. - unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) - ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS - : PPCISD::FCFIDS) - : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU - : PPCISD::FCFID); - MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) - ? MVT::f32 - : MVT::f64; - - if (Op.getOperand(0).getValueType() == MVT::i64) { - SDValue SINT = Op.getOperand(0); + if (Src.getValueType() == MVT::i64) { + SDValue SINT = Src; // When converting to single-precision, we actually need to convert // to double-precision first and then round to single-precision. // To avoid double-rounding effects during that operation, we have @@ -8715,16 +8094,16 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, int FrameIdx = MFI.CreateStackObject(4, Align(4), false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Store = - DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), FrameIdx)); + SDValue Store = DAG.getStore(Chain, dl, SINT.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FrameIdx)); + Chain = Store; assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; - RLI.Chain = Store; + RLI.Chain = Chain; RLI.MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = Align(4); @@ -8737,18 +8116,27 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, PPCISD::LFIWZX : PPCISD::LFIWAX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i32, MMO); + Chain = Bits.getValue(1); } else Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT); - SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits); + SDValue FP = convertIntToFP(Op, Bits, DAG, Subtarget, Chain); + if (IsStrict) + Chain = FP.getValue(1); - if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) - FP = DAG.getNode(ISD::FP_ROUND, dl, - MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { + if (IsStrict) + FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl, + DAG.getVTList(MVT::f32, MVT::Other), + {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags); + else + FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, + DAG.getIntPtrConstant(0, dl)); + } return FP; } - assert(Op.getOperand(0).getValueType() == MVT::i32 && + assert(Src.getValueType() == MVT::i32 && "Unhandled INT_TO_FP type in custom expander!"); // Since we only generate this in 64-bit mode, we can take advantage of // 64-bit registers. In particular, sign extend the input value into the @@ -8762,21 +8150,20 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) { ReuseLoadInfo RLI; bool ReusingLoad; - if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI, - DAG))) { + if (!(ReusingLoad = canReuseLoadAddress(Src, MVT::i32, RLI, DAG))) { int FrameIdx = MFI.CreateStackObject(4, Align(4), false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Store = - DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), FrameIdx)); + SDValue Store = DAG.getStore(Chain, dl, Src, FIdx, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), FrameIdx)); + Chain = Store; assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; - RLI.Chain = Store; + RLI.Chain = Chain; RLI.MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = Align(4); @@ -8786,10 +8173,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4, RLI.Alignment, RLI.AAInfo, RLI.Ranges); SDValue Ops[] = { RLI.Chain, RLI.Ptr }; - Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ? - PPCISD::LFIWZX : PPCISD::LFIWAX, - dl, DAG.getVTList(MVT::f64, MVT::Other), - Ops, MVT::i32, MMO); + Ld = DAG.getMemIntrinsicNode(IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl, + DAG.getVTList(MVT::f64, MVT::Other), Ops, + MVT::i32, MMO); + Chain = Ld.getValue(1); if (ReusingLoad) spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG); } else { @@ -8799,25 +8186,34 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, int FrameIdx = MFI.CreateStackObject(8, Align(8), false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, - Op.getOperand(0)); + SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Src); // STD the extended value into the stack slot. SDValue Store = DAG.getStore( - DAG.getEntryNode(), dl, Ext64, FIdx, + Chain, dl, Ext64, FIdx, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); + Chain = Store; // Load the value as a double. Ld = DAG.getLoad( - MVT::f64, dl, Store, FIdx, + MVT::f64, dl, Chain, FIdx, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx)); + Chain = Ld.getValue(1); } // FCFID it and return it. - SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld); - if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) - FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, - DAG.getIntPtrConstant(0, dl)); + SDValue FP = convertIntToFP(Op, Ld, DAG, Subtarget, Chain); + if (IsStrict) + Chain = FP.getValue(1); + if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { + if (IsStrict) + FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl, + DAG.getVTList(MVT::f32, MVT::Other), + {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags); + else + FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, + DAG.getIntPtrConstant(0, dl)); + } return FP; } @@ -8852,16 +8248,24 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain); Chain = MFFS.getValue(1); - // Save FP register to stack slot - int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false); - SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo()); + SDValue CWD; + if (isTypeLegal(MVT::i64)) { + CWD = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, MVT::i64, MFFS)); + } else { + // Save FP register to stack slot + int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); + Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo()); - // Load FP Control Word from low 32 bits of stack slot. - SDValue Four = DAG.getConstant(4, dl, PtrVT); - SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); - SDValue CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo()); - Chain = CWD.getValue(1); + // Load FP Control Word from low 32 bits of stack slot. + assert(hasBigEndianPartOrdering(MVT::i64, MF.getDataLayout()) && + "Stack slot adjustment is valid only on big endian subtargets!"); + SDValue Four = DAG.getConstant(4, dl, PtrVT); + SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four); + CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo()); + Chain = CWD.getValue(1); + } // Transform as necessary SDValue CWD1 = @@ -8972,6 +8376,31 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(OutOps, dl); } +SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + + bool IsFSHL = Op.getOpcode() == ISD::FSHL; + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + SDValue Z = Op.getOperand(2); + EVT AmtVT = Z.getValueType(); + + // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + // This is simpler than TargetLowering::expandFunnelShift because we can rely + // on PowerPC shift by BW being well defined. + Z = DAG.getNode(ISD::AND, dl, AmtVT, Z, + DAG.getConstant(BitWidth - 1, dl, AmtVT)); + SDValue SubZ = + DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Z); + X = DAG.getNode(PPCISD::SHL, dl, VT, X, IsFSHL ? Z : SubZ); + Y = DAG.getNode(PPCISD::SRL, dl, VT, Y, IsFSHL ? SubZ : Z); + return DAG.getNode(ISD::OR, dl, VT, X, Y); +} + //===----------------------------------------------------------------------===// // Vector related lowering. // @@ -8987,7 +8416,7 @@ static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT, EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1]; // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize. - if (Val == ((1LU << (SplatSize * 8)) - 1)) { + if (Val == ((1LLU << (SplatSize * 8)) - 1)) { SplatSize = 1; Val = 0xFF; } @@ -9111,13 +8540,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { Op0.getOperand(1)); } -static const SDValue *getNormalLoadInput(const SDValue &Op) { +static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) { const SDValue *InputLoad = &Op; if (InputLoad->getOpcode() == ISD::BITCAST) InputLoad = &InputLoad->getOperand(0); if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR || - InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) + InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) { + IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED; InputLoad = &InputLoad->getOperand(0); + } if (InputLoad->getOpcode() != ISD::LOAD) return nullptr; LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); @@ -9163,110 +8594,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR"); - if (Subtarget.hasQPX() && Op.getValueType() == MVT::v4i1) { - // We first build an i32 vector, load it into a QPX register, - // then convert it to a floating-point vector and compare it - // to a zero vector to get the boolean result. - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - assert(BVN->getNumOperands() == 4 && - "BUILD_VECTOR for v4i1 does not have 4 operands"); - - bool IsConst = true; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) continue; - if (!isa<ConstantSDNode>(BVN->getOperand(i))) { - IsConst = false; - break; - } - } - - if (IsConst) { - Constant *One = - ConstantFP::get(Type::getFloatTy(*DAG.getContext()), 1.0); - Constant *NegOne = - ConstantFP::get(Type::getFloatTy(*DAG.getContext()), -1.0); - - Constant *CV[4]; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) - CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); - else if (isNullConstant(BVN->getOperand(i))) - CV[i] = NegOne; - else - CV[i] = One; - } - - Constant *CP = ConstantVector::get(CV); - SDValue CPIdx = - DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16)); - - SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; - SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other}); - return DAG.getMemIntrinsicNode( - PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); - } - - SmallVector<SDValue, 4> Stores; - for (unsigned i = 0; i < 4; ++i) { - if (BVN->getOperand(i).isUndef()) continue; - - unsigned Offset = 4*i; - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - unsigned StoreSize = BVN->getOperand(i).getValueType().getStoreSize(); - if (StoreSize > 4) { - Stores.push_back( - DAG.getTruncStore(DAG.getEntryNode(), dl, BVN->getOperand(i), Idx, - PtrInfo.getWithOffset(Offset), MVT::i32)); - } else { - SDValue StoreValue = BVN->getOperand(i); - if (StoreSize < 4) - StoreValue = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, StoreValue); - - Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, StoreValue, Idx, - PtrInfo.getWithOffset(Offset))); - } - } - - SDValue StoreChain; - if (!Stores.empty()) - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - else - StoreChain = DAG.getEntryNode(); - - // Now load from v4i32 into the QPX register; this will extend it to - // v4i64 but not yet convert it to a floating point. Nevertheless, this - // is typed as v4f64 because the QPX register integer states are not - // explicitly represented. - - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvlfiwz, dl, MVT::i32), - FIdx}; - SDVTList VTs = DAG.getVTList({MVT::v4f64, /*chain*/ MVT::Other}); - - SDValue LoadedVect = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - LoadedVect = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfcfidu, dl, MVT::i32), - LoadedVect); - - SDValue FPZeros = DAG.getConstantFP(0.0, dl, MVT::v4f64); - - return DAG.getSetCC(dl, MVT::v4i1, LoadedVect, FPZeros, ISD::SETEQ); - } - - // All other QPX vectors are handled by generic code. - if (Subtarget.hasQPX()) - return SDValue(); - // Check if this is a splat of a constant value. APInt APSplatBits, APSplatUndef; unsigned SplatBitSize; @@ -9277,19 +8604,48 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // If it is a splat of a double, check if we can shrink it to a 32 bit // non-denormal float which when converted back to double gives us the same - // double. This is to exploit the XXSPLTIDP instruction. - if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() && - (SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) && - convertToNonDenormSingle(APSplatBits)) { - SDValue SplatNode = DAG.getNode( - PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64, - DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32)); - return DAG.getBitcast(Op.getValueType(), SplatNode); + // double. This is to exploit the XXSPLTIDP instruction.+ // If we lose precision, we use XXSPLTI32DX. + if (BVNIsConstantSplat && (SplatBitSize == 64) && + Subtarget.hasPrefixInstrs()) { + if (convertToNonDenormSingle(APSplatBits) && + (Op->getValueType(0) == MVT::v2f64)) { + SDValue SplatNode = DAG.getNode( + PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64, + DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32)); + return DAG.getBitcast(Op.getValueType(), SplatNode); + } else { // We may lose precision, so we have to use XXSPLTI32DX. + + uint32_t Hi = + (uint32_t)((APSplatBits.getZExtValue() & 0xFFFFFFFF00000000LL) >> 32); + uint32_t Lo = + (uint32_t)(APSplatBits.getZExtValue() & 0xFFFFFFFF); + SDValue SplatNode = DAG.getUNDEF(MVT::v2i64); + + if (!Hi || !Lo) + // If either load is 0, then we should generate XXLXOR to set to 0. + SplatNode = DAG.getTargetConstant(0, dl, MVT::v2i64); + + if (Hi) + SplatNode = DAG.getNode( + PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode, + DAG.getTargetConstant(0, dl, MVT::i32), + DAG.getTargetConstant(Hi, dl, MVT::i32)); + + if (Lo) + SplatNode = + DAG.getNode(PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode, + DAG.getTargetConstant(1, dl, MVT::i32), + DAG.getTargetConstant(Lo, dl, MVT::i32)); + + return DAG.getBitcast(Op.getValueType(), SplatNode); + } } if (!BVNIsConstantSplat || SplatBitSize > 32) { - const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0)); + bool IsPermutedLoad = false; + const SDValue *InputLoad = + getNormalLoadInput(Op.getOperand(0), IsPermutedLoad); // Handle load-and-splat patterns as we have instructions that will do this // in one go. if (InputLoad && DAG.isSplatValue(Op, true)) { @@ -9301,7 +8657,12 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // Checking for a single use of this load, we have to check for vector // width (128 bits) / ElementSize uses (since each operand of the // BUILD_VECTOR is a separate use of the value. - if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) && + unsigned NumUsesOfInputLD = 128 / ElementSize; + for (SDValue BVInOp : Op->ops()) + if (BVInOp.isUndef()) + NumUsesOfInputLD--; + assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?"); + if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) && ((Subtarget.hasVSX() && ElementSize == 64) || (Subtarget.hasP9Vector() && ElementSize == 32))) { SDValue Ops[] = { @@ -9309,17 +8670,21 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, LD->getBasePtr(), // Ptr DAG.getValueType(Op.getValueType()) // VT }; - return - DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, - DAG.getVTList(Op.getValueType(), MVT::Other), - Ops, LD->getMemoryVT(), LD->getMemOperand()); + SDValue LdSplt = DAG.getMemIntrinsicNode( + PPCISD::LD_SPLAT, dl, DAG.getVTList(Op.getValueType(), MVT::Other), + Ops, LD->getMemoryVT(), LD->getMemOperand()); + // Replace all uses of the output chain of the original load with the + // output chain of the new load. + DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1), + LdSplt.getValue(1)); + return LdSplt; } } - // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be - // lowered to VSX instructions under certain conditions. + // In 64BIT mode BUILD_VECTOR nodes that are not constant splats of up to + // 32-bits can be lowered to VSX instructions under certain conditions. // Without VSX, there is no pattern more efficient than expanding the node. - if (Subtarget.hasVSX() && + if (Subtarget.hasVSX() && Subtarget.isPPC64() && haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(), Subtarget.hasP8Vector())) return Op; @@ -9348,7 +8713,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be // turned into a 4-byte splat of 0xABABABAB. if (Subtarget.hasPrefixInstrs() && SplatSize == 2) - return getCanonicalConstSplat((SplatBits |= SplatBits << 16), SplatSize * 2, + return getCanonicalConstSplat(SplatBits | (SplatBits << 16), SplatSize * 2, Op.getValueType(), DAG, dl); if (Subtarget.hasPrefixInstrs() && SplatSize == 4) @@ -9444,17 +8809,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); } - // vsplti + sra self. - if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) { - SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl); - static const unsigned IIDs[] = { // Intrinsic to use for each size. - Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0, - Intrinsic::ppc_altivec_vsraw - }; - Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); - } - // vsplti + rol self. if (SextVal == (int)(((unsigned)i << TypeShiftAmt) | ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) { @@ -9912,7 +9266,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // If this is a load-and-splat, we can do that with a single instruction // in some cases. However if the load has multiple uses, we don't want to // combine it because that will just produce multiple loads. - const SDValue *InputLoad = getNormalLoadInput(V1); + bool IsPermutedLoad = false; + const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad); if (InputLoad && Subtarget.hasVSX() && V2.isUndef() && (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) && InputLoad->hasOneUse()) { @@ -9920,6 +9275,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG); + // The splat index for permuted loads will be in the left half of the vector + // which is strictly wider than the loaded value by 8 bytes. So we need to + // adjust the splat index to point to the correct address in memory. + if (IsPermutedLoad) { + assert(isLittleEndian && "Unexpected permuted load on big endian target"); + SplatIdx += IsFourByte ? 2 : 1; + assert((SplatIdx < (IsFourByte ? 4 : 2)) && + "Splat of a value outside of the loaded memory"); + } + LoadSDNode *LD = cast<LoadSDNode>(*InputLoad); // For 4-byte load-and-splat, we need Power9. if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) { @@ -9929,10 +9294,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, else Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8; - // If we are loading a partial vector, it does not make sense to adjust - // the base pointer. This happens with (splat (s_to_v_permuted (ld))). - if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64)) - Offset = 0; SDValue BasePtr = LD->getBasePtr(); if (Offset != 0) BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), @@ -9947,6 +9308,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SDValue LdSplt = DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL, Ops, LD->getMemoryVT(), LD->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1), LdSplt.getValue(1)); if (LdSplt.getValueType() != SVOp->getValueType(0)) LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt); return LdSplt; @@ -10050,42 +9412,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } } - if (Subtarget.hasQPX()) { - if (VT.getVectorNumElements() != 4) - return SDValue(); - - if (V2.isUndef()) V2 = V1; - - int AlignIdx = PPC::isQVALIGNIShuffleMask(SVOp); - if (AlignIdx != -1) { - return DAG.getNode(PPCISD::QVALIGNI, dl, VT, V1, V2, - DAG.getConstant(AlignIdx, dl, MVT::i32)); - } else if (SVOp->isSplat()) { - int SplatIdx = SVOp->getSplatIndex(); - if (SplatIdx >= 4) { - std::swap(V1, V2); - SplatIdx -= 4; - } - - return DAG.getNode(PPCISD::QVESPLATI, dl, VT, V1, - DAG.getConstant(SplatIdx, dl, MVT::i32)); - } - - // Lower this into a qvgpci/qvfperm pair. - - // Compute the qvgpci literal - unsigned idx = 0; - for (unsigned i = 0; i < 4; ++i) { - int m = SVOp->getMaskElt(i); - unsigned mm = m >= 0 ? (unsigned) m : i; - idx |= mm << (3-i)*3; - } - - SDValue V3 = DAG.getNode(PPCISD::QVGPCI, dl, MVT::v4f64, - DAG.getConstant(idx, dl, MVT::i32)); - return DAG.getNode(PPCISD::QVFPERM, dl, VT, V1, V2, V3); - } - // Cases that are handled by instructions that take permute immediates // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be // selected by the instruction selector. @@ -10347,6 +9673,26 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, return false; break; + case Intrinsic::ppc_altivec_vcmpequq: + case Intrinsic::ppc_altivec_vcmpgtsq: + case Intrinsic::ppc_altivec_vcmpgtuq: + if (!Subtarget.isISA3_1()) + return false; + switch (IntrinsicID) { + default: + llvm_unreachable("Unknown comparison intrinsic."); + case Intrinsic::ppc_altivec_vcmpequq: + CompareOpc = 455; + break; + case Intrinsic::ppc_altivec_vcmpgtsq: + CompareOpc = 903; + break; + case Intrinsic::ppc_altivec_vcmpgtuq: + CompareOpc = 647; + break; + } + break; + // VSX predicate comparisons use the same infrastructure case Intrinsic::ppc_vsx_xvcmpeqdp_p: case Intrinsic::ppc_vsx_xvcmpgedp_p: @@ -10470,6 +9816,26 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, else return false; break; + case Intrinsic::ppc_altivec_vcmpequq_p: + case Intrinsic::ppc_altivec_vcmpgtsq_p: + case Intrinsic::ppc_altivec_vcmpgtuq_p: + if (!Subtarget.isISA3_1()) + return false; + switch (IntrinsicID) { + default: + llvm_unreachable("Unknown comparison intrinsic."); + case Intrinsic::ppc_altivec_vcmpequq_p: + CompareOpc = 455; + break; + case Intrinsic::ppc_altivec_vcmpgtsq_p: + CompareOpc = 903; + break; + case Intrinsic::ppc_altivec_vcmpgtuq_p: + CompareOpc = 647; + break; + } + isDot = true; + break; } return true; } @@ -10483,11 +9849,32 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc dl(Op); - if (IntrinsicID == Intrinsic::thread_pointer) { + switch (IntrinsicID) { + case Intrinsic::thread_pointer: // Reads the thread pointer register, used for __builtin_thread_pointer. if (Subtarget.isPPC64()) return DAG.getRegister(PPC::X13, MVT::i64); return DAG.getRegister(PPC::R2, MVT::i32); + + case Intrinsic::ppc_mma_disassemble_acc: + case Intrinsic::ppc_vsx_disassemble_pair: { + int NumVecs = 2; + SDValue WideVec = Op.getOperand(1); + if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) { + NumVecs = 4; + WideVec = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, WideVec); + } + SmallVector<SDValue, 4> RetOps; + for (int VecNo = 0; VecNo < NumVecs; VecNo++) { + SDValue Extract = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, WideVec, + DAG.getConstant(Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo + : VecNo, + dl, MVT::i64)); + RetOps.push_back(Extract); + } + return DAG.getMergeValues(RetOps, dl); + } } // If this is a lowered altivec predicate compare, CompareOpc is set to the @@ -10512,7 +9899,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(CompareOpc, dl, MVT::i32) }; EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue }; - SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); + SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops); // Now that we have the comparison, emit a copy from the CR to a GPR. // This is flagged to the above dot comparison. @@ -10673,154 +10060,51 @@ SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return Op; } -SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - SDNode *N = Op.getNode(); - - assert(N->getOperand(0).getValueType() == MVT::v4i1 && - "Unknown extract_vector_elt type"); - - SDValue Value = N->getOperand(0); - - // The first part of this is like the store lowering except that we don't - // need to track the chain. - - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to - // understand how to form the extending load. - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - // Now convert to an integer and store. - Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), - Value); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue StoreChain = DAG.getEntryNode(); - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), - Value, FIdx}; - SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); - - StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - - // Extract the value requested. - unsigned Offset = 4*cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - SDValue IntVal = - DAG.getLoad(MVT::i32, dl, StoreChain, Idx, PtrInfo.getWithOffset(Offset)); - - if (!Subtarget.useCRBits()) - return IntVal; - - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, IntVal); -} - -/// Lowering for QPX v4i1 loads SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); LoadSDNode *LN = cast<LoadSDNode>(Op.getNode()); SDValue LoadChain = LN->getChain(); SDValue BasePtr = LN->getBasePtr(); + EVT VT = Op.getValueType(); - if (Op.getValueType() == MVT::v4f64 || - Op.getValueType() == MVT::v4f32) { - EVT MemVT = LN->getMemoryVT(); - unsigned Alignment = LN->getAlignment(); - - // If this load is properly aligned, then it is legal. - if (Alignment >= MemVT.getStoreSize()) - return Op; - - EVT ScalarVT = Op.getValueType().getScalarType(), - ScalarMemVT = MemVT.getScalarType(); - unsigned Stride = ScalarMemVT.getStoreSize(); - - SDValue Vals[4], LoadChains[4]; - for (unsigned Idx = 0; Idx < 4; ++Idx) { - SDValue Load; - if (ScalarVT != ScalarMemVT) - Load = DAG.getExtLoad(LN->getExtensionType(), dl, ScalarVT, LoadChain, - BasePtr, - LN->getPointerInfo().getWithOffset(Idx * Stride), - ScalarMemVT, MinAlign(Alignment, Idx * Stride), - LN->getMemOperand()->getFlags(), LN->getAAInfo()); - else - Load = DAG.getLoad(ScalarVT, dl, LoadChain, BasePtr, - LN->getPointerInfo().getWithOffset(Idx * Stride), - MinAlign(Alignment, Idx * Stride), - LN->getMemOperand()->getFlags(), LN->getAAInfo()); - - if (Idx == 0 && LN->isIndexed()) { - assert(LN->getAddressingMode() == ISD::PRE_INC && - "Unknown addressing mode on vector load"); - Load = DAG.getIndexedLoad(Load, dl, BasePtr, LN->getOffset(), - LN->getAddressingMode()); - } - - Vals[Idx] = Load; - LoadChains[Idx] = Load.getValue(1); - - BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getConstant(Stride, dl, - BasePtr.getValueType())); - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - SDValue Value = DAG.getBuildVector(Op.getValueType(), dl, Vals); - - if (LN->isIndexed()) { - SDValue RetOps[] = { Value, Vals[0].getValue(1), TF }; - return DAG.getMergeValues(RetOps, dl); - } - - SDValue RetOps[] = { Value, TF }; - return DAG.getMergeValues(RetOps, dl); - } - - assert(Op.getValueType() == MVT::v4i1 && "Unknown load to lower"); - assert(LN->isUnindexed() && "Indexed v4i1 loads are not supported"); - - // To lower v4i1 from a byte array, we load the byte elements of the - // vector and then reuse the BUILD_VECTOR logic. - - SDValue VectElmts[4], VectElmtChains[4]; - for (unsigned i = 0; i < 4; ++i) { - SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - - VectElmts[i] = DAG.getExtLoad( - ISD::EXTLOAD, dl, MVT::i32, LoadChain, Idx, - LN->getPointerInfo().getWithOffset(i), MVT::i8, - /* Alignment = */ 1, LN->getMemOperand()->getFlags(), LN->getAAInfo()); - VectElmtChains[i] = VectElmts[i].getValue(1); - } - - LoadChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, VectElmtChains); - SDValue Value = DAG.getBuildVector(MVT::v4i1, dl, VectElmts); + if (VT != MVT::v256i1 && VT != MVT::v512i1) + return Op; - SDValue RVals[] = { Value, LoadChain }; - return DAG.getMergeValues(RVals, dl); + // Type v256i1 is used for pairs and v512i1 is used for accumulators. + // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in + // 2 or 4 vsx registers. + assert((VT != MVT::v512i1 || Subtarget.hasMMA()) && + "Type unsupported without MMA"); + assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) && + "Type unsupported without paired vector support"); + Align Alignment = LN->getAlign(); + SmallVector<SDValue, 4> Loads; + SmallVector<SDValue, 4> LoadChains; + unsigned NumVecs = VT.getSizeInBits() / 128; + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + SDValue Load = + DAG.getLoad(MVT::v16i8, dl, LoadChain, BasePtr, + LN->getPointerInfo().getWithOffset(Idx * 16), + commonAlignment(Alignment, Idx * 16), + LN->getMemOperand()->getFlags(), LN->getAAInfo()); + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(16, dl, BasePtr.getValueType())); + Loads.push_back(Load); + LoadChains.push_back(Load.getValue(1)); + } + if (Subtarget.isLittleEndian()) { + std::reverse(Loads.begin(), Loads.end()); + std::reverse(LoadChains.begin(), LoadChains.end()); + } + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + SDValue Value = + DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD, + dl, VT, Loads); + SDValue RetOps[] = {Value, TF}; + return DAG.getMergeValues(RetOps, dl); } -/// Lowering for QPX v4i1 stores SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -10828,122 +10112,40 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SDValue StoreChain = SN->getChain(); SDValue BasePtr = SN->getBasePtr(); SDValue Value = SN->getValue(); + EVT StoreVT = Value.getValueType(); - if (Value.getValueType() == MVT::v4f64 || - Value.getValueType() == MVT::v4f32) { - EVT MemVT = SN->getMemoryVT(); - unsigned Alignment = SN->getAlignment(); - - // If this store is properly aligned, then it is legal. - if (Alignment >= MemVT.getStoreSize()) - return Op; - - EVT ScalarVT = Value.getValueType().getScalarType(), - ScalarMemVT = MemVT.getScalarType(); - unsigned Stride = ScalarMemVT.getStoreSize(); - - SDValue Stores[4]; - for (unsigned Idx = 0; Idx < 4; ++Idx) { - SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value, - DAG.getVectorIdxConstant(Idx, dl)); - SDValue Store; - if (ScalarVT != ScalarMemVT) - Store = - DAG.getTruncStore(StoreChain, dl, Ex, BasePtr, - SN->getPointerInfo().getWithOffset(Idx * Stride), - ScalarMemVT, MinAlign(Alignment, Idx * Stride), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); - else - Store = DAG.getStore(StoreChain, dl, Ex, BasePtr, - SN->getPointerInfo().getWithOffset(Idx * Stride), - MinAlign(Alignment, Idx * Stride), - SN->getMemOperand()->getFlags(), SN->getAAInfo()); - - if (Idx == 0 && SN->isIndexed()) { - assert(SN->getAddressingMode() == ISD::PRE_INC && - "Unknown addressing mode on vector store"); - Store = DAG.getIndexedStore(Store, dl, BasePtr, SN->getOffset(), - SN->getAddressingMode()); - } - - BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, - DAG.getConstant(Stride, dl, - BasePtr.getValueType())); - Stores[Idx] = Store; - } - - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); - - if (SN->isIndexed()) { - SDValue RetOps[] = { TF, Stores[0].getValue(1) }; - return DAG.getMergeValues(RetOps, dl); - } - - return TF; - } - - assert(SN->isUnindexed() && "Indexed v4i1 stores are not supported"); - assert(Value.getValueType() == MVT::v4i1 && "Unknown store to lower"); - - // The values are now known to be -1 (false) or 1 (true). To convert this - // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). - // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 - Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - - // FIXME: We can make this an f32 vector, but the BUILD_VECTOR code needs to - // understand how to form the extending load. - SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::v4f64); - - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); - - // Now convert to an integer and store. - Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, - DAG.getConstant(Intrinsic::ppc_qpx_qvfctiwu, dl, MVT::i32), - Value); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - int FrameIdx = MFI.CreateStackObject(16, Align(16), false); - MachinePointerInfo PtrInfo = - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - - SDValue Ops[] = {StoreChain, - DAG.getConstant(Intrinsic::ppc_qpx_qvstfiw, dl, MVT::i32), - Value, FIdx}; - SDVTList VTs = DAG.getVTList(/*chain*/ MVT::Other); - - StoreChain = DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, - dl, VTs, Ops, MVT::v4i32, PtrInfo); - - // Move data into the byte array. - SDValue Loads[4], LoadChains[4]; - for (unsigned i = 0; i < 4; ++i) { - unsigned Offset = 4*i; - SDValue Idx = DAG.getConstant(Offset, dl, FIdx.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, FIdx.getValueType(), FIdx, Idx); - - Loads[i] = DAG.getLoad(MVT::i32, dl, StoreChain, Idx, - PtrInfo.getWithOffset(Offset)); - LoadChains[i] = Loads[i].getValue(1); - } - - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - - SDValue Stores[4]; - for (unsigned i = 0; i < 4; ++i) { - SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); - Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - - Stores[i] = DAG.getTruncStore( - StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), - MVT::i8, /* Alignment = */ 1, SN->getMemOperand()->getFlags(), - SN->getAAInfo()); - } - - StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); + if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1) + return Op; - return StoreChain; + // Type v256i1 is used for pairs and v512i1 is used for accumulators. + // Here we create 2 or 4 v16i8 stores to store the pair or accumulator + // underlying registers individually. + assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) && + "Type unsupported without MMA"); + assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) && + "Type unsupported without paired vector support"); + Align Alignment = SN->getAlign(); + SmallVector<SDValue, 4> Stores; + unsigned NumVecs = 2; + if (StoreVT == MVT::v512i1) { + Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value); + NumVecs = 4; + } + for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { + unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx; + SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, + DAG.getConstant(VecNum, dl, MVT::i64)); + SDValue Store = + DAG.getStore(StoreChain, dl, Elt, BasePtr, + SN->getPointerInfo().getWithOffset(Idx * 16), + commonAlignment(Alignment, Idx * 16), + SN->getMemOperand()->getFlags(), SN->getAAInfo()); + BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, + DAG.getConstant(16, dl, BasePtr.getValueType())); + Stores.push_back(Store); + } + SDValue TF = DAG.getTokenFactor(dl, Stores); + return TF; } SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { @@ -11010,42 +10212,13 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { } } -SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { - - assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS"); - - EVT VT = Op.getValueType(); - assert(VT.isVector() && - "Only set vector abs as custom, scalar abs shouldn't reach here!"); - assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || - VT == MVT::v16i8) && - "Unexpected vector element type!"); - assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) && - "Current subtarget doesn't support smax v2i64!"); - - // For vector abs, it can be lowered to: - // abs x - // ==> - // y = -x - // smax(x, y) - - SDLoc dl(Op); - SDValue X = Op.getOperand(0); - SDValue Zero = DAG.getConstant(0, dl, VT); - SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X); - - // SMAX patch https://reviews.llvm.org/D47332 - // hasn't landed yet, so use intrinsic first here. - // TODO: Should use SMAX directly once SMAX patch landed - Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw; - if (VT == MVT::v2i64) - BifID = Intrinsic::ppc_altivec_vmaxsd; - else if (VT == MVT::v8i16) - BifID = Intrinsic::ppc_altivec_vmaxsh; - else if (VT == MVT::v16i8) - BifID = Intrinsic::ppc_altivec_vmaxsb; +SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + bool IsStrict = Op->isStrictFPOpcode(); + if (Op.getOperand(IsStrict ? 1 : 0).getValueType() == MVT::f128 && + !Subtarget.hasP9Vector()) + return SDValue(); - return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT); + return Op; } // Custom lowering for fpext vf32 to v2f64 @@ -11158,8 +10331,12 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op)); + case ISD::STRICT_UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); @@ -11169,16 +10346,20 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG); case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG); + case ISD::FSHL: return LowerFunnelShift(Op, DAG); + case ISD::FSHR: return LowerFunnelShift(Op, DAG); + // Vector-related lowering. case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); - case ISD::ABS: return LowerABS(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::STRICT_FP_ROUND: + case ISD::FP_ROUND: + return LowerFP_ROUND(Op, DAG); case ISD::ROTL: return LowerROTL(Op, DAG); // For counter-based loop handling. @@ -11246,23 +10427,28 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: // LowerFP_TO_INT() can only handle f32 and f64. - if (N->getOperand(0).getValueType() == MVT::ppcf128) + if (N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType() == + MVT::ppcf128) return; Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; case ISD::TRUNCATE: { - EVT TrgVT = N->getValueType(0); - EVT OpVT = N->getOperand(0).getValueType(); - if (TrgVT.isVector() && - isOperationCustom(N->getOpcode(), TrgVT) && - OpVT.getSizeInBits() <= 128 && - isPowerOf2_32(OpVT.getVectorElementType().getSizeInBits())) - Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG)); + if (!N->getValueType(0).isVector()) + return; + SDValue Lowered = LowerTRUNCATEVector(SDValue(N, 0), DAG); + if (Lowered) + Results.push_back(Lowered); return; } + case ISD::FSHL: + case ISD::FSHR: + // Don't handle funnel shifts here. + return; case ISD::BITCAST: // Don't handle bitcast here. return; @@ -11434,17 +10620,88 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, return BB; } +static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) { + switch(MI.getOpcode()) { + default: + return false; + case PPC::COPY: + return TII->isSignExtended(MI); + case PPC::LHA: + case PPC::LHA8: + case PPC::LHAU: + case PPC::LHAU8: + case PPC::LHAUX: + case PPC::LHAUX8: + case PPC::LHAX: + case PPC::LHAX8: + case PPC::LWA: + case PPC::LWAUX: + case PPC::LWAX: + case PPC::LWAX_32: + case PPC::LWA_32: + case PPC::PLHA: + case PPC::PLHA8: + case PPC::PLHA8pc: + case PPC::PLHApc: + case PPC::PLWA: + case PPC::PLWA8: + case PPC::PLWA8pc: + case PPC::PLWApc: + case PPC::EXTSB: + case PPC::EXTSB8: + case PPC::EXTSB8_32_64: + case PPC::EXTSB8_rec: + case PPC::EXTSB_rec: + case PPC::EXTSH: + case PPC::EXTSH8: + case PPC::EXTSH8_32_64: + case PPC::EXTSH8_rec: + case PPC::EXTSH_rec: + case PPC::EXTSW: + case PPC::EXTSWSLI: + case PPC::EXTSWSLI_32_64: + case PPC::EXTSWSLI_32_64_rec: + case PPC::EXTSWSLI_rec: + case PPC::EXTSW_32: + case PPC::EXTSW_32_64: + case PPC::EXTSW_32_64_rec: + case PPC::EXTSW_rec: + case PPC::SRAW: + case PPC::SRAWI: + case PPC::SRAWI_rec: + case PPC::SRAW_rec: + return true; + } + return false; +} + MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( MachineInstr &MI, MachineBasicBlock *BB, bool is8bit, // operation unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const { + // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. + const PPCInstrInfo *TII = Subtarget.getInstrInfo(); + + // If this is a signed comparison and the value being compared is not known + // to be sign extended, sign extend it here. + DebugLoc dl = MI.getDebugLoc(); + MachineFunction *F = BB->getParent(); + MachineRegisterInfo &RegInfo = F->getRegInfo(); + Register incr = MI.getOperand(3).getReg(); + bool IsSignExtended = Register::isVirtualRegister(incr) && + isSignExtended(*RegInfo.getVRegDef(incr), TII); + + if (CmpOpcode == PPC::CMPW && !IsSignExtended) { + Register ValueReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); + BuildMI(*BB, MI, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueReg) + .addReg(MI.getOperand(3).getReg()); + MI.getOperand(3).setReg(ValueReg); + } // If we support part-word atomic mnemonics, just use them if (Subtarget.hasPartwordAtomics()) return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode, CmpPred); - // This also handles ATOMIC_SWAP, indicated by BinOpcode==0. - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); // In 64 bit mode we have to use 64 bits for addresses, even though the // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address // registers without caring whether they're 32 or 64, but here we're @@ -11454,14 +10711,11 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction *F = BB->getParent(); MachineFunction::iterator It = ++BB->getIterator(); Register dest = MI.getOperand(0).getReg(); Register ptrA = MI.getOperand(1).getReg(); Register ptrB = MI.getOperand(2).getReg(); - Register incr = MI.getOperand(3).getReg(); - DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = @@ -11475,7 +10729,6 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); - MachineRegisterInfo &RegInfo = F->getRegInfo(); const TargetRegisterClass *RC = is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; @@ -11950,18 +11203,34 @@ PPCTargetLowering::emitProbedAlloca(MachineInstr &MI, Register SPReg = isPPC64 ? PPC::X1 : PPC::R1; Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); - - // Get the canonical FinalStackPtr like what - // PPCRegisterInfo::lowerDynamicAlloc does. - BuildMI(*MBB, {MI}, DL, - TII->get(isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 - : PPC::PREPARE_PROBED_ALLOCA_32), - FramePointer) - .addDef(FinalStackPtr) + Register ActualNegSizeReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); + + // Since value of NegSizeReg might be realigned in prologepilog, insert a + // PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and + // NegSize. + unsigned ProbeOpc; + if (!MRI.hasOneNonDBGUse(NegSizeReg)) + ProbeOpc = + isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 : PPC::PREPARE_PROBED_ALLOCA_32; + else + // By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg + // and NegSizeReg will be allocated in the same phyreg to avoid + // redundant copy when NegSizeReg has only one use which is current MI and + // will be replaced by PREPARE_PROBED_ALLOCA then. + ProbeOpc = isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64 + : PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32; + BuildMI(*MBB, {MI}, DL, TII->get(ProbeOpc), FramePointer) + .addDef(ActualNegSizeReg) .addReg(NegSizeReg) .add(MI.getOperand(2)) .add(MI.getOperand(3)); + // Calculate final stack pointer, which equals to SP + ActualNegSize. + BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), + FinalStackPtr) + .addReg(SPReg) + .addReg(ActualNegSizeReg); + // Materialize a scratch register for update. int64_t NegProbeSize = -(int64_t)ProbeSize; assert(isInt<32>(NegProbeSize) && "Unhandled probe size!"); @@ -11982,7 +11251,7 @@ PPCTargetLowering::emitProbedAlloca(MachineInstr &MI, // Probing leading residual part. Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div) - .addReg(NegSizeReg) + .addReg(ActualNegSizeReg) .addReg(ScratchReg); Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul) @@ -11991,7 +11260,7 @@ PPCTargetLowering::emitProbedAlloca(MachineInstr &MI, Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC); BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod) .addReg(Mul) - .addReg(NegSizeReg); + .addReg(ActualNegSizeReg); BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg) .addReg(FramePointer) .addReg(SPReg) @@ -12102,9 +11371,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } else if (MI.getOpcode() == PPC::SELECT_CC_F4 || MI.getOpcode() == PPC::SELECT_CC_F8 || MI.getOpcode() == PPC::SELECT_CC_F16 || - MI.getOpcode() == PPC::SELECT_CC_QFRC || - MI.getOpcode() == PPC::SELECT_CC_QSRC || - MI.getOpcode() == PPC::SELECT_CC_QBRC || MI.getOpcode() == PPC::SELECT_CC_VRRC || MI.getOpcode() == PPC::SELECT_CC_VSFRC || MI.getOpcode() == PPC::SELECT_CC_VSSRC || @@ -12114,9 +11380,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_F16 || - MI.getOpcode() == PPC::SELECT_QFRC || - MI.getOpcode() == PPC::SELECT_QSRC || - MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_SPE || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_VRRC || @@ -12154,9 +11417,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_F16 || MI.getOpcode() == PPC::SELECT_SPE4 || MI.getOpcode() == PPC::SELECT_SPE || - MI.getOpcode() == PPC::SELECT_QFRC || - MI.getOpcode() == PPC::SELECT_QSRC || - MI.getOpcode() == PPC::SELECT_QBRC || MI.getOpcode() == PPC::SELECT_VRRC || MI.getOpcode() == PPC::SELECT_VSFRC || MI.getOpcode() == PPC::SELECT_VSSRC || @@ -12639,11 +11899,20 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg); // Set rounding mode to round-to-zero. - BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31); - BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30); + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)) + .addImm(31) + .addReg(PPC::RM, RegState::ImplicitDefine); + + BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)) + .addImm(30) + .addReg(PPC::RM, RegState::ImplicitDefine); // Perform addition. - BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2); + auto MIB = BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest) + .addReg(Src1) + .addReg(Src2); + if (MI.getFlag(MachineInstr::NoFPExcept)) + MIB.setMIFlag(MachineInstr::NoFPExcept); // Restore FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); @@ -12702,10 +11971,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // the immediate to set the bits 62:63 of FPSCR. unsigned Mode = MI.getOperand(1).getImm(); BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0)) - .addImm(31); + .addImm(31) + .addReg(PPC::RM, RegState::ImplicitDefine); BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0)) - .addImm(30); + .addImm(30) + .addReg(PPC::RM, RegState::ImplicitDefine); } else if (MI.getOpcode() == PPC::SETRND) { DebugLoc dl = MI.getDebugLoc(); @@ -12815,6 +12086,20 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(NewFPSCRReg) .addImm(0) .addImm(0); + } else if (MI.getOpcode() == PPC::SETFLM) { + DebugLoc Dl = MI.getDebugLoc(); + + // Result of setflm is previous FPSCR content, so we need to save it first. + Register OldFPSCRReg = MI.getOperand(0).getReg(); + BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg); + + // Put bits in 32:63 to FPSCR. + Register NewFPSCRReg = MI.getOperand(1).getReg(); + BuildMI(*BB, MI, Dl, TII->get(PPC::MTFSF)) + .addImm(255) + .addReg(NewFPSCRReg) + .addImm(0) + .addImm(0); } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 || MI.getOpcode() == PPC::PROBED_ALLOCA_64) { return emitProbedAlloca(MI, BB); @@ -12841,6 +12126,47 @@ static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) { return RefinementSteps; } +SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, + const DenormalMode &Mode) const { + // We only have VSX Vector Test for software Square Root. + EVT VT = Op.getValueType(); + if (!isTypeLegal(MVT::i1) || + (VT != MVT::f64 && + ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX()))) + return TargetLowering::getSqrtInputTest(Op, DAG, Mode); + + SDLoc DL(Op); + // The output register of FTSQRT is CR field. + SDValue FTSQRT = DAG.getNode(PPCISD::FTSQRT, DL, MVT::i32, Op); + // ftsqrt BF,FRB + // Let e_b be the unbiased exponent of the double-precision + // floating-point operand in register FRB. + // fe_flag is set to 1 if either of the following conditions occurs. + // - The double-precision floating-point operand in register FRB is a zero, + // a NaN, or an infinity, or a negative value. + // - e_b is less than or equal to -970. + // Otherwise fe_flag is set to 0. + // Both VSX and non-VSX versions would set EQ bit in the CR if the number is + // not eligible for iteration. (zero/negative/infinity/nan or unbiased + // exponent is less than -970) + SDValue SRIdxVal = DAG.getTargetConstant(PPC::sub_eq, DL, MVT::i32); + return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i1, + FTSQRT, SRIdxVal), + 0); +} + +SDValue +PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op, + SelectionDAG &DAG) const { + // We only have VSX Vector Square Root. + EVT VT = Op.getValueType(); + if (VT != MVT::f64 && + ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX())) + return TargetLowering::getSqrtResultForDenormInput(Op, DAG); + + return DAG.getNode(PPCISD::FSQRT, SDLoc(Op), VT, Op); +} + SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, @@ -12849,9 +12175,7 @@ SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) || (VT == MVT::f64 && Subtarget.hasFRSQRTE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX()) || - (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); @@ -12870,9 +12194,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, if ((VT == MVT::f32 && Subtarget.hasFRES()) || (VT == MVT::f64 && Subtarget.hasFRE()) || (VT == MVT::v4f32 && Subtarget.hasAltivec()) || - (VT == MVT::v2f64 && Subtarget.hasVSX()) || - (VT == MVT::v4f32 && Subtarget.hasQPX()) || - (VT == MVT::v4f64 && Subtarget.hasQPX())) { + (VT == MVT::v2f64 && Subtarget.hasVSX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand); @@ -12970,24 +12292,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { default: return false; - case Intrinsic::ppc_qpx_qvlfd: - case Intrinsic::ppc_qpx_qvlfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfs: - case Intrinsic::ppc_qpx_qvlfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcd: - case Intrinsic::ppc_qpx_qvlfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcs: - case Intrinsic::ppc_qpx_qvlfcsa: - VT = MVT::v2f32; - break; - case Intrinsic::ppc_qpx_qvlfiwa: - case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_vsx_lxvw4x: @@ -13016,24 +12320,6 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, EVT VT; switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { default: return false; - case Intrinsic::ppc_qpx_qvstfd: - case Intrinsic::ppc_qpx_qvstfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfs: - case Intrinsic::ppc_qpx_qvstfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcd: - case Intrinsic::ppc_qpx_qvstfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcs: - case Intrinsic::ppc_qpx_qvstfcsa: - VT = MVT::v2f32; - break; - case Intrinsic::ppc_qpx_qvstfiw: - case Intrinsic::ppc_qpx_qvstfiwa: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_vsx_stxvw4x: @@ -13261,11 +12547,13 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1)); // We don't really care about what is known about the first bit (if - // anything), so clear it in all masks prior to comparing them. - Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0); - Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0); + // anything), so pretend that it is known zero for both to ensure they can + // be compared as constants. + Op1Known.Zero.setBit(0); Op1Known.One.clearBit(0); + Op2Known.Zero.setBit(0); Op2Known.One.clearBit(0); - if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One) + if (!Op1Known.isConstant() || !Op2Known.isConstant() || + Op1Known.getConstant() != Op2Known.getConstant()) return SDValue(); } } @@ -13317,8 +12605,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, // Visit all inputs, collect all binary operations (and, or, xor and // select) that are all fed by extensions. while (!BinOps.empty()) { - SDValue BinOp = BinOps.back(); - BinOps.pop_back(); + SDValue BinOp = BinOps.pop_back_val(); if (!Visited.insert(BinOp.getNode()).second) continue; @@ -13533,8 +12820,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, // Visit all inputs, collect all binary operations (and, or, xor and // select) that are all fed by truncations. while (!BinOps.empty()) { - SDValue BinOp = BinOps.back(); - BinOps.pop_back(); + SDValue BinOp = BinOps.pop_back_val(); if (!Visited.insert(BinOp.getNode()).second) continue; @@ -14131,6 +13417,46 @@ static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +// Look for the pattern of a load from a narrow width to i128, feeding +// into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node +// (LXVRZX). This node represents a zero extending load that will be matched +// to the Load VSX Vector Rightmost instructions. +static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + + // This combine is only eligible for a BUILD_VECTOR of v1i128. + if (N->getValueType(0) != MVT::v1i128) + return SDValue(); + + SDValue Operand = N->getOperand(0); + // Proceed with the transformation if the operand to the BUILD_VECTOR + // is a load instruction. + if (Operand.getOpcode() != ISD::LOAD) + return SDValue(); + + LoadSDNode *LD = dyn_cast<LoadSDNode>(Operand); + EVT MemoryType = LD->getMemoryVT(); + + // This transformation is only valid if the we are loading either a byte, + // halfword, word, or doubleword. + bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 || + MemoryType == MVT::i32 || MemoryType == MVT::i64; + + // Ensure that the load from the narrow width is being zero extended to i128. + if (!ValidLDType || + (LD->getExtensionType() != ISD::ZEXTLOAD && + LD->getExtensionType() != ISD::EXTLOAD)) + return SDValue(); + + SDValue LoadOps[] = { + LD->getChain(), LD->getBasePtr(), + DAG.getIntPtrConstant(MemoryType.getScalarSizeInBits(), DL)}; + + return DAG.getMemIntrinsicNode(PPCISD::LXVRZX, DL, + DAG.getVTList(MVT::v1i128, MVT::Other), + LoadOps, MemoryType, LD->getMemOperand()); +} + SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::BUILD_VECTOR && @@ -14168,6 +13494,14 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, return Reduced; } + // On Power10, the Load VSX Vector Rightmost instructions can be utilized + // if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR + // is a load from <valid narrow width> to i128. + if (Subtarget.isISA3_1()) { + SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG); + if (BVOfZLoad) + return BVOfZLoad; + } if (N->getValueType(0) != MVT::v2f64) return SDValue(); @@ -14231,6 +13565,8 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, // from the hardware. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) return SDValue(); + if (!Op.getOperand(0).getValueType().isSimple()) + return SDValue(); if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) || Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64)) return SDValue(); @@ -14467,8 +13803,7 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N, EVT Op1VT = N->getOperand(1).getValueType(); EVT ResVT = Val.getValueType(); - // Floating point types smaller than 32 bits are not legal on Power. - if (ResVT.getScalarSizeInBits() < 32) + if (!isTypeLegal(ResVT)) return SDValue(); // Only perform combine for conversion to i64/i32 or power9 i16/i8. @@ -14562,7 +13897,6 @@ static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV, if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx)) ShuffV[i] += HalfVec; } - return; } // Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if @@ -15031,18 +14365,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, EVT MemVT = LD->getMemoryVT(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty); - Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext()); - Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy); if (LD->isUnindexed() && VT.isVector() && ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) && // P8 and later hardware should just use LOAD. !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v4f32)) || - (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) && - LD->getAlign() >= ScalarABIAlignment)) && + VT == MVT::v4f32))) && LD->getAlign() < ABIAlignment) { - // This is a type-legal unaligned Altivec or QPX load. + // This is a type-legal unaligned Altivec load. SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); bool isLittleEndian = Subtarget.isLittleEndian(); @@ -15073,24 +14403,13 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // optimization later. Intrinsic::ID Intr, IntrLD, IntrPerm; MVT PermCntlTy, PermTy, LDTy; - if (Subtarget.hasAltivec()) { - Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr : - Intrinsic::ppc_altivec_lvsl; - IntrLD = Intrinsic::ppc_altivec_lvx; - IntrPerm = Intrinsic::ppc_altivec_vperm; - PermCntlTy = MVT::v16i8; - PermTy = MVT::v4i32; - LDTy = MVT::v4i32; - } else { - Intr = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlpcld : - Intrinsic::ppc_qpx_qvlpcls; - IntrLD = MemVT == MVT::v4f64 ? Intrinsic::ppc_qpx_qvlfd : - Intrinsic::ppc_qpx_qvlfs; - IntrPerm = Intrinsic::ppc_qpx_qvfperm; - PermCntlTy = MVT::v4f64; - PermTy = MVT::v4f64; - LDTy = MemVT.getSimpleVT(); - } + Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr + : Intrinsic::ppc_altivec_lvsl; + IntrLD = Intrinsic::ppc_altivec_lvx; + IntrPerm = Intrinsic::ppc_altivec_vperm; + PermCntlTy = MVT::v16i8; + PermTy = MVT::v4i32; + LDTy = MVT::v4i32; SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy); @@ -15161,10 +14480,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, BaseLoad, ExtraLoad, PermCntl, DAG, dl); if (VT != PermTy) - Perm = Subtarget.hasAltivec() ? - DAG.getNode(ISD::BITCAST, dl, VT, Perm) : - DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, // QPX - DAG.getTargetConstant(1, dl, MVT::i64)); + Perm = Subtarget.hasAltivec() + ? DAG.getNode(ISD::BITCAST, dl, VT, Perm) + : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm, + DAG.getTargetConstant(1, dl, MVT::i64)); // second argument is 1 because this rounding // is always exact. @@ -15180,14 +14499,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl); - if ((IID == Intr || - IID == Intrinsic::ppc_qpx_qvlpcld || - IID == Intrinsic::ppc_qpx_qvlpcls) && - N->getOperand(1)->getOpcode() == ISD::ADD) { + if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) { SDValue Add = N->getOperand(1); - int Bits = IID == Intrinsic::ppc_qpx_qvlpcld ? - 5 /* 32 byte alignment */ : 4 /* 16 byte alignment */; + int Bits = 4 /* 16 byte alignment */; if (DAG.MaskedValueIsZero(Add->getOperand(1), APInt::getAllOnesValue(Bits /* alignment */) @@ -15197,7 +14512,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, UE = BasePtr->use_end(); UI != UE; ++UI) { if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN && - cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == IID) { + cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() == + IID) { // We've found another LVSL/LVSR, and this address is an aligned // multiple of that one. The results will be the same, so use the // one we've just found instead. @@ -15329,43 +14645,43 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } break; case PPCISD::VCMP: - // If a VCMPo node already exists with exactly the same operands as this - // node, use its result instead of this node (VCMPo computes both a CR6 and - // a normal output). + // If a VCMP_rec node already exists with exactly the same operands as this + // node, use its result instead of this node (VCMP_rec computes both a CR6 + // and a normal output). // if (!N->getOperand(0).hasOneUse() && !N->getOperand(1).hasOneUse() && !N->getOperand(2).hasOneUse()) { - // Scan all of the users of the LHS, looking for VCMPo's that match. - SDNode *VCMPoNode = nullptr; + // Scan all of the users of the LHS, looking for VCMP_rec's that match. + SDNode *VCMPrecNode = nullptr; SDNode *LHSN = N->getOperand(0).getNode(); for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end(); UI != E; ++UI) - if (UI->getOpcode() == PPCISD::VCMPo && + if (UI->getOpcode() == PPCISD::VCMP_rec && UI->getOperand(1) == N->getOperand(1) && UI->getOperand(2) == N->getOperand(2) && UI->getOperand(0) == N->getOperand(0)) { - VCMPoNode = *UI; + VCMPrecNode = *UI; break; } - // If there is no VCMPo node, or if the flag value has a single use, don't - // transform this. - if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1)) + // If there is no VCMP_rec node, or if the flag value has a single use, + // don't transform this. + if (!VCMPrecNode || VCMPrecNode->hasNUsesOfValue(0, 1)) break; // Look at the (necessarily single) use of the flag value. If it has a // chain, this transformation is more complex. Note that multiple things // could use the value result, which we should ignore. SDNode *FlagUser = nullptr; - for (SDNode::use_iterator UI = VCMPoNode->use_begin(); + for (SDNode::use_iterator UI = VCMPrecNode->use_begin(); FlagUser == nullptr; ++UI) { - assert(UI != VCMPoNode->use_end() && "Didn't find user!"); + assert(UI != VCMPrecNode->use_end() && "Didn't find user!"); SDNode *User = *UI; for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) { - if (User->getOperand(i) == SDValue(VCMPoNode, 1)) { + if (User->getOperand(i) == SDValue(VCMPrecNode, 1)) { FlagUser = User; break; } @@ -15375,7 +14691,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // If the user is a MFOCRF instruction, we know this is safe. // Otherwise we give up for right now. if (FlagUser->getOpcode() == PPCISD::MFOCRF) - return SDValue(VCMPoNode, 0); + return SDValue(VCMPrecNode, 0); } break; case ISD::BRCOND: { @@ -15464,7 +14780,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAG.getConstant(CompareOpc, dl, MVT::i32) }; EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue }; - SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops); + SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops); // Unpack the result based on how the target uses it. PPC::Predicate CompOpc; @@ -15559,16 +14875,19 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case Intrinsic::ppc_altivec_vcmpequh_p: case Intrinsic::ppc_altivec_vcmpequw_p: case Intrinsic::ppc_altivec_vcmpequd_p: + case Intrinsic::ppc_altivec_vcmpequq_p: case Intrinsic::ppc_altivec_vcmpgefp_p: case Intrinsic::ppc_altivec_vcmpgtfp_p: case Intrinsic::ppc_altivec_vcmpgtsb_p: case Intrinsic::ppc_altivec_vcmpgtsh_p: case Intrinsic::ppc_altivec_vcmpgtsw_p: case Intrinsic::ppc_altivec_vcmpgtsd_p: + case Intrinsic::ppc_altivec_vcmpgtsq_p: case Intrinsic::ppc_altivec_vcmpgtub_p: case Intrinsic::ppc_altivec_vcmpgtuh_p: case Intrinsic::ppc_altivec_vcmpgtuw_p: case Intrinsic::ppc_altivec_vcmpgtud_p: + case Intrinsic::ppc_altivec_vcmpgtuq_p: Known.Zero = ~1U; // All bits but the low one are known to be zero. break; } @@ -15746,17 +15065,9 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::F4RCRegClass); if (VT == MVT::f64 || VT == MVT::i64) return std::make_pair(0U, &PPC::F8RCRegClass); - if (VT == MVT::v4f64 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QFRCRegClass); - if (VT == MVT::v4f32 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QSRCRegClass); } break; case 'v': - if (VT == MVT::v4f64 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QFRCRegClass); - if (VT == MVT::v4f32 && Subtarget.hasQPX()) - return std::make_pair(0U, &PPC::QSRCRegClass); if (Subtarget.hasAltivec()) return std::make_pair(0U, &PPC::VRRCRegClass); break; @@ -15892,9 +15203,15 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, // by AM is legal for this target, for a load/store of the specified type. bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS, Instruction *I) const { - // PPC does not allow r+i addressing modes for vectors! - if (Ty->isVectorTy() && AM.BaseOffs != 0) + unsigned AS, + Instruction *I) const { + // Vector type r+i form is supported since power9 as DQ form. We don't check + // the offset matching DQ form requirement(off % 16 == 0), because on PowerPC, + // imm form is preferred and the offset can be adjusted to use imm form later + // in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and + // max offset to check legal addressing mode, we should be a little aggressive + // to contain other offsets for that LSRUse. + if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector()) return false; // PPC allows a sign-extended 16-bit immediate field. @@ -16048,19 +15365,17 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvlfd: - case Intrinsic::ppc_qpx_qvlfs: - case Intrinsic::ppc_qpx_qvlfcd: - case Intrinsic::ppc_qpx_qvlfcs: - case Intrinsic::ppc_qpx_qvlfiwa: - case Intrinsic::ppc_qpx_qvlfiwz: case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: case Intrinsic::ppc_altivec_lvebx: case Intrinsic::ppc_altivec_lvehx: case Intrinsic::ppc_altivec_lvewx: case Intrinsic::ppc_vsx_lxvd2x: - case Intrinsic::ppc_vsx_lxvw4x: { + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x_be: + case Intrinsic::ppc_vsx_lxvw4x_be: + case Intrinsic::ppc_vsx_lxvl: + case Intrinsic::ppc_vsx_lxvll: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_lvebx: @@ -16073,20 +15388,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, VT = MVT::i32; break; case Intrinsic::ppc_vsx_lxvd2x: + case Intrinsic::ppc_vsx_lxvd2x_be: VT = MVT::v2f64; break; - case Intrinsic::ppc_qpx_qvlfd: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfs: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcd: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcs: - VT = MVT::v2f32; - break; default: VT = MVT::v4i32; break; @@ -16101,52 +15405,17 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOLoad; return true; } - case Intrinsic::ppc_qpx_qvlfda: - case Intrinsic::ppc_qpx_qvlfsa: - case Intrinsic::ppc_qpx_qvlfcda: - case Intrinsic::ppc_qpx_qvlfcsa: - case Intrinsic::ppc_qpx_qvlfiwaa: - case Intrinsic::ppc_qpx_qvlfiwza: { - EVT VT; - switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvlfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvlfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvlfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvlfcsa: - VT = MVT::v2f32; - break; - default: - VT = MVT::v4i32; - break; - } - - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = VT; - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.size = VT.getStoreSize(); - Info.align = Align(1); - Info.flags = MachineMemOperand::MOLoad; - return true; - } - case Intrinsic::ppc_qpx_qvstfd: - case Intrinsic::ppc_qpx_qvstfs: - case Intrinsic::ppc_qpx_qvstfcd: - case Intrinsic::ppc_qpx_qvstfcs: - case Intrinsic::ppc_qpx_qvstfiw: case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: case Intrinsic::ppc_altivec_stvebx: case Intrinsic::ppc_altivec_stvehx: case Intrinsic::ppc_altivec_stvewx: case Intrinsic::ppc_vsx_stxvd2x: - case Intrinsic::ppc_vsx_stxvw4x: { + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x_be: + case Intrinsic::ppc_vsx_stxvw4x_be: + case Intrinsic::ppc_vsx_stxvl: + case Intrinsic::ppc_vsx_stxvll: { EVT VT; switch (Intrinsic) { case Intrinsic::ppc_altivec_stvebx: @@ -16159,20 +15428,9 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, VT = MVT::i32; break; case Intrinsic::ppc_vsx_stxvd2x: + case Intrinsic::ppc_vsx_stxvd2x_be: VT = MVT::v2f64; break; - case Intrinsic::ppc_qpx_qvstfd: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfs: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcd: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcs: - VT = MVT::v2f32; - break; default: VT = MVT::v4i32; break; @@ -16187,39 +15445,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } - case Intrinsic::ppc_qpx_qvstfda: - case Intrinsic::ppc_qpx_qvstfsa: - case Intrinsic::ppc_qpx_qvstfcda: - case Intrinsic::ppc_qpx_qvstfcsa: - case Intrinsic::ppc_qpx_qvstfiwa: { - EVT VT; - switch (Intrinsic) { - case Intrinsic::ppc_qpx_qvstfda: - VT = MVT::v4f64; - break; - case Intrinsic::ppc_qpx_qvstfsa: - VT = MVT::v4f32; - break; - case Intrinsic::ppc_qpx_qvstfcda: - VT = MVT::v2f64; - break; - case Intrinsic::ppc_qpx_qvstfcsa: - VT = MVT::v2f32; - break; - default: - VT = MVT::v4i32; - break; - } - - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = VT; - Info.ptrVal = I.getArgOperand(1); - Info.offset = 0; - Info.size = VT.getStoreSize(); - Info.align = Align(1); - Info.flags = MachineMemOperand::MOStore; - return true; - } default: break; } @@ -16232,14 +15457,6 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, EVT PPCTargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { - // When expanding a memset, require at least two QPX instructions to cover - // the cost of loading the value to be stored from the constant pool. - if (Subtarget.hasQPX() && Op.size() >= 32 && - (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) && - !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { - return MVT::v4f64; - } - // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. if (Subtarget.hasAltivec() && Op.size() >= 16 && @@ -16358,6 +15575,33 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return true; } +bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { + // Check integral scalar types. + if (!VT.isScalarInteger()) + return false; + if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) { + if (!ConstNode->getAPIntValue().isSignedIntN(64)) + return false; + // This transformation will generate >= 2 operations. But the following + // cases will generate <= 2 instructions during ISEL. So exclude them. + // 1. If the constant multiplier fits 16 bits, it can be handled by one + // HW instruction, ie. MULLI + // 2. If the multiplier after shifted fits 16 bits, an extra shift + // instruction is needed than case 1, ie. MULLI and RLDICR + int64_t Imm = ConstNode->getSExtValue(); + unsigned Shift = countTrailingZeros<uint64_t>(Imm); + Imm >>= Shift; + if (isInt<16>(Imm)) + return false; + uint64_t UImm = static_cast<uint64_t>(Imm); + if (isPowerOf2_64(UImm + 1) || isPowerOf2_64(UImm - 1) || + isPowerOf2_64(1 - UImm) || isPowerOf2_64(-1 - UImm)) + return true; + } + return false; +} + bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { return isFMAFasterThanFMulAndFAdd( @@ -16377,31 +15621,56 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, } } -// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist. -// FIXME: add more patterns which are profitable to hoist. +// FIXME: add more patterns which are not profitable to hoist. bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const { - if (I->getOpcode() != Instruction::FMul) - return true; - if (!I->hasOneUse()) return true; Instruction *User = I->user_back(); assert(User && "A single use instruction with no uses."); - if (User->getOpcode() != Instruction::FSub && - User->getOpcode() != Instruction::FAdd) - return true; + switch (I->getOpcode()) { + case Instruction::FMul: { + // Don't break FMA, PowerPC prefers FMA. + if (User->getOpcode() != Instruction::FSub && + User->getOpcode() != Instruction::FAdd) + return true; - const TargetOptions &Options = getTargetMachine().Options; - const Function *F = I->getFunction(); - const DataLayout &DL = F->getParent()->getDataLayout(); - Type *Ty = User->getOperand(0)->getType(); + const TargetOptions &Options = getTargetMachine().Options; + const Function *F = I->getFunction(); + const DataLayout &DL = F->getParent()->getDataLayout(); + Type *Ty = User->getOperand(0)->getType(); + + return !( + isFMAFasterThanFMulAndFAdd(*F, Ty) && + isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); + } + case Instruction::Load: { + // Don't break "store (load float*)" pattern, this pattern will be combined + // to "store (load int32)" in later InstCombine pass. See function + // combineLoadToOperationType. On PowerPC, loading a float point takes more + // cycles than loading a 32 bit integer. + LoadInst *LI = cast<LoadInst>(I); + // For the loads that combineLoadToOperationType does nothing, like + // ordered load, it should be profitable to hoist them. + // For swifterror load, it can only be used for pointer to pointer type, so + // later type check should get rid of this case. + if (!LI->isUnordered()) + return true; + + if (User->getOpcode() != Instruction::Store) + return true; + + if (I->getType()->getTypeID() != Type::FloatTyID) + return true; - return !( - isFMAFasterThanFMulAndFAdd(*F, Ty) && - isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && - (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); + return false; + } + default: + return true; + } + return true; } const MCPhysReg * @@ -16433,7 +15702,7 @@ PPCTargetLowering::shouldExpandBuildVectorWithShuffles( if (VT == MVT::v2i64) return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves - if (Subtarget.hasVSX() || Subtarget.hasQPX()) + if (Subtarget.hasVSX()) return true; return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); @@ -16479,8 +15748,7 @@ SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, switch (Opc) { case PPCISD::FNMSUB: - // TODO: QPX subtarget is deprecated. No transformation here. - if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX()) + if (!Op.hasOneUse() || !isTypeLegal(VT)) break; const TargetOptions &Options = getTargetMachine().Options; @@ -16609,10 +15877,10 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { SDValue N0 = N->getOperand(0); ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!Subtarget.isISA3_0() || + if (!Subtarget.isISA3_0() || !Subtarget.isPPC64() || N0.getOpcode() != ISD::SIGN_EXTEND || - N0.getOperand(0).getValueType() != MVT::i32 || - CN1 == nullptr || N->getValueType(0) != MVT::i64) + N0.getOperand(0).getValueType() != MVT::i32 || CN1 == nullptr || + N->getValueType(0) != MVT::i64) return SDValue(); // We can't save an operation here if the value is already extended, and @@ -16961,8 +16229,7 @@ SDValue PPCTargetLowering::combineFMALike(SDNode *N, bool LegalOps = !DCI.isBeforeLegalizeOps(); SDLoc Loc(N); - // TODO: QPX subtarget is deprecated. No transformation here. - if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT)) + if (!isOperationLegal(ISD::FMA, VT)) return SDValue(); // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0 diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 768eaa43e013..477105bd03ac 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -89,6 +89,12 @@ namespace llvm { FRE, FRSQRTE, + /// Test instruction for software square root. + FTSQRT, + + /// Square root instruction. + FSQRT, + /// VPERM - The PPC VPERM Instruction. /// VPERM, @@ -146,8 +152,7 @@ namespace llvm { /// probed. PROBED_ALLOCA, - /// GlobalBaseReg - On Darwin, this node represents the result of the mflr - /// at function entry, used for PIC code. + /// The result of the mflr at function entry, used for PIC code. GlobalBaseReg, /// These nodes represent PPC shifts. @@ -265,11 +270,11 @@ namespace llvm { /// is VCMPGTSH. VCMP, - /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the - /// altivec VCMP*o instructions. For lack of better number, we use the + /// RESVEC, OUTFLAG = VCMP_rec(LHS, RHS, OPC) - Represents one of the + /// altivec VCMP*_rec instructions. For lack of better number, we use the /// opcode number encoding for the OPC field to identify the compare. For /// example, 838 is VCMPGTSH. - VCMPo, + VCMP_rec, /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This /// corresponds to the COND_BRANCH pseudo instruction. CRRC is the @@ -381,6 +386,10 @@ namespace llvm { /// sym\@got\@dtprel\@l. ADDI_DTPREL_L, + /// G8RC = PADDI_DTPREL %x3, Symbol - For the pc-rel based local-dynamic TLS + /// model, produces a PADDI8 instruction that adds X3 to sym\@dtprel. + PADDI_DTPREL, + /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded /// during instruction selection to optimize a BUILD_VECTOR into /// operations on splats. This is necessary to avoid losing these @@ -427,22 +436,6 @@ namespace llvm { /// => VABSDUW((XVNEGSP a), (XVNEGSP b)) VABSD, - /// QVFPERM = This corresponds to the QPX qvfperm instruction. - QVFPERM, - - /// QVGPCI = This corresponds to the QPX qvgpci instruction. - QVGPCI, - - /// QVALIGNI = This corresponds to the QPX qvaligni instruction. - QVALIGNI, - - /// QVESPLATI = This corresponds to the QPX qvesplati instruction. - QVESPLATI, - - /// QBFLT = Access the underlying QPX floating-point boolean - /// representation. - QBFLT, - /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or /// lower (IDX=1) half of v4f32 to v2f64. FP_EXTEND_HALF, @@ -452,6 +445,46 @@ namespace llvm { /// PLD. MAT_PCREL_ADDR, + /// TLS_DYNAMIC_MAT_PCREL_ADDR = Materialize a PC Relative address for + /// TLS global address when using dynamic access models. This can be done + /// through an add like PADDI. + TLS_DYNAMIC_MAT_PCREL_ADDR, + + /// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address + /// when using local exec access models, and when prefixed instructions are + /// available. This is used with ADD_TLS to produce an add like PADDI. + TLS_LOCAL_EXEC_MAT_ADDR, + + /// ACC_BUILD = Build an accumulator register from 4 VSX registers. + ACC_BUILD, + + /// PAIR_BUILD = Build a vector pair register from 2 VSX registers. + PAIR_BUILD, + + /// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of + /// an accumulator or pair register. This node is needed because + /// EXTRACT_SUBVECTOR expects the input and output vectors to have the same + /// element type. + EXTRACT_VSX_REG, + + /// XXMFACC = This corresponds to the xxmfacc instruction. + XXMFACC, + + // Constrained conversion from floating point to int + STRICT_FCTIDZ = ISD::FIRST_TARGET_STRICTFP_OPCODE, + STRICT_FCTIWZ, + STRICT_FCTIDUZ, + STRICT_FCTIWUZ, + + /// Constrained integer-to-floating-point conversion instructions. + STRICT_FCFID, + STRICT_FCFIDU, + STRICT_FCFIDS, + STRICT_FCFIDUS, + + /// Constrained floating point add in round-to-zero mode. + STRICT_FADDRTZ, + /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a /// byte-swapping store instruction. It byte-swaps the low "Type" bits of /// the GPRC input, then stores it through Ptr. Type can be either i16 or @@ -493,6 +526,12 @@ namespace llvm { /// an xxswapd. LXVD2X, + /// LXVRZX - Load VSX Vector Rightmost and Zero Extend + /// This node represents v1i128 BUILD_VECTOR of a zero extending load + /// instruction from <byte, halfword, word, or doubleword> to i128. + /// Allows utilization of the Load VSX Vector Rightmost Instructions. + LXVRZX, + /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian. /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on /// the vector type to load vector in big-endian element order. @@ -519,10 +558,6 @@ namespace llvm { /// Store scalar integers from VSR. ST_VSR_SCAL_INT, - /// QBRC, CHAIN = QVLFSb CHAIN, Ptr - /// The 4xf32 load used for v4i1 constants. - QVLFSb, - /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes /// except they ensure that the compare input is zero-extended for /// sub-word versions because the atomic loads zero-extend. @@ -627,10 +662,6 @@ namespace llvm { /// the number of bytes of each element [124] -> [bhw]. SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG); - /// If this is a qvaligni shuffle mask, return the shift - /// amount, otherwise return -1. - int isQVALIGNIShuffleMask(SDNode *N); - } // end namespace PPC class PPCTargetLowering : public TargetLowering { @@ -740,6 +771,8 @@ namespace llvm { bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, MaybeAlign EncodingAlignment) const; + bool SelectAddressRegImm34(SDValue N, SDValue &Disp, SDValue &Base, + SelectionDAG &DAG) const; /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. @@ -895,6 +928,9 @@ namespace llvm { return true; } + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; + bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override { // Only handle float load/store pair because float(fpr) load/store @@ -980,11 +1016,6 @@ namespace llvm { Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override; - /// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a - /// specific type is cheaper than a multiply followed by a shift. - /// This is true for words and doublewords on 64-bit PowerPC. - bool isMulhCheaperThanMulShift(EVT Type) const override; - /// Override to support customized stack guard loading. bool useLoadStackGuardNode() const override; void insertSSPDeclarations(Module &M) const override; @@ -1042,11 +1073,6 @@ namespace llvm { } }; - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { - // Addrspacecasts are always noops. - return true; - } - bool canReuseLoadAddress(SDValue Op, EVT MemVT, ReuseLoadInfo &RLI, SelectionDAG &DAG, ISD::LoadExtType ET = ISD::NON_EXTLOAD) const; @@ -1117,19 +1143,18 @@ namespace llvm { SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; @@ -1176,10 +1201,6 @@ namespace llvm { SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; - SDValue LowerFormalArguments_Darwin( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, - SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const; SDValue LowerFormalArguments_64SVR4( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, @@ -1194,13 +1215,6 @@ namespace llvm { ISD::ArgFlagsTy Flags, SelectionDAG &DAG, const SDLoc &dl) const; - SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee, CallFlags CFlags, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - const SDLoc &dl, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals, - const CallBase *CB) const; SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallFlags CFlags, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, @@ -1257,6 +1271,10 @@ namespace llvm { bool Reciprocal) const override; SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override; + SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG, + const DenormalMode &Mode) const override; + SDValue getSqrtResultForDenormInput(SDValue Operand, + SelectionDAG &DAG) const override; unsigned combineRepeatedFPDivisors() const override; SDValue @@ -1295,6 +1313,8 @@ namespace llvm { bool isIntS16Immediate(SDNode *N, int16_t &Imm); bool isIntS16Immediate(SDValue Op, int16_t &Imm); + bool isIntS34Immediate(SDNode *N, int64_t &Imm); + bool isIntS34Immediate(SDValue Op, int64_t &Imm); bool convertToNonDenormSingle(APInt &ArgAPInt); bool convertToNonDenormSingle(APFloat &ArgAPFloat); diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 1c457d4170d5..03e9d6970a30 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -19,12 +19,14 @@ def s16imm64 : Operand<i64> { let EncoderMethod = "getImm16Encoding"; let ParserMatchClass = PPCS16ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<16>"; + let OperandType = "OPERAND_IMMEDIATE"; } def u16imm64 : Operand<i64> { let PrintMethod = "printU16ImmOperand"; let EncoderMethod = "getImm16Encoding"; let ParserMatchClass = PPCU16ImmAsmOperand; let DecoderMethod = "decodeUImmOperand<16>"; + let OperandType = "OPERAND_IMMEDIATE"; } def s17imm64 : Operand<i64> { // This operand type is used for addis/lis to allow the assembler parser @@ -34,6 +36,7 @@ def s17imm64 : Operand<i64> { let EncoderMethod = "getImm16Encoding"; let ParserMatchClass = PPCS17ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<16>"; + let OperandType = "OPERAND_IMMEDIATE"; } def tocentry : Operand<iPTR> { let MIOperandInfo = (ops i64imm:$imm); @@ -148,6 +151,9 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in { def BL8_NOTOC : IForm<18, 0, 1, (outs), (ins calltarget:$func), "bl $func", IIC_BrB, []>; + def BL8_NOTOC_TLS : IForm<18, 0, 1, (outs), + (ins tlscall:$func), + "bl $func", IIC_BrB, []>; } } let Uses = [CTR8, RM] in { @@ -431,9 +437,14 @@ def PROBED_ALLOCA_64 : PPCCustomInserterPseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi), "#PROBED_ALLOCA_64", [(set i64:$result, (PPCprobedalloca i64:$negsize, iaddr:$fpsi))]>; -def PREPARE_PROBED_ALLOCA_64 : PPCEmitTimePseudo<(outs g8rc:$fp, - g8rc:$sp), +def PREPARE_PROBED_ALLOCA_64 : PPCEmitTimePseudo<(outs + g8rc:$fp, g8rc:$actual_negsize), (ins g8rc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_64", []>; +def PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64 : PPCEmitTimePseudo<(outs + g8rc:$fp, g8rc:$actual_negsize), + (ins g8rc:$negsize, memri:$fpsi), + "#PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64", []>, + RegConstraint<"$actual_negsize = $negsize">; def PROBED_STACKALLOC_64 : PPCEmitTimePseudo<(outs g8rc:$scratch, g8rc:$temp), (ins i64imm:$stacksize), "#PROBED_STACKALLOC_64", []>; @@ -835,7 +846,7 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in { def SETB8 : XForm_44<31, 128, (outs g8rc:$RT), (ins crrc:$BFA), "setb $RT, $BFA", IIC_IntGeneral>, isPPC64; } -def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins i32imm:$L), +def DARN : XForm_45<31, 755, (outs g8rc:$RT), (ins u2imm:$L), "darn $RT, $L", IIC_LdStLD>, isPPC64; def ADDPCIS : DXForm<19, 2, (outs g8rc:$RT), (ins i32imm:$D), "addpcis $RT, $D", IIC_BrB, []>, isPPC64; @@ -976,8 +987,11 @@ def : InstAlias<"cntlzw. $rA, $rS", (CNTLZW8_rec g8rc:$rA, g8rc:$rS)>; def : InstAlias<"mtxer $Rx", (MTSPR8 1, g8rc:$Rx)>; def : InstAlias<"mfxer $Rx", (MFSPR8 g8rc:$Rx, 1)>; -def : InstAlias<"mtudscr $Rx", (MTSPR8 3, g8rc:$Rx)>; -def : InstAlias<"mfudscr $Rx", (MFSPR8 g8rc:$Rx, 3)>; +//Disable this alias on AIX for now because as does not support them. +let Predicates = [ModernAs] in { + def : InstAlias<"mtudscr $Rx", (MTSPR8 3, g8rc:$Rx)>; + def : InstAlias<"mfudscr $Rx", (MFSPR8 g8rc:$Rx, 3)>; +} def : InstAlias<"mfrtcu $Rx", (MFSPR8 g8rc:$Rx, 4)>; def : InstAlias<"mfrtcl $Rx", (MFSPR8 g8rc:$Rx, 5)>; @@ -1021,8 +1035,8 @@ def : InstAlias<"mfamr $Rx", (MFSPR8 g8rc:$Rx, 29)>; foreach SPRG = 0-3 in { def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR8 g8rc:$RT, !add(SPRG, 272))>; def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR8 g8rc:$RT, !add(SPRG, 272))>; - def : InstAlias<"mfsprg "#SPRG#", $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>; - def : InstAlias<"mfsprg"#SPRG#" $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>; + def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>; + def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>; } def : InstAlias<"mfasr $RT", (MFSPR8 g8rc:$RT, 280)>; @@ -1051,7 +1065,7 @@ def LHA8: DForm_1<42, (outs g8rc:$rD), (ins memri:$src), def LWA : DSForm_1<58, 2, (outs g8rc:$rD), (ins memrix:$src), "lwa $rD, $src", IIC_LdStLWA, [(set i64:$rD, - (aligned4sextloadi32 iaddrX4:$src))]>, isPPC64, + (DSFormSextLoadi32 iaddrX4:$src))]>, isPPC64, PPC970_DGroup_Cracked; let Interpretation64Bit = 1, isCodeGenOnly = 1 in def LHAX8: XForm_1_memOp<31, 343, (outs g8rc:$rD), (ins memrr:$src), @@ -1162,7 +1176,7 @@ def LWZUX8 : XForm_1_memOp<31, 55, (outs g8rc:$rD, ptr_rc_nor0:$ea_result), let PPC970_Unit = 2 in { def LD : DSForm_1<58, 0, (outs g8rc:$rD), (ins memrix:$src), "ld $rD, $src", IIC_LdStLD, - [(set i64:$rD, (aligned4load iaddrX4:$src))]>, isPPC64; + [(set i64:$rD, (DSFormLoad iaddrX4:$src))]>, isPPC64; // The following four definitions are selected for small code model only. // Otherwise, we need to create two instructions to form a 32-bit offset, // so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select(). @@ -1257,17 +1271,36 @@ def ADDItlsgdL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm6 [(set i64:$rD, (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>, isPPC64; -// LR8 is a true define, while the rest of the Defs are clobbers. X3 is + +class GETtlsADDRPseudo <string asmstr> : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), + asmstr, + [(set i64:$rD, + (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>, + isPPC64; +class GETtlsldADDRPseudo <string asmstr> : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), + asmstr, + [(set i64:$rD, + (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>, + isPPC64; + +let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1 in { +// LR8 is a true define, while the rest of the Defs are clobbers. X3 is // explicitly defined when this op is created, so not mentioned here. // This is lowered to BL8_NOP_TLS by the assembly printer, so the size must be // correct because the branch select pass is relying on it. -let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8, - Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in -def GETtlsADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), - "#GETtlsADDR", - [(set i64:$rD, - (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>, - isPPC64; +let Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7], Size = 8 in +def GETtlsADDR : GETtlsADDRPseudo <"#GETtlsADDR">; +let Defs = [X0,X2,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7], Size = 8 in +def GETtlsADDRPCREL : GETtlsADDRPseudo <"#GETtlsADDRPCREL">; + +// LR8 is a true define, while the rest of the Defs are clobbers. X3 is +// explicitly defined when this op is created, so not mentioned here. +let Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in +def GETtlsldADDR : GETtlsldADDRPseudo <"#GETtlsldADDR">; +let Defs = [X0,X2,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in +def GETtlsldADDRPCREL : GETtlsldADDRPseudo <"#GETtlsldADDRPCREL">; +} + // Combined op for ADDItlsgdL and GETtlsADDR, late expanded. X3 and LR8 // are true defines while the rest of the Defs are clobbers. let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, @@ -1291,15 +1324,6 @@ def ADDItlsldL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm6 [(set i64:$rD, (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>, isPPC64; -// LR8 is a true define, while the rest of the Defs are clobbers. X3 is -// explicitly defined when this op is created, so not mentioned here. -let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, - Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in -def GETtlsldADDR : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym), - "#GETtlsldADDR", - [(set i64:$rD, - (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>, - isPPC64; // Combined op for ADDItlsldL and GETtlsADDR, late expanded. X3 and LR8 // are true defines, while the rest of the Defs are clobbers. let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, @@ -1324,6 +1348,11 @@ def ADDIdtprelL : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm [(set i64:$rD, (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>, isPPC64; +def PADDIdtprel : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp), + "#PADDIdtprel", + [(set i64:$rD, + (PPCpaddiDtprel i64:$reg, tglobaltlsaddr:$disp))]>, + isPPC64; let PPC970_Unit = 2 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { @@ -1354,7 +1383,7 @@ def STWX8 : XForm_8_memOp<31, 151, (outs), (ins g8rc:$rS, memrr:$dst), // Normal 8-byte stores. def STD : DSForm_1<62, 0, (outs), (ins g8rc:$rS, memrix:$dst), "std $rS, $dst", IIC_LdStSTD, - [(aligned4store i64:$rS, iaddrX4:$dst)]>, isPPC64; + [(DSFormStore i64:$rS, iaddrX4:$dst)]>, isPPC64; def STDX : XForm_8_memOp<31, 149, (outs), (ins g8rc:$rS, memrr:$dst), "stdx $rS, $dst", IIC_LdStSTD, [(store i64:$rS, xaddrX4:$dst)]>, isPPC64, @@ -1421,7 +1450,7 @@ def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), (STHU8 $rS, iaddroff:$ptroff, $ptrreg)>; def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), (STWU8 $rS, iaddroff:$ptroff, $ptrreg)>; -def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), +def : Pat<(DSFormPreStore i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff), (STDU $rS, iaddroff:$ptroff, $ptrreg)>; def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), @@ -1439,11 +1468,11 @@ def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff), // -let PPC970_Unit = 3, hasSideEffects = 0, +let PPC970_Unit = 3, hasSideEffects = 0, mayRaiseFPException = 1, Uses = [RM] in { // FPU Operations. defm FCFID : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB), "fcfid", "$frD, $frB", IIC_FPGeneral, - [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64; + [(set f64:$frD, (PPCany_fcfid f64:$frB))]>, isPPC64; defm FCTID : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB), "fctid", "$frD, $frB", IIC_FPGeneral, []>, isPPC64; @@ -1452,23 +1481,23 @@ defm FCTIDU : XForm_26r<63, 942, (outs f8rc:$frD), (ins f8rc:$frB), []>, isPPC64; defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB), "fctidz", "$frD, $frB", IIC_FPGeneral, - [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64; + [(set f64:$frD, (PPCany_fctidz f64:$frB))]>, isPPC64; defm FCFIDU : XForm_26r<63, 974, (outs f8rc:$frD), (ins f8rc:$frB), "fcfidu", "$frD, $frB", IIC_FPGeneral, - [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64; + [(set f64:$frD, (PPCany_fcfidu f64:$frB))]>, isPPC64; defm FCFIDS : XForm_26r<59, 846, (outs f4rc:$frD), (ins f8rc:$frB), "fcfids", "$frD, $frB", IIC_FPGeneral, - [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64; + [(set f32:$frD, (PPCany_fcfids f64:$frB))]>, isPPC64; defm FCFIDUS : XForm_26r<59, 974, (outs f4rc:$frD), (ins f8rc:$frB), "fcfidus", "$frD, $frB", IIC_FPGeneral, - [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64; + [(set f32:$frD, (PPCany_fcfidus f64:$frB))]>, isPPC64; defm FCTIDUZ : XForm_26r<63, 943, (outs f8rc:$frD), (ins f8rc:$frB), "fctiduz", "$frD, $frB", IIC_FPGeneral, - [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64; + [(set f64:$frD, (PPCany_fctiduz f64:$frB))]>, isPPC64; defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB), "fctiwuz", "$frD, $frB", IIC_FPGeneral, - [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64; + [(set f64:$frD, (PPCany_fctiwuz f64:$frB))]>, isPPC64; } @@ -1565,11 +1594,11 @@ def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)), // Patterns to match r+r indexed loads and stores for // addresses without at least 4-byte alignment. -def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)), +def : Pat<(i64 (NonDSFormSextLoadi32 xoaddr:$src)), (LWAX xoaddr:$src)>; -def : Pat<(i64 (unaligned4load xoaddr:$src)), +def : Pat<(i64 (NonDSFormLoad xoaddr:$src)), (LDX xoaddr:$src)>; -def : Pat<(unaligned4store i64:$rS, xoaddr:$dst), +def : Pat<(NonDSFormStore i64:$rS, xoaddr:$dst), (STDX $rS, xoaddr:$dst)>; // 64-bits atomic loads and stores @@ -1580,6 +1609,11 @@ def : Pat<(atomic_store_64 iaddrX4:$ptr, i64:$val), (STD g8rc:$val, memrix:$ptr def : Pat<(atomic_store_64 xaddrX4:$ptr, i64:$val), (STDX g8rc:$val, memrr:$ptr)>; let Predicates = [IsISA3_0] in { +// DARN (deliver random number) +// L=0 for 32-bit, L=1 for conditioned random, L=2 for raw random +def : Pat<(int_ppc_darn32), (EXTRACT_SUBREG (DARN 0), sub_32)>; +def : Pat<(int_ppc_darn), (DARN 1)>; +def : Pat<(int_ppc_darnraw), (DARN 2)>; class X_L1_RA5_RB5<bits<6> opcode, bits<10> xo, string opc, RegisterOperand ty, InstrItinClass itin, list<dag> pattern> diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index 920eeed9d41f..1a34aa09315b 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -404,12 +404,14 @@ let isCodeGenOnly = 1 in { Deprecated<DeprecatedDST>; } -def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins), - "mfvscr $vD", IIC_LdStStore, - [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; -def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB), - "mtvscr $vB", IIC_LdStLoad, - [(int_ppc_altivec_mtvscr v4i32:$vB)]>; +let hasSideEffects = 1 in { + def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins), + "mfvscr $vD", IIC_LdStStore, + [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; + def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB), + "mtvscr $vB", IIC_LdStLoad, + [(int_ppc_altivec_mtvscr v4i32:$vB)]>; +} let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in { // Loads. def LVEBX: XForm_1_memOp<31, 7, (outs vrrc:$vD), (ins memrr:$src), @@ -469,10 +471,11 @@ def VNMSUBFP: VAForm_1<47, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vC, vrrc:$vB), "vnmsubfp $vD, $vA, $vC, $vB", IIC_VecFP, [(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC, (fneg v4f32:$vB))))]>; - -def VMHADDSHS : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>; -def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs, - v8i16>; +let hasSideEffects = 1 in { + def VMHADDSHS : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>; + def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs, + v8i16>; +} def VMLADDUHM : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>; } // isCommutable @@ -608,14 +611,16 @@ def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm, v4i32, v16i8, v4i32>; def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm, v4i32, v8i16, v4i32>; -def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs, - v4i32, v8i16, v4i32>; def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm, v4i32, v16i8, v4i32>; def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm, v4i32, v8i16, v4i32>; -def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs, - v4i32, v8i16, v4i32>; +let hasSideEffects = 1 in { + def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs, + v4i32, v8i16, v4i32>; + def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs, + v4i32, v8i16, v4i32>; +} let isCommutable = 1 in { def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb, @@ -665,15 +670,17 @@ def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>; def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>; def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>; -def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>; -def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>; +let hasSideEffects = 1 in { + def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>; + def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>; -def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs, - v4i32, v16i8, v4i32>; -def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs, - v4i32, v8i16, v4i32>; -def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs, - v4i32, v16i8, v4i32>; + def VSUM4SBS: VX1_Int_Ty3<1800, "vsum4sbs", int_ppc_altivec_vsum4sbs, + v4i32, v16i8, v4i32>; + def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs, + v4i32, v8i16, v4i32>; + def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs, + v4i32, v16i8, v4i32>; +} def VNOR : VXForm_1<1284, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vnor $vD, $vA, $vB", IIC_VecFP, @@ -742,26 +749,28 @@ def VSPLTISW : VXForm_3<908, (outs vrrc:$vD), (ins s5imm:$SIMM), // Vector Pack. def VPKPX : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx, v8i16, v4i32>; -def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss, - v16i8, v8i16>; -def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus, - v16i8, v8i16>; -def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss, - v8i16, v4i32>; -def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus, - v8i16, v4i32>; +let hasSideEffects = 1 in { + def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss, + v16i8, v8i16>; + def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus, + v16i8, v8i16>; + def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss, + v8i16, v4i32>; + def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus, + v8i16, v4i32>; + def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus, + v16i8, v8i16>; + def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus, + v8i16, v4i32>; +} def VPKUHUM : VXForm_1<14, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vpkuhum $vD, $vA, $vB", IIC_VecFP, [(set v16i8:$vD, (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>; -def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus, - v16i8, v8i16>; def VPKUWUM : VXForm_1<78, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vpkuwum $vD, $vA, $vB", IIC_VecFP, [(set v16i8:$vD, (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>; -def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus, - v8i16, v4i32>; // Vector Unpack. def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx, @@ -784,47 +793,47 @@ class VCMP<bits<10> xo, string asmstr, ValueType Ty> : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr, IIC_VecFPCompare, [(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>; -class VCMPo<bits<10> xo, string asmstr, ValueType Ty> +class VCMP_rec<bits<10> xo, string asmstr, ValueType Ty> : VXRForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), asmstr, IIC_VecFPCompare, - [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> { + [(set Ty:$vD, (Ty (PPCvcmp_rec Ty:$vA, Ty:$vB, xo)))]> { let Defs = [CR6]; let RC = 1; } // f32 element comparisons.0 def VCMPBFP : VCMP <966, "vcmpbfp $vD, $vA, $vB" , v4f32>; -def VCMPBFP_rec : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>; +def VCMPBFP_rec : VCMP_rec<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>; def VCMPEQFP : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>; -def VCMPEQFP_rec : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>; +def VCMPEQFP_rec : VCMP_rec<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>; def VCMPGEFP : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>; -def VCMPGEFP_rec : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>; +def VCMPGEFP_rec : VCMP_rec<454, "vcmpgefp. $vD, $vA, $vB", v4f32>; def VCMPGTFP : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>; -def VCMPGTFP_rec : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>; +def VCMPGTFP_rec : VCMP_rec<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>; // i8 element comparisons. def VCMPEQUB : VCMP < 6, "vcmpequb $vD, $vA, $vB" , v16i8>; -def VCMPEQUB_rec : VCMPo< 6, "vcmpequb. $vD, $vA, $vB", v16i8>; +def VCMPEQUB_rec : VCMP_rec< 6, "vcmpequb. $vD, $vA, $vB", v16i8>; def VCMPGTSB : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>; -def VCMPGTSB_rec : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>; +def VCMPGTSB_rec : VCMP_rec<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>; def VCMPGTUB : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>; -def VCMPGTUB_rec : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>; +def VCMPGTUB_rec : VCMP_rec<518, "vcmpgtub. $vD, $vA, $vB", v16i8>; // i16 element comparisons. def VCMPEQUH : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>; -def VCMPEQUH_rec : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>; +def VCMPEQUH_rec : VCMP_rec< 70, "vcmpequh. $vD, $vA, $vB", v8i16>; def VCMPGTSH : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>; -def VCMPGTSH_rec : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>; +def VCMPGTSH_rec : VCMP_rec<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>; def VCMPGTUH : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>; -def VCMPGTUH_rec : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>; +def VCMPGTUH_rec : VCMP_rec<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>; // i32 element comparisons. def VCMPEQUW : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>; -def VCMPEQUW_rec : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>; +def VCMPEQUW_rec : VCMP_rec<134, "vcmpequw. $vD, $vA, $vB", v4i32>; def VCMPGTSW : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>; -def VCMPGTSW_rec : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>; +def VCMPGTSW_rec : VCMP_rec<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>; def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>; -def VCMPGTUW_rec : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>; +def VCMPGTUW_rec : VCMP_rec<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>; let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in { @@ -933,6 +942,18 @@ def : Pat<(v1i128 (bitconvert (v4i32 VRRC:$src))), (v1i128 VRRC:$src)>; def : Pat<(v1i128 (bitconvert (v4f32 VRRC:$src))), (v1i128 VRRC:$src)>; def : Pat<(v1i128 (bitconvert (v2i64 VRRC:$src))), (v1i128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v16i8 VRRC:$src))), (f128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v8i16 VRRC:$src))), (f128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v4i32 VRRC:$src))), (f128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v4f32 VRRC:$src))), (f128 VRRC:$src)>; +def : Pat<(f128 (bitconvert (v2f64 VRRC:$src))), (f128 VRRC:$src)>; + +def : Pat<(v16i8 (bitconvert (f128 VRRC:$src))), (v16i8 VRRC:$src)>; +def : Pat<(v8i16 (bitconvert (f128 VRRC:$src))), (v8i16 VRRC:$src)>; +def : Pat<(v4i32 (bitconvert (f128 VRRC:$src))), (v4i32 VRRC:$src)>; +def : Pat<(v4f32 (bitconvert (f128 VRRC:$src))), (v4f32 VRRC:$src)>; +def : Pat<(v2f64 (bitconvert (f128 VRRC:$src))), (v2f64 VRRC:$src)>; + // Max/Min def : Pat<(v16i8 (umax v16i8:$src1, v16i8:$src2)), (v16i8 (VMAXUB $src1, $src2))>; @@ -1291,11 +1312,11 @@ def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), // i64 element comparisons. def VCMPEQUD : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>; -def VCMPEQUD_rec : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>; +def VCMPEQUD_rec : VCMP_rec<199, "vcmpequd. $vD, $vA, $vB", v2i64>; def VCMPGTSD : VCMP <967, "vcmpgtsd $vD, $vA, $vB" , v2i64>; -def VCMPGTSD_rec : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>; +def VCMPGTSD_rec : VCMP_rec<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>; def VCMPGTUD : VCMP <711, "vcmpgtud $vD, $vA, $vB" , v2i64>; -def VCMPGTUD_rec : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>; +def VCMPGTUD_rec : VCMP_rec<711, "vcmpgtud. $vD, $vA, $vB", v2i64>; // The cryptography instructions that do not require Category:Vector.Crypto def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb", @@ -1306,20 +1327,22 @@ def VPMSUMW : VX1_Int_Ty<1160, "vpmsumw", int_ppc_altivec_crypto_vpmsumw, v4i32>; def VPMSUMD : VX1_Int_Ty<1224, "vpmsumd", int_ppc_altivec_crypto_vpmsumd, v2i64>; -def VPERMXOR : VA1a_Int_Ty<45, "vpermxor", - int_ppc_altivec_crypto_vpermxor, v16i8>; +def VPERMXOR : VAForm_1<45, (outs vrrc:$VD), (ins vrrc:$VA, vrrc:$VB, vrrc:$VC), + "vpermxor $VD, $VA, $VB, $VC", IIC_VecFP, []>; // Vector doubleword integer pack and unpack. -def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss, - v4i32, v2i64>; -def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus, - v4i32, v2i64>; +let hasSideEffects = 1 in { + def VPKSDSS : VX1_Int_Ty2<1486, "vpksdss", int_ppc_altivec_vpksdss, + v4i32, v2i64>; + def VPKSDUS : VX1_Int_Ty2<1358, "vpksdus", int_ppc_altivec_vpksdus, + v4i32, v2i64>; + def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus, + v4i32, v2i64>; +} def VPKUDUM : VXForm_1<1102, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vpkudum $vD, $vA, $vB", IIC_VecFP, [(set v16i8:$vD, (vpkudum_shuffle v16i8:$vA, v16i8:$vB))]>; -def VPKUDUS : VX1_Int_Ty2<1230, "vpkudus", int_ppc_altivec_vpkudus, - v4i32, v2i64>; def VUPKHSW : VX2_Int_Ty2<1614, "vupkhsw", int_ppc_altivec_vupkhsw, v2i64, v4i32>; def VUPKLSW : VX2_Int_Ty2<1742, "vupklsw", int_ppc_altivec_vupklsw, @@ -1363,21 +1386,21 @@ def VMSUMUDM : VA1a_Int_Ty3<35, "vmsumudm", int_ppc_altivec_vmsumudm, // i8 element comparisons. def VCMPNEB : VCMP < 7, "vcmpneb $vD, $vA, $vB" , v16i8>; -def VCMPNEB_rec : VCMPo < 7, "vcmpneb. $vD, $vA, $vB" , v16i8>; +def VCMPNEB_rec : VCMP_rec < 7, "vcmpneb. $vD, $vA, $vB" , v16i8>; def VCMPNEZB : VCMP <263, "vcmpnezb $vD, $vA, $vB" , v16i8>; -def VCMPNEZB_rec : VCMPo<263, "vcmpnezb. $vD, $vA, $vB", v16i8>; +def VCMPNEZB_rec : VCMP_rec<263, "vcmpnezb. $vD, $vA, $vB", v16i8>; // i16 element comparisons. def VCMPNEH : VCMP < 71, "vcmpneh $vD, $vA, $vB" , v8i16>; -def VCMPNEH_rec : VCMPo< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>; +def VCMPNEH_rec : VCMP_rec< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>; def VCMPNEZH : VCMP <327, "vcmpnezh $vD, $vA, $vB" , v8i16>; -def VCMPNEZH_rec : VCMPo<327, "vcmpnezh. $vD, $vA, $vB", v8i16>; +def VCMPNEZH_rec : VCMP_rec<327, "vcmpnezh. $vD, $vA, $vB", v8i16>; // i32 element comparisons. def VCMPNEW : VCMP <135, "vcmpnew $vD, $vA, $vB" , v4i32>; -def VCMPNEW_rec : VCMPo<135, "vcmpnew. $vD, $vA, $vB" , v4i32>; +def VCMPNEW_rec : VCMP_rec<135, "vcmpnew. $vD, $vA, $vB" , v4i32>; def VCMPNEZW : VCMP <391, "vcmpnezw $vD, $vA, $vB" , v4i32>; -def VCMPNEZW_rec : VCMPo<391, "vcmpnezw. $vD, $vA, $vB", v4i32>; +def VCMPNEZW_rec : VCMP_rec<391, "vcmpnezw. $vD, $vA, $vB", v4i32>; // VX-Form: [PO VRT / UIM VRB XO]. // We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent @@ -1449,11 +1472,16 @@ def VCTZD : VX_VT5_EO5_VB5<1538, 31, "vctzd", [(set v2i64:$vD, (cttz v2i64:$vB))]>; // Vector Extend Sign -def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w", []>; -def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w", []>; -def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d", []>; -def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d", []>; -def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d", []>; +def VEXTSB2W : VX_VT5_EO5_VB5<1538, 16, "vextsb2w", + [(set v4i32:$vD, (int_ppc_altivec_vextsb2w v16i8:$vB))]>; +def VEXTSH2W : VX_VT5_EO5_VB5<1538, 17, "vextsh2w", + [(set v4i32:$vD, (int_ppc_altivec_vextsh2w v8i16:$vB))]>; +def VEXTSB2D : VX_VT5_EO5_VB5<1538, 24, "vextsb2d", + [(set v2i64:$vD, (int_ppc_altivec_vextsb2d v16i8:$vB))]>; +def VEXTSH2D : VX_VT5_EO5_VB5<1538, 25, "vextsh2d", + [(set v2i64:$vD, (int_ppc_altivec_vextsh2d v8i16:$vB))]>; +def VEXTSW2D : VX_VT5_EO5_VB5<1538, 26, "vextsw2d", + [(set v2i64:$vD, (int_ppc_altivec_vextsw2d v4i32:$vB))]>; let isCodeGenOnly = 1 in { def VEXTSB2Ws : VX_VT5_EO5_VB5s<1538, 16, "vextsb2w", []>; def VEXTSH2Ws : VX_VT5_EO5_VB5s<1538, 17, "vextsh2w", []>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index 632d4d9deb8a..646efe64a22c 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -637,12 +637,12 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, } class XForm_17a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin> + InstrItinClass itin, list<dag> pattern> : XForm_17<opcode, xo, OOL, IOL, asmstr, itin > { let FRA = 0; + let Pattern = pattern; } -// Used for QPX class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> : I<opcode, OOL, IOL, asmstr, itin> { @@ -1781,14 +1781,6 @@ class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, let Inst{31} = 0; } -// Used for QPX -class AForm_4a<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list<dag> pattern> - : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> { - let FRA = 0; - let FRC = 0; -} - // 1.7.13 M-Form class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> @@ -2099,49 +2091,6 @@ class VX_RD5_RSp5_PS1_XO9<bits<9> xo, dag OOL, dag IOL, string asmstr, let Inst{23-31} = xo; } -// Z23-Form (used by QPX) -class Z23Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list<dag> pattern> - : I<opcode, OOL, IOL, asmstr, itin> { - bits<5> FRT; - bits<5> FRA; - bits<5> FRB; - bits<2> idx; - - let Pattern = pattern; - - bit RC = 0; // set by isRecordForm - - let Inst{6-10} = FRT; - let Inst{11-15} = FRA; - let Inst{16-20} = FRB; - let Inst{21-22} = idx; - let Inst{23-30} = xo; - let Inst{31} = RC; -} - -class Z23Form_2<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list<dag> pattern> - : Z23Form_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> { - let FRB = 0; -} - -class Z23Form_3<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, - InstrItinClass itin, list<dag> pattern> - : I<opcode, OOL, IOL, asmstr, itin> { - bits<5> FRT; - bits<12> idx; - - let Pattern = pattern; - - bit RC = 0; // set by isRecordForm - - let Inst{6-10} = FRT; - let Inst{11-22} = idx; - let Inst{23-30} = xo; - let Inst{31} = RC; -} - class Z23Form_8<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> : I<opcode, OOL, IOL, asmstr, itin> { diff --git a/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/llvm/lib/Target/PowerPC/PPCInstrHTM.td index 992ad8216f3b..e59a08774dc5 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrHTM.td +++ b/llvm/lib/Target/PowerPC/PPCInstrHTM.td @@ -164,9 +164,8 @@ def : Pat<(int_ppc_tsuspend), (TSR 0)>; def : Pat<(i64 (int_ppc_ttest)), - (RLDICL (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), - (TABORTWCI 0, (LI 0), 0), sub_32)), - 36, 28)>; + (i64 (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), (TABORTWCI 0, (LI 0), 0), sub_32))>; } // [HasHTM] diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 11c97210ead9..9e3c6c569bd7 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -21,12 +21,15 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/StackMaps.h" @@ -73,6 +76,14 @@ static cl::opt<bool> UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden, cl::desc("Use the old (incorrect) instruction latency calculation")); +static cl::opt<float> + FMARPFactor("ppc-fma-rp-factor", cl::Hidden, cl::init(1.5), + cl::desc("register pressure factor for the transformations.")); + +static cl::opt<bool> EnableFMARegPressureReduction( + "ppc-fma-rp-reduction", cl::Hidden, cl::init(true), + cl::desc("enable register pressure reduce in machine combiner pass.")); + // Pin the vtable to this file. void PPCInstrInfo::anchor() {} @@ -259,14 +270,6 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case PPC::XVMULDP: case PPC::XVMULSP: case PPC::XSMULSP: - // QPX Add: - case PPC::QVFADD: - case PPC::QVFADDS: - case PPC::QVFADDSs: - // QPX Multiply: - case PPC::QVFMUL: - case PPC::QVFMULS: - case PPC::QVFMULSs: return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && Inst.getFlag(MachineInstr::MIFlag::FmNsz); // Fixed point: @@ -286,23 +289,23 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { #define InfoArrayIdxFMULInst 2 #define InfoArrayIdxAddOpIdx 3 #define InfoArrayIdxMULOpIdx 4 +#define InfoArrayIdxFSubInst 5 // Array keeps info for FMA instructions: // Index 0(InfoArrayIdxFMAInst): FMA instruction; -// Index 1(InfoArrayIdxFAddInst): ADD instruction assoaicted with FMA; -// Index 2(InfoArrayIdxFMULInst): MUL instruction assoaicted with FMA; +// Index 1(InfoArrayIdxFAddInst): ADD instruction associated with FMA; +// Index 2(InfoArrayIdxFMULInst): MUL instruction associated with FMA; // Index 3(InfoArrayIdxAddOpIdx): ADD operand index in FMA operands; // Index 4(InfoArrayIdxMULOpIdx): first MUL operand index in FMA operands; -// second MUL operand index is plus 1. -static const uint16_t FMAOpIdxInfo[][5] = { +// second MUL operand index is plus 1; +// Index 5(InfoArrayIdxFSubInst): SUB instruction associated with FMA. +static const uint16_t FMAOpIdxInfo[][6] = { // FIXME: Add more FMA instructions like XSNMADDADP and so on. - {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2}, - {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2}, - {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2}, - {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2}, - {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1}, - {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}, - {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1}, - {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}}; + {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2, PPC::XSSUBDP}, + {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2, PPC::XSSUBSP}, + {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2, PPC::XVSUBDP}, + {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2, PPC::XVSUBSP}, + {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1, PPC::FSUB}, + {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1, PPC::FSUBS}}; // Check if an opcode is a FMA instruction. If it is, return the index in array // FMAOpIdxInfo. Otherwise, return -1. @@ -313,6 +316,8 @@ int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const { return -1; } +// On PowerPC target, we have two kinds of patterns related to FMA: +// 1: Improve ILP. // Try to reassociate FMA chains like below: // // Pattern 1: @@ -336,11 +341,35 @@ int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const { // // breaking the dependency between A and B, allowing FMA to be executed in // parallel (or back-to-back in a pipeline) instead of depending on each other. +// +// 2: Reduce register pressure. +// Try to reassociate FMA with FSUB and a constant like below: +// C is a floatint point const. +// +// Pattern 1: +// A = FSUB X, Y (Leaf) +// D = FMA B, C, A (Root) +// --> +// A = FMA B, Y, -C +// D = FMA A, X, C +// +// Pattern 2: +// A = FSUB X, Y (Leaf) +// D = FMA B, A, C (Root) +// --> +// A = FMA B, Y, -C +// D = FMA A, X, C +// +// Before the transformation, A must be assigned with different hardware +// register with D. After the transformation, A and D must be assigned with +// same hardware register due to TIE attricute of FMA instructions. +// bool PPCInstrInfo::getFMAPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, + bool DoRegPressureReduce) const { MachineBasicBlock *MBB = Root.getParent(); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); auto IsAllOpsVirtualReg = [](const MachineInstr &Instr) { for (const auto &MO : Instr.explicit_operands()) @@ -349,16 +378,35 @@ bool PPCInstrInfo::getFMAPatterns( return true; }; - auto IsReassociable = [&](const MachineInstr &Instr, int16_t &AddOpIdx, - bool IsLeaf, bool IsAdd) { - int16_t Idx = -1; - if (!IsAdd) { - Idx = getFMAOpIdxInfo(Instr.getOpcode()); - if (Idx < 0) - return false; - } else if (Instr.getOpcode() != - FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())] - [InfoArrayIdxFAddInst]) + auto IsReassociableAddOrSub = [&](const MachineInstr &Instr, + unsigned OpType) { + if (Instr.getOpcode() != + FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())][OpType]) + return false; + + // Instruction can be reassociated. + // fast math flags may prohibit reassociation. + if (!(Instr.getFlag(MachineInstr::MIFlag::FmReassoc) && + Instr.getFlag(MachineInstr::MIFlag::FmNsz))) + return false; + + // Instruction operands are virtual registers for reassociation. + if (!IsAllOpsVirtualReg(Instr)) + return false; + + // For register pressure reassociation, the FSub must have only one use as + // we want to delete the sub to save its def. + if (OpType == InfoArrayIdxFSubInst && + !MRI->hasOneNonDBGUse(Instr.getOperand(0).getReg())) + return false; + + return true; + }; + + auto IsReassociableFMA = [&](const MachineInstr &Instr, int16_t &AddOpIdx, + int16_t &MulOpIdx, bool IsLeaf) { + int16_t Idx = getFMAOpIdxInfo(Instr.getOpcode()); + if (Idx < 0) return false; // Instruction can be reassociated. @@ -371,65 +419,356 @@ bool PPCInstrInfo::getFMAPatterns( if (!IsAllOpsVirtualReg(Instr)) return false; - if (IsAdd && IsLeaf) + MulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx]; + if (IsLeaf) return true; AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx]; const MachineOperand &OpAdd = Instr.getOperand(AddOpIdx); - MachineInstr *MIAdd = MRI.getUniqueVRegDef(OpAdd.getReg()); + MachineInstr *MIAdd = MRI->getUniqueVRegDef(OpAdd.getReg()); // If 'add' operand's def is not in current block, don't do ILP related opt. if (!MIAdd || MIAdd->getParent() != MBB) return false; // If this is not Leaf FMA Instr, its 'add' operand should only have one use // as this fma will be changed later. - return IsLeaf ? true : MRI.hasOneNonDBGUse(OpAdd.getReg()); + return IsLeaf ? true : MRI->hasOneNonDBGUse(OpAdd.getReg()); }; int16_t AddOpIdx = -1; + int16_t MulOpIdx = -1; + + bool IsUsedOnceL = false; + bool IsUsedOnceR = false; + MachineInstr *MULInstrL = nullptr; + MachineInstr *MULInstrR = nullptr; + + auto IsRPReductionCandidate = [&]() { + // Currently, we only support float and double. + // FIXME: add support for other types. + unsigned Opcode = Root.getOpcode(); + if (Opcode != PPC::XSMADDASP && Opcode != PPC::XSMADDADP) + return false; + + // Root must be a valid FMA like instruction. + // Treat it as leaf as we don't care its add operand. + if (IsReassociableFMA(Root, AddOpIdx, MulOpIdx, true)) { + assert((MulOpIdx >= 0) && "mul operand index not right!"); + Register MULRegL = TRI->lookThruSingleUseCopyChain( + Root.getOperand(MulOpIdx).getReg(), MRI); + Register MULRegR = TRI->lookThruSingleUseCopyChain( + Root.getOperand(MulOpIdx + 1).getReg(), MRI); + if (!MULRegL && !MULRegR) + return false; + + if (MULRegL && !MULRegR) { + MULRegR = + TRI->lookThruCopyLike(Root.getOperand(MulOpIdx + 1).getReg(), MRI); + IsUsedOnceL = true; + } else if (!MULRegL && MULRegR) { + MULRegL = + TRI->lookThruCopyLike(Root.getOperand(MulOpIdx).getReg(), MRI); + IsUsedOnceR = true; + } else { + IsUsedOnceL = true; + IsUsedOnceR = true; + } + + if (!Register::isVirtualRegister(MULRegL) || + !Register::isVirtualRegister(MULRegR)) + return false; + + MULInstrL = MRI->getVRegDef(MULRegL); + MULInstrR = MRI->getVRegDef(MULRegR); + return true; + } + return false; + }; + + // Register pressure fma reassociation patterns. + if (DoRegPressureReduce && IsRPReductionCandidate()) { + assert((MULInstrL && MULInstrR) && "wrong register preduction candidate!"); + // Register pressure pattern 1 + if (isLoadFromConstantPool(MULInstrL) && IsUsedOnceR && + IsReassociableAddOrSub(*MULInstrR, InfoArrayIdxFSubInst)) { + LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_BCA\n"); + Patterns.push_back(MachineCombinerPattern::REASSOC_XY_BCA); + return true; + } + + // Register pressure pattern 2 + if ((isLoadFromConstantPool(MULInstrR) && IsUsedOnceL && + IsReassociableAddOrSub(*MULInstrL, InfoArrayIdxFSubInst))) { + LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_BAC\n"); + Patterns.push_back(MachineCombinerPattern::REASSOC_XY_BAC); + return true; + } + } + + // ILP fma reassociation patterns. // Root must be a valid FMA like instruction. - if (!IsReassociable(Root, AddOpIdx, false, false)) + AddOpIdx = -1; + if (!IsReassociableFMA(Root, AddOpIdx, MulOpIdx, false)) return false; assert((AddOpIdx >= 0) && "add operand index not right!"); Register RegB = Root.getOperand(AddOpIdx).getReg(); - MachineInstr *Prev = MRI.getUniqueVRegDef(RegB); + MachineInstr *Prev = MRI->getUniqueVRegDef(RegB); // Prev must be a valid FMA like instruction. AddOpIdx = -1; - if (!IsReassociable(*Prev, AddOpIdx, false, false)) + if (!IsReassociableFMA(*Prev, AddOpIdx, MulOpIdx, false)) return false; assert((AddOpIdx >= 0) && "add operand index not right!"); Register RegA = Prev->getOperand(AddOpIdx).getReg(); - MachineInstr *Leaf = MRI.getUniqueVRegDef(RegA); + MachineInstr *Leaf = MRI->getUniqueVRegDef(RegA); AddOpIdx = -1; - if (IsReassociable(*Leaf, AddOpIdx, true, false)) { + if (IsReassociableFMA(*Leaf, AddOpIdx, MulOpIdx, true)) { Patterns.push_back(MachineCombinerPattern::REASSOC_XMM_AMM_BMM); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_XMM_AMM_BMM\n"); return true; } - if (IsReassociable(*Leaf, AddOpIdx, true, true)) { + if (IsReassociableAddOrSub(*Leaf, InfoArrayIdxFAddInst)) { Patterns.push_back(MachineCombinerPattern::REASSOC_XY_AMM_BMM); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_XY_AMM_BMM\n"); return true; } return false; } +void PPCInstrInfo::finalizeInsInstrs( + MachineInstr &Root, MachineCombinerPattern &P, + SmallVectorImpl<MachineInstr *> &InsInstrs) const { + assert(!InsInstrs.empty() && "Instructions set to be inserted is empty!"); + + MachineFunction *MF = Root.getMF(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MachineConstantPool *MCP = MF->getConstantPool(); + + int16_t Idx = getFMAOpIdxInfo(Root.getOpcode()); + if (Idx < 0) + return; + + uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx]; + + // For now we only need to fix up placeholder for register pressure reduce + // patterns. + Register ConstReg = 0; + switch (P) { + case MachineCombinerPattern::REASSOC_XY_BCA: + ConstReg = + TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx).getReg(), MRI); + break; + case MachineCombinerPattern::REASSOC_XY_BAC: + ConstReg = + TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx + 1).getReg(), MRI); + break; + default: + // Not register pressure reduce patterns. + return; + } + + MachineInstr *ConstDefInstr = MRI->getVRegDef(ConstReg); + // Get const value from const pool. + const Constant *C = getConstantFromConstantPool(ConstDefInstr); + assert(isa<llvm::ConstantFP>(C) && "not a valid constant!"); + + // Get negative fp const. + APFloat F1((dyn_cast<ConstantFP>(C))->getValueAPF()); + F1.changeSign(); + Constant *NegC = ConstantFP::get(dyn_cast<ConstantFP>(C)->getContext(), F1); + Align Alignment = MF->getDataLayout().getPrefTypeAlign(C->getType()); + + // Put negative fp const into constant pool. + unsigned ConstPoolIdx = MCP->getConstantPoolIndex(NegC, Alignment); + + MachineOperand *Placeholder = nullptr; + // Record the placeholder PPC::ZERO8 we add in reassociateFMA. + for (auto *Inst : InsInstrs) { + for (MachineOperand &Operand : Inst->explicit_operands()) { + assert(Operand.isReg() && "Invalid instruction in InsInstrs!"); + if (Operand.getReg() == PPC::ZERO8) { + Placeholder = &Operand; + break; + } + } + } + + assert(Placeholder && "Placeholder does not exist!"); + + // Generate instructions to load the const fp from constant pool. + // We only support PPC64 and medium code model. + Register LoadNewConst = + generateLoadForNewConst(ConstPoolIdx, &Root, C->getType(), InsInstrs); + + // Fill the placeholder with the new load from constant pool. + Placeholder->setReg(LoadNewConst); +} + +bool PPCInstrInfo::shouldReduceRegisterPressure( + MachineBasicBlock *MBB, RegisterClassInfo *RegClassInfo) const { + + if (!EnableFMARegPressureReduction) + return false; + + // Currently, we only enable register pressure reducing in machine combiner + // for: 1: PPC64; 2: Code Model is Medium; 3: Power9 which also has vector + // support. + // + // So we need following instructions to access a TOC entry: + // + // %6:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, %const.0 + // %7:vssrc = DFLOADf32 target-flags(ppc-toc-lo) %const.0, + // killed %6:g8rc_and_g8rc_nox0, implicit $x2 :: (load 4 from constant-pool) + // + // FIXME: add more supported targets, like Small and Large code model, PPC32, + // AIX. + if (!(Subtarget.isPPC64() && Subtarget.hasP9Vector() && + Subtarget.getTargetMachine().getCodeModel() == CodeModel::Medium)) + return false; + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + + auto GetMBBPressure = [&](MachineBasicBlock *MBB) -> std::vector<unsigned> { + RegionPressure Pressure; + RegPressureTracker RPTracker(Pressure); + + // Initialize the register pressure tracker. + RPTracker.init(MBB->getParent(), RegClassInfo, nullptr, MBB, MBB->end(), + /*TrackLaneMasks*/ false, /*TrackUntiedDefs=*/true); + + for (MachineBasicBlock::iterator MII = MBB->instr_end(), + MIE = MBB->instr_begin(); + MII != MIE; --MII) { + MachineInstr &MI = *std::prev(MII); + if (MI.isDebugValue() || MI.isDebugLabel()) + continue; + RegisterOperands RegOpers; + RegOpers.collect(MI, *TRI, *MRI, false, false); + RPTracker.recedeSkipDebugValues(); + assert(&*RPTracker.getPos() == &MI && "RPTracker sync error!"); + RPTracker.recede(RegOpers); + } + + // Close the RPTracker to finalize live ins. + RPTracker.closeRegion(); + + return RPTracker.getPressure().MaxSetPressure; + }; + + // For now we only care about float and double type fma. + unsigned VSSRCLimit = TRI->getRegPressureSetLimit( + *MBB->getParent(), PPC::RegisterPressureSets::VSSRC); + + // Only reduce register pressure when pressure is high. + return GetMBBPressure(MBB)[PPC::RegisterPressureSets::VSSRC] > + (float)VSSRCLimit * FMARPFactor; +} + +bool PPCInstrInfo::isLoadFromConstantPool(MachineInstr *I) const { + // I has only one memory operand which is load from constant pool. + if (!I->hasOneMemOperand()) + return false; + + MachineMemOperand *Op = I->memoperands()[0]; + return Op->isLoad() && Op->getPseudoValue() && + Op->getPseudoValue()->kind() == PseudoSourceValue::ConstantPool; +} + +Register PPCInstrInfo::generateLoadForNewConst( + unsigned Idx, MachineInstr *MI, Type *Ty, + SmallVectorImpl<MachineInstr *> &InsInstrs) const { + // Now we only support PPC64, Medium code model and P9 with vector. + // We have immutable pattern to access const pool. See function + // shouldReduceRegisterPressure. + assert((Subtarget.isPPC64() && Subtarget.hasP9Vector() && + Subtarget.getTargetMachine().getCodeModel() == CodeModel::Medium) && + "Target not supported!\n"); + + MachineFunction *MF = MI->getMF(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + + // Generate ADDIStocHA8 + Register VReg1 = MRI->createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass); + MachineInstrBuilder TOCOffset = + BuildMI(*MF, MI->getDebugLoc(), get(PPC::ADDIStocHA8), VReg1) + .addReg(PPC::X2) + .addConstantPoolIndex(Idx); + + assert((Ty->isFloatTy() || Ty->isDoubleTy()) && + "Only float and double are supported!"); + + unsigned LoadOpcode; + // Should be float type or double type. + if (Ty->isFloatTy()) + LoadOpcode = PPC::DFLOADf32; + else + LoadOpcode = PPC::DFLOADf64; + + const TargetRegisterClass *RC = MRI->getRegClass(MI->getOperand(0).getReg()); + Register VReg2 = MRI->createVirtualRegister(RC); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad, + Ty->getScalarSizeInBits() / 8, MF->getDataLayout().getPrefTypeAlign(Ty)); + + // Generate Load from constant pool. + MachineInstrBuilder Load = + BuildMI(*MF, MI->getDebugLoc(), get(LoadOpcode), VReg2) + .addConstantPoolIndex(Idx) + .addReg(VReg1, getKillRegState(true)) + .addMemOperand(MMO); + + Load->getOperand(1).setTargetFlags(PPCII::MO_TOC_LO); + + // Insert the toc load instructions into InsInstrs. + InsInstrs.insert(InsInstrs.begin(), Load); + InsInstrs.insert(InsInstrs.begin(), TOCOffset); + return VReg2; +} + +// This function returns the const value in constant pool if the \p I is a load +// from constant pool. +const Constant * +PPCInstrInfo::getConstantFromConstantPool(MachineInstr *I) const { + MachineFunction *MF = I->getMF(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + MachineConstantPool *MCP = MF->getConstantPool(); + assert(I->mayLoad() && "Should be a load instruction.\n"); + for (auto MO : I->uses()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (Reg == 0 || !Register::isVirtualRegister(Reg)) + continue; + // Find the toc address. + MachineInstr *DefMI = MRI->getVRegDef(Reg); + for (auto MO2 : DefMI->uses()) + if (MO2.isCPI()) + return (MCP->getConstants())[MO2.getIndex()].Val.ConstVal; + } + return nullptr; +} + bool PPCInstrInfo::getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, + bool DoRegPressureReduce) const { // Using the machine combiner in this way is potentially expensive, so // restrict to when aggressive optimizations are desired. if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive) return false; - if (getFMAPatterns(Root, Patterns)) + if (getFMAPatterns(Root, Patterns, DoRegPressureReduce)) return true; - return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, + DoRegPressureReduce); } void PPCInstrInfo::genAlternativeCodeSequence( @@ -440,6 +779,8 @@ void PPCInstrInfo::genAlternativeCodeSequence( switch (Pattern) { case MachineCombinerPattern::REASSOC_XY_AMM_BMM: case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: + case MachineCombinerPattern::REASSOC_XY_BCA: + case MachineCombinerPattern::REASSOC_XY_BAC: reassociateFMA(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg); break; default: @@ -450,8 +791,6 @@ void PPCInstrInfo::genAlternativeCodeSequence( } } -// Currently, only handle two patterns REASSOC_XY_AMM_BMM and -// REASSOC_XMM_AMM_BMM. See comments for getFMAPatterns. void PPCInstrInfo::reassociateFMA( MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, @@ -459,6 +798,7 @@ void PPCInstrInfo::reassociateFMA( DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { MachineFunction *MF = Root.getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); MachineOperand &OpC = Root.getOperand(0); Register RegC = OpC.getReg(); const TargetRegisterClass *RC = MRI.getRegClass(RegC); @@ -468,13 +808,42 @@ void PPCInstrInfo::reassociateFMA( int16_t Idx = getFMAOpIdxInfo(FmaOp); assert(Idx >= 0 && "Root must be a FMA instruction"); + bool IsILPReassociate = + (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) || + (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM); + uint16_t AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx]; uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx]; - MachineInstr *Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg()); - MachineInstr *Leaf = - MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg()); - uint16_t IntersectedFlags = - Root.getFlags() & Prev->getFlags() & Leaf->getFlags(); + + MachineInstr *Prev = nullptr; + MachineInstr *Leaf = nullptr; + switch (Pattern) { + default: + llvm_unreachable("not recognized pattern!"); + case MachineCombinerPattern::REASSOC_XY_AMM_BMM: + case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: + Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg()); + Leaf = MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg()); + break; + case MachineCombinerPattern::REASSOC_XY_BAC: { + Register MULReg = + TRI->lookThruCopyLike(Root.getOperand(FirstMulOpIdx).getReg(), &MRI); + Leaf = MRI.getVRegDef(MULReg); + break; + } + case MachineCombinerPattern::REASSOC_XY_BCA: { + Register MULReg = TRI->lookThruCopyLike( + Root.getOperand(FirstMulOpIdx + 1).getReg(), &MRI); + Leaf = MRI.getVRegDef(MULReg); + break; + } + } + + uint16_t IntersectedFlags = 0; + if (IsILPReassociate) + IntersectedFlags = Root.getFlags() & Prev->getFlags() & Leaf->getFlags(); + else + IntersectedFlags = Root.getFlags() & Leaf->getFlags(); auto GetOperandInfo = [&](const MachineOperand &Operand, Register &Reg, bool &KillFlag) { @@ -484,36 +853,51 @@ void PPCInstrInfo::reassociateFMA( }; auto GetFMAInstrInfo = [&](const MachineInstr &Instr, Register &MulOp1, - Register &MulOp2, bool &MulOp1KillFlag, - bool &MulOp2KillFlag) { + Register &MulOp2, Register &AddOp, + bool &MulOp1KillFlag, bool &MulOp2KillFlag, + bool &AddOpKillFlag) { GetOperandInfo(Instr.getOperand(FirstMulOpIdx), MulOp1, MulOp1KillFlag); GetOperandInfo(Instr.getOperand(FirstMulOpIdx + 1), MulOp2, MulOp2KillFlag); + GetOperandInfo(Instr.getOperand(AddOpIdx), AddOp, AddOpKillFlag); }; - Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32; + Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32, RegA11, + RegA21, RegB; bool KillX = false, KillY = false, KillM11 = false, KillM12 = false, - KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false; + KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false, + KillA11 = false, KillA21 = false, KillB = false; - GetFMAInstrInfo(Root, RegM31, RegM32, KillM31, KillM32); - GetFMAInstrInfo(*Prev, RegM21, RegM22, KillM21, KillM22); + GetFMAInstrInfo(Root, RegM31, RegM32, RegB, KillM31, KillM32, KillB); + + if (IsILPReassociate) + GetFMAInstrInfo(*Prev, RegM21, RegM22, RegA21, KillM21, KillM22, KillA21); if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) { - GetFMAInstrInfo(*Leaf, RegM11, RegM12, KillM11, KillM12); + GetFMAInstrInfo(*Leaf, RegM11, RegM12, RegA11, KillM11, KillM12, KillA11); GetOperandInfo(Leaf->getOperand(AddOpIdx), RegX, KillX); } else if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) { GetOperandInfo(Leaf->getOperand(1), RegX, KillX); GetOperandInfo(Leaf->getOperand(2), RegY, KillY); + } else { + // Get FSUB instruction info. + GetOperandInfo(Leaf->getOperand(1), RegX, KillX); + GetOperandInfo(Leaf->getOperand(2), RegY, KillY); } // Create new virtual registers for the new results instead of // recycling legacy ones because the MachineCombiner's computation of the // critical path requires a new register definition rather than an existing // one. + // For register pressure reassociation, we only need create one virtual + // register for the new fma. Register NewVRA = MRI.createVirtualRegister(RC); InstrIdxForVirtReg.insert(std::make_pair(NewVRA, 0)); - Register NewVRB = MRI.createVirtualRegister(RC); - InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1)); + Register NewVRB = 0; + if (IsILPReassociate) { + NewVRB = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1)); + } Register NewVRD = 0; if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) { @@ -532,7 +916,11 @@ void PPCInstrInfo::reassociateFMA( MI->getOperand(FirstMulOpIdx + 1).setIsKill(KillRegMul2); }; - if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) { + MachineInstrBuilder NewARegPressure, NewCRegPressure; + switch (Pattern) { + default: + llvm_unreachable("not recognized pattern!"); + case MachineCombinerPattern::REASSOC_XY_AMM_BMM: { // Create new instructions for insertion. MachineInstrBuilder MINewB = BuildMI(*MF, Prev->getDebugLoc(), get(FmaOp), NewVRB) @@ -565,7 +953,9 @@ void PPCInstrInfo::reassociateFMA( InsInstrs.push_back(MINewA); InsInstrs.push_back(MINewB); InsInstrs.push_back(MINewC); - } else if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) { + break; + } + case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: { assert(NewVRD && "new FMA register not created!"); // Create new instructions for insertion. MachineInstrBuilder MINewA = @@ -607,6 +997,47 @@ void PPCInstrInfo::reassociateFMA( InsInstrs.push_back(MINewB); InsInstrs.push_back(MINewD); InsInstrs.push_back(MINewC); + break; + } + case MachineCombinerPattern::REASSOC_XY_BAC: + case MachineCombinerPattern::REASSOC_XY_BCA: { + Register VarReg; + bool KillVarReg = false; + if (Pattern == MachineCombinerPattern::REASSOC_XY_BCA) { + VarReg = RegM31; + KillVarReg = KillM31; + } else { + VarReg = RegM32; + KillVarReg = KillM32; + } + // We don't want to get negative const from memory pool too early, as the + // created entry will not be deleted even if it has no users. Since all + // operand of Leaf and Root are virtual register, we use zero register + // here as a placeholder. When the InsInstrs is selected in + // MachineCombiner, we call finalizeInsInstrs to replace the zero register + // with a virtual register which is a load from constant pool. + NewARegPressure = BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), NewVRA) + .addReg(RegB, getKillRegState(RegB)) + .addReg(RegY, getKillRegState(KillY)) + .addReg(PPC::ZERO8); + NewCRegPressure = BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), RegC) + .addReg(NewVRA, getKillRegState(true)) + .addReg(RegX, getKillRegState(KillX)) + .addReg(VarReg, getKillRegState(KillVarReg)); + // For now, we only support xsmaddadp/xsmaddasp, their add operand are + // both at index 1, no need to adjust. + // FIXME: when add more fma instructions support, like fma/fmas, adjust + // the operand index here. + break; + } + } + + if (!IsILPReassociate) { + setSpecialOperandAttr(*NewARegPressure, IntersectedFlags); + setSpecialOperandAttr(*NewCRegPressure, IntersectedFlags); + + InsInstrs.push_back(NewARegPressure); + InsInstrs.push_back(NewCRegPressure); } assert(!InsInstrs.empty() && @@ -614,7 +1045,8 @@ void PPCInstrInfo::reassociateFMA( // Record old instructions for deletion. DelInstrs.push_back(Leaf); - DelInstrs.push_back(Prev); + if (IsILPReassociate) + DelInstrs.push_back(Prev); DelInstrs.push_back(&Root); } @@ -666,7 +1098,6 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::LI8: case PPC::LIS: case PPC::LIS8: - case PPC::QVGPCI: case PPC::ADDIStocHA: case PPC::ADDIStocHA8: case PPC::ADDItocL: @@ -683,6 +1114,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case PPC::V_SETALLONES: case PPC::CRSET: case PPC::CRUNSET: + case PPC::XXSETACCZ: return true; } return false; @@ -1283,14 +1715,22 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(31); return; } else if (PPC::CRRCRegClass.contains(SrcReg) && - PPC::G8RCRegClass.contains(DestReg)) { - BuildMI(MBB, I, DL, get(PPC::MFOCRF8), DestReg).addReg(SrcReg); - getKillRegState(KillSrc); - return; - } else if (PPC::CRRCRegClass.contains(SrcReg) && - PPC::GPRCRegClass.contains(DestReg)) { - BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(SrcReg); + (PPC::G8RCRegClass.contains(DestReg) || + PPC::GPRCRegClass.contains(DestReg))) { + bool Is64Bit = PPC::G8RCRegClass.contains(DestReg); + unsigned MvCode = Is64Bit ? PPC::MFOCRF8 : PPC::MFOCRF; + unsigned ShCode = Is64Bit ? PPC::RLWINM8 : PPC::RLWINM; + unsigned CRNum = TRI->getEncodingValue(SrcReg); + BuildMI(MBB, I, DL, get(MvCode), DestReg).addReg(SrcReg); getKillRegState(KillSrc); + if (CRNum == 7) + return; + // Shift the CR bits to make the CR field in the lowest 4 bits of GRC. + BuildMI(MBB, I, DL, get(ShCode), DestReg) + .addReg(DestReg, RegState::Kill) + .addImm(CRNum * 4 + 4) + .addImm(28) + .addImm(31); return; } else if (PPC::G8RCRegClass.contains(SrcReg) && PPC::VSFRCRegClass.contains(DestReg)) { @@ -1343,17 +1783,53 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (PPC::VSFRCRegClass.contains(DestReg, SrcReg) || PPC::VSSRCRegClass.contains(DestReg, SrcReg)) Opc = (Subtarget.hasP9Vector()) ? PPC::XSCPSGNDP : PPC::XXLORf; - else if (PPC::QFRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMR; - else if (PPC::QSRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRs; - else if (PPC::QBRCRegClass.contains(DestReg, SrcReg)) - Opc = PPC::QVFMRb; + else if (Subtarget.pairedVectorMemops() && + PPC::VSRpRCRegClass.contains(DestReg, SrcReg)) { + if (SrcReg > PPC::VSRp15) + SrcReg = PPC::V0 + (SrcReg - PPC::VSRp16) * 2; + else + SrcReg = PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2; + if (DestReg > PPC::VSRp15) + DestReg = PPC::V0 + (DestReg - PPC::VSRp16) * 2; + else + DestReg = PPC::VSL0 + (DestReg - PPC::VSRp0) * 2; + BuildMI(MBB, I, DL, get(PPC::XXLOR), DestReg). + addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, I, DL, get(PPC::XXLOR), DestReg + 1). + addReg(SrcReg + 1).addReg(SrcReg + 1, getKillRegState(KillSrc)); + return; + } else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg)) Opc = PPC::CROR; else if (PPC::SPERCRegClass.contains(DestReg, SrcReg)) Opc = PPC::EVOR; - else + else if ((PPC::ACCRCRegClass.contains(DestReg) || + PPC::UACCRCRegClass.contains(DestReg)) && + (PPC::ACCRCRegClass.contains(SrcReg) || + PPC::UACCRCRegClass.contains(SrcReg))) { + // If primed, de-prime the source register, copy the individual registers + // and prime the destination if needed. The vector subregisters are + // vs[(u)acc * 4] - vs[(u)acc * 4 + 3]. If the copy is not a kill and the + // source is primed, we need to re-prime it after the copy as well. + PPCRegisterInfo::emitAccCopyInfo(MBB, DestReg, SrcReg); + bool DestPrimed = PPC::ACCRCRegClass.contains(DestReg); + bool SrcPrimed = PPC::ACCRCRegClass.contains(SrcReg); + MCRegister VSLSrcReg = + PPC::VSL0 + (SrcReg - (SrcPrimed ? PPC::ACC0 : PPC::UACC0)) * 4; + MCRegister VSLDestReg = + PPC::VSL0 + (DestReg - (DestPrimed ? PPC::ACC0 : PPC::UACC0)) * 4; + if (SrcPrimed) + BuildMI(MBB, I, DL, get(PPC::XXMFACC), SrcReg).addReg(SrcReg); + for (unsigned Idx = 0; Idx < 4; Idx++) + BuildMI(MBB, I, DL, get(PPC::XXLOR), VSLDestReg + Idx) + .addReg(VSLSrcReg + Idx) + .addReg(VSLSrcReg + Idx, getKillRegState(KillSrc)); + if (DestPrimed) + BuildMI(MBB, I, DL, get(PPC::XXMTACC), DestReg).addReg(DestReg); + if (SrcPrimed && !KillSrc) + BuildMI(MBB, I, DL, get(PPC::XXMTACC), SrcReg).addReg(SrcReg); + return; + } else llvm_unreachable("Impossible reg-to-reg copy"); const MCInstrDesc &MCID = get(Opc); @@ -1364,7 +1840,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc)); } -static unsigned getSpillIndex(const TargetRegisterClass *RC) { +unsigned PPCInstrInfo::getSpillIndex(const TargetRegisterClass *RC) const { int OpcodeIndex = 0; if (PPC::GPRCRegClass.hasSubClassEq(RC) || @@ -1391,16 +1867,20 @@ static unsigned getSpillIndex(const TargetRegisterClass *RC) { OpcodeIndex = SOK_VectorFloat8Spill; } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_VectorFloat4Spill; - } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_VRSaveSpill; - } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat8Spill; - } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadFloat4Spill; - } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) { - OpcodeIndex = SOK_QuadBitSpill; } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) { OpcodeIndex = SOK_SpillToVSR; + } else if (PPC::ACCRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_AccumulatorSpill; + } else if (PPC::UACCRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_UAccumulatorSpill; + } else if (PPC::VSRpRCRegClass.hasSubClassEq(RC)) { + assert(Subtarget.pairedVectorMemops() && + "Register unexpected when paired memops are disabled."); + OpcodeIndex = SOK_PairedVecSpill; } else { llvm_unreachable("Unknown regclass!"); } @@ -1437,9 +1917,6 @@ void PPCInstrInfo::StoreRegToStackSlot( PPC::CRBITRCRegClass.hasSubClassEq(RC)) FuncInfo->setSpillsCR(); - if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) - FuncInfo->setSpillsVRSAVE(); - if (isXFormMemOp(Opcode)) FuncInfo->setHasNonRISpills(); } @@ -1495,9 +1972,6 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL, PPC::CRBITRCRegClass.hasSubClassEq(RC)) FuncInfo->setSpillsCR(); - if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) - FuncInfo->setSpillsVRSAVE(); - if (isXFormMemOp(Opcode)) FuncInfo->setHasNonRISpills(); } @@ -1667,6 +2141,17 @@ bool PPCInstrInfo::isPredicated(const MachineInstr &MI) const { return false; } +bool PPCInstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + // Set MFFS and MTFSF as scheduling boundary to avoid unexpected code motion + // across them, since some FP operations may change content of FPSCR. + // TODO: Model FPSCR in PPC instruction definitions and remove the workaround + if (MI.getOpcode() == PPC::MFFS || MI.getOpcode() == PPC::MTFSF) + return true; + return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); +} + bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, ArrayRef<MachineOperand> Pred) const { unsigned OpC = MI.getOpcode(); @@ -1675,6 +2160,10 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, bool isPPC64 = Subtarget.isPPC64(); MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZLR8 : PPC::BDNZLR) : (isPPC64 ? PPC::BDZLR8 : PPC::BDZLR))); + // Need add Def and Use for CTR implicit operand. + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addReg(Pred[1].getReg(), RegState::Implicit) + .addReg(Pred[1].getReg(), RegState::ImplicitDefine); } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) { MI.setDesc(get(PPC::BCLR)); MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); @@ -1694,6 +2183,10 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, bool isPPC64 = Subtarget.isPPC64(); MI.setDesc(get(Pred[0].getImm() ? (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) : (isPPC64 ? PPC::BDZ8 : PPC::BDZ))); + // Need add Def and Use for CTR implicit operand. + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addReg(Pred[1].getReg(), RegState::Implicit) + .addReg(Pred[1].getReg(), RegState::ImplicitDefine); } else if (Pred[0].getImm() == PPC::PRED_BIT_SET) { MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); MI.RemoveOperand(0); @@ -1734,19 +2227,24 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8 : PPC::BCCTR8) : (setLR ? PPC::BCCTRL : PPC::BCCTR))); MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); - return true; } else if (Pred[0].getImm() == PPC::PRED_BIT_UNSET) { MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCTRL8n : PPC::BCCTR8n) : (setLR ? PPC::BCCTRLn : PPC::BCCTRn))); MachineInstrBuilder(*MI.getParent()->getParent(), MI).add(Pred[1]); - return true; + } else { + MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8) + : (setLR ? PPC::BCCCTRL : PPC::BCCCTR))); + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addImm(Pred[0].getImm()) + .add(Pred[1]); } - MI.setDesc(get(isPPC64 ? (setLR ? PPC::BCCCTRL8 : PPC::BCCCTR8) - : (setLR ? PPC::BCCCTRL : PPC::BCCCTR))); - MachineInstrBuilder(*MI.getParent()->getParent(), MI) - .addImm(Pred[0].getImm()) - .add(Pred[1]); + // Need add Def and Use for LR implicit operand. + if (setLR) + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::Implicit) + .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::ImplicitDefine); + return true; } @@ -1784,8 +2282,9 @@ bool PPCInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, return false; } -bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI, - std::vector<MachineOperand> &Pred) const { +bool PPCInstrInfo::ClobbersPredicate(MachineInstr &MI, + std::vector<MachineOperand> &Pred, + bool SkipDead) const { // Note: At the present time, the contents of Pred from this function is // unused by IfConversion. This implementation follows ARM by pushing the // CR-defining operand. Because the 'DZ' and 'DNZ' count as types of @@ -2071,6 +2570,14 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (NewOpC == -1) return false; + // This transformation should not be performed if `nsw` is missing and is not + // `equalityOnly` comparison. Since if there is overflow, sub_lt, sub_gt in + // CRReg do not reflect correct order. If `equalityOnly` is true, sub_eq in + // CRReg can reflect if compared values are equal, this optz is still valid. + if (!equalityOnly && (NewOpC == PPC::SUBF_rec || NewOpC == PPC::SUBF8_rec) && + Sub && !Sub->getFlag(MachineInstr::NoSWrap)) + return false; + // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP // needs to be updated to be based on SUB. Push the condition code // operands to OperandsToUpdate. If it is safe to remove CmpInstr, the @@ -2221,6 +2728,112 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return true; } +bool PPCInstrInfo::getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { + const MachineOperand *BaseOp; + OffsetIsScalable = false; + if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI)) + return false; + BaseOps.push_back(BaseOp); + return true; +} + +static bool isLdStSafeToCluster(const MachineInstr &LdSt, + const TargetRegisterInfo *TRI) { + // If this is a volatile load/store, don't mess with it. + if (LdSt.hasOrderedMemoryRef() || LdSt.getNumExplicitOperands() != 3) + return false; + + if (LdSt.getOperand(2).isFI()) + return true; + + assert(LdSt.getOperand(2).isReg() && "Expected a reg operand."); + // Can't cluster if the instruction modifies the base register + // or it is update form. e.g. ld r2,3(r2) + if (LdSt.modifiesRegister(LdSt.getOperand(2).getReg(), TRI)) + return false; + + return true; +} + +// Only cluster instruction pair that have the same opcode, and they are +// clusterable according to PowerPC specification. +static bool isClusterableLdStOpcPair(unsigned FirstOpc, unsigned SecondOpc, + const PPCSubtarget &Subtarget) { + switch (FirstOpc) { + default: + return false; + case PPC::STD: + case PPC::STFD: + case PPC::STXSD: + case PPC::DFSTOREf64: + return FirstOpc == SecondOpc; + // PowerPC backend has opcode STW/STW8 for instruction "stw" to deal with + // 32bit and 64bit instruction selection. They are clusterable pair though + // they are different opcode. + case PPC::STW: + case PPC::STW8: + return SecondOpc == PPC::STW || SecondOpc == PPC::STW8; + } +} + +bool PPCInstrInfo::shouldClusterMemOps( + ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, + unsigned NumBytes) const { + + assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); + const MachineOperand &BaseOp1 = *BaseOps1.front(); + const MachineOperand &BaseOp2 = *BaseOps2.front(); + assert((BaseOp1.isReg() || BaseOp1.isFI()) && + "Only base registers and frame indices are supported."); + + // The NumLoads means the number of loads that has been clustered. + // Don't cluster memory op if there are already two ops clustered at least. + if (NumLoads > 2) + return false; + + // Cluster the load/store only when they have the same base + // register or FI. + if ((BaseOp1.isReg() != BaseOp2.isReg()) || + (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) || + (BaseOp1.isFI() && BaseOp1.getIndex() != BaseOp2.getIndex())) + return false; + + // Check if the load/store are clusterable according to the PowerPC + // specification. + const MachineInstr &FirstLdSt = *BaseOp1.getParent(); + const MachineInstr &SecondLdSt = *BaseOp2.getParent(); + unsigned FirstOpc = FirstLdSt.getOpcode(); + unsigned SecondOpc = SecondLdSt.getOpcode(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + // Cluster the load/store only when they have the same opcode, and they are + // clusterable opcode according to PowerPC specification. + if (!isClusterableLdStOpcPair(FirstOpc, SecondOpc, Subtarget)) + return false; + + // Can't cluster load/store that have ordered or volatile memory reference. + if (!isLdStSafeToCluster(FirstLdSt, TRI) || + !isLdStSafeToCluster(SecondLdSt, TRI)) + return false; + + int64_t Offset1 = 0, Offset2 = 0; + unsigned Width1 = 0, Width2 = 0; + const MachineOperand *Base1 = nullptr, *Base2 = nullptr; + if (!getMemOperandWithOffsetWidth(FirstLdSt, Base1, Offset1, Width1, TRI) || + !getMemOperandWithOffsetWidth(SecondLdSt, Base2, Offset2, Width2, TRI) || + Width1 != Width2) + return false; + + assert(Base1 == &BaseOp1 && Base2 == &BaseOp2 && + "getMemOperandWithOffsetWidth return incorrect base op"); + // The caller should already have ordered FirstMemOp/SecondMemOp by offset. + assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); + return Offset1 + Width1 == Offset2; +} + /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. /// @@ -2270,7 +2883,14 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { {MO_PLT, "ppc-plt"}, {MO_PIC_FLAG, "ppc-pic"}, {MO_PCREL_FLAG, "ppc-pcrel"}, - {MO_GOT_FLAG, "ppc-got"}}; + {MO_GOT_FLAG, "ppc-got"}, + {MO_PCREL_OPT_FLAG, "ppc-opt-pcrel"}, + {MO_TLSGD_FLAG, "ppc-tlsgd"}, + {MO_TLSLD_FLAG, "ppc-tlsld"}, + {MO_TPREL_FLAG, "ppc-tprel"}, + {MO_GOT_TLSGD_PCREL_FLAG, "ppc-got-tlsgd-pcrel"}, + {MO_GOT_TLSLD_PCREL_FLAG, "ppc-got-tlsld-pcrel"}, + {MO_GOT_TPREL_PCREL_FLAG, "ppc-got-tprel-pcrel"}}; return makeArrayRef(TargetFlags); } @@ -2351,6 +2971,31 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { auto DL = MI.getDebugLoc(); switch (MI.getOpcode()) { + case PPC::BUILD_UACC: { + MCRegister ACC = MI.getOperand(0).getReg(); + MCRegister UACC = MI.getOperand(1).getReg(); + if (ACC - PPC::ACC0 != UACC - PPC::UACC0) { + MCRegister SrcVSR = PPC::VSL0 + (UACC - PPC::UACC0) * 4; + MCRegister DstVSR = PPC::VSL0 + (ACC - PPC::ACC0) * 4; + // FIXME: This can easily be improved to look up to the top of the MBB + // to see if the inputs are XXLOR's. If they are and SrcReg is killed, + // we can just re-target any such XXLOR's to DstVSR + offset. + for (int VecNo = 0; VecNo < 4; VecNo++) + BuildMI(MBB, MI, DL, get(PPC::XXLOR), DstVSR + VecNo) + .addReg(SrcVSR + VecNo) + .addReg(SrcVSR + VecNo); + } + // BUILD_UACC is expanded to 4 copies of the underlying vsx regisers. + // So after building the 4 copies, we can replace the BUILD_UACC instruction + // with a NOP. + LLVM_FALLTHROUGH; + } + case PPC::KILL_PAIR: { + MI.setDesc(get(PPC::UNENCODED_NOP)); + MI.RemoveOperand(1); + MI.RemoveOperand(0); + return true; + } case TargetOpcode::LOAD_STACK_GUARD: { assert(Subtarget.isTargetLinux() && "Only Linux target is expected to contain LOAD_STACK_GUARD"); @@ -2642,7 +3287,10 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI( } unsigned PPCInstrInfo::getSpillTarget() const { - return Subtarget.hasP9Vector() ? 1 : 0; + // With P10, we may need to spill paired vector registers or accumulator + // registers. MMA implies paired vectors, so we can just check that. + bool IsP10Variant = Subtarget.isISA3_1() || Subtarget.pairedVectorMemops(); + return IsP10Variant ? 2 : Subtarget.hasP9Vector() ? 1 : 0; } const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const { @@ -2653,13 +3301,35 @@ const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const { return LoadSpillOpcodesArray[getSpillTarget()]; } -void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, +void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr *StartMI, MachineInstr *EndMI, unsigned RegNo) const { + // Conservatively clear kill flag for the register if the instructions are in + // different basic blocks and in SSA form, because the kill flag may no longer + // be right. There is no need to bother with dead flags since defs with no + // uses will be handled by DCE. + MachineRegisterInfo &MRI = StartMI->getParent()->getParent()->getRegInfo(); + if (MRI.isSSA() && (StartMI->getParent() != EndMI->getParent())) { + MRI.clearKillFlags(RegNo); + return; + } // Instructions between [StartMI, EndMI] should be in same basic block. - assert((StartMI.getParent() == EndMI.getParent()) && + assert((StartMI->getParent() == EndMI->getParent()) && "Instructions are not in same basic block"); + // If before RA, StartMI may be def through COPY, we need to adjust it to the + // real def. See function getForwardingDefMI. + if (MRI.isSSA()) { + bool Reads, Writes; + std::tie(Reads, Writes) = StartMI->readsWritesVirtualRegister(RegNo); + if (!Reads && !Writes) { + assert(Register::isVirtualRegister(RegNo) && + "Must be a virtual register"); + // Get real def and ignore copies. + StartMI = MRI.getVRegDef(RegNo); + } + } + bool IsKillSet = false; auto clearOperandKillInfo = [=] (MachineInstr &MI, unsigned Index) { @@ -2672,21 +3342,21 @@ void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, // Set killed flag for EndMI. // No need to do anything if EndMI defines RegNo. int UseIndex = - EndMI.findRegisterUseOperandIdx(RegNo, false, &getRegisterInfo()); + EndMI->findRegisterUseOperandIdx(RegNo, false, &getRegisterInfo()); if (UseIndex != -1) { - EndMI.getOperand(UseIndex).setIsKill(true); + EndMI->getOperand(UseIndex).setIsKill(true); IsKillSet = true; // Clear killed flag for other EndMI operands related to RegNo. In some // upexpected cases, killed may be set multiple times for same register // operand in same MI. - for (int i = 0, e = EndMI.getNumOperands(); i != e; ++i) + for (int i = 0, e = EndMI->getNumOperands(); i != e; ++i) if (i != UseIndex) - clearOperandKillInfo(EndMI, i); + clearOperandKillInfo(*EndMI, i); } // Walking the inst in reverse order (EndMI -> StartMI]. - MachineBasicBlock::reverse_iterator It = EndMI; - MachineBasicBlock::reverse_iterator E = EndMI.getParent()->rend(); + MachineBasicBlock::reverse_iterator It = *EndMI; + MachineBasicBlock::reverse_iterator E = EndMI->getParent()->rend(); // EndMI has been handled above, skip it here. It++; MachineOperand *MO = nullptr; @@ -2712,13 +3382,13 @@ void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, } else if ((MO = It->findRegisterDefOperand(RegNo, false, true, &getRegisterInfo()))) { // No use found, set dead for its def. - assert(&*It == &StartMI && "No new def between StartMI and EndMI."); + assert(&*It == StartMI && "No new def between StartMI and EndMI."); MO->setIsDead(true); break; } } - if ((&*It) == &StartMI) + if ((&*It) == StartMI) break; } // Ensure RegMo liveness is killed after EndMI. @@ -3011,6 +3681,143 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, return false; } +bool PPCInstrInfo::combineRLWINM(MachineInstr &MI, + MachineInstr **ToErase) const { + MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo(); + unsigned FoldingReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(FoldingReg)) + return false; + MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg); + if (SrcMI->getOpcode() != PPC::RLWINM && + SrcMI->getOpcode() != PPC::RLWINM_rec && + SrcMI->getOpcode() != PPC::RLWINM8 && + SrcMI->getOpcode() != PPC::RLWINM8_rec) + return false; + assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() && + MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() && + SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) && + "Invalid PPC::RLWINM Instruction!"); + uint64_t SHSrc = SrcMI->getOperand(2).getImm(); + uint64_t SHMI = MI.getOperand(2).getImm(); + uint64_t MBSrc = SrcMI->getOperand(3).getImm(); + uint64_t MBMI = MI.getOperand(3).getImm(); + uint64_t MESrc = SrcMI->getOperand(4).getImm(); + uint64_t MEMI = MI.getOperand(4).getImm(); + + assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) && + "Invalid PPC::RLWINM Instruction!"); + // If MBMI is bigger than MEMI, we always can not get run of ones. + // RotatedSrcMask non-wrap: + // 0........31|32........63 + // RotatedSrcMask: B---E B---E + // MaskMI: -----------|--E B------ + // Result: ----- --- (Bad candidate) + // + // RotatedSrcMask wrap: + // 0........31|32........63 + // RotatedSrcMask: --E B----|--E B---- + // MaskMI: -----------|--E B------ + // Result: --- -----|--- ----- (Bad candidate) + // + // One special case is RotatedSrcMask is a full set mask. + // RotatedSrcMask full: + // 0........31|32........63 + // RotatedSrcMask: ------EB---|-------EB--- + // MaskMI: -----------|--E B------ + // Result: -----------|--- ------- (Good candidate) + + // Mark special case. + bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31); + + // For other MBMI > MEMI cases, just return. + if ((MBMI > MEMI) && !SrcMaskFull) + return false; + + // Handle MBMI <= MEMI cases. + APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI); + // In MI, we only need low 32 bits of SrcMI, just consider about low 32 + // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0, + // while in PowerPC ISA, lowerest bit is at index 63. + APInt MaskSrc = APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc); + + APInt RotatedSrcMask = MaskSrc.rotl(SHMI); + APInt FinalMask = RotatedSrcMask & MaskMI; + uint32_t NewMB, NewME; + bool Simplified = false; + + // If final mask is 0, MI result should be 0 too. + if (FinalMask.isNullValue()) { + bool Is64Bit = + (MI.getOpcode() == PPC::RLWINM8 || MI.getOpcode() == PPC::RLWINM8_rec); + Simplified = true; + LLVM_DEBUG(dbgs() << "Replace Instr: "); + LLVM_DEBUG(MI.dump()); + + if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) { + // Replace MI with "LI 0" + MI.RemoveOperand(4); + MI.RemoveOperand(3); + MI.RemoveOperand(2); + MI.getOperand(1).ChangeToImmediate(0); + MI.setDesc(get(Is64Bit ? PPC::LI8 : PPC::LI)); + } else { + // Replace MI with "ANDI_rec reg, 0" + MI.RemoveOperand(4); + MI.RemoveOperand(3); + MI.getOperand(2).setImm(0); + MI.setDesc(get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec)); + MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); + if (SrcMI->getOperand(1).isKill()) { + MI.getOperand(1).setIsKill(true); + SrcMI->getOperand(1).setIsKill(false); + } else + // About to replace MI.getOperand(1), clear its kill flag. + MI.getOperand(1).setIsKill(false); + } + + LLVM_DEBUG(dbgs() << "With: "); + LLVM_DEBUG(MI.dump()); + + } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB, NewME) && + NewMB <= NewME) || + SrcMaskFull) { + // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger + // than NewME. Otherwise we get a 64 bit value after folding, but MI + // return a 32 bit value. + Simplified = true; + LLVM_DEBUG(dbgs() << "Converting Instr: "); + LLVM_DEBUG(MI.dump()); + + uint16_t NewSH = (SHSrc + SHMI) % 32; + MI.getOperand(2).setImm(NewSH); + // If SrcMI mask is full, no need to update MBMI and MEMI. + if (!SrcMaskFull) { + MI.getOperand(3).setImm(NewMB); + MI.getOperand(4).setImm(NewME); + } + MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); + if (SrcMI->getOperand(1).isKill()) { + MI.getOperand(1).setIsKill(true); + SrcMI->getOperand(1).setIsKill(false); + } else + // About to replace MI.getOperand(1), clear its kill flag. + MI.getOperand(1).setIsKill(false); + + LLVM_DEBUG(dbgs() << "To: "); + LLVM_DEBUG(MI.dump()); + } + if (Simplified & MRI->use_nodbg_empty(FoldingReg) && + !SrcMI->hasImplicitDef()) { + // If FoldingReg has no non-debug use and it has no implicit def (it + // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI. + // Otherwise keep it. + *ToErase = SrcMI; + LLVM_DEBUG(dbgs() << "Delete dead instruction: "); + LLVM_DEBUG(SrcMI->dump()); + } + return Simplified; +} + bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, ImmInstrInfo &III, bool PostRA) const { // The vast majority of the instructions would need their operand 2 replaced @@ -3732,6 +4539,20 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI, } return false; } + case PPC::SUBFIC: + case PPC::SUBFIC8: { + // Only transform this if the CARRY implicit operand is dead. + if (MI.getNumOperands() > 3 && !MI.getOperand(3).isDead()) + return false; + int64_t Minuend = MI.getOperand(2).getImm(); + if (isInt<16>(Minuend - SExtImm)) { + ReplaceWithLI = true; + Is64BitLI = Opc == PPC::SUBFIC8; + NewImm = Minuend - SExtImm; + break; + } + return false; + } case PPC::RLDICL: case PPC::RLDICL_rec: case PPC::RLDICL_32: @@ -3849,7 +4670,7 @@ bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI, // ForwardingOperandReg = LI imm1 // y = op2 imm2, ForwardingOperandReg(killed) if (IsForwardingOperandKilled) - fixupIsDeadOrKill(DefMI, MI, ForwardingOperandReg); + fixupIsDeadOrKill(&DefMI, &MI, ForwardingOperandReg); LLVM_DEBUG(dbgs() << "With:\n"); LLVM_DEBUG(MI.dump()); @@ -3941,9 +4762,9 @@ bool PPCInstrInfo::transformToNewImmFormFedByAdd( // Update kill flag if (RegMO->isKill() || IsKilledFor(RegMO->getReg())) - fixupIsDeadOrKill(DefMI, MI, RegMO->getReg()); + fixupIsDeadOrKill(&DefMI, &MI, RegMO->getReg()); if (ForwardKilledOperandReg != ~0U) - fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg); + fixupIsDeadOrKill(&DefMI, &MI, ForwardKilledOperandReg); } LLVM_DEBUG(dbgs() << "With:\n"); @@ -4054,12 +4875,12 @@ bool PPCInstrInfo::transformToImmFormFedByAdd( // x = ADD reg(killed), imm // y = XOP 0, x if (IsFwdFeederRegKilled || RegMO->isKill()) - fixupIsDeadOrKill(DefMI, MI, RegMO->getReg()); + fixupIsDeadOrKill(&DefMI, &MI, RegMO->getReg()); // Pattern 3: // ForwardKilledOperandReg = ADD reg, imm // y = XOP 0, ForwardKilledOperandReg(killed) if (ForwardKilledOperandReg != ~0U) - fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg); + fixupIsDeadOrKill(&DefMI, &MI, ForwardKilledOperandReg); LLVM_DEBUG(dbgs() << "With:\n"); LLVM_DEBUG(MI.dump()); @@ -4215,7 +5036,7 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, // ForwardKilledOperandReg = LI imm // y = XOP reg, ForwardKilledOperandReg(killed) if (ForwardKilledOperandReg != ~0U) - fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg); + fixupIsDeadOrKill(&DefMI, &MI, ForwardKilledOperandReg); return true; } @@ -4618,13 +5439,15 @@ MachineInstr *PPCInstrInfo::findLoopInstr( bool PPCInstrInfo::getMemOperandWithOffsetWidth( const MachineInstr &LdSt, const MachineOperand *&BaseReg, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const { - if (!LdSt.mayLoadOrStore()) + if (!LdSt.mayLoadOrStore() || LdSt.getNumExplicitOperands() != 3) return false; // Handle only loads/stores with base register followed by immediate offset. - if (LdSt.getNumExplicitOperands() != 3) + if (!LdSt.getOperand(1).isImm() || + (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI())) return false; - if (!LdSt.getOperand(1).isImm() || !LdSt.getOperand(2).isReg()) + if (!LdSt.getOperand(1).isImm() || + (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI())) return false; if (!LdSt.hasOneMemOperand()) diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index d98597f48340..c6ef1742b722 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -122,61 +122,73 @@ enum SpillOpcodeKey { SOK_VSXVectorSpill, SOK_VectorFloat8Spill, SOK_VectorFloat4Spill, - SOK_VRSaveSpill, - SOK_QuadFloat8Spill, - SOK_QuadFloat4Spill, - SOK_QuadBitSpill, SOK_SpillToVSR, + SOK_PairedVecSpill, + SOK_AccumulatorSpill, + SOK_UAccumulatorSpill, SOK_SPESpill, SOK_LastOpcodeSpill // This must be last on the enum. }; // Define list of load and store spill opcodes. +#define NoInstr PPC::INSTRUCTION_LIST_END #define Pwr8LoadOpcodes \ { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX, \ - PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb, \ - PPC::SPILLTOVSR_LD, PPC::EVLDD \ + PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, PPC::EVLDD \ } #define Pwr9LoadOpcodes \ { \ PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ - PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, \ - PPC::QVLFDXb, PPC::SPILLTOVSR_LD \ + PPC::DFLOADf32, PPC::SPILLTOVSR_LD, NoInstr, NoInstr, NoInstr, NoInstr \ + } + +#define Pwr10LoadOpcodes \ + { \ + PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR, \ + PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, \ + PPC::DFLOADf32, PPC::SPILLTOVSR_LD, PPC::LXVP, PPC::RESTORE_ACC, \ + PPC::RESTORE_UACC, NoInstr \ } #define Pwr8StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ - PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, PPC::SPILL_VRSAVE, \ - PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, PPC::SPILLTOVSR_ST, \ - PPC::EVSTDD \ + PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, \ + PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, PPC::EVSTDD \ } #define Pwr9StoreOpcodes \ { \ PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ - PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, \ - PPC::SPILLTOVSR_ST \ + PPC::SPILLTOVSR_ST, NoInstr, NoInstr, NoInstr, NoInstr \ + } + +#define Pwr10StoreOpcodes \ + { \ + PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \ + PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32, \ + PPC::SPILLTOVSR_ST, PPC::STXVP, PPC::SPILL_ACC, PPC::SPILL_UACC, \ + NoInstr \ } // Initialize arrays for load and store spill opcodes on supported subtargets. #define StoreOpcodesForSpill \ - { Pwr8StoreOpcodes, Pwr9StoreOpcodes } + { Pwr8StoreOpcodes, Pwr9StoreOpcodes, Pwr10StoreOpcodes } #define LoadOpcodesForSpill \ - { Pwr8LoadOpcodes, Pwr9LoadOpcodes } + { Pwr8LoadOpcodes, Pwr9LoadOpcodes, Pwr10LoadOpcodes } class PPCSubtarget; class PPCInstrInfo : public PPCGenInstrInfo { PPCSubtarget &Subtarget; const PPCRegisterInfo RI; - const unsigned StoreSpillOpcodesArray[2][SOK_LastOpcodeSpill] = + const unsigned StoreSpillOpcodesArray[3][SOK_LastOpcodeSpill] = StoreOpcodesForSpill; - const unsigned LoadSpillOpcodesArray[2][SOK_LastOpcodeSpill] = + const unsigned LoadSpillOpcodesArray[3][SOK_LastOpcodeSpill] = LoadOpcodesForSpill; void StoreRegToStackSlot(MachineFunction &MF, unsigned SrcReg, bool isKill, @@ -234,11 +246,17 @@ class PPCInstrInfo : public PPCGenInstrInfo { unsigned getSpillTarget() const; const unsigned *getStoreOpcodesForSpillArray() const; const unsigned *getLoadOpcodesForSpillArray() const; + unsigned getSpillIndex(const TargetRegisterClass *RC) const; int16_t getFMAOpIdxInfo(unsigned Opcode) const; void reassociateFMA(MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const; + bool isLoadFromConstantPool(MachineInstr *I) const; + Register + generateLoadForNewConst(unsigned Idx, MachineInstr *MI, Type *Ty, + SmallVectorImpl<MachineInstr *> &InsInstrs) const; + const Constant *getConstantFromConstantPool(MachineInstr *I) const; virtual void anchor(); protected: @@ -273,10 +291,10 @@ public: } static bool isSameClassPhysRegCopy(unsigned Opcode) { - unsigned CopyOpcodes[] = - { PPC::OR, PPC::OR8, PPC::FMR, PPC::VOR, PPC::XXLOR, PPC::XXLORf, - PPC::XSCPSGNDP, PPC::MCRF, PPC::QVFMR, PPC::QVFMRs, PPC::QVFMRb, - PPC::CROR, PPC::EVOR, -1U }; + unsigned CopyOpcodes[] = {PPC::OR, PPC::OR8, PPC::FMR, + PPC::VOR, PPC::XXLOR, PPC::XXLORf, + PPC::XSCPSGNDP, PPC::MCRF, PPC::CROR, + PPC::EVOR, -1U}; for (int i = 0; CopyOpcodes[i] != -1U; i++) if (Opcode == CopyOpcodes[i]) return true; @@ -330,14 +348,29 @@ public: /// chain ending in \p Root. All potential patterns are output in the \p /// P array. bool getFMAPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &P) const; + SmallVectorImpl<MachineCombinerPattern> &P, + bool DoRegPressureReduce) const; /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in <Root>. All potential patterns are /// output in the <Pattern> array. - bool getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &P) const override; + bool getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &P, + bool DoRegPressureReduce) const override; + + /// On PowerPC, we leverage machine combiner pass to reduce register pressure + /// when the register pressure is high for one BB. + /// Return true if register pressure for \p MBB is high and ABI is supported + /// to reduce register pressure. Otherwise return false. + bool + shouldReduceRegisterPressure(MachineBasicBlock *MBB, + RegisterClassInfo *RegClassInfo) const override; + + /// Fixup the placeholders we put in genAlternativeCodeSequence() for + /// MachineCombiner. + void + finalizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P, + SmallVectorImpl<MachineInstr *> &InsInstrs) const override; bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; @@ -470,14 +503,18 @@ public: // Predication support. bool isPredicated(const MachineInstr &MI) const override; + bool isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; + bool PredicateInstruction(MachineInstr &MI, ArrayRef<MachineOperand> Pred) const override; bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, ArrayRef<MachineOperand> Pred2) const override; - bool DefinesPredicate(MachineInstr &MI, - std::vector<MachineOperand> &Pred) const override; + bool ClobbersPredicate(MachineInstr &MI, std::vector<MachineOperand> &Pred, + bool SkipDead) const override; // Comparison optimization. @@ -497,6 +534,20 @@ public: int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const; + /// Get the base operand and byte offset of an instruction that reads/writes + /// memory. + bool getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, + SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset, + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; + + /// Returns true if the two given memory operations should be scheduled + /// adjacent. + bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2, + unsigned NumLoads, unsigned NumBytes) const override; + /// Return true if two MIs access different memory addresses and false /// otherwise bool @@ -554,6 +605,7 @@ public: bool convertToImmediateForm(MachineInstr &MI, MachineInstr **KilledDef = nullptr) const; bool foldFrameOffset(MachineInstr &MI) const; + bool combineRLWINM(MachineInstr &MI, MachineInstr **ToErase = nullptr) const; bool isADDIInstrEligibleForFolding(MachineInstr &ADDIMI, int64_t &Imm) const; bool isADDInstrEligibleForFolding(MachineInstr &ADDMI) const; bool isImmInstrEligibleForFolding(MachineInstr &MI, unsigned &BaseReg, @@ -565,15 +617,21 @@ public: int64_t OffsetImm) const; /// Fixup killed/dead flag for register \p RegNo between instructions [\p - /// StartMI, \p EndMI]. Some PostRA transformations may violate register - /// killed/dead flags semantics, this function can be called to fix up. Before - /// calling this function, + /// StartMI, \p EndMI]. Some pre-RA or post-RA transformations may violate + /// register killed/dead flags semantics, this function can be called to fix + /// up. Before calling this function, /// 1. Ensure that \p RegNo liveness is killed after instruction \p EndMI. /// 2. Ensure that there is no new definition between (\p StartMI, \p EndMI) - /// and possible definition for \p RegNo is \p StartMI or \p EndMI. - /// 3. Ensure that all instructions between [\p StartMI, \p EndMI] are in same - /// basic block. - void fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, + /// and possible definition for \p RegNo is \p StartMI or \p EndMI. For + /// pre-RA cases, definition may be \p StartMI through COPY, \p StartMI + /// will be adjust to true definition. + /// 3. We can do accurate fixup for the case when all instructions between + /// [\p StartMI, \p EndMI] are in same basic block. + /// 4. For the case when \p StartMI and \p EndMI are not in same basic block, + /// we conservatively clear kill flag for all uses of \p RegNo for pre-RA + /// and for post-RA, we give an assertion as without reaching definition + /// analysis post-RA, \p StartMI and \p EndMI are hard to keep right. + void fixupIsDeadOrKill(MachineInstr *StartMI, MachineInstr *EndMI, unsigned RegNo) const; void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const; void replaceInstrOperandWithImm(MachineInstr &MI, unsigned OpNo, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 673ab63039cf..724af23542d7 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -74,6 +74,9 @@ def SDT_PPCcondbr : SDTypeProfile<0, 3, [ SDTCisVT<0, i32>, SDTCisVT<2, OtherVT> ]>; +def SDT_PPCFtsqrt : SDTypeProfile<1, 1, [ + SDTCisVT<0, i32>]>; + def SDT_PPClbrx : SDTypeProfile<1, 2, [ SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT> ]>; @@ -124,6 +127,8 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [ def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>; def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>; +def PPCfsqrt : SDNode<"PPCISD::FSQRT", SDTFPUnaryOp, []>; +def PPCftsqrt : SDNode<"PPCISD::FTSQRT", SDT_PPCFtsqrt,[]>; def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>; def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>; @@ -134,6 +139,28 @@ def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>; def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>; def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>; +def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID", + SDTFPUnaryOp, [SDNPHasChain]>; +def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU", + SDTFPUnaryOp, [SDNPHasChain]>; +def PPCstrict_fcfids : SDNode<"PPCISD::STRICT_FCFIDS", + SDTFPRoundOp, [SDNPHasChain]>; +def PPCstrict_fcfidus : SDNode<"PPCISD::STRICT_FCFIDUS", + SDTFPRoundOp, [SDNPHasChain]>; + +def PPCany_fcfid : PatFrags<(ops node:$op), + [(PPCfcfid node:$op), + (PPCstrict_fcfid node:$op)]>; +def PPCany_fcfidu : PatFrags<(ops node:$op), + [(PPCfcfidu node:$op), + (PPCstrict_fcfidu node:$op)]>; +def PPCany_fcfids : PatFrags<(ops node:$op), + [(PPCfcfids node:$op), + (PPCstrict_fcfids node:$op)]>; +def PPCany_fcfidus : PatFrags<(ops node:$op), + [(PPCfcfidus node:$op), + (PPCstrict_fcfidus node:$op)]>; + def PPCcv_fp_to_uint_in_vsr: SDNode<"PPCISD::FP_TO_UINT_IN_VSR", SDT_PPCcv_fp_to_int, []>; def PPCcv_fp_to_sint_in_vsr: @@ -160,7 +187,12 @@ def PPCmffs : SDNode<"PPCISD::MFFS", // Perform FADD in round-to-zero mode. def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>; +def PPCstrict_faddrtz: SDNode<"PPCISD::STRICT_FADDRTZ", SDTFPBinOp, + [SDNPHasChain]>; +def PPCany_faddrtz: PatFrags<(ops node:$lhs, node:$rhs), + [(PPCfaddrtz node:$lhs, node:$rhs), + (PPCstrict_faddrtz node:$lhs, node:$rhs)]>; def PPCfsel : SDNode<"PPCISD::FSEL", // Type constraint for fsel. @@ -195,6 +227,7 @@ def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR", SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>; def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>; def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>; +def PPCpaddiDtprel : SDNode<"PPCISD::PADDI_DTPREL", SDTIntBinOp>; def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>; @@ -203,16 +236,6 @@ def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>; def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; -def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>; -def PPCqvgpci : SDNode<"PPCISD::QVGPCI", SDT_PPCqvgpci, []>; -def PPCqvaligni : SDNode<"PPCISD::QVALIGNI", SDT_PPCqvaligni, []>; -def PPCqvesplati : SDNode<"PPCISD::QVESPLATI", SDT_PPCqvesplati, []>; - -def PPCqbflt : SDNode<"PPCISD::QBFLT", SDT_PPCqbflt, []>; - -def PPCqvlfsb : SDNode<"PPCISD::QVLFSb", SDT_PPCqvlfsb, - [SDNPHasChain, SDNPMayLoad]>; - def PPCcmpb : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>; // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift @@ -225,6 +248,28 @@ def PPCfnmsub : SDNode<"PPCISD::FNMSUB" , SDTFPTernaryOp>; def PPCextswsli : SDNode<"PPCISD::EXTSWSLI" , SDT_PPCextswsli>; +def PPCstrict_fctidz : SDNode<"PPCISD::STRICT_FCTIDZ", + SDTFPUnaryOp, [SDNPHasChain]>; +def PPCstrict_fctiwz : SDNode<"PPCISD::STRICT_FCTIWZ", + SDTFPUnaryOp, [SDNPHasChain]>; +def PPCstrict_fctiduz : SDNode<"PPCISD::STRICT_FCTIDUZ", + SDTFPUnaryOp, [SDNPHasChain]>; +def PPCstrict_fctiwuz : SDNode<"PPCISD::STRICT_FCTIWUZ", + SDTFPUnaryOp, [SDNPHasChain]>; + +def PPCany_fctidz : PatFrags<(ops node:$op), + [(PPCstrict_fctidz node:$op), + (PPCfctidz node:$op)]>; +def PPCany_fctiwz : PatFrags<(ops node:$op), + [(PPCstrict_fctiwz node:$op), + (PPCfctiwz node:$op)]>; +def PPCany_fctiduz : PatFrags<(ops node:$op), + [(PPCstrict_fctiduz node:$op), + (PPCfctiduz node:$op)]>; +def PPCany_fctiwuz : PatFrags<(ops node:$op), + [(PPCstrict_fctiwuz node:$op), + (PPCfctiwuz node:$op)]>; + // Move 2 i64 values into a VSX register def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128", SDTypeProfile<1, 2, @@ -295,7 +340,7 @@ def PPCrfebb : SDNode<"PPCISD::RFEBB", SDT_PPCsc, [SDNPHasChain, SDNPSideEffect]>; def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>; -def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>; +def PPCvcmp_rec : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>; def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr, [SDNPHasChain, SDNPOptInGlue]>; @@ -327,6 +372,10 @@ def PPCprobedalloca : SDNode<"PPCISD::PROBED_ALLOCA", SDTDynOp, [SDNPHasChain]>; // PC Relative Specific Nodes def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>; +def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR", + SDTIntUnaryOp, []>; +def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR", + SDTIntUnaryOp, []>; //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. @@ -446,37 +495,41 @@ def imm64ZExt32 : Operand<i64>, ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>; -// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require +// This is a somewhat weaker condition than actually checking for 4-byte +// alignment. It is simply checking that the displacement can be represented +// as an immediate that is a multiple of 4 (i.e. the requirements for DS-Form +// instructions). +// But some r+i load/store instructions (such as LD, STD, LDU, etc.) that require // restricted memrix (4-aligned) constants are alignment sensitive. If these // offsets are hidden behind TOC entries than the values of the lower-order // bits cannot be checked directly. As a result, we need to also incorporate // an alignment check into the relevant patterns. -def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast<LoadSDNode>(N)->getAlignment() >= 4; +def DSFormLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isOffsetMultipleOf(N, 4) || cast<LoadSDNode>(N)->getAlignment() >= 4; }]>; -def aligned4store : PatFrag<(ops node:$val, node:$ptr), +def DSFormStore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAlignment() >= 4; + return isOffsetMultipleOf(N, 4) || cast<StoreSDNode>(N)->getAlignment() >= 4; }]>; -def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ - return cast<LoadSDNode>(N)->getAlignment() >= 4; +def DSFormSextLoadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ + return isOffsetMultipleOf(N, 4) || cast<LoadSDNode>(N)->getAlignment() >= 4; }]>; -def aligned4pre_store : PatFrag< +def DSFormPreStore : PatFrag< (ops node:$val, node:$base, node:$offset), (pre_store node:$val, node:$base, node:$offset), [{ - return cast<StoreSDNode>(N)->getAlignment() >= 4; + return isOffsetMultipleOf(N, 4) || cast<StoreSDNode>(N)->getAlignment() >= 4; }]>; -def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast<LoadSDNode>(N)->getAlignment() < 4; +def NonDSFormLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() < 4 && !isOffsetMultipleOf(N, 4); }]>; -def unaligned4store : PatFrag<(ops node:$val, node:$ptr), +def NonDSFormStore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAlignment() < 4; + return cast<StoreSDNode>(N)->getAlignment() < 4 && !isOffsetMultipleOf(N, 4); }]>; -def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ - return cast<LoadSDNode>(N)->getAlignment() < 4; +def NonDSFormSextLoadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() < 4 && !isOffsetMultipleOf(N, 4); }]>; // This is a somewhat weaker condition than actually checking for 16-byte @@ -617,6 +670,7 @@ def PPCU1ImmAsmOperand : AsmOperandClass { def u1imm : Operand<i32> { let PrintMethod = "printU1ImmOperand"; let ParserMatchClass = PPCU1ImmAsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU2ImmAsmOperand : AsmOperandClass { @@ -626,6 +680,7 @@ def PPCU2ImmAsmOperand : AsmOperandClass { def u2imm : Operand<i32> { let PrintMethod = "printU2ImmOperand"; let ParserMatchClass = PPCU2ImmAsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCATBitsAsHintAsmOperand : AsmOperandClass { @@ -635,6 +690,7 @@ def PPCATBitsAsHintAsmOperand : AsmOperandClass { def atimm : Operand<i32> { let PrintMethod = "printATBitsAsHint"; let ParserMatchClass = PPCATBitsAsHintAsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU3ImmAsmOperand : AsmOperandClass { @@ -644,6 +700,7 @@ def PPCU3ImmAsmOperand : AsmOperandClass { def u3imm : Operand<i32> { let PrintMethod = "printU3ImmOperand"; let ParserMatchClass = PPCU3ImmAsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU4ImmAsmOperand : AsmOperandClass { @@ -653,6 +710,7 @@ def PPCU4ImmAsmOperand : AsmOperandClass { def u4imm : Operand<i32> { let PrintMethod = "printU4ImmOperand"; let ParserMatchClass = PPCU4ImmAsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCS5ImmAsmOperand : AsmOperandClass { let Name = "S5Imm"; let PredicateMethod = "isS5Imm"; @@ -662,6 +720,7 @@ def s5imm : Operand<i32> { let PrintMethod = "printS5ImmOperand"; let ParserMatchClass = PPCS5ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<5>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU5ImmAsmOperand : AsmOperandClass { let Name = "U5Imm"; let PredicateMethod = "isU5Imm"; @@ -671,6 +730,7 @@ def u5imm : Operand<i32> { let PrintMethod = "printU5ImmOperand"; let ParserMatchClass = PPCU5ImmAsmOperand; let DecoderMethod = "decodeUImmOperand<5>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU6ImmAsmOperand : AsmOperandClass { let Name = "U6Imm"; let PredicateMethod = "isU6Imm"; @@ -680,6 +740,7 @@ def u6imm : Operand<i32> { let PrintMethod = "printU6ImmOperand"; let ParserMatchClass = PPCU6ImmAsmOperand; let DecoderMethod = "decodeUImmOperand<6>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU7ImmAsmOperand : AsmOperandClass { let Name = "U7Imm"; let PredicateMethod = "isU7Imm"; @@ -689,6 +750,7 @@ def u7imm : Operand<i32> { let PrintMethod = "printU7ImmOperand"; let ParserMatchClass = PPCU7ImmAsmOperand; let DecoderMethod = "decodeUImmOperand<7>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU8ImmAsmOperand : AsmOperandClass { let Name = "U8Imm"; let PredicateMethod = "isU8Imm"; @@ -698,6 +760,7 @@ def u8imm : Operand<i32> { let PrintMethod = "printU8ImmOperand"; let ParserMatchClass = PPCU8ImmAsmOperand; let DecoderMethod = "decodeUImmOperand<8>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU10ImmAsmOperand : AsmOperandClass { let Name = "U10Imm"; let PredicateMethod = "isU10Imm"; @@ -707,6 +770,7 @@ def u10imm : Operand<i32> { let PrintMethod = "printU10ImmOperand"; let ParserMatchClass = PPCU10ImmAsmOperand; let DecoderMethod = "decodeUImmOperand<10>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU12ImmAsmOperand : AsmOperandClass { let Name = "U12Imm"; let PredicateMethod = "isU12Imm"; @@ -716,6 +780,7 @@ def u12imm : Operand<i32> { let PrintMethod = "printU12ImmOperand"; let ParserMatchClass = PPCU12ImmAsmOperand; let DecoderMethod = "decodeUImmOperand<12>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCS16ImmAsmOperand : AsmOperandClass { let Name = "S16Imm"; let PredicateMethod = "isS16Imm"; @@ -726,6 +791,7 @@ def s16imm : Operand<i32> { let EncoderMethod = "getImm16Encoding"; let ParserMatchClass = PPCS16ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<16>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCU16ImmAsmOperand : AsmOperandClass { let Name = "U16Imm"; let PredicateMethod = "isU16Imm"; @@ -736,6 +802,7 @@ def u16imm : Operand<i32> { let EncoderMethod = "getImm16Encoding"; let ParserMatchClass = PPCU16ImmAsmOperand; let DecoderMethod = "decodeUImmOperand<16>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCS17ImmAsmOperand : AsmOperandClass { let Name = "S17Imm"; let PredicateMethod = "isS17Imm"; @@ -749,6 +816,7 @@ def s17imm : Operand<i32> { let EncoderMethod = "getImm16Encoding"; let ParserMatchClass = PPCS17ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<16>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCS34ImmAsmOperand : AsmOperandClass { let Name = "S34Imm"; @@ -757,9 +825,17 @@ def PPCS34ImmAsmOperand : AsmOperandClass { } def s34imm : Operand<i64> { let PrintMethod = "printS34ImmOperand"; - let EncoderMethod = "getImm34Encoding"; + let EncoderMethod = "getImm34EncodingNoPCRel"; let ParserMatchClass = PPCS34ImmAsmOperand; let DecoderMethod = "decodeSImmOperand<34>"; + let OperandType = "OPERAND_IMMEDIATE"; +} +def s34imm_pcrel : Operand<i64> { + let PrintMethod = "printS34ImmOperand"; + let EncoderMethod = "getImm34EncodingPCRel"; + let ParserMatchClass = PPCS34ImmAsmOperand; + let DecoderMethod = "decodeSImmOperand<34>"; + let OperandType = "OPERAND_IMMEDIATE"; } def PPCImmZeroAsmOperand : AsmOperandClass { let Name = "ImmZero"; @@ -770,6 +846,7 @@ def immZero : Operand<i32> { let PrintMethod = "printImmZeroOperand"; let ParserMatchClass = PPCImmZeroAsmOperand; let DecoderMethod = "decodeImmZeroOperand"; + let OperandType = "OPERAND_IMMEDIATE"; } def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>; @@ -915,40 +992,47 @@ def memri : Operand<iPTR> { let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg); let EncoderMethod = "getMemRIEncoding"; let DecoderMethod = "decodeMemRIOperands"; + let OperandType = "OPERAND_MEMORY"; } def memrr : Operand<iPTR> { let PrintMethod = "printMemRegReg"; let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg); + let OperandType = "OPERAND_MEMORY"; } def memrix : Operand<iPTR> { // memri where the imm is 4-aligned. let PrintMethod = "printMemRegImm"; let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg); let EncoderMethod = "getMemRIXEncoding"; let DecoderMethod = "decodeMemRIXOperands"; + let OperandType = "OPERAND_MEMORY"; } def memrix16 : Operand<iPTR> { // memri, imm is 16-aligned, 12-bit, Inst{16:27} let PrintMethod = "printMemRegImm"; let MIOperandInfo = (ops dispRIX16:$imm, ptr_rc_nor0:$reg); let EncoderMethod = "getMemRIX16Encoding"; let DecoderMethod = "decodeMemRIX16Operands"; + let OperandType = "OPERAND_MEMORY"; } def spe8dis : Operand<iPTR> { // SPE displacement where the imm is 8-aligned. let PrintMethod = "printMemRegImm"; let MIOperandInfo = (ops dispSPE8:$imm, ptr_rc_nor0:$reg); let EncoderMethod = "getSPE8DisEncoding"; let DecoderMethod = "decodeSPE8Operands"; + let OperandType = "OPERAND_MEMORY"; } def spe4dis : Operand<iPTR> { // SPE displacement where the imm is 4-aligned. let PrintMethod = "printMemRegImm"; let MIOperandInfo = (ops dispSPE4:$imm, ptr_rc_nor0:$reg); let EncoderMethod = "getSPE4DisEncoding"; let DecoderMethod = "decodeSPE4Operands"; + let OperandType = "OPERAND_MEMORY"; } def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned. let PrintMethod = "printMemRegImm"; let MIOperandInfo = (ops dispSPE2:$imm, ptr_rc_nor0:$reg); let EncoderMethod = "getSPE2DisEncoding"; let DecoderMethod = "decodeSPE2Operands"; + let OperandType = "OPERAND_MEMORY"; } // A single-register address. This is used with the SjLj @@ -956,6 +1040,7 @@ def spe2dis : Operand<iPTR> { // SPE displacement where the imm is 2-aligned. // G8RC_NOX0 registers. def memr : Operand<iPTR> { let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg); + let OperandType = "OPERAND_MEMORY"; } def PPCTLSRegOperand : AsmOperandClass { let Name = "TLSReg"; let PredicateMethod = "isTLSReg"; @@ -981,11 +1066,13 @@ def pred : Operand<OtherVT> { // Define PowerPC specific addressing mode. // d-form -def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; // "stb" +def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; // "stb" // ds-form -def iaddrX4 : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std" +def iaddrX4 : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std" // dq-form -def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv" +def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv" +// 8LS:d-form +def iaddrX34 : ComplexPattern<iPTR, 2, "SelectAddrImmX34", [], []>; // "pstxvp" // Below forms are all x-form addressing mode, use three different ones so we // can make a accurate check for x-form instructions in ISEL. @@ -1031,6 +1118,11 @@ def HasExtDiv : Predicate<"Subtarget->hasExtDiv()">; def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">; def HasFPU : Predicate<"Subtarget->hasFPU()">; def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">; +def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">; + +// AIX assembler may not be modern enough to support some extended mne. +def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">, + AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>; //===----------------------------------------------------------------------===// // PowerPC Multiclass Definitions. @@ -1389,10 +1481,7 @@ def ADJCALLSTACKUP : PPCEmitTimePseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2 "#ADJCALLSTACKUP $amt1 $amt2", [(callseq_end timm:$amt1, timm:$amt2)]>; } - -def UPDATE_VRSAVE : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$rS), - "UPDATE_VRSAVE $rD, $rS", []>; -} +} // hasCtrlDep let Defs = [R1], Uses = [R1] in def DYNALLOC : PPCEmitTimePseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC", @@ -1406,9 +1495,14 @@ def PROBED_ALLOCA_32 : PPCCustomInserterPseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#PROBED_ALLOCA_32", [(set i32:$result, (PPCprobedalloca i32:$negsize, iaddr:$fpsi))]>; -def PREPARE_PROBED_ALLOCA_32 : PPCEmitTimePseudo<(outs gprc:$fp, - gprc:$sp), +def PREPARE_PROBED_ALLOCA_32 : PPCEmitTimePseudo<(outs + gprc:$fp, gprc:$actual_negsize), (ins gprc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_32", []>; +def PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32 : PPCEmitTimePseudo<(outs + gprc:$fp, gprc:$actual_negsize), + (ins gprc:$negsize, memri:$fpsi), + "#PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32", []>, + RegConstraint<"$actual_negsize = $negsize">; def PROBED_STACKALLOC_32 : PPCEmitTimePseudo<(outs gprc:$scratch, gprc:$temp), (ins i64imm:$stacksize), "#PROBED_STACKALLOC_32", []>; @@ -1513,6 +1607,9 @@ def SETRNDi : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins u2imm:$RND), def SETRND : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins gprc:$in), "#SETRND", [(set f64:$FRT, (int_ppc_setrnd gprc :$in))]>; + +def SETFLM : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins f8rc:$FLM), + "#SETFLM", [(set f64:$FRT, (int_ppc_setflm f8rc:$FLM))]>; } let Defs = [LR] in @@ -1562,11 +1659,12 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { def BCn : BForm_4<16, 4, 0, 0, (outs), (ins crbitrc:$bi, condbrtarget:$dst), "bc 4, $bi, $dst">; - let isReturn = 1, Uses = [LR, RM] in + let isReturn = 1, Uses = [LR, RM] in { def BCLR : XLForm_2_br2<19, 16, 12, 0, (outs), (ins crbitrc:$bi), "bclr 12, $bi, 0", IIC_BrB, []>; def BCLRn : XLForm_2_br2<19, 16, 4, 0, (outs), (ins crbitrc:$bi), "bclr 4, $bi, 0", IIC_BrB, []>; + } } let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in { @@ -1838,7 +1936,7 @@ def DCBZL : DCB_Form<1014, 1, (outs), (ins memrr:$dst), "dcbzl $dst", IIC_LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>, PPC970_DGroup_Single; -def DCBF : DCB_Form_hint<86, (outs), (ins u5imm:$TH, memrr:$dst), +def DCBF : DCB_Form_hint<86, (outs), (ins u3imm:$TH, memrr:$dst), "dcbf $dst, $TH", IIC_LdStDCBF, []>, PPC970_DGroup_Single; @@ -2373,7 +2471,7 @@ let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst), "stmw $rS, $dst", IIC_LdStLMW, []>; -def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L), +def SYNC : XForm_24_sync<31, 598, (outs), (ins u2imm:$L), "sync $L", IIC_LdStSync, []>; let isCodeGenOnly = 1 in { @@ -2568,37 +2666,26 @@ let isCompare = 1, hasSideEffects = 0 in { } } let PPC970_Unit = 3, Predicates = [HasFPU] in { // FPU Operations. -//def FCMPO : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB), -// "fcmpo $crD, $fA, $fB", IIC_FPCompare>; -let isCompare = 1, hasSideEffects = 0 in { +let isCompare = 1, mayRaiseFPException = 1, hasSideEffects = 0 in { def FCMPUS : XForm_17<63, 0, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB), "fcmpu $crD, $fA, $fB", IIC_FPCompare>; - let Interpretation64Bit = 1, isCodeGenOnly = 1 in - def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), - "fcmpu $crD, $fA, $fB", IIC_FPCompare>; + def FCMPOS : XForm_17<63, 32, (outs crrc:$crD), (ins f4rc:$fA, f4rc:$fB), + "fcmpo $crD, $fA, $fB", IIC_FPCompare>; + let Interpretation64Bit = 1, isCodeGenOnly = 1 in { + def FCMPUD : XForm_17<63, 0, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), + "fcmpu $crD, $fA, $fB", IIC_FPCompare>; + def FCMPOD : XForm_17<63, 32, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), + "fcmpo $crD, $fA, $fB", IIC_FPCompare>; + } } def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB), "ftdiv $crD, $fA, $fB", IIC_FPCompare>; def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB), - "ftsqrt $crD, $fB", IIC_FPCompare>; - -let Uses = [RM], mayRaiseFPException = 1 in { - let hasSideEffects = 0 in { - defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB), - "fctiw", "$frD, $frB", IIC_FPGeneral, - []>; - defm FCTIWU : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB), - "fctiwu", "$frD, $frB", IIC_FPGeneral, - []>; - defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB), - "fctiwz", "$frD, $frB", IIC_FPGeneral, - [(set f64:$frD, (PPCfctiwz f64:$frB))]>; - - defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB), - "frsp", "$frD, $frB", IIC_FPGeneral, - [(set f32:$frD, (any_fpround f64:$frB))]>; + "ftsqrt $crD, $fB", IIC_FPCompare, + [(set i32:$crD, (PPCftsqrt f64:$fB))]>; +let mayRaiseFPException = 1, hasSideEffects = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in defm FRIND : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB), "frin", "$frD, $frB", IIC_FPGeneral, @@ -2606,9 +2693,7 @@ let Uses = [RM], mayRaiseFPException = 1 in { defm FRINS : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB), "frin", "$frD, $frB", IIC_FPGeneral, [(set f32:$frD, (any_fround f32:$frB))]>; - } - let hasSideEffects = 0 in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in defm FRIPD : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB), "frip", "$frD, $frB", IIC_FPGeneral, @@ -2630,6 +2715,22 @@ let Uses = [RM], mayRaiseFPException = 1 in { defm FRIMS : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB), "frim", "$frD, $frB", IIC_FPGeneral, [(set f32:$frD, (any_ffloor f32:$frB))]>; +} + +let Uses = [RM], mayRaiseFPException = 1, hasSideEffects = 0 in { + defm FCTIW : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiw", "$frD, $frB", IIC_FPGeneral, + []>; + defm FCTIWU : XForm_26r<63, 142, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiwu", "$frD, $frB", IIC_FPGeneral, + []>; + defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB), + "fctiwz", "$frD, $frB", IIC_FPGeneral, + [(set f64:$frD, (PPCany_fctiwz f64:$frB))]>; + + defm FRSP : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB), + "frsp", "$frD, $frB", IIC_FPGeneral, + [(set f32:$frD, (any_fpround f64:$frB))]>; defm FSQRT : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB), "fsqrt", "$frD, $frB", IIC_FPSqrtD, @@ -2637,9 +2738,10 @@ let Uses = [RM], mayRaiseFPException = 1 in { defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB), "fsqrts", "$frD, $frB", IIC_FPSqrtS, [(set f32:$frD, (any_fsqrt f32:$frB))]>; - } - } } +} + +def : Pat<(PPCfsqrt f64:$frA), (FSQRT $frA)>; /// Note that FMR is defined as pseudo-ops on the PPC970 because they are /// often coalesced away and we don't want the dispatch group builder to think @@ -2684,6 +2786,7 @@ defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB), [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>; // Reciprocal estimates. +let mayRaiseFPException = 1 in { defm FRE : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB), "fre", "$frD, $frB", IIC_FPGeneral, [(set f64:$frD, (PPCfre f64:$frB))]>; @@ -2697,6 +2800,7 @@ defm FRSQRTES : XForm_26r<59, 26, (outs f4rc:$frD), (ins f4rc:$frB), "frsqrtes", "$frD, $frB", IIC_FPGeneral, [(set f32:$frD, (PPCfrsqrte f32:$frB))]>; } +} // XL-Form instructions. condition register logical ops. // @@ -2857,18 +2961,6 @@ let isCodeGenOnly = 1 in { def : InstAlias<"mtvrsave $rS", (MTVRSAVE gprc:$rS)>; def : InstAlias<"mfvrsave $rS", (MFVRSAVE gprc:$rS)>; -// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register, -// so we'll need to scavenge a register for it. -let mayStore = 1 in -def SPILL_VRSAVE : PPCEmitTimePseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F), - "#SPILL_VRSAVE", []>; - -// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously -// spilled), so we'll need to scavenge a register for it. -let mayLoad = 1 in -def RESTORE_VRSAVE : PPCEmitTimePseudo<(outs VRSAVERC:$vrsave), (ins memri:$F), - "#RESTORE_VRSAVE", []>; - let hasSideEffects = 0 in { // mtocrf's input needs to be prepared by shifting by an amount dependent // on the cr register selected. Thus, post-ra anti-dep breaking must not @@ -2908,20 +3000,24 @@ def : InstAlias<"mtcr $rA", (MTCRF 255, gprc:$rA)>; let Predicates = [HasFPU] in { // Custom inserter instruction to perform FADD in round-to-zero mode. -let Uses = [RM] in { +let Uses = [RM], mayRaiseFPException = 1 in { def FADDrtz: PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB), "", - [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>; + [(set f64:$FRT, (PPCany_faddrtz f64:$FRA, f64:$FRB))]>; } // The above pseudo gets expanded to make use of the following instructions // to manipulate FPSCR. Note that FPSCR is not modeled at the DAG level. -let Uses = [RM], Defs = [RM] in { - def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM), - "mtfsb0 $FM", IIC_IntMTFSB0, []>, - PPC970_DGroup_Single, PPC970_Unit_FPU; - def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM), - "mtfsb1 $FM", IIC_IntMTFSB0, []>, - PPC970_DGroup_Single, PPC970_Unit_FPU; + +// When FM is 30/31, we are setting the 62/63 bit of FPSCR, the implicit-def +// RM should be set. +def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM), + "mtfsb0 $FM", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; +def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM), + "mtfsb1 $FM", IIC_IntMTFSB0, []>, + PPC970_DGroup_Single, PPC970_Unit_FPU; + +let Defs = [RM] in { let isCodeGenOnly = 1 in def MTFSFb : XFLForm<63, 711, (outs), (ins i32imm:$FM, f8rc:$rT), "mtfsf $FM, $rT", IIC_IntMTFSB0, []>, @@ -3060,7 +3156,7 @@ def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC_rec gprc:$rA, gprc:$rC, gprc:$rB)> // this type. // let PPC970_Unit = 3, hasSideEffects = 0, Predicates = [HasFPU] in { // FPU Operations. -let Uses = [RM] in { +let mayRaiseFPException = 1, Uses = [RM] in { let isCommutable = 1 in { defm FMADD : AForm_1r<63, 29, (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB), @@ -3246,9 +3342,13 @@ def : Pat<(PPCcall (i32 texternalsym:$dst)), // Calls for AIX only def : Pat<(PPCcall (i32 mcsym:$dst)), (BL mcsym:$dst)>; + def : Pat<(PPCcall_nop (i32 mcsym:$dst)), (BL_NOP mcsym:$dst)>; +def : Pat<(PPCcall_nop (i32 texternalsym:$dst)), + (BL_NOP texternalsym:$dst)>; + def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm), (TCRETURNdi tglobaladdr:$dst, imm:$imm)>; @@ -3258,7 +3358,7 @@ def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm), def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm), (TCRETURNri CTRRC:$dst, imm:$imm)>; - +def : Pat<(int_ppc_readflm), (MFFS)>; // Hi and Lo for Darwin Global Addresses. def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>; @@ -3412,7 +3512,7 @@ def : Pat<(f64 (extloadf32 iaddr:$src)), def : Pat<(f64 (extloadf32 xaddr:$src)), (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>; -def : Pat<(f64 (fpextend f32:$src)), +def : Pat<(f64 (any_fpextend f32:$src)), (COPY_TO_REGCLASS $src, F8RC)>; } @@ -3452,7 +3552,6 @@ include "PPCInstrAltivec.td" include "PPCInstrSPE.td" include "PPCInstr64Bit.td" include "PPCInstrVSX.td" -include "PPCInstrQPX.td" include "PPCInstrHTM.td" def crnot : OutPatFrag<(ops node:$in), @@ -3836,6 +3935,7 @@ def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETGT)), def : Pat<(i1 (setcc i64:$s1, i64:$s2, SETEQ)), (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; +let Predicates = [IsNotISA3_1] in { // Instantiations of CRNotPat for i32. defm : CRNotPat<(i1 (setcc i32:$s1, immZExt16:$imm, SETUGE)), (EXTRACT_SUBREG (CMPLWI $s1, imm:$imm), sub_lt)>; @@ -3893,106 +3993,62 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETLE)), (EXTRACT_SUBREG (CMPD $s1, $s2), sub_gt)>; defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)), (EXTRACT_SUBREG (CMPD $s1, $s2), sub_eq)>; +} -let Predicates = [HasFPU] in { -// Instantiations of CRNotPat for f32. -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUGE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETGE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETULE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETLE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETUNE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETNE)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f32:$s1, f32:$s2, SETO)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; - -// Instantiations of CRNotPat for f64. -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUGE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETGE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETULE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETLE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETUNE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETNE)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f64:$s1, f64:$s2, SETO)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; - -// Instantiations of CRNotPat for f128. -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUGE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETGE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETULE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETLE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETUNE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETNE)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -defm : CRNotPat<(i1 (setcc f128:$s1, f128:$s2, SETO)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; +multiclass FSetCCPat<SDNode SetCC, ValueType Ty, PatLeaf FCmp> { + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>; + + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOLT)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLT)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOGT)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGT)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOEQ)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETEQ)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUO)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>; } -// SETCC for f32. let Predicates = [HasFPU] in { -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOLT)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETLT)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOGT)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETGT)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETOEQ)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETEQ)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f32:$s1, f32:$s2, SETUO)), - (EXTRACT_SUBREG (FCMPUS $s1, $s2), sub_un)>; +// FCMPU: If either of the operands is a Signaling NaN, then VXSNAN is set. +// SETCC for f32. +defm : FSetCCPat<any_fsetcc, f32, FCMPUS>; // SETCC for f64. -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOLT)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETLT)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOGT)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETGT)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETOEQ)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETEQ)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f64:$s1, f64:$s2, SETUO)), - (EXTRACT_SUBREG (FCMPUD $s1, $s2), sub_un)>; +defm : FSetCCPat<any_fsetcc, f64, FCMPUD>; // SETCC for f128. -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOLT)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETLT)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_lt)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOGT)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETGT)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_gt)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETOEQ)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETEQ)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_eq)>; -def : Pat<(i1 (setcc f128:$s1, f128:$s2, SETUO)), - (EXTRACT_SUBREG (XSCMPUQP $s1, $s2), sub_un)>; +defm : FSetCCPat<any_fsetcc, f128, XSCMPUQP>; + +// FCMPO: If either of the operands is a Signaling NaN, then VXSNAN is set and, +// if neither operand is a Signaling NaN but at least one operand is a Quiet NaN, +// then VXVC is set. +// SETCCS for f32. +defm : FSetCCPat<strict_fsetccs, f32, FCMPOS>; + +// SETCCS for f64. +defm : FSetCCPat<strict_fsetccs, f64, FCMPOD>; +// SETCCS for f128. +defm : FSetCCPat<strict_fsetccs, f128, XSCMPOQP>; } // This must be in this file because it relies on patterns defined in this file @@ -4261,7 +4317,7 @@ def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins), def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src), "icbi $src", IIC_LdStICBI, []>; -def WAIT : XForm_24_sync<31, 30, (outs), (ins i32imm:$L), +def WAIT : XForm_24_sync<31, 30, (outs), (ins u2imm:$L), "wait $L", IIC_LdStLoad, []>; def MBAR : XForm_mbar<31, 854, (outs), (ins u5imm:$MO), @@ -4279,7 +4335,7 @@ def MTSRIN: XForm_srin<31, 242, (outs), (ins gprc:$RS, gprc:$RB), def MFSRIN: XForm_srin<31, 659, (outs gprc:$RS), (ins gprc:$RB), "mfsrin $RS, $RB", IIC_SprMFSR>; -def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L), +def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, u1imm:$L), "mtmsr $RS, $L", IIC_SprMTMSR>; def WRTEE: XForm_mtmsr<31, 131, (outs), (ins gprc:$RS), @@ -4308,15 +4364,17 @@ def : InstAlias<"iccci", (ICCCI R0, R0)>, Requires<[IsPPC4xx]>; def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins), "mfmsr $RT", IIC_SprMFMSR, []>; -def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L), +def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, u1imm:$L), "mtmsrd $RS, $L", IIC_SprMTMSRD>; def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA), "mcrfs $BF, $BFA", IIC_BrMCR>; +// If W is 0 and BF is 7, the 60:63 bits will be set, we should set the +// implicit-def RM. def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W), "mtfsfi $BF, $U, $W", IIC_IntMFFS>; - +let Defs = [CR1] in def MTFSFI_rec : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W), "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isRecordForm; @@ -4324,12 +4382,15 @@ def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>; def : InstAlias<"mtfsfi. $BF, $U", (MTFSFI_rec crrc:$BF, i32imm:$U, 0)>; let Predicates = [HasFPU] in { +let Defs = [RM] in { def MTFSF : XFLForm_1<63, 711, (outs), - (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W), + (ins i32imm:$FLM, f8rc:$FRB, u1imm:$L, i32imm:$W), "mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>; +let Defs = [CR1] in def MTFSF_rec : XFLForm_1<63, 711, (outs), - (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W), + (ins i32imm:$FLM, f8rc:$FRB, u1imm:$L, i32imm:$W), "mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isRecordForm; +} def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>; def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSF_rec i32imm:$FLM, f8rc:$FRB, 0, 0)>; @@ -4556,6 +4617,16 @@ def : Pat<(int_ppc_dcbfl xoaddr:$dst), def : Pat<(int_ppc_dcbflp xoaddr:$dst), (DCBF 3, xoaddr:$dst)>; +let Predicates = [IsISA3_1] in { + def DCBFPS : PPCAsmPseudo<"dcbfps $dst", (ins memrr:$dst)>; + def DCBSTPS : PPCAsmPseudo<"dcbstps $dst", (ins memrr:$dst)>; + + def : Pat<(int_ppc_dcbfps xoaddr:$dst), + (DCBF 4, xoaddr:$dst)>; + def : Pat<(int_ppc_dcbstps xoaddr:$dst), + (DCBF 6, xoaddr:$dst)>; +} + def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>; def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>; def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>; @@ -4582,8 +4653,11 @@ def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>; def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>; def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>; +//Disable this alias on AIX for now because as does not support them. +let Predicates = [ModernAs] in { def : InstAlias<"mtudscr $Rx", (MTSPR 3, gprc:$Rx)>; def : InstAlias<"mfudscr $Rx", (MFSPR gprc:$Rx, 3)>; +} def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>; def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index 2bab73418e10..b9eb3b3b7d37 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1,3 +1,8 @@ +// Mask immediates for MMA instructions (2, 4 and 8 bits). +def Msk2Imm : ImmLeaf<i32, [{ return isUInt<2>(Imm); }]>; +def Msk4Imm : ImmLeaf<i32, [{ return isUInt<4>(Imm); }]>; +def Msk8Imm : ImmLeaf<i32, [{ return isUInt<8>(Imm); }]>; + //===----------------------------------------------------------------------===// // PowerPC ISA 3.1 specific type constraints. // @@ -5,12 +10,35 @@ def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>, SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3> ]>; +def SDT_PPCAccBuild : SDTypeProfile<1, 4, [ + SDTCisVT<0, v512i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>, + SDTCisVT<3, v4i32>, SDTCisVT<4, v4i32> +]>; +def SDT_PPCPairBuild : SDTypeProfile<1, 2, [ + SDTCisVT<0, v256i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32> +]>; +def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [ + SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisInt<2> +]>; +def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [ + SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisInt<2> +]>; +def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [ + SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1> +]>; //===----------------------------------------------------------------------===// // ISA 3.1 specific PPCISD nodes. // def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>; +def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>; +def PPCPairBuild : SDNode<"PPCISD::PAIR_BUILD", SDT_PPCPairBuild, []>; +def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx, + []>; +def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx, + []>; +def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>; //===----------------------------------------------------------------------===// @@ -18,6 +46,15 @@ def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>; // address computations). class isPCRel { bit PCRel = 1; } +// PowerPC specific type constraints. +def SDT_PPCLXVRZX : SDTypeProfile<1, 2, [ + SDTCisVT<0, v1i128>, SDTCisPtrTy<1>, SDTCisPtrTy<2> +]>; + +// PPC Specific DAG Nodes. +def PPClxvrzx : SDNode<"PPCISD::LXVRZX", SDT_PPCLXVRZX, + [SDNPHasChain, SDNPMayLoad]>; + // Top-level class for prefixed instructions. class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin> : Instruction { @@ -59,6 +96,39 @@ class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr, string BaseName = ""; } +// VX-Form: [ PO VT R VB RC XO ] +class VXForm_VTB5_RC<bits<10> xo, bits<5> R, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> VT; + bits<5> VB; + bit RC = 0; + + let Pattern = pattern; + + let Inst{6-10} = VT; + let Inst{11-15} = R; + let Inst{16-20} = VB; + let Inst{21} = RC; + let Inst{22-31} = xo; +} + +// Multiclass definition to account for record and non-record form +// instructions of VXRForm. +multiclass VXForm_VTB5_RCr<bits<10> xo, bits<5> R, dag OOL, dag IOL, + string asmbase, string asmstr, + InstrItinClass itin, list<dag> pattern> { + let BaseName = asmbase in { + def NAME : VXForm_VTB5_RC<xo, R, OOL, IOL, + !strconcat(asmbase, !strconcat(" ", asmstr)), + itin, pattern>, RecFormRel; + let Defs = [CR6] in + def _rec : VXForm_VTB5_RC<xo, R, OOL, IOL, + !strconcat(asmbase, !strconcat(". ", asmstr)), + itin, []>, isRecordForm, RecFormRel; + } +} + class MLS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern> : PI<1, opcode, OOL, IOL, asmstr, itin> { @@ -242,29 +312,37 @@ class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr, } -// VX-Form: [PO VRT / UIM RB XO]. -// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent -// "/ UIM" (unused bit followed by a 4-bit immediate) -// Destructive (insert) forms are suffixed with _ins. -class VXForm_VRT5_UIM5_RB5_ins<bits<11> xo, string opc, list<dag> pattern> - : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, g8rc:$rB), - !strconcat(opc, " $vD, $rB, $UIM"), IIC_VecGeneral, pattern>, - RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; - // VX-Form: [PO VRT RA VRB XO]. // Destructive (insert) forms are suffixed with _ins. class VXForm_VTB5_RA5_ins<bits<11> xo, string opc, list<dag> pattern> - : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, vrrc:$vB), + : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, vrrc:$vB), !strconcat(opc, " $vD, $rA, $vB"), IIC_VecGeneral, pattern>, RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; // VX-Form: [PO VRT RA RB XO]. // Destructive (insert) forms are suffixed with _ins. class VXForm_VRT5_RAB5_ins<bits<11> xo, string opc, list<dag> pattern> - : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB), + : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, gprc:$rA, gprc:$rB), !strconcat(opc, " $vD, $rA, $rB"), IIC_VecGeneral, pattern>, RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; +// VX-Form: [ PO BF // VRA VRB XO ] +class VXForm_BF3_VAB5<bits<11> xo, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<3> BF; + bits<5> VA; + bits<5> VB; + + let Pattern = pattern; + + let Inst{6-8} = BF; + let Inst{9-10} = 0; + let Inst{11-15} = VA; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + // VN-Form: [PO VRT VRA VRB PS SD XO] // SD is "Shift Direction" class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr, @@ -285,6 +363,22 @@ class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr, let Inst{26-31} = xo; } +class VXForm_RD5_MP_VB5<bits<11> xo, bits<4> eo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> RD; + bits<5> VB; + bit MP; + + let Pattern = pattern; + + let Inst{6-10} = RD; + let Inst{11-14} = eo; + let Inst{15} = MP; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + // 8RR:D-Form: [ 1 1 0 // // imm0 // PO T XO TX imm1 ]. class 8RR_DForm_IMM32_XT6<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, @@ -415,6 +509,13 @@ class XX2_BF3_XO5_XB6_XO9<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL, let Inst{31} = 0; } +// X-Form: [ PO RT BI /// XO / ] +class XForm_XT5_BI5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> { + let B = 0; +} + multiclass MLS_DForm_R_SI34_RTA5_MEM_p<bits<6> opcode, dag OOL, dag IOL, dag PCRel_IOL, string asmstr, InstrItinClass itin> { @@ -444,14 +545,307 @@ multiclass 8LS_DForm_R_SI34_XT6_RA5_p<bits<5> opcode, dag OOL, dag IOL, isPCRel; } +def PPCRegVSRpRCAsmOperand : AsmOperandClass { + let Name = "RegVSRpRC"; let PredicateMethod = "isVSRpEvenRegNumber"; +} + +def vsrprc : RegisterOperand<VSRpRC> { + let ParserMatchClass = PPCRegVSRpRCAsmOperand; +} + +def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass { + let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber"; +} + +def vsrpevenrc : RegisterOperand<VSRpRC> { + let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand; + let EncoderMethod = "getVSRpEvenEncoding"; + let DecoderMethod = "decodeVSRpEvenOperands"; +} + +class DQForm_XTp5_RA17_MEM<bits<6> opcode, bits<4> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<5> XTp; + bits<17> DQ_RA; + let Pattern = pattern; + + let Inst{6-9} = XTp{3-0}; + let Inst{10} = XTp{4}; + let Inst{11-15} = DQ_RA{16-12}; // Register # + let Inst{16-27} = DQ_RA{11-0}; // Displacement. + let Inst{28-31} = xo; +} + +class XForm_XTp5_XAB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin>, XFormMemOp { + bits<5> XTp; + bits<5> A; + bits<5> B; + + let Pattern = pattern; + let Inst{6-9} = XTp{3-0}; + let Inst{10} = XTp{4}; + let Inst{11-15} = A; + let Inst{16-20} = B; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class 8LS_DForm_R_XTp5_SI34_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr, + InstrItinClass itin, list<dag> pattern> + : PI<1, opcode, OOL, IOL, asmstr, itin> { + bits<5> XTp; + bits<39> D_RA; + + let Pattern = pattern; + + // The prefix. + let Inst{6-10} = 0; + let Inst{11} = PCRel; + let Inst{12-13} = 0; + let Inst{14-31} = D_RA{33-16}; // Imm18 + + // The instruction. + let Inst{38-41} = XTp{3-0}; + let Inst{42} = XTp{4}; + let Inst{43-47} = D_RA{38-34}; // Register # + let Inst{48-63} = D_RA{15-0}; // D +} + +multiclass 8LS_DForm_R_XTp5_SI34_MEM_p<bits<6> pref, bits<6> opcode, dag OOL, + dag IOL, dag PCRel_IOL, + string asmstr, InstrItinClass itin> { + def NAME : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, IOL, + !strconcat(asmstr, ", 0"), itin, []>; + def pc : 8LS_DForm_R_XTp5_SI34_MEM<opcode, OOL, PCRel_IOL, + !strconcat(asmstr, ", 1"), itin, []>, + isPCRel; +} + +def PPCRegACCRCAsmOperand : AsmOperandClass { + let Name = "RegACCRC"; let PredicateMethod = "isACCRegNumber"; +} + +def acc : RegisterOperand<ACCRC> { + let ParserMatchClass = PPCRegACCRCAsmOperand; +} + +def uacc : RegisterOperand<UACCRC> { + let ParserMatchClass = PPCRegACCRCAsmOperand; +} + +// [PO AS XO2 XO] +class XForm_AT3<bits<6> opcode, bits<5> xo2, bits<10> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> AT; + + let Pattern = pattern; + + let Inst{6-8} = AT; + let Inst{9-10} = 0; + let Inst{11-15} = xo2; + let Inst{16-20} = 0; + let Inst{21-30} = xo; + let Inst{31} = 0; +} + +class XX3Form_AT3_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, + list<dag> pattern> + : I<opcode, OOL, IOL, asmstr, itin> { + bits<3> AT; + bits<6> XA; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6-8} = AT; + let Inst{9-10} = 0; + let Inst{11-15} = XA{4-0}; + let Inst{16-20} = XB{4-0}; + let Inst{21-28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = 0; +} + +class MMIRR_XX3Form_XY4P2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, + list<dag> pattern> + : PI<1, opcode, OOL, IOL, asmstr, itin> { + bits<3> AT; + bits<6> XA; + bits<6> XB; + bits<4> XMSK; + bits<4> YMSK; + bits<2> PMSK; + + let Pattern = pattern; + + // The prefix. + let Inst{6-7} = 3; + let Inst{8-11} = 9; + let Inst{12-15} = 0; + let Inst{16-17} = PMSK; + let Inst{18-23} = 0; + let Inst{24-27} = XMSK; + let Inst{28-31} = YMSK; + + // The instruction. + let Inst{38-40} = AT; + let Inst{41-42} = 0; + let Inst{43-47} = XA{4-0}; + let Inst{48-52} = XB{4-0}; + let Inst{53-60} = xo; + let Inst{61} = XA{5}; + let Inst{62} = XB{5}; + let Inst{63} = 0; +} + +class MMIRR_XX3Form_XY4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, + list<dag> pattern> + : PI<1, opcode, OOL, IOL, asmstr, itin> { + bits<3> AT; + bits<6> XA; + bits<6> XB; + bits<4> XMSK; + bits<4> YMSK; + + let Pattern = pattern; + + // The prefix. + let Inst{6-7} = 3; + let Inst{8-11} = 9; + let Inst{12-23} = 0; + let Inst{24-27} = XMSK; + let Inst{28-31} = YMSK; + + // The instruction. + let Inst{38-40} = AT; + let Inst{41-42} = 0; + let Inst{43-47} = XA{4-0}; + let Inst{48-52} = XB{4-0}; + let Inst{53-60} = xo; + let Inst{61} = XA{5}; + let Inst{62} = XB{5}; + let Inst{63} = 0; +} + +class MMIRR_XX3Form_X4Y2_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, + list<dag> pattern> + : PI<1, opcode, OOL, IOL, asmstr, itin> { + bits<3> AT; + bits<6> XA; + bits<6> XB; + bits<4> XMSK; + bits<2> YMSK; + + let Pattern = pattern; + + // The prefix. + let Inst{6-7} = 3; + let Inst{8-11} = 9; + let Inst{12-23} = 0; + let Inst{24-27} = XMSK; + let Inst{28-29} = YMSK; + let Inst{30-31} = 0; + + // The instruction. + let Inst{38-40} = AT; + let Inst{41-42} = 0; + let Inst{43-47} = XA{4-0}; + let Inst{48-52} = XB{4-0}; + let Inst{53-60} = xo; + let Inst{61} = XA{5}; + let Inst{62} = XB{5}; + let Inst{63} = 0; +} + +class MMIRR_XX3Form_XY4P8_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, + list<dag> pattern> + : PI<1, opcode, OOL, IOL, asmstr, itin> { + bits<3> AT; + bits<6> XA; + bits<6> XB; + bits<4> XMSK; + bits<4> YMSK; + bits<8> PMSK; + + let Pattern = pattern; + + // The prefix. + let Inst{6-7} = 3; + let Inst{8-11} = 9; + let Inst{12-15} = 0; + let Inst{16-23} = PMSK; + let Inst{24-27} = XMSK; + let Inst{28-31} = YMSK; + + // The instruction. + let Inst{38-40} = AT; + let Inst{41-42} = 0; + let Inst{43-47} = XA{4-0}; + let Inst{48-52} = XB{4-0}; + let Inst{53-60} = xo; + let Inst{61} = XA{5}; + let Inst{62} = XB{5}; + let Inst{63} = 0; +} + +class MMIRR_XX3Form_XYP4_XAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, + list<dag> pattern> + : PI<1, opcode, OOL, IOL, asmstr, itin> { + bits<3> AT; + bits<6> XA; + bits<6> XB; + bits<4> XMSK; + bits<4> YMSK; + bits<4> PMSK; + + let Pattern = pattern; + + // The prefix. + let Inst{6-7} = 3; + let Inst{8-11} = 9; + let Inst{12-15} = 0; + let Inst{16-19} = PMSK; + let Inst{20-23} = 0; + let Inst{24-27} = XMSK; + let Inst{28-31} = YMSK; + + // The instruction. + let Inst{38-40} = AT; + let Inst{41-42} = 0; + let Inst{43-47} = XA{4-0}; + let Inst{48-52} = XB{4-0}; + let Inst{53-60} = xo; + let Inst{61} = XA{5}; + let Inst{62} = XB{5}; + let Inst{63} = 0; +} + def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">; def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">; +def PairedVectorMemops : Predicate<"Subtarget->pairedVectorMemops()">; +def MMA : Predicate<"Subtarget->hasMMA()">; + +def RCCp { + dag AToVSRC = (COPY_TO_REGCLASS $XA, VSRC); + dag BToVSRC = (COPY_TO_REGCLASS $XB, VSRC); +} let Predicates = [PrefixInstrs] in { let Interpretation64Bit = 1, isCodeGenOnly = 1 in { defm PADDI8 : MLS_DForm_R_SI34_RTA5_p<14, (outs g8rc:$RT), (ins g8rc:$RA, s34imm:$SI), - (ins immZero:$RA, s34imm:$SI), + (ins immZero:$RA, s34imm_pcrel:$SI), "paddi $RT, $RA, $SI", IIC_LdStLFD>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def PLI8 : MLS_DForm_SI34_RT5<14, (outs g8rc:$RT), @@ -461,7 +855,7 @@ let Predicates = [PrefixInstrs] in { } defm PADDI : MLS_DForm_R_SI34_RTA5_p<14, (outs gprc:$RT), (ins gprc:$RA, s34imm:$SI), - (ins immZero:$RA, s34imm:$SI), + (ins immZero:$RA, s34imm_pcrel:$SI), "paddi $RT, $RA, $SI", IIC_LdStLFD>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def PLI : MLS_DForm_SI34_RT5<14, (outs gprc:$RT), @@ -592,6 +986,695 @@ let Predicates = [PrefixInstrs] in { } } +// Multiclass definitions for MMA accumulator instructions. +// ---------------------------------------------------------------------------- + +// Defines 2 unmasked instructions where the xo field for acc/non-acc version +// is even/odd. +multiclass ACC_UM_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, + string asmstr> { + let Predicates = [MMA] in { + def NAME : + XX3Form_AT3_XAB6<opcode, !or(xo, 0x01), (outs acc:$AT), IOL, + !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PP : + XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), !con((ins acc:$ATi), IOL), + !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } +} + +// Defines 4 instructions, masked/unmasked with masks 8, 4, 4 bits. +// The XO field for acc/non-acc version is even/odd. +multiclass ACC_UM_M844_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, + string asmstr> { + defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>; + let Predicates = [MMA, PrefixInstrs] in { + def PM#NAME : + MMIRR_XX3Form_XY4P8_XAB6< + opcode, !or(xo, 0x01), (outs acc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#PP : + MMIRR_XX3Form_XY4P8_XAB6< + opcode, xo, (outs acc:$AT), + !con((ins acc:$ATi), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u8imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } +} + +// Defines 4 instructions, masked/unmasked with masks 4, 4, 4 bits. +// The XO field for acc/non-acc version is even/odd. +multiclass ACC_UM_M444_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, + string asmstr> { + defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>; + let Predicates = [MMA, PrefixInstrs] in { + def PM#NAME : + MMIRR_XX3Form_XYP4_XAB6< + opcode, !or(xo, 0x01), (outs acc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#PP : + MMIRR_XX3Form_XYP4_XAB6< + opcode, xo, (outs acc:$AT), + !con((ins acc:$ATi), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u4imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } +} + +// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits. +// The XO field for acc/non-acc version is even/odd. +multiclass ACC_UM_M244_XOEO<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, + string asmstr> { + defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>; + let Predicates = [MMA, PrefixInstrs] in { + def PM#NAME : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0x01), (outs acc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#PP : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, xo, (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } +} + +// Defines 4 instructions, masked/unmasked with masks 2, 4, 4 bits. +// Upper nibble of XO field for acc/non-acc version is 0x4/0x6. +multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, + string asmstr> { + let Predicates = [MMA] in { + def NAME : + XX3Form_AT3_XAB6<opcode, xo, (outs acc:$AT), IOL, + !strconcat(asmbase#" ", asmstr), IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PP : + XX3Form_AT3_XAB6< + opcode, !or(xo, 0x20), (outs acc:$AT), !con((ins acc:$ATi), IOL), + !strconcat(asmbase#"pp ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } + let Predicates = [MMA, PrefixInstrs] in { + def PM#NAME : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, xo, (outs acc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#PP : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0x20), (outs acc:$AT), + !con((ins acc:$ATi), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } +} + +// Defines 10 instructions, operand negating, unmasked, masked with 2, 4, 4 +// bits. Upper nibble are masked with 0x8, 0x4, 0xC for negating operands. +multiclass ACC_NEG_UM_M244_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, + string asmbase, string asmstr> { + defm NAME : ACC_UM_M244_XOEO<opcode, xo, IOL, asmbase, asmstr>; + let Predicates = [MMA] in { + def PN : XX3Form_AT3_XAB6< + opcode, !or(xo, 0x80), (outs acc:$AT), !con((ins acc:$ATi), IOL), + !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def NP : XX3Form_AT3_XAB6< + opcode, !or(xo, 0x40), (outs acc:$AT), !con((ins acc:$ATi), IOL), + !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def NN : XX3Form_AT3_XAB6< + opcode, !or(xo, 0xC0), (outs acc:$AT), !con((ins acc:$ATi), IOL), + !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } + let Predicates = [MMA, PrefixInstrs] in { + def PM#NAME#PN : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0x80), (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#NP : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0x40), (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#NN : + MMIRR_XX3Form_XY4P2_XAB6< + opcode, !or(xo, 0xC0), (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), + !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK, $PMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } +} + +// Defines 5 instructions, unmasked, operand negating. +// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands. +multiclass ACC_NEG_UM_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, + string asmbase, string asmstr> { + defm NAME : ACC_UM_XOEO<opcode, xo, IOL, asmbase, asmstr>; + let Predicates = [MMA] in { + def PN : XX3Form_AT3_XAB6<opcode, !or(xo, 0x80), (outs acc:$AT), + !con((ins acc:$ATi), IOL), + !strconcat(asmbase#"pn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def NP : XX3Form_AT3_XAB6<opcode, !or(xo, 0x40), (outs acc:$AT), + !con((ins acc:$ATi), IOL), + !strconcat(asmbase#"np ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def NN : XX3Form_AT3_XAB6<opcode, !or(xo, 0xC0), (outs acc:$AT), + !con((ins acc:$ATi), IOL), + !strconcat(asmbase#"nn ", asmstr), IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } +} + +// Defines 10 instructions, operand negating, unmasked, masked with 4, 4 bits. +// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands. +multiclass ACC_NEG_UM_M44_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, + string asmbase, string asmstr> { + defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>; + let Predicates = [MMA, PrefixInstrs] in { + def PM#NAME : + MMIRR_XX3Form_XY4_XAB6< + opcode, !or(xo, 0x01), (outs acc:$AT), + !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#PP : + MMIRR_XX3Form_XY4_XAB6< + opcode, xo, (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#PN : + MMIRR_XX3Form_XY4_XAB6< + opcode, !or(xo, 0x80), (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), + !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#NP : + MMIRR_XX3Form_XY4_XAB6< + opcode, !or(xo, 0x40), (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), + !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#NN : + MMIRR_XX3Form_XY4_XAB6< + opcode, !or(xo, 0xC0), (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK))), + !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } +} + +// Defines 10 instructions, operand negating, unmasked, masked with 4, 2 bits. +// Upper nibble are masked with 0x8, 0x4, 0xC for negating operands. +multiclass ACC_NEG_UM_M42_XOM84C<bits<6> opcode, bits<8> xo, dag IOL, + string asmbase, string asmstr> { + defm NAME : ACC_NEG_UM_XOM84C<opcode, xo, IOL, asmbase, asmstr>; + let Predicates = [MMA, PrefixInstrs] in { + def PM#NAME : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, !or(xo, 0x01), (outs acc:$AT), + !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK)), + !strconcat("pm"#asmbase#" ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"@earlyclobber $AT">; + def PM#NAME#PP : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, xo, (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), + !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#PN : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, !or(xo, 0x80), (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), + !strconcat("pm"#asmbase#"pn ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#NP : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, !or(xo, 0x40), (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), + !strconcat("pm"#asmbase#"np ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def PM#NAME#NN : + MMIRR_XX3Form_X4Y2_XAB6< + opcode, !or(xo, 0xC0), (outs acc:$AT), + !con((ins acc:$ATi), !con(IOL, (ins u4imm:$XMSK, u2imm:$YMSK))), + !strconcat("pm"#asmbase#"nn ", asmstr#", $XMSK, $YMSK"), + IIC_VecFP, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + } +} + +// End of class definitions. +//----------------------------------------------------------------------------- + +let Predicates = [MMA] in { + def XXMFACC : + XForm_AT3<31, 0, 177, (outs acc:$ASo), (ins acc:$AS), "xxmfacc $AS", + IIC_VecGeneral, + [(set v512i1:$ASo, (int_ppc_mma_xxmfacc v512i1:$AS))]>, + RegConstraint<"$ASo = $AS">, NoEncode<"$ASo">; + def XXMTACC : + XForm_AT3<31, 1, 177, (outs acc:$AT), (ins acc:$ATi), "xxmtacc $AT", + IIC_VecGeneral, + [(set v512i1:$AT, (int_ppc_mma_xxmtacc v512i1:$ATi))]>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + def KILL_PAIR : PPCPostRAExpPseudo<(outs vsrprc:$XTp), (ins vsrprc:$XSp), + "#KILL_PAIR", []>, + RegConstraint<"$XTp = $XSp">; + def BUILD_UACC : PPCPostRAExpPseudo<(outs acc:$AT), (ins uacc:$AS), + "#BUILD_UACC $AT, $AS", []>; + // We define XXSETACCZ as rematerializable to undo CSE of that intrinsic in + // the backend. We avoid CSE here because it generates a copy of the acc + // register and this copy is more expensive than calling the intrinsic again. + let isAsCheapAsAMove = 1, isReMaterializable = 1 in { + def XXSETACCZ : + XForm_AT3<31, 3, 177, (outs acc:$AT), (ins), "xxsetaccz $AT", IIC_VecGeneral, + [(set v512i1:$AT, (int_ppc_mma_xxsetaccz))]>; + } + def XVI8GER4SPP : + XX3Form_AT3_XAB6<59, 99, (outs acc:$AT), (ins acc:$ATi, vsrc:$XA, vsrc:$XB), + "xvi8ger4spp $AT, $XA, $XB", IIC_VecGeneral, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; + let mayStore = 1 in { + def SPILL_ACC: PPCEmitTimePseudo<(outs), (ins acc:$AT, memrix16:$dst), + "#SPILL_ACC", []>; + def SPILL_UACC: PPCEmitTimePseudo<(outs), (ins uacc:$AT, memrix16:$dst), + "#SPILL_UACC", []>; + } + let mayLoad = 1, hasSideEffects = 0 in { + def RESTORE_ACC: PPCEmitTimePseudo<(outs acc:$AT), (ins memrix16:$src), + "#RESTORE_ACC", []>; + def RESTORE_UACC: PPCEmitTimePseudo<(outs uacc:$AT), (ins memrix16:$src), + "#RESTORE_UACC", []>; + } +} + +let Predicates = [MMA, PrefixInstrs] in { + def PMXVI8GER4SPP : + MMIRR_XX3Form_XYP4_XAB6<59, 99, (outs acc:$AT), + (ins acc:$ATi, vsrc:$XA,vsrc:$XB, u4imm:$XMSK, + u4imm:$YMSK, u4imm:$PMSK), + "pmxvi8ger4spp $AT, $XA, $XB, $XMSK, $YMSK, $PMSK", + IIC_VecGeneral, []>, + RegConstraint<"$ATi = $AT">, NoEncode<"$ATi">; +} + +// MMA accumulating/non-accumulating instructions. +//------------------------------------------------------------------------------ + +// XVBF16GER2, XVBF16GER2PP, XVBF16GER2PN, XVBF16GER2NP, XVBF16GER2NN +// PMXVBF16GER2, PMXVBF16GER2PP, PMXVBF16GER2PN, PMXVBF16GER2NP, PMXVBF16GER2NN +defm XVBF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 50, (ins vsrc:$XA, vsrc:$XB), + "xvbf16ger2", "$AT, $XA, $XB">; + +// XVI4GER8, XVI4GER8PP, PMXVI4GER8, PMXVI4GER8PP +defm XVI4GER8 : ACC_UM_M844_XOEO<59, 34, (ins vsrc:$XA, vsrc:$XB), + "xvi4ger8", "$AT, $XA, $XB">; + +// XVI8GER4, XVI8GER4PP, PMXVI8GER4, PMXVI8GER4PP +defm XVI8GER4 : ACC_UM_M444_XOEO<59, 2, (ins vsrc:$XA, vsrc:$XB), + "xvi8ger4", "$AT, $XA, $XB">; + +// XVI16GER2, XVI16GER2PP, PMXVI16GER2, PMXVI16GER2PP +defm XVI16GER2 : ACC_UM_M244_XO46<59, 75, (ins vsrc:$XA, vsrc:$XB), + "xvi16ger2", "$AT, $XA, $XB">; + +// XVI16GER2S, XVI16GER2SPP, PMXVI16GER2S, PMXVI16GER2SPP +defm XVI16GER2S : ACC_UM_M244_XOEO<59, 42, (ins vsrc:$XA, vsrc:$XB), + "xvi16ger2s", "$AT, $XA, $XB">; + +// XVF16GER2, XVF16GER2PP, XVF16GER2PN, XVF16GER2NP, XVF16GER2NN +// PMXVF16GER2, PMXVF16GER2PP, PMXVF16GER2PN, PMXVF16GER2NP, PMXVF16GER2NN +defm XVF16GER2 : ACC_NEG_UM_M244_XOM84C<59, 18, (ins vsrc:$XA, vsrc:$XB), + "xvf16ger2", "$AT, $XA, $XB">; + +// XVF32GER, XVF32GERPP, XVF32GERPN, XVF32GERNP, XVF32GERPP +// PMXVF32GER, PMXVF32GERPP, PMXVF32GERPN, PMXVF32GERNP, PMXVF32GERPP +defm XVF32GER : ACC_NEG_UM_M44_XOM84C<59, 26, (ins vsrc:$XA, vsrc:$XB), + "xvf32ger", "$AT, $XA, $XB">; + +// XVF64GER, XVF64GERPP, XVF64GERPN, XVF64GERNP, XVF64GERNN +// PMXVF64GER, PMXVF64GERPP, PMXVF64GERPN, PMXVF64GERNP, PMXVF64GERNN +defm XVF64GER : ACC_NEG_UM_M42_XOM84C<59, 58, (ins vsrpevenrc:$XA, vsrc:$XB), + "xvf64ger", "$AT, $XA, $XB">; +//------------------------------------------------------------------------------ + +// MMA Intrinsics +let Predicates = [MMA] in { + def : Pat<(v512i1 (int_ppc_mma_xvi4ger8 v16i8:$XA, v16i8:$XB)), + (XVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + + def : Pat<(v512i1 (int_ppc_mma_xvi8ger4 v16i8:$XA, v16i8:$XB)), + (XVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + + def : Pat<(v512i1 (int_ppc_mma_xvi16ger2s v16i8:$XA, v16i8:$XB)), + (XVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2 v16i8:$XA, v16i8:$XB)), + (XVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + + def : Pat<(v512i1 (int_ppc_mma_xvf32ger v16i8:$XA, v16i8:$XB)), + (XVF32GER RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64ger v256i1:$XA, v16i8:$XB)), + (XVF64GER $XA, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB)), + (XVF64GERPP $ATi, $XA, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), + (XVF64GERPN $ATi, $XA, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)), + (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), + (XVF64GERNN $ATi, $XA, RCCp.BToVSRC)>; + + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2 v16i8:$XA, v16i8:$XB)), + (XVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi16ger2 v16i8:$XA, v16i8:$XB)), + (XVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; + def : Pat<(v512i1 (int_ppc_mma_xvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB)), + (XVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC)>; +} + +// MMA Intrinsics +let Predicates = [MMA, PrefixInstrs] in { + def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk8Imm:$PMSK)), + (PMXVI4GER8 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk8Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi4ger8pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk8Imm:$PMSK)), + (PMXVI4GER8PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk8Imm:$PMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk4Imm:$PMSK)), + (PMXVI8GER4 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk4Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk4Imm:$PMSK)), + (PMXVI8GER4PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk4Imm:$PMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2s v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMXVI16GER2S RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2spp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVI16GER2SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMXVF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvf32ger v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)), + (PMXVF32GER RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK)), + (PMXVF32GERPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf32gerpn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK)), + (PMXVF32GERPN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK)), + (PMXVF32GERNP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf32gernn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK)), + (PMXVF32GERNN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvf64ger v256i1:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)), + (PMXVF64GER $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, Msk2Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpp v512i1:$ATi, v256i1:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk2Imm:$YMSK)), + (PMXVF64GERPP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk2Imm:$YMSK)), + (PMXVF64GERPN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk2Imm:$YMSK)), + (PMXVF64GERNP $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk2Imm:$YMSK)), + (PMXVF64GERNN $ATi, $XA, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk2Imm:$YMSK)>; + + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMXVBF16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVBF16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2pn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVBF16GER2PN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2np v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVBF16GER2NP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvbf16ger2nn v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVBF16GER2NN $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2 v16i8:$XA, v16i8:$XB, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)), + (PMXVI16GER2 RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi8ger4spp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVI8GER4SPP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; + def : Pat<(v512i1 (int_ppc_mma_pmxvi16ger2pp v512i1:$ATi, v16i8:$XA, v16i8:$XB, + Msk4Imm:$XMSK, Msk4Imm:$YMSK, + Msk2Imm:$PMSK)), + (PMXVI16GER2PP $ATi, RCCp.AToVSRC, RCCp.BToVSRC, Msk4Imm:$XMSK, + Msk4Imm:$YMSK, Msk2Imm:$PMSK)>; +} + +def Concats { + dag VecsToVecPair0 = + (v256i1 (INSERT_SUBREG + (INSERT_SUBREG (IMPLICIT_DEF), $vs0, sub_vsx1), + $vs1, sub_vsx0)); + dag VecsToVecPair1 = + (v256i1 (INSERT_SUBREG + (INSERT_SUBREG (IMPLICIT_DEF), $vs2, sub_vsx1), + $vs3, sub_vsx0)); + dag VecsToVecQuad = + (BUILD_UACC (INSERT_SUBREG + (INSERT_SUBREG (v512i1 (IMPLICIT_DEF)), + (KILL_PAIR VecsToVecPair0), sub_pair0), + (KILL_PAIR VecsToVecPair1), sub_pair1)); +} + +def Extracts { + dag Pair0 = (v256i1 (EXTRACT_SUBREG $v, sub_pair0)); + dag Pair1 = (v256i1 (EXTRACT_SUBREG $v, sub_pair1)); + dag Vec0 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx0)); + dag Vec1 = (v4i32 (EXTRACT_SUBREG Pair0, sub_vsx1)); + dag Vec2 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx0)); + dag Vec3 = (v4i32 (EXTRACT_SUBREG Pair1, sub_vsx1)); +} + +let Predicates = [MMA] in { + def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)), + (XXMTACC Concats.VecsToVecQuad)>; + def : Pat<(v512i1 (int_ppc_mma_assemble_acc v16i8:$vs1, v16i8:$vs0, + v16i8:$vs3, v16i8:$vs2)), + (XXMTACC Concats.VecsToVecQuad)>; + def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 0))), + Extracts.Vec0>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 1))), + Extracts.Vec1>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 2))), + Extracts.Vec2>; + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 3))), + Extracts.Vec3>; +} + +let Predicates = [PairedVectorMemops] in { + def : Pat<(v256i1 (PPCPairBuild v4i32:$vs1, v4i32:$vs0)), + Concats.VecsToVecPair0>; + def : Pat<(v256i1 (int_ppc_vsx_assemble_pair v16i8:$vs1, v16i8:$vs0)), + Concats.VecsToVecPair0>; + def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 0))), + (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>; + def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 1))), + (v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>; +} + +let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops] in { + def LXVP : DQForm_XTp5_RA17_MEM<6, 0, (outs vsrprc:$XTp), + (ins memrix16:$DQ_RA), "lxvp $XTp, $DQ_RA", + IIC_LdStLFD, []>; + def LXVPX : XForm_XTp5_XAB5<31, 333, (outs vsrprc:$XTp), (ins memrr:$src), + "lxvpx $XTp, $src", IIC_LdStLFD, + []>; +} + +let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops] in { + def STXVP : DQForm_XTp5_RA17_MEM<6, 1, (outs), (ins vsrprc:$XTp, + memrix16:$DQ_RA), "stxvp $XTp, $DQ_RA", + IIC_LdStLFD, []>; + def STXVPX : XForm_XTp5_XAB5<31, 461, (outs), (ins vsrprc:$XTp, memrr:$dst), + "stxvpx $XTp, $dst", IIC_LdStLFD, + []>; +} + +let mayLoad = 1, mayStore = 0, Predicates = [PairedVectorMemops, PrefixInstrs] in { + defm PLXVP : + 8LS_DForm_R_XTp5_SI34_MEM_p<1, 58, (outs vsrprc:$XTp), (ins memri34:$D_RA), + (ins memri34_pcrel:$D_RA), "plxvp $XTp, $D_RA", + IIC_LdStLFD>; +} + +let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] in { + defm PSTXVP : + 8LS_DForm_R_XTp5_SI34_MEM_p<1, 62, (outs), (ins vsrprc:$XTp, memri34:$D_RA), + (ins vsrprc:$XTp, memri34_pcrel:$D_RA), + "pstxvp $XTp, $D_RA", IIC_LdStLFD>; +} + +let Predicates = [PairedVectorMemops] in { + // Intrinsics for Paired Vector Loads. + def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>; + def : Pat<(v256i1 (int_ppc_vsx_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>; + let Predicates = [PairedVectorMemops, PrefixInstrs] in { + def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>; + } + // Intrinsics for Paired Vector Stores. + def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX16:$dst), + (STXVP $XSp, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, xaddrX16:$dst), + (STXVPX $XSp, xaddrX16:$dst)>; + let Predicates = [PairedVectorMemops, PrefixInstrs] in { + def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX34:$dst), + (PSTXVP $XSp, memri34:$dst)>; + } +} + // TODO: We have an added complexity of 500 here. This is only a temporary // solution to have tablegen consider these patterns first. The way we do // addressing for PowerPC is complex depending on available D form, X form, or @@ -753,6 +1836,13 @@ let Predicates = [PCRelativeMemops], AddedComplexity = 500 in { // If the PPCmatpcreladdr node is not caught by any other pattern it should be // caught here and turned into a paddi instruction to materialize the address. def : Pat<(PPCmatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>; + // PPCtlsdynamatpcreladdr node is used for TLS dynamic models to materialize + // tls global address with paddi instruction. + def : Pat<(PPCtlsdynamatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>; + // PPCtlslocalexecmataddr node is used for TLS local exec models to + // materialize tls global address with paddi instruction. + def : Pat<(PPCaddTls i64:$in, (PPCtlslocalexecmataddr tglobaltlsaddr:$addr)), + (PADDI8 $in, $addr)>; } let Predicates = [PrefixInstrs] in { @@ -797,6 +1887,26 @@ let Predicates = [PrefixInstrs] in { } let Predicates = [IsISA3_1] in { + def SETBC : XForm_XT5_BI5<31, 384, (outs gprc:$RT), (ins crbitrc:$BI), + "setbc $RT, $BI", IIC_IntCompare, []>; + def SETBCR : XForm_XT5_BI5<31, 416, (outs gprc:$RT), (ins crbitrc:$BI), + "setbcr $RT, $BI", IIC_IntCompare, []>; + def SETNBC : XForm_XT5_BI5<31, 448, (outs gprc:$RT), (ins crbitrc:$BI), + "setnbc $RT, $BI", IIC_IntCompare, []>; + def SETNBCR : XForm_XT5_BI5<31, 480, (outs gprc:$RT), (ins crbitrc:$BI), + "setnbcr $RT, $BI", IIC_IntCompare, []>; + + let Interpretation64Bit = 1, isCodeGenOnly = 1 in { + def SETBC8 : XForm_XT5_BI5<31, 384, (outs g8rc:$RT), (ins crbitrc:$BI), + "setbc $RT, $BI", IIC_IntCompare, []>; + def SETBCR8 : XForm_XT5_BI5<31, 416, (outs g8rc:$RT), (ins crbitrc:$BI), + "setbcr $RT, $BI", IIC_IntCompare, []>; + def SETNBC8 : XForm_XT5_BI5<31, 448, (outs g8rc:$RT), (ins crbitrc:$BI), + "setnbc $RT, $BI", IIC_IntCompare, []>; + def SETNBCR8 : XForm_XT5_BI5<31, 480, (outs g8rc:$RT), (ins crbitrc:$BI), + "setnbcr $RT, $BI", IIC_IntCompare, []>; + } + def VSLDBI : VNForm_VTAB5_SD3<22, 0, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH), "vsldbi $VRT, $VRA, $VRB, $SH", @@ -813,87 +1923,254 @@ let Predicates = [IsISA3_1] in { (int_ppc_altivec_vsrdbi v16i8:$VRA, v16i8:$VRB, i32:$SH))]>; - def VINSW : - VXForm_VRT5_UIM5_RB5_ins<207, "vinsw", - [(set v4i32:$vD, - (int_ppc_altivec_vinsw v4i32:$vDi, i64:$rB, - timm:$UIM))]>; + defm VSTRIBR : VXForm_VTB5_RCr<13, 1, (outs vrrc:$vT), (ins vrrc:$vB), + "vstribr", "$vT, $vB", IIC_VecGeneral, + [(set v16i8:$vT, + (int_ppc_altivec_vstribr v16i8:$vB))]>; + defm VSTRIBL : VXForm_VTB5_RCr<13, 0, (outs vrrc:$vT), (ins vrrc:$vB), + "vstribl", "$vT, $vB", IIC_VecGeneral, + [(set v16i8:$vT, + (int_ppc_altivec_vstribl v16i8:$vB))]>; + defm VSTRIHR : VXForm_VTB5_RCr<13, 3, (outs vrrc:$vT), (ins vrrc:$vB), + "vstrihr", "$vT, $vB", IIC_VecGeneral, + [(set v8i16:$vT, + (int_ppc_altivec_vstrihr v8i16:$vB))]>; + defm VSTRIHL : VXForm_VTB5_RCr<13, 2, (outs vrrc:$vT), (ins vrrc:$vB), + "vstrihl", "$vT, $vB", IIC_VecGeneral, + [(set v8i16:$vT, + (int_ppc_altivec_vstrihl v8i16:$vB))]>; + def VINSW : + VXForm_1<207, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, gprc:$rB), + "vinsw $vD, $rB, $UIM", IIC_VecGeneral, + [(set v4i32:$vD, + (int_ppc_altivec_vinsw v4i32:$vDi, i32:$rB, timm:$UIM))]>, + RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; def VINSD : - VXForm_VRT5_UIM5_RB5_ins<463, "vinsd", - [(set v2i64:$vD, - (int_ppc_altivec_vinsd v2i64:$vDi, i64:$rB, - timm:$UIM))]>; + VXForm_1<463, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, g8rc:$rB), + "vinsd $vD, $rB, $UIM", IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vinsd v2i64:$vDi, i64:$rB, timm:$UIM))]>, + RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; def VINSBVLX : VXForm_VTB5_RA5_ins<15, "vinsbvlx", [(set v16i8:$vD, - (int_ppc_altivec_vinsbvlx v16i8:$vDi, i64:$rA, + (int_ppc_altivec_vinsbvlx v16i8:$vDi, i32:$rA, v16i8:$vB))]>; def VINSBVRX : VXForm_VTB5_RA5_ins<271, "vinsbvrx", [(set v16i8:$vD, - (int_ppc_altivec_vinsbvrx v16i8:$vDi, i64:$rA, + (int_ppc_altivec_vinsbvrx v16i8:$vDi, i32:$rA, v16i8:$vB))]>; def VINSHVLX : VXForm_VTB5_RA5_ins<79, "vinshvlx", [(set v8i16:$vD, - (int_ppc_altivec_vinshvlx v8i16:$vDi, i64:$rA, + (int_ppc_altivec_vinshvlx v8i16:$vDi, i32:$rA, v8i16:$vB))]>; def VINSHVRX : VXForm_VTB5_RA5_ins<335, "vinshvrx", [(set v8i16:$vD, - (int_ppc_altivec_vinshvrx v8i16:$vDi, i64:$rA, + (int_ppc_altivec_vinshvrx v8i16:$vDi, i32:$rA, v8i16:$vB))]>; def VINSWVLX : VXForm_VTB5_RA5_ins<143, "vinswvlx", [(set v4i32:$vD, - (int_ppc_altivec_vinswvlx v4i32:$vDi, i64:$rA, + (int_ppc_altivec_vinswvlx v4i32:$vDi, i32:$rA, v4i32:$vB))]>; def VINSWVRX : VXForm_VTB5_RA5_ins<399, "vinswvrx", [(set v4i32:$vD, - (int_ppc_altivec_vinswvrx v4i32:$vDi, i64:$rA, + (int_ppc_altivec_vinswvrx v4i32:$vDi, i32:$rA, v4i32:$vB))]>; def VINSBLX : VXForm_VRT5_RAB5_ins<527, "vinsblx", [(set v16i8:$vD, - (int_ppc_altivec_vinsblx v16i8:$vDi, i64:$rA, - i64:$rB))]>; + (int_ppc_altivec_vinsblx v16i8:$vDi, i32:$rA, + i32:$rB))]>; def VINSBRX : VXForm_VRT5_RAB5_ins<783, "vinsbrx", [(set v16i8:$vD, - (int_ppc_altivec_vinsbrx v16i8:$vDi, i64:$rA, - i64:$rB))]>; + (int_ppc_altivec_vinsbrx v16i8:$vDi, i32:$rA, + i32:$rB))]>; def VINSHLX : VXForm_VRT5_RAB5_ins<591, "vinshlx", [(set v8i16:$vD, - (int_ppc_altivec_vinshlx v8i16:$vDi, i64:$rA, - i64:$rB))]>; + (int_ppc_altivec_vinshlx v8i16:$vDi, i32:$rA, + i32:$rB))]>; def VINSHRX : VXForm_VRT5_RAB5_ins<847, "vinshrx", [(set v8i16:$vD, - (int_ppc_altivec_vinshrx v8i16:$vDi, i64:$rA, - i64:$rB))]>; + (int_ppc_altivec_vinshrx v8i16:$vDi, i32:$rA, + i32:$rB))]>; def VINSWLX : VXForm_VRT5_RAB5_ins<655, "vinswlx", [(set v4i32:$vD, - (int_ppc_altivec_vinswlx v4i32:$vDi, i64:$rA, - i64:$rB))]>; + (int_ppc_altivec_vinswlx v4i32:$vDi, i32:$rA, + i32:$rB))]>; def VINSWRX : VXForm_VRT5_RAB5_ins<911, "vinswrx", [(set v4i32:$vD, - (int_ppc_altivec_vinswrx v4i32:$vDi, i64:$rA, - i64:$rB))]>; + (int_ppc_altivec_vinswrx v4i32:$vDi, i32:$rA, + i32:$rB))]>; def VINSDLX : - VXForm_VRT5_RAB5_ins<719, "vinsdlx", - [(set v2i64:$vD, - (int_ppc_altivec_vinsdlx v2i64:$vDi, i64:$rA, - i64:$rB))]>; + VXForm_1<719, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB), + "vinsdlx $vD, $rA, $rB", IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vinsdlx v2i64:$vDi, i64:$rA, i64:$rB))]>, + RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; def VINSDRX : - VXForm_VRT5_RAB5_ins<975, "vinsdrx", - [(set v2i64:$vD, - (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA, - i64:$rB))]>; - + VXForm_1<975, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB), + "vinsdrx $vD, $rA, $rB", IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA, i64:$rB))]>, + RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; + def VEXTRACTBM : VXForm_RD5_XO5_RS5<1602, 8, (outs gprc:$rD), (ins vrrc:$vB), + "vextractbm $rD, $vB", IIC_VecGeneral, + [(set i32:$rD, + (int_ppc_altivec_vextractbm v16i8:$vB))]>; + def VEXTRACTHM : VXForm_RD5_XO5_RS5<1602, 9, (outs gprc:$rD), (ins vrrc:$vB), + "vextracthm $rD, $vB", IIC_VecGeneral, + [(set i32:$rD, + (int_ppc_altivec_vextracthm v8i16:$vB))]>; + def VEXTRACTWM : VXForm_RD5_XO5_RS5<1602, 10, (outs gprc:$rD), (ins vrrc:$vB), + "vextractwm $rD, $vB", IIC_VecGeneral, + [(set i32:$rD, + (int_ppc_altivec_vextractwm v4i32:$vB))]>; + def VEXTRACTDM : VXForm_RD5_XO5_RS5<1602, 11, (outs gprc:$rD), (ins vrrc:$vB), + "vextractdm $rD, $vB", IIC_VecGeneral, + [(set i32:$rD, + (int_ppc_altivec_vextractdm v2i64:$vB))]>; + def VEXTRACTQM : VXForm_RD5_XO5_RS5<1602, 12, (outs gprc:$rD), (ins vrrc:$vB), + "vextractqm $rD, $vB", IIC_VecGeneral, + [(set i32:$rD, + (int_ppc_altivec_vextractqm v1i128:$vB))]>; + def VEXPANDBM : VXForm_RD5_XO5_RS5<1602, 0, (outs vrrc:$vD), (ins vrrc:$vB), + "vexpandbm $vD, $vB", IIC_VecGeneral, + [(set v16i8:$vD, (int_ppc_altivec_vexpandbm + v16i8:$vB))]>; + def VEXPANDHM : VXForm_RD5_XO5_RS5<1602, 1, (outs vrrc:$vD), (ins vrrc:$vB), + "vexpandhm $vD, $vB", IIC_VecGeneral, + [(set v8i16:$vD, (int_ppc_altivec_vexpandhm + v8i16:$vB))]>; + def VEXPANDWM : VXForm_RD5_XO5_RS5<1602, 2, (outs vrrc:$vD), (ins vrrc:$vB), + "vexpandwm $vD, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (int_ppc_altivec_vexpandwm + v4i32:$vB))]>; + def VEXPANDDM : VXForm_RD5_XO5_RS5<1602, 3, (outs vrrc:$vD), (ins vrrc:$vB), + "vexpanddm $vD, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (int_ppc_altivec_vexpanddm + v2i64:$vB))]>; + def VEXPANDQM : VXForm_RD5_XO5_RS5<1602, 4, (outs vrrc:$vD), (ins vrrc:$vB), + "vexpandqm $vD, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vexpandqm + v1i128:$vB))]>; + def MTVSRBM : VXForm_RD5_XO5_RS5<1602, 16, (outs vrrc:$vD), (ins g8rc:$rB), + "mtvsrbm $vD, $rB", IIC_VecGeneral, + [(set v16i8:$vD, + (int_ppc_altivec_mtvsrbm i64:$rB))]>; + def MTVSRHM : VXForm_RD5_XO5_RS5<1602, 17, (outs vrrc:$vD), (ins g8rc:$rB), + "mtvsrhm $vD, $rB", IIC_VecGeneral, + [(set v8i16:$vD, + (int_ppc_altivec_mtvsrhm i64:$rB))]>; + def MTVSRWM : VXForm_RD5_XO5_RS5<1602, 18, (outs vrrc:$vD), (ins g8rc:$rB), + "mtvsrwm $vD, $rB", IIC_VecGeneral, + [(set v4i32:$vD, + (int_ppc_altivec_mtvsrwm i64:$rB))]>; + def MTVSRDM : VXForm_RD5_XO5_RS5<1602, 19, (outs vrrc:$vD), (ins g8rc:$rB), + "mtvsrdm $vD, $rB", IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_mtvsrdm i64:$rB))]>; + def MTVSRQM : VXForm_RD5_XO5_RS5<1602, 20, (outs vrrc:$vD), (ins g8rc:$rB), + "mtvsrqm $vD, $rB", IIC_VecGeneral, + [(set v1i128:$vD, + (int_ppc_altivec_mtvsrqm i64:$rB))]>; + def MTVSRBMI : DXForm<4, 10, (outs vrrc:$vD), (ins u16imm64:$D), + "mtvsrbmi $vD, $D", IIC_VecGeneral, + [(set v16i8:$vD, + (int_ppc_altivec_mtvsrbm imm:$D))]>; + def VCNTMBB : VXForm_RD5_MP_VB5<1602, 12, (outs g8rc:$rD), + (ins vrrc:$vB, u1imm:$MP), + "vcntmbb $rD, $vB, $MP", IIC_VecGeneral, + [(set i64:$rD, (int_ppc_altivec_vcntmbb + v16i8:$vB, timm:$MP))]>; + def VCNTMBH : VXForm_RD5_MP_VB5<1602, 13, (outs g8rc:$rD), + (ins vrrc:$vB, u1imm:$MP), + "vcntmbh $rD, $vB, $MP", IIC_VecGeneral, + [(set i64:$rD, (int_ppc_altivec_vcntmbh + v8i16:$vB, timm:$MP))]>; + def VCNTMBW : VXForm_RD5_MP_VB5<1602, 14, (outs g8rc:$rD), + (ins vrrc:$vB, u1imm:$MP), + "vcntmbw $rD, $vB, $MP", IIC_VecGeneral, + [(set i64:$rD, (int_ppc_altivec_vcntmbw + v4i32:$vB, timm:$MP))]>; + def VCNTMBD : VXForm_RD5_MP_VB5<1602, 15, (outs g8rc:$rD), + (ins vrrc:$vB, u1imm:$MP), + "vcntmbd $rD, $vB, $MP", IIC_VecGeneral, + [(set i64:$rD, (int_ppc_altivec_vcntmbd + v2i64:$vB, timm:$MP))]>; + def VEXTDUBVLX : VAForm_1a<24, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, gprc:$rC), + "vextdubvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vextdubvlx v16i8:$vA, + v16i8:$vB, + i32:$rC))]>; + def VEXTDUBVRX : VAForm_1a<25, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, gprc:$rC), + "vextdubvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vextdubvrx v16i8:$vA, + v16i8:$vB, + i32:$rC))]>; + def VEXTDUHVLX : VAForm_1a<26, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, gprc:$rC), + "vextduhvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vextduhvlx v8i16:$vA, + v8i16:$vB, + i32:$rC))]>; + def VEXTDUHVRX : VAForm_1a<27, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, gprc:$rC), + "vextduhvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vextduhvrx v8i16:$vA, + v8i16:$vB, + i32:$rC))]>; + def VEXTDUWVLX : VAForm_1a<28, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, gprc:$rC), + "vextduwvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vextduwvlx v4i32:$vA, + v4i32:$vB, + i32:$rC))]>; + def VEXTDUWVRX : VAForm_1a<29, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, gprc:$rC), + "vextduwvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vextduwvrx v4i32:$vA, + v4i32:$vB, + i32:$rC))]>; + def VEXTDDVLX : VAForm_1a<30, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, gprc:$rC), + "vextddvlx $vD, $vA, $vB, $rC", + IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vextddvlx v2i64:$vA, + v2i64:$vB, + i32:$rC))]>; + def VEXTDDVRX : VAForm_1a<31, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, gprc:$rC), + "vextddvrx $vD, $vA, $vB, $rC", + IIC_VecGeneral, + [(set v2i64:$vD, + (int_ppc_altivec_vextddvrx v2i64:$vA, + v2i64:$vB, + i32:$rC))]>; def VPDEPD : VXForm_1<1485, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vpdepd $vD, $vA, $vB", IIC_VecGeneral, [(set v2i64:$vD, @@ -961,7 +2238,61 @@ let Predicates = [IsISA3_1] in { "vclrrb $vD, $vA, $rB", IIC_VecGeneral, [(set v16i8:$vD, (int_ppc_altivec_vclrrb v16i8:$vA, i32:$rB))]>; - + def VMULLD : VXForm_1<457, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmulld $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (mul v2i64:$vA, v2i64:$vB))]>; + def VMULHSW : VXForm_1<905, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmulhsw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (mulhs v4i32:$vA, v4i32:$vB))]>; + def VMULHUW : VXForm_1<649, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmulhuw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (mulhu v4i32:$vA, v4i32:$vB))]>; + def VMULHSD : VXForm_1<969, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmulhsd $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (mulhs v2i64:$vA, v2i64:$vB))]>; + def VMULHUD : VXForm_1<713, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmulhud $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (mulhu v2i64:$vA, v2i64:$vB))]>; + def VMODSW : VXForm_1<1931, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmodsw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (srem v4i32:$vA, v4i32:$vB))]>; + def VMODUW : VXForm_1<1675, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmoduw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (urem v4i32:$vA, v4i32:$vB))]>; + def VMODSD : VXForm_1<1995, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmodsd $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (srem v2i64:$vA, v2i64:$vB))]>; + def VMODUD : VXForm_1<1739, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmodud $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (urem v2i64:$vA, v2i64:$vB))]>; + def VDIVSW : VXForm_1<395, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdivsw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (sdiv v4i32:$vA, v4i32:$vB))]>; + def VDIVUW : VXForm_1<139, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdivuw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (udiv v4i32:$vA, v4i32:$vB))]>; + def VDIVSD : VXForm_1<459, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdivsd $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (sdiv v2i64:$vA, v2i64:$vB))]>; + def VDIVUD : VXForm_1<203, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdivud $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (udiv v2i64:$vA, v2i64:$vB))]>; + def VDIVESW : VXForm_1<907, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdivesw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (int_ppc_altivec_vdivesw v4i32:$vA, + v4i32:$vB))]>; + def VDIVEUW : VXForm_1<651, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdiveuw $vD, $vA, $vB", IIC_VecGeneral, + [(set v4i32:$vD, (int_ppc_altivec_vdiveuw v4i32:$vA, + v4i32:$vB))]>; + def VDIVESD : VXForm_1<971, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdivesd $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (int_ppc_altivec_vdivesd v2i64:$vA, + v2i64:$vB))]>; + def VDIVEUD : VXForm_1<715, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdiveud $vD, $vA, $vB", IIC_VecGeneral, + [(set v2i64:$vD, (int_ppc_altivec_vdiveud v2i64:$vA, + v2i64:$vB))]>; def XVTLSBB : XX2_BF3_XO5_XB6_XO9<60, 2, 475, (outs crrc:$BF), (ins vsrc:$XB), "xvtlsbb $BF, $XB", IIC_VecGeneral, []>; @@ -980,10 +2311,204 @@ let Predicates = [IsISA3_1] in { def STXVRWX : X_XS6_RA5_RB5<31, 205, "stxvrwx", vsrc, []>; def STXVRDX : X_XS6_RA5_RB5<31, 237, "stxvrdx", vsrc, []>; } + + def VMULESD : VXForm_1<968, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmulesd $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmulesd v2i64:$vA, + v2i64:$vB))]>; + def VMULEUD : VXForm_1<712, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmuleud $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmuleud v2i64:$vA, + v2i64:$vB))]>; + def VMULOSD : VXForm_1<456, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmulosd $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmulosd v2i64:$vA, + v2i64:$vB))]>; + def VMULOUD : VXForm_1<200, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmuloud $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmuloud v2i64:$vA, + v2i64:$vB))]>; + def VMSUMCUD : VAForm_1a<23, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, vrrc:$vC), + "vmsumcud $vD, $vA, $vB, $vC", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vmsumcud + v2i64:$vA, v2i64:$vB, v1i128:$vC))]>; + def VDIVSQ : VXForm_1<267, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdivsq $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (sdiv v1i128:$vA, v1i128:$vB))]>; + def VDIVUQ : VXForm_1<11, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdivuq $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (udiv v1i128:$vA, v1i128:$vB))]>; + def VDIVESQ : VXForm_1<779, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdivesq $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vdivesq v1i128:$vA, + v1i128:$vB))]>; + def VDIVEUQ : VXForm_1<523, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vdiveuq $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vdiveuq v1i128:$vA, + v1i128:$vB))]>; + def VCMPEQUQ : VCMP <455, "vcmpequq $vD, $vA, $vB" , v1i128>; + def VCMPGTSQ : VCMP <903, "vcmpgtsq $vD, $vA, $vB" , v1i128>; + def VCMPGTUQ : VCMP <647, "vcmpgtuq $vD, $vA, $vB" , v1i128>; + def VCMPEQUQ_rec : VCMP_rec <455, "vcmpequq. $vD, $vA, $vB" , v1i128>; + def VCMPGTSQ_rec : VCMP_rec <903, "vcmpgtsq. $vD, $vA, $vB" , v1i128>; + def VCMPGTUQ_rec : VCMP_rec <647, "vcmpgtuq. $vD, $vA, $vB" , v1i128>; + def VMODSQ : VXForm_1<1803, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmodsq $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (srem v1i128:$vA, v1i128:$vB))]>; + def VMODUQ : VXForm_1<1547, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), + "vmoduq $vD, $vA, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (urem v1i128:$vA, v1i128:$vB))]>; + def VEXTSD2Q : VXForm_RD5_XO5_RS5<1538, 27, (outs vrrc:$vD), (ins vrrc:$vB), + "vextsd2q $vD, $vB", IIC_VecGeneral, + [(set v1i128:$vD, (int_ppc_altivec_vextsd2q v2i64:$vB))]>; + def VCMPUQ : VXForm_BF3_VAB5<257, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB), + "vcmpuq $BF, $vA, $vB", IIC_VecGeneral, []>; + def VCMPSQ : VXForm_BF3_VAB5<321, (outs crrc:$BF), (ins vrrc:$vA, vrrc:$vB), + "vcmpsq $BF, $vA, $vB", IIC_VecGeneral, []>; + def VRLQNM : VX1_VT5_VA5_VB5<325, "vrlqnm", + [(set v1i128:$vD, + (int_ppc_altivec_vrlqnm v1i128:$vA, + v1i128:$vB))]>; + def VRLQMI : VXForm_1<69, (outs vrrc:$vD), + (ins vrrc:$vA, vrrc:$vB, vrrc:$vDi), + "vrlqmi $vD, $vA, $vB", IIC_VecFP, + [(set v1i128:$vD, + (int_ppc_altivec_vrlqmi v1i128:$vA, v1i128:$vB, + v1i128:$vDi))]>, + RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; + def VSLQ : VX1_VT5_VA5_VB5<261, "vslq", []>; + def VSRAQ : VX1_VT5_VA5_VB5<773, "vsraq", []>; + def VSRQ : VX1_VT5_VA5_VB5<517, "vsrq", []>; + def VRLQ : VX1_VT5_VA5_VB5<5, "vrlq", []>; + def XSCVQPUQZ : X_VT5_XO5_VB5<63, 0, 836, "xscvqpuqz", []>; + def XSCVQPSQZ : X_VT5_XO5_VB5<63, 8, 836, "xscvqpsqz", []>; + def XSCVUQQP : X_VT5_XO5_VB5<63, 3, 836, "xscvuqqp", []>; + def XSCVSQQP : X_VT5_XO5_VB5<63, 11, 836, "xscvsqqp", []>; +} + +let Predicates = [IsISA3_1, HasVSX] in { + def XVCVSPBF16 : XX2_XT6_XO5_XB6<60, 17, 475, "xvcvspbf16", vsrc, []>; + def XVCVBF16SPN : XX2_XT6_XO5_XB6<60, 16, 475, "xvcvbf16spn", vsrc, []>; +} + +// Multiclass defining patterns for Set Boolean Extension Reverse Instructions. +// This is analogous to the CRNotPat multiclass but specifically for Power10 +// and newer subtargets since the extended forms use Set Boolean instructions. +// The first two anonymous patterns defined are actually a duplicate of those +// in CRNotPat, but it is preferable to define both multiclasses as complete +// ones rather than pulling that small common section out. +multiclass P10ReverseSetBool<dag pattern, dag result> { + def : Pat<pattern, (crnot result)>; + def : Pat<(not pattern), result>; + + def : Pat<(i32 (zext pattern)), + (SETBCR result)>; + def : Pat<(i64 (zext pattern)), + (SETBCR8 result)>; + + def : Pat<(i32 (sext pattern)), + (SETNBCR result)>; + def : Pat<(i64 (sext pattern)), + (SETNBCR8 result)>; + + def : Pat<(i32 (anyext pattern)), + (SETBCR result)>; + def : Pat<(i64 (anyext pattern)), + (SETBCR8 result)>; +} + +multiclass IntSetP10RevSetBool<SDNode SetCC, ValueType Ty, ImmLeaf ZExtTy, + ImmLeaf SExtTy, PatLeaf Cmpi, PatLeaf Cmpli, + PatLeaf Cmp, PatLeaf Cmpl> { + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)), + (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_lt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)), + (EXTRACT_SUBREG (Cmp $s1, $s2), sub_lt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)), + (EXTRACT_SUBREG (Cmpl $s1, $s2), sub_gt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)), + (EXTRACT_SUBREG (Cmp $s1, $s2), sub_gt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)), + (EXTRACT_SUBREG (Cmp $s1, $s2), sub_eq)>; + + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETUGE)), + (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_lt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETGE)), + (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_lt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETULE)), + (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_gt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETLE)), + (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_gt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, SExtTy:$imm, SETNE)), + (EXTRACT_SUBREG (Cmpi $s1, imm:$imm), sub_eq)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, ZExtTy:$imm, SETNE)), + (EXTRACT_SUBREG (Cmpli $s1, imm:$imm), sub_eq)>; +} + +multiclass FSetP10RevSetBool<SDNode SetCC, ValueType Ty, PatLeaf FCmp> { + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>; + defm : P10ReverseSetBool<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)), + (EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>; +} + +let Predicates = [IsISA3_1] in { + def : Pat<(i32 (zext i1:$in)), + (SETBC $in)>; + def : Pat<(i64 (zext i1:$in)), + (SETBC8 $in)>; + def : Pat<(i32 (sext i1:$in)), + (SETNBC $in)>; + def : Pat<(i64 (sext i1:$in)), + (SETNBC8 $in)>; + def : Pat<(i32 (anyext i1:$in)), + (SETBC $in)>; + def : Pat<(i64 (anyext i1:$in)), + (SETBC8 $in)>; + + // Instantiation of the set boolean reverse patterns for 32-bit integers. + defm : IntSetP10RevSetBool<setcc, i32, immZExt16, imm32SExt16, + CMPWI, CMPLWI, CMPW, CMPLW>; + defm : P10ReverseSetBool<(i1 (setcc i32:$s1, imm:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLWI (XORIS $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; + + // Instantiation of the set boolean reverse patterns for 64-bit integers. + defm : IntSetP10RevSetBool<setcc, i64, immZExt16, imm64SExt16, + CMPDI, CMPLDI, CMPD, CMPLD>; + defm : P10ReverseSetBool<(i1 (setcc i64:$s1, imm64ZExt32:$imm, SETNE)), + (EXTRACT_SUBREG (CMPLDI (XORIS8 $s1, (HI16 imm:$imm)), + (LO16 imm:$imm)), sub_eq)>; +} + +// Instantiation of the set boolean reverse patterns for f32, f64, f128. +let Predicates = [IsISA3_1, HasFPU] in { + defm : FSetP10RevSetBool<setcc, f32, FCMPUS>; + defm : FSetP10RevSetBool<setcc, f64, FCMPUD>; + defm : FSetP10RevSetBool<setcc, f128, XSCMPUQP>; } //---------------------------- Anonymous Patterns ----------------------------// let Predicates = [IsISA3_1] in { + // Exploit the vector multiply high instructions using intrinsics. + def : Pat<(v4i32 (int_ppc_altivec_vmulhsw v4i32:$vA, v4i32:$vB)), + (v4i32 (VMULHSW $vA, $vB))>; + def : Pat<(v4i32 (int_ppc_altivec_vmulhuw v4i32:$vA, v4i32:$vB)), + (v4i32 (VMULHUW $vA, $vB))>; + def : Pat<(v2i64 (int_ppc_altivec_vmulhsd v2i64:$vA, v2i64:$vB)), + (v2i64 (VMULHSD $vA, $vB))>; + def : Pat<(v2i64 (int_ppc_altivec_vmulhud v2i64:$vA, v2i64:$vB)), + (v2i64 (VMULHUD $vA, $vB))>; def : Pat<(v16i8 (int_ppc_vsx_xxgenpcvbm v16i8:$VRB, imm:$IMM)), (v16i8 (COPY_TO_REGCLASS (XXGENPCVBM $VRB, imm:$IMM), VRRC))>; def : Pat<(v8i16 (int_ppc_vsx_xxgenpcvhm v8i16:$VRB, imm:$IMM)), @@ -992,12 +2517,82 @@ let Predicates = [IsISA3_1] in { (v4i32 (COPY_TO_REGCLASS (XXGENPCVWM $VRB, imm:$IMM), VRRC))>; def : Pat<(v2i64 (int_ppc_vsx_xxgenpcvdm v2i64:$VRB, imm:$IMM)), (v2i64 (COPY_TO_REGCLASS (XXGENPCVDM $VRB, imm:$IMM), VRRC))>; - def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, -1)), + def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 1)), (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_lt)>; def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 0)), (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_eq)>; + + def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 8)), + (v1i128 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VRRC))>; + def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 16)), + (v1i128 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VRRC))>; + def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 32)), + (v1i128 (COPY_TO_REGCLASS (LXVRWX xoaddr:$src), VRRC))>; + def : Pat <(v1i128 (PPClxvrzx xoaddr:$src, 64)), + (v1i128 (COPY_TO_REGCLASS (LXVRDX xoaddr:$src), VRRC))>; + + def : Pat<(v1i128 (rotl v1i128:$vA, v1i128:$vB)), + (v1i128 (VRLQ v1i128:$vA, v1i128:$vB))>; + + def : Pat <(v2i64 (PPCxxsplti32dx v2i64:$XT, i32:$XI, i32:$IMM32)), + (v2i64 (XXSPLTI32DX v2i64:$XT, i32:$XI, i32:$IMM32))>; +} + +let Predicates = [IsISA3_1, HasVSX] in { + def : Pat<(v16i8 (int_ppc_vsx_xvcvspbf16 v16i8:$XA)), + (COPY_TO_REGCLASS (XVCVSPBF16 RCCp.AToVSRC), VRRC)>; + def : Pat<(v16i8 (int_ppc_vsx_xvcvbf16spn v16i8:$XA)), + (COPY_TO_REGCLASS (XVCVBF16SPN RCCp.AToVSRC), VRRC)>; } +let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in { + // Store element 0 of a VSX register to memory + def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$src, 0)), xoaddr:$dst), + (STXVRBX (COPY_TO_REGCLASS v16i8:$src, VSRC), xoaddr:$dst)>; + def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$src, 0)), xoaddr:$dst), + (STXVRHX (COPY_TO_REGCLASS v8i16:$src, VSRC), xoaddr:$dst)>; + def : Pat<(store (i32 (extractelt v4i32:$src, 0)), xoaddr:$dst), + (STXVRWX $src, xoaddr:$dst)>; + def : Pat<(store (f32 (extractelt v4f32:$src, 0)), xoaddr:$dst), + (STXVRWX $src, xoaddr:$dst)>; + def : Pat<(store (i64 (extractelt v2i64:$src, 0)), xoaddr:$dst), + (STXVRDX $src, xoaddr:$dst)>; + def : Pat<(store (f64 (extractelt v2f64:$src, 0)), xoaddr:$dst), + (STXVRDX $src, xoaddr:$dst)>; + } + +// FIXME: The swap is overkill when the shift amount is a constant. +// We should just fix the constant in the DAG. +let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { + def : Pat<(v1i128 (shl v1i128:$VRA, v1i128:$VRB)), + (v1i128 (VSLQ v1i128:$VRA, + (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC), + (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; + def : Pat<(v1i128 (PPCshl v1i128:$VRA, v1i128:$VRB)), + (v1i128 (VSLQ v1i128:$VRA, + (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC), + (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; + def : Pat<(v1i128 (srl v1i128:$VRA, v1i128:$VRB)), + (v1i128 (VSRQ v1i128:$VRA, + (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC), + (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; + def : Pat<(v1i128 (PPCsrl v1i128:$VRA, v1i128:$VRB)), + (v1i128 (VSRQ v1i128:$VRA, + (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC), + (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; + def : Pat<(v1i128 (sra v1i128:$VRA, v1i128:$VRB)), + (v1i128 (VSRAQ v1i128:$VRA, + (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC), + (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; + def : Pat<(v1i128 (PPCsra v1i128:$VRA, v1i128:$VRB)), + (v1i128 (VSRAQ v1i128:$VRA, + (XXPERMDI (COPY_TO_REGCLASS $VRB, VSRC), + (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; +} + +class xxevalPattern <dag pattern, bits<8> imm> : + Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} + let AddedComplexity = 400, Predicates = [PrefixInstrs] in { def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A, i32immNonAllOneNonZero:$A, @@ -1010,6 +2605,44 @@ let AddedComplexity = 400, Predicates = [PrefixInstrs] in { def : Pat<(f64 nzFPImmAsi32:$A), (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)), VSFRC)>; + + // Anonymous patterns for XXEVAL + // AND + // and(A, B, C) + def : xxevalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>; + // and(A, xor(B, C)) + def : xxevalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>; + // and(A, or(B, C)) + def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>; + // and(A, nor(B, C)) + def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (or v4i32:$vB, v4i32:$vC))), + 8>; + // and(A, eqv(B, C)) + def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (xor v4i32:$vB, v4i32:$vC))), + 9>; + // and(A, nand(B, C)) + def : xxevalPattern<(and v4i32:$vA, (vnot_ppc (and v4i32:$vB, v4i32:$vC))), + 14>; + + // NAND + // nand(A, B, C) + def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), + !sub(255, 1)>; + // nand(A, xor(B, C)) + def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), + !sub(255, 6)>; + // nand(A, or(B, C)) + def : xxevalPattern<(vnot_ppc (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), + !sub(255, 7)>; + // nand(A, nor(B, C)) + def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), + !sub(255, 8)>; + // nand(A, eqv(B, C)) + def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), + !sub(255, 9)>; + // nand(A, nand(B, C)) + def : xxevalPattern<(or (vnot_ppc v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), + !sub(255, 14)>; } let Predicates = [PrefixInstrs] in { diff --git a/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/llvm/lib/Target/PowerPC/PPCInstrQPX.td deleted file mode 100644 index 2265af2815cb..000000000000 --- a/llvm/lib/Target/PowerPC/PPCInstrQPX.td +++ /dev/null @@ -1,1212 +0,0 @@ -//===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file describes the QPX extension to the PowerPC instruction set. -// Reference: -// Book Q: QPX Architecture Definition. IBM (as updated in) 2011. -// -//===----------------------------------------------------------------------===// - -def PPCRegQFRCAsmOperand : AsmOperandClass { - let Name = "RegQFRC"; let PredicateMethod = "isRegNumber"; -} -def qfrc : RegisterOperand<QFRC> { - let ParserMatchClass = PPCRegQFRCAsmOperand; -} -def PPCRegQSRCAsmOperand : AsmOperandClass { - let Name = "RegQSRC"; let PredicateMethod = "isRegNumber"; -} -def qsrc : RegisterOperand<QSRC> { - let ParserMatchClass = PPCRegQSRCAsmOperand; -} -def PPCRegQBRCAsmOperand : AsmOperandClass { - let Name = "RegQBRC"; let PredicateMethod = "isRegNumber"; -} -def qbrc : RegisterOperand<QBRC> { - let ParserMatchClass = PPCRegQBRCAsmOperand; -} - -//===----------------------------------------------------------------------===// -// Helpers for defining instructions that directly correspond to intrinsics. - -// QPXA1_Int - A AForm_1 intrinsic definition. -class QPXA1_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), - !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_FPFused, - [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; -// QPXA1s_Int - A AForm_1 intrinsic definition (simple instructions). -class QPXA1s_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_1<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), - !strconcat(opc, " $FRT, $FRA, $FRC, $FRB"), IIC_VecPerm, - [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; -// QPXA2_Int - A AForm_2 intrinsic definition. -class QPXA2_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_2<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPGeneral, - [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>; -// QPXA3_Int - A AForm_3 intrinsic definition. -class QPXA3_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_3<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC), - !strconcat(opc, " $FRT, $FRA, $FRC"), IIC_FPGeneral, - [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRC))]>; -// QPXA4_Int - A AForm_4a intrinsic definition. -class QPXA4_Int<bits<6> opcode, bits<5> xo, string opc, Intrinsic IntID> - : AForm_4a<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB), - !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral, - [(set v4f64:$FRT, (IntID v4f64:$FRB))]>; -// QPXX18_Int - A XForm_18 intrinsic definition. -class QPXX18_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID> - : XForm_18<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - !strconcat(opc, " $FRT, $FRA, $FRB"), IIC_FPCompare, - [(set v4f64:$FRT, (IntID v4f64:$FRA, v4f64:$FRB))]>; -// QPXX19_Int - A XForm_19 intrinsic definition. -class QPXX19_Int<bits<6> opcode, bits<10> xo, string opc, Intrinsic IntID> - : XForm_19<opcode, xo, (outs qfrc:$FRT), (ins qfrc:$FRB), - !strconcat(opc, " $FRT, $FRB"), IIC_FPGeneral, - [(set v4f64:$FRT, (IntID v4f64:$FRB))]>; - -//===----------------------------------------------------------------------===// -// Pattern Frags. - -def extloadv4f32 : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::v4f32; -}]>; - -def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr), - (truncstore node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32; -}]>; -def pre_truncstv4f32 : PatFrag<(ops node:$val, node:$base, node:$offset), - (pre_truncst node:$val, - node:$base, node:$offset), [{ - return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32; -}]>; - -def fround_inexact : PatFrag<(ops node:$val), (fpround node:$val), [{ - return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 0; -}]>; - -def fround_exact : PatFrag<(ops node:$val), (fpround node:$val), [{ - return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() == 1; -}]>; - -let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs. - def u12 : ImmLeaf<i32, [{ return (Imm & 0xFFF) == Imm; }]>; - -//===----------------------------------------------------------------------===// -// Instruction Definitions. - -def HasQPX : Predicate<"Subtarget->hasQPX()">; -let Predicates = [HasQPX] in { -let DecoderNamespace = "QPX" in { -let hasSideEffects = 0 in { // QPX instructions don't have side effects. -let Uses = [RM] in { - // Add Instructions - let isCommutable = 1 in { - def QVFADD : AForm_2<4, 21, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfadd $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fadd v4f64:$FRA, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFADDS : QPXA2_Int<0, 21, "qvfadds", int_ppc_qpx_qvfadds>; - def QVFADDSs : AForm_2<0, 21, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfadds $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fadd v4f32:$FRA, v4f32:$FRB))]>; - } - def QVFSUB : AForm_2<4, 20, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfsub $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fsub v4f64:$FRA, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSUBS : QPXA2_Int<0, 20, "qvfsubs", int_ppc_qpx_qvfsubs>; - def QVFSUBSs : AForm_2<0, 20, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfsubs $FRT, $FRA, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fsub v4f32:$FRA, v4f32:$FRB))]>; - - // Estimate Instructions - def QVFRE : AForm_4a<4, 24, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfre $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (PPCfre v4f64:$FRB))]>; - def QVFRES : QPXA4_Int<0, 24, "qvfres", int_ppc_qpx_qvfres>; - let isCodeGenOnly = 1 in - def QVFRESs : AForm_4a<0, 24, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfres $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (PPCfre v4f32:$FRB))]>; - - def QVFRSQRTE : AForm_4a<4, 26, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrsqrte $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (PPCfrsqrte v4f64:$FRB))]>; - def QVFRSQRTES : QPXA4_Int<0, 26, "qvfrsqrtes", int_ppc_qpx_qvfrsqrtes>; - let isCodeGenOnly = 1 in - def QVFRSQRTESs : AForm_4a<0, 26, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrsqrtes $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (PPCfrsqrte v4f32:$FRB))]>; - - // Multiply Instructions - let isCommutable = 1 in { - def QVFMUL : AForm_3<4, 25, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC), - "qvfmul $FRT, $FRA, $FRC", IIC_FPGeneral, - [(set v4f64:$FRT, (fmul v4f64:$FRA, v4f64:$FRC))]>; - let isCodeGenOnly = 1 in - def QVFMULS : QPXA3_Int<0, 25, "qvfmuls", int_ppc_qpx_qvfmuls>; - def QVFMULSs : AForm_3<0, 25, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC), - "qvfmuls $FRT, $FRA, $FRC", IIC_FPGeneral, - [(set v4f32:$FRT, (fmul v4f32:$FRA, v4f32:$FRC))]>; - } - def QVFXMUL : QPXA3_Int<4, 17, "qvfxmul", int_ppc_qpx_qvfxmul>; - def QVFXMULS : QPXA3_Int<0, 17, "qvfxmuls", int_ppc_qpx_qvfxmuls>; - - // Multiply-add instructions - def QVFMADD : AForm_1<4, 29, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>; - def QVFMADDSs : AForm_1<0, 29, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>; - def QVFNMADD : AForm_1<4, 31, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, - v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>; - def QVFNMADDSs : AForm_1<0, 31, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, - v4f32:$FRB)))]>; - def QVFMSUB : AForm_1<4, 28, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, - (fneg v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>; - def QVFMSUBSs : AForm_1<0, 28, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, - (fneg v4f32:$FRB)))]>; - def QVFNMSUB : AForm_1<4, 30, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB), - "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC, - (fneg v4f64:$FRB))))]>; - let isCodeGenOnly = 1 in - def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>; - def QVFNMSUBSs : AForm_1<0, 30, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB), - "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused, - [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC, - (fneg v4f32:$FRB))))]>; - def QVFXMADD : QPXA1_Int<4, 9, "qvfxmadd", int_ppc_qpx_qvfxmadd>; - def QVFXMADDS : QPXA1_Int<0, 9, "qvfxmadds", int_ppc_qpx_qvfxmadds>; - def QVFXXNPMADD : QPXA1_Int<4, 11, "qvfxxnpmadd", int_ppc_qpx_qvfxxnpmadd>; - def QVFXXNPMADDS : QPXA1_Int<0, 11, "qvfxxnpmadds", int_ppc_qpx_qvfxxnpmadds>; - def QVFXXCPNMADD : QPXA1_Int<4, 3, "qvfxxcpnmadd", int_ppc_qpx_qvfxxcpnmadd>; - def QVFXXCPNMADDS : QPXA1_Int<0, 3, "qvfxxcpnmadds", int_ppc_qpx_qvfxxcpnmadds>; - def QVFXXMADD : QPXA1_Int<4, 1, "qvfxxmadd", int_ppc_qpx_qvfxxmadd>; - def QVFXXMADDS : QPXA1_Int<0, 1, "qvfxxmadds", int_ppc_qpx_qvfxxmadds>; - - // Select Instruction - let isCodeGenOnly = 1 in - def QVFSEL : QPXA1s_Int<4, 23, "qvfsel", int_ppc_qpx_qvfsel>; - def QVFSELb : AForm_1<4, 23, (outs qfrc:$FRT), - (ins qbrc:$FRA, qfrc:$FRB, qfrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (vselect v4i1:$FRA, - v4f64:$FRC, v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSELbs : AForm_1<4, 23, (outs qsrc:$FRT), - (ins qbrc:$FRA, qsrc:$FRB, qsrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (vselect v4i1:$FRA, - v4f32:$FRC, v4f32:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFSELbb: AForm_1<4, 23, (outs qbrc:$FRT), - (ins qbrc:$FRA, qbrc:$FRB, qbrc:$FRC), - "qvfsel $FRT, $FRA, $FRC, $FRB", IIC_VecPerm, - [(set v4i1:$FRT, (vselect v4i1:$FRA, - v4i1:$FRC, v4i1:$FRB))]>; - - // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after - // instruction selection into a branch sequence. - def SELECT_CC_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crrc:$cond, qfrc:$T, qfrc:$F, - i32imm:$BROPC), "#SELECT_CC_QFRC", - []>; - def SELECT_CC_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crrc:$cond, qsrc:$T, qsrc:$F, - i32imm:$BROPC), "#SELECT_CC_QSRC", - []>; - def SELECT_CC_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crrc:$cond, qbrc:$T, qbrc:$F, - i32imm:$BROPC), "#SELECT_CC_QBRC", - []>; - - // SELECT_* pseudo instructions, like SELECT_CC_* but taking condition - // register bit directly. - def SELECT_QFRC: PPCCustomInserterPseudo<(outs qfrc:$dst), (ins crbitrc:$cond, - qfrc:$T, qfrc:$F), "#SELECT_QFRC", - [(set v4f64:$dst, - (select i1:$cond, v4f64:$T, v4f64:$F))]>; - def SELECT_QSRC: PPCCustomInserterPseudo<(outs qsrc:$dst), (ins crbitrc:$cond, - qsrc:$T, qsrc:$F), "#SELECT_QSRC", - [(set v4f32:$dst, - (select i1:$cond, v4f32:$T, v4f32:$F))]>; - def SELECT_QBRC: PPCCustomInserterPseudo<(outs qbrc:$dst), (ins crbitrc:$cond, - qbrc:$T, qbrc:$F), "#SELECT_QBRC", - [(set v4i1:$dst, - (select i1:$cond, v4i1:$T, v4i1:$F))]>; - - // Convert and Round Instructions - def QVFCTID : QPXX19_Int<4, 814, "qvfctid", int_ppc_qpx_qvfctid>; - let isCodeGenOnly = 1 in - def QVFCTIDb : XForm_19<4, 814, (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfctid $FRT, $FRB", IIC_FPGeneral, []>; - - def QVFCTIDU : QPXX19_Int<4, 942, "qvfctidu", int_ppc_qpx_qvfctidu>; - def QVFCTIDZ : QPXX19_Int<4, 815, "qvfctidz", int_ppc_qpx_qvfctidz>; - def QVFCTIDUZ : QPXX19_Int<4, 943, "qvfctiduz", int_ppc_qpx_qvfctiduz>; - def QVFCTIW : QPXX19_Int<4, 14, "qvfctiw", int_ppc_qpx_qvfctiw>; - def QVFCTIWU : QPXX19_Int<4, 142, "qvfctiwu", int_ppc_qpx_qvfctiwu>; - def QVFCTIWZ : QPXX19_Int<4, 15, "qvfctiwz", int_ppc_qpx_qvfctiwz>; - def QVFCTIWUZ : QPXX19_Int<4, 143, "qvfctiwuz", int_ppc_qpx_qvfctiwuz>; - def QVFCFID : QPXX19_Int<4, 846, "qvfcfid", int_ppc_qpx_qvfcfid>; - let isCodeGenOnly = 1 in - def QVFCFIDb : XForm_19<4, 846, (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfcfid $FRT, $FRB", IIC_FPGeneral, []>; - - def QVFCFIDU : QPXX19_Int<4, 974, "qvfcfidu", int_ppc_qpx_qvfcfidu>; - def QVFCFIDS : QPXX19_Int<0, 846, "qvfcfids", int_ppc_qpx_qvfcfids>; - def QVFCFIDUS : QPXX19_Int<0, 974, "qvfcfidus", int_ppc_qpx_qvfcfidus>; - - let isCodeGenOnly = 1 in - def QVFRSP : QPXX19_Int<4, 12, "qvfrsp", int_ppc_qpx_qvfrsp>; - def QVFRSPs : XForm_19<4, 12, - (outs qsrc:$FRT), (ins qfrc:$FRB), - "qvfrsp $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fround_inexact v4f64:$FRB))]>; - - def QVFRIZ : XForm_19<4, 424, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfriz $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (ftrunc v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIZs : XForm_19<4, 424, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfriz $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (ftrunc v4f32:$FRB))]>; - - def QVFRIN : XForm_19<4, 392, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrin $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fround v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRINs : XForm_19<4, 392, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrin $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fround v4f32:$FRB))]>; - - def QVFRIP : XForm_19<4, 456, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrip $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (fceil v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIPs : XForm_19<4, 456, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrip $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (fceil v4f32:$FRB))]>; - - def QVFRIM : XForm_19<4, 488, (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfrim $FRT, $FRB", IIC_FPGeneral, - [(set v4f64:$FRT, (ffloor v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFRIMs : XForm_19<4, 488, (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfrim $FRT, $FRB", IIC_FPGeneral, - [(set v4f32:$FRT, (ffloor v4f32:$FRB))]>; - - // Move Instructions - def QVFMR : XForm_19<4, 72, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4f64:$FRT, v4f64:$FRB) */]>; - let isCodeGenOnly = 1 in { - def QVFMRs : XForm_19<4, 72, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4f32:$FRT, v4f32:$FRB) */]>; - def QVFMRb : XForm_19<4, 72, - (outs qbrc:$FRT), (ins qbrc:$FRB), - "qvfmr $FRT, $FRB", IIC_VecPerm, - [/* (set v4i1:$FRT, v4i1:$FRB) */]>; - } - def QVFNEG : XForm_19<4, 40, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfneg $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fneg v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFNEGs : XForm_19<4, 40, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfneg $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fneg v4f32:$FRB))]>; - def QVFABS : XForm_19<4, 264, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfabs $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fabs v4f64:$FRB))]>; - let isCodeGenOnly = 1 in - def QVFABSs : XForm_19<4, 264, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfabs $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fabs v4f32:$FRB))]>; - def QVFNABS : XForm_19<4, 136, - (outs qfrc:$FRT), (ins qfrc:$FRB), - "qvfnabs $FRT, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fneg (fabs v4f64:$FRB)))]>; - let isCodeGenOnly = 1 in - def QVFNABSs : XForm_19<4, 136, - (outs qsrc:$FRT), (ins qsrc:$FRB), - "qvfnabs $FRT, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fneg (fabs v4f32:$FRB)))]>; - def QVFCPSGN : XForm_18<4, 8, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, - [(set v4f64:$FRT, (fcopysign v4f64:$FRB, v4f64:$FRA))]>; - let isCodeGenOnly = 1 in - def QVFCPSGNs : XForm_18<4, 8, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcpsgn $FRT, $FRA, $FRB", IIC_VecPerm, - [(set v4f32:$FRT, (fcopysign v4f32:$FRB, v4f32:$FRA))]>; - - def QVALIGNI : Z23Form_1<4, 5, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvaligni v4f64:$FRA, v4f64:$FRB, - (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVALIGNIs : Z23Form_1<4, 5, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvaligni v4f32:$FRA, v4f32:$FRB, - (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVALIGNIb : Z23Form_1<4, 5, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u2imm:$idx), - "qvaligni $FRT, $FRA, $FRB, $idx", IIC_VecPerm, - [(set v4i1:$FRT, - (PPCqvaligni v4i1:$FRA, v4i1:$FRB, - (i32 imm:$idx)))]>; - - def QVESPLATI : Z23Form_2<4, 37, - (outs qfrc:$FRT), (ins qfrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvesplati v4f64:$FRA, (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVESPLATIs : Z23Form_2<4, 37, - (outs qsrc:$FRT), (ins qsrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvesplati v4f32:$FRA, (i32 imm:$idx)))]>; - let isCodeGenOnly = 1 in - def QVESPLATIb : Z23Form_2<4, 37, - (outs qbrc:$FRT), (ins qbrc:$FRA, u2imm:$idx), - "qvesplati $FRT, $FRA, $idx", IIC_VecPerm, - [(set v4i1:$FRT, - (PPCqvesplati v4i1:$FRA, (i32 imm:$idx)))]>; - - def QVFPERM : AForm_1<4, 6, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC), - "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, - [(set v4f64:$FRT, - (PPCqvfperm v4f64:$FRA, v4f64:$FRB, v4f64:$FRC))]>; - let isCodeGenOnly = 1 in - def QVFPERMs : AForm_1<4, 6, - (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qfrc:$FRC), - "qvfperm $FRT, $FRA, $FRB, $FRC", IIC_VecPerm, - [(set v4f32:$FRT, - (PPCqvfperm v4f32:$FRA, v4f32:$FRB, v4f64:$FRC))]>; - - let isReMaterializable = 1, isAsCheapAsAMove = 1 in - def QVGPCI : Z23Form_3<4, 133, - (outs qfrc:$FRT), (ins u12imm:$idx), - "qvgpci $FRT, $idx", IIC_VecPerm, - [(set v4f64:$FRT, (PPCqvgpci (u12:$idx)))]>; - - // Compare Instruction - let isCodeGenOnly = 1 in - def QVFTSTNAN : QPXX18_Int<4, 64, "qvftstnan", int_ppc_qpx_qvftstnan>; - def QVFTSTNANb : XForm_18<4, 64, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETUO))]>; - let isCodeGenOnly = 1 in - def QVFTSTNANbs : XForm_18<4, 64, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvftstnan $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETUO))]>; - let isCodeGenOnly = 1 in - def QVFCMPLT : QPXX18_Int<4, 96, "qvfcmplt", int_ppc_qpx_qvfcmplt>; - def QVFCMPLTb : XForm_18<4, 96, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOLT))]>; - let isCodeGenOnly = 1 in - def QVFCMPLTbs : XForm_18<4, 96, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmplt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOLT))]>; - let isCodeGenOnly = 1 in - def QVFCMPGT : QPXX18_Int<4, 32, "qvfcmpgt", int_ppc_qpx_qvfcmpgt>; - def QVFCMPGTb : XForm_18<4, 32, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOGT))]>; - let isCodeGenOnly = 1 in - def QVFCMPGTbs : XForm_18<4, 32, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmpgt $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOGT))]>; - let isCodeGenOnly = 1 in - def QVFCMPEQ : QPXX18_Int<4, 0, "qvfcmpeq", int_ppc_qpx_qvfcmpeq>; - def QVFCMPEQb : XForm_18<4, 0, (outs qbrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB), - "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f64:$FRA, v4f64:$FRB, SETOEQ))]>; - let isCodeGenOnly = 1 in - def QVFCMPEQbs : XForm_18<4, 0, (outs qbrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB), - "qvfcmpeq $FRT, $FRA, $FRB", IIC_FPCompare, - [(set v4i1:$FRT, - (setcc v4f32:$FRA, v4f32:$FRB, SETOEQ))]>; - - let isCodeGenOnly = 1 in - def QVFLOGICAL : XForm_20<4, 4, - (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - def QVFLOGICALb : XForm_20<4, 4, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - let isCodeGenOnly = 1 in - def QVFLOGICALs : XForm_20<4, 4, - (outs qbrc:$FRT), (ins qbrc:$FRA, qbrc:$FRB, u12imm:$tttt), - "qvflogical $FRT, $FRA, $FRB, $tttt", IIC_VecPerm, []>; - - // Load indexed instructions - let mayLoad = 1 in { - def QVLFDX : XForm_1_memOp<31, 583, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfdx $FRT, $src", IIC_LdStLFD, - [(set v4f64:$FRT, (load xoaddr:$src))]>; - let isCodeGenOnly = 1 in - def QVLFDXb : XForm_1_memOp<31, 583, - (outs qbrc:$FRT), (ins memrr:$src), - "qvlfdx $FRT, $src", IIC_LdStLFD, []>; - - let RC = 1 in - def QVLFDXA : XForm_1<31, 583, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfdxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFDUX : XForm_1<31, 615, - (outs qfrc:$FRT, ptr_rc_nor0:$ea_result), - (ins memrr:$src), - "qvlfdux $FRT, $src", IIC_LdStLFDU, []>, - RegConstraint<"$src.ptrreg = $ea_result">, - NoEncode<"$ea_result">; - let RC = 1 in - def QVLFDUXA : XForm_1<31, 615, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfduxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFSX : XForm_1_memOp<31, 519, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4f64:$FRT, (extloadv4f32 xoaddr:$src))]>; - - let isCodeGenOnly = 1 in - def QVLFSXb : XForm_1<31, 519, - (outs qbrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4i1:$FRT, (PPCqvlfsb xoaddr:$src))]>; - let isCodeGenOnly = 1 in - def QVLFSXs : XForm_1_memOp<31, 519, - (outs qsrc:$FRT), (ins memrr:$src), - "qvlfsx $FRT, $src", IIC_LdStLFD, - [(set v4f32:$FRT, (load xoaddr:$src))]>; - - let RC = 1 in - def QVLFSXA : XForm_1<31, 519, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFSUX : XForm_1<31, 551, - (outs qsrc:$FRT, ptr_rc_nor0:$ea_result), - (ins memrr:$src), - "qvlfsux $FRT, $src", IIC_LdStLFDU, []>, - RegConstraint<"$src.ptrreg = $ea_result">, - NoEncode<"$ea_result">; - - let RC = 1 in - def QVLFSUXA : XForm_1<31, 551, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfsuxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCDX : XForm_1<31, 71, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdx $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCDXA : XForm_1<31, 71, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCDUX : XForm_1<31, 103, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcdux $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCDUXA : XForm_1<31, 103, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcduxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCSX : XForm_1<31, 7, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; - let isCodeGenOnly = 1 in - def QVLFCSXs : XForm_1<31, 7, - (outs qsrc:$FRT), (ins memrr:$src), - "qvlfcsx $FRT, $src", IIC_LdStLFD, []>; - - let RC = 1 in - def QVLFCSXA : XForm_1<31, 7, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFCSUX : XForm_1<31, 39, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsux $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFCSUXA : XForm_1<31, 39, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfcsuxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFIWAX : XForm_1<31, 871, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwax $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFIWAXA : XForm_1<31, 871, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwaxa $FRT, $src", IIC_LdStLFD, []>; - - def QVLFIWZX : XForm_1<31, 839, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwzx $FRT, $src", IIC_LdStLFD, []>; - let RC = 1 in - def QVLFIWZXA : XForm_1<31, 839, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlfiwzxa $FRT, $src", IIC_LdStLFD, []>; - } - - - def QVLPCLDX : XForm_1<31, 582, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcldx $FRT, $src", IIC_LdStLFD, []>; - def QVLPCLSX : XForm_1<31, 518, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpclsx $FRT, $src", IIC_LdStLFD, []>; - let isCodeGenOnly = 1 in - def QVLPCLSXint : XForm_11<31, 518, - (outs qfrc:$FRT), (ins G8RC:$src), - "qvlpclsx $FRT, 0, $src", IIC_LdStLFD, []>; - def QVLPCRDX : XForm_1<31, 70, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcrdx $FRT, $src", IIC_LdStLFD, []>; - def QVLPCRSX : XForm_1<31, 6, - (outs qfrc:$FRT), (ins memrr:$src), - "qvlpcrsx $FRT, $src", IIC_LdStLFD, []>; - - // Store indexed instructions - let mayStore = 1 in { - def QVSTFDX : XForm_8_memOp<31, 711, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdx $FRT, $dst", IIC_LdStSTFD, - [(store qfrc:$FRT, xoaddr:$dst)]>; - let isCodeGenOnly = 1 in - def QVSTFDXb : XForm_8_memOp<31, 711, - (outs), (ins qbrc:$FRT, memrr:$dst), - "qvstfdx $FRT, $dst", IIC_LdStSTFD, []>; - - let RC = 1 in - def QVSTFDXA : XForm_8<31, 711, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDUX : XForm_8<31, 743, (outs ptr_rc_nor0:$ea_res), - (ins qfrc:$FRT, memrr:$dst), - "qvstfdux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - - let RC = 1 in - def QVSTFDUXA : XForm_8<31, 743, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDXI : XForm_8<31, 709, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFDXIA : XForm_8<31, 709, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfdxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFDUXI : XForm_8<31, 741, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFDUXIA : XForm_8<31, 741, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfduxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSX : XForm_8_memOp<31, 647, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsx $FRT, $dst", IIC_LdStSTFD, - [(truncstorev4f32 qfrc:$FRT, xoaddr:$dst)]>; - let isCodeGenOnly = 1 in - def QVSTFSXs : XForm_8_memOp<31, 647, - (outs), (ins qsrc:$FRT, memrr:$dst), - "qvstfsx $FRT, $dst", IIC_LdStSTFD, - [(store qsrc:$FRT, xoaddr:$dst)]>; - - let RC = 1 in - def QVSTFSXA : XForm_8<31, 647, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSUX : XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), - (ins qsrc:$FRT, memrr:$dst), - "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - let isCodeGenOnly = 1 in - def QVSTFSUXs: XForm_8<31, 679, (outs ptr_rc_nor0:$ea_res), - (ins qfrc:$FRT, memrr:$dst), - "qvstfsux $FRT, $dst", IIC_LdStSTFDU, []>, - RegConstraint<"$dst.ptrreg = $ea_res">, - NoEncode<"$ea_res">; - - let RC = 1 in - def QVSTFSUXA : XForm_8<31, 679, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSXI : XForm_8<31, 645, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFSXIA : XForm_8<31, 645, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFSUXI : XForm_8<31, 677, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFSUXIA : XForm_8<31, 677, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfsuxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDX : XForm_8<31, 199, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdx $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDXA : XForm_8<31, 199, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSX : XForm_8<31, 135, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; - let isCodeGenOnly = 1 in - def QVSTFCSXs : XForm_8<31, 135, - (outs), (ins qsrc:$FRT, memrr:$dst), - "qvstfcsx $FRT, $dst", IIC_LdStSTFD, []>; - - let RC = 1 in - def QVSTFCSXA : XForm_8<31, 135, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDUX : XForm_8<31, 231, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdux $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDUXA : XForm_8<31, 231, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSUX : XForm_8<31, 167, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsux $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSUXA : XForm_8<31, 167, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxa $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDXI : XForm_8<31, 197, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDXIA : XForm_8<31, 197, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcdxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSXI : XForm_8<31, 133, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSXIA : XForm_8<31, 133, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCDUXI : XForm_8<31, 229, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCDUXIA : XForm_8<31, 229, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcduxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFCSUXI : XForm_8<31, 165, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxi $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFCSUXIA : XForm_8<31, 165, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfcsuxia $FRT, $dst", IIC_LdStSTFD, []>; - - def QVSTFIWX : XForm_8<31, 967, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfiwx $FRT, $dst", IIC_LdStSTFD, []>; - let RC = 1 in - def QVSTFIWXA : XForm_8<31, 967, - (outs), (ins qfrc:$FRT, memrr:$dst), - "qvstfiwxa $FRT, $dst", IIC_LdStSTFD, []>; - } -} - -} // neverHasSideEffects -} - -def : InstAlias<"qvfclr $FRT", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 0)>; -def : InstAlias<"qvfand $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 1)>; -def : InstAlias<"qvfandc $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 4)>; -def : InstAlias<"qvfctfb $FRT, $FRA", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 5)>; -def : InstAlias<"qvfxor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 6)>; -def : InstAlias<"qvfor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 7)>; -def : InstAlias<"qvfnor $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 8)>; -def : InstAlias<"qvfequ $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 9)>; -def : InstAlias<"qvfnot $FRT, $FRA", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRA, 10)>; -def : InstAlias<"qvforc $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 13)>; -def : InstAlias<"qvfnand $FRT, $FRA, $FRB", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRA, qbrc:$FRB, 14)>; -def : InstAlias<"qvfset $FRT", - (QVFLOGICALb qbrc:$FRT, qbrc:$FRT, qbrc:$FRT, 15)>; - -//===----------------------------------------------------------------------===// -// Additional QPX Patterns -// - -def : Pat<(v4f64 (scalar_to_vector f64:$A)), - (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), $A, sub_64)>; -def : Pat<(v4f32 (scalar_to_vector f32:$A)), - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, 0)), - (EXTRACT_SUBREG $S, sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 0)), - (EXTRACT_SUBREG $S, sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, 1)), - (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>; -def : Pat<(f64 (extractelt v4f64:$S, 2)), - (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>; -def : Pat<(f64 (extractelt v4f64:$S, 3)), - (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>; - -def : Pat<(f32 (extractelt v4f32:$S, 1)), - (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 2)), - (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, 3)), - (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>; - -def : Pat<(f64 (extractelt v4f64:$S, i64:$F)), - (EXTRACT_SUBREG (QVFPERM $S, $S, - (QVLPCLSXint (RLDICR $F, 2, - /* 63-2 = */ 61))), - sub_64)>; -def : Pat<(f32 (extractelt v4f32:$S, i64:$F)), - (EXTRACT_SUBREG (QVFPERMs $S, $S, - (QVLPCLSXint (RLDICR $F, 2, - /* 63-2 = */ 61))), - sub_64)>; - -def : Pat<(int_ppc_qpx_qvfperm v4f64:$A, v4f64:$B, v4f64:$C), - (QVFPERM $A, $B, $C)>; - -def : Pat<(int_ppc_qpx_qvfcpsgn v4f64:$A, v4f64:$B), - (QVFCPSGN $A, $B)>; - -// FCOPYSIGN's operand types need not agree. -def : Pat<(fcopysign v4f64:$frB, v4f32:$frA), - (QVFCPSGN (COPY_TO_REGCLASS $frA, QFRC), $frB)>; -def : Pat<(fcopysign QSRC:$frB, QFRC:$frA), - (QVFCPSGNs (COPY_TO_REGCLASS $frA, QSRC), $frB)>; - -def : Pat<(int_ppc_qpx_qvfneg v4f64:$A), (QVFNEG $A)>; -def : Pat<(int_ppc_qpx_qvfabs v4f64:$A), (QVFABS $A)>; -def : Pat<(int_ppc_qpx_qvfnabs v4f64:$A), (QVFNABS $A)>; - -def : Pat<(int_ppc_qpx_qvfriz v4f64:$A), (QVFRIZ $A)>; -def : Pat<(int_ppc_qpx_qvfrin v4f64:$A), (QVFRIN $A)>; -def : Pat<(int_ppc_qpx_qvfrip v4f64:$A), (QVFRIP $A)>; -def : Pat<(int_ppc_qpx_qvfrim v4f64:$A), (QVFRIM $A)>; - -def : Pat<(int_ppc_qpx_qvfre v4f64:$A), (QVFRE $A)>; -def : Pat<(int_ppc_qpx_qvfrsqrte v4f64:$A), (QVFRSQRTE $A)>; - -def : Pat<(int_ppc_qpx_qvfadd v4f64:$A, v4f64:$B), - (QVFADD $A, $B)>; -def : Pat<(int_ppc_qpx_qvfsub v4f64:$A, v4f64:$B), - (QVFSUB $A, $B)>; -def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B), - (QVFMUL $A, $B)>; - -// Additional QVFNMSUB patterns: -a*c + b == -(a*c - b) -def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B), - (QVFNMSUB $A, $C, $B)>; -def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B), - (QVFNMSUB $A, $C, $B)>; -def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B), - (QVFNMSUBSs $A, $C, $B)>; -def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B), - (QVFNMSUBSs $A, $C, $B)>; - -def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C), - (QVFMADD $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfnmadd v4f64:$A, v4f64:$B, v4f64:$C), - (QVFNMADD $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfmsub v4f64:$A, v4f64:$B, v4f64:$C), - (QVFMSUB $A, $B, $C)>; -def : Pat<(int_ppc_qpx_qvfnmsub v4f64:$A, v4f64:$B, v4f64:$C), - (QVFNMSUB $A, $B, $C)>; - -def : Pat<(int_ppc_qpx_qvlfd xoaddr:$src), - (QVLFDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), - (QVLFDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfs xoaddr:$src), - (QVLFSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), - (QVLFSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcda xoaddr:$src), - (QVLFCDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcd xoaddr:$src), - (QVLFCDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcsa xoaddr:$src), - (QVLFCSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfcs xoaddr:$src), - (QVLFCSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfda xoaddr:$src), - (QVLFDXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwaa xoaddr:$src), - (QVLFIWAXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwa xoaddr:$src), - (QVLFIWAX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwza xoaddr:$src), - (QVLFIWZXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfiwz xoaddr:$src), - (QVLFIWZX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlfsa xoaddr:$src), - (QVLFSXA xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcld xoaddr:$src), - (QVLPCLDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcls xoaddr:$src), - (QVLPCLSX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcrd xoaddr:$src), - (QVLPCRDX xoaddr:$src)>; -def : Pat<(int_ppc_qpx_qvlpcrs xoaddr:$src), - (QVLPCRSX xoaddr:$src)>; - -def : Pat<(int_ppc_qpx_qvstfd v4f64:$T, xoaddr:$dst), - (QVSTFDX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfs v4f64:$T, xoaddr:$dst), - (QVSTFSX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcda v4f64:$T, xoaddr:$dst), - (QVSTFCDXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcd v4f64:$T, xoaddr:$dst), - (QVSTFCDX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcsa v4f64:$T, xoaddr:$dst), - (QVSTFCSXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfcs v4f64:$T, xoaddr:$dst), - (QVSTFCSX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfda v4f64:$T, xoaddr:$dst), - (QVSTFDXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfiwa v4f64:$T, xoaddr:$dst), - (QVSTFIWXA $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfiw v4f64:$T, xoaddr:$dst), - (QVSTFIWX $T, xoaddr:$dst)>; -def : Pat<(int_ppc_qpx_qvstfsa v4f64:$T, xoaddr:$dst), - (QVSTFSXA $T, xoaddr:$dst)>; - -def : Pat<(pre_store v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFDUX $rS, $ptrreg, $ptroff)>; -def : Pat<(pre_store v4f32:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFSUX $rS, $ptrreg, $ptroff)>; -def : Pat<(pre_truncstv4f32 v4f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), - (QVSTFSUXs $rS, $ptrreg, $ptroff)>; - -def : Pat<(int_ppc_qpx_qvflogical v4f64:$A, v4f64:$B, (i32 imm:$idx)), - (QVFLOGICAL $A, $B, imm:$idx)>; -def : Pat<(int_ppc_qpx_qvgpci (u12:$idx)), - (QVGPCI imm:$idx)>; - -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOGE), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETOLE), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETONE), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETO), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUEQ), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGT), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUGE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPLTb $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULT), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETULE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPGTb $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETUNE), - (QVFLOGICALb (QVFTSTNANb $FRA, $FRB), - (QVFCMPEQb $FRA, $FRB), (i32 13))>; - -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETEQ), - (QVFCMPEQb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGT), - (QVFCMPGTb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETGE), - (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFCMPLTb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLT), - (QVFCMPLTb $FRA, $FRB)>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETLE), - (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFCMPGTb $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f64:$FRA, v4f64:$FRB, SETNE), - (QVFLOGICALb (QVFCMPEQb $FRA, $FRB), - (QVFCMPEQb $FRA, $FRB), (i32 10))>; - -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOGE), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETOLE), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETONE), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 8))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETO), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUEQ), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGT), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUGE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPLTbs $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULT), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRA, $FRB), (i32 7))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETULE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPGTbs $FRA, $FRB), (i32 13))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETUNE), - (QVFLOGICALb (QVFTSTNANbs $FRA, $FRB), - (QVFCMPEQbs $FRA, $FRB), (i32 13))>; - -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETEQ), - (QVFCMPEQbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGT), - (QVFCMPGTbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETGE), - (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFCMPLTbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLT), - (QVFCMPLTbs $FRA, $FRB)>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETLE), - (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFCMPGTbs $FRA, $FRB), (i32 10))>; -def : Pat<(setcc v4f32:$FRA, v4f32:$FRB, SETNE), - (QVFLOGICALb (QVFCMPEQbs $FRA, $FRB), - (QVFCMPEQbs $FRA, $FRB), (i32 10))>; - -def : Pat<(and v4i1:$FRA, (not v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 4))>; -def : Pat<(not (or v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 8))>; -def : Pat<(not (xor v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 9))>; -def : Pat<(or v4i1:$FRA, (not v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 13))>; -def : Pat<(not (and v4i1:$FRA, v4i1:$FRB)), - (QVFLOGICALb $FRA, $FRB, (i32 14))>; - -def : Pat<(and v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 1))>; -def : Pat<(or v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 7))>; -def : Pat<(xor v4i1:$FRA, v4i1:$FRB), - (QVFLOGICALb $FRA, $FRB, (i32 6))>; -def : Pat<(not v4i1:$FRA), - (QVFLOGICALb $FRA, $FRA, (i32 10))>; - -def : Pat<(v4f64 (fpextend v4f32:$src)), - (COPY_TO_REGCLASS $src, QFRC)>; - -def : Pat<(v4f32 (fround_exact v4f64:$src)), - (COPY_TO_REGCLASS $src, QSRC)>; - -// Extract the underlying floating-point values from the -// QPX (-1.0, 1.0) boolean representation. -def : Pat<(v4f64 (PPCqbflt v4i1:$src)), - (COPY_TO_REGCLASS $src, QFRC)>; - -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLT)), - (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULT)), - (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETLE)), - (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETULE)), - (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETEQ)), - (SELECT_QFRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGE)), - (SELECT_QFRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGE)), - (SELECT_QFRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETGT)), - (SELECT_QFRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETUGT)), - (SELECT_QFRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f64 (selectcc i1:$lhs, i1:$rhs, v4f64:$tval, v4f64:$fval, SETNE)), - (SELECT_QFRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLT)), - (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULT)), - (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETLE)), - (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETULE)), - (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETEQ)), - (SELECT_QSRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGE)), - (SELECT_QSRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGE)), - (SELECT_QSRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETGT)), - (SELECT_QSRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETUGT)), - (SELECT_QSRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4f32 (selectcc i1:$lhs, i1:$rhs, v4f32:$tval, v4f32:$fval, SETNE)), - (SELECT_QSRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLT)), - (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULT)), - (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETLE)), - (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETULE)), - (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETEQ)), - (SELECT_QBRC (CREQV $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGE)), - (SELECT_QBRC (CRORC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGE)), - (SELECT_QBRC (CRORC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETGT)), - (SELECT_QBRC (CRANDC $rhs, $lhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETUGT)), - (SELECT_QBRC (CRANDC $lhs, $rhs), $tval, $fval)>; -def : Pat<(v4i1 (selectcc i1:$lhs, i1:$rhs, v4i1:$tval, v4i1:$fval, SETNE)), - (SELECT_QBRC (CRXOR $lhs, $rhs), $tval, $fval)>; - -} // end HasQPX - -let Predicates = [HasQPX, NoNaNsFPMath] in { -def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFCMPLTb $FRA, $FRB), $FRB, $FRA)>; -def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFCMPGTb $FRA, $FRB), $FRB, $FRA)>; - -def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFCMPLTbs $FRA, $FRB), $FRB, $FRA)>; -def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFCMPGTbs $FRA, $FRB), $FRB, $FRA)>; -} - -let Predicates = [HasQPX, NaNsFPMath] in { -// When either of these operands is NaN, we should return the other operand. -// QVFCMPLT/QVFCMPGT return false is either operand is NaN, which means we need -// to explicitly or with a NaN test on the second operand. -def : Pat<(fminnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFLOGICALb (QVFCMPLTb $FRA, $FRB), - (QVFTSTNANb $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -def : Pat<(fmaxnum v4f64:$FRA, v4f64:$FRB), - (QVFSELb (QVFLOGICALb (QVFCMPGTb $FRA, $FRB), - (QVFTSTNANb $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; - -def : Pat<(fminnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFLOGICALb (QVFCMPLTbs $FRA, $FRB), - (QVFTSTNANbs $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB), - (QVFSELbs (QVFLOGICALb (QVFCMPGTbs $FRA, $FRB), - (QVFTSTNANbs $FRB, $FRB), (i32 7)), - $FRB, $FRA)>; -} diff --git a/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/llvm/lib/Target/PowerPC/PPCInstrSPE.td index 858eb0c9fe50..299b34ca8283 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrSPE.td +++ b/llvm/lib/Target/PowerPC/PPCInstrSPE.td @@ -820,16 +820,6 @@ def SPESTWX : XForm_8<31, 151, (outs), (ins spe4rc:$rS, memrr:$dst), } // HasSPE let Predicates = [HasSPE] in { -def : Pat<(f64 (extloadf32 iaddr:$src)), - (COPY_TO_REGCLASS (SPELWZ iaddr:$src), SPERC)>; -def : Pat<(f64 (extloadf32 xaddr:$src)), - (COPY_TO_REGCLASS (SPELWZX xaddr:$src), SPERC)>; - -def : Pat<(f64 (fpextend f32:$src)), - (COPY_TO_REGCLASS $src, SPERC)>; -} - -let Predicates = [HasSPE] in { def SELECT_CC_SPE4 : PPCCustomInserterPseudo<(outs spe4rc:$dst), (ins crrc:$cond, spe4rc:$T, spe4rc:$F, i32imm:$BROPC), "#SELECT_CC_SPE4", diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 9ba5058a6f81..db6e00c71b89 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -145,6 +145,7 @@ def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED", def HasVSX : Predicate<"Subtarget->hasVSX()">; def IsLittleEndian : Predicate<"Subtarget->isLittleEndian()">; def IsBigEndian : Predicate<"!Subtarget->isLittleEndian()">; +def IsPPC64 : Predicate<"Subtarget->isPPC64()">; def HasOnlySwappingMemOps : Predicate<"!Subtarget->hasP9Vector()">; def HasP8Vector : Predicate<"Subtarget->hasP8Vector()">; def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">; @@ -167,7 +168,7 @@ multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase, def _rec : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), !strconcat(asmbase, !strconcat(". ", asmstr)), itin, [(set InTy:$XT, - (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>, + (InTy (PPCvcmp_rec InTy:$XA, InTy:$XB, xo)))]>, isRecordForm; } } @@ -362,7 +363,8 @@ let hasSideEffects = 0 in { } } // mayStore - let Uses = [RM], mayRaiseFPException = 1 in { + let mayRaiseFPException = 1 in { + let Uses = [RM] in { // Add/Mul Instructions let isCommutable = 1 in { def XSADDDP : XX3Form<60, 32, @@ -622,12 +624,30 @@ let hasSideEffects = 0 in { "xsrsqrtedp $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfrsqrte f64:$XB))]>; + let mayRaiseFPException = 0 in { def XSTDIVDP : XX3Form_1<60, 61, (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>; def XSTSQRTDP : XX2Form_1<60, 106, (outs crrc:$crD), (ins vsfrc:$XB), - "xstsqrtdp $crD, $XB", IIC_FPCompare, []>; + "xstsqrtdp $crD, $XB", IIC_FPCompare, + [(set i32:$crD, (PPCftsqrt f64:$XB))]>; + def XVTDIVDP : XX3Form_1<60, 125, + (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), + "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>; + def XVTDIVSP : XX3Form_1<60, 93, + (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), + "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>; + + def XVTSQRTDP : XX2Form_1<60, 234, + (outs crrc:$crD), (ins vsrc:$XB), + "xvtsqrtdp $crD, $XB", IIC_FPCompare, + [(set i32:$crD, (PPCftsqrt v2f64:$XB))]>; + def XVTSQRTSP : XX2Form_1<60, 170, + (outs crrc:$crD), (ins vsrc:$XB), + "xvtsqrtsp $crD, $XB", IIC_FPCompare, + [(set i32:$crD, (PPCftsqrt v4f32:$XB))]>; + } def XVDIVDP : XX3Form<60, 120, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), @@ -647,20 +667,6 @@ let hasSideEffects = 0 in { "xvsqrtsp $XT, $XB", IIC_FPSqrtS, [(set v4f32:$XT, (any_fsqrt v4f32:$XB))]>; - def XVTDIVDP : XX3Form_1<60, 125, - (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), - "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>; - def XVTDIVSP : XX3Form_1<60, 93, - (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB), - "xvtdivsp $crD, $XA, $XB", IIC_FPCompare, []>; - - def XVTSQRTDP : XX2Form_1<60, 234, - (outs crrc:$crD), (ins vsrc:$XB), - "xvtsqrtdp $crD, $XB", IIC_FPCompare, []>; - def XVTSQRTSP : XX2Form_1<60, 170, - (outs crrc:$crD), (ins vsrc:$XB), - "xvtsqrtsp $crD, $XB", IIC_FPCompare, []>; - def XVREDP : XX2Form<60, 218, (outs vsrc:$XT), (ins vsrc:$XB), "xvredp $XT, $XB", IIC_VecFP, @@ -707,6 +713,7 @@ let hasSideEffects = 0 in { int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>; // Move Instructions + let mayRaiseFPException = 0 in { def XSABSDP : XX2Form<60, 345, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsabsdp $XT, $XB", IIC_VecFP, @@ -760,6 +767,7 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XB), "xvnegsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (fneg v4f32:$XB))]>; + } // Conversion Instructions def XSCVDPSP : XX2Form<60, 265, @@ -768,50 +776,50 @@ let hasSideEffects = 0 in { def XSCVDPSXDS : XX2Form<60, 344, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpsxds $XT, $XB", IIC_VecFP, - [(set f64:$XT, (PPCfctidz f64:$XB))]>; + [(set f64:$XT, (PPCany_fctidz f64:$XB))]>; let isCodeGenOnly = 1 in def XSCVDPSXDSs : XX2Form<60, 344, (outs vssrc:$XT), (ins vssrc:$XB), "xscvdpsxds $XT, $XB", IIC_VecFP, - [(set f32:$XT, (PPCfctidz f32:$XB))]>; + [(set f32:$XT, (PPCany_fctidz f32:$XB))]>; def XSCVDPSXWS : XX2Form<60, 88, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpsxws $XT, $XB", IIC_VecFP, - [(set f64:$XT, (PPCfctiwz f64:$XB))]>; + [(set f64:$XT, (PPCany_fctiwz f64:$XB))]>; let isCodeGenOnly = 1 in def XSCVDPSXWSs : XX2Form<60, 88, (outs vssrc:$XT), (ins vssrc:$XB), "xscvdpsxws $XT, $XB", IIC_VecFP, - [(set f32:$XT, (PPCfctiwz f32:$XB))]>; + [(set f32:$XT, (PPCany_fctiwz f32:$XB))]>; def XSCVDPUXDS : XX2Form<60, 328, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpuxds $XT, $XB", IIC_VecFP, - [(set f64:$XT, (PPCfctiduz f64:$XB))]>; + [(set f64:$XT, (PPCany_fctiduz f64:$XB))]>; let isCodeGenOnly = 1 in def XSCVDPUXDSs : XX2Form<60, 328, (outs vssrc:$XT), (ins vssrc:$XB), "xscvdpuxds $XT, $XB", IIC_VecFP, - [(set f32:$XT, (PPCfctiduz f32:$XB))]>; + [(set f32:$XT, (PPCany_fctiduz f32:$XB))]>; def XSCVDPUXWS : XX2Form<60, 72, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpuxws $XT, $XB", IIC_VecFP, - [(set f64:$XT, (PPCfctiwuz f64:$XB))]>; + [(set f64:$XT, (PPCany_fctiwuz f64:$XB))]>; let isCodeGenOnly = 1 in def XSCVDPUXWSs : XX2Form<60, 72, (outs vssrc:$XT), (ins vssrc:$XB), "xscvdpuxws $XT, $XB", IIC_VecFP, - [(set f32:$XT, (PPCfctiwuz f32:$XB))]>; + [(set f32:$XT, (PPCany_fctiwuz f32:$XB))]>; def XSCVSPDP : XX2Form<60, 329, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvspdp $XT, $XB", IIC_VecFP, []>; def XSCVSXDDP : XX2Form<60, 376, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvsxddp $XT, $XB", IIC_VecFP, - [(set f64:$XT, (PPCfcfid f64:$XB))]>; + [(set f64:$XT, (PPCany_fcfid f64:$XB))]>; def XSCVUXDDP : XX2Form<60, 360, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvuxddp $XT, $XB", IIC_VecFP, - [(set f64:$XT, (PPCfcfidu f64:$XB))]>; + [(set f64:$XT, (PPCany_fcfidu f64:$XB))]>; def XVCVDPSP : XX2Form<60, 393, (outs vsrc:$XT), (ins vsrc:$XB), @@ -820,7 +828,7 @@ let hasSideEffects = 0 in { def XVCVDPSXDS : XX2Form<60, 472, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvdpsxds $XT, $XB", IIC_VecFP, - [(set v2i64:$XT, (fp_to_sint v2f64:$XB))]>; + [(set v2i64:$XT, (any_fp_to_sint v2f64:$XB))]>; def XVCVDPSXWS : XX2Form<60, 216, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvdpsxws $XT, $XB", IIC_VecFP, @@ -828,7 +836,7 @@ let hasSideEffects = 0 in { def XVCVDPUXDS : XX2Form<60, 456, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvdpuxds $XT, $XB", IIC_VecFP, - [(set v2i64:$XT, (fp_to_uint v2f64:$XB))]>; + [(set v2i64:$XT, (any_fp_to_uint v2f64:$XB))]>; def XVCVDPUXWS : XX2Form<60, 200, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvdpuxws $XT, $XB", IIC_VecFP, @@ -844,56 +852,105 @@ let hasSideEffects = 0 in { def XVCVSPSXWS : XX2Form<60, 152, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvspsxws $XT, $XB", IIC_VecFP, - [(set v4i32:$XT, (fp_to_sint v4f32:$XB))]>; + [(set v4i32:$XT, (any_fp_to_sint v4f32:$XB))]>; def XVCVSPUXDS : XX2Form<60, 392, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvspuxds $XT, $XB", IIC_VecFP, []>; def XVCVSPUXWS : XX2Form<60, 136, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvspuxws $XT, $XB", IIC_VecFP, - [(set v4i32:$XT, (fp_to_uint v4f32:$XB))]>; + [(set v4i32:$XT, (any_fp_to_uint v4f32:$XB))]>; def XVCVSXDDP : XX2Form<60, 504, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvsxddp $XT, $XB", IIC_VecFP, - [(set v2f64:$XT, (sint_to_fp v2i64:$XB))]>; + [(set v2f64:$XT, (any_sint_to_fp v2i64:$XB))]>; def XVCVSXDSP : XX2Form<60, 440, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvsxdsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (int_ppc_vsx_xvcvsxdsp v2i64:$XB))]>; - def XVCVSXWDP : XX2Form<60, 248, - (outs vsrc:$XT), (ins vsrc:$XB), - "xvcvsxwdp $XT, $XB", IIC_VecFP, - [(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>; def XVCVSXWSP : XX2Form<60, 184, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvsxwsp $XT, $XB", IIC_VecFP, - [(set v4f32:$XT, (sint_to_fp v4i32:$XB))]>; + [(set v4f32:$XT, (any_sint_to_fp v4i32:$XB))]>; def XVCVUXDDP : XX2Form<60, 488, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvuxddp $XT, $XB", IIC_VecFP, - [(set v2f64:$XT, (uint_to_fp v2i64:$XB))]>; + [(set v2f64:$XT, (any_uint_to_fp v2i64:$XB))]>; def XVCVUXDSP : XX2Form<60, 424, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvuxdsp $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (int_ppc_vsx_xvcvuxdsp v2i64:$XB))]>; + def XVCVUXWSP : XX2Form<60, 168, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvuxwsp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (any_uint_to_fp v4i32:$XB))]>; + + let mayRaiseFPException = 0 in { + def XVCVSXWDP : XX2Form<60, 248, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvcvsxwdp $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (int_ppc_vsx_xvcvsxwdp v4i32:$XB))]>; def XVCVUXWDP : XX2Form<60, 232, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvuxwdp $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (int_ppc_vsx_xvcvuxwdp v4i32:$XB))]>; - def XVCVUXWSP : XX2Form<60, 168, + } + + // Rounding Instructions respecting current rounding mode + def XSRDPIC : XX2Form<60, 107, + (outs vsfrc:$XT), (ins vsfrc:$XB), + "xsrdpic $XT, $XB", IIC_VecFP, + [(set f64:$XT, (fnearbyint f64:$XB))]>; + def XVRDPIC : XX2Form<60, 235, (outs vsrc:$XT), (ins vsrc:$XB), - "xvcvuxwsp $XT, $XB", IIC_VecFP, - [(set v4f32:$XT, (uint_to_fp v4i32:$XB))]>; + "xvrdpic $XT, $XB", IIC_VecFP, + [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>; + def XVRSPIC : XX2Form<60, 171, + (outs vsrc:$XT), (ins vsrc:$XB), + "xvrspic $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>; + // Max/Min Instructions + let isCommutable = 1 in { + def XSMAXDP : XX3Form<60, 160, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmaxdp $XT, $XA, $XB", IIC_VecFP, + [(set vsfrc:$XT, + (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>; + def XSMINDP : XX3Form<60, 168, + (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), + "xsmindp $XT, $XA, $XB", IIC_VecFP, + [(set vsfrc:$XT, + (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>; + + def XVMAXDP : XX3Form<60, 224, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmaxdp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>; + def XVMINDP : XX3Form<60, 232, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmindp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>; + + def XVMAXSP : XX3Form<60, 192, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmaxsp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>; + def XVMINSP : XX3Form<60, 200, + (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvminsp $XT, $XA, $XB", IIC_VecFP, + [(set vsrc:$XT, + (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>; + } // isCommutable + } // Uses = [RM] - // Rounding Instructions + // Rounding Instructions with static direction. def XSRDPI : XX2Form<60, 73, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpi $XT, $XB", IIC_VecFP, [(set f64:$XT, (any_fround f64:$XB))]>; - def XSRDPIC : XX2Form<60, 107, - (outs vsfrc:$XT), (ins vsfrc:$XB), - "xsrdpic $XT, $XB", IIC_VecFP, - [(set f64:$XT, (any_fnearbyint f64:$XB))]>; def XSRDPIM : XX2Form<60, 121, (outs vsfrc:$XT), (ins vsfrc:$XB), "xsrdpim $XT, $XB", IIC_VecFP, @@ -911,10 +968,6 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpi $XT, $XB", IIC_VecFP, [(set v2f64:$XT, (any_fround v2f64:$XB))]>; - def XVRDPIC : XX2Form<60, 235, - (outs vsrc:$XT), (ins vsrc:$XB), - "xvrdpic $XT, $XB", IIC_VecFP, - [(set v2f64:$XT, (any_fnearbyint v2f64:$XB))]>; def XVRDPIM : XX2Form<60, 249, (outs vsrc:$XT), (ins vsrc:$XB), "xvrdpim $XT, $XB", IIC_VecFP, @@ -932,10 +985,6 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XB), "xvrspi $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (any_fround v4f32:$XB))]>; - def XVRSPIC : XX2Form<60, 171, - (outs vsrc:$XT), (ins vsrc:$XB), - "xvrspic $XT, $XB", IIC_VecFP, - [(set v4f32:$XT, (any_fnearbyint v4f32:$XB))]>; def XVRSPIM : XX2Form<60, 185, (outs vsrc:$XT), (ins vsrc:$XB), "xvrspim $XT, $XB", IIC_VecFP, @@ -948,43 +997,7 @@ let hasSideEffects = 0 in { (outs vsrc:$XT), (ins vsrc:$XB), "xvrspiz $XT, $XB", IIC_VecFP, [(set v4f32:$XT, (any_ftrunc v4f32:$XB))]>; - - // Max/Min Instructions - let isCommutable = 1 in { - def XSMAXDP : XX3Form<60, 160, - (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), - "xsmaxdp $XT, $XA, $XB", IIC_VecFP, - [(set vsfrc:$XT, - (int_ppc_vsx_xsmaxdp vsfrc:$XA, vsfrc:$XB))]>; - def XSMINDP : XX3Form<60, 168, - (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB), - "xsmindp $XT, $XA, $XB", IIC_VecFP, - [(set vsfrc:$XT, - (int_ppc_vsx_xsmindp vsfrc:$XA, vsfrc:$XB))]>; - - def XVMAXDP : XX3Form<60, 224, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xvmaxdp $XT, $XA, $XB", IIC_VecFP, - [(set vsrc:$XT, - (int_ppc_vsx_xvmaxdp vsrc:$XA, vsrc:$XB))]>; - def XVMINDP : XX3Form<60, 232, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xvmindp $XT, $XA, $XB", IIC_VecFP, - [(set vsrc:$XT, - (int_ppc_vsx_xvmindp vsrc:$XA, vsrc:$XB))]>; - - def XVMAXSP : XX3Form<60, 192, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xvmaxsp $XT, $XA, $XB", IIC_VecFP, - [(set vsrc:$XT, - (int_ppc_vsx_xvmaxsp vsrc:$XA, vsrc:$XB))]>; - def XVMINSP : XX3Form<60, 200, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), - "xvminsp $XT, $XA, $XB", IIC_VecFP, - [(set vsrc:$XT, - (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>; - } // isCommutable - } // Uses = [RM], mayRaiseFPException + } // mayRaiseFPException // Logical Instructions let isCommutable = 1 in @@ -1170,7 +1183,7 @@ let Predicates = [HasVSX, HasP8Vector] in { "xsresp $XT, $XB", IIC_VecFP, [(set f32:$XT, (PPCfre f32:$XB))]>; // FIXME: Setting the hasSideEffects flag here to match current behaviour. - let hasSideEffects = 1, mayRaiseFPException = 1 in + let hasSideEffects = 1 in def XSRSP : XX2Form<60, 281, (outs vssrc:$XT), (ins vsfrc:$XB), "xsrsp $XT, $XB", IIC_VecFP, @@ -1268,18 +1281,18 @@ let Predicates = [HasVSX, HasP8Vector] in { def XSCVSXDSP : XX2Form<60, 312, (outs vssrc:$XT), (ins vsfrc:$XB), "xscvsxdsp $XT, $XB", IIC_VecFP, - [(set f32:$XT, (PPCfcfids f64:$XB))]>; + [(set f32:$XT, (PPCany_fcfids f64:$XB))]>; def XSCVUXDSP : XX2Form<60, 296, (outs vssrc:$XT), (ins vsfrc:$XB), "xscvuxdsp $XT, $XB", IIC_VecFP, - [(set f32:$XT, (PPCfcfidus f64:$XB))]>; + [(set f32:$XT, (PPCany_fcfidus f64:$XB))]>; + } // mayRaiseFPException // Conversions between vector and scalar single precision def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB), "xscvdpspn $XT, $XB", IIC_VecFP, []>; def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB), "xscvspdpn $XT, $XB", IIC_VecFP, []>; - } // mayRaiseFPException let Predicates = [HasVSX, HasDirectMove] in { // VSX direct move instructions @@ -1440,15 +1453,16 @@ let Predicates = [HasVSX, HasP9Vector] in { // FIXME: Setting the hasSideEffects flag here to match current behaviour. // QP Compare Ordered/Unordered let hasSideEffects = 1 in { - def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>; - def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>; - // DP/QP Compare Exponents def XSCMPEXPDP : XX3Form_1<60, 59, (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB), "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>; def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>; + let mayRaiseFPException = 1 in { + def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>; + def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>; + // DP Compare ==, >=, >, != // Use vsrc for XT, because the entire register of XT is set. // XT.dword[1] = 0x0000_0000_0000_0000 @@ -1458,6 +1472,7 @@ let Predicates = [HasVSX, HasP9Vector] in { IIC_FPCompare, []>; def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc, IIC_FPCompare, []>; + } } //===--------------------------------------------------------------------===// @@ -1476,9 +1491,8 @@ let Predicates = [HasVSX, HasP9Vector] in { f128:$vB))]>; } - // FIXME: Setting the hasSideEffects flag here to match current behaviour. // Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero) - let hasSideEffects = 1 in { + let mayRaiseFPException = 1 in { def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>; def XSCVQPSWZ : X_VT5_XO5_VB5<63, 9, 836, "xscvqpswz", []>; def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>; @@ -1494,11 +1508,12 @@ let Predicates = [HasVSX, HasP9Vector] in { // vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits, // but we still use vsfrc for it. // FIXME: Setting the hasSideEffects flag here to match current behaviour. - let hasSideEffects = 1 in { + let hasSideEffects = 1, mayRaiseFPException = 1 in { def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>; def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>; } + let mayRaiseFPException = 1 in { // Vector HP -> SP // FIXME: Setting the hasSideEffects flag here to match current behaviour. let hasSideEffects = 1 in @@ -1507,16 +1522,15 @@ let Predicates = [HasVSX, HasP9Vector] in { [(set v4f32:$XT, (int_ppc_vsx_xvcvsphp v4f32:$XB))]>; - let mayRaiseFPException = 1 in { - // Round to Quad-Precision Integer [with Inexact] - def XSRQPI : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 0, "xsrqpi" , []>; - def XSRQPIX : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 1, "xsrqpix", []>; - } + // Round to Quad-Precision Integer [with Inexact] + def XSRQPI : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 0, "xsrqpi" , []>; + def XSRQPIX : Z23_VT5_R1_VB5_RMC2_EX1<63, 5, 1, "xsrqpix", []>; // Round Quad-Precision to Double-Extended Precision (fp80) // FIXME: Setting the hasSideEffects flag here to match current behaviour. let hasSideEffects = 1 in def XSRQPXP : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>; + } //===--------------------------------------------------------------------===// // Insert/Extract Instructions @@ -1607,6 +1621,7 @@ let Predicates = [HasVSX, HasP9Vector] in { (int_ppc_vsx_xvtstdcdp v2f64:$XB, timm:$DCMX))]>; // Maximum/Minimum Type-C/Type-J DP + let mayRaiseFPException = 1 in { def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsfrc, vsfrc, vsfrc, IIC_VecFP, [(set f64:$XT, (PPCxsmaxc f64:$XA, f64:$XB))]>; @@ -1621,6 +1636,7 @@ let Predicates = [HasVSX, HasP9Vector] in { def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc, IIC_VecFP, []>; } + } // Vector Byte-Reverse H/W/D/Q Word // FIXME: Setting the hasSideEffects flag here to match current behaviour. @@ -2392,33 +2408,48 @@ def MrgWords { // arbitrarily chosen to be Big, Little. // // Predicate combinations available: +// [HasVSX, IsLittleEndian, HasP8Altivec] Altivec patterns using VSX instr. +// [HasVSX, IsBigEndian, HasP8Altivec] Altivec patterns using VSX instr. // [HasVSX] // [HasVSX, IsBigEndian] // [HasVSX, IsLittleEndian] // [HasVSX, NoP9Vector] +// [HasVSX, NoP9Vector, IsLittleEndian] // [HasVSX, HasOnlySwappingMemOps] // [HasVSX, HasOnlySwappingMemOps, IsBigEndian] // [HasVSX, HasP8Vector] -// [HasVSX, HasP8Vector, IsBigEndian] +// [HasVSX, HasP8Vector, IsBigEndian, IsPPC64] // [HasVSX, HasP8Vector, IsLittleEndian] -// [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian] +// [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64] // [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian] // [HasVSX, HasDirectMove] // [HasVSX, HasDirectMove, IsBigEndian] // [HasVSX, HasDirectMove, IsLittleEndian] -// [HasVSX, HasDirectMove, NoP9Altivec, IsBigEndian] +// [HasVSX, HasDirectMove, NoP9Altivec, IsBigEndian, IsPPC64] +// [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64] // [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian] -// [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian] // [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian] // [HasVSX, HasP9Vector] -// [HasVSX, HasP9Vector, IsBigEndian] +// [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] // [HasVSX, HasP9Vector, IsLittleEndian] // [HasVSX, HasP9Altivec] -// [HasVSX, HasP9Altivec, IsBigEndian] +// [HasVSX, HasP9Altivec, IsBigEndian, IsPPC64] // [HasVSX, HasP9Altivec, IsLittleEndian] -// [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian] +// [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian, IsPPC64] // [HasVSX, IsISA3_0, HasDirectMove, IsLittleEndian] +// These Altivec patterns are here because we need a VSX instruction to match +// the intrinsic (but only for little endian system). +let Predicates = [HasVSX, IsLittleEndian, HasP8Altivec] in + def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor v16i8:$a, + v16i8:$b, v16i8:$c)), + (v16i8 (VPERMXOR $a, $b, (XXLNOR (COPY_TO_REGCLASS $c, VSRC), + (COPY_TO_REGCLASS $c, VSRC))))>; +let Predicates = [HasVSX, IsBigEndian, HasP8Altivec] in + def : Pat<(v16i8 (int_ppc_altivec_crypto_vpermxor v16i8:$a, + v16i8:$b, v16i8:$c)), + (v16i8 (VPERMXOR $a, $b, $c))>; + let AddedComplexity = 400 in { // Valid for any VSX subtarget, regardless of endianness. let Predicates = [HasVSX] in { @@ -2450,6 +2481,10 @@ def : Pat<(fneg (PPCfnmsub v4f32:$A, v4f32:$B, v4f32:$C)), def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, (fneg v4f32:$C)), (XVNMADDASP $C, $A, $B)>; +def : Pat<(PPCfsqrt f64:$frA), (XSSQRTDP $frA)>; +def : Pat<(PPCfsqrt v2f64:$frA), (XVSQRTDP $frA)>; +def : Pat<(PPCfsqrt v4f32:$frA), (XVSQRTSP $frA)>; + def : Pat<(v2f64 (bitconvert v4f32:$A)), (COPY_TO_REGCLASS $A, VSRC)>; def : Pat<(v2f64 (bitconvert v4i32:$A)), @@ -2579,6 +2614,16 @@ def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B), def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B), (XVDIVDP $A, $B)>; +// Vector test for software divide and sqrt. +def : Pat<(i32 (int_ppc_vsx_xvtdivdp v2f64:$A, v2f64:$B)), + (COPY_TO_REGCLASS (XVTDIVDP $A, $B), GPRC)>; +def : Pat<(i32 (int_ppc_vsx_xvtdivsp v4f32:$A, v4f32:$B)), + (COPY_TO_REGCLASS (XVTDIVSP $A, $B), GPRC)>; +def : Pat<(i32 (int_ppc_vsx_xvtsqrtdp v2f64:$A)), + (COPY_TO_REGCLASS (XVTSQRTDP $A), GPRC)>; +def : Pat<(i32 (int_ppc_vsx_xvtsqrtsp v4f32:$A)), + (COPY_TO_REGCLASS (XVTSQRTSP $A), GPRC)>; + // Reciprocal estimate def : Pat<(int_ppc_vsx_xvresp v4f32:$A), (XVRESP $A)>; @@ -2679,7 +2724,7 @@ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(f32 (any_fround f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; -def : Pat<(f32 (any_fnearbyint f32:$S)), +def : Pat<(f32 (fnearbyint f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPIC (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; def : Pat<(f32 (any_ffloor f32:$S)), @@ -2694,11 +2739,11 @@ def : Pat<(f32 (any_ftrunc f32:$S)), def : Pat<(f32 (any_frint f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPIC (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; -def : Pat<(v4f32 (frint v4f32:$S)), (v4f32 (XVRSPIC $S))>; +def : Pat<(v4f32 (any_frint v4f32:$S)), (v4f32 (XVRSPIC $S))>; // Rounding for double precision. -def : Pat<(f64 (frint f64:$S)), (f64 (XSRDPIC $S))>; -def : Pat<(v2f64 (frint v2f64:$S)), (v2f64 (XVRDPIC $S))>; +def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>; +def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>; // Materialize a zero-vector of long long def : Pat<(v2i64 immAllZerosV), @@ -2975,6 +3020,19 @@ defm : ScalToVecWPermute< VSFRC)), sub_64)>; } // HasVSX, NoP9Vector +// Any little endian pre-Power9 VSX subtarget. +let Predicates = [HasVSX, NoP9Vector, IsLittleEndian] in { +// Load-and-splat using only X-Form VSX loads. +defm : ScalToVecWPermute< + v2i64, (i64 (load xoaddr:$src)), + (XXPERMDIs (XFLOADf64 xoaddr:$src), 2), + (SUBREG_TO_REG (i64 1), (XFLOADf64 xoaddr:$src), sub_64)>; +defm : ScalToVecWPermute< + v2f64, (f64 (load xoaddr:$src)), + (XXPERMDIs (XFLOADf64 xoaddr:$src), 2), + (SUBREG_TO_REG (i64 1), (XFLOADf64 xoaddr:$src), sub_64)>; +} // HasVSX, NoP9Vector, IsLittleEndian + // Any VSX subtarget that only has loads and stores that load in big endian // order regardless of endianness. This is really pre-Power9 subtargets. let Predicates = [HasVSX, HasOnlySwappingMemOps] in { @@ -2986,8 +3044,8 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in { def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>; } // HasVSX, HasOnlySwappingMemOps -// Big endian VSX subtarget that only has loads and stores that always load -// in big endian order. Really big endian pre-Power9 subtargets. +// Big endian VSX subtarget that only has loads and stores that always +// load in big endian order. Really big endian pre-Power9 subtargets. let Predicates = [HasVSX, HasOnlySwappingMemOps, IsBigEndian] in { def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>; @@ -3080,7 +3138,7 @@ def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))), } // HasVSX, HasP8Vector // Big endian Power8 VSX subtarget. -let Predicates = [HasVSX, HasP8Vector, IsBigEndian] in { +let Predicates = [HasVSX, HasP8Vector, IsBigEndian, IsPPC64] in { def : Pat<DWToSPExtractConv.El0SS1, (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>; def : Pat<DWToSPExtractConv.El1SS1, @@ -3158,7 +3216,7 @@ foreach Idx = [ [0,3], [2,1], [3,2] ] in { (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))), sub_64), xoaddr:$src)>; } -} // HasVSX, HasP8Vector, IsBigEndian +} // HasVSX, HasP8Vector, IsBigEndian, IsPPC64 // Little endian Power8 VSX subtarget. let Predicates = [HasVSX, HasP8Vector, IsLittleEndian] in { @@ -3257,7 +3315,7 @@ foreach Idx = [ [0,2], [1,1], [3,3] ] in { } // HasVSX, HasP8Vector, IsLittleEndian // Big endian pre-Power9 VSX subtarget. -let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian] in { +let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64] in { def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src), (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>; def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src), @@ -3268,7 +3326,7 @@ def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src), def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src), (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64), xoaddr:$src)>; -} // HasVSX, HasP8Vector, NoP9Vector, IsBigEndian +} // HasVSX, HasP8Vector, NoP9Vector, IsBigEndian, IsPPC64 // Little endian pre-Power9 VSX subtarget. let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian] in { @@ -3525,8 +3583,8 @@ def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), (i32 VectorExtractions.LE_VARIABLE_WORD)>; } // HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian -// Big endian pre-Power9 VSX subtarget that has direct moves. -let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian] in { +// Big endian pre-Power9 64Bit VSX subtarget that has direct moves. +let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64] in { // Big endian integer vectors using direct moves. def : Pat<(v2i64 (build_vector i64:$A, i64:$B)), (v2i64 (XXPERMDI @@ -3540,7 +3598,7 @@ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), (MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), VSRC), 0)>; def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>; -} // HasVSX, HasDirectMove, NoP9Vector, IsBigEndian +} // HasVSX, HasDirectMove, NoP9Vector, IsBigEndian, IsPPC64 // Little endian pre-Power9 VSX subtarget that has direct moves. let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian] in { @@ -3569,25 +3627,25 @@ def : Pat<(fneg (PPCfnmsub f128:$A, f128:$B, f128:$C)), def : Pat<(PPCfnmsub f128:$A, f128:$B, (fneg f128:$C)), (XSNMADDQP $C, $A, $B)>; -def : Pat<(f128 (sint_to_fp i64:$src)), +def : Pat<(f128 (any_sint_to_fp i64:$src)), (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>; -def : Pat<(f128 (sint_to_fp (i64 (PPCmfvsr f64:$src)))), +def : Pat<(f128 (any_sint_to_fp (i64 (PPCmfvsr f64:$src)))), (f128 (XSCVSDQP $src))>; -def : Pat<(f128 (sint_to_fp (i32 (PPCmfvsr f64:$src)))), +def : Pat<(f128 (any_sint_to_fp (i32 (PPCmfvsr f64:$src)))), (f128 (XSCVSDQP (VEXTSW2Ds $src)))>; -def : Pat<(f128 (uint_to_fp i64:$src)), +def : Pat<(f128 (any_uint_to_fp i64:$src)), (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>; -def : Pat<(f128 (uint_to_fp (i64 (PPCmfvsr f64:$src)))), +def : Pat<(f128 (any_uint_to_fp (i64 (PPCmfvsr f64:$src)))), (f128 (XSCVUDQP $src))>; // Convert (Un)Signed Word -> QP. -def : Pat<(f128 (sint_to_fp i32:$src)), +def : Pat<(f128 (any_sint_to_fp i32:$src)), (f128 (XSCVSDQP (MTVSRWA $src)))>; -def : Pat<(f128 (sint_to_fp (i32 (load xoaddr:$src)))), +def : Pat<(f128 (any_sint_to_fp (i32 (load xoaddr:$src)))), (f128 (XSCVSDQP (LIWAX xoaddr:$src)))>; -def : Pat<(f128 (uint_to_fp i32:$src)), +def : Pat<(f128 (any_uint_to_fp i32:$src)), (f128 (XSCVUDQP (MTVSRWZ $src)))>; -def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))), +def : Pat<(f128 (any_uint_to_fp (i32 (load xoaddr:$src)))), (f128 (XSCVUDQP (LIWZX xoaddr:$src)))>; // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a @@ -3761,11 +3819,11 @@ def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)), (f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>; // Truncate & Convert QP -> (Un)Signed (D)Word. -def : Pat<(i64 (fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>; -def : Pat<(i64 (fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>; -def : Pat<(i32 (fp_to_sint f128:$src)), +def : Pat<(i64 (any_fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>; +def : Pat<(i64 (any_fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>; +def : Pat<(i32 (any_fp_to_sint f128:$src)), (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC)))>; -def : Pat<(i32 (fp_to_uint f128:$src)), +def : Pat<(i32 (any_fp_to_uint f128:$src)), (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC)))>; // Instructions for store(fptosi). @@ -3893,8 +3951,8 @@ def : Pat<(v4i32 (PPCldsplat xoaddr:$A)), (v4i32 (LXVWSX xoaddr:$A))>; } // HasVSX, HasP9Vector -// Big endian Power9 subtarget. -let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in { +// Big endian 64Bit Power9 subtarget. +let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in { def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))), (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>; def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))), @@ -4067,7 +4125,7 @@ foreach Idx = 0-15 in { def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))), (f128 (XSCVUDQP (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 4)))>; -} // HasVSX, HasP9Vector, IsBigEndian +} // HasVSX, HasP9Vector, IsBigEndian, IsPPC64 // Little endian Power9 subtarget. let Predicates = [HasVSX, HasP9Vector, IsLittleEndian] in { @@ -4292,8 +4350,8 @@ def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))), (v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>; } // HasVSX, HasP9Altivec -// Big endian Power9 VSX subtargets with P9 Altivec support. -let Predicates = [HasVSX, HasP9Altivec, IsBigEndian] in { +// Big endian Power9 64Bit VSX subtargets with P9 Altivec support. +let Predicates = [HasVSX, HasP9Altivec, IsBigEndian, IsPPC64] in { def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))), (VEXTUBLX $Idx, $S)>; @@ -4426,7 +4484,7 @@ def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1, (v4i32 (VEXTSB2W $A))>; def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)), (v2i64 (VEXTSB2D $A))>; -} // HasVSX, HasP9Altivec, IsBigEndian +} // HasVSX, HasP9Altivec, IsBigEndian, IsPPC64 // Little endian Power9 VSX subtargets with P9 Altivec support. let Predicates = [HasVSX, HasP9Altivec, IsLittleEndian] in { @@ -4563,8 +4621,9 @@ def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)), (v2i64 (VEXTSB2D $A))>; } // HasVSX, HasP9Altivec, IsLittleEndian -// Big endian VSX subtarget that supports additional direct moves from ISA3.0. -let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian] in { +// Big endian 64Bit VSX subtarget that supports additional direct moves from +// ISA3.0. +let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian, IsPPC64] in { def : Pat<(i64 (extractelt v2i64:$A, 1)), (i64 (MFVSRLD $A))>; // Better way to build integer vectors if we have MTVSRDD. Big endian. @@ -4577,7 +4636,7 @@ def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), def : Pat<(f128 (PPCbuild_fp128 i64:$rB, i64:$rA)), (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>; -} // HasVSX, IsISA3_0, HasDirectMove, IsBigEndian +} // HasVSX, IsISA3_0, HasDirectMove, IsBigEndian, IsPPC64 // Little endian VSX subtarget that supports direct moves from ISA3.0. let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsLittleEndian] in { @@ -4602,20 +4661,24 @@ def : InstAlias<"xvmovdp $XT, $XB", def : InstAlias<"xvmovsp $XT, $XB", (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>; -def : InstAlias<"xxspltd $XT, $XB, 0", - (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>; -def : InstAlias<"xxspltd $XT, $XB, 1", - (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>; +// Certain versions of the AIX assembler may missassemble these mnemonics. +let Predicates = [ModernAs] in { + def : InstAlias<"xxspltd $XT, $XB, 0", + (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>; + def : InstAlias<"xxspltd $XT, $XB, 1", + (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>; + def : InstAlias<"xxspltd $XT, $XB, 0", + (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>; + def : InstAlias<"xxspltd $XT, $XB, 1", + (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>; +} + def : InstAlias<"xxmrghd $XT, $XA, $XB", (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>; def : InstAlias<"xxmrgld $XT, $XA, $XB", (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>; def : InstAlias<"xxswapd $XT, $XB", (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>; -def : InstAlias<"xxspltd $XT, $XB, 0", - (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>; -def : InstAlias<"xxspltd $XT, $XB, 1", - (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>; def : InstAlias<"xxswapd $XT, $XB", (XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>; def : InstAlias<"mfvrd $rA, $XT", diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index a7546d2be5d8..c24240909797 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -60,6 +60,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsPowerPC.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" @@ -80,10 +81,8 @@ using namespace llvm; -// By default, we limit this to creating 16 common bases out of loops per -// function. 16 is a little over half of the allocatable register set. static cl::opt<unsigned> MaxVarsPrep("ppc-formprep-max-vars", - cl::Hidden, cl::init(16), + cl::Hidden, cl::init(24), cl::desc("Potential common base number threshold per function for PPC loop " "prep")); @@ -93,8 +92,7 @@ static cl::opt<bool> PreferUpdateForm("ppc-formprep-prefer-update", // Sum of following 3 per loop thresholds for all loops can not be larger // than MaxVarsPrep. -// By default, we limit this to creating 9 PHIs for one loop. -// 9 and 3 for each kind prep are exterimental values on Power9. +// now the thresholds for each kind prep are exterimental values on Power9. static cl::opt<unsigned> MaxVarsUpdateForm("ppc-preinc-prep-max-vars", cl::Hidden, cl::init(3), cl::desc("Potential PHI threshold per loop for PPC loop prep of update " @@ -105,7 +103,7 @@ static cl::opt<unsigned> MaxVarsDSForm("ppc-dsprep-max-vars", cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form")); static cl::opt<unsigned> MaxVarsDQForm("ppc-dqprep-max-vars", - cl::Hidden, cl::init(3), + cl::Hidden, cl::init(8), cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form")); @@ -277,8 +275,11 @@ static Value *GetPointerOperand(Value *MemI) { } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) { return SMemI->getPointerOperand(); } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) { - if (IMemI->getIntrinsicID() == Intrinsic::prefetch) + if (IMemI->getIntrinsicID() == Intrinsic::prefetch || + IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) return IMemI->getArgOperand(0); + if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) + return IMemI->getArgOperand(1); } return nullptr; @@ -345,9 +346,13 @@ SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates( MemI = SMemI; PtrValue = SMemI->getPointerOperand(); } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(&J)) { - if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { + if (IMemI->getIntrinsicID() == Intrinsic::prefetch || + IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) { MemI = IMemI; PtrValue = IMemI->getArgOperand(0); + } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) { + MemI = IMemI; + PtrValue = IMemI->getArgOperand(1); } else continue; } else continue; @@ -606,6 +611,10 @@ bool PPCLoopInstrFormPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain, NewBasePtr = NewPHI; } + // Clear the rewriter cache, because values that are in the rewriter's cache + // can be deleted below, causing the AssertingVH in the cache to trigger. + SCEVE.clear(); + if (Instruction *IDel = dyn_cast<Instruction>(BasePtr)) BBChanged.insert(IDel->getParent()); BasePtr->replaceAllUsesWith(NewBasePtr); @@ -791,7 +800,7 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) { bool MadeChange = false; // Only prep. the inner-most loop - if (!L->empty()) + if (!L->isInnermost()) return MadeChange; // Return if already done enough preparation. @@ -823,6 +832,11 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) { if (ST && ST->hasAltivec() && PtrValue->getType()->getPointerElementType()->isVectorTy()) return false; + // There are no update forms for P10 lxvp/stxvp intrinsic. + auto *II = dyn_cast<IntrinsicInst>(I); + if (II && ((II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) || + II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp)) + return false; // See getPreIndexedAddressParts, the displacement for LDU/STDU has to // be 4's multiple (DS-form). For i64 loads/stores when the displacement // fits in a 16-bit signed field but isn't a multiple of 4, it will be @@ -860,7 +874,13 @@ bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) { // Check if a load/store has DQ form. auto isDQFormCandidate = [&] (const Instruction *I, const Value *PtrValue) { assert((PtrValue && I) && "Invalid parameter!"); - return !isa<IntrinsicInst>(I) && ST && ST->hasP9Vector() && + // Check if it is a P10 lxvp/stxvp intrinsic. + auto *II = dyn_cast<IntrinsicInst>(I); + if (II) + return II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp || + II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp; + // Check if it is a P9 vector load/store. + return ST && ST->hasP9Vector() && (PtrValue->getType()->getPointerElementType()->isVectorTy()); }; diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp index 2b0e604e0ccd..27b2c9a628d0 100644 --- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp +++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp @@ -16,6 +16,7 @@ #include "PPC.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Instructions.h" @@ -64,8 +65,7 @@ private: /// Checks if the specified function name represents an entry in the MASSV /// library. bool PPCLowerMASSVEntries::isMASSVFunc(StringRef Name) { - auto Iter = std::find(std::begin(MASSVFuncs), std::end(MASSVFuncs), Name); - return Iter != std::end(MASSVFuncs); + return llvm::is_contained(MASSVFuncs, Name); } // FIXME: @@ -105,7 +105,7 @@ bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func, return false; if (Constant *Exp = dyn_cast<Constant>(CI->getArgOperand(1))) - if (ConstantFP *CFP = dyn_cast<ConstantFP>(Exp->getSplatValue())) { + if (ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(Exp->getSplatValue())) { // If the argument is 0.75 or 0.25 it is cheaper to turn it into pow // intrinsic so that it could be optimzed as sequence of sqrt's. if (!CI->hasNoInfs() || !CI->hasApproxFunc()) diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index 236f98f32e18..5cc180d770b2 100644 --- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -74,7 +74,9 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, RefKind = MCSymbolRefExpr::VK_PPC_TOC_LO; break; case PPCII::MO_TLS: - RefKind = MCSymbolRefExpr::VK_PPC_TLS; + bool IsPCRel = (MO.getTargetFlags() & ~access) == PPCII::MO_PCREL_FLAG; + RefKind = IsPCRel ? MCSymbolRefExpr::VK_PPC_TLS_PCREL + : MCSymbolRefExpr::VK_PPC_TLS; break; } @@ -84,6 +86,14 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, RefKind = MCSymbolRefExpr::VK_PCREL; else if (MO.getTargetFlags() == (PPCII::MO_PCREL_FLAG | PPCII::MO_GOT_FLAG)) RefKind = MCSymbolRefExpr::VK_PPC_GOT_PCREL; + else if (MO.getTargetFlags() == (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG)) + RefKind = MCSymbolRefExpr::VK_TPREL; + else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG) + RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSGD_PCREL; + else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG) + RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_PCREL; + else if (MO.getTargetFlags() == PPCII::MO_GOT_TPREL_PCREL_FLAG) + RefKind = MCSymbolRefExpr::VK_PPC_GOT_TPREL_PCREL; const MachineInstr *MI = MO.getParent(); const MachineFunction *MF = MI->getMF(); @@ -100,6 +110,8 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, MIOpcode == PPC::BL8_NOTOC) { RefKind = MCSymbolRefExpr::VK_PPC_NOTOC; } + if (MO.getTargetFlags() == PPCII::MO_PCREL_OPT_FLAG) + RefKind = MCSymbolRefExpr::VK_PPC_PCREL_OPT; } const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx); diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index d2aba6bd6e8d..c8b01aaef828 100644 --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -267,6 +267,113 @@ void PPCMIPeephole::UpdateTOCSaves( TOCSaves[MI] = Keep; } +// This function returns a list of all PHI nodes in the tree starting from +// the RootPHI node. We perform a BFS traversal to get an ordered list of nodes. +// The list initially only contains the root PHI. When we visit a PHI node, we +// add it to the list. We continue to look for other PHI node operands while +// there are nodes to visit in the list. The function returns false if the +// optimization cannot be applied on this tree. +static bool collectUnprimedAccPHIs(MachineRegisterInfo *MRI, + MachineInstr *RootPHI, + SmallVectorImpl<MachineInstr *> &PHIs) { + PHIs.push_back(RootPHI); + unsigned VisitedIndex = 0; + while (VisitedIndex < PHIs.size()) { + MachineInstr *VisitedPHI = PHIs[VisitedIndex]; + for (unsigned PHIOp = 1, NumOps = VisitedPHI->getNumOperands(); + PHIOp != NumOps; PHIOp += 2) { + Register RegOp = VisitedPHI->getOperand(PHIOp).getReg(); + if (!Register::isVirtualRegister(RegOp)) + return false; + MachineInstr *Instr = MRI->getVRegDef(RegOp); + // While collecting the PHI nodes, we check if they can be converted (i.e. + // all the operands are either copies, implicit defs or PHI nodes). + unsigned Opcode = Instr->getOpcode(); + if (Opcode == PPC::COPY) { + Register Reg = Instr->getOperand(1).getReg(); + if (!Register::isVirtualRegister(Reg) || + MRI->getRegClass(Reg) != &PPC::ACCRCRegClass) + return false; + } else if (Opcode != PPC::IMPLICIT_DEF && Opcode != PPC::PHI) + return false; + // If we detect a cycle in the PHI nodes, we exit. It would be + // possible to change cycles as well, but that would add a lot + // of complexity for a case that is unlikely to occur with MMA + // code. + if (Opcode != PPC::PHI) + continue; + if (llvm::is_contained(PHIs, Instr)) + return false; + PHIs.push_back(Instr); + } + VisitedIndex++; + } + return true; +} + +// This function changes the unprimed accumulator PHI nodes in the PHIs list to +// primed accumulator PHI nodes. The list is traversed in reverse order to +// change all the PHI operands of a PHI node before changing the node itself. +// We keep a map to associate each changed PHI node to its non-changed form. +static void convertUnprimedAccPHIs(const PPCInstrInfo *TII, + MachineRegisterInfo *MRI, + SmallVectorImpl<MachineInstr *> &PHIs, + Register Dst) { + DenseMap<MachineInstr *, MachineInstr *> ChangedPHIMap; + for (auto It = PHIs.rbegin(), End = PHIs.rend(); It != End; ++It) { + MachineInstr *PHI = *It; + SmallVector<std::pair<MachineOperand, MachineOperand>, 4> PHIOps; + // We check if the current PHI node can be changed by looking at its + // operands. If all the operands are either copies from primed + // accumulators, implicit definitions or other unprimed accumulator + // PHI nodes, we change it. + for (unsigned PHIOp = 1, NumOps = PHI->getNumOperands(); PHIOp != NumOps; + PHIOp += 2) { + Register RegOp = PHI->getOperand(PHIOp).getReg(); + MachineInstr *PHIInput = MRI->getVRegDef(RegOp); + unsigned Opcode = PHIInput->getOpcode(); + assert((Opcode == PPC::COPY || Opcode == PPC::IMPLICIT_DEF || + Opcode == PPC::PHI) && + "Unexpected instruction"); + if (Opcode == PPC::COPY) { + assert(MRI->getRegClass(PHIInput->getOperand(1).getReg()) == + &PPC::ACCRCRegClass && + "Unexpected register class"); + PHIOps.push_back({PHIInput->getOperand(1), PHI->getOperand(PHIOp + 1)}); + } else if (Opcode == PPC::IMPLICIT_DEF) { + Register AccReg = MRI->createVirtualRegister(&PPC::ACCRCRegClass); + BuildMI(*PHIInput->getParent(), PHIInput, PHIInput->getDebugLoc(), + TII->get(PPC::IMPLICIT_DEF), AccReg); + PHIOps.push_back({MachineOperand::CreateReg(AccReg, false), + PHI->getOperand(PHIOp + 1)}); + } else if (Opcode == PPC::PHI) { + // We found a PHI operand. At this point we know this operand + // has already been changed so we get its associated changed form + // from the map. + assert(ChangedPHIMap.count(PHIInput) == 1 && + "This PHI node should have already been changed."); + MachineInstr *PrimedAccPHI = ChangedPHIMap.lookup(PHIInput); + PHIOps.push_back({MachineOperand::CreateReg( + PrimedAccPHI->getOperand(0).getReg(), false), + PHI->getOperand(PHIOp + 1)}); + } + } + Register AccReg = Dst; + // If the PHI node we are changing is the root node, the register it defines + // will be the destination register of the original copy (of the PHI def). + // For all other PHI's in the list, we need to create another primed + // accumulator virtual register as the PHI will no longer define the + // unprimed accumulator. + if (PHI != PHIs[0]) + AccReg = MRI->createVirtualRegister(&PPC::ACCRCRegClass); + MachineInstrBuilder NewPHI = BuildMI( + *PHI->getParent(), PHI, PHI->getDebugLoc(), TII->get(PPC::PHI), AccReg); + for (auto RegMBB : PHIOps) + NewPHI.add(RegMBB.first).add(RegMBB.second); + ChangedPHIMap[PHI] = NewPHI.getInstr(); + } +} + // Perform peephole optimizations. bool PPCMIPeephole::simplifyCode(void) { bool Simplified = false; @@ -321,6 +428,38 @@ bool PPCMIPeephole::simplifyCode(void) { default: break; + case PPC::COPY: { + Register Src = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + if (!Register::isVirtualRegister(Src) || + !Register::isVirtualRegister(Dst)) + break; + if (MRI->getRegClass(Src) != &PPC::UACCRCRegClass || + MRI->getRegClass(Dst) != &PPC::ACCRCRegClass) + break; + + // We are copying an unprimed accumulator to a primed accumulator. + // If the input to the copy is a PHI that is fed only by (i) copies in + // the other direction (ii) implicitly defined unprimed accumulators or + // (iii) other PHI nodes satisfying (i) and (ii), we can change + // the PHI to a PHI on primed accumulators (as long as we also change + // its operands). To detect and change such copies, we first get a list + // of all the PHI nodes starting from the root PHI node in BFS order. + // We then visit all these PHI nodes to check if they can be changed to + // primed accumulator PHI nodes and if so, we change them. + MachineInstr *RootPHI = MRI->getVRegDef(Src); + if (RootPHI->getOpcode() != PPC::PHI) + break; + + SmallVector<MachineInstr *, 4> PHIs; + if (!collectUnprimedAccPHIs(MRI, RootPHI, PHIs)) + break; + + convertUnprimedAccPHIs(TII, MRI, PHIs, Dst); + + ToErase = &MI; + break; + } case PPC::LI: case PPC::LI8: { // If we are materializing a zero, look for any use operands for which @@ -573,7 +712,7 @@ bool PPCMIPeephole::simplifyCode(void) { Simplified = true; Register ConvReg1 = RoundInstr->getOperand(1).getReg(); Register FRSPDefines = RoundInstr->getOperand(0).getReg(); - MachineInstr &Use = *(MRI->use_instr_begin(FRSPDefines)); + MachineInstr &Use = *(MRI->use_instr_nodbg_begin(FRSPDefines)); for (int i = 0, e = Use.getNumOperands(); i < e; ++i) if (Use.getOperand(i).isReg() && Use.getOperand(i).getReg() == FRSPDefines) @@ -848,142 +987,9 @@ bool PPCMIPeephole::simplifyCode(void) { case PPC::RLWINM_rec: case PPC::RLWINM8: case PPC::RLWINM8_rec: { - unsigned FoldingReg = MI.getOperand(1).getReg(); - if (!Register::isVirtualRegister(FoldingReg)) - break; - - MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg); - if (SrcMI->getOpcode() != PPC::RLWINM && - SrcMI->getOpcode() != PPC::RLWINM_rec && - SrcMI->getOpcode() != PPC::RLWINM8 && - SrcMI->getOpcode() != PPC::RLWINM8_rec) - break; - assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() && - MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() && - SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) && - "Invalid PPC::RLWINM Instruction!"); - uint64_t SHSrc = SrcMI->getOperand(2).getImm(); - uint64_t SHMI = MI.getOperand(2).getImm(); - uint64_t MBSrc = SrcMI->getOperand(3).getImm(); - uint64_t MBMI = MI.getOperand(3).getImm(); - uint64_t MESrc = SrcMI->getOperand(4).getImm(); - uint64_t MEMI = MI.getOperand(4).getImm(); - - assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) && - "Invalid PPC::RLWINM Instruction!"); - - // If MBMI is bigger than MEMI, we always can not get run of ones. - // RotatedSrcMask non-wrap: - // 0........31|32........63 - // RotatedSrcMask: B---E B---E - // MaskMI: -----------|--E B------ - // Result: ----- --- (Bad candidate) - // - // RotatedSrcMask wrap: - // 0........31|32........63 - // RotatedSrcMask: --E B----|--E B---- - // MaskMI: -----------|--E B------ - // Result: --- -----|--- ----- (Bad candidate) - // - // One special case is RotatedSrcMask is a full set mask. - // RotatedSrcMask full: - // 0........31|32........63 - // RotatedSrcMask: ------EB---|-------EB--- - // MaskMI: -----------|--E B------ - // Result: -----------|--- ------- (Good candidate) - - // Mark special case. - bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31); - - // For other MBMI > MEMI cases, just return. - if ((MBMI > MEMI) && !SrcMaskFull) - break; - - // Handle MBMI <= MEMI cases. - APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI); - // In MI, we only need low 32 bits of SrcMI, just consider about low 32 - // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0, - // while in PowerPC ISA, lowerest bit is at index 63. - APInt MaskSrc = - APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc); - - APInt RotatedSrcMask = MaskSrc.rotl(SHMI); - APInt FinalMask = RotatedSrcMask & MaskMI; - uint32_t NewMB, NewME; - - // If final mask is 0, MI result should be 0 too. - if (FinalMask.isNullValue()) { - bool Is64Bit = (MI.getOpcode() == PPC::RLWINM8 || - MI.getOpcode() == PPC::RLWINM8_rec); - - Simplified = true; - - LLVM_DEBUG(dbgs() << "Replace Instr: "); - LLVM_DEBUG(MI.dump()); - - if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) { - // Replace MI with "LI 0" - MI.RemoveOperand(4); - MI.RemoveOperand(3); - MI.RemoveOperand(2); - MI.getOperand(1).ChangeToImmediate(0); - MI.setDesc(TII->get(Is64Bit ? PPC::LI8 : PPC::LI)); - } else { - // Replace MI with "ANDI_rec reg, 0" - MI.RemoveOperand(4); - MI.RemoveOperand(3); - MI.getOperand(2).setImm(0); - MI.setDesc(TII->get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec)); - MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); - if (SrcMI->getOperand(1).isKill()) { - MI.getOperand(1).setIsKill(true); - SrcMI->getOperand(1).setIsKill(false); - } else - // About to replace MI.getOperand(1), clear its kill flag. - MI.getOperand(1).setIsKill(false); - } - - LLVM_DEBUG(dbgs() << "With: "); - LLVM_DEBUG(MI.dump()); - } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB, - NewME) && NewMB <= NewME)|| SrcMaskFull) { - // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger - // than NewME. Otherwise we get a 64 bit value after folding, but MI - // return a 32 bit value. - - Simplified = true; - LLVM_DEBUG(dbgs() << "Converting Instr: "); - LLVM_DEBUG(MI.dump()); - - uint16_t NewSH = (SHSrc + SHMI) % 32; - MI.getOperand(2).setImm(NewSH); - // If SrcMI mask is full, no need to update MBMI and MEMI. - if (!SrcMaskFull) { - MI.getOperand(3).setImm(NewMB); - MI.getOperand(4).setImm(NewME); - } - MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); - if (SrcMI->getOperand(1).isKill()) { - MI.getOperand(1).setIsKill(true); - SrcMI->getOperand(1).setIsKill(false); - } else - // About to replace MI.getOperand(1), clear its kill flag. - MI.getOperand(1).setIsKill(false); - - LLVM_DEBUG(dbgs() << "To: "); - LLVM_DEBUG(MI.dump()); - } - if (Simplified) { - // If FoldingReg has no non-debug use and it has no implicit def (it - // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI. - // Otherwise keep it. + Simplified = TII->combineRLWINM(MI, &ToErase); + if (Simplified) ++NumRotatesCollapsed; - if (MRI->use_nodbg_empty(FoldingReg) && !SrcMI->hasImplicitDef()) { - ToErase = SrcMI; - LLVM_DEBUG(dbgs() << "Delete dead instruction: "); - LLVM_DEBUG(SrcMI->dump()); - } - } break; } } @@ -1555,6 +1561,8 @@ bool PPCMIPeephole::emitRLDICWhenLoweringJumpTables(MachineInstr &MI) { MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); MI.getOperand(2).setImm(NewSH); MI.getOperand(3).setImm(NewMB); + MI.getOperand(1).setIsKill(SrcMI->getOperand(1).isKill()); + SrcMI->getOperand(1).setIsKill(false); LLVM_DEBUG(dbgs() << "To: "); LLVM_DEBUG(MI.dump()); diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp index daf88589bb52..c976a9c62d3b 100644 --- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -8,6 +8,7 @@ #include "PPCMachineFunctionInfo.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/XCOFF.h" #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCContext.h" #include "llvm/Support/CommandLine.h" @@ -63,3 +64,36 @@ bool PPCFunctionInfo::isLiveInZExt(Register VReg) const { return LiveIn.second.isZExt(); return false; } + +void PPCFunctionInfo::appendParameterType(ParamType Type) { + uint32_t CopyParamType = ParameterType; + int Bits = 0; + + // If it is fixed type, we only need to increase the FixedParamNum, for + // the bit encode of fixed type is bit of zero, we do not need to change the + // ParamType. + if (Type == FixedType) { + ++FixedParamNum; + return; + } + + ++FloatingPointParamNum; + + for (int I = 0; + I < static_cast<int>(FloatingPointParamNum + FixedParamNum - 1); ++I) { + if (CopyParamType & XCOFF::TracebackTable::ParmTypeIsFloatingBit) { + // '10'b => floating point short parameter. + // '11'b => floating point long parameter. + CopyParamType <<= 2; + Bits += 2; + } else { + // '0'b => fixed parameter. + CopyParamType <<= 1; + ++Bits; + } + } + + assert(Type != FixedType && "FixedType should already be handled."); + if (Bits < 31) + ParameterType |= Type << (30 - Bits); +} diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h index 29ca53e273d7..4b73b36318b4 100644 --- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -22,6 +22,16 @@ namespace llvm { /// PPCFunctionInfo - This class is derived from MachineFunction private /// PowerPC target-specific information for each MachineFunction. class PPCFunctionInfo : public MachineFunctionInfo { +public: + // The value in the ParamType are used to indicate the bitstrings used in the + // encoding format. + enum ParamType { + FixedType = 0x0, + ShortFloatPoint = 0x2, + LongFloatPoint = 0x3 + }; + +private: virtual void anchor(); /// FramePointerSaveIndex - Frame index of where the old frame pointer is @@ -69,9 +79,6 @@ class PPCFunctionInfo : public MachineFunctionInfo { /// disabled. bool DisableNonVolatileCR = false; - /// Indicates whether VRSAVE is spilled in the current function. - bool SpillsVRSAVE = false; - /// LRStoreRequired - The bool indicates whether there is some explicit use of /// the LR/LR8 stack slot that is not obvious from scanning the code. This /// requires that the code generator produce a store of LR to the stack on @@ -110,6 +117,20 @@ class PPCFunctionInfo : public MachineFunctionInfo { /// register for parameter passing. unsigned VarArgsNumFPR = 0; + /// FixedParamNum - Number of fixed parameter. + unsigned FixedParamNum = 0; + + /// FloatingParamNum - Number of floating point parameter. + unsigned FloatingPointParamNum = 0; + + /// ParamType - Encode type for every parameter + /// in the order of parameters passing in. + /// Bitstring starts from the most significant (leftmost) bit. + /// '0'b => fixed parameter. + /// '10'b => floating point short parameter. + /// '11'b => floating point long parameter. + uint32_t ParameterType = 0; + /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4. int CRSpillFrameIndex = 0; @@ -175,9 +196,6 @@ public: void setDisableNonVolatileCR() { DisableNonVolatileCR = true; } bool isNonVolatileCRDisabled() const { return DisableNonVolatileCR; } - void setSpillsVRSAVE() { SpillsVRSAVE = true; } - bool isVRSAVESpilled() const { return SpillsVRSAVE; } - void setLRStoreRequired() { LRStoreRequired = true; } bool isLRStoreRequired() const { return LRStoreRequired; } @@ -196,6 +214,13 @@ public: unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; } void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; } + unsigned getFixedParamNum() const { return FixedParamNum; } + + unsigned getFloatingPointParamNum() const { return FloatingPointParamNum; } + + uint32_t getParameterType() const { return ParameterType; } + void appendParameterType(ParamType Type); + unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; } void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; } diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp index 5649d7d13966..ce615e554d94 100644 --- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp +++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp @@ -49,10 +49,103 @@ bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand, void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const { - GenericScheduler::tryCandidate(Cand, TryCand, Zone); + // From GenericScheduler::tryCandidate - if (!Cand.isValid() || !Zone) + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; return; + } + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return; + + // Avoid exceeding the target's limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return; + + // Avoid increasing the max critical pressure in the scheduled region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, + TryCand, Cand, RegCritical, TRI, DAG->MF)) + return; + + // We only compare a subset of features when comparing nodes between + // Top and Bottom boundary. Some properties are simply incomparable, in many + // other instances we should only override the other boundary if something + // is a clear good pick on one boundary. Skip heuristics that are more + // "tie-breaking" in nature. + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + // For loops that are acyclic path limited, aggressively schedule for + // latency. Within an single cycle, whenever CurrMOps > 0, allow normal + // heuristics to take precedence. + if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && + tryLatency(TryCand, Cand, *Zone)) + return; + + // Prioritize instructions that read unbuffered resources by stall cycles. + if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), + Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) + return; + } + + // Keep clustered nodes together to encourage downstream peephole + // optimizations which may reduce resource requirements. + // + // This is a best effort to set things up for a post-RA pass. Optimizations + // like generating loads of multiple registers should ideally be done within + // the scheduler pass by combining the loads during DAG postprocessing. + const SUnit *CandNextClusterSU = + Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + const SUnit *TryCandNextClusterSU = + TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + if (tryGreater(TryCand.SU == TryCandNextClusterSU, + Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) + return; + + if (SameBoundary) { + // Weak edges are for clustering and other constraints. + if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), + getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) + return; + } + + // Avoid increasing the max pressure of the entire region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, + Cand, RegMax, TRI, DAG->MF)) + return; + + if (SameBoundary) { + // Avoid critical resource consumption and balance the schedule. + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return; + + // Avoid serializing long latency dependence chains. + // For acyclic path limited loops, latency was already checked above. + if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && + !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) + return; + + // Fall through to original instruction order. + if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || + (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { + TryCand.Reason = NodeOrder; + } + } + + // GenericScheduler::tryCandidate end // Add powerpc specific heuristic only when TryCand isn't selected or // selected as node order. @@ -61,8 +154,10 @@ void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand, // There are some benefits to schedule the ADDI before the load to hide the // latency, as RA may create a true dependency between the load and addi. - if (biasAddiLoadCandidate(Cand, TryCand, *Zone)) - return; + if (SameBoundary) { + if (biasAddiLoadCandidate(Cand, TryCand, *Zone)) + return; + } } bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand, @@ -79,11 +174,44 @@ bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand, void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) { - PostGenericScheduler::tryCandidate(Cand, TryCand); + // From PostGenericScheduler::tryCandidate + + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return; + } + + // Prioritize instructions that read unbuffered resources by stall cycles. + if (tryLess(Top.getLatencyStallCycles(TryCand.SU), + Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) + return; - if (!Cand.isValid()) + // Keep clustered nodes together. + if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(), + Cand.SU == DAG->getNextClusterSucc(), TryCand, Cand, Cluster)) return; + // Avoid critical resource consumption and balance the schedule. + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return; + + // Avoid serializing long latency dependence chains. + if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) { + return; + } + + // Fall through to original instruction order. + if (TryCand.SU->NodeNum < Cand.SU->NodeNum) + TryCand.Reason = NodeOrder; + + // PostGenericScheduler::tryCandidate end + // Add powerpc post ra specific heuristic only when TryCand isn't selected or // selected as node order. if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand) diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp index 815dfd1402f4..d12c6d9cd406 100644 --- a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp +++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp @@ -51,8 +51,8 @@ public: Kd(Kind), Supported(HasFeature), DepOpIdx(Index), OpSet1(First), OpSet2(Second) {} - bool hasOp1(unsigned Opc) const { return OpSet1.count(Opc) != 0; } - bool hasOp2(unsigned Opc) const { return OpSet2.count(Opc) != 0; } + bool hasOp1(unsigned Opc) const { return OpSet1.contains(Opc); } + bool hasOp2(unsigned Opc) const { return OpSet2.contains(Opc); } bool isSupported() const { return Supported; } Optional<unsigned> depOpIdx() const { if (DepOpIdx < 0) diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp index 4ea714ff15f7..a8853609a7c8 100644 --- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp @@ -21,8 +21,8 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCContext.h" #include "llvm/Support/CommandLine.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -39,10 +39,54 @@ STATISTIC(NumFrameOffFoldInPreEmit, "Number of folding frame offset by using r+r in pre-emit peephole"); static cl::opt<bool> +EnablePCRelLinkerOpt("ppc-pcrel-linker-opt", cl::Hidden, cl::init(true), + cl::desc("enable PC Relative linker optimization")); + +static cl::opt<bool> RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(true), cl::desc("Run pre-emit peephole optimizations.")); namespace { + +static bool hasPCRelativeForm(MachineInstr &Use) { + switch (Use.getOpcode()) { + default: + return false; + case PPC::LBZ: + case PPC::LBZ8: + case PPC::LHA: + case PPC::LHA8: + case PPC::LHZ: + case PPC::LHZ8: + case PPC::LWZ: + case PPC::LWZ8: + case PPC::STB: + case PPC::STB8: + case PPC::STH: + case PPC::STH8: + case PPC::STW: + case PPC::STW8: + case PPC::LD: + case PPC::STD: + case PPC::LWA: + case PPC::LXSD: + case PPC::LXSSP: + case PPC::LXV: + case PPC::STXSD: + case PPC::STXSSP: + case PPC::STXV: + case PPC::LFD: + case PPC::LFS: + case PPC::STFD: + case PPC::STFS: + case PPC::DFLOADf32: + case PPC::DFLOADf64: + case PPC::DFSTOREf32: + case PPC::DFSTOREf64: + return true; + } +} + class PPCPreEmitPeephole : public MachineFunctionPass { public: static char ID; @@ -77,7 +121,7 @@ namespace { for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) { // Skip load immediate that is marked to be erased later because it // cannot be used to replace any other instructions. - if (InstrsToErase.find(&*BBI) != InstrsToErase.end()) + if (InstrsToErase.contains(&*BBI)) continue; // Skip non-load immediate. unsigned Opc = BBI->getOpcode(); @@ -172,6 +216,196 @@ namespace { return !InstrsToErase.empty(); } + // Check if this instruction is a PLDpc that is part of a GOT indirect + // access. + bool isGOTPLDpc(MachineInstr &Instr) { + if (Instr.getOpcode() != PPC::PLDpc) + return false; + + // The result must be a register. + const MachineOperand &LoadedAddressReg = Instr.getOperand(0); + if (!LoadedAddressReg.isReg()) + return false; + + // Make sure that this is a global symbol. + const MachineOperand &SymbolOp = Instr.getOperand(1); + if (!SymbolOp.isGlobal()) + return false; + + // Finally return true only if the GOT flag is present. + return (SymbolOp.getTargetFlags() & PPCII::MO_GOT_FLAG); + } + + bool addLinkerOpt(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) { + MachineFunction *MF = MBB.getParent(); + // If the linker opt is disabled then just return. + if (!EnablePCRelLinkerOpt) + return false; + + // Add this linker opt only if we are using PC Relative memops. + if (!MF->getSubtarget<PPCSubtarget>().isUsingPCRelativeCalls()) + return false; + + // Struct to keep track of one def/use pair for a GOT indirect access. + struct GOTDefUsePair { + MachineBasicBlock::iterator DefInst; + MachineBasicBlock::iterator UseInst; + Register DefReg; + Register UseReg; + bool StillValid; + }; + // Vector of def/ues pairs in this basic block. + SmallVector<GOTDefUsePair, 4> CandPairs; + SmallVector<GOTDefUsePair, 4> ValidPairs; + bool MadeChange = false; + + // Run through all of the instructions in the basic block and try to + // collect potential pairs of GOT indirect access instructions. + for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) { + // Look for the initial GOT indirect load. + if (isGOTPLDpc(*BBI)) { + GOTDefUsePair CurrentPair{BBI, MachineBasicBlock::iterator(), + BBI->getOperand(0).getReg(), + PPC::NoRegister, true}; + CandPairs.push_back(CurrentPair); + continue; + } + + // We haven't encountered any new PLD instructions, nothing to check. + if (CandPairs.empty()) + continue; + + // Run through the candidate pairs and see if any of the registers + // defined in the PLD instructions are used by this instruction. + // Note: the size of CandPairs can change in the loop. + for (unsigned Idx = 0; Idx < CandPairs.size(); Idx++) { + GOTDefUsePair &Pair = CandPairs[Idx]; + // The instruction does not use or modify this PLD's def reg, + // ignore it. + if (!BBI->readsRegister(Pair.DefReg, TRI) && + !BBI->modifiesRegister(Pair.DefReg, TRI)) + continue; + + // The use needs to be used in the address compuation and not + // as the register being stored for a store. + const MachineOperand *UseOp = + hasPCRelativeForm(*BBI) ? &BBI->getOperand(2) : nullptr; + + // Check for a valid use. + if (UseOp && UseOp->isReg() && UseOp->getReg() == Pair.DefReg && + UseOp->isUse() && UseOp->isKill()) { + Pair.UseInst = BBI; + Pair.UseReg = BBI->getOperand(0).getReg(); + ValidPairs.push_back(Pair); + } + CandPairs.erase(CandPairs.begin() + Idx); + } + } + + // Go through all of the pairs and check for any more valid uses. + for (auto Pair = ValidPairs.begin(); Pair != ValidPairs.end(); Pair++) { + // We shouldn't be here if we don't have a valid pair. + assert(Pair->UseInst.isValid() && Pair->StillValid && + "Kept an invalid def/use pair for GOT PCRel opt"); + // We have found a potential pair. Search through the instructions + // between the def and the use to see if it is valid to mark this as a + // linker opt. + MachineBasicBlock::iterator BBI = Pair->DefInst; + ++BBI; + for (; BBI != Pair->UseInst; ++BBI) { + if (BBI->readsRegister(Pair->UseReg, TRI) || + BBI->modifiesRegister(Pair->UseReg, TRI)) { + Pair->StillValid = false; + break; + } + } + + if (!Pair->StillValid) + continue; + + // The load/store instruction that uses the address from the PLD will + // either use a register (for a store) or define a register (for the + // load). That register will be added as an implicit def to the PLD + // and as an implicit use on the second memory op. This is a precaution + // to prevent future passes from using that register between the two + // instructions. + MachineOperand ImplDef = + MachineOperand::CreateReg(Pair->UseReg, true, true); + MachineOperand ImplUse = + MachineOperand::CreateReg(Pair->UseReg, false, true); + Pair->DefInst->addOperand(ImplDef); + Pair->UseInst->addOperand(ImplUse); + + // Create the symbol. + MCContext &Context = MF->getContext(); + MCSymbol *Symbol = Context.createNamedTempSymbol("pcrel"); + MachineOperand PCRelLabel = + MachineOperand::CreateMCSymbol(Symbol, PPCII::MO_PCREL_OPT_FLAG); + Pair->DefInst->addOperand(*MF, PCRelLabel); + Pair->UseInst->addOperand(*MF, PCRelLabel); + MadeChange |= true; + } + return MadeChange; + } + + // This function removes redundant pairs of accumulator prime/unprime + // instructions. In some situations, it's possible the compiler inserts an + // accumulator prime instruction followed by an unprime instruction (e.g. + // when we store an accumulator after restoring it from a spill). If the + // accumulator is not used between the two, they can be removed. This + // function removes these redundant pairs from basic blocks. + // The algorithm is quite straightforward - every time we encounter a prime + // instruction, the primed register is added to a candidate set. Any use + // other than a prime removes the candidate from the set and any de-prime + // of a current candidate marks both the prime and de-prime for removal. + // This way we ensure we only remove prime/de-prime *pairs* with no + // intervening uses. + bool removeAccPrimeUnprime(MachineBasicBlock &MBB) { + DenseSet<MachineInstr *> InstrsToErase; + // Initially, none of the acc registers are candidates. + SmallVector<MachineInstr *, 8> Candidates( + PPC::UACCRCRegClass.getNumRegs(), nullptr); + + for (MachineInstr &BBI : MBB.instrs()) { + unsigned Opc = BBI.getOpcode(); + // If we are visiting a xxmtacc instruction, we add it and its operand + // register to the candidate set. + if (Opc == PPC::XXMTACC) { + Register Acc = BBI.getOperand(0).getReg(); + assert(PPC::ACCRCRegClass.contains(Acc) && + "Unexpected register for XXMTACC"); + Candidates[Acc - PPC::ACC0] = &BBI; + } + // If we are visiting a xxmfacc instruction and its operand register is + // in the candidate set, we mark the two instructions for removal. + else if (Opc == PPC::XXMFACC) { + Register Acc = BBI.getOperand(0).getReg(); + assert(PPC::ACCRCRegClass.contains(Acc) && + "Unexpected register for XXMFACC"); + if (!Candidates[Acc - PPC::ACC0]) + continue; + InstrsToErase.insert(&BBI); + InstrsToErase.insert(Candidates[Acc - PPC::ACC0]); + } + // If we are visiting an instruction using an accumulator register + // as operand, we remove it from the candidate set. + else { + for (MachineOperand &Operand : BBI.operands()) { + if (!Operand.isReg()) + continue; + Register Reg = Operand.getReg(); + if (PPC::ACCRCRegClass.contains(Reg)) + Candidates[Reg - PPC::ACC0] = nullptr; + } + } + } + + for (MachineInstr *MI : InstrsToErase) + MI->eraseFromParent(); + NumRemovedInPreEmit += InstrsToErase.size(); + return !InstrsToErase.empty(); + } + bool runOnMachineFunction(MachineFunction &MF) override { if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole) { // Remove UNENCODED_NOP even when this pass is disabled. @@ -192,6 +426,8 @@ namespace { SmallVector<MachineInstr *, 4> InstrsToErase; for (MachineBasicBlock &MBB : MF) { Changed |= removeRedundantLIs(MBB, TRI); + Changed |= addLinkerOpt(MBB, TRI); + Changed |= removeAccPrimeUnprime(MBB); for (MachineInstr &MI : MBB) { unsigned Opc = MI.getOpcode(); if (Opc == PPC::UNENCODED_NOP) { diff --git a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp b/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp deleted file mode 100644 index 6e9042643820..000000000000 --- a/llvm/lib/Target/PowerPC/PPCQPXLoadSplat.cpp +++ /dev/null @@ -1,161 +0,0 @@ -//===----- PPCQPXLoadSplat.cpp - QPX Load Splat Simplification ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// The QPX vector registers overlay the scalar floating-point registers, and -// any scalar floating-point loads splat their value across all vector lanes. -// Thus, if we have a scalar load followed by a splat, we can remove the splat -// (i.e. replace the load with a load-and-splat pseudo instruction). -// -// This pass must run after anything that might do store-to-load forwarding. -// -//===----------------------------------------------------------------------===// - -#include "PPC.h" -#include "PPCInstrBuilder.h" -#include "PPCInstrInfo.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "ppc-qpx-load-splat" - -STATISTIC(NumSimplified, "Number of QPX load splats simplified"); - -namespace { - struct PPCQPXLoadSplat : public MachineFunctionPass { - static char ID; - PPCQPXLoadSplat() : MachineFunctionPass(ID) { - initializePPCQPXLoadSplatPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &Fn) override; - - StringRef getPassName() const override { - return "PowerPC QPX Load Splat Simplification"; - } - }; - char PPCQPXLoadSplat::ID = 0; -} - -INITIALIZE_PASS(PPCQPXLoadSplat, "ppc-qpx-load-splat", - "PowerPC QPX Load Splat Simplification", - false, false) - -FunctionPass *llvm::createPPCQPXLoadSplatPass() { - return new PPCQPXLoadSplat(); -} - -bool PPCQPXLoadSplat::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction())) - return false; - - bool MadeChange = false; - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - - for (auto MFI = MF.begin(), MFIE = MF.end(); MFI != MFIE; ++MFI) { - MachineBasicBlock *MBB = &*MFI; - SmallVector<MachineInstr *, 4> Splats; - - for (auto MBBI = MBB->rbegin(); MBBI != MBB->rend(); ++MBBI) { - MachineInstr *MI = &*MBBI; - - if (MI->hasUnmodeledSideEffects() || MI->isCall()) { - Splats.clear(); - continue; - } - - // We're looking for a sequence like this: - // %f0 = LFD 0, killed %x3, implicit-def %qf0; mem:LD8[%a](tbaa=!2) - // %qf1 = QVESPLATI killed %qf0, 0, implicit %rm - - for (auto SI = Splats.begin(); SI != Splats.end();) { - MachineInstr *SMI = *SI; - Register SplatReg = SMI->getOperand(0).getReg(); - Register SrcReg = SMI->getOperand(1).getReg(); - - if (MI->modifiesRegister(SrcReg, TRI)) { - switch (MI->getOpcode()) { - default: - SI = Splats.erase(SI); - continue; - case PPC::LFS: - case PPC::LFD: - case PPC::LFSU: - case PPC::LFDU: - case PPC::LFSUX: - case PPC::LFDUX: - case PPC::LFSX: - case PPC::LFDX: - case PPC::LFIWAX: - case PPC::LFIWZX: - if (SplatReg != SrcReg) { - // We need to change the load to define the scalar subregister of - // the QPX splat source register. - unsigned SubRegIndex = - TRI->getSubRegIndex(SrcReg, MI->getOperand(0).getReg()); - Register SplatSubReg = TRI->getSubReg(SplatReg, SubRegIndex); - - // Substitute both the explicit defined register, and also the - // implicit def of the containing QPX register. - MI->getOperand(0).setReg(SplatSubReg); - MI->substituteRegister(SrcReg, SplatReg, 0, *TRI); - } - - SI = Splats.erase(SI); - - // If SMI is directly after MI, then MBBI's base iterator is - // pointing at SMI. Adjust MBBI around the call to erase SMI to - // avoid invalidating MBBI. - ++MBBI; - SMI->eraseFromParent(); - --MBBI; - - ++NumSimplified; - MadeChange = true; - continue; - } - } - - // If this instruction defines the splat register, then we cannot move - // the previous definition above it. If it reads from the splat - // register, then it must already be alive from some previous - // definition, and if the splat register is different from the source - // register, then this definition must not be the load for which we're - // searching. - if (MI->modifiesRegister(SplatReg, TRI) || - (SrcReg != SplatReg && - MI->readsRegister(SplatReg, TRI))) { - SI = Splats.erase(SI); - continue; - } - - ++SI; - } - - if (MI->getOpcode() != PPC::QVESPLATI && - MI->getOpcode() != PPC::QVESPLATIs && - MI->getOpcode() != PPC::QVESPLATIb) - continue; - if (MI->getOperand(2).getImm() != 0) - continue; - - // If there are other uses of the scalar value after this, replacing - // those uses might be non-trivial. - if (!MI->getOperand(1).isKill()) - continue; - - Splats.push_back(MI); - } - } - - return MadeChange; -} diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp index 90cc81beb89d..5cee00c61fc1 100644 --- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp +++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp @@ -206,9 +206,9 @@ static bool splitMBB(BlockSplitInfo &BSI) { NewMBB->splice(NewMBB->end(), ThisMBB, InsertPoint, ThisMBB->end()); NewMBB->transferSuccessors(ThisMBB); if (!ProbOrigTarget.isUnknown()) { - auto MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigTarget); + auto MBBI = find(NewMBB->successors(), OrigTarget); NewMBB->setSuccProbability(MBBI, ProbOrigTarget); - MBBI = std::find(NewMBB->succ_begin(), NewMBB->succ_end(), OrigFallThrough); + MBBI = find(NewMBB->successors(), OrigFallThrough); NewMBB->setSuccProbability(MBBI, ProbOrigFallThrough); } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 35f5e1fbebcd..178a13443e2a 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -75,6 +75,21 @@ MaxCRBitSpillDist("ppc-max-crbit-spill-dist", "spill on ppc"), cl::Hidden, cl::init(100)); +// Copies/moves of physical accumulators are expensive operations +// that should be avoided whenever possible. MMA instructions are +// meant to be used in performance-sensitive computational kernels. +// This option is provided, at least for the time being, to give the +// user a tool to detect this expensive operation and either rework +// their code or report a compiler bug if that turns out to be the +// cause. +#ifndef NDEBUG +static cl::opt<bool> +ReportAccMoves("ppc-report-acc-moves", + cl::desc("Emit information about accumulator register spills " + "and copies"), + cl::Hidden, cl::init(false)); +#endif + static unsigned offsetMinAlignForOpcode(unsigned OpC); PPCRegisterInfo::PPCRegisterInfo(const PPCTargetMachine &TM) @@ -141,6 +156,10 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const MCPhysReg* PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>(); + if (Subtarget.isAIXABI() && + (Subtarget.hasAltivec() && !TM.getAIXExtendedAltivecABI())) + report_fatal_error("the default AIX Altivec ABI is not yet " + "supported."); if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) { if (!TM.isPPC64() && Subtarget.isAIXABI()) report_fatal_error("AnyReg unimplemented on 32-bit AIX."); @@ -187,8 +206,11 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return SaveR2 ? CSR_PPC64_R2_SaveList : CSR_PPC64_SaveList; } // 32-bit targets. - if (Subtarget.isAIXABI()) + if (Subtarget.isAIXABI()) { + if (Subtarget.hasAltivec()) + return CSR_AIX32_Altivec_SaveList; return CSR_AIX32_SaveList; + } if (Subtarget.hasAltivec()) return CSR_SVR432_Altivec_SaveList; else if (Subtarget.hasSPE()) @@ -209,8 +231,10 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF, } if (Subtarget.isAIXABI()) { - assert(!Subtarget.hasAltivec() && "Altivec is not implemented on AIX yet."); - return TM.isPPC64() ? CSR_PPC64_RegMask : CSR_AIX32_RegMask; + return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_PPC64_Altivec_RegMask + : CSR_PPC64_RegMask) + : (Subtarget.hasAltivec() ? CSR_AIX32_Altivec_RegMask + : CSR_AIX32_RegMask); } if (CC == CallingConv::Cold) { @@ -404,9 +428,6 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } case PPC::F8RCRegClassID: case PPC::F4RCRegClassID: - case PPC::QFRCRegClassID: - case PPC::QSRCRegClassID: - case PPC::QBRCRegClassID: case PPC::VRRCRegClassID: case PPC::VFRCRegClassID: case PPC::VSLRCRegClassID: @@ -624,21 +645,30 @@ void PPCRegisterInfo::lowerPrepareProbedAlloca( bool LP64 = TM.isPPC64(); DebugLoc dl = MI.getDebugLoc(); Register FramePointer = MI.getOperand(0).getReg(); - Register FinalStackPtr = MI.getOperand(1).getReg(); + const Register ActualNegSizeReg = MI.getOperand(1).getReg(); bool KillNegSizeReg = MI.getOperand(2).isKill(); Register NegSizeReg = MI.getOperand(2).getReg(); - prepareDynamicAlloca(II, NegSizeReg, KillNegSizeReg, FramePointer); - if (LP64) { - BuildMI(MBB, II, dl, TII.get(PPC::ADD8), FinalStackPtr) - .addReg(PPC::X1) - .addReg(NegSizeReg, getKillRegState(KillNegSizeReg)); - - } else { - BuildMI(MBB, II, dl, TII.get(PPC::ADD4), FinalStackPtr) - .addReg(PPC::R1) - .addReg(NegSizeReg, getKillRegState(KillNegSizeReg)); + const MCInstrDesc &CopyInst = TII.get(LP64 ? PPC::OR8 : PPC::OR); + // RegAllocator might allocate FramePointer and NegSizeReg in the same phyreg. + if (FramePointer == NegSizeReg) { + assert(KillNegSizeReg && "FramePointer is a def and NegSizeReg is an use, " + "NegSizeReg should be killed"); + // FramePointer is clobbered earlier than the use of NegSizeReg in + // prepareDynamicAlloca, save NegSizeReg in ActualNegSizeReg to avoid + // misuse. + BuildMI(MBB, II, dl, CopyInst, ActualNegSizeReg) + .addReg(NegSizeReg) + .addReg(NegSizeReg); + NegSizeReg = ActualNegSizeReg; + KillNegSizeReg = false; } - + prepareDynamicAlloca(II, NegSizeReg, KillNegSizeReg, FramePointer); + // NegSizeReg might be updated in prepareDynamicAlloca if MaxAlign > + // TargetAlign. + if (NegSizeReg != ActualNegSizeReg) + BuildMI(MBB, II, dl, CopyInst, ActualNegSizeReg) + .addReg(NegSizeReg) + .addReg(NegSizeReg); MBB.erase(II); } @@ -821,6 +851,16 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, SpillsKnownBit = true; break; default: + // On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all + // bits (specifically, it produces a -1 if the CR bit is set). Ultimately, + // the bit that is of importance to us is bit 32 (bit 0 of a 32-bit + // register), and SETNBC will set this. + if (Subtarget.isISA3_1()) { + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg) + .addReg(SrcReg, RegState::Undef); + break; + } + // On Power9, we can use SETB to extract the LT bit. This only works for // the LT bit since SETB produces -1/1/0 for LT/GT/<neither>. So the value // of the bit we care about (32-bit sign bit) will be set to the value of @@ -920,54 +960,104 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II, MBB.erase(II); } -void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II, - unsigned FrameIndex) const { - // Get the instruction. - MachineInstr &MI = *II; // ; SPILL_VRSAVE <SrcReg>, <offset> - // Get the instruction's basic block. +void PPCRegisterInfo::emitAccCopyInfo(MachineBasicBlock &MBB, + MCRegister DestReg, MCRegister SrcReg) { +#ifdef NDEBUG + return; +#else + if (ReportAccMoves) { + std::string Dest = PPC::ACCRCRegClass.contains(DestReg) ? "acc" : "uacc"; + std::string Src = PPC::ACCRCRegClass.contains(SrcReg) ? "acc" : "uacc"; + dbgs() << "Emitting copy from " << Src << " to " << Dest << ":\n"; + MBB.dump(); + } +#endif +} + +static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed, + bool IsRestore) { +#ifdef NDEBUG + return; +#else + if (ReportAccMoves) { + dbgs() << "Emitting " << (IsPrimed ? "acc" : "uacc") << " register " + << (IsRestore ? "restore" : "spill") << ":\n"; + MBB.dump(); + } +#endif +} + +/// lowerACCSpilling - Generate the code for spilling the accumulator register. +/// Similarly to other spills/reloads that use pseudo-ops, we do not actually +/// eliminate the FrameIndex here nor compute the stack offset. We simply +/// create a real instruction with an FI and rely on eliminateFrameIndex to +/// handle the FI elimination. +void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + MachineInstr &MI = *II; // SPILL_ACC <SrcReg>, <offset> MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); - DebugLoc dl = MI.getDebugLoc(); - - const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - Register Reg = MF.getRegInfo().createVirtualRegister(GPRC); + DebugLoc DL = MI.getDebugLoc(); Register SrcReg = MI.getOperand(0).getReg(); - - BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg) - .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); - - addFrameReference( - BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill), - FrameIndex); + bool IsKilled = MI.getOperand(0).isKill(); + + bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg); + Register Reg = + PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2; + bool IsLittleEndian = Subtarget.isLittleEndian(); + + emitAccSpillRestoreInfo(MBB, IsPrimed, false); + + // De-prime the register being spilled, create two stores for the pair + // subregisters accounting for endianness and then re-prime the register if + // it isn't killed. This uses the Offset parameter to addFrameReference() to + // adjust the offset of the store that is within the 64-byte stack slot. + if (IsPrimed) + BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(Reg, getKillRegState(IsKilled)), + FrameIndex, IsLittleEndian ? 32 : 0); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP)) + .addReg(Reg + 1, getKillRegState(IsKilled)), + FrameIndex, IsLittleEndian ? 0 : 32); + if (IsPrimed && !IsKilled) + BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg); // Discard the pseudo instruction. MBB.erase(II); } -void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II, - unsigned FrameIndex) const { - // Get the instruction. - MachineInstr &MI = *II; // ; <DestReg> = RESTORE_VRSAVE <offset> - // Get the instruction's basic block. +/// lowerACCRestore - Generate the code to restore the accumulator register. +void PPCRegisterInfo::lowerACCRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const { + MachineInstr &MI = *II; // <DestReg> = RESTORE_ACC <offset> MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); - DebugLoc dl = MI.getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); - const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; - Register Reg = MF.getRegInfo().createVirtualRegister(GPRC); Register DestReg = MI.getOperand(0).getReg(); assert(MI.definesRegister(DestReg) && - "RESTORE_VRSAVE does not define its destination"); + "RESTORE_ACC does not define its destination"); - addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ), - Reg), FrameIndex); + bool IsPrimed = PPC::ACCRCRegClass.contains(DestReg); + Register Reg = + PPC::VSRp0 + (DestReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2; + bool IsLittleEndian = Subtarget.isLittleEndian(); - BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg) - .addReg(Reg, RegState::Kill); + emitAccSpillRestoreInfo(MBB, IsPrimed, true); + + // Create two loads for the pair subregisters accounting for endianness and + // then prime the accumulator register being restored. + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg), + FrameIndex, IsLittleEndian ? 32 : 0); + addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::LXVP), Reg + 1), + FrameIndex, IsLittleEndian ? 0 : 32); + if (IsPrimed) + BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), DestReg).addReg(DestReg); // Discard the pseudo instruction. MBB.erase(II); @@ -1084,7 +1174,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (FPSI && FrameIndex == FPSI && (OpC == PPC::PREPARE_PROBED_ALLOCA_64 || - OpC == PPC::PREPARE_PROBED_ALLOCA_32)) { + OpC == PPC::PREPARE_PROBED_ALLOCA_32 || + OpC == PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64 || + OpC == PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32)) { lowerPrepareProbedAlloca(II); return; } @@ -1102,11 +1194,11 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } else if (OpC == PPC::RESTORE_CRBIT) { lowerCRBitRestore(II, FrameIndex); return; - } else if (OpC == PPC::SPILL_VRSAVE) { - lowerVRSAVESpilling(II, FrameIndex); + } else if (OpC == PPC::SPILL_ACC || OpC == PPC::SPILL_UACC) { + lowerACCSpilling(II, FrameIndex); return; - } else if (OpC == PPC::RESTORE_VRSAVE) { - lowerVRSAVERestore(II, FrameIndex); + } else if (OpC == PPC::RESTORE_ACC || OpC == PPC::RESTORE_UACC) { + lowerACCRestore(II, FrameIndex); return; } @@ -1283,10 +1375,9 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { /// Insert defining instruction(s) for BaseReg to /// be a pointer to FrameIdx at the beginning of the basic block. -void PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, - Register BaseReg, - int FrameIdx, - int64_t Offset) const { +Register PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, + int FrameIdx, + int64_t Offset) const { unsigned ADDriOpc = TM.isPPC64() ? PPC::ADDI8 : PPC::ADDI; MachineBasicBlock::iterator Ins = MBB->begin(); @@ -1299,10 +1390,14 @@ void PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); const MCInstrDesc &MCID = TII.get(ADDriOpc); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const TargetRegisterClass *RC = getPointerRegClass(MF); + Register BaseReg = MRI.createVirtualRegister(RC); MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF)); BuildMI(*MBB, Ins, DL, MCID, BaseReg) .addFrameIndex(FrameIdx).addImm(Offset); + + return BaseReg; } void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 61acd955e1cb..93f330ab56b6 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -119,10 +119,14 @@ public: unsigned FrameIndex) const; void lowerCRBitRestore(MachineBasicBlock::iterator II, unsigned FrameIndex) const; - void lowerVRSAVESpilling(MachineBasicBlock::iterator II, - unsigned FrameIndex) const; - void lowerVRSAVERestore(MachineBasicBlock::iterator II, - unsigned FrameIndex) const; + + void lowerACCSpilling(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + void lowerACCRestore(MachineBasicBlock::iterator II, + unsigned FrameIndex) const; + + static void emitAccCopyInfo(MachineBasicBlock &MBB, MCRegister DestReg, + MCRegister SrcReg); bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg, int &FrameIdx) const override; @@ -132,9 +136,8 @@ public: // Support for virtual base registers. bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; - void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg, - int FrameIdx, - int64_t Offset) const override; + Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, + int64_t Offset) const override; void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override; bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, @@ -151,12 +154,18 @@ public: /// register name so that only the number is left. Used by for linux asm. static const char *stripRegisterPrefix(const char *RegName) { switch (RegName[0]) { + case 'a': + if (RegName[1] == 'c' && RegName[2] == 'c') + return RegName + 3; + break; case 'r': case 'f': - case 'q': // for QPX case 'v': - if (RegName[1] == 's') + if (RegName[1] == 's') { + if (RegName[2] == 'p') + return RegName + 3; return RegName + 2; + } return RegName + 1; case 'c': if (RegName[1] == 'r') return RegName + 2; } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index b45757c1acc5..e03617aa75ff 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -16,6 +16,10 @@ def sub_eq : SubRegIndex<1, 2>; def sub_un : SubRegIndex<1, 3>; def sub_32 : SubRegIndex<32>; def sub_64 : SubRegIndex<64>; +def sub_vsx0 : SubRegIndex<128>; +def sub_vsx1 : SubRegIndex<128, 128>; +def sub_pair0 : SubRegIndex<256>; +def sub_pair1 : SubRegIndex<256, 256>; } @@ -54,13 +58,6 @@ class FPR<bits<5> num, string n> : PPCReg<n> { let HWEncoding{4-0} = num; } -// QFPR - One of the 32 256-bit floating-point vector registers (used for QPX) -class QFPR<FPR SubReg, string n> : PPCReg<n> { - let HWEncoding = SubReg.HWEncoding; - let SubRegs = [SubReg]; - let SubRegIndices = [sub_64]; -} - // VF - One of the 32 64-bit floating-point subregisters of the vector // registers (used by VSX). class VF<bits<5> num, string n> : PPCReg<n> { @@ -101,6 +98,27 @@ class CRBIT<bits<5> num, string n> : PPCReg<n> { let HWEncoding{4-0} = num; } +// ACC - One of the 8 512-bit VSX accumulators. +class ACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> { + let HWEncoding{2-0} = num; + let SubRegs = subregs; +} + +// UACC - One of the 8 512-bit VSX accumulators prior to being primed. +// Without using this register class, the register allocator has no way to +// differentiate a primed accumulator from an unprimed accumulator. +// This may result in invalid copies between primed and unprimed accumulators. +class UACC<bits<3> num, string n, list<Register> subregs> : PPCReg<n> { + let HWEncoding{2-0} = num; + let SubRegs = subregs; +} + +// VSR Pairs - One of the 32 paired even-odd consecutive VSRs. +class VSRPair<bits<5> num, string n, list<Register> subregs> : PPCReg<n> { + let HWEncoding{4-0} = num; + let SubRegs = subregs; +} + // General-purpose registers foreach Index = 0-31 in { def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>; @@ -132,12 +150,6 @@ foreach Index = 0-31 in { DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>; } -// QPX Floating-point registers -foreach Index = 0-31 in { - def QF#Index : QFPR<!cast<FPR>("F"#Index), "q"#Index>, - DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>; -} - // Vector registers foreach Index = 0-31 in { def V#Index : VR<!cast<VF>("VF"#Index), "v"#Index>, @@ -156,6 +168,23 @@ foreach Index = 32-63 in { def VSX#Index : VSXReg<Index, "vs"#Index>; } +let SubRegIndices = [sub_vsx0, sub_vsx1] in { + // VSR pairs 0 - 15 (corresponding to VSRs 0 - 30 paired with 1 - 31). + foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in { + def VSRp#!srl(Index, 1) : VSRPair<!srl(Index, 1), "vsp"#Index, + [!cast<VSRL>("VSL"#Index), !cast<VSRL>("VSL"#!add(Index, 1))]>, + DwarfRegNum<[0, 0]>; + } + + // VSR pairs 16 - 31 (corresponding to VSRs 32 - 62 paired with 33 - 63). + foreach Index = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 } in { + def VSRp#!add(!srl(Index, 1), 16) : + VSRPair<!add(!srl(Index, 1), 16), "vsp"#!add(Index, 32), + [!cast<VR>("V"#Index), !cast<VR>("V"#!add(Index, 1))]>, + DwarfRegNum<[0, 0]>; + } +} + // The representation of r0 when treated as the constant 0. def ZERO : GPR<0, "0">, DwarfRegAlias<R0>; def ZERO8 : GP8<ZERO, "0">, DwarfRegAlias<X0>; @@ -343,16 +372,6 @@ def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC, // Register class for single precision scalars in VSX registers def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>; -// For QPX -def QFRC : RegisterClass<"PPC", [v4f64], 256, (add (sequence "QF%u", 0, 13), - (sequence "QF%u", 31, 14))>; -def QSRC : RegisterClass<"PPC", [v4f32], 128, (add QFRC)>; -def QBRC : RegisterClass<"PPC", [v4i1], 256, (add QFRC)> { - // These are actually stored as floating-point values where a positive - // number is true and anything else (including NaN) is false. - let Size = 256; -} - def CRBITRC : RegisterClass<"PPC", [i1], 32, (add CR2LT, CR2GT, CR2EQ, CR2UN, CR3LT, CR3GT, CR3EQ, CR3UN, @@ -395,3 +414,44 @@ def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> { let CopyCost = -1; } +let SubRegIndices = [sub_pair0, sub_pair1] in { + def ACC0 : ACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[0, 0]>; + def ACC1 : ACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[0, 0]>; + def ACC2 : ACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[0, 0]>; + def ACC3 : ACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[0, 0]>; + def ACC4 : ACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[0, 0]>; + def ACC5 : ACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[0, 0]>; + def ACC6 : ACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[0, 0]>; + def ACC7 : ACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[0, 0]>; +} +def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3, + ACC4, ACC5, ACC6, ACC7)> { + let Size = 512; +} + +let SubRegIndices = [sub_pair0, sub_pair1] in { + def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[0, 0]>; + def UACC1 : UACC<1, "acc1", [VSRp2, VSRp3]>, DwarfRegNum<[0, 0]>; + def UACC2 : UACC<2, "acc2", [VSRp4, VSRp5]>, DwarfRegNum<[0, 0]>; + def UACC3 : UACC<3, "acc3", [VSRp6, VSRp7]>, DwarfRegNum<[0, 0]>; + def UACC4 : UACC<4, "acc4", [VSRp8, VSRp9]>, DwarfRegNum<[0, 0]>; + def UACC5 : UACC<5, "acc5", [VSRp10, VSRp11]>, DwarfRegNum<[0, 0]>; + def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[0, 0]>; + def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[0, 0]>; +} +def UACCRC : RegisterClass<"PPC", [v512i1], 128, + (add UACC0, UACC1, UACC2, UACC3, + UACC4, UACC5, UACC6, UACC7)> { + let Size = 512; +} + +// Allocate in the same order as the underlying VSX registers. +def VSRpRC : + RegisterClass<"PPC", [v256i1], 128, + (add (sequence "VSRp%u", 0, 6), + (sequence "VSRp%u", 15, 7), VSRp17, VSRp18, + VSRp16, VSRp19, VSRp20, VSRp21, VSRp22, VSRp23, + VSRp24, VSRp25, VSRp31, VSRp30, VSRp29, VSRp28, + VSRp27, VSRp26)> { + let Size = 256; +} diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td index 0a1ae7e55b3c..571cc219ff2b 100644 --- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td +++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td @@ -40,12 +40,11 @@ def P9Model : SchedMachineModel { let CompleteModel = 1; - // Do not support QPX (Quad Processing eXtension), SPE (Signal Processing - // Engine), prefixed instructions on Power 9, PC relative mem ops, or - // instructions introduced in ISA 3.1. - let UnsupportedFeatures = [HasQPX, HasSPE, PrefixInstrs, PCRelativeMemops, - IsISA3_1]; - + // Do not support SPE (Signal Processing Engine), prefixed instructions on + // Power 9, paired vector mem ops, MMA, PC relative mem ops, or instructions + // introduced in ISA 3.1. + let UnsupportedFeatures = [HasSPE, PrefixInstrs, PairedVectorMemops, MMA, + PCRelativeMemops, IsISA3_1]; } let SchedModel = P9Model in { diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 3836cc960394..d31195f67ef1 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -11,9 +11,13 @@ //===----------------------------------------------------------------------===// #include "PPCSubtarget.h" +#include "GISel/PPCCallLowering.h" +#include "GISel/PPCLegalizerInfo.h" +#include "GISel/PPCRegisterBankInfo.h" #include "PPC.h" #include "PPCRegisterInfo.h" #include "PPCTargetMachine.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/Attributes.h" @@ -35,10 +39,6 @@ using namespace llvm; static cl::opt<bool> UseSubRegLiveness("ppc-track-subreg-liveness", cl::desc("Enable subregister liveness tracking for PPC"), cl::Hidden); -static cl::opt<bool> QPXStackUnaligned("qpx-stack-unaligned", - cl::desc("Even when QPX is enabled the stack is not 32-byte aligned"), - cl::Hidden); - static cl::opt<bool> EnableMachinePipeliner("ppc-enable-pipeliner", cl::desc("Enable Machine Pipeliner for PPC"), @@ -53,11 +53,19 @@ PPCSubtarget &PPCSubtarget::initializeSubtargetDependencies(StringRef CPU, PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const PPCTargetMachine &TM) - : PPCGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT), + : PPCGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), TargetTriple(TT), IsPPC64(TargetTriple.getArch() == Triple::ppc64 || TargetTriple.getArch() == Triple::ppc64le), TM(TM), FrameLowering(initializeSubtargetDependencies(CPU, FS)), - InstrInfo(*this), TLInfo(TM, *this) {} + InstrInfo(*this), TLInfo(TM, *this) { + CallLoweringInfo.reset(new PPCCallLowering(*getTargetLowering())); + Legalizer.reset(new PPCLegalizerInfo(*this)); + auto *RBI = new PPCRegisterBankInfo(*getRegisterInfo()); + RegBankInfo.reset(RBI); + + InstSelector.reset(createPPCInstructionSelector( + *static_cast<const PPCTargetMachine *>(&TM), *this, *RBI)); +} void PPCSubtarget::initializeEnvironment() { StackAlignment = Align(16); @@ -69,8 +77,8 @@ void PPCSubtarget::initializeEnvironment() { HasHardFloat = false; HasAltivec = false; HasSPE = false; + HasEFPU2 = false; HasFPU = false; - HasQPX = false; HasVSX = false; NeedsTwoConstNR = false; HasP8Vector = false; @@ -78,6 +86,7 @@ void PPCSubtarget::initializeEnvironment() { HasP8Crypto = false; HasP9Vector = false; HasP9Altivec = false; + HasMMA = false; HasP10Vector = false; HasPrefixInstrs = false; HasPCRelativeMemops = false; @@ -109,10 +118,10 @@ void PPCSubtarget::initializeEnvironment() { HasInvariantFunctionDescriptors = false; HasPartwordAtomics = false; HasDirectMove = false; - IsQPXStackUnaligned = false; HasHTM = false; HasFloat128 = false; HasFusion = false; + HasStoreFusion = false; HasAddiLoadFusion = false; HasAddisLoadFusion = false; IsISA3_0 = false; @@ -122,7 +131,10 @@ void PPCSubtarget::initializeEnvironment() { VectorsUseTwoUnits = false; UsePPCPreRASchedStrategy = false; UsePPCPostRASchedStrategy = false; + PairedVectorMemops = false; PredictableSelectIsExpensive = false; + HasModernAIXAs = false; + IsAIX = false; HasPOPCNTD = POPCNTD_Unavailable; } @@ -144,7 +156,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { InstrItins = getInstrItineraryForCPU(CPUName); // Parse features string. - ParseSubtargetFeatures(CPUName, FS); + ParseSubtargetFeatures(CPUName, /*TuneCPU*/ CPUName, FS); // If the user requested use of 64-bit regs, but the cpu selected doesn't // support it, ignore. @@ -158,7 +170,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (HasSPE && IsPPC64) report_fatal_error( "SPE is only supported for 32-bit targets.\n", false); - if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU)) + if (HasSPE && (HasAltivec || HasVSX || HasFPU)) report_fatal_error( "SPE and traditional floating point cannot both be enabled.\n", false); @@ -166,15 +178,12 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (!HasSPE) HasFPU = true; - // QPX requires a 32-byte aligned stack. Note that we need to do this if - // we're compiling for a BG/Q system regardless of whether or not QPX - // is enabled because external functions will assume this alignment. - IsQPXStackUnaligned = QPXStackUnaligned; StackAlignment = getPlatformStackAlignment(); // Determine endianness. // FIXME: Part of the TargetMachine. - IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le); + IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le || + TargetTriple.getArch() == Triple::ppcle); } bool PPCSubtarget::enableMachineScheduler() const { return true; } @@ -235,3 +244,20 @@ bool PPCSubtarget::isUsingPCRelativeCalls() const { return isPPC64() && hasPCRelativeMemops() && isELFv2ABI() && CodeModel::Medium == getTargetMachine().getCodeModel(); } + +// GlobalISEL +const CallLowering *PPCSubtarget::getCallLowering() const { + return CallLoweringInfo.get(); +} + +const RegisterBankInfo *PPCSubtarget::getRegBankInfo() const { + return RegBankInfo.get(); +} + +const LegalizerInfo *PPCSubtarget::getLegalizerInfo() const { + return Legalizer.get(); +} + +InstructionSelector *PPCSubtarget::getInstructionSelector() const { + return InstSelector.get(); +} diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index ec329022c457..50d89390d5bc 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -17,6 +17,9 @@ #include "PPCISelLowering.h" #include "PPCInstrInfo.h" #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" @@ -97,7 +100,7 @@ protected: bool HasAltivec; bool HasFPU; bool HasSPE; - bool HasQPX; + bool HasEFPU2; bool HasVSX; bool NeedsTwoConstNR; bool HasP8Vector; @@ -108,6 +111,7 @@ protected: bool HasP10Vector; bool HasPrefixInstrs; bool HasPCRelativeMemops; + bool HasMMA; bool HasFCPSGN; bool HasFSQRT; bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES; @@ -137,6 +141,7 @@ protected: bool HasHTM; bool HasFloat128; bool HasFusion; + bool HasStoreFusion; bool HasAddiLoadFusion; bool HasAddisLoadFusion; bool IsISA3_0; @@ -146,21 +151,25 @@ protected: bool VectorsUseTwoUnits; bool UsePPCPreRASchedStrategy; bool UsePPCPostRASchedStrategy; + bool PairedVectorMemops; bool PredictableSelectIsExpensive; + bool HasModernAIXAs; + bool IsAIX; POPCNTDKind HasPOPCNTD; - /// When targeting QPX running a stock PPC64 Linux kernel where the stack - /// alignment has not been changed, we need to keep the 16-byte alignment - /// of the stack. - bool IsQPXStackUnaligned; - const PPCTargetMachine &TM; PPCFrameLowering FrameLowering; PPCInstrInfo InstrInfo; PPCTargetLowering TLInfo; SelectionDAGTargetInfo TSInfo; + /// GlobalISel related APIs. + std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + std::unique_ptr<InstructionSelector> InstSelector; + public: /// This constructor initializes the data members to match that /// of the specified triple. @@ -170,16 +179,13 @@ public: /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. - void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); /// getStackAlignment - Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every /// function for this subtarget. Align getStackAlignment() const { return StackAlignment; } - /// getDarwinDirective - Returns the -m directive specified for the cpu. - unsigned getDarwinDirective() const { return CPUDirective; } - /// getCPUDirective - Returns the -m directive specified for the cpu. /// unsigned getCPUDirective() const { return CPUDirective; } @@ -254,8 +260,8 @@ public: bool hasFPCVT() const { return HasFPCVT; } bool hasAltivec() const { return HasAltivec; } bool hasSPE() const { return HasSPE; } + bool hasEFPU2() const { return HasEFPU2; } bool hasFPU() const { return HasFPU; } - bool hasQPX() const { return HasQPX; } bool hasVSX() const { return HasVSX; } bool needsTwoConstNR() const { return NeedsTwoConstNR; } bool hasP8Vector() const { return HasP8Vector; } @@ -266,6 +272,8 @@ public: bool hasP10Vector() const { return HasP10Vector; } bool hasPrefixInstrs() const { return HasPrefixInstrs; } bool hasPCRelativeMemops() const { return HasPCRelativeMemops; } + bool hasMMA() const { return HasMMA; } + bool pairedVectorMemops() const { return PairedVectorMemops; } bool hasMFOCRF() const { return HasMFOCRF; } bool hasISEL() const { return HasISEL; } bool hasBPERMD() const { return HasBPERMD; } @@ -291,11 +299,7 @@ public: bool hasPartwordAtomics() const { return HasPartwordAtomics; } bool hasDirectMove() const { return HasDirectMove; } - bool isQPXStackUnaligned() const { return IsQPXStackUnaligned; } Align getPlatformStackAlignment() const { - if ((hasQPX() || isBGQ()) && !isQPXStackUnaligned()) - return Align(32); - return Align(16); } @@ -315,6 +319,7 @@ public: bool isISA3_1() const { return IsISA3_1; } bool useLongCalls() const { return UseLongCalls; } bool hasFusion() const { return HasFusion; } + bool hasStoreFusion() const { return HasStoreFusion; } bool hasAddiLoadFusion() const { return HasAddiLoadFusion; } bool hasAddisLoadFusion() const { return HasAddisLoadFusion; } bool needsSwapsForVSXMemOps() const { @@ -325,9 +330,6 @@ public: const Triple &getTargetTriple() const { return TargetTriple; } - /// isBGQ - True if this is a BG/Q platform. - bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; } - bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } @@ -404,6 +406,12 @@ public: bool isPredictableSelectIsExpensive() const { return PredictableSelectIsExpensive; } + + // GlobalISEL + const CallLowering *getCallLowering() const override; + const RegisterBankInfo *getRegBankInfo() const override; + const LegalizerInfo *getLegalizerInfo() const override; + InstructionSelector *getInstructionSelector() const override; }; } // End llvm namespace diff --git a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 4b809e0c8553..43dcc5844c4e 100644 --- a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -50,16 +50,17 @@ protected: bool Changed = false; bool NeedFence = true; bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64(); + bool IsPCREL = false; for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end(); I != IE;) { MachineInstr &MI = *I; + IsPCREL = isPCREL(MI); if (MI.getOpcode() != PPC::ADDItlsgdLADDR && MI.getOpcode() != PPC::ADDItlsldLADDR && MI.getOpcode() != PPC::ADDItlsgdLADDR32 && - MI.getOpcode() != PPC::ADDItlsldLADDR32) { - + MI.getOpcode() != PPC::ADDItlsldLADDR32 && !IsPCREL) { // Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP // as scheduling fences, we skip creating fences if we already // have existing ADJCALLSTACKDOWN/UP to avoid nesting, @@ -76,12 +77,16 @@ protected: LLVM_DEBUG(dbgs() << "TLS Dynamic Call Fixup:\n " << MI); Register OutReg = MI.getOperand(0).getReg(); - Register InReg = MI.getOperand(1).getReg(); - DebugLoc DL = MI.getDebugLoc(); + Register InReg = PPC::NoRegister; Register GPR3 = Is64Bit ? PPC::X3 : PPC::R3; - unsigned Opc1, Opc2; - const Register OrigRegs[] = {OutReg, InReg, GPR3}; + SmallVector<Register, 3> OrigRegs = {OutReg, GPR3}; + if (!IsPCREL) { + InReg = MI.getOperand(1).getReg(); + OrigRegs.push_back(InReg); + } + DebugLoc DL = MI.getDebugLoc(); + unsigned Opc1, Opc2; switch (MI.getOpcode()) { default: llvm_unreachable("Opcode inconsistency error"); @@ -101,6 +106,13 @@ protected: Opc1 = PPC::ADDItlsldL32; Opc2 = PPC::GETtlsldADDR32; break; + case PPC::PADDI8pc: + assert(IsPCREL && "Expecting General/Local Dynamic PCRel"); + Opc1 = PPC::PADDI8pc; + Opc2 = MI.getOperand(2).getTargetFlags() == + PPCII::MO_GOT_TLSGD_PCREL_FLAG + ? PPC::GETtlsADDRPCREL + : PPC::GETtlsldADDRPCREL; } // We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr @@ -113,9 +125,15 @@ protected: BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0) .addImm(0); - // Expand into two ops built prior to the existing instruction. - MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3) - .addReg(InReg); + MachineInstr *Addi; + if (IsPCREL) { + Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addImm(0); + } else { + // Expand into two ops built prior to the existing instruction. + assert(InReg != PPC::NoRegister && "Operand must be a register"); + Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3).addReg(InReg); + } + Addi->addOperand(MI.getOperand(2)); // The ADDItls* instruction is the first instruction in the @@ -125,7 +143,10 @@ protected: MachineInstr *Call = (BuildMI(MBB, I, DL, TII->get(Opc2), GPR3) .addReg(GPR3)); - Call->addOperand(MI.getOperand(3)); + if (IsPCREL) + Call->addOperand(MI.getOperand(2)); + else + Call->addOperand(MI.getOperand(3)); if (NeedFence) BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0); @@ -150,6 +171,14 @@ protected: } public: + bool isPCREL(const MachineInstr &MI) { + return (MI.getOpcode() == PPC::PADDI8pc) && + (MI.getOperand(2).getTargetFlags() == + PPCII::MO_GOT_TLSGD_PCREL_FLAG || + MI.getOperand(2).getTargetFlags() == + PPCII::MO_GOT_TLSLD_PCREL_FLAG); + } + bool runOnMachineFunction(MachineFunction &MF) override { TII = MF.getSubtarget<PPCSubtarget>().getInstrInfo(); LIS = &getAnalysis<LiveIntervals>(); diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index f15f9c7f4942..0634833e64dc 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -24,12 +24,18 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/Localizer.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" @@ -64,10 +70,6 @@ opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, cl::desc("Disable VSX Swap Removal for PPC")); static cl:: -opt<bool> DisableQPXLoadSplat("disable-ppc-qpx-load-splat", cl::Hidden, - cl::desc("Disable QPX load splat simplification")); - -static cl:: opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden, cl::desc("Disable machine peepholes for PPC")); @@ -98,8 +100,9 @@ static cl::opt<bool> extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { // Register the targets RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target()); - RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target()); - RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget()); + RegisterTargetMachine<PPCTargetMachine> B(getThePPC32LETarget()); + RegisterTargetMachine<PPCTargetMachine> C(getThePPC64Target()); + RegisterTargetMachine<PPCTargetMachine> D(getThePPC64LETarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); #ifndef NDEBUG @@ -114,13 +117,13 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { initializePPCReduceCRLogicalsPass(PR); initializePPCBSelPass(PR); initializePPCBranchCoalescingPass(PR); - initializePPCQPXLoadSplatPass(PR); initializePPCBoolRetToIntPass(PR); initializePPCExpandISELPass(PR); initializePPCPreEmitPeepholePass(PR); initializePPCTLSDynamicCallPass(PR); initializePPCMIPeepholePass(PR); initializePPCLowerMASSVEntriesPass(PR); + initializeGlobalISel(PR); } /// Return the datalayout string of a subtarget. @@ -128,8 +131,8 @@ static std::string getDataLayoutString(const Triple &T) { bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le; std::string Ret; - // Most PPC* platforms are big endian, PPC64LE is little endian. - if (T.getArch() == Triple::ppc64le) + // Most PPC* platforms are big endian, PPC(64)LE is little endian. + if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle) Ret = "e"; else Ret = "E"; @@ -143,10 +146,7 @@ static std::string getDataLayoutString(const Triple &T) { // Note, the alignment values for f64 and i64 on ppc64 in Darwin // documentation are wrong; these are correct (i.e. "what gcc does"). - if (is64Bit || !T.isOSDarwin()) - Ret += "-i64:64"; - else - Ret += "-f64:32:64"; + Ret += "-i64:64"; // PPC64 has 32 and 64 bit registers, PPC32 has only 32 bit ones. if (is64Bit) @@ -154,6 +154,13 @@ static std::string getDataLayoutString(const Triple &T) { else Ret += "-n32"; + // Specify the vector alignment explicitly. For v256i1 and v512i1, the + // calculated alignment would be 256*alignment(i1) and 512*alignment(i1), + // which is 256 and 512 bytes - way over aligned. + if ((T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppc64) && + (T.isOSAIX() || T.isOSLinux())) + Ret += "-v256:256:256-v512:512:512"; + return Ret; } @@ -183,13 +190,17 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, FullFS = "+invariant-function-descriptors"; } + if (TT.isOSAIX()) { + if (!FullFS.empty()) + FullFS = "+aix," + FullFS; + else + FullFS = "+aix"; + } + return FullFS; } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { - if (TT.isOSDarwin()) - return std::make_unique<TargetLoweringObjectFileMachO>(); - if (TT.isOSAIX()) return std::make_unique<TargetLoweringObjectFileXCOFF>(); @@ -198,9 +209,6 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, const TargetOptions &Options) { - if (TT.isOSDarwin()) - report_fatal_error("Darwin is no longer supported for PowerPC"); - if (Options.MCOptions.getABIName().startswith("elfv1")) return PPCTargetMachine::PPC_ABI_ELFv1; else if (Options.MCOptions.getABIName().startswith("elfv2")) @@ -230,10 +238,6 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, if (RM.hasValue()) return *RM; - // Darwin defaults to dynamic-no-pic. - if (TT.isOSDarwin()) - return Reloc::DynamicNoPIC; - // Big Endian PPC and AIX default to PIC. if (TT.getArch() == Triple::ppc64 || TT.isOSAIX()) return Reloc::PIC_; @@ -276,6 +280,8 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) { std::make_unique<GenericScheduler>(C)); // add DAG Mutations here. DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); + if (ST.hasStoreFusion()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.hasFusion()) DAG->addMutation(createPowerPCMacroFusionDAGMutation()); @@ -290,6 +296,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler( std::make_unique<PPCPostRASchedStrategy>(C) : std::make_unique<PostGenericScheduler>(C), true); // add DAG Mutations here. + if (ST.hasStoreFusion()) + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); if (ST.hasFusion()) DAG->addMutation(createPowerPCMacroFusionDAGMutation()); return DAG; @@ -321,12 +329,10 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const { Attribute CPUAttr = F.getFnAttribute("target-cpu"); Attribute FSAttr = F.getFnAttribute("target-features"); - std::string CPU = !CPUAttr.hasAttribute(Attribute::None) - ? CPUAttr.getValueAsString().str() - : TargetCPU; - std::string FS = !FSAttr.hasAttribute(Attribute::None) - ? FSAttr.getValueAsString().str() - : TargetFS; + std::string CPU = + CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU; + std::string FS = + FSAttr.isValid() ? FSAttr.getValueAsString().str() : TargetFS; // FIXME: This is related to the code below to reset the target options, // we need to know whether or not the soft float flag is set on the @@ -388,6 +394,12 @@ public: void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + // GlobalISEL + bool addIRTranslator() override; + bool addLegalizeMachineIR() override; + bool addRegBankSelect() override; + bool addGlobalInstructionSelect() override; + ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { return createPPCMachineScheduler(C); @@ -411,14 +423,9 @@ void PPCPassConfig::addIRPasses() { // Lower generic MASSV routines to PowerPC subtarget-specific entries. addPass(createPPCLowerMASSVEntriesPass()); - - // For the BG/Q (or if explicitly requested), add explicit data prefetch - // intrinsics. - bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ && - getOptLevel() != CodeGenOpt::None; + + // If explicitly requested, add explicit data prefetch intrinsics. if (EnablePrefetch.getNumOccurrences() > 0) - UsePrefetching = EnablePrefetch; - if (UsePrefetching) addPass(createLoopDataPrefetchPass()); if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) { @@ -515,15 +522,8 @@ void PPCPassConfig::addPreRegAlloc() { } void PPCPassConfig::addPreSched2() { - if (getOptLevel() != CodeGenOpt::None) { + if (getOptLevel() != CodeGenOpt::None) addPass(&IfConverterID); - - // This optimization must happen after anything that might do store-to-load - // forwarding. Here we're after RA (and, thus, when spills are inserted) - // but before post-RA scheduling. - if (!DisableQPXLoadSplat) - addPass(createPPCQPXLoadSplatPass()); - } } void PPCPassConfig::addPreEmitPass() { @@ -550,3 +550,24 @@ static MachineSchedRegistry PPCPostRASchedRegistry("ppc-postra", "Run PowerPC PostRA specific scheduler", createPPCPostMachineScheduler); + +// Global ISEL +bool PPCPassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} + +bool PPCPassConfig::addLegalizeMachineIR() { + addPass(new Legalizer()); + return false; +} + +bool PPCPassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); + return false; +} + +bool PPCPassConfig::addGlobalInstructionSelect() { + addPass(new InstructionSelect()); + return false; +} diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/llvm/lib/Target/PowerPC/PPCTargetMachine.h index fd1d14ae32d4..21faa4e710e3 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.h +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.h @@ -58,6 +58,11 @@ public: const Triple &TT = getTargetTriple(); return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le); }; + + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Addrspacecasts are always noops. + return true; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 53556ffc267d..b3d8100fe016 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -8,13 +8,19 @@ #include "PPCTargetTransformInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/IR/IntrinsicsPowerPC.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/Local.h" + using namespace llvm; #define DEBUG_TYPE "ppctti" @@ -22,8 +28,7 @@ using namespace llvm; static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); -// This is currently only used for the data prefetch pass which is only enabled -// for BG/Q by default. +// This is currently only used for the data prefetch pass static cl::opt<unsigned> CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64), cl::desc("The loop prefetch cache line size")); @@ -59,6 +64,109 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } +Optional<Instruction *> +PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + // Turn PPC lvx -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(0), Align(16), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { + Value *Ptr = IC.Builder.CreateBitCast( + II.getArgOperand(0), PointerType::getUnqual(II.getType())); + return new LoadInst(II.getType(), Ptr, "", false, Align(16)); + } + break; + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: { + // Turn PPC VSX loads into normal loads. + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0), + PointerType::getUnqual(II.getType())); + return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1)); + } + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + // Turn stvx -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment( + II.getArgOperand(1), Align(16), IC.getDataLayout(), &II, + &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) { + Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType()); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); + return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16)); + } + break; + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x: { + // Turn PPC VSX stores into normal stores. + Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType()); + Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy); + return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1)); + } + case Intrinsic::ppc_altivec_vperm: + // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. + // Note that ppc_altivec_vperm has a big-endian bias, so when creating + // a vectorshuffle for little endian, we must undo the transformation + // performed on vec_perm in altivec.h. That is, we must complement + // the permutation mask with respect to 31 and reverse the order of + // V1 and V2. + if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) { + assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 && + "Bad type for intrinsic!"); + + // Check that all of the elements are integer constants or undefs. + bool AllEltsOk = true; + for (unsigned i = 0; i != 16; ++i) { + Constant *Elt = Mask->getAggregateElement(i); + if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) { + AllEltsOk = false; + break; + } + } + + if (AllEltsOk) { + // Cast the input vectors to byte vectors. + Value *Op0 = + IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType()); + Value *Op1 = + IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType()); + Value *Result = UndefValue::get(Op0->getType()); + + // Only extract each element once. + Value *ExtractedElts[32]; + memset(ExtractedElts, 0, sizeof(ExtractedElts)); + + for (unsigned i = 0; i != 16; ++i) { + if (isa<UndefValue>(Mask->getAggregateElement(i))) + continue; + unsigned Idx = + cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue(); + Idx &= 31; // Match the hardware behavior. + if (DL.isLittleEndian()) + Idx = 31 - Idx; + + if (!ExtractedElts[Idx]) { + Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0; + Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1; + ExtractedElts[Idx] = IC.Builder.CreateExtractElement( + Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15)); + } + + // Insert this value into the result vector. + Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx], + IC.Builder.getInt32(i)); + } + return CastInst::Create(Instruction::BitCast, Result, II.getType()); + } + } + break; + } + return None; +} + int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { if (DisablePPCConstHoist) @@ -126,9 +234,10 @@ int PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, - TTI::TargetCostKind CostKind) { + TTI::TargetCostKind CostKind, + Instruction *Inst) { if (DisablePPCConstHoist) - return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind); + return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst); assert(Ty->isIntegerTy()); @@ -226,6 +335,29 @@ PPCTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands, return BaseT::getUserCost(U, Operands, CostKind); } +// Determining the address of a TLS variable results in a function call in +// certain TLS models. +static bool memAddrUsesCTR(const Value *MemAddr, const PPCTargetMachine &TM, + SmallPtrSetImpl<const Value *> &Visited) { + // No need to traverse again if we already checked this operand. + if (!Visited.insert(MemAddr).second) + return false; + const auto *GV = dyn_cast<GlobalValue>(MemAddr); + if (!GV) { + // Recurse to check for constants that refer to TLS global variables. + if (const auto *CV = dyn_cast<Constant>(MemAddr)) + for (const auto &CO : CV->operands()) + if (memAddrUsesCTR(CO, TM, Visited)) + return true; + return false; + } + + if (!GV->isThreadLocal()) + return false; + TLSModel::Model Model = TM.getTLSModel(GV); + return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic; +} + bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo, SmallPtrSetImpl<const Value *> &Visited) { const PPCTargetMachine &TM = ST->getTargetMachine(); @@ -244,31 +376,6 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo, return false; }; - // Determining the address of a TLS variable results in a function call in - // certain TLS models. - std::function<bool(const Value *)> memAddrUsesCTR = - [&memAddrUsesCTR, &TM, &Visited](const Value *MemAddr) -> bool { - // No need to traverse again if we already checked this operand. - if (!Visited.insert(MemAddr).second) - return false; - const auto *GV = dyn_cast<GlobalValue>(MemAddr); - if (!GV) { - // Recurse to check for constants that refer to TLS global variables. - if (const auto *CV = dyn_cast<Constant>(MemAddr)) - for (const auto &CO : CV->operands()) - if (memAddrUsesCTR(CO)) - return true; - - return false; - } - - if (!GV->isThreadLocal()) - return false; - TLSModel::Model Model = TM.getTLSModel(GV); - return Model == TLSModel::GeneralDynamic || - Model == TLSModel::LocalDynamic; - }; - auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) { if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); @@ -276,8 +383,34 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo, return false; }; + auto supportedHalfPrecisionOp = [](Instruction *Inst) { + switch (Inst->getOpcode()) { + default: + return false; + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::Load: + case Instruction::Store: + case Instruction::FPToUI: + case Instruction::UIToFP: + case Instruction::FPToSI: + case Instruction::SIToFP: + return true; + } + }; + for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) { + // There are no direct operations on half precision so assume that + // anything with that type requires a call except for a few select + // operations with Power9. + if (Instruction *CurrInst = dyn_cast<Instruction>(J)) { + for (const auto &Op : CurrInst->operands()) { + if (Op->getType()->getScalarType()->isHalfTy() || + CurrInst->getType()->getScalarType()->isHalfTy()) + return !(ST->isISA3_0() && supportedHalfPrecisionOp(CurrInst)); + } + } if (CallInst *CI = dyn_cast<CallInst>(J)) { // Inline ASM is okay, unless it clobbers the ctr register. if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) { @@ -299,6 +432,30 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo, case Intrinsic::loop_decrement: return true; + // Binary operations on 128-bit value will use CTR. + case Intrinsic::experimental_constrained_fadd: + case Intrinsic::experimental_constrained_fsub: + case Intrinsic::experimental_constrained_fmul: + case Intrinsic::experimental_constrained_fdiv: + case Intrinsic::experimental_constrained_frem: + if (F->getType()->getScalarType()->isFP128Ty() || + F->getType()->getScalarType()->isPPC_FP128Ty()) + return true; + break; + + case Intrinsic::experimental_constrained_fptosi: + case Intrinsic::experimental_constrained_fptoui: + case Intrinsic::experimental_constrained_sitofp: + case Intrinsic::experimental_constrained_uitofp: { + Type *SrcType = CI->getArgOperand(0)->getType()->getScalarType(); + Type *DstType = CI->getType()->getScalarType(); + if (SrcType->isPPC_FP128Ty() || DstType->isPPC_FP128Ty() || + isLargeIntegerTy(!TM.isPPC64(), SrcType) || + isLargeIntegerTy(!TM.isPPC64(), DstType)) + return true; + break; + } + // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp // because, although it does clobber the counter register, the // control can't then return to inside the loop unless there is also @@ -317,6 +474,15 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo, case Intrinsic::pow: case Intrinsic::sin: case Intrinsic::cos: + case Intrinsic::experimental_constrained_powi: + case Intrinsic::experimental_constrained_log: + case Intrinsic::experimental_constrained_log2: + case Intrinsic::experimental_constrained_log10: + case Intrinsic::experimental_constrained_exp: + case Intrinsic::experimental_constrained_exp2: + case Intrinsic::experimental_constrained_pow: + case Intrinsic::experimental_constrained_sin: + case Intrinsic::experimental_constrained_cos: return true; case Intrinsic::copysign: if (CI->getArgOperand(0)->getType()->getScalarType()-> @@ -338,6 +504,54 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo, case Intrinsic::llround: Opcode = ISD::LLROUND; break; case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; + case Intrinsic::experimental_constrained_fcmp: + Opcode = ISD::STRICT_FSETCC; + break; + case Intrinsic::experimental_constrained_fcmps: + Opcode = ISD::STRICT_FSETCCS; + break; + case Intrinsic::experimental_constrained_fma: + Opcode = ISD::STRICT_FMA; + break; + case Intrinsic::experimental_constrained_sqrt: + Opcode = ISD::STRICT_FSQRT; + break; + case Intrinsic::experimental_constrained_floor: + Opcode = ISD::STRICT_FFLOOR; + break; + case Intrinsic::experimental_constrained_ceil: + Opcode = ISD::STRICT_FCEIL; + break; + case Intrinsic::experimental_constrained_trunc: + Opcode = ISD::STRICT_FTRUNC; + break; + case Intrinsic::experimental_constrained_rint: + Opcode = ISD::STRICT_FRINT; + break; + case Intrinsic::experimental_constrained_lrint: + Opcode = ISD::STRICT_LRINT; + break; + case Intrinsic::experimental_constrained_llrint: + Opcode = ISD::STRICT_LLRINT; + break; + case Intrinsic::experimental_constrained_nearbyint: + Opcode = ISD::STRICT_FNEARBYINT; + break; + case Intrinsic::experimental_constrained_round: + Opcode = ISD::STRICT_FROUND; + break; + case Intrinsic::experimental_constrained_lround: + Opcode = ISD::STRICT_LROUND; + break; + case Intrinsic::experimental_constrained_llround: + Opcode = ISD::STRICT_LLROUND; + break; + case Intrinsic::experimental_constrained_minnum: + Opcode = ISD::STRICT_FMINNUM; + break; + case Intrinsic::experimental_constrained_maxnum: + Opcode = ISD::STRICT_FMAXNUM; + break; case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; } @@ -486,7 +700,7 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo, } for (Value *Operand : J->operands()) - if (memAddrUsesCTR(Operand)) + if (memAddrUsesCTR(Operand, TM, Visited)) return true; } @@ -546,6 +760,24 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, } } + // If an exit block has a PHI that accesses a TLS variable as one of the + // incoming values from the loop, we cannot produce a CTR loop because the + // address for that value will be computed in the loop. + SmallVector<BasicBlock *, 4> ExitBlocks; + L->getExitBlocks(ExitBlocks); + for (auto &BB : ExitBlocks) { + for (auto &PHI : BB->phis()) { + for (int Idx = 0, EndIdx = PHI.getNumIncomingValues(); Idx < EndIdx; + Idx++) { + const BasicBlock *IncomingBB = PHI.getIncomingBlock(Idx); + const Value *IncomingValue = PHI.getIncomingValue(Idx); + if (L->contains(IncomingBB) && + memAddrUsesCTR(IncomingValue, TM, Visited)) + return false; + } + } + } + LLVMContext &C = L->getHeader()->getContext(); HWLoopInfo.CountType = TM.isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C); @@ -581,10 +813,7 @@ bool PPCTTIImpl::useColdCCForColdCall(Function &F) { } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { - // On the A2, always unroll aggressively. For QPX unaligned loads, we depend - // on combining the loads generated for consecutive accesses, and failure to - // do so is particularly expensive. This makes it much more likely (compared - // to only using concatenation unrolling). + // On the A2, always unroll aggressively. if (ST->getCPUDirective() == PPC::DIR_A2) return true; @@ -644,7 +873,6 @@ const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const { unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const { if (Vector) { - if (ST->hasQPX()) return 256; if (ST->hasAltivec()) return 128; return 0; } @@ -673,8 +901,6 @@ unsigned PPCTTIImpl::getCacheLineSize() const { } unsigned PPCTTIImpl::getPrefetchDistance() const { - // This seems like a reasonable default for the BG/Q (this pass is enabled, by - // default, only on the BG/Q). return 300; } @@ -763,7 +989,7 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). We need one such shuffle instruction for each actual // register (this is not true for arbitrary shuffles, but is true for the @@ -780,11 +1006,12 @@ int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { } int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); - int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I); + int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src); // TODO: Allow non-throughput costs that aren't binary. if (CostKind != TTI::TCK_RecipThroughput) @@ -793,9 +1020,11 @@ int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, } int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) { - int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); + int Cost = + BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) return Cost; @@ -819,13 +1048,6 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return Cost; - } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) { - // Floating point scalars are already located in index #0. - if (Index == 0) - return 0; - - return Cost; - } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) { if (ST->hasP9Altivec()) { if (ISD == ISD::INSERT_VECTOR_ELT) @@ -849,7 +1071,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { // The cost of the load constant for a vector extract is disregarded // (invariant, easily schedulable). return vectorCostAdjustment(1, Opcode, Val, nullptr); - + } else if (ST->hasDirectMove()) // Assume permute has standard cost. // Assume move-to/move-from VSR have 2x standard cost. @@ -900,8 +1122,6 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, LT.second == MVT::v4i32 || LT.second == MVT::v4f32); bool IsVSXType = ST->hasVSX() && (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); - bool IsQPXType = ST->hasQPX() && - (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); // VSX has 32b/64b load instructions. Legalization can handle loading of // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and @@ -924,8 +1144,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // for Altivec types using the VSX instructions, but that's more expensive // than using the permutation-based load sequence. On the P8, that's no // longer true. - if (Opcode == Instruction::Load && - ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && + if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) && *Alignment >= LT.second.getScalarType().getStoreSize()) return Cost + LT.first; // Add the cost of the permutations. @@ -978,7 +1197,7 @@ int PPCTTIImpl::getInterleavedMemoryOpCost( getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); - // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // PPC, for both Altivec/VSX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant // instruction). For each result vector, we need one shuffle per incoming // vector (except that the first shuffle can take two incoming vectors @@ -1028,3 +1247,51 @@ bool PPCTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, else return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); } + +bool PPCTTIImpl::isNumRegsMajorCostOfLSR() { + return false; +} + +bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) { + switch (Inst->getIntrinsicID()) { + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::ppc_altivec_lvebx: + case Intrinsic::ppc_altivec_lvehx: + case Intrinsic::ppc_altivec_lvewx: + case Intrinsic::ppc_vsx_lxvd2x: + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x_be: + case Intrinsic::ppc_vsx_lxvw4x_be: + case Intrinsic::ppc_vsx_lxvl: + case Intrinsic::ppc_vsx_lxvll: + case Intrinsic::ppc_vsx_lxvp: { + Info.PtrVal = Inst->getArgOperand(0); + Info.ReadMem = true; + Info.WriteMem = false; + return true; + } + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + case Intrinsic::ppc_altivec_stvebx: + case Intrinsic::ppc_altivec_stvehx: + case Intrinsic::ppc_altivec_stvewx: + case Intrinsic::ppc_vsx_stxvd2x: + case Intrinsic::ppc_vsx_stxvw4x: + case Intrinsic::ppc_vsx_stxvd2x_be: + case Intrinsic::ppc_vsx_stxvw4x_be: + case Intrinsic::ppc_vsx_stxvl: + case Intrinsic::ppc_vsx_stxvll: + case Intrinsic::ppc_vsx_stxvp: { + Info.PtrVal = Inst->getArgOperand(1); + Info.ReadMem = false; + Info.WriteMem = true; + return true; + } + default: + break; + } + + return false; +} diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index d998521084e1..bc946715156f 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -41,6 +41,9 @@ public: : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + /// \name Scalar TTI Implementations /// @{ @@ -49,7 +52,8 @@ public: TTI::TargetCostKind CostKind); int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty, TTI::TargetCostKind CostKind); + Type *Ty, TTI::TargetCostKind CostKind, + Instruction *Inst = nullptr); int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); @@ -64,12 +68,14 @@ public: bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo); + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2); + bool isNumRegsMajorCostOfLSR(); /// @} @@ -103,10 +109,11 @@ public: const Instruction *CxtI = nullptr); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::TargetCostKind CostKind, + TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index 3e6d1c7939f1..e72e29112da7 100644 --- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -315,9 +315,9 @@ protected: // Extend the live interval of the addend source (it might end at the // copy to be removed, or somewhere in between there and here). This // is necessary only if it is a physical register. - if (!Register::isVirtualRegister(AddendSrcReg)) - for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid(); - ++Units) { + if (!AddendSrcReg.isVirtual()) + for (MCRegUnitIterator Units(AddendSrcReg.asMCReg(), TRI); + Units.isValid(); ++Units) { unsigned Unit = *Units; LiveRange &AddendSrcRange = LIS->getRegUnit(Unit); diff --git a/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index c3729da0b07b..ff251f55afff 100644 --- a/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -254,10 +254,11 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { if (!MO.isReg()) continue; Register Reg = MO.getReg(); - if (isAnyVecReg(Reg, Partial)) { + // All operands need to be checked because there are instructions that + // operate on a partial register and produce a full register (such as + // XXPERMDIs). + if (isAnyVecReg(Reg, Partial)) RelevantInstr = true; - break; - } } if (!RelevantInstr) @@ -689,6 +690,29 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() { LLVM_DEBUG(UseMI.dump()); LLVM_DEBUG(dbgs() << "\n"); } + + // It is possible that the load feeds a swap and that swap feeds a + // store. In such a case, the code is actually trying to store a swapped + // vector. We must reject such webs. + if (SwapVector[UseIdx].IsSwap && !SwapVector[UseIdx].IsLoad && + !SwapVector[UseIdx].IsStore) { + Register SwapDefReg = UseMI.getOperand(0).getReg(); + for (MachineInstr &UseOfUseMI : + MRI->use_nodbg_instructions(SwapDefReg)) { + int UseOfUseIdx = SwapMap[&UseOfUseMI]; + if (SwapVector[UseOfUseIdx].IsStore) { + SwapVector[Repr].WebRejected = 1; + LLVM_DEBUG( + dbgs() << format( + "Web %d rejected for load/swap feeding a store\n", Repr)); + LLVM_DEBUG(dbgs() << " def " << EntryIdx << ": "); + LLVM_DEBUG(MI->dump()); + LLVM_DEBUG(dbgs() << " use " << UseIdx << ": "); + LLVM_DEBUG(UseMI.dump()); + LLVM_DEBUG(dbgs() << "\n"); + } + } + } } // Reject webs that contain swapping stores that are fed by something diff --git a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp index 649bd648a6cf..6bb952f27fee 100644 --- a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp +++ b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp @@ -14,6 +14,10 @@ Target &llvm::getThePPC32Target() { static Target ThePPC32Target; return ThePPC32Target; } +Target &llvm::getThePPC32LETarget() { + static Target ThePPC32LETarget; + return ThePPC32LETarget; +} Target &llvm::getThePPC64Target() { static Target ThePPC64Target; return ThePPC64Target; @@ -24,9 +28,12 @@ Target &llvm::getThePPC64LETarget() { } extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetInfo() { - RegisterTarget<Triple::ppc, /*HasJIT=*/true> X(getThePPC32Target(), "ppc32", + RegisterTarget<Triple::ppc, /*HasJIT=*/true> W(getThePPC32Target(), "ppc32", "PowerPC 32", "PPC"); + RegisterTarget<Triple::ppcle, /*HasJIT=*/true> X( + getThePPC32LETarget(), "ppc32le", "PowerPC 32 LE", "PPC"); + RegisterTarget<Triple::ppc64, /*HasJIT=*/true> Y(getThePPC64Target(), "ppc64", "PowerPC 64", "PPC"); diff --git a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h index 2d0afbfb1be0..f9d20ef00df8 100644 --- a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h +++ b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.h @@ -14,6 +14,7 @@ namespace llvm { class Target; Target &getThePPC32Target(); +Target &getThePPC32LETarget(); Target &getThePPC64Target(); Target &getThePPC64LETarget(); |