From 706b4fc47bbc608932d3b491ae19a3b9cde9497b Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Fri, 17 Jan 2020 20:45:01 +0000 Subject: Vendor import of llvm-project master e26a78e70, the last commit before the llvmorg-11-init tag, from which release/10.x was branched. --- llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp | 90 +- .../PowerPC/Disassembler/PPCDisassembler.cpp | 6 +- .../Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp | 8 +- .../Target/PowerPC/MCTargetDesc/PPCInstPrinter.h | 6 +- .../Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp | 28 +- .../lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h | 7 - .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp | 23 +- .../Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h | 24 + llvm/lib/Target/PowerPC/P9InstrResources.td | 229 ++-- llvm/lib/Target/PowerPC/PPC.h | 11 +- llvm/lib/Target/PowerPC/PPC.td | 175 ++- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 356 +++-- llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp | 1 + llvm/lib/Target/PowerPC/PPCCTRLoops.cpp | 4 +- llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 7 +- llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp | 4 +- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 401 +++--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 1357 +++++++++++++------- llvm/lib/Target/PowerPC/PPCISelLowering.h | 936 +++++++------- llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 78 +- llvm/lib/Target/PowerPC/PPCInstrAltivec.td | 121 +- llvm/lib/Target/PowerPC/PPCInstrFormats.td | 61 +- llvm/lib/Target/PowerPC/PPCInstrHTM.td | 16 +- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 549 +++++--- llvm/lib/Target/PowerPC/PPCInstrInfo.h | 14 +- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 383 +++--- llvm/lib/Target/PowerPC/PPCInstrVSX.td | 190 ++- llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp | 894 +++++++++++++ llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp | 605 --------- llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp | 164 +++ llvm/lib/Target/PowerPC/PPCMIPeephole.cpp | 390 ++++-- llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h | 2 +- llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp | 25 +- llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp | 13 +- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 35 +- llvm/lib/Target/PowerPC/PPCRegisterInfo.h | 9 + llvm/lib/Target/PowerPC/PPCSubtarget.cpp | 7 +- llvm/lib/Target/PowerPC/PPCSubtarget.h | 45 +- llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp | 1 + llvm/lib/Target/PowerPC/PPCTargetMachine.cpp | 20 +- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 96 +- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h | 17 +- llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp | 1 + .../PowerPC/TargetInfo/PowerPCTargetInfo.cpp | 2 +- 44 files changed, 4573 insertions(+), 2838 deletions(-) create mode 100644 llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp delete mode 100644 llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp create mode 100644 llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp (limited to 'llvm/lib/Target/PowerPC') diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index aedf5b713c3f..7e7902c27a81 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -800,9 +800,9 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, Inst = TmpInst; break; } - case PPC::SUBICo: { + case PPC::SUBIC_rec: { MCInst TmpInst; - TmpInst.setOpcode(PPC::ADDICo); + TmpInst.setOpcode(PPC::ADDIC_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); addNegOperand(TmpInst, Inst.getOperand(2), getContext()); @@ -810,11 +810,11 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::EXTLWI: - case PPC::EXTLWIo: { + case PPC::EXTLWI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); int64_t B = Inst.getOperand(3).getImm(); - TmpInst.setOpcode(Opcode == PPC::EXTLWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.setOpcode(Opcode == PPC::EXTLWI ? PPC::RLWINM : PPC::RLWINM_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(B)); @@ -824,11 +824,11 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::EXTRWI: - case PPC::EXTRWIo: { + case PPC::EXTRWI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); int64_t B = Inst.getOperand(3).getImm(); - TmpInst.setOpcode(Opcode == PPC::EXTRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.setOpcode(Opcode == PPC::EXTRWI ? PPC::RLWINM : PPC::RLWINM_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(B + N)); @@ -838,11 +838,11 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::INSLWI: - case PPC::INSLWIo: { + case PPC::INSLWI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); int64_t B = Inst.getOperand(3).getImm(); - TmpInst.setOpcode(Opcode == PPC::INSLWI? PPC::RLWIMI : PPC::RLWIMIo); + TmpInst.setOpcode(Opcode == PPC::INSLWI ? PPC::RLWIMI : PPC::RLWIMI_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); @@ -853,11 +853,11 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::INSRWI: - case PPC::INSRWIo: { + case PPC::INSRWI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); int64_t B = Inst.getOperand(3).getImm(); - TmpInst.setOpcode(Opcode == PPC::INSRWI? PPC::RLWIMI : PPC::RLWIMIo); + TmpInst.setOpcode(Opcode == PPC::INSRWI ? PPC::RLWIMI : PPC::RLWIMI_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); @@ -868,10 +868,10 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::ROTRWI: - case PPC::ROTRWIo: { + case PPC::ROTRWI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); - TmpInst.setOpcode(Opcode == PPC::ROTRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.setOpcode(Opcode == PPC::ROTRWI ? PPC::RLWINM : PPC::RLWINM_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(32 - N)); @@ -881,10 +881,10 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::SLWI: - case PPC::SLWIo: { + case PPC::SLWI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); - TmpInst.setOpcode(Opcode == PPC::SLWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.setOpcode(Opcode == PPC::SLWI ? PPC::RLWINM : PPC::RLWINM_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(N)); @@ -894,10 +894,10 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::SRWI: - case PPC::SRWIo: { + case PPC::SRWI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); - TmpInst.setOpcode(Opcode == PPC::SRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.setOpcode(Opcode == PPC::SRWI ? PPC::RLWINM : PPC::RLWINM_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(32 - N)); @@ -907,10 +907,10 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::CLRRWI: - case PPC::CLRRWIo: { + case PPC::CLRRWI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); - TmpInst.setOpcode(Opcode == PPC::CLRRWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.setOpcode(Opcode == PPC::CLRRWI ? PPC::RLWINM : PPC::RLWINM_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(0)); @@ -920,11 +920,11 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::CLRLSLWI: - case PPC::CLRLSLWIo: { + case PPC::CLRLSLWI_rec: { MCInst TmpInst; int64_t B = Inst.getOperand(2).getImm(); int64_t N = Inst.getOperand(3).getImm(); - TmpInst.setOpcode(Opcode == PPC::CLRLSLWI? PPC::RLWINM : PPC::RLWINMo); + TmpInst.setOpcode(Opcode == PPC::CLRLSLWI ? PPC::RLWINM : PPC::RLWINM_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(N)); @@ -934,11 +934,11 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::EXTLDI: - case PPC::EXTLDIo: { + case PPC::EXTLDI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); int64_t B = Inst.getOperand(3).getImm(); - TmpInst.setOpcode(Opcode == PPC::EXTLDI? PPC::RLDICR : PPC::RLDICRo); + TmpInst.setOpcode(Opcode == PPC::EXTLDI ? PPC::RLDICR : PPC::RLDICR_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(B)); @@ -947,11 +947,11 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::EXTRDI: - case PPC::EXTRDIo: { + case PPC::EXTRDI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); int64_t B = Inst.getOperand(3).getImm(); - TmpInst.setOpcode(Opcode == PPC::EXTRDI? PPC::RLDICL : PPC::RLDICLo); + TmpInst.setOpcode(Opcode == PPC::EXTRDI ? PPC::RLDICL : PPC::RLDICL_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(B + N)); @@ -960,11 +960,11 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::INSRDI: - case PPC::INSRDIo: { + case PPC::INSRDI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); int64_t B = Inst.getOperand(3).getImm(); - TmpInst.setOpcode(Opcode == PPC::INSRDI? PPC::RLDIMI : PPC::RLDIMIo); + TmpInst.setOpcode(Opcode == PPC::INSRDI ? PPC::RLDIMI : PPC::RLDIMI_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); @@ -974,10 +974,10 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::ROTRDI: - case PPC::ROTRDIo: { + case PPC::ROTRDI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); - TmpInst.setOpcode(Opcode == PPC::ROTRDI? PPC::RLDICL : PPC::RLDICLo); + TmpInst.setOpcode(Opcode == PPC::ROTRDI ? PPC::RLDICL : PPC::RLDICL_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(64 - N)); @@ -986,10 +986,10 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::SLDI: - case PPC::SLDIo: { + case PPC::SLDI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); - TmpInst.setOpcode(Opcode == PPC::SLDI? PPC::RLDICR : PPC::RLDICRo); + TmpInst.setOpcode(Opcode == PPC::SLDI ? PPC::RLDICR : PPC::RLDICR_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(N)); @@ -1007,10 +1007,10 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::SRDI: - case PPC::SRDIo: { + case PPC::SRDI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); - TmpInst.setOpcode(Opcode == PPC::SRDI? PPC::RLDICL : PPC::RLDICLo); + TmpInst.setOpcode(Opcode == PPC::SRDI ? PPC::RLDICL : PPC::RLDICL_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(64 - N)); @@ -1019,10 +1019,10 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::CLRRDI: - case PPC::CLRRDIo: { + case PPC::CLRRDI_rec: { MCInst TmpInst; int64_t N = Inst.getOperand(2).getImm(); - TmpInst.setOpcode(Opcode == PPC::CLRRDI? PPC::RLDICR : PPC::RLDICRo); + TmpInst.setOpcode(Opcode == PPC::CLRRDI ? PPC::RLDICR : PPC::RLDICR_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(0)); @@ -1031,11 +1031,11 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::CLRLSLDI: - case PPC::CLRLSLDIo: { + case PPC::CLRLSLDI_rec: { MCInst TmpInst; int64_t B = Inst.getOperand(2).getImm(); int64_t N = Inst.getOperand(3).getImm(); - TmpInst.setOpcode(Opcode == PPC::CLRLSLDI? PPC::RLDIC : PPC::RLDICo); + TmpInst.setOpcode(Opcode == PPC::CLRLSLDI ? PPC::RLDIC : PPC::RLDIC_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(N)); @@ -1044,14 +1044,14 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::RLWINMbm: - case PPC::RLWINMobm: { + case PPC::RLWINMbm_rec: { unsigned MB, ME; int64_t BM = Inst.getOperand(3).getImm(); if (!isRunOfOnes(BM, MB, ME)) break; MCInst TmpInst; - TmpInst.setOpcode(Opcode == PPC::RLWINMbm ? PPC::RLWINM : PPC::RLWINMo); + TmpInst.setOpcode(Opcode == PPC::RLWINMbm ? PPC::RLWINM : PPC::RLWINM_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(Inst.getOperand(2)); @@ -1061,14 +1061,14 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::RLWIMIbm: - case PPC::RLWIMIobm: { + case PPC::RLWIMIbm_rec: { unsigned MB, ME; int64_t BM = Inst.getOperand(3).getImm(); if (!isRunOfOnes(BM, MB, ME)) break; MCInst TmpInst; - TmpInst.setOpcode(Opcode == PPC::RLWIMIbm ? PPC::RLWIMI : PPC::RLWIMIo); + TmpInst.setOpcode(Opcode == PPC::RLWIMIbm ? PPC::RLWIMI : PPC::RLWIMI_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(0)); // The tied operand. TmpInst.addOperand(Inst.getOperand(1)); @@ -1079,14 +1079,14 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::RLWNMbm: - case PPC::RLWNMobm: { + case PPC::RLWNMbm_rec: { unsigned MB, ME; int64_t BM = Inst.getOperand(3).getImm(); if (!isRunOfOnes(BM, MB, ME)) break; MCInst TmpInst; - TmpInst.setOpcode(Opcode == PPC::RLWNMbm ? PPC::RLWNM : PPC::RLWNMo); + TmpInst.setOpcode(Opcode == PPC::RLWNMbm ? PPC::RLWNM : PPC::RLWNM_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(Inst.getOperand(2)); @@ -1116,8 +1116,8 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, case PPC::CP_PASTEx : case PPC::CP_PASTE_LAST: { MCInst TmpInst; - TmpInst.setOpcode(Opcode == PPC::CP_PASTEx ? - PPC::CP_PASTE : PPC::CP_PASTEo); + TmpInst.setOpcode(Opcode == PPC::CP_PASTEx ? PPC::CP_PASTE + : PPC::CP_PASTE_rec); TmpInst.addOperand(Inst.getOperand(0)); TmpInst.addOperand(Inst.getOperand(1)); TmpInst.addOperand(MCOperand::createImm(Opcode == PPC::CP_PASTEx ? 0 : 1)); @@ -1786,7 +1786,7 @@ bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) { /// Force static initialization. -extern "C" void LLVMInitializePowerPCAsmParser() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() { RegisterMCAsmParser A(getThePPC32Target()); RegisterMCAsmParser B(getThePPC64Target()); RegisterMCAsmParser C(getThePPC64LETarget()); diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 3597fd15eeb1..e3c0f958c7ed 100644 --- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -34,7 +34,6 @@ public: DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, uint64_t Address, - raw_ostream &VStream, raw_ostream &CStream) const override; }; } // end anonymous namespace @@ -51,7 +50,7 @@ static MCDisassembler *createPPCLEDisassembler(const Target &T, return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true); } -extern "C" void LLVMInitializePowerPCDisassembler() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() { // Register the disassembler for each target. TargetRegistry::RegisterMCDisassembler(getThePPC32Target(), createPPCDisassembler); @@ -323,7 +322,7 @@ static DecodeStatus decodeCRBitMOperand(MCInst &Inst, uint64_t Imm, DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef Bytes, - uint64_t Address, raw_ostream &OS, + uint64_t Address, raw_ostream &CS) const { // Get the four bytes of the instruction. Size = 4; @@ -350,4 +349,3 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI); } - diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp index 7fc231618fa9..9cc1c539e24a 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp @@ -64,8 +64,9 @@ void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { OS << RegName; } -void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot, const MCSubtargetInfo &STI) { +void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address, + StringRef Annot, const MCSubtargetInfo &STI, + raw_ostream &O) { // Customize printing of the addis instruction on AIX. When an operand is a // symbol reference, the instruction syntax is changed to look like a load // operation, i.e: @@ -193,11 +194,10 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, } if (!printAliasInstr(MI, O)) - printInstruction(MI, O); + printInstruction(MI, Address, O); printAnnotation(O, Annot); } - void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O, const char *Modifier) { diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h index 725ae2a7081b..a3ec41aa348d 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h @@ -32,11 +32,11 @@ public: : MCInstPrinter(MAI, MII, MRI), TT(T) {} void printRegName(raw_ostream &OS, unsigned RegNo) const override; - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; + void printInst(const MCInst *MI, uint64_t Address, StringRef Annot, + const MCSubtargetInfo &STI, raw_ostream &O) override; // Autogenerated by tblgen. - void printInstruction(const MCInst *MI, raw_ostream &O); + void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); bool printAliasInstr(const MCInst *MI, raw_ostream &OS); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 1216cd727289..dc2c216a3efd 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -15,33 +15,6 @@ using namespace llvm; -void PPCMCAsmInfoDarwin::anchor() { } - -PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) { - if (is64Bit) { - CodePointerSize = CalleeSaveStackSlotSize = 8; - } - IsLittleEndian = false; - - SeparatorString = "@"; - CommentString = ";"; - ExceptionsType = ExceptionHandling::DwarfCFI; - - if (!is64Bit) - Data64bitsDirective = nullptr; // We can't emit a 64-bit unit in PPC32 mode. - - AssemblerDialect = 1; // New-Style mnemonics. - SupportsDebugInformation= true; // Debug information. - - // The installed assembler for OSX < 10.6 lacks some directives. - // FIXME: this should really be a check on the assembler characteristics - // rather than OS version - if (T.isMacOSX() && T.isMacOSXVersionLT(10, 6)) - HasWeakDefCanBeHiddenDirective = false; - - UseIntegratedAssembler = true; -} - void PPCELFMCAsmInfo::anchor() { } PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) { @@ -87,4 +60,5 @@ PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) { assert(!IsLittleEndian && "Little-endian XCOFF not supported."); CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4; ZeroDirective = "\t.space\t"; + SymbolsHaveSMC = true; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h index 42cb62ad26a4..8c52bbbd8a56 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h @@ -20,13 +20,6 @@ namespace llvm { class Triple; -class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin { - virtual void anchor(); - -public: - explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple &); -}; - class PPCELFMCAsmInfo : public MCAsmInfoELF { void anchor() override; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 90c3c8d20edb..cbfb8e2ff282 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -30,6 +30,7 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" +#include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/ErrorHandling.h" @@ -76,14 +77,13 @@ static MCSubtargetInfo *createPPCMCSubtargetInfo(const Triple &TT, } static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, - const Triple &TheTriple) { + const Triple &TheTriple, + const MCTargetOptions &Options) { bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 || TheTriple.getArch() == Triple::ppc64le); MCAsmInfo *MAI; - if (TheTriple.isOSDarwin()) - MAI = new PPCMCAsmInfoDarwin(isPPC64, TheTriple); - else if (TheTriple.isOSBinFormatXCOFF()) + if (TheTriple.isOSBinFormatXCOFF()) MAI = new PPCXCOFFMCAsmInfo(isPPC64, TheTriple); else MAI = new PPCELFMCAsmInfo(isPPC64, TheTriple); @@ -107,8 +107,11 @@ public: : PPCTargetStreamer(S), OS(OS) {} void emitTCEntry(const MCSymbol &S) override { + const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo(); OS << "\t.tc "; - OS << S.getName(); + OS << (MAI->getSymbolsHaveSMC() + ? cast(S).getUnqualifiedName() + : S.getName()); OS << "[TC],"; OS << S.getName(); OS << '\n'; @@ -196,7 +199,8 @@ public: void finish() override { for (auto *Sym : UpdateOther) - copyLocalEntry(Sym, Sym->getVariableValue()); + if (Sym->isVariable()) + copyLocalEntry(Sym, Sym->getVariableValue()); } private: @@ -242,7 +246,10 @@ public: PPCTargetXCOFFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {} void emitTCEntry(const MCSymbol &S) override { - report_fatal_error("TOC entries not supported yet."); + const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo(); + const unsigned PointerSize = MAI->getCodePointerSize(); + Streamer.EmitValueToAlignment(PointerSize); + Streamer.EmitSymbolValue(&S, PointerSize); } void emitMachine(StringRef CPU) override { @@ -285,7 +292,7 @@ static MCInstPrinter *createPPCMCInstPrinter(const Triple &T, return new PPCInstPrinter(MAI, MII, MRI, T); } -extern "C" void LLVMInitializePowerPCTargetMC() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() { for (Target *T : {&getThePPC32Target(), &getThePPC64Target(), &getThePPC64LETarget()}) { // Register the MC asm info. diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h index 74b67bd2e928..49443679bb31 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h @@ -82,6 +82,30 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) { return false; } +static inline bool isRunOfOnes64(uint64_t Val, unsigned &MB, unsigned &ME) { + if (!Val) + return false; + + if (isShiftedMask_64(Val)) { + // look for the first non-zero bit + MB = countLeadingZeros(Val); + // look for the first zero bit after the run of ones + ME = countLeadingZeros((Val - 1) ^ Val); + return true; + } else { + Val = ~Val; // invert mask + if (isShiftedMask_64(Val)) { + // effectively look for the first zero bit + ME = countLeadingZeros(Val) - 1; + // effectively look for the first one bit after the run of zeros + MB = countLeadingZeros((Val - 1) ^ Val) + 1; + return true; + } + } + // no run present + return false; +} + } // end namespace llvm // Generated files will use "namespace PPC". To avoid symbol clash, diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td index f6cd8ed00c82..9b3d13989ee2 100644 --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -107,7 +107,7 @@ def : InstRW<[P9_ALU_3C, IP_EXEC_1C, DISP_1C], (instregex "XSMAX(C|J)?DP$"), (instregex "XSMIN(C|J)?DP$"), (instregex "XSCMP(EQ|EXP|GE|GT|O|U)DP$"), - (instregex "CNT(L|T)Z(D|W)(8)?(o)?$"), + (instregex "CNT(L|T)Z(D|W)(8)?(_rec)?$"), (instregex "POPCNT(D|W)$"), (instregex "CMPB(8)?$"), (instregex "SETB(8)?$"), @@ -129,26 +129,26 @@ def : InstRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C], (instregex "MTV(S)?RW(A|Z)$"), (instregex "CMP(WI|LWI|W|LW)(8)?$"), (instregex "CMP(L)?D(I)?$"), - (instregex "SUBF(I)?C(8)?$"), - (instregex "ANDI(S)?o(8)?$"), - (instregex "ADDC(8)?$"), - (instregex "ADDIC(8)?(o)?$"), - (instregex "ADD(8|4)(o)?$"), - (instregex "ADD(E|ME|ZE)(8)?(o)?$"), - (instregex "SUBF(E|ME|ZE)?(8)?(o)?$"), - (instregex "NEG(8)?(o)?$"), + (instregex "SUBF(I)?C(8)?(O)?$"), + (instregex "ANDI(S)?(8)?(_rec)?$"), + (instregex "ADDC(8)?(O)?$"), + (instregex "ADDIC(8)?(_rec)?$"), + (instregex "ADD(8|4)(O)?(_rec)?$"), + (instregex "ADD(E|ME|ZE)(8)?(O)?(_rec)?$"), + (instregex "SUBF(E|ME|ZE)?(8)?(O)?(_rec)?$"), + (instregex "NEG(8)?(O)?(_rec)?$"), (instregex "POPCNTB$"), (instregex "ADD(I|IS)?(8)?$"), (instregex "LI(S)?(8)?$"), - (instregex "(X)?OR(I|IS)?(8)?(o)?$"), - (instregex "NAND(8)?(o)?$"), - (instregex "AND(C)?(8)?(o)?$"), - (instregex "NOR(8)?(o)?$"), - (instregex "OR(C)?(8)?(o)?$"), - (instregex "EQV(8)?(o)?$"), - (instregex "EXTS(B|H|W)(8)?(_32)?(_64)?(o)?$"), + (instregex "(X)?OR(I|IS)?(8)?(_rec)?$"), + (instregex "NAND(8)?(_rec)?$"), + (instregex "AND(C)?(8)?(_rec)?$"), + (instregex "NOR(8)?(_rec)?$"), + (instregex "OR(C)?(8)?(_rec)?$"), + (instregex "EQV(8)?(_rec)?$"), + (instregex "EXTS(B|H|W)(8)?(_32)?(_64)?(_rec)?$"), (instregex "ADD(4|8)(TLS)?(_)?$"), - (instregex "NEG(8)?$"), + (instregex "NEG(8)?(O)?$"), (instregex "ADDI(S)?toc(HA|L)(8)?$"), COPY, MCRF, @@ -211,8 +211,8 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instregex "VABSDU(B|H|W)$"), (instregex "VADDU(B|H|W)S$"), (instregex "VAVG(S|U)(B|H|W)$"), - (instregex "VCMP(EQ|GE|GT)FP(o)?$"), - (instregex "VCMPBFP(o)?$"), + (instregex "VCMP(EQ|GE|GT)FP(_rec)?$"), + (instregex "VCMPBFP(_rec)?$"), (instregex "VC(L|T)Z(B|H|W|D)$"), (instregex "VADDS(B|H|W)S$"), (instregex "V(MIN|MAX)FP$"), @@ -233,43 +233,43 @@ def : InstRW<[P9_ALUE_3C, P9_ALUO_3C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], VSUBUWS, VSUBCUW, VCMPGTSB, - VCMPGTSBo, + VCMPGTSB_rec, VCMPGTSD, - VCMPGTSDo, + VCMPGTSD_rec, VCMPGTSH, - VCMPGTSHo, + VCMPGTSH_rec, VCMPGTSW, - VCMPGTSWo, + VCMPGTSW_rec, VCMPGTUB, - VCMPGTUBo, + VCMPGTUB_rec, VCMPGTUD, - VCMPGTUDo, + VCMPGTUD_rec, VCMPGTUH, - VCMPGTUHo, + VCMPGTUH_rec, VCMPGTUW, - VCMPGTUWo, - VCMPNEBo, - VCMPNEHo, - VCMPNEWo, - VCMPNEZBo, - VCMPNEZHo, - VCMPNEZWo, - VCMPEQUBo, - VCMPEQUDo, - VCMPEQUHo, - VCMPEQUWo, + VCMPGTUW_rec, + VCMPNEB_rec, + VCMPNEH_rec, + VCMPNEW_rec, + VCMPNEZB_rec, + VCMPNEZH_rec, + VCMPNEZW_rec, + VCMPEQUB_rec, + VCMPEQUD_rec, + VCMPEQUH_rec, + VCMPEQUW_rec, XVCMPEQDP, - XVCMPEQDPo, + XVCMPEQDP_rec, XVCMPEQSP, - XVCMPEQSPo, + XVCMPEQSP_rec, XVCMPGEDP, - XVCMPGEDPo, + XVCMPGEDP_rec, XVCMPGESP, - XVCMPGESPo, + XVCMPGESP_rec, XVCMPGTDP, - XVCMPGTDPo, + XVCMPGTDP_rec, XVCMPGTSP, - XVCMPGTSPo, + XVCMPGTSP_rec, XVMAXDP, XVMAXSP, XVMINDP, @@ -399,7 +399,7 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], def : InstRW<[P9_DP_5C, IP_EXEC_1C, DISP_3SLOTS_1C], (instrs (instregex "MADD(HD|HDU|LD|LD8)$"), - (instregex "MUL(HD|HW|LD|LI|LI8|LW)(U)?$") + (instregex "MUL(HD|HW|LD|LI|LI8|LW)(U)?(O)?$") )>; // 7 cycle Restricted DP operation. One DP unit, one EXEC pipeline and all three @@ -451,14 +451,14 @@ def : InstRW<[P9_DP_7C, IP_EXEC_1C, DISP_3SLOTS_1C], def : InstRW<[P9_DP_7C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_1C], (instrs - (instregex "FSEL(D|S)o$") + (instregex "FSEL(D|S)_rec$") )>; // 5 Cycle Restricted DP operation and one 2 cycle ALU operation. def : InstRW<[P9_DPOpAndALUOp_7C, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_1C], (instrs - (instregex "MUL(H|L)(D|W)(U)?o$") + (instregex "MUL(H|L)(D|W)(U)?(O)?_rec$") )>; // 7 cycle Restricted DP operation and one 3 cycle ALU operation. @@ -467,18 +467,18 @@ def : InstRW<[P9_DPOpAndALUOp_7C, IP_EXEC_1C, IP_EXEC_1C, def : InstRW<[P9_DPOpAndALU2Op_10C, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_1C], (instrs - (instregex "FRI(N|P|Z|M)(D|S)o$"), - (instregex "FRE(S)?o$"), - (instregex "FADD(S)?o$"), - (instregex "FSUB(S)?o$"), - (instregex "F(N)?MSUB(S)?o$"), - (instregex "F(N)?MADD(S)?o$"), - (instregex "FCFID(U)?(S)?o$"), - (instregex "FCTID(U)?(Z)?o$"), - (instregex "FCTIW(U)?(Z)?o$"), - (instregex "FMUL(S)?o$"), - (instregex "FRSQRTE(S)?o$"), - FRSPo + (instregex "FRI(N|P|Z|M)(D|S)_rec$"), + (instregex "FRE(S)?_rec$"), + (instregex "FADD(S)?_rec$"), + (instregex "FSUB(S)?_rec$"), + (instregex "F(N)?MSUB(S)?_rec$"), + (instregex "F(N)?MADD(S)?_rec$"), + (instregex "FCFID(U)?(S)?_rec$"), + (instregex "FCTID(U)?(Z)?_rec$"), + (instregex "FCTIW(U)?(Z)?_rec$"), + (instregex "FMUL(S)?_rec$"), + (instregex "FRSQRTE(S)?_rec$"), + FRSP_rec )>; // 7 cycle DP operation. One DP unit, one EXEC pipeline and 1 dispatch units. @@ -613,16 +613,16 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C], XSCMPUQP, XSTSTDCQP, XSXSIGQP, - BCDCFNo, - BCDCFZo, - BCDCPSGNo, - BCDCTNo, - BCDCTZo, - BCDSETSGNo, - BCDSo, - BCDTRUNCo, - BCDUSo, - BCDUTRUNCo + BCDCFN_rec, + BCDCFZ_rec, + BCDCPSGN_rec, + BCDCTN_rec, + BCDCTZ_rec, + BCDSETSGN_rec, + BCDS_rec, + BCDTRUNC_rec, + BCDUS_rec, + BCDUTRUNC_rec )>; // 12 Cycle DFU operation. Only one DFU unit per CPU so we use a whole @@ -630,7 +630,7 @@ def : InstRW<[P9_PM_3C, IP_EXECO_1C, IP_EXECE_1C, DISP_1C], // dispatch. def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs - BCDSRo, + BCDSR_rec, XSADDQP, XSADDQPO, XSCVDPQP, @@ -654,7 +654,7 @@ def : InstRW<[P9_DFU_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], // dispatch. def : InstRW<[P9_DFU_23C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs - BCDCTSQo + BCDCTSQ_rec )>; // 24 Cycle DFU operation. Only one DFU unit per CPU so we use a whole @@ -679,7 +679,7 @@ def : InstRW<[P9_DFU_24C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], // dispatch. def : InstRW<[P9_DFU_37C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C], (instrs - BCDCFSQo + BCDCFSQ_rec )>; // 58 Cycle DFU operation. Only one DFU unit per CPU so we use a whole @@ -819,7 +819,7 @@ def : InstRW<[P9_LoadAndALUOp_6C, IP_EXEC_1C, IP_AGEN_1C, DISP_1C, DISP_1C], (instrs (instregex "LHA(X)?(8)?$"), - (instregex "CP_PASTE(8)?o$"), + (instregex "CP_PASTE(8)?_rec$"), (instregex "LWA(X)?(_32)?$"), TCHECK )>; @@ -946,7 +946,9 @@ def : InstRW<[P9_DIV_12C, IP_EXECE_1C, IP_EXECO_1C, DISP_EVEN_1C], def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], (instrs DIVW, + DIVWO, DIVWU, + DIVWUO, MODSW )>; @@ -956,9 +958,13 @@ def : InstRW<[P9_DIV_16C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], (instrs DIVWE, + DIVWEO, DIVD, + DIVDO, DIVWEU, + DIVWEUO, DIVDU, + DIVDUO, MODSD, MODUD, MODUW @@ -970,7 +976,9 @@ def : InstRW<[P9_DIV_24C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], (instrs DIVDE, - DIVDEU + DIVDEO, + DIVDEU, + DIVDEUO )>; // Cracked DIV and ALU operation. Requires one full slice for the ALU operation @@ -979,7 +987,7 @@ def : InstRW<[P9_DIV_40C_8, IP_EXECO_1C, IP_EXECE_1C, DISP_EVEN_1C], def : InstRW<[P9_IntDivAndALUOp_18C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, DISP_EVEN_1C, DISP_1C], (instrs - (instregex "DIVW(U)?(O)?o$") + (instregex "DIVW(U)?(O)?_rec$") )>; // Cracked DIV and ALU operation. Requires one full slice for the ALU operation @@ -988,10 +996,14 @@ def : InstRW<[P9_IntDivAndALUOp_18C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, DISP_EVEN_1C, DISP_1C], (instrs - DIVDo, - DIVDUo, - DIVWEo, - DIVWEUo + DIVD_rec, + DIVDO_rec, + DIVDU_rec, + DIVDUO_rec, + DIVWE_rec, + DIVWEO_rec, + DIVWEU_rec, + DIVWEUO_rec )>; // Cracked DIV and ALU operation. Requires one full slice for the ALU operation @@ -1000,8 +1012,10 @@ def : InstRW<[P9_IntDivAndALUOp_26C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, def : InstRW<[P9_IntDivAndALUOp_42C_8, IP_EXECE_1C, IP_EXECO_1C, IP_EXEC_1C, DISP_EVEN_1C, DISP_1C], (instrs - DIVDEo, - DIVDEUo + DIVDE_rec, + DIVDEO_rec, + DIVDEU_rec, + DIVDEUO_rec )>; // CR access instructions in _BrMCR, IIC_BrMCRX. @@ -1026,8 +1040,8 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs - (instregex "ADDC(8)?o$"), - (instregex "SUBFC(8)?o$") + (instregex "ADDC(8)?(O)?_rec$"), + (instregex "SUBFC(8)?(O)?_rec$") )>; // Cracked ALU operations. @@ -1038,10 +1052,10 @@ def : InstRW<[P9_ALU_2C, P9_ALU_2C, IP_EXEC_1C, IP_EXEC_1C, def : InstRW<[P9_ALU_2C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_1C], (instrs - (instregex "F(N)?ABS(D|S)o$"), - (instregex "FCPSGN(D|S)o$"), - (instregex "FNEG(D|S)o$"), - FMRo + (instregex "F(N)?ABS(D|S)_rec$"), + (instregex "FCPSGN(D|S)_rec$"), + (instregex "FNEG(D|S)_rec$"), + FMR_rec )>; // Cracked ALU operations. @@ -1063,8 +1077,8 @@ def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs - (instregex "MTFSF(b|o)?$"), - (instregex "MTFSFI(o)?$") + (instregex "MTFSF(b|_rec)?$"), + (instregex "MTFSFI(_rec)?$") )>; // Cracked instruction made of two ALU ops. @@ -1073,13 +1087,13 @@ def : InstRW<[P9_ALU_3C, P9_ALU_3C, IP_EXEC_1C, IP_EXEC_1C, def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_1C], (instrs - (instregex "RLD(I)?C(R|L)o$"), - (instregex "RLW(IMI|INM|NM)(8)?o$"), - (instregex "SLW(8)?o$"), - (instregex "SRAW(I)?o$"), - (instregex "SRW(8)?o$"), - RLDICL_32o, - RLDIMIo + (instregex "RLD(I)?C(R|L)_rec$"), + (instregex "RLW(IMI|INM|NM)(8)?_rec$"), + (instregex "SLW(8)?_rec$"), + (instregex "SRAW(I)?_rec$"), + (instregex "SRW(8)?_rec$"), + RLDICL_32_rec, + RLDIMI_rec )>; // Cracked instruction made of two ALU ops. @@ -1088,7 +1102,7 @@ def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, def : InstRW<[P9_ALU2OpAndALU2Op_6C, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_3SLOTS_1C], (instrs - (instregex "MFFS(L|CE|o)?$") + (instregex "MFFS(L|CE|_rec)?$") )>; // Cracked ALU instruction composed of three consecutive 2 cycle loads for a @@ -1104,12 +1118,12 @@ def : InstRW<[P9_ALUOpAndALUOpAndALUOp_6C, IP_EXEC_1C, IP_EXEC_1C, IP_EXEC_1C, // The two ops cannot be done in parallel. def : InstRW<[P9_ALUOpAndALUOp_4C, IP_EXEC_1C, IP_EXEC_1C, DISP_1C, DISP_1C], (instrs - (instregex "EXTSWSLI_32_64o$"), - (instregex "SRAD(I)?o$"), - EXTSWSLIo, - SLDo, - SRDo, - RLDICo + (instregex "EXTSWSLI_32_64_rec$"), + (instregex "SRAD(I)?_rec$"), + EXTSWSLI_rec, + SLD_rec, + SRD_rec, + RLDIC_rec )>; // 33 Cycle DP Instruction Restricted. Takes one slice and 3 dispatches. @@ -1122,7 +1136,7 @@ def : InstRW<[P9_DP_33C_8, IP_EXEC_1C, DISP_3SLOTS_1C], def : InstRW<[P9_DPOpAndALU2Op_36C_8, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_1C], (instrs - FDIVo + FDIV_rec )>; // 36 Cycle DP Instruction. @@ -1156,7 +1170,7 @@ def : InstRW<[P9_DPE_27C_10, P9_DPO_27C_10, IP_EXECE_1C, IP_EXECO_1C, def : InstRW<[P9_DPOpAndALU2Op_39C_10, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_1C], (instrs - FSQRTo + FSQRT_rec )>; // 26 Cycle DP Instruction. @@ -1175,7 +1189,7 @@ def : InstRW<[P9_DP_26C_5, IP_EXEC_1C, DISP_3SLOTS_1C], def : InstRW<[P9_DPOpAndALU2Op_29C_5, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_1C], (instrs - FSQRTSo + FSQRTS_rec )>; // 33 Cycle DP Instruction. Takes one slice and 1 dispatch. @@ -1194,7 +1208,7 @@ def : InstRW<[P9_DP_22C_5, IP_EXEC_1C, DISP_3SLOTS_1C], def : InstRW<[P9_DPOpAndALU2Op_25C_5, IP_EXEC_1C, IP_EXEC_1C, DISP_3SLOTS_1C, DISP_1C], (instrs - FDIVSo + FDIVS_rec )>; // 22 Cycle DP Instruction. Takes one slice and 1 dispatch. @@ -1304,6 +1318,7 @@ def : InstRW<[P9_BR_2C, DISP_BR_1C], BCLalways, BCLn, BCTRL8_LDinto_toc, + BCTRL_LWZinto_toc, BCn, CTRL_DEP )>; @@ -1400,7 +1415,7 @@ def : InstRW<[], MBAR, MSYNC, SLBSYNC, - SLBFEEo, + SLBFEE_rec, NAP, STOP, TRAP, diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h index 0534773c4c9e..a83509f0e687 100644 --- a/llvm/lib/Target/PowerPC/PPC.h +++ b/llvm/lib/Target/PowerPC/PPC.h @@ -28,12 +28,13 @@ namespace llvm { class AsmPrinter; class MCInst; class MCOperand; - + class ModulePass; + FunctionPass *createPPCCTRLoops(); #ifndef NDEBUG FunctionPass *createPPCCTRLoopsVerify(); #endif - FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM); + FunctionPass *createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM); FunctionPass *createPPCTOCRegDepsPass(); FunctionPass *createPPCEarlyReturnPass(); FunctionPass *createPPCVSXCopyPass(); @@ -59,7 +60,7 @@ namespace llvm { #ifndef NDEBUG void initializePPCCTRLoopsVerifyPass(PassRegistry&); #endif - void initializePPCLoopPreIncPrepPass(PassRegistry&); + void initializePPCLoopInstrFormPrepPass(PassRegistry&); void initializePPCTOCRegDepsPass(PassRegistry&); void initializePPCEarlyReturnPass(PassRegistry&); void initializePPCVSXCopyPass(PassRegistry&); @@ -77,6 +78,10 @@ namespace llvm { extern char &PPCVSXFMAMutateID; + ModulePass *createPPCLowerMASSVEntriesPass(); + void initializePPCLowerMASSVEntriesPass(PassRegistry &); + extern char &PPCLowerMASSVEntriesID; + namespace PPCII { /// Target Operand Flag enum. diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 8e94a2ae15e0..bef0a81ee3ad 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -22,35 +22,37 @@ include "llvm/Target/Target.td" // CPU Directives // //===----------------------------------------------------------------------===// -def Directive440 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_440", "">; -def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">; -def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">; -def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; -def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; -def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">; -def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">; -def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">; -def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">; -def Directive32 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">; -def Directive64 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">; -def DirectiveA2 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">; -def DirectiveE500 : SubtargetFeature<"", "DarwinDirective", +def Directive440 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_440", "">; +def Directive601 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_601", "">; +def Directive602 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_602", "">; +def Directive603 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_603", "">; +def Directive604 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_603", "">; +def Directive620 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_603", "">; +def Directive7400: SubtargetFeature<"", "CPUDirective", "PPC::DIR_7400", "">; +def Directive750 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_750", "">; +def Directive970 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_970", "">; +def Directive32 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_32", "">; +def Directive64 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_64", "">; +def DirectiveA2 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_A2", "">; +def DirectiveE500 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_E500", "">; -def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective", +def DirectiveE500mc : SubtargetFeature<"", "CPUDirective", "PPC::DIR_E500mc", "">; -def DirectiveE5500 : SubtargetFeature<"", "DarwinDirective", +def DirectiveE5500 : SubtargetFeature<"", "CPUDirective", "PPC::DIR_E5500", "">; -def DirectivePwr3: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR3", "">; -def DirectivePwr4: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR4", "">; -def DirectivePwr5: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5", "">; +def DirectivePwr3: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR3", "">; +def DirectivePwr4: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR4", "">; +def DirectivePwr5: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR5", "">; def DirectivePwr5x - : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">; -def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">; + : SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR5X", "">; +def DirectivePwr6: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR6", "">; def DirectivePwr6x - : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">; -def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">; -def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">; -def DirectivePwr9: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR9", "">; + : SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR6X", "">; +def DirectivePwr7: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR7", "">; +def DirectivePwr8: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR8", "">; +def DirectivePwr9: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR9", "">; +def DirectivePwrFuture + : SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR_FUTURE", "">; def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", "Enable 64-bit instructions">; @@ -164,6 +166,9 @@ def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true", "Enable Hardware Transactional Memory instructions">; def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true", "Implement mftb using the mfspr instruction">; +def FeatureUnalignedFloats : + SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess", + "true", "CPU does not trap on unaligned FP access">; def FeaturePPCPreRASched: SubtargetFeature<"ppc-prera-sched", "UsePPCPreRASchedStrategy", "true", "Use PowerPC pre-RA scheduling strategy">; @@ -209,36 +214,95 @@ def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units", // came before them, the idea is to make implementations of new processors // less error prone and easier to read. // Namely: -// list Power8FeatureList = ... -// list FutureProcessorSpecificFeatureList = -// [ features that Power8 does not support ] -// list FutureProcessorFeatureList = -// !listconcat(Power8FeatureList, FutureProcessorSpecificFeatureList) +// list P8InheritableFeatures = ... +// list FutureProcessorAddtionalFeatures = +// [ features that Power8 does not support but inheritable ] +// list FutureProcessorSpecificFeatures = +// [ features that Power8 does not support and not inheritable ] +// list FutureProcessorInheritableFeatures = +// !listconcat(P8InheritableFeatures, FutureProcessorAddtionalFeatures) +// list FutureProcessorFeatures = +// !listconcat(FutureProcessorInheritableFeatures, +// FutureProcessorSpecificFeatures) // Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as // well as providing a single point of definition if the feature set will be // used elsewhere. def ProcessorFeatures { - list Power7FeatureList = - [DirectivePwr7, FeatureAltivec, FeatureVSX, - FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE, - FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES, - FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX, - FeatureFPRND, FeatureFPCVT, FeatureISEL, - FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX, - Feature64Bit /*, Feature64BitRegs */, - FeatureBPERMD, FeatureExtDiv, - FeatureMFTB, DeprecatedDST, FeatureTwoConstNR]; - list Power8SpecificFeatures = - [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto, - FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic]; - list Power8FeatureList = - !listconcat(Power7FeatureList, Power8SpecificFeatures); - list Power9SpecificFeatures = - [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0, - FeatureVectorsUseTwoUnits, FeaturePPCPreRASched, FeaturePPCPostRASched]; - list Power9FeatureList = - !listconcat(Power8FeatureList, Power9SpecificFeatures); + // Power7 + list P7InheritableFeatures = [DirectivePwr7, + FeatureAltivec, + FeatureVSX, + FeatureMFOCRF, + FeatureFCPSGN, + FeatureFSqrt, + FeatureFRE, + FeatureFRES, + FeatureFRSQRTE, + FeatureFRSQRTES, + FeatureRecipPrec, + FeatureSTFIWX, + FeatureLFIWAX, + FeatureFPRND, + FeatureFPCVT, + FeatureISEL, + FeaturePOPCNTD, + FeatureCMPB, + FeatureLDBRX, + Feature64Bit, + /* Feature64BitRegs, */ + FeatureBPERMD, + FeatureExtDiv, + FeatureMFTB, + DeprecatedDST, + FeatureTwoConstNR, + FeatureUnalignedFloats]; + list P7SpecificFeatures = []; + list P7Features = + !listconcat(P7InheritableFeatures, P7SpecificFeatures); + + // Power8 + list P8AdditionalFeatures = [DirectivePwr8, + FeatureP8Altivec, + FeatureP8Vector, + FeatureP8Crypto, + FeatureHTM, + FeatureDirectMove, + FeatureICBT, + FeaturePartwordAtomic]; + list P8SpecificFeatures = []; + list P8InheritableFeatures = + !listconcat(P7InheritableFeatures, P8AdditionalFeatures); + list P8Features = + !listconcat(P8InheritableFeatures, P8SpecificFeatures); + + // Power9 + list P9AdditionalFeatures = [DirectivePwr9, + FeatureP9Altivec, + FeatureP9Vector, + FeatureISA3_0]; + // Some features are unique to Power9 and there is no reason to assume + // they will be part of any future CPUs. One example is the narrower + // dispatch for vector operations than scalar ones. For the time being, + // this list also includes scheduling-related features since we do not have + // enough info to create custom scheduling strategies for future CPUs. + list P9SpecificFeatures = [FeatureVectorsUseTwoUnits, + FeaturePPCPreRASched, + FeaturePPCPostRASched]; + list P9InheritableFeatures = + !listconcat(P8InheritableFeatures, P9AdditionalFeatures); + list P9Features = + !listconcat(P9InheritableFeatures, P9SpecificFeatures); + + // Future + // For future CPU we assume that all of the existing features from Power 9 + // still exist with the exception of those we know are Power 9 specific. + list FutureAdditionalFeatures = []; + list FutureSpecificFeatures = []; + list FutureInheritableFeatures = + !listconcat(P9InheritableFeatures, FutureAdditionalFeatures); + list FutureFeatures = + !listconcat(FutureInheritableFeatures, FutureSpecificFeatures); } // Note: Future features to add when support is extended to more @@ -378,7 +442,7 @@ def : ProcessorModel<"g5", G5Model, def : ProcessorModel<"e500", PPCE500Model, [DirectiveE500, FeatureICBT, FeatureBookE, - FeatureISEL, FeatureMFTB]>; + FeatureISEL, FeatureMFTB, FeatureSPE]>; def : ProcessorModel<"e500mc", PPCE500mcModel, [DirectiveE500mc, FeatureSTFIWX, FeatureICBT, FeatureBookE, @@ -438,9 +502,12 @@ def : ProcessorModel<"pwr6x", G5Model, FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB, FeatureFPRND, Feature64Bit, FeatureMFTB, DeprecatedDST]>; -def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.Power7FeatureList>; -def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>; -def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.Power9FeatureList>; +def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>; +def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>; +def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>; +// No scheduler model for future CPU. +def : ProcessorModel<"future", NoSchedModel, + ProcessorFeatures.FutureFeatures>; def : Processor<"ppc", G3Itineraries, [Directive32, FeatureHardFloat, FeatureMFTB]>; def : Processor<"ppc32", G3Itineraries, [Directive32, FeatureHardFloat, @@ -451,7 +518,7 @@ def : ProcessorModel<"ppc64", G5Model, FeatureFRSQRTE, FeatureSTFIWX, Feature64Bit /*, Feature64BitRegs */, FeatureMFTB]>; -def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.Power8FeatureList>; +def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.P8Features>; //===----------------------------------------------------------------------===// // Calling Conventions diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 66236b72a1a3..4311df5dbeb8 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -43,6 +43,7 @@ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -79,9 +80,11 @@ namespace { class PPCAsmPrinter : public AsmPrinter { protected: MapVector TOC; - const PPCSubtarget *Subtarget; + const PPCSubtarget *Subtarget = nullptr; StackMaps SM; + virtual MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO); + public: explicit PPCAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) @@ -144,23 +147,12 @@ public: void EmitInstruction(const MachineInstr *MI) override; }; -/// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac -/// OS X -class PPCDarwinAsmPrinter : public PPCAsmPrinter { -public: - explicit PPCDarwinAsmPrinter(TargetMachine &TM, - std::unique_ptr Streamer) - : PPCAsmPrinter(TM, std::move(Streamer)) {} - - StringRef getPassName() const override { - return "Darwin PPC Assembly Printer"; - } - - bool doFinalization(Module &M) override; - void EmitStartOfAsmFile(Module &M) override; -}; - class PPCAIXAsmPrinter : public PPCAsmPrinter { +private: + static void ValidateGV(const GlobalVariable *GV); +protected: + MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO) override; + public: PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) : PPCAsmPrinter(TM, std::move(Streamer)) {} @@ -169,6 +161,8 @@ public: void SetupMachineFunction(MachineFunction &MF) override; + const MCExpr *lowerConstant(const Constant *CV) override; + void EmitGlobalVariable(const GlobalVariable *GV) override; void EmitFunctionDescriptor() override; @@ -351,8 +345,12 @@ void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) { void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) { unsigned NumNOPBytes = MI.getOperand(1).getImm(); + + auto &Ctx = OutStreamer->getContext(); + MCSymbol *MILabel = Ctx.createTempSymbol(); + OutStreamer->EmitLabel(MILabel); - SM.recordStackMap(MI); + SM.recordStackMap(*MILabel, MI); assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); // Scan ahead to trim the shadow. @@ -377,7 +375,11 @@ void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) { // Lower a patchpoint of the form: // [], , , , void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) { - SM.recordPatchPoint(MI); + auto &Ctx = OutStreamer->getContext(); + MCSymbol *MILabel = Ctx.createTempSymbol(); + OutStreamer->EmitLabel(MILabel); + + SM.recordPatchPoint(*MILabel, MI); PatchPointOpers Opers(&MI); unsigned EncodedBytes = 0; @@ -490,9 +492,9 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, (!Subtarget->isPPC64() && MI->getOperand(1).getReg() == PPC::R3)) && "GETtls[ld]ADDR[32] must read GPR3"); - if (!Subtarget->isPPC64() && !Subtarget->isDarwin() && - isPositionIndependent()) + if (Subtarget->is32BitELFABI() && isPositionIndependent()) Kind = MCSymbolRefExpr::VK_PLT; + const MCExpr *TlsRef = MCSymbolRefExpr::create(TlsGetAddr, Kind, OutContext); @@ -514,17 +516,16 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI, /// Map a machine operand for a TOC pseudo-machine instruction to its /// corresponding MCSymbol. -static MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO, - AsmPrinter &AP) { +MCSymbol *PPCAsmPrinter::getMCSymbolForTOCPseudoMO(const MachineOperand &MO) { switch (MO.getType()) { case MachineOperand::MO_GlobalAddress: - return AP.getSymbol(MO.getGlobal()); + return getSymbol(MO.getGlobal()); case MachineOperand::MO_ConstantPoolIndex: - return AP.GetCPISymbol(MO.getIndex()); + return GetCPISymbol(MO.getIndex()); case MachineOperand::MO_JumpTableIndex: - return AP.GetJTISymbol(MO.getIndex()); + return GetJTISymbol(MO.getIndex()); case MachineOperand::MO_BlockAddress: - return AP.GetBlockAddressSymbol(MO.getBlockAddress()); + return GetBlockAddressSymbol(MO.getBlockAddress()); default: llvm_unreachable("Unexpected operand type to get symbol."); } @@ -688,7 +689,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { "Invalid operand for LWZtoc."); // Map the operand to its corresponding MCSymbol. - const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO); // Create a reference to the GOT entry for the symbol. The GOT entry will be // synthesized later. @@ -749,7 +750,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { // global address operand to be a reference to the TOC entry we will // synthesize later. MCSymbol *TOCEntry = - lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this)); + lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO)); const MCSymbolRefExpr::VariantKind VK = IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC; @@ -775,7 +776,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { "Invalid operand for ADDIStocHA."); // Map the machine operand to its corresponding MCSymbol. - MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO); // Always use TOC on AIX. Map the global address operand to be a reference // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to @@ -805,7 +806,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { "Invalid operand for LWZtocL."); // Map the machine operand to its corresponding MCSymbol. - MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO); // Always use TOC on AIX. Map the global address operand to be a reference // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to @@ -835,7 +836,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) && "Invalid operand for ADDIStocHA8!"); - const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO); const bool GlobalToc = MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal()); @@ -881,7 +882,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { "LDtocL used on symbol that could be accessed directly is " "invalid. Must match ADDIStocHA8.")); - const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO); if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large) MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); @@ -911,7 +912,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { "Interposable definitions must use indirect access.")); const MCExpr *Exp = - MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO, *this), + MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO), MCSymbolRefExpr::VK_PPC_TOC_LO, OutContext); TmpInst.getOperand(2) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); @@ -1578,151 +1579,6 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() { } } -void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) { - static const char *const CPUDirectives[] = { - "", - "ppc", - "ppc440", - "ppc601", - "ppc602", - "ppc603", - "ppc7400", - "ppc750", - "ppc970", - "ppcA2", - "ppce500", - "ppce500mc", - "ppce5500", - "power3", - "power4", - "power5", - "power5x", - "power6", - "power6x", - "power7", - // FIXME: why is power8 missing here? - "ppc64", - "ppc64le", - "power9" - }; - - // Get the numerically largest directive. - // FIXME: How should we merge darwin directives? - unsigned Directive = PPC::DIR_NONE; - for (const Function &F : M) { - const PPCSubtarget &STI = TM.getSubtarget(F); - unsigned FDir = STI.getDarwinDirective(); - Directive = Directive > FDir ? FDir : STI.getDarwinDirective(); - if (STI.hasMFOCRF() && Directive < PPC::DIR_970) - Directive = PPC::DIR_970; - if (STI.hasAltivec() && Directive < PPC::DIR_7400) - Directive = PPC::DIR_7400; - if (STI.isPPC64() && Directive < PPC::DIR_64) - Directive = PPC::DIR_64; - } - - assert(Directive <= PPC::DIR_64 && "Directive out of range."); - - assert(Directive < array_lengthof(CPUDirectives) && - "CPUDirectives[] might not be up-to-date!"); - PPCTargetStreamer &TStreamer = - *static_cast(OutStreamer->getTargetStreamer()); - TStreamer.emitMachine(CPUDirectives[Directive]); - - // Prime text sections so they are adjacent. This reduces the likelihood a - // large data or debug section causes a branch to exceed 16M limit. - const TargetLoweringObjectFileMachO &TLOFMacho = - static_cast(getObjFileLowering()); - OutStreamer->SwitchSection(TLOFMacho.getTextCoalSection()); - if (TM.getRelocationModel() == Reloc::PIC_) { - OutStreamer->SwitchSection( - OutContext.getMachOSection("__TEXT", "__picsymbolstub1", - MachO::S_SYMBOL_STUBS | - MachO::S_ATTR_PURE_INSTRUCTIONS, - 32, SectionKind::getText())); - } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) { - OutStreamer->SwitchSection( - OutContext.getMachOSection("__TEXT","__symbol_stub1", - MachO::S_SYMBOL_STUBS | - MachO::S_ATTR_PURE_INSTRUCTIONS, - 16, SectionKind::getText())); - } - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); -} - -bool PPCDarwinAsmPrinter::doFinalization(Module &M) { - bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64; - - // Darwin/PPC always uses mach-o. - const TargetLoweringObjectFileMachO &TLOFMacho = - static_cast(getObjFileLowering()); - if (MMI) { - MachineModuleInfoMachO &MMIMacho = - MMI->getObjFileInfo(); - - if (MAI->doesSupportExceptionHandling()) { - // Add the (possibly multiple) personalities to the set of global values. - // Only referenced functions get into the Personalities list. - for (const Function *Personality : MMI->getPersonalities()) { - if (Personality) { - MCSymbol *NLPSym = - getSymbolWithGlobalValueBase(Personality, "$non_lazy_ptr"); - MachineModuleInfoImpl::StubValueTy &StubSym = - MMIMacho.getGVStubEntry(NLPSym); - StubSym = - MachineModuleInfoImpl::StubValueTy(getSymbol(Personality), true); - } - } - } - - // Output stubs for dynamically-linked functions. - MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList(); - - // Output macho stubs for external and common global variables. - if (!Stubs.empty()) { - // Switch with ".non_lazy_symbol_pointer" directive. - OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); - EmitAlignment(isPPC64 ? Align(8) : Align(4)); - - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { - // L_foo$stub: - OutStreamer->EmitLabel(Stubs[i].first); - // .indirect_symbol _foo - MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second; - OutStreamer->EmitSymbolAttribute(MCSym.getPointer(), - MCSA_IndirectSymbol); - - if (MCSym.getInt()) - // External to current translation unit. - OutStreamer->EmitIntValue(0, isPPC64 ? 8 : 4 /*size*/); - else - // Internal to current translation unit. - // - // When we place the LSDA into the TEXT section, the type info - // pointers - // need to be indirect and pc-rel. We accomplish this by using NLPs. - // However, sometimes the types are local to the file. So we need to - // fill in the value for the NLP in those cases. - OutStreamer->EmitValue( - MCSymbolRefExpr::create(MCSym.getPointer(), OutContext), - isPPC64 ? 8 : 4 /*size*/); - } - - Stubs.clear(); - OutStreamer->AddBlankLine(); - } - } - - // Funny Darwin hack: This flag tells the linker that no global symbols - // contain code that falls through to other global symbols (e.g. the obvious - // implementation of multiple entry points). If this doesn't occur, the - // linker can safely perform dead code stripping. Since LLVM never generates - // code that does this, it is always safe to set. - OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); - - return AsmPrinter::doFinalization(M); -} - void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) { // Get the function descriptor symbol. CurrentFnDescSym = getSymbol(&MF.getFunction()); @@ -1735,7 +1591,7 @@ void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) { return AsmPrinter::SetupMachineFunction(MF); } -void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { +void PPCAIXAsmPrinter::ValidateGV(const GlobalVariable *GV) { // Early error checking limiting what is supported. if (GV->isThreadLocal()) report_fatal_error("Thread local not yet supported on AIX."); @@ -1745,22 +1601,51 @@ void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { if (GV->hasComdat()) report_fatal_error("COMDAT not yet supported by AIX."); +} + +const MCExpr *PPCAIXAsmPrinter::lowerConstant(const Constant *CV) { + if (const Function *F = dyn_cast(CV)) { + MCSymbolXCOFF *FSym = cast(getSymbol(F)); + if (!FSym->hasContainingCsect()) { + const XCOFF::StorageClass SC = + F->isDeclaration() + ? TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F) + : XCOFF::C_HIDEXT; + MCSectionXCOFF *Csect = OutStreamer->getContext().getXCOFFSection( + FSym->getName(), XCOFF::XMC_DS, + F->isDeclaration() ? XCOFF::XTY_ER : XCOFF::XTY_SD, SC, + SectionKind::getData()); + FSym->setContainingCsect(Csect); + } + return MCSymbolRefExpr::create( + FSym->getContainingCsect()->getQualNameSymbol(), OutContext); + } + return PPCAsmPrinter::lowerConstant(CV); +} + +void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + ValidateGV(GV); + + // External global variables are already handled. + if (!GV->hasInitializer()) + return; + + // Create the symbol, set its storage class. + MCSymbolXCOFF *GVSym = cast(getSymbol(GV)); + GVSym->setStorageClass( + TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV)); SectionKind GVKind = getObjFileLowering().getKindForGlobal(GV, TM); - if (!GVKind.isCommon() && !GVKind.isBSSLocal() && !GVKind.isData()) + if ((!GVKind.isGlobalWriteableData() && !GVKind.isReadOnly()) || + GVKind.isMergeable2ByteCString() || GVKind.isMergeable4ByteCString()) report_fatal_error("Encountered a global variable kind that is " "not supported yet."); // Create the containing csect and switch to it. - MCSectionXCOFF *CSect = cast( + MCSectionXCOFF *Csect = cast( getObjFileLowering().SectionForGlobal(GV, GVKind, TM)); - OutStreamer->SwitchSection(CSect); - - // Create the symbol, set its storage class, and emit it. - MCSymbolXCOFF *GVSym = cast(getSymbol(GV)); - GVSym->setStorageClass( - TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV)); - GVSym->setContainingCsect(CSect); + OutStreamer->SwitchSection(Csect); + GVSym->setContainingCsect(Csect); const DataLayout &DL = GV->getParent()->getDataLayout(); @@ -1771,9 +1656,10 @@ void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); if (GVKind.isBSSLocal()) - OutStreamer->EmitXCOFFLocalCommonSymbol(GVSym, Size, Align); + OutStreamer->EmitXCOFFLocalCommonSymbol( + GVSym, Size, Csect->getQualNameSymbol(), Align); else - OutStreamer->EmitCommonSymbol(GVSym, Size, Align); + OutStreamer->EmitCommonSymbol(Csect->getQualNameSymbol(), Size, Align); return; } @@ -1797,7 +1683,10 @@ void PPCAIXAsmPrinter::EmitFunctionDescriptor() { OutStreamer->EmitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext), PointerSize); // Emit TOC base address. - MCSymbol *TOCBaseSym = OutContext.getOrCreateSymbol(StringRef("TOC[TC0]")); + const MCSectionXCOFF *TOCBaseSec = OutStreamer->getContext().getXCOFFSection( + StringRef("TOC"), XCOFF::XMC_TC0, XCOFF::XTY_SD, XCOFF::C_HIDEXT, + SectionKind::getData()); + const MCSymbol *TOCBaseSym = TOCBaseSec->getQualNameSymbol(); OutStreamer->EmitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext), PointerSize); // Emit a null environment pointer. @@ -1813,15 +1702,90 @@ void PPCAIXAsmPrinter::EmitEndOfAsmFile(Module &M) { return; // Emit TOC base. - MCSymbol *TOCBaseSym = OutContext.getOrCreateSymbol(StringRef("TOC[TC0]")); MCSectionXCOFF *TOCBaseSection = OutStreamer->getContext().getXCOFFSection( StringRef("TOC"), XCOFF::XMC_TC0, XCOFF::XTY_SD, XCOFF::C_HIDEXT, SectionKind::getData()); - cast(TOCBaseSym)->setContainingCsect(TOCBaseSection); + // The TOC-base always has 0 size, but 4 byte alignment. + TOCBaseSection->setAlignment(Align(4)); // Switch to section to emit TOC base. OutStreamer->SwitchSection(TOCBaseSection); + + PPCTargetStreamer &TS = + static_cast(*OutStreamer->getTargetStreamer()); + + for (auto &I : TOC) { + // Setup the csect for the current TC entry. + MCSectionXCOFF *TCEntry = OutStreamer->getContext().getXCOFFSection( + cast(I.first)->getUnqualifiedName(), XCOFF::XMC_TC, + XCOFF::XTY_SD, XCOFF::C_HIDEXT, SectionKind::getData()); + cast(I.second)->setContainingCsect(TCEntry); + OutStreamer->SwitchSection(TCEntry); + + OutStreamer->EmitLabel(I.second); + TS.emitTCEntry(*I.first); + } } +MCSymbol * +PPCAIXAsmPrinter::getMCSymbolForTOCPseudoMO(const MachineOperand &MO) { + const GlobalObject *GO = nullptr; + + // If the MO is a function or certain kind of globals, we want to make sure to + // refer to the csect symbol, otherwise we can just do the default handling. + if (MO.getType() != MachineOperand::MO_GlobalAddress || + !(GO = dyn_cast(MO.getGlobal()))) + return PPCAsmPrinter::getMCSymbolForTOCPseudoMO(MO); + + // Do an early error check for globals we don't support. This will go away + // eventually. + const auto *GV = dyn_cast(GO); + if (GV) { + ValidateGV(GV); + } + + MCSymbolXCOFF *XSym = cast(getSymbol(GO)); + + // If the global object is a global variable without initializer or is a + // declaration of a function, then XSym is an external referenced symbol. + // Hence we may need to explictly create a MCSectionXCOFF for it so that we + // can return its symbol later. + if (GO->isDeclaration()) { + if (!XSym->hasContainingCsect()) { + // Make sure the storage class is set. + const XCOFF::StorageClass SC = + TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO); + XSym->setStorageClass(SC); + + MCSectionXCOFF *Csect = OutStreamer->getContext().getXCOFFSection( + XSym->getName(), isa(GO) ? XCOFF::XMC_DS : XCOFF::XMC_UA, + XCOFF::XTY_ER, SC, SectionKind::getMetadata()); + XSym->setContainingCsect(Csect); + } + + return XSym->getContainingCsect()->getQualNameSymbol(); + } + + // Handle initialized global variables and defined functions. + SectionKind GOKind = getObjFileLowering().getKindForGlobal(GO, TM); + + if (GOKind.isText()) { + // If the MO is a function, we want to make sure to refer to the function + // descriptor csect. + return OutStreamer->getContext() + .getXCOFFSection(XSym->getName(), XCOFF::XMC_DS, XCOFF::XTY_SD, + XCOFF::C_HIDEXT, SectionKind::getData()) + ->getQualNameSymbol(); + } else if (GOKind.isCommon() || GOKind.isBSSLocal()) { + // If the operand is a common then we should refer to the csect symbol. + return cast( + getObjFileLowering().SectionForGlobal(GO, GOKind, TM)) + ->getQualNameSymbol(); + } + + // Other global variables are refered to by labels inside of a single csect, + // so refer to the label directly. + return getSymbol(GV); +} /// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code /// for a MachineFunction to the given output stream, in a format that the @@ -1830,8 +1794,6 @@ void PPCAIXAsmPrinter::EmitEndOfAsmFile(Module &M) { static AsmPrinter * createPPCAsmPrinterPass(TargetMachine &tm, std::unique_ptr &&Streamer) { - if (tm.getTargetTriple().isMacOSX()) - return new PPCDarwinAsmPrinter(tm, std::move(Streamer)); if (tm.getTargetTriple().isOSAIX()) return new PPCAIXAsmPrinter(tm, std::move(Streamer)); @@ -1839,7 +1801,7 @@ createPPCAsmPrinterPass(TargetMachine &tm, } // Force static initialization. -extern "C" void LLVMInitializePowerPCAsmPrinter() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmPrinter() { TargetRegistry::RegisterAsmPrinter(getThePPC32Target(), createPPCAsmPrinterPass); TargetRegistry::RegisterAsmPrinter(getThePPC64Target(), diff --git a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp index d325b078979f..109b665e0d57 100644 --- a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp +++ b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" using namespace llvm; diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp index 2b8d9b87724f..4ce705300e1b 100644 --- a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -33,10 +33,8 @@ #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/Constants.h" @@ -47,6 +45,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/PassSupport.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -54,6 +53,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #ifndef NDEBUG diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 06a4d183e781..4c608520e265 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -782,8 +782,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); DebugLoc dl; - bool needsCFI = MMI.hasDebugInfo() || - MF.getFunction().needsUnwindTableEntry(); + bool needsCFI = MF.needsFrameMoves(); // Get processor type. bool isPPC64 = Subtarget.isPPC64(); @@ -2442,10 +2441,6 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, } unsigned PPCFrameLowering::getTOCSaveOffset() const { - if (Subtarget.isAIXABI()) - // TOC save/restore is normally handled by the linker. - // Indirect calls should hit this limitation. - report_fatal_error("TOC save is not implemented on AIX yet."); return TOCSaveOffset; } diff --git a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp index 391ebcc1a143..ffaa3e05c847 100644 --- a/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp +++ b/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp @@ -158,7 +158,7 @@ unsigned PPCDispatchGroupSBHazardRecognizer::PreEmitNoops(SUnit *SU) { // new group. if (isLoadAfterStore(SU) && CurSlots < 6) { unsigned Directive = - DAG->MF.getSubtarget().getDarwinDirective(); + DAG->MF.getSubtarget().getCPUDirective(); // If we're using a special group-terminating nop, then we need only one. // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready if (Directive == PPC::DIR_PWR6 || Directive == PPC::DIR_PWR7 || @@ -218,7 +218,7 @@ void PPCDispatchGroupSBHazardRecognizer::Reset() { void PPCDispatchGroupSBHazardRecognizer::EmitNoop() { unsigned Directive = - DAG->MF.getSubtarget().getDarwinDirective(); + DAG->MF.getSubtarget().getCPUDirective(); // If the group has now filled all of its slots, or if we're using a special // group-terminating nop, the group is complete. // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 4ad6c88233fe..776ec52e2604 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -138,9 +138,9 @@ namespace { /// class PPCDAGToDAGISel : public SelectionDAGISel { const PPCTargetMachine &TM; - const PPCSubtarget *PPCSubTarget; - const PPCTargetLowering *PPCLowering; - unsigned GlobalBaseReg; + const PPCSubtarget *PPCSubTarget = nullptr; + const PPCTargetLowering *PPCLowering = nullptr; + unsigned GlobalBaseReg = 0; public: explicit PPCDAGToDAGISel(PPCTargetMachine &tm, CodeGenOpt::Level OptLevel) @@ -204,6 +204,7 @@ namespace { bool tryBitfieldInsert(SDNode *N); bool tryBitPermutation(SDNode *N); bool tryIntCompareInGPR(SDNode *N); + bool tryAndWithMask(SDNode *N); // tryTLSXFormLoad - Convert an ISD::LOAD fed by a PPCISD::ADD_TLS into // an X-Form load instruction with the offset being a relocation coming from @@ -309,7 +310,6 @@ namespace { errs() << "ConstraintID: " << ConstraintID << "\n"; llvm_unreachable("Unexpected asm memory constraint"); case InlineAsm::Constraint_es: - case InlineAsm::Constraint_i: case InlineAsm::Constraint_m: case InlineAsm::Constraint_o: case InlineAsm::Constraint_Q: @@ -512,13 +512,14 @@ static bool isInt64Immediate(SDValue N, uint64_t &Imm) { return isInt64Immediate(N.getNode(), Imm); } -static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo, +static unsigned getBranchHint(unsigned PCC, + const FunctionLoweringInfo &FuncInfo, const SDValue &DestMBB) { assert(isa(DestMBB)); - if (!FuncInfo->BPI) return PPC::BR_NO_HINT; + if (!FuncInfo.BPI) return PPC::BR_NO_HINT; - const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); + const BasicBlock *BB = FuncInfo.MBB->getBasicBlock(); const Instruction *BBTerm = BB->getTerminator(); if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT; @@ -526,8 +527,8 @@ static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo, const BasicBlock *TBB = BBTerm->getSuccessor(0); const BasicBlock *FBB = BBTerm->getSuccessor(1); - auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB); - auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB); + auto TProb = FuncInfo.BPI->getEdgeProbability(BB, TBB); + auto FProb = FuncInfo.BPI->getEdgeProbability(BB, FBB); // We only want to handle cases which are easy to predict at static time, e.g. // C++ throw statement, that is very likely not taken, or calling never @@ -547,7 +548,7 @@ static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo, if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb)) return PPC::BR_NO_HINT; - LLVM_DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() + LLVM_DEBUG(dbgs() << "Use branch hint for '" << FuncInfo.Fn->getName() << "::" << BB->getName() << "'\n" << " -> " << TBB->getName() << ": " << TProb << "\n" << " -> " << FBB->getName() << ": " << FProb << "\n"); @@ -1044,7 +1045,7 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) { if (Use->isMachineOpcode()) return 0; MaxTruncation = - std::max(MaxTruncation, Use->getValueType(0).getSizeInBits()); + std::max(MaxTruncation, (unsigned)Use->getValueType(0).getSizeInBits()); continue; case ISD::STORE: { if (Use->isMachineOpcode()) @@ -1399,11 +1400,14 @@ class BitPermutationSelector { for (unsigned i = 0; i < NumValidBits; ++i) Bits[i] = (*LHSBits)[i]; - // These bits are known to be zero. + // These bits are known to be zero but the AssertZext may be from a value + // that already has some constant zero bits (i.e. from a masking and). for (unsigned i = NumValidBits; i < NumBits; ++i) - Bits[i] = ValueBit((*LHSBits)[i].getValue(), - (*LHSBits)[i].getValueBitIndex(), - ValueBit::VariableKnownToBeZero); + Bits[i] = (*LHSBits)[i].hasValue() + ? ValueBit((*LHSBits)[i].getValue(), + (*LHSBits)[i].getValueBitIndex(), + ValueBit::VariableKnownToBeZero) + : ValueBit(ValueBit::ConstZero); return std::make_pair(Interesting, &Bits); } @@ -1811,11 +1815,14 @@ class BitPermutationSelector { SDValue ANDIVal, ANDISVal; if (ANDIMask != 0) - ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32, - VRot, getI32Imm(ANDIMask, dl)), 0); + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDI_rec, dl, MVT::i32, + VRot, getI32Imm(ANDIMask, dl)), + 0); if (ANDISMask != 0) - ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32, - VRot, getI32Imm(ANDISMask, dl)), 0); + ANDISVal = + SDValue(CurDAG->getMachineNode(PPC::ANDIS_rec, dl, MVT::i32, VRot, + getI32Imm(ANDISMask, dl)), + 0); SDValue TotalVal; if (!ANDIVal) @@ -1904,11 +1911,14 @@ class BitPermutationSelector { SDValue ANDIVal, ANDISVal; if (ANDIMask != 0) - ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo, dl, MVT::i32, - Res, getI32Imm(ANDIMask, dl)), 0); + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDI_rec, dl, MVT::i32, + Res, getI32Imm(ANDIMask, dl)), + 0); if (ANDISMask != 0) - ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo, dl, MVT::i32, - Res, getI32Imm(ANDISMask, dl)), 0); + ANDISVal = + SDValue(CurDAG->getMachineNode(PPC::ANDIS_rec, dl, MVT::i32, Res, + getI32Imm(ANDISMask, dl)), + 0); if (!ANDIVal) Res = ANDISVal; @@ -2181,15 +2191,16 @@ class BitPermutationSelector { SDValue ANDIVal, ANDISVal; if (ANDIMask != 0) - ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDI8_rec, dl, MVT::i64, ExtendToInt64(VRot, dl), getI32Imm(ANDIMask, dl)), 0); if (ANDISMask != 0) - ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, - ExtendToInt64(VRot, dl), - getI32Imm(ANDISMask, dl)), - 0); + ANDISVal = + SDValue(CurDAG->getMachineNode(PPC::ANDIS8_rec, dl, MVT::i64, + ExtendToInt64(VRot, dl), + getI32Imm(ANDISMask, dl)), + 0); if (!ANDIVal) TotalVal = ANDISVal; @@ -2330,11 +2341,16 @@ class BitPermutationSelector { SDValue ANDIVal, ANDISVal; if (ANDIMask != 0) - ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDIo8, dl, MVT::i64, - ExtendToInt64(Res, dl), getI32Imm(ANDIMask, dl)), 0); + ANDIVal = SDValue(CurDAG->getMachineNode(PPC::ANDI8_rec, dl, MVT::i64, + ExtendToInt64(Res, dl), + getI32Imm(ANDIMask, dl)), + 0); if (ANDISMask != 0) - ANDISVal = SDValue(CurDAG->getMachineNode(PPC::ANDISo8, dl, MVT::i64, - ExtendToInt64(Res, dl), getI32Imm(ANDISMask, dl)), 0); + ANDISVal = + SDValue(CurDAG->getMachineNode(PPC::ANDIS8_rec, dl, MVT::i64, + ExtendToInt64(Res, dl), + getI32Imm(ANDISMask, dl)), + 0); if (!ANDIVal) Res = ANDISVal; @@ -2385,7 +2401,7 @@ class BitPermutationSelector { SmallVector Bits; - bool NeedMask; + bool NeedMask = false; SmallVector RLAmt; SmallVector BitGroups; @@ -2393,7 +2409,7 @@ class BitPermutationSelector { DenseMap, ValueRotInfo> ValueRots; SmallVector ValueRotsVec; - SelectionDAG *CurDAG; + SelectionDAG *CurDAG = nullptr; public: BitPermutationSelector(SelectionDAG *DAG) @@ -2623,8 +2639,9 @@ SDNode *IntegerCompareEliminator::tryLogicOpOfCompares(SDNode *N) { assert((NewOpc != -1 || !IsBitwiseNegate) && "No record form available for AND8/OR8/XOR8?"); WideOp = - SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl, - MVT::i64, MVT::Glue, LHS, RHS), 0); + SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDI8_rec : NewOpc, + dl, MVT::i64, MVT::Glue, LHS, RHS), + 0); } // Select this node to a single bit from CR0 set by the record-form node @@ -3597,7 +3614,7 @@ SDValue IntegerCompareEliminator::getSETCCInGPR(SDValue Compare, if (ConvOpts == SetccInGPROpts::ZExtInvert || ConvOpts == SetccInGPROpts::SExtInvert) - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, InputVT); bool Inputs32Bit = InputVT == MVT::i32; @@ -3832,7 +3849,11 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0); } -static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) { +static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC, const EVT &VT, + const PPCSubtarget *Subtarget) { + // For SPE instructions, the result is in GT bit of the CR + bool UseSPE = Subtarget->hasSPE() && VT.isFloatingPoint(); + switch (CC) { case ISD::SETUEQ: case ISD::SETONE: @@ -3841,17 +3862,23 @@ static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) { llvm_unreachable("Should be lowered by legalize!"); default: llvm_unreachable("Unknown condition!"); case ISD::SETOEQ: - case ISD::SETEQ: return PPC::PRED_EQ; + case ISD::SETEQ: + return UseSPE ? PPC::PRED_GT : PPC::PRED_EQ; case ISD::SETUNE: - case ISD::SETNE: return PPC::PRED_NE; + case ISD::SETNE: + return UseSPE ? PPC::PRED_LE : PPC::PRED_NE; case ISD::SETOLT: - case ISD::SETLT: return PPC::PRED_LT; + case ISD::SETLT: + return UseSPE ? PPC::PRED_GT : PPC::PRED_LT; case ISD::SETULE: - case ISD::SETLE: return PPC::PRED_LE; + case ISD::SETLE: + return UseSPE ? PPC::PRED_LE : PPC::PRED_LE; case ISD::SETOGT: - case ISD::SETGT: return PPC::PRED_GT; + case ISD::SETGT: + return UseSPE ? PPC::PRED_GT : PPC::PRED_GT; case ISD::SETUGE: - case ISD::SETGE: return PPC::PRED_GE; + case ISD::SETGE: + return UseSPE ? PPC::PRED_LE : PPC::PRED_GE; case ISD::SETO: return PPC::PRED_NU; case ISD::SETUO: return PPC::PRED_UN; // These two are invalid for floating point. Assume we have int. @@ -4344,6 +4371,142 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG, return true; } +bool PPCDAGToDAGISel::tryAndWithMask(SDNode *N) { + if (N->getOpcode() != ISD::AND) + return false; + + SDLoc dl(N); + SDValue Val = N->getOperand(0); + unsigned Imm, Imm2, SH, MB, ME; + uint64_t Imm64; + + // If this is an and of a value rotated between 0 and 31 bits and then and'd + // with a mask, emit rlwinm + if (isInt32Immediate(N->getOperand(1), Imm) && + isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) { + SDValue Val = N->getOperand(0).getOperand(0); + SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + return true; + } + + // If this is just a masked value where the input is not handled, and + // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm + if (isInt32Immediate(N->getOperand(1), Imm)) { + if (isRunOfOnes(Imm, MB, ME) && + N->getOperand(0).getOpcode() != ISD::ROTL) { + SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); + return true; + } + // AND X, 0 -> 0, not "rlwinm 32". + if (Imm == 0) { + ReplaceUses(SDValue(N, 0), N->getOperand(1)); + return true; + } + + // ISD::OR doesn't get all the bitfield insertion fun. + // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a + // bitfield insert. + if (N->getOperand(0).getOpcode() == ISD::OR && + isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) { + // The idea here is to check whether this is equivalent to: + // (c1 & m) | (x & ~m) + // where m is a run-of-ones mask. The logic here is that, for each bit in + // c1 and c2: + // - if both are 1, then the output will be 1. + // - if both are 0, then the output will be 0. + // - if the bit in c1 is 0, and the bit in c2 is 1, then the output will + // come from x. + // - if the bit in c1 is 1, and the bit in c2 is 0, then the output will + // be 0. + // If that last condition is never the case, then we can form m from the + // bits that are the same between c1 and c2. + unsigned MB, ME; + if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) { + SDValue Ops[] = { N->getOperand(0).getOperand(0), + N->getOperand(0).getOperand(1), + getI32Imm(0, dl), getI32Imm(MB, dl), + getI32Imm(ME, dl) }; + ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops)); + return true; + } + } + } else if (isInt64Immediate(N->getOperand(1).getNode(), Imm64)) { + // If this is a 64-bit zero-extension mask, emit rldicl. + if (isMask_64(Imm64)) { + MB = 64 - countTrailingOnes(Imm64); + SH = 0; + + if (Val.getOpcode() == ISD::ANY_EXTEND) { + auto Op0 = Val.getOperand(0); + if ( Op0.getOpcode() == ISD::SRL && + isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) { + + auto ResultType = Val.getNode()->getValueType(0); + auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, + ResultType); + SDValue IDVal (ImDef, 0); + + Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, + ResultType, IDVal, Op0.getOperand(0), + getI32Imm(1, dl)), 0); + SH = 64 - Imm; + } + } + + // If the operand is a logical right shift, we can fold it into this + // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb) + // for n <= mb. The right shift is really a left rotate followed by a + // mask, and this mask is a more-restrictive sub-mask of the mask implied + // by the shift. + if (Val.getOpcode() == ISD::SRL && + isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) { + assert(Imm < 64 && "Illegal shift amount"); + Val = Val.getOperand(0); + SH = 64 - Imm; + } + + SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops); + return true; + } else if (isMask_64(~Imm64)) { + // If this is a negated 64-bit zero-extension mask, + // i.e. the immediate is a sequence of ones from most significant side + // and all zero for reminder, we should use rldicr. + MB = 63 - countTrailingOnes(~Imm64); + SH = 0; + SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops); + return true; + } + + // It is not 16-bit imm that means we need two instructions at least if + // using "and" instruction. Try to exploit it with rotate mask instructions. + if (isRunOfOnes64(Imm64, MB, ME)) { + if (MB >= 32 && MB <= ME) { + // MB ME + // +----------------------+ + // |xxxxxxxxxxx00011111000| + // +----------------------+ + // 0 32 64 + // We can only do it if the MB is larger than 32 and MB <= ME + // as RLWINM will replace the content of [0 - 32) with [32 - 64) even + // we didn't rotate it. + SDValue Ops[] = { Val, getI64Imm(0, dl), getI64Imm(MB - 32, dl), + getI64Imm(ME - 32, dl) }; + CurDAG->SelectNodeTo(N, PPC::RLWINM8, MVT::i64, Ops); + return true; + } + // TODO - handle it with rldicl + rldicl + } + } + + return false; +} + // Select - Convert the specified operand from a target-independent to a // target-specific node if it hasn't already been changed. void PPCDAGToDAGISel::Select(SDNode *N) { @@ -4565,121 +4728,13 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } } - case ISD::AND: { - unsigned Imm, Imm2, SH, MB, ME; - uint64_t Imm64; - - // If this is an and of a value rotated between 0 and 31 bits and then and'd - // with a mask, emit rlwinm - if (isInt32Immediate(N->getOperand(1), Imm) && - isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) { - SDValue Val = N->getOperand(0).getOperand(0); - SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl), - getI32Imm(ME, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); - return; - } - // If this is just a masked value where the input is not handled above, and - // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm - if (isInt32Immediate(N->getOperand(1), Imm) && - isRunOfOnes(Imm, MB, ME) && - N->getOperand(0).getOpcode() != ISD::ROTL) { - SDValue Val = N->getOperand(0); - SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl), - getI32Imm(ME, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops); - return; - } - // If this is a 64-bit zero-extension mask, emit rldicl. - if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) && - isMask_64(Imm64)) { - SDValue Val = N->getOperand(0); - MB = 64 - countTrailingOnes(Imm64); - SH = 0; - - if (Val.getOpcode() == ISD::ANY_EXTEND) { - auto Op0 = Val.getOperand(0); - if ( Op0.getOpcode() == ISD::SRL && - isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) { - - auto ResultType = Val.getNode()->getValueType(0); - auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, - ResultType); - SDValue IDVal (ImDef, 0); - - Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, - ResultType, IDVal, Op0.getOperand(0), - getI32Imm(1, dl)), 0); - SH = 64 - Imm; - } - } - - // If the operand is a logical right shift, we can fold it into this - // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb) - // for n <= mb. The right shift is really a left rotate followed by a - // mask, and this mask is a more-restrictive sub-mask of the mask implied - // by the shift. - if (Val.getOpcode() == ISD::SRL && - isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) { - assert(Imm < 64 && "Illegal shift amount"); - Val = Val.getOperand(0); - SH = 64 - Imm; - } - - SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops); - return; - } - // If this is a negated 64-bit zero-extension mask, - // i.e. the immediate is a sequence of ones from most significant side - // and all zero for reminder, we should use rldicr. - if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) && - isMask_64(~Imm64)) { - SDValue Val = N->getOperand(0); - MB = 63 - countTrailingOnes(~Imm64); - SH = 0; - SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) }; - CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops); - return; - } - - // AND X, 0 -> 0, not "rlwinm 32". - if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) { - ReplaceUses(SDValue(N, 0), N->getOperand(1)); + case ISD::AND: + // If this is an 'and' with a mask, try to emit rlwinm/rldicl/rldicr + if (tryAndWithMask(N)) return; - } - // ISD::OR doesn't get all the bitfield insertion fun. - // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a - // bitfield insert. - if (isInt32Immediate(N->getOperand(1), Imm) && - N->getOperand(0).getOpcode() == ISD::OR && - isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) { - // The idea here is to check whether this is equivalent to: - // (c1 & m) | (x & ~m) - // where m is a run-of-ones mask. The logic here is that, for each bit in - // c1 and c2: - // - if both are 1, then the output will be 1. - // - if both are 0, then the output will be 0. - // - if the bit in c1 is 0, and the bit in c2 is 1, then the output will - // come from x. - // - if the bit in c1 is 1, and the bit in c2 is 0, then the output will - // be 0. - // If that last condition is never the case, then we can form m from the - // bits that are the same between c1 and c2. - unsigned MB, ME; - if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) { - SDValue Ops[] = { N->getOperand(0).getOperand(0), - N->getOperand(0).getOperand(1), - getI32Imm(0, dl), getI32Imm(MB, dl), - getI32Imm(ME, dl) }; - ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops)); - return; - } - } // Other cases are autogenerated. break; - } case ISD::OR: { if (N->getValueType(0) == MVT::i32) if (tryBitfieldInsert(N)) @@ -4781,24 +4836,24 @@ void PPCDAGToDAGISel::Select(SDNode *N) { break; } // FIXME: Remove this once the ANDI glue bug is fixed: - case PPCISD::ANDIo_1_EQ_BIT: - case PPCISD::ANDIo_1_GT_BIT: { + case PPCISD::ANDI_rec_1_EQ_BIT: + case PPCISD::ANDI_rec_1_GT_BIT: { if (!ANDIGlueBug) break; EVT InVT = N->getOperand(0).getValueType(); assert((InVT == MVT::i64 || InVT == MVT::i32) && - "Invalid input type for ANDIo_1_EQ_BIT"); + "Invalid input type for ANDI_rec_1_EQ_BIT"); - unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDIo8 : PPC::ANDIo; + unsigned Opcode = (InVT == MVT::i64) ? PPC::ANDI8_rec : PPC::ANDI_rec; SDValue AndI(CurDAG->getMachineNode(Opcode, dl, InVT, MVT::Glue, N->getOperand(0), CurDAG->getTargetConstant(1, dl, InVT)), 0); SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32); - SDValue SRIdxVal = - CurDAG->getTargetConstant(N->getOpcode() == PPCISD::ANDIo_1_EQ_BIT ? - PPC::sub_eq : PPC::sub_gt, dl, MVT::i32); + SDValue SRIdxVal = CurDAG->getTargetConstant( + N->getOpcode() == PPCISD::ANDI_rec_1_EQ_BIT ? PPC::sub_eq : PPC::sub_gt, + dl, MVT::i32); CurDAG->SelectNodeTo(N, TargetOpcode::EXTRACT_SUBREG, MVT::i1, CR0Reg, SRIdxVal, SDValue(AndI.getNode(), 1) /* glue */); @@ -4889,7 +4944,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) { return; } - unsigned BROpc = getPredicateForSetCC(CC); + unsigned BROpc = + getPredicateForSetCC(CC, N->getOperand(0).getValueType(), PPCSubTarget); unsigned SelectCCOp; if (N->getValueType(0) == MVT::i32) @@ -5002,7 +5058,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // Prevent PPC::PRED_* from being selected into LI. unsigned PCC = cast(N->getOperand(1))->getZExtValue(); if (EnableBranchHint) - PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3)); + PCC |= getBranchHint(PCC, *FuncInfo, N->getOperand(3)); SDValue Pred = getI32Imm(PCC, dl); SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3), @@ -5012,7 +5068,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } case ISD::BR_CC: { ISD::CondCode CC = cast(N->getOperand(1))->get(); - unsigned PCC = getPredicateForSetCC(CC); + unsigned PCC = + getPredicateForSetCC(CC, N->getOperand(2).getValueType(), PPCSubTarget); if (N->getOperand(2).getValueType() == MVT::i1) { unsigned Opc; @@ -5045,7 +5102,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } if (EnableBranchHint) - PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4)); + PCC |= getBranchHint(PCC, *FuncInfo, N->getOperand(4)); SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl); SDValue Ops[] = { getI32Imm(PCC, dl), CondCode, @@ -6181,8 +6238,8 @@ static bool PeepholePPC64ZExtGather(SDValue Op32, // For ANDI and ANDIS, the higher-order bits are zero if either that is true // of the first operand, or if the second operand is positive (so that it is // not sign extended). - if (Op32.getMachineOpcode() == PPC::ANDIo || - Op32.getMachineOpcode() == PPC::ANDISo) { + if (Op32.getMachineOpcode() == PPC::ANDI_rec || + Op32.getMachineOpcode() == PPC::ANDIS_rec) { SmallPtrSet ToPromote1; bool Op0OK = PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1); @@ -6304,8 +6361,12 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() { case PPC::ORI: NewOpcode = PPC::ORI8; break; case PPC::ORIS: NewOpcode = PPC::ORIS8; break; case PPC::AND: NewOpcode = PPC::AND8; break; - case PPC::ANDIo: NewOpcode = PPC::ANDIo8; break; - case PPC::ANDISo: NewOpcode = PPC::ANDISo8; break; + case PPC::ANDI_rec: + NewOpcode = PPC::ANDI8_rec; + break; + case PPC::ANDIS_rec: + NewOpcode = PPC::ANDIS8_rec; + break; } // Note: During the replacement process, the nodes will be in an diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8cf6a660b08b..60ed72e1018b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -52,6 +52,7 @@ #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/CallSite.h" @@ -66,6 +67,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsPowerPC.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" @@ -119,6 +121,9 @@ cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden); static cl::opt EnableQuadPrecision("enable-ppc-quad-precision", cl::desc("enable quad precision float support on ppc"), cl::Hidden); +static cl::opt UseAbsoluteJumpTables("ppc-use-absolute-jumptables", +cl::desc("use absolute jump tables on ppc"), cl::Hidden); + STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); @@ -132,10 +137,6 @@ extern cl::opt ANDIGlueBug; PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, const PPCSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { - // Use _setjmp/_longjmp instead of setjmp/longjmp. - setUseUnderscoreSetJmp(true); - setUseUnderscoreLongJmp(true); - // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all // arguments are at least 4/8 bytes aligned. bool isPPC64 = Subtarget.isPPC64(); @@ -389,6 +390,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::BITCAST, MVT::i32, Legal); setOperationAction(ISD::BITCAST, MVT::i64, Legal); setOperationAction(ISD::BITCAST, MVT::f64, Legal); + if (TM.Options.UnsafeFPMath) { + setOperationAction(ISD::LRINT, MVT::f64, Legal); + setOperationAction(ISD::LRINT, MVT::f32, Legal); + setOperationAction(ISD::LLRINT, MVT::f64, Legal); + setOperationAction(ISD::LLRINT, MVT::f32, Legal); + setOperationAction(ISD::LROUND, MVT::f64, Legal); + setOperationAction(ISD::LROUND, MVT::f32, Legal); + setOperationAction(ISD::LLROUND, MVT::f64, Legal); + setOperationAction(ISD::LLROUND, MVT::f32, Legal); + } } else { setOperationAction(ISD::BITCAST, MVT::f32, Expand); setOperationAction(ISD::BITCAST, MVT::i32, Expand); @@ -548,6 +559,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); } + if (Subtarget.hasVSX()) { + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); + } + if (Subtarget.hasAltivec()) { // First set operation action for all vector types to expand. Then we // will selectively turn on ones that can be effectively codegen'd. @@ -702,6 +720,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (!Subtarget.hasP8Altivec()) setOperationAction(ISD::ABS, MVT::v2i64, Expand); + // With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w). + if (Subtarget.hasAltivec()) + for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8}) + setOperationAction(ISD::ROTL, VT, Legal); + // With hasP8Altivec set, we can lower ISD::ROTL to vrld. + if (Subtarget.hasP8Altivec()) + setOperationAction(ISD::ROTL, MVT::v2i64, Legal); + addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); @@ -756,13 +782,23 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); + // The nearbyint variants are not allowed to raise the inexact exception + // so we can only code-gen them with unsafe math. + if (TM.Options.UnsafeFPMath) { + setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); + } + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); setOperationAction(ISD::FROUND, MVT::v2f64, Legal); + setOperationAction(ISD::FROUND, MVT::f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); setOperationAction(ISD::FROUND, MVT::v4f32, Legal); + setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::MUL, MVT::v2f64, Legal); setOperationAction(ISD::FMA, MVT::v2f64, Legal); @@ -910,12 +946,23 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FREM, MVT::f128, Expand); } setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); - + setOperationAction(ISD::BSWAP, MVT::v8i16, Legal); + setOperationAction(ISD::BSWAP, MVT::v4i32, Legal); + setOperationAction(ISD::BSWAP, MVT::v2i64, Legal); + setOperationAction(ISD::BSWAP, MVT::v1i128, Legal); } if (Subtarget.hasP9Altivec()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); } } @@ -1183,7 +1230,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.isDarwin()) setPrefFunctionAlignment(Align(16)); - switch (Subtarget.getDarwinDirective()) { + switch (Subtarget.getCPUDirective()) { default: break; case PPC::DIR_970: case PPC::DIR_A2: @@ -1198,6 +1245,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, case PPC::DIR_PWR7: case PPC::DIR_PWR8: case PPC::DIR_PWR9: + case PPC::DIR_PWR_FUTURE: setPrefLoopAlignment(Align(16)); setPrefFunctionAlignment(Align(16)); break; @@ -1212,15 +1260,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // The Freescale cores do better with aggressive inlining of memcpy and // friends. GCC uses same threshold of 128 bytes (= 32 word stores). - if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc || - Subtarget.getDarwinDirective() == PPC::DIR_E5500) { + if (Subtarget.getCPUDirective() == PPC::DIR_E500mc || + Subtarget.getCPUDirective() == PPC::DIR_E5500) { MaxStoresPerMemset = 32; MaxStoresPerMemsetOptSize = 16; MaxStoresPerMemcpy = 32; MaxStoresPerMemcpyOptSize = 8; MaxStoresPerMemmove = 32; MaxStoresPerMemmoveOptSize = 8; - } else if (Subtarget.getDarwinDirective() == PPC::DIR_A2) { + } else if (Subtarget.getCPUDirective() == PPC::DIR_A2) { // The A2 also benefits from (very) aggressive inlining of memcpy and // friends. The overhead of a the function call, even when warm, can be // over one hundred cycles. @@ -1294,6 +1342,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; case PPCISD::FSEL: return "PPCISD::FSEL"; + case PPCISD::XSMAXCDP: return "PPCISD::XSMAXCDP"; + case PPCISD::XSMINCDP: return "PPCISD::XSMINCDP"; case PPCISD::FCFID: return "PPCISD::FCFID"; case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; @@ -1314,7 +1364,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; - case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; case PPCISD::CMPB: return "PPCISD::CMPB"; @@ -1345,8 +1394,10 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ"; case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP"; case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP"; - case PPCISD::ANDIo_1_EQ_BIT: return "PPCISD::ANDIo_1_EQ_BIT"; - case PPCISD::ANDIo_1_GT_BIT: return "PPCISD::ANDIo_1_GT_BIT"; + case PPCISD::ANDI_rec_1_EQ_BIT: + return "PPCISD::ANDI_rec_1_EQ_BIT"; + case PPCISD::ANDI_rec_1_GT_BIT: + return "PPCISD::ANDI_rec_1_GT_BIT"; case PPCISD::VCMP: return "PPCISD::VCMP"; case PPCISD::VCMPo: return "PPCISD::VCMPo"; case PPCISD::LBRX: return "PPCISD::LBRX"; @@ -2699,9 +2750,9 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, ConstantPoolSDNode *CP = cast(Op); const Constant *C = CP->getConstVal(); - // 64-bit SVR4 ABI code is always position-independent. + // 64-bit SVR4 ABI and AIX ABI code are always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (Subtarget.is64BitELFABI()) { + if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0); return getTOCEntry(DAG, SDLoc(CP), GA); @@ -2735,14 +2786,16 @@ unsigned PPCTargetLowering::getJumpTableEncoding() const { } bool PPCTargetLowering::isJumpTableRelative() const { - if (Subtarget.isPPC64()) + if (UseAbsoluteJumpTables) + return false; + if (Subtarget.isPPC64() || Subtarget.isAIXABI()) return true; return TargetLowering::isJumpTableRelative(); } SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { - if (!Subtarget.isPPC64()) + if (!Subtarget.isPPC64() || Subtarget.isAIXABI()) return TargetLowering::getPICJumpTableRelocBase(Table, DAG); switch (getTargetMachine().getCodeModel()) { @@ -2759,7 +2812,7 @@ const MCExpr * PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { - if (!Subtarget.isPPC64()) + if (!Subtarget.isPPC64() || Subtarget.isAIXABI()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); switch (getTargetMachine().getCodeModel()) { @@ -2775,9 +2828,9 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); JumpTableSDNode *JT = cast(Op); - // 64-bit SVR4 ABI code is always position-independent. + // 64-bit SVR4 ABI and AIX ABI code are always position-independent. // The actual address of the GlobalValue is stored in the TOC. - if (Subtarget.is64BitELFABI()) { + if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT); return getTOCEntry(DAG, SDLoc(JT), GA); @@ -2804,9 +2857,9 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, BlockAddressSDNode *BASDN = cast(Op); const BlockAddress *BA = BASDN->getBlockAddress(); - // 64-bit SVR4 ABI code is always position-independent. + // 64-bit SVR4 ABI and AIX ABI code are always position-independent. // The actual BlockAddress is stored in the TOC. - if (Subtarget.is64BitELFABI()) { + if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) { setUsesTOCBasePtr(DAG); SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()); return getTOCEntry(DAG, SDLoc(BASDN), GA); @@ -3129,11 +3182,17 @@ SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget.isAIXABI()) + report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX."); + return Op.getOperand(0); } SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget.isAIXABI()) + report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX."); + SDValue Chain = Op.getOperand(0); SDValue Trmp = Op.getOperand(1); // trampoline SDValue FPtr = Op.getOperand(2); // nested function @@ -3394,15 +3453,16 @@ SDValue PPCTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { + if (Subtarget.isAIXABI()) + return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG, + InVals); if (Subtarget.is64BitELFABI()) return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); - else if (Subtarget.is32BitELFABI()) + if (Subtarget.is32BitELFABI()) return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); - // FIXME: We are using this for both AIX and Darwin. We should add appropriate - // AIX testing, and rename it appropriately. return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals); } @@ -4934,213 +4994,6 @@ static bool isFunctionGlobalAddress(SDValue Callee) { return false; } -static unsigned -PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain, - SDValue CallSeqStart, const SDLoc &dl, int SPDiff, bool isTailCall, - bool isPatchPoint, bool hasNest, - SmallVectorImpl> &RegsToPass, - SmallVectorImpl &Ops, std::vector &NodeTys, - ImmutableCallSite CS, const PPCSubtarget &Subtarget) { - bool isPPC64 = Subtarget.isPPC64(); - bool isSVR4ABI = Subtarget.isSVR4ABI(); - bool is64BitELFv1ABI = isPPC64 && isSVR4ABI && !Subtarget.isELFv2ABI(); - bool isAIXABI = Subtarget.isAIXABI(); - - EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); - NodeTys.push_back(MVT::Other); // Returns a chain - NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use. - - unsigned CallOpc = PPCISD::CALL; - - bool needIndirectCall = true; - if (!isSVR4ABI || !isPPC64) - if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) { - // If this is an absolute destination address, use the munged value. - Callee = SDValue(Dest, 0); - needIndirectCall = false; - } - - // PC-relative references to external symbols should go through $stub, unless - // we're building with the leopard linker or later, which automatically - // synthesizes these stubs. - const TargetMachine &TM = DAG.getTarget(); - const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); - const GlobalValue *GV = nullptr; - if (auto *G = dyn_cast(Callee)) - GV = G->getGlobal(); - bool Local = TM.shouldAssumeDSOLocal(*Mod, GV); - bool UsePlt = !Local && Subtarget.isTargetELF() && !isPPC64; - - // If the callee is a GlobalAddress/ExternalSymbol node (quite common, - // every direct call is) turn it into a TargetGlobalAddress / - // TargetExternalSymbol node so that legalize doesn't hack it. - if (isFunctionGlobalAddress(Callee)) { - GlobalAddressSDNode *G = cast(Callee); - - // A call to a TLS address is actually an indirect call to a - // thread-specific pointer. - unsigned OpFlags = 0; - if (UsePlt) - OpFlags = PPCII::MO_PLT; - - Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, - Callee.getValueType(), 0, OpFlags); - needIndirectCall = false; - } - - if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { - unsigned char OpFlags = 0; - - if (UsePlt) - OpFlags = PPCII::MO_PLT; - - Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(), - OpFlags); - needIndirectCall = false; - } - - if (isPatchPoint) { - // We'll form an invalid direct call when lowering a patchpoint; the full - // sequence for an indirect call is complicated, and many of the - // instructions introduced might have side effects (and, thus, can't be - // removed later). The call itself will be removed as soon as the - // argument/return lowering is complete, so the fact that it has the wrong - // kind of operands should not really matter. - needIndirectCall = false; - } - - if (needIndirectCall) { - // Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair - // to do the call, we can't use PPCISD::CALL. - SDValue MTCTROps[] = {Chain, Callee, InFlag}; - - if (is64BitELFv1ABI) { - // Function pointers in the 64-bit SVR4 ABI do not point to the function - // entry point, but to the function descriptor (the function entry point - // address is part of the function descriptor though). - // The function descriptor is a three doubleword structure with the - // following fields: function entry point, TOC base address and - // environment pointer. - // Thus for a call through a function pointer, the following actions need - // to be performed: - // 1. Save the TOC of the caller in the TOC save area of its stack - // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). - // 2. Load the address of the function entry point from the function - // descriptor. - // 3. Load the TOC of the callee from the function descriptor into r2. - // 4. Load the environment pointer from the function descriptor into - // r11. - // 5. Branch to the function entry point address. - // 6. On return of the callee, the TOC of the caller needs to be - // restored (this is done in FinishCall()). - // - // The loads are scheduled at the beginning of the call sequence, and the - // register copies are flagged together to ensure that no other - // operations can be scheduled in between. E.g. without flagging the - // copies together, a TOC access in the caller could be scheduled between - // the assignment of the callee TOC and the branch to the callee, which - // results in the TOC access going through the TOC of the callee instead - // of going through the TOC of the caller, which leads to incorrect code. - - // Load the address of the function entry point from the function - // descriptor. - SDValue LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-1); - if (LDChain.getValueType() == MVT::Glue) - LDChain = CallSeqStart.getValue(CallSeqStart->getNumValues()-2); - - auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() - ? (MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant) - : MachineMemOperand::MONone; - - MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr); - SDValue LoadFuncPtr = DAG.getLoad(MVT::i64, dl, LDChain, Callee, MPI, - /* Alignment = */ 8, MMOFlags); - - // Load environment pointer into r11. - SDValue PtrOff = DAG.getIntPtrConstant(16, dl); - SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff); - SDValue LoadEnvPtr = - DAG.getLoad(MVT::i64, dl, LDChain, AddPtr, MPI.getWithOffset(16), - /* Alignment = */ 8, MMOFlags); - - SDValue TOCOff = DAG.getIntPtrConstant(8, dl); - SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff); - SDValue TOCPtr = - DAG.getLoad(MVT::i64, dl, LDChain, AddTOC, MPI.getWithOffset(8), - /* Alignment = */ 8, MMOFlags); - - setUsesTOCBasePtr(DAG); - SDValue TOCVal = DAG.getCopyToReg(Chain, dl, PPC::X2, TOCPtr, - InFlag); - Chain = TOCVal.getValue(0); - InFlag = TOCVal.getValue(1); - - // If the function call has an explicit 'nest' parameter, it takes the - // place of the environment pointer. - if (!hasNest) { - SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr, - InFlag); - - Chain = EnvVal.getValue(0); - InFlag = EnvVal.getValue(1); - } - - MTCTROps[0] = Chain; - MTCTROps[1] = LoadFuncPtr; - MTCTROps[2] = InFlag; - } - - Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, - makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2)); - InFlag = Chain.getValue(1); - - NodeTys.clear(); - NodeTys.push_back(MVT::Other); - NodeTys.push_back(MVT::Glue); - Ops.push_back(Chain); - CallOpc = PPCISD::BCTRL; - Callee.setNode(nullptr); - // Add use of X11 (holding environment pointer) - if (is64BitELFv1ABI && !hasNest) - Ops.push_back(DAG.getRegister(PPC::X11, PtrVT)); - // Add CTR register as callee so a bctr can be emitted later. - if (isTailCall) - Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT)); - } - - // If this is a direct call, pass the chain and the callee. - if (Callee.getNode()) { - Ops.push_back(Chain); - Ops.push_back(Callee); - } - // If this is a tail call add stack pointer delta. - if (isTailCall) - Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32)); - - // Add argument registers to the end of the list so that they are known live - // into the call. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) - Ops.push_back(DAG.getRegister(RegsToPass[i].first, - RegsToPass[i].second.getValueType())); - - // All calls, in the AIX ABI and 64-bit ELF ABIs, need the TOC register - // live into the call. - // We do need to reserve R2/X2 to appease the verifier for the PATCHPOINT. - if ((isSVR4ABI && isPPC64) || isAIXABI) { - setUsesTOCBasePtr(DAG); - - // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is - // no way to mark dependencies as implicit here. - // We will add the R2/X2 dependency in EmitInstrWithCustomInserter. - if (!isPatchPoint) - Ops.push_back(DAG.getRegister(isPPC64 ? PPC::X2 - : PPC::R2, PtrVT)); - } - - return CallOpc; -} - SDValue PPCTargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, @@ -5205,30 +5058,357 @@ SDValue PPCTargetLowering::LowerCallResult( return Chain; } -SDValue PPCTargetLowering::FinishCall( - CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg, - bool isPatchPoint, bool hasNest, SelectionDAG &DAG, - SmallVector, 8> &RegsToPass, SDValue InFlag, - SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, - unsigned NumBytes, const SmallVectorImpl &Ins, - SmallVectorImpl &InVals, ImmutableCallSite CS) const { - std::vector NodeTys; - SmallVector Ops; - unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl, - SPDiff, isTailCall, isPatchPoint, hasNest, - RegsToPass, Ops, NodeTys, CS, Subtarget); +static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG, + const PPCSubtarget &Subtarget, bool isPatchPoint) { + // PatchPoint calls are not indirect. + if (isPatchPoint) + return false; + + if (isFunctionGlobalAddress(Callee) || dyn_cast(Callee)) + return false; + + // Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not + // becuase the immediate function pointer points to a descriptor instead of + // a function entry point. The ELFv2 ABI cannot use a BLA because the function + // pointer immediate points to the global entry point, while the BLA would + // need to jump to the local entry point (see rL211174). + if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() && + isBLACompatibleAddress(Callee, DAG)) + return false; + + return true; +} + +static unsigned getCallOpcode(bool isIndirectCall, bool isPatchPoint, + bool isTailCall, const Function &Caller, + const SDValue &Callee, + const PPCSubtarget &Subtarget, + const TargetMachine &TM) { + if (isTailCall) + return PPCISD::TC_RETURN; + + // This is a call through a function pointer. + if (isIndirectCall) { + // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross + // indirect calls. The save of the caller's TOC pointer to the stack will be + // inserted into the DAG as part of call lowering. The restore of the TOC + // pointer is modeled by using a pseudo instruction for the call opcode that + // represents the 2 instruction sequence of an indirect branch and link, + // immediately followed by a load of the TOC pointer from the the stack save + // slot into gpr2. + if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) + return PPCISD::BCTRL_LOAD_TOC; + + // An indirect call that does not need a TOC restore. + return PPCISD::BCTRL; + } + + // The ABIs that maintain a TOC pointer accross calls need to have a nop + // immediately following the call instruction if the caller and callee may + // have different TOC bases. At link time if the linker determines the calls + // may not share a TOC base, the call is redirected to a trampoline inserted + // by the linker. The trampoline will (among other things) save the callers + // TOC pointer at an ABI designated offset in the linkage area and the linker + // will rewrite the nop to be a load of the TOC pointer from the linkage area + // into gpr2. + if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) + return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL + : PPCISD::CALL_NOP; + + return PPCISD::CALL; +} + +static bool isValidAIXExternalSymSDNode(StringRef SymName) { + return StringSwitch(SymName) + .Cases("__divdi3", "__fixunsdfdi", "__floatundidf", "__floatundisf", + "__moddi3", "__udivdi3", "__umoddi3", true) + .Cases("ceil", "floor", "memcpy", "memmove", "memset", "round", true) + .Default(false); +} + +static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, + const SDLoc &dl, const PPCSubtarget &Subtarget) { + if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI()) + if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) + return SDValue(Dest, 0); + + // Returns true if the callee is local, and false otherwise. + auto isLocalCallee = [&]() { + const GlobalAddressSDNode *G = dyn_cast(Callee); + const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); + const GlobalValue *GV = G ? G->getGlobal() : nullptr; + + return DAG.getTarget().shouldAssumeDSOLocal(*Mod, GV) && + !dyn_cast_or_null(GV); + }; + + // The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in + // a static relocation model causes some versions of GNU LD (2.17.50, at + // least) to force BSS-PLT, instead of secure-PLT, even if all objects are + // built with secure-PLT. + bool UsePlt = + Subtarget.is32BitELFABI() && !isLocalCallee() && + Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_; + + // On AIX, direct function calls reference the symbol for the function's + // entry point, which is named by prepending a "." before the function's + // C-linkage name. + const auto getAIXFuncEntryPointSymbolSDNode = + [&](StringRef FuncName, bool IsDeclaration, + const XCOFF::StorageClass &SC) { + auto &Context = DAG.getMachineFunction().getMMI().getContext(); + + MCSymbolXCOFF *S = cast( + Context.getOrCreateSymbol(Twine(".") + Twine(FuncName))); + + if (IsDeclaration && !S->hasContainingCsect()) { + // On AIX, an undefined symbol needs to be associated with a + // MCSectionXCOFF to get the correct storage mapping class. + // In this case, XCOFF::XMC_PR. + MCSectionXCOFF *Sec = Context.getXCOFFSection( + S->getName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC, + SectionKind::getMetadata()); + S->setContainingCsect(Sec); + } + + MVT PtrVT = + DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + return DAG.getMCSymbol(S, PtrVT); + }; + + if (isFunctionGlobalAddress(Callee)) { + const GlobalAddressSDNode *G = cast(Callee); + const GlobalValue *GV = G->getGlobal(); + + if (!Subtarget.isAIXABI()) + return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0, + UsePlt ? PPCII::MO_PLT : 0); + + assert(!isa(GV) && "IFunc is not supported on AIX."); + const GlobalObject *GO = cast(GV); + const XCOFF::StorageClass SC = + TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO); + return getAIXFuncEntryPointSymbolSDNode(GO->getName(), GO->isDeclaration(), + SC); + } + + if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { + const char *SymName = S->getSymbol(); + if (!Subtarget.isAIXABI()) + return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(), + UsePlt ? PPCII::MO_PLT : 0); + + // If there exists a user-declared function whose name is the same as the + // ExternalSymbol's, then we pick up the user-declared version. + const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); + if (const Function *F = + dyn_cast_or_null(Mod->getNamedValue(SymName))) { + const XCOFF::StorageClass SC = + TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F); + return getAIXFuncEntryPointSymbolSDNode(F->getName(), F->isDeclaration(), + SC); + } + + // TODO: Remove this when the support for ExternalSymbolSDNode is complete. + if (isValidAIXExternalSymSDNode(SymName)) { + return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT); + } + + report_fatal_error("Unexpected ExternalSymbolSDNode: " + Twine(SymName)); + } + + // No transformation needed. + assert(Callee.getNode() && "What no callee?"); + return Callee; +} + +static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) { + assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START && + "Expected a CALLSEQ_STARTSDNode."); + + // The last operand is the chain, except when the node has glue. If the node + // has glue, then the last operand is the glue, and the chain is the second + // last operand. + SDValue LastValue = CallSeqStart.getValue(CallSeqStart->getNumValues() - 1); + if (LastValue.getValueType() != MVT::Glue) + return LastValue; + + return CallSeqStart.getValue(CallSeqStart->getNumValues() - 2); +} + +// Creates the node that moves a functions address into the count register +// to prepare for an indirect call instruction. +static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee, + SDValue &Glue, SDValue &Chain, + const SDLoc &dl) { + SDValue MTCTROps[] = {Chain, Callee, Glue}; + EVT ReturnTypes[] = {MVT::Other, MVT::Glue}; + Chain = DAG.getNode(PPCISD::MTCTR, dl, makeArrayRef(ReturnTypes, 2), + makeArrayRef(MTCTROps, Glue.getNode() ? 3 : 2)); + // The glue is the second value produced. + Glue = Chain.getValue(1); +} + +static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee, + SDValue &Glue, SDValue &Chain, + SDValue CallSeqStart, + ImmutableCallSite CS, const SDLoc &dl, + bool hasNest, + const PPCSubtarget &Subtarget) { + // Function pointers in the 64-bit SVR4 ABI do not point to the function + // entry point, but to the function descriptor (the function entry point + // address is part of the function descriptor though). + // The function descriptor is a three doubleword structure with the + // following fields: function entry point, TOC base address and + // environment pointer. + // Thus for a call through a function pointer, the following actions need + // to be performed: + // 1. Save the TOC of the caller in the TOC save area of its stack + // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()). + // 2. Load the address of the function entry point from the function + // descriptor. + // 3. Load the TOC of the callee from the function descriptor into r2. + // 4. Load the environment pointer from the function descriptor into + // r11. + // 5. Branch to the function entry point address. + // 6. On return of the callee, the TOC of the caller needs to be + // restored (this is done in FinishCall()). + // + // The loads are scheduled at the beginning of the call sequence, and the + // register copies are flagged together to ensure that no other + // operations can be scheduled in between. E.g. without flagging the + // copies together, a TOC access in the caller could be scheduled between + // the assignment of the callee TOC and the branch to the callee, which leads + // to incorrect code. + + // Start by loading the function address from the descriptor. + SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart); + auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors() + ? (MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant) + : MachineMemOperand::MONone; + + MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr); + + // Registers used in building the DAG. + const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister(); + const MCRegister TOCReg = Subtarget.getTOCPointerRegister(); + + // Offsets of descriptor members. + const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset(); + const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset(); + + const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; + const unsigned Alignment = Subtarget.isPPC64() ? 8 : 4; + + // One load for the functions entry point address. + SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI, + Alignment, MMOFlags); + + // One for loading the TOC anchor for the module that contains the called + // function. + SDValue TOCOff = DAG.getIntPtrConstant(TOCAnchorOffset, dl); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, Callee, TOCOff); + SDValue TOCPtr = + DAG.getLoad(RegVT, dl, LDChain, AddTOC, + MPI.getWithOffset(TOCAnchorOffset), Alignment, MMOFlags); + + // One for loading the environment pointer. + SDValue PtrOff = DAG.getIntPtrConstant(EnvPtrOffset, dl); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, RegVT, Callee, PtrOff); + SDValue LoadEnvPtr = + DAG.getLoad(RegVT, dl, LDChain, AddPtr, + MPI.getWithOffset(EnvPtrOffset), Alignment, MMOFlags); + + + // Then copy the newly loaded TOC anchor to the TOC pointer. + SDValue TOCVal = DAG.getCopyToReg(Chain, dl, TOCReg, TOCPtr, Glue); + Chain = TOCVal.getValue(0); + Glue = TOCVal.getValue(1); + + // If the function call has an explicit 'nest' parameter, it takes the + // place of the environment pointer. + assert((!hasNest || !Subtarget.isAIXABI()) && + "Nest parameter is not supported on AIX."); + if (!hasNest) { + SDValue EnvVal = DAG.getCopyToReg(Chain, dl, EnvPtrReg, LoadEnvPtr, Glue); + Chain = EnvVal.getValue(0); + Glue = EnvVal.getValue(1); + } + + // The rest of the indirect call sequence is the same as the non-descriptor + // DAG. + prepareIndirectCall(DAG, LoadFuncPtr, Glue, Chain, dl); +} + +static void +buildCallOperands(SmallVectorImpl &Ops, CallingConv::ID CallConv, + const SDLoc &dl, bool isTailCall, bool isVarArg, + bool isPatchPoint, bool hasNest, SelectionDAG &DAG, + SmallVector, 8> &RegsToPass, + SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff, + const PPCSubtarget &Subtarget, bool isIndirect) { + const bool IsPPC64 = Subtarget.isPPC64(); + // MVT for a general purpose register. + const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32; + + // First operand is always the chain. + Ops.push_back(Chain); + + // If it's a direct call pass the callee as the second operand. + if (!isIndirect) + Ops.push_back(Callee); + else { + assert(!isPatchPoint && "Patch point call are not indirect."); + + // For the TOC based ABIs, we have saved the TOC pointer to the linkage area + // on the stack (this would have been done in `LowerCall_64SVR4` or + // `LowerCall_AIX`). The call instruction is a pseudo instruction that + // represents both the indirect branch and a load that restores the TOC + // pointer from the linkage area. The operand for the TOC restore is an add + // of the TOC save offset to the stack pointer. This must be the second + // operand: after the chain input but before any other variadic arguments. + if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) { + const MCRegister StackPtrReg = Subtarget.getStackPointerRegister(); + + SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT); + unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); + SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); + SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, StackPtr, TOCOff); + Ops.push_back(AddTOC); + } + + // Add the register used for the environment pointer. + if (Subtarget.usesFunctionDescriptors() && !hasNest) + Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(), + RegVT)); + + + // Add CTR register as callee so a bctr can be emitted later. + if (isTailCall) + Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT)); + } + + // If this is a tail call add stack pointer delta. + if (isTailCall) + Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32)); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is + // no way to mark dependencies as implicit here. + // We will add the R2/X2 dependency in EmitInstrWithCustomInserter. + if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) && !isPatchPoint) + Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT)); // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls - if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64()) + if (isVarArg && Subtarget.is32BitELFABI()) Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); - // When performing tail call optimization the callee pops its arguments off - // the stack. Account for this here so these bytes can be pushed back on in - // PPCFrameLowering::eliminateCallFramePseudoInstr. - int BytesCalleePops = - (CallConv == CallingConv::Fast && - getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0; - // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *Mask = @@ -5236,8 +5416,40 @@ SDValue PPCTargetLowering::FinishCall( assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); - if (InFlag.getNode()) - Ops.push_back(InFlag); + // If the glue is valid, it is the last operand. + if (Glue.getNode()) + Ops.push_back(Glue); +} + +SDValue PPCTargetLowering::FinishCall( + CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg, + bool isPatchPoint, bool hasNest, SelectionDAG &DAG, + SmallVector, 8> &RegsToPass, SDValue Glue, + SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff, + unsigned NumBytes, const SmallVectorImpl &Ins, + SmallVectorImpl &InVals, ImmutableCallSite CS) const { + + if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) + setUsesTOCBasePtr(DAG); + + const bool isIndirect = isIndirectCall(Callee, DAG, Subtarget, isPatchPoint); + unsigned CallOpc = getCallOpcode(isIndirect, isPatchPoint, isTailCall, + DAG.getMachineFunction().getFunction(), + Callee, Subtarget, DAG.getTarget()); + + if (!isIndirect) + Callee = transformCallee(Callee, DAG, dl, Subtarget); + else if (Subtarget.usesFunctionDescriptors()) + prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CS, + dl, hasNest, Subtarget); + else + prepareIndirectCall(DAG, Callee, Glue, Chain, dl); + + // Build the operand list for the call instruction. + SmallVector Ops; + buildCallOperands(Ops, CallConv, dl, isTailCall, isVarArg, isPatchPoint, + hasNest, DAG, RegsToPass, Glue, Chain, Callee, SPDiff, + Subtarget, isIndirect); // Emit tail call. if (isTailCall) { @@ -5246,81 +5458,32 @@ SDValue PPCTargetLowering::FinishCall( Callee.getOpcode() == ISD::TargetExternalSymbol || Callee.getOpcode() == ISD::TargetGlobalAddress || isa(Callee)) && - "Expecting an global address, external symbol, absolute value or register"); - + "Expecting a global address, external symbol, absolute value or " + "register"); + assert(CallOpc == PPCISD::TC_RETURN && + "Unexpected call opcode for a tail call."); DAG.getMachineFunction().getFrameInfo().setHasTailCall(); - return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops); + return DAG.getNode(CallOpc, dl, MVT::Other, Ops); } - // Add a NOP immediately after the branch instruction when using the 64-bit - // SVR4 or the AIX ABI. - // At link time, if caller and callee are in a different module and - // thus have a different TOC, the call will be replaced with a call to a stub - // function which saves the current TOC, loads the TOC of the callee and - // branches to the callee. The NOP will be replaced with a load instruction - // which restores the TOC of the caller from the TOC save slot of the current - // stack frame. If caller and callee belong to the same module (and have the - // same TOC), the NOP will remain unchanged, or become some other NOP. - - MachineFunction &MF = DAG.getMachineFunction(); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - if (!isTailCall && !isPatchPoint && - ((Subtarget.isSVR4ABI() && Subtarget.isPPC64()) || - Subtarget.isAIXABI())) { - if (CallOpc == PPCISD::BCTRL) { - if (Subtarget.isAIXABI()) - report_fatal_error("Indirect call on AIX is not implemented."); - - // This is a call through a function pointer. - // Restore the caller TOC from the save area into R2. - // See PrepareCall() for more information about calls through function - // pointers in the 64-bit SVR4 ABI. - // We are using a target-specific load with r2 hard coded, because the - // result of a target-independent load would never go directly into r2, - // since r2 is a reserved register (which prevents the register allocator - // from allocating it), resulting in an additional register being - // allocated and an unnecessary move instruction being generated. - CallOpc = PPCISD::BCTRL_LOAD_TOC; - - SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT); - unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); - SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); - SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff); + std::array ReturnTypes = {{MVT::Other, MVT::Glue}}; + Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops); + Glue = Chain.getValue(1); - // The address needs to go after the chain input but before the flag (or - // any other variadic arguments). - Ops.insert(std::next(Ops.begin()), AddTOC); - } else if (CallOpc == PPCISD::CALL && - !callsShareTOCBase(&MF.getFunction(), Callee, DAG.getTarget())) { - // Otherwise insert NOP for non-local calls. - CallOpc = PPCISD::CALL_NOP; - } - } - - if (Subtarget.isAIXABI() && isFunctionGlobalAddress(Callee)) { - // On AIX, direct function calls reference the symbol for the function's - // entry point, which is named by inserting a "." before the function's - // C-linkage name. - GlobalAddressSDNode *G = cast(Callee); - auto &Context = DAG.getMachineFunction().getMMI().getContext(); - MCSymbol *S = Context.getOrCreateSymbol(Twine(".") + - Twine(G->getGlobal()->getName())); - Callee = DAG.getMCSymbol(S, PtrVT); - // Replace the GlobalAddressSDNode Callee with the MCSymbolSDNode. - Ops[1] = Callee; - } - - Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); - InFlag = Chain.getValue(1); + // When performing tail call optimization the callee pops its arguments off + // the stack. Account for this here so these bytes can be pushed back on in + // PPCFrameLowering::eliminateCallFramePseudoInstr. + int BytesCalleePops = (CallConv == CallingConv::Fast && + getTargetMachine().Options.GuaranteedTailCallOpt) + ? NumBytes + : 0; Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(BytesCalleePops, dl, true), - InFlag, dl); - if (!Ins.empty()) - InFlag = Chain.getValue(1); + Glue, dl); + Glue = Chain.getValue(1); - return LowerCallResult(Chain, InFlag, CallConv, isVarArg, - Ins, dl, DAG, InVals); + return LowerCallResult(Chain, Glue, CallConv, isVarArg, Ins, dl, DAG, InVals); } SDValue @@ -6273,8 +6436,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // Check if this is an indirect call (MTCTR/BCTRL). - // See PrepareCall() for more information about calls through function - // pointers in the 64-bit SVR4 ABI. + // See prepareDescriptorIndirectCall and buildCallOperands for more + // information about calls through function pointers in the 64-bit SVR4 ABI. if (!isTailCall && !isPatchPoint && !isFunctionGlobalAddress(Callee) && !isa(Callee)) { @@ -6695,6 +6858,205 @@ SDValue PPCTargetLowering::LowerCall_Darwin( NumBytes, Ins, InVals, CS); } +static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State) { + + if (ValVT == MVT::f128) + report_fatal_error("f128 is unimplemented on AIX."); + + if (ArgFlags.isByVal()) + report_fatal_error("Passing structure by value is unimplemented."); + + if (ArgFlags.isNest()) + report_fatal_error("Nest arguments are unimplemented."); + + if (ValVT.isVector() || LocVT.isVector()) + report_fatal_error("Vector arguments are unimplemented on AIX."); + + const PPCSubtarget &Subtarget = static_cast( + State.getMachineFunction().getSubtarget()); + const bool IsPPC64 = Subtarget.isPPC64(); + const unsigned PtrByteSize = IsPPC64 ? 8 : 4; + + static const MCPhysReg GPR_32[] = {// 32-bit registers. + PPC::R3, PPC::R4, PPC::R5, PPC::R6, + PPC::R7, PPC::R8, PPC::R9, PPC::R10}; + static const MCPhysReg GPR_64[] = {// 64-bit registers. + PPC::X3, PPC::X4, PPC::X5, PPC::X6, + PPC::X7, PPC::X8, PPC::X9, PPC::X10}; + + // Arguments always reserve parameter save area. + switch (ValVT.SimpleTy) { + default: + report_fatal_error("Unhandled value type for argument."); + case MVT::i64: + // i64 arguments should have been split to i32 for PPC32. + assert(IsPPC64 && "PPC32 should have split i64 values."); + LLVM_FALLTHROUGH; + case MVT::i1: + case MVT::i32: + State.AllocateStack(PtrByteSize, PtrByteSize); + if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) { + MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32; + // Promote integers if needed. + if (ValVT.getSizeInBits() < RegVT.getSizeInBits()) + LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt + : CCValAssign::LocInfo::ZExt; + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo)); + } + else + report_fatal_error("Handling of placing parameters on the stack is " + "unimplemented!"); + return false; + + case MVT::f32: + case MVT::f64: { + // Parameter save area (PSA) is reserved even if the float passes in fpr. + const unsigned StoreSize = LocVT.getStoreSize(); + // Floats are always 4-byte aligned in the PSA on AIX. + // This includes f64 in 64-bit mode for ABI compatibility. + State.AllocateStack(IsPPC64 ? 8 : StoreSize, 4); + if (unsigned Reg = State.AllocateReg(FPR)) + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + report_fatal_error("Handling of placing parameters on the stack is " + "unimplemented!"); + + // AIX requires that GPRs are reserved for float arguments. + // Successfully reserved GPRs are only initialized for vararg calls. + MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32; + for (unsigned I = 0; I < StoreSize; I += PtrByteSize) { + if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) { + if (State.isVarArg()) { + // Custom handling is required for: + // f64 in PPC32 needs to be split into 2 GPRs. + // f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR. + State.addLoc( + CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo)); + } + } else if (State.isVarArg()) { + report_fatal_error("Handling of placing parameters on the stack is " + "unimplemented!"); + } + } + + return false; + } + } + return true; +} + +static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT, + bool IsPPC64) { + assert((IsPPC64 || SVT != MVT::i64) && + "i64 should have been split for 32-bit codegen."); + + switch (SVT) { + default: + report_fatal_error("Unexpected value type for formal argument"); + case MVT::i1: + case MVT::i32: + case MVT::i64: + return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + case MVT::f32: + return &PPC::F4RCRegClass; + case MVT::f64: + return &PPC::F8RCRegClass; + } +} + +static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT, + SelectionDAG &DAG, SDValue ArgValue, + MVT LocVT, const SDLoc &dl) { + assert(ValVT.isScalarInteger() && LocVT.isScalarInteger()); + assert(ValVT.getSizeInBits() < LocVT.getSizeInBits()); + + if (Flags.isSExt()) + ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue, + DAG.getValueType(ValVT)); + else if (Flags.isZExt()) + ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue, + DAG.getValueType(ValVT)); + + return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue); +} + +SDValue PPCTargetLowering::LowerFormalArguments_AIX( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl &InVals) const { + + assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold || + CallConv == CallingConv::Fast) && + "Unexpected calling convention!"); + + if (isVarArg) + report_fatal_error("This call type is unimplemented on AIX."); + + if (getTargetMachine().Options.GuaranteedTailCallOpt) + report_fatal_error("Tail call support is unimplemented on AIX."); + + if (useSoftFloat()) + report_fatal_error("Soft float support is unimplemented on AIX."); + + const PPCSubtarget &Subtarget = + static_cast(DAG.getSubtarget()); + if (Subtarget.hasQPX()) + report_fatal_error("QPX support is not supported on AIX."); + + const bool IsPPC64 = Subtarget.isPPC64(); + const unsigned PtrByteSize = IsPPC64 ? 8 : 4; + + // Assign locations to all of the incoming arguments. + SmallVector ArgLocs; + MachineFunction &MF = DAG.getMachineFunction(); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + + // Reserve space for the linkage area on the stack. + const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + // On AIX a minimum of 8 words is saved to the parameter save area. + const unsigned MinParameterSaveArea = 8 * PtrByteSize; + CCInfo.AllocateStack(LinkageSize + MinParameterSaveArea, PtrByteSize); + CCInfo.AnalyzeFormalArguments(Ins, CC_AIX); + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue ArgValue; + ISD::ArgFlagsTy Flags = Ins[i].Flags; + if (VA.isRegLoc()) { + EVT ValVT = VA.getValVT(); + MVT LocVT = VA.getLocVT(); + MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy; + unsigned VReg = + MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64)); + ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT); + if (ValVT.isScalarInteger() && + (ValVT.getSizeInBits() < LocVT.getSizeInBits())) { + ArgValue = + truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl); + } + InVals.push_back(ArgValue); + } else { + report_fatal_error("Handling of formal arguments on the stack is " + "unimplemented!"); + } + } + + // Area that is at least reserved in the caller of this function. + unsigned MinReservedArea = CCInfo.getNextStackOffset(); + + // Set the size that is at least reserved in caller of this function. Tail + // call optimized function's reserved stack space needs to be aligned so + // that taking the difference between two stack areas will result in an + // aligned stack. + MinReservedArea = + EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea); + PPCFunctionInfo *FuncInfo = MF.getInfo(); + FuncInfo->setMinReservedArea(MinReservedArea); + + return Chain; +} SDValue PPCTargetLowering::LowerCall_AIX( SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, @@ -6705,22 +7067,33 @@ SDValue PPCTargetLowering::LowerCall_AIX( SelectionDAG &DAG, SmallVectorImpl &InVals, ImmutableCallSite CS) const { - assert((CallConv == CallingConv::C || CallConv == CallingConv::Fast) && - "Unimplemented calling convention!"); - if (isVarArg || isPatchPoint) + assert((CallConv == CallingConv::C || + CallConv == CallingConv::Cold || + CallConv == CallingConv::Fast) && "Unexpected calling convention!"); + + if (isPatchPoint) report_fatal_error("This call type is unimplemented on AIX."); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - bool isPPC64 = PtrVT == MVT::i64; - unsigned PtrByteSize = isPPC64 ? 8 : 4; - unsigned NumOps = Outs.size(); + const PPCSubtarget& Subtarget = + static_cast(DAG.getSubtarget()); + if (Subtarget.hasQPX()) + report_fatal_error("QPX is not supported on AIX."); + if (Subtarget.hasAltivec()) + report_fatal_error("Altivec support is unimplemented on AIX."); + MachineFunction &MF = DAG.getMachineFunction(); + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); - // Count how many bytes are to be pushed on the stack, including the linkage - // area, parameter list area. - // On XCOFF, we start with 24/48, which is reserved space for - // [SP][CR][LR][2 x reserved][TOC]. - unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + // Reserve space for the linkage save area (LSA) on the stack. + // In both PPC32 and PPC64 there are 6 reserved slots in the LSA: + // [SP][CR][LR][2 x reserved][TOC]. + // The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64. + const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); + const bool IsPPC64 = Subtarget.isPPC64(); + const unsigned PtrByteSize = IsPPC64 ? 8 : 4; + CCInfo.AllocateStack(LinkageSize, PtrByteSize); + CCInfo.AnalyzeCallOperands(Outs, CC_AIX); // The prolog code of the callee may store up to 8 GPR argument registers to // the stack, allowing va_start to index over them in memory if the callee @@ -6728,98 +7101,101 @@ SDValue PPCTargetLowering::LowerCall_AIX( // Because we cannot tell if this is needed on the caller side, we have to // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. - unsigned NumBytes = LinkageSize + 8 * PtrByteSize; + const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize; + const unsigned NumBytes = LinkageSize + MinParameterSaveAreaSize; // Adjust the stack pointer for the new arguments... - // These operations are automatically eliminated by the prolog/epilog - // inserter pass. + // These operations are automatically eliminated by the prolog/epilog pass. Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; - static const MCPhysReg GPR_32[] = { // 32-bit registers. - PPC::R3, PPC::R4, PPC::R5, PPC::R6, - PPC::R7, PPC::R8, PPC::R9, PPC::R10 - }; - static const MCPhysReg GPR_64[] = { // 64-bit registers. - PPC::X3, PPC::X4, PPC::X5, PPC::X6, - PPC::X7, PPC::X8, PPC::X9, PPC::X10 - }; - - const unsigned NumGPRs = isPPC64 ? array_lengthof(GPR_64) - : array_lengthof(GPR_32); - const unsigned NumFPRs = array_lengthof(FPR); - assert(NumFPRs == 13 && "Only FPR 1-13 could be used for parameter passing " - "on AIX"); + SmallVector, 8> RegsToPass; - const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32; - unsigned GPR_idx = 0, FPR_idx = 0; + for (unsigned I = 0, E = ArgLocs.size(); I != E;) { + CCValAssign &VA = ArgLocs[I++]; - SmallVector, 8> RegsToPass; + if (VA.isMemLoc()) + report_fatal_error("Handling of placing parameters on the stack is " + "unimplemented!"); + if (!VA.isRegLoc()) + report_fatal_error( + "Unexpected non-register location for function call argument."); - if (isTailCall) - report_fatal_error("Handling of tail call is unimplemented!"); - int SPDiff = 0; + SDValue Arg = OutVals[VA.getValNo()]; - for (unsigned i = 0; i != NumOps; ++i) { - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (!VA.needsCustom()) { + switch (VA.getLocInfo()) { + default: + report_fatal_error("Unexpected argument extension type."); + case CCValAssign::Full: + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); + break; + } + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - // Promote integers if needed. - if (Arg.getValueType() == MVT::i1 || - (isPPC64 && Arg.getValueType() == MVT::i32)) { - unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - Arg = DAG.getNode(ExtOp, dl, PtrVT, Arg); + continue; } - // Note: "by value" is code for passing a structure by value, not - // basic types. - if (Flags.isByVal()) - report_fatal_error("Passing structure by value is unimplemented!"); + // Custom handling is used for GPR initializations for vararg float + // arguments. + assert(isVarArg && VA.getValVT().isFloatingPoint() && + VA.getLocVT().isInteger() && + "Unexpected custom register handling for calling convention."); - switch (Arg.getSimpleValueType().SimpleTy) { - default: llvm_unreachable("Unexpected ValueType for argument!"); - case MVT::i1: - case MVT::i32: - case MVT::i64: - if (GPR_idx != NumGPRs) - RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg)); - else - report_fatal_error("Handling of placing parameters on the stack is " - "unimplemented!"); - break; - case MVT::f32: - case MVT::f64: - if (FPR_idx != NumFPRs) { - RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg)); + SDValue ArgAsInt = + DAG.getBitcast(MVT::getIntegerVT(VA.getValVT().getSizeInBits()), Arg); - // If we have any FPRs remaining, we may also have GPRs remaining. - // Args passed in FPRs consume 1 or 2 (f64 in 32 bit mode) available - // GPRs. - if (GPR_idx != NumGPRs) - ++GPR_idx; - if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64) - ++GPR_idx; - } else - report_fatal_error("Handling of placing parameters on the stack is " - "unimplemented!"); - break; - case MVT::v4f32: - case MVT::v4i32: - case MVT::v8i16: - case MVT::v16i8: - case MVT::v2f64: - case MVT::v2i64: - case MVT::v1i128: - case MVT::f128: - case MVT::v4f64: - case MVT::v4i1: - report_fatal_error("Handling of this parameter type is unimplemented!"); - } - } + if (Arg.getValueType().getStoreSize() == VA.getLocVT().getStoreSize()) + // f32 in 32-bit GPR + // f64 in 64-bit GPR + RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt)); + else if (Arg.getValueType().getSizeInBits() < VA.getLocVT().getSizeInBits()) + // f32 in 64-bit GPR. + RegsToPass.push_back(std::make_pair( + VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, VA.getLocVT()))); + else { + // f64 in two 32-bit GPRs + // The 2 GPRs are marked custom and expected to be adjacent in ArgLocs. + assert(Arg.getValueType() == MVT::f64 && isVarArg && !IsPPC64 && + "Unexpected custom register for argument!"); + CCValAssign &GPR1 = VA; + SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt, + DAG.getConstant(32, dl, MVT::i8)); + RegsToPass.push_back(std::make_pair( + GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32))); + assert(I != E && "A second custom GPR is expected!"); + CCValAssign &GPR2 = ArgLocs[I++]; + assert(GPR2.isRegLoc() && GPR2.getValNo() == GPR1.getValNo() && + GPR2.needsCustom() && "A second custom GPR is expected!"); + RegsToPass.push_back(std::make_pair( + GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32))); + } + } + + // For indirect calls, we need to save the TOC base to the stack for + // restoration after the call. + if (!isTailCall && !isPatchPoint && + !isFunctionGlobalAddress(Callee) && !isa(Callee)) { + const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister(); + const MCRegister StackPtrReg = Subtarget.getStackPointerRegister(); + const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; + const unsigned TOCSaveOffset = + Subtarget.getFrameLowering()->getTOCSaveOffset(); - if (!isFunctionGlobalAddress(Callee) && - !isa(Callee)) - report_fatal_error("Handling of indirect call is unimplemented!"); + setUsesTOCBasePtr(DAG); + SDValue Val = DAG.getCopyFromReg(Chain, dl, TOCBaseReg, PtrVT); + SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); + SDValue StackPtr = DAG.getRegister(StackPtrReg, PtrVT); + SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); + Chain = DAG.getStore( + Val.getValue(1), dl, Val, AddPtr, + MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset)); + } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. @@ -6829,10 +7205,11 @@ SDValue PPCTargetLowering::LowerCall_AIX( InFlag = Chain.getValue(1); } + const int SPDiff = 0; return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, - /* unused except on PPC64 ELFv1 */ false, DAG, - RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff, - NumBytes, Ins, InVals, CS); + /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass, + InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, + InVals, CS); } bool @@ -7121,8 +7498,7 @@ SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { "Custom lowering only for i1 results"); SDLoc DL(Op); - return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1, - Op.getOperand(0)); + return DAG.getNode(PPCISD::ANDI_rec_1_GT_BIT, DL, MVT::i1, Op.getOperand(0)); } SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, @@ -7188,17 +7564,15 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { !Op.getOperand(2).getValueType().isFloatingPoint()) return Op; + bool HasNoInfs = DAG.getTarget().Options.NoInfsFPMath; + bool HasNoNaNs = DAG.getTarget().Options.NoNaNsFPMath; // We might be able to do better than this under some circumstances, but in // general, fsel-based lowering of select is a finite-math-only optimization. // For more information, see section F.3 of the 2.06 ISA specification. - if (!DAG.getTarget().Options.NoInfsFPMath || - !DAG.getTarget().Options.NoNaNsFPMath) + // With ISA 3.0, we have xsmaxcdp/xsmincdp which are OK to emit even in the + // presence of infinities. + if (!Subtarget.hasP9Vector() && (!HasNoInfs || !HasNoNaNs)) return Op; - // TODO: Propagate flags from the select rather than global settings. - SDNodeFlags Flags; - Flags.setNoInfs(true); - Flags.setNoNaNs(true); - ISD::CondCode CC = cast(Op.getOperand(4))->get(); EVT ResVT = Op.getValueType(); @@ -7207,6 +7581,27 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); SDLoc dl(Op); + if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) { + switch (CC) { + default: + // Not a min/max but with finite math, we may still be able to use fsel. + if (HasNoInfs && HasNoNaNs) + break; + return Op; + case ISD::SETOGT: + case ISD::SETGT: + return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS); + case ISD::SETOLT: + case ISD::SETLT: + return DAG.getNode(PPCISD::XSMINCDP, dl, Op.getValueType(), LHS, RHS); + } + } + + // TODO: Propagate flags from the select rather than global settings. + SDNodeFlags Flags; + Flags.setNoInfs(true); + Flags.setNoNaNs(true); + // If the RHS of the comparison is a 0.0, we don't need to do the // subtraction at all. SDValue Sel1; @@ -8055,8 +8450,6 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const { /// SplatSize. Cast the result to VT. static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT, SelectionDAG &DAG, const SDLoc &dl) { - assert(Val >= -16 && Val <= 15 && "vsplti is out of range!"); - static const MVT VTys[] = { // canonical VT to use for each size. MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32 }; @@ -8376,29 +8769,10 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, } // We have XXSPLTIB for constant splats one byte wide - if (Subtarget.hasP9Vector() && SplatSize == 1) { - // This is a splat of 1-byte elements with some elements potentially undef. - // Rather than trying to match undef in the SDAG patterns, ensure that all - // elements are the same constant. - if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) { - SmallVector Ops(16, DAG.getConstant(SplatBits, - dl, MVT::i32)); - SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops); - if (Op.getValueType() != MVT::v16i8) - return DAG.getBitcast(Op.getValueType(), NewBV); - return NewBV; - } - - // BuildVectorSDNode::isConstantSplat() is actually pretty smart. It'll - // detect that constant splats like v8i16: 0xABAB are really just splats - // of a 1-byte constant. In this case, we need to convert the node to a - // splat of v16i8 and a bitcast. - if (Op.getValueType() != MVT::v16i8) - return DAG.getBitcast(Op.getValueType(), - DAG.getConstant(SplatBits, dl, MVT::v16i8)); - - return Op; - } + // FIXME: SplatBits is an unsigned int being cast to an int while passing it + // as an argument to BuildSplatiI. Given SplatSize == 1 it is okay here. + if (Subtarget.hasP9Vector() && SplatSize == 1) + return BuildSplatI(SplatBits, SplatSize, Op.getValueType(), DAG, dl); // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> @@ -8930,19 +9304,19 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (Subtarget.hasP9Vector()) { if (PPC::isXXBRHShuffleMask(SVOp)) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); - SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv); + SDValue ReveHWord = DAG.getNode(ISD::BSWAP, dl, MVT::v8i16, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord); } else if (PPC::isXXBRWShuffleMask(SVOp)) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); - SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv); + SDValue ReveWord = DAG.getNode(ISD::BSWAP, dl, MVT::v4i32, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord); } else if (PPC::isXXBRDShuffleMask(SVOp)) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1); - SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv); + SDValue ReveDWord = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord); } else if (PPC::isXXBRQShuffleMask(SVOp)) { SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1); - SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv); + SDValue ReveQWord = DAG.getNode(ISD::BSWAP, dl, MVT::v1i128, Conv); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord); } } @@ -9503,7 +9877,7 @@ SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const { Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0), Op.getOperand(0)); // XXBRD - Op = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Op); + Op = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Op); // MFVSRD int VectorIndex = 0; if (Subtarget.isLittleEndian()) @@ -10845,9 +11219,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, DebugLoc dl = MI.getDebugLoc(); TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond, MI.getOperand(2).getReg(), MI.getOperand(3).getReg()); - } else if (MI.getOpcode() == PPC::SELECT_CC_I4 || - MI.getOpcode() == PPC::SELECT_CC_I8 || - MI.getOpcode() == PPC::SELECT_CC_F4 || + } else if (MI.getOpcode() == PPC::SELECT_CC_F4 || MI.getOpcode() == PPC::SELECT_CC_F8 || MI.getOpcode() == PPC::SELECT_CC_F16 || MI.getOpcode() == PPC::SELECT_CC_QFRC || @@ -10859,8 +11231,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOpcode() == PPC::SELECT_CC_VSRC || MI.getOpcode() == PPC::SELECT_CC_SPE4 || MI.getOpcode() == PPC::SELECT_CC_SPE || - MI.getOpcode() == PPC::SELECT_I4 || - MI.getOpcode() == PPC::SELECT_I8 || MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 || MI.getOpcode() == PPC::SELECT_F16 || @@ -11397,28 +11767,28 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // Restore FPSCR value. BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg); - } else if (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || - MI.getOpcode() == PPC::ANDIo_1_GT_BIT || - MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || - MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) { - unsigned Opcode = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8 || - MI.getOpcode() == PPC::ANDIo_1_GT_BIT8) - ? PPC::ANDIo8 - : PPC::ANDIo; - bool isEQ = (MI.getOpcode() == PPC::ANDIo_1_EQ_BIT || - MI.getOpcode() == PPC::ANDIo_1_EQ_BIT8); + } else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT || + MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT || + MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 || + MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) { + unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 || + MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) + ? PPC::ANDI8_rec + : PPC::ANDI_rec; + bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT || + MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8); MachineRegisterInfo &RegInfo = F->getRegInfo(); Register Dest = RegInfo.createVirtualRegister( - Opcode == PPC::ANDIo ? &PPC::GPRCRegClass : &PPC::G8RCRegClass); + Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass); - DebugLoc dl = MI.getDebugLoc(); - BuildMI(*BB, MI, dl, TII->get(Opcode), Dest) + DebugLoc Dl = MI.getDebugLoc(); + BuildMI(*BB, MI, Dl, TII->get(Opcode), Dest) .addReg(MI.getOperand(1).getReg()) .addImm(1); - BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), + BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) - .addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT); + .addReg(IsEQ ? PPC::CR0EQ : PPC::CR0GT); } else if (MI.getOpcode() == PPC::TCHECK_RET) { DebugLoc Dl = MI.getDebugLoc(); MachineRegisterInfo &RegInfo = F->getRegInfo(); @@ -11638,7 +12008,7 @@ unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are two or more FDIVs (for embedded cores with only // one FP pipeline) for three or more FDIVs (for generic OOO cores). - switch (Subtarget.getDarwinDirective()) { + switch (Subtarget.getCPUDirective()) { default: return 3; case PPC::DIR_440: @@ -14111,7 +14481,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, } Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { - switch (Subtarget.getDarwinDirective()) { + switch (Subtarget.getCPUDirective()) { default: break; case PPC::DIR_970: case PPC::DIR_PWR4: @@ -14121,7 +14491,8 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { case PPC::DIR_PWR6X: case PPC::DIR_PWR7: case PPC::DIR_PWR8: - case PPC::DIR_PWR9: { + case PPC::DIR_PWR9: + case PPC::DIR_PWR_FUTURE: { if (!ML) break; @@ -14309,6 +14680,17 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::VSFRCRegClass); } + // If we name a VSX register, we can't defer to the base class because it + // will not recognize the correct register (their names will be VSL{0-31} + // and V{0-31} so they won't match). So we match them here. + if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') { + int VSNum = atoi(Constraint.data() + 3); + assert(VSNum >= 0 && VSNum <= 63 && + "Attempted to access a vsr out of range"); + if (VSNum < 32) + return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass); + return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass); + } std::pair R = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); @@ -14513,16 +14895,15 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -Register PPCTargetLowering::getRegisterByName(const char* RegName, EVT VT, +Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { bool isPPC64 = Subtarget.isPPC64(); bool IsDarwinABI = Subtarget.isDarwinABI(); - if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) || - (!isPPC64 && VT != MVT::i32)) + bool is64Bit = isPPC64 && VT == LLT::scalar(64); + if (!is64Bit && VT != LLT::scalar(32)) report_fatal_error("Invalid register global variable type"); - bool is64Bit = isPPC64 && VT == MVT::i64; Register Reg = StringSwitch(RegName) .Case("r1", is64Bit ? PPC::X1 : PPC::R1) .Case("r2", (IsDarwinABI || isPPC64) ? Register() : PPC::R2) @@ -14870,6 +15251,9 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!VT.isSimple()) return false; + if (VT.isFloatingPoint() && !Subtarget.allowsUnalignedFPAccess()) + return false; + if (VT.getSimpleVT().isVector()) { if (Subtarget.hasVSX()) { if (VT != MVT::v2f64 && VT != MVT::v2i64 && @@ -14889,7 +15273,8 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return true; } -bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { +bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const { VT = VT.getScalarType(); if (!VT.isSimple()) @@ -15278,7 +15663,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const { return SDValue(); auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool { - switch (this->Subtarget.getDarwinDirective()) { + switch (this->Subtarget.getCPUDirective()) { default: // TODO: enhance the condition for subtarget before pwr8 return false; @@ -15288,6 +15673,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const { // vector 7 2 2 return true; case PPC::DIR_PWR9: + case PPC::DIR_PWR_FUTURE: // type mul add shl // scalar 5 2 2 // vector 7 2 2 @@ -15357,12 +15743,6 @@ bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!CI->isTailCall()) return false; - // If tail calls are disabled for the caller then we are done. - const Function *Caller = CI->getParent()->getParent(); - auto Attr = Caller->getFnAttribute("disable-tail-calls"); - if (Attr.getValueAsString() == "true") - return false; - // If sibling calls have been disabled and tail-calls aren't guaranteed // there is no reason to duplicate. auto &TM = getTargetMachine(); @@ -15375,6 +15755,7 @@ bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return false; // Make sure the callee and caller calling conventions are eligible for tco. + const Function *Caller = CI->getParent()->getParent(); if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(), CI->getCallingConv())) return false; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 62922ea2d4c4..e0c381827b87 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -43,460 +43,475 @@ namespace llvm { // that come before it. For example, ADD or MUL should be placed before // the ISD::FIRST_TARGET_MEMORY_OPCODE while a LOAD or STORE should come // after it. - enum NodeType : unsigned { - // Start the numbering where the builtin ops and target ops leave off. - FIRST_NUMBER = ISD::BUILTIN_OP_END, - - /// FSEL - Traditional three-operand fsel node. - /// - FSEL, - - /// FCFID - The FCFID instruction, taking an f64 operand and producing - /// and f64 value containing the FP representation of the integer that - /// was temporarily in the f64 operand. - FCFID, - - /// Newer FCFID[US] integer-to-floating-point conversion instructions for - /// unsigned integers and single-precision outputs. - FCFIDU, FCFIDS, FCFIDUS, - - /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64 - /// operand, producing an f64 value containing the integer representation - /// of that FP value. - FCTIDZ, FCTIWZ, - - /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for - /// unsigned integers with round toward zero. - FCTIDUZ, FCTIWUZ, - - /// Floating-point-to-interger conversion instructions - FP_TO_UINT_IN_VSR, FP_TO_SINT_IN_VSR, - - /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in - /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer. - VEXTS, - - /// SExtVElems, takes an input vector of a smaller type and sign - /// extends to an output vector of a larger type. - SExtVElems, - - /// Reciprocal estimate instructions (unary FP ops). - FRE, FRSQRTE, - - // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking - // three v4f32 operands and producing a v4f32 result. - VMADDFP, VNMSUBFP, - - /// VPERM - The PPC VPERM Instruction. - /// - VPERM, - - /// XXSPLT - The PPC VSX splat instructions - /// - XXSPLT, - - /// VECINSERT - The PPC vector insert instruction - /// - VECINSERT, - - /// XXREVERSE - The PPC VSX reverse instruction - /// - XXREVERSE, - - /// VECSHL - The PPC vector shift left instruction - /// - VECSHL, - - /// XXPERMDI - The PPC XXPERMDI instruction - /// - XXPERMDI, - - /// The CMPB instruction (takes two operands of i32 or i64). - CMPB, - - /// Hi/Lo - These represent the high and low 16-bit parts of a global - /// address respectively. These nodes have two operands, the first of - /// which must be a TargetGlobalAddress, and the second of which must be a - /// Constant. Selected naively, these turn into 'lis G+C' and 'li G+C', - /// though these are usually folded into other nodes. - Hi, Lo, - - /// The following two target-specific nodes are used for calls through - /// function pointers in the 64-bit SVR4 ABI. - - /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX) - /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to - /// compute an allocation on the stack. - DYNALLOC, - - /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to - /// compute an offset from native SP to the address of the most recent - /// dynamic alloca. - DYNAREAOFFSET, - - /// GlobalBaseReg - On Darwin, this node represents the result of the mflr - /// at function entry, used for PIC code. - GlobalBaseReg, - - /// These nodes represent PPC shifts. - /// - /// For scalar types, only the last `n + 1` bits of the shift amounts - /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc. - /// for exact behaviors. - /// - /// For vector types, only the last n bits are used. See vsld. - SRL, SRA, SHL, - - /// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign - /// word and shift left immediate. - EXTSWSLI, - - /// The combination of sra[wd]i and addze used to implemented signed - /// integer division by a power of 2. The first operand is the dividend, - /// and the second is the constant shift amount (representing the - /// divisor). - SRA_ADDZE, - - /// CALL - A direct function call. - /// CALL_NOP is a call with the special NOP which follows 64-bit - /// SVR4 calls and 32-bit/64-bit AIX calls. - CALL, CALL_NOP, - - /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a - /// MTCTR instruction. - MTCTR, - - /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a - /// BCTRL instruction. - BCTRL, - - /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl - /// instruction and the TOC reload required on SVR4 PPC64. - BCTRL_LOAD_TOC, - - /// Return with a flag operand, matched by 'blr' - RET_FLAG, - - /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction. - /// This copies the bits corresponding to the specified CRREG into the - /// resultant GPR. Bits corresponding to other CR regs are undefined. - MFOCRF, - - /// Direct move from a VSX register to a GPR - MFVSR, - - /// Direct move from a GPR to a VSX register (algebraic) - MTVSRA, - - /// Direct move from a GPR to a VSX register (zero) - MTVSRZ, - - /// Direct move of 2 consecutive GPR to a VSX register. - BUILD_FP128, - - /// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and - /// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is - /// unsupported for this target. - /// Merge 2 GPRs to a single SPE register. - BUILD_SPE64, - - /// Extract SPE register component, second argument is high or low. - EXTRACT_SPE, - - /// Extract a subvector from signed integer vector and convert to FP. - /// It is primarily used to convert a (widened) illegal integer vector - /// type to a legal floating point vector type. - /// For example v2i32 -> widened to v4i32 -> v2f64 - SINT_VEC_TO_FP, - - /// Extract a subvector from unsigned integer vector and convert to FP. - /// As with SINT_VEC_TO_FP, used for converting illegal types. - UINT_VEC_TO_FP, - - // FIXME: Remove these once the ANDI glue bug is fixed: - /// i1 = ANDIo_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the - /// eq or gt bit of CR0 after executing andi. x, 1. This is used to - /// implement truncation of i32 or i64 to i1. - ANDIo_1_EQ_BIT, ANDIo_1_GT_BIT, - - // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit - // target (returns (Lo, Hi)). It takes a chain operand. - READ_TIME_BASE, - - // EH_SJLJ_SETJMP - SjLj exception handling setjmp. - EH_SJLJ_SETJMP, - - // EH_SJLJ_LONGJMP - SjLj exception handling longjmp. - EH_SJLJ_LONGJMP, - - /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP* - /// instructions. For lack of better number, we use the opcode number - /// encoding for the OPC field to identify the compare. For example, 838 - /// is VCMPGTSH. - VCMP, - - /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the - /// altivec VCMP*o instructions. For lack of better number, we use the - /// opcode number encoding for the OPC field to identify the compare. For - /// example, 838 is VCMPGTSH. - VCMPo, - - /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This - /// corresponds to the COND_BRANCH pseudo instruction. CRRC is the - /// condition register to branch on, OPC is the branch opcode to use (e.g. - /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is - /// an optional input flag argument. - COND_BRANCH, - - /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based - /// loops. - BDNZ, BDZ, - - /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding - /// towards zero. Used only as part of the long double-to-int - /// conversion sequence. - FADDRTZ, - - /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register. - MFFS, - - /// TC_RETURN - A tail call return. - /// operand #0 chain - /// operand #1 callee (register or absolute) - /// operand #2 stack adjustment - /// operand #3 optional in flag - TC_RETURN, - - /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls - CR6SET, - CR6UNSET, - - /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS - /// for non-position independent code on PPC32. - PPC32_GOT, - - /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and - /// local dynamic TLS and position indendepent code on PPC32. - PPC32_PICGOT, - - /// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec - /// TLS model, produces an ADDIS8 instruction that adds the GOT - /// base to sym\@got\@tprel\@ha. - ADDIS_GOT_TPREL_HA, - - /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec - /// TLS model, produces a LD instruction with base register G8RReg - /// and offset sym\@got\@tprel\@l. This completes the addition that - /// finds the offset of "sym" relative to the thread pointer. - LD_GOT_TPREL_L, - - /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS - /// model, produces an ADD instruction that adds the contents of - /// G8RReg to the thread pointer. Symbol contains a relocation - /// sym\@tls which is to be replaced by the thread pointer and - /// identifies to the linker that the instruction is part of a - /// TLS sequence. - ADD_TLS, - - /// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS - /// model, produces an ADDIS8 instruction that adds the GOT base - /// register to sym\@got\@tlsgd\@ha. - ADDIS_TLSGD_HA, - - /// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS - /// model, produces an ADDI8 instruction that adds G8RReg to - /// sym\@got\@tlsgd\@l and stores the result in X3. Hidden by - /// ADDIS_TLSGD_L_ADDR until after register assignment. - ADDI_TLSGD_L, - - /// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS - /// model, produces a call to __tls_get_addr(sym\@tlsgd). Hidden by - /// ADDIS_TLSGD_L_ADDR until after register assignment. - GET_TLS_ADDR, - - /// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that - /// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following - /// register assignment. - ADDI_TLSGD_L_ADDR, - - /// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS - /// model, produces an ADDIS8 instruction that adds the GOT base - /// register to sym\@got\@tlsld\@ha. - ADDIS_TLSLD_HA, - - /// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS - /// model, produces an ADDI8 instruction that adds G8RReg to - /// sym\@got\@tlsld\@l and stores the result in X3. Hidden by - /// ADDIS_TLSLD_L_ADDR until after register assignment. - ADDI_TLSLD_L, - - /// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS - /// model, produces a call to __tls_get_addr(sym\@tlsld). Hidden by - /// ADDIS_TLSLD_L_ADDR until after register assignment. - GET_TLSLD_ADDR, - - /// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that - /// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion - /// following register assignment. - ADDI_TLSLD_L_ADDR, - - /// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS - /// model, produces an ADDIS8 instruction that adds X3 to - /// sym\@dtprel\@ha. - ADDIS_DTPREL_HA, - - /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS - /// model, produces an ADDI8 instruction that adds G8RReg to - /// sym\@got\@dtprel\@l. - ADDI_DTPREL_L, - - /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded - /// during instruction selection to optimize a BUILD_VECTOR into - /// operations on splats. This is necessary to avoid losing these - /// optimizations due to constant folding. - VADD_SPLAT, - - /// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned - /// operand identifies the operating system entry point. - SC, - - /// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer. - CLRBHRB, - - /// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch - /// history rolling buffer entry. - MFBHRBE, - - /// CHAIN = RFEBB CHAIN, State - Return from event-based branch. - RFEBB, - - /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little - /// endian. Maps to an xxswapd instruction that corrects an lxvd2x - /// or stxvd2x instruction. The chain is necessary because the - /// sequence replaces a load and needs to provide the same number - /// of outputs. - XXSWAPD, - - /// An SDNode for swaps that are not associated with any loads/stores - /// and thereby have no chain. - SWAP_NO_CHAIN, - - /// An SDNode for Power9 vector absolute value difference. - /// operand #0 vector - /// operand #1 vector - /// operand #2 constant i32 0 or 1, to indicate whether needs to patch - /// the most significant bit for signed i32 - /// - /// Power9 VABSD* instructions are designed to support unsigned integer - /// vectors (byte/halfword/word), if we want to make use of them for signed - /// integer vectors, we have to flip their sign bits first. To flip sign bit - /// for byte/halfword integer vector would become inefficient, but for word - /// integer vector, we can leverage XVNEGSP to make it efficiently. eg: - /// abs(sub(a,b)) => VABSDUW(a+0x80000000, b+0x80000000) - /// => VABSDUW((XVNEGSP a), (XVNEGSP b)) - VABSD, - - /// QVFPERM = This corresponds to the QPX qvfperm instruction. - QVFPERM, - - /// QVGPCI = This corresponds to the QPX qvgpci instruction. - QVGPCI, - - /// QVALIGNI = This corresponds to the QPX qvaligni instruction. - QVALIGNI, - - /// QVESPLATI = This corresponds to the QPX qvesplati instruction. - QVESPLATI, - - /// QBFLT = Access the underlying QPX floating-point boolean - /// representation. - QBFLT, - - /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or - /// lower (IDX=1) half of v4f32 to v2f64. - FP_EXTEND_HALF, - - /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a - /// byte-swapping store instruction. It byte-swaps the low "Type" bits of - /// the GPRC input, then stores it through Ptr. Type can be either i16 or - /// i32. - STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE, - - /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a - /// byte-swapping load instruction. It loads "Type" bits, byte swaps it, - /// then puts it in the bottom bits of the GPRC. TYPE can be either i16 - /// or i32. - LBRX, - - /// STFIWX - The STFIWX instruction. The first operand is an input token - /// chain, then an f64 value to store, then an address to store it to. - STFIWX, - - /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point - /// load which sign-extends from a 32-bit integer value into the - /// destination 64-bit register. - LFIWAX, - - /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point - /// load which zero-extends from a 32-bit integer value into the - /// destination 64-bit register. - LFIWZX, - - /// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an - /// integer smaller than 64 bits into a VSR. The integer is zero-extended. - /// This can be used for converting loaded integers to floating point. - LXSIZX, - - /// STXSIX - The STXSI[bh]X instruction. The first operand is an input - /// chain, then an f64 value to store, then an address to store it to, - /// followed by a byte-width for the store. - STXSIX, - - /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian. - /// Maps directly to an lxvd2x instruction that will be followed by - /// an xxswapd. - LXVD2X, - - /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian. - /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on - /// the vector type to load vector in big-endian element order. - LOAD_VEC_BE, - - /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a - /// v2f32 value into the lower half of a VSR register. - LD_VSX_LH, - - /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory - /// instructions such as LXVDSX, LXVWSX. - LD_SPLAT, - - /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. - /// Maps directly to an stxvd2x instruction that will be preceded by - /// an xxswapd. - STXVD2X, - - /// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian. - /// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on - /// the vector type to store vector in big-endian element order. - STORE_VEC_BE, - - /// Store scalar integers from VSR. - ST_VSR_SCAL_INT, - - /// QBRC, CHAIN = QVLFSb CHAIN, Ptr - /// The 4xf32 load used for v4i1 constants. - QVLFSb, - - /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes - /// except they ensure that the compare input is zero-extended for - /// sub-word versions because the atomic loads zero-extend. - ATOMIC_CMP_SWAP_8, ATOMIC_CMP_SWAP_16, - - /// GPRC = TOC_ENTRY GA, TOC - /// Loads the entry for GA from the TOC, where the TOC base is given by - /// the last operand. - TOC_ENTRY - }; + enum NodeType : unsigned { + // Start the numbering where the builtin ops and target ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// FSEL - Traditional three-operand fsel node. + /// + FSEL, + + /// XSMAXCDP, XSMINCDP - C-type min/max instructions. + XSMAXCDP, + XSMINCDP, + + /// FCFID - The FCFID instruction, taking an f64 operand and producing + /// and f64 value containing the FP representation of the integer that + /// was temporarily in the f64 operand. + FCFID, + + /// Newer FCFID[US] integer-to-floating-point conversion instructions for + /// unsigned integers and single-precision outputs. + FCFIDU, + FCFIDS, + FCFIDUS, + + /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64 + /// operand, producing an f64 value containing the integer representation + /// of that FP value. + FCTIDZ, + FCTIWZ, + + /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for + /// unsigned integers with round toward zero. + FCTIDUZ, + FCTIWUZ, + + /// Floating-point-to-interger conversion instructions + FP_TO_UINT_IN_VSR, + FP_TO_SINT_IN_VSR, + + /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in + /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer. + VEXTS, + + /// SExtVElems, takes an input vector of a smaller type and sign + /// extends to an output vector of a larger type. + SExtVElems, + + /// Reciprocal estimate instructions (unary FP ops). + FRE, + FRSQRTE, + + // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking + // three v4f32 operands and producing a v4f32 result. + VMADDFP, + VNMSUBFP, + + /// VPERM - The PPC VPERM Instruction. + /// + VPERM, + + /// XXSPLT - The PPC VSX splat instructions + /// + XXSPLT, + + /// VECINSERT - The PPC vector insert instruction + /// + VECINSERT, + + /// VECSHL - The PPC vector shift left instruction + /// + VECSHL, + + /// XXPERMDI - The PPC XXPERMDI instruction + /// + XXPERMDI, + + /// The CMPB instruction (takes two operands of i32 or i64). + CMPB, + + /// Hi/Lo - These represent the high and low 16-bit parts of a global + /// address respectively. These nodes have two operands, the first of + /// which must be a TargetGlobalAddress, and the second of which must be a + /// Constant. Selected naively, these turn into 'lis G+C' and 'li G+C', + /// though these are usually folded into other nodes. + Hi, + Lo, + + /// The following two target-specific nodes are used for calls through + /// function pointers in the 64-bit SVR4 ABI. + + /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX) + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an allocation on the stack. + DYNALLOC, + + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an offset from native SP to the address of the most recent + /// dynamic alloca. + DYNAREAOFFSET, + + /// GlobalBaseReg - On Darwin, this node represents the result of the mflr + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// These nodes represent PPC shifts. + /// + /// For scalar types, only the last `n + 1` bits of the shift amounts + /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc. + /// for exact behaviors. + /// + /// For vector types, only the last n bits are used. See vsld. + SRL, + SRA, + SHL, + + /// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign + /// word and shift left immediate. + EXTSWSLI, + + /// The combination of sra[wd]i and addze used to implemented signed + /// integer division by a power of 2. The first operand is the dividend, + /// and the second is the constant shift amount (representing the + /// divisor). + SRA_ADDZE, + + /// CALL - A direct function call. + /// CALL_NOP is a call with the special NOP which follows 64-bit + /// SVR4 calls and 32-bit/64-bit AIX calls. + CALL, + CALL_NOP, + + /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a + /// MTCTR instruction. + MTCTR, + + /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a + /// BCTRL instruction. + BCTRL, + + /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl + /// instruction and the TOC reload required on 64-bit ELF, 32-bit AIX + /// and 64-bit AIX. + BCTRL_LOAD_TOC, + + /// Return with a flag operand, matched by 'blr' + RET_FLAG, + + /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction. + /// This copies the bits corresponding to the specified CRREG into the + /// resultant GPR. Bits corresponding to other CR regs are undefined. + MFOCRF, + + /// Direct move from a VSX register to a GPR + MFVSR, + + /// Direct move from a GPR to a VSX register (algebraic) + MTVSRA, + + /// Direct move from a GPR to a VSX register (zero) + MTVSRZ, + + /// Direct move of 2 consecutive GPR to a VSX register. + BUILD_FP128, + + /// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and + /// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is + /// unsupported for this target. + /// Merge 2 GPRs to a single SPE register. + BUILD_SPE64, + + /// Extract SPE register component, second argument is high or low. + EXTRACT_SPE, + + /// Extract a subvector from signed integer vector and convert to FP. + /// It is primarily used to convert a (widened) illegal integer vector + /// type to a legal floating point vector type. + /// For example v2i32 -> widened to v4i32 -> v2f64 + SINT_VEC_TO_FP, + + /// Extract a subvector from unsigned integer vector and convert to FP. + /// As with SINT_VEC_TO_FP, used for converting illegal types. + UINT_VEC_TO_FP, + + // FIXME: Remove these once the ANDI glue bug is fixed: + /// i1 = ANDI_rec_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the + /// eq or gt bit of CR0 after executing andi. x, 1. This is used to + /// implement truncation of i32 or i64 to i1. + ANDI_rec_1_EQ_BIT, + ANDI_rec_1_GT_BIT, + + // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit + // target (returns (Lo, Hi)). It takes a chain operand. + READ_TIME_BASE, + + // EH_SJLJ_SETJMP - SjLj exception handling setjmp. + EH_SJLJ_SETJMP, + + // EH_SJLJ_LONGJMP - SjLj exception handling longjmp. + EH_SJLJ_LONGJMP, + + /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP* + /// instructions. For lack of better number, we use the opcode number + /// encoding for the OPC field to identify the compare. For example, 838 + /// is VCMPGTSH. + VCMP, + + /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the + /// altivec VCMP*o instructions. For lack of better number, we use the + /// opcode number encoding for the OPC field to identify the compare. For + /// example, 838 is VCMPGTSH. + VCMPo, + + /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This + /// corresponds to the COND_BRANCH pseudo instruction. CRRC is the + /// condition register to branch on, OPC is the branch opcode to use (e.g. + /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is + /// an optional input flag argument. + COND_BRANCH, + + /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based + /// loops. + BDNZ, + BDZ, + + /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding + /// towards zero. Used only as part of the long double-to-int + /// conversion sequence. + FADDRTZ, + + /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register. + MFFS, + + /// TC_RETURN - A tail call return. + /// operand #0 chain + /// operand #1 callee (register or absolute) + /// operand #2 stack adjustment + /// operand #3 optional in flag + TC_RETURN, + + /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls + CR6SET, + CR6UNSET, + + /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS + /// for non-position independent code on PPC32. + PPC32_GOT, + + /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and + /// local dynamic TLS and position indendepent code on PPC32. + PPC32_PICGOT, + + /// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec + /// TLS model, produces an ADDIS8 instruction that adds the GOT + /// base to sym\@got\@tprel\@ha. + ADDIS_GOT_TPREL_HA, + + /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec + /// TLS model, produces a LD instruction with base register G8RReg + /// and offset sym\@got\@tprel\@l. This completes the addition that + /// finds the offset of "sym" relative to the thread pointer. + LD_GOT_TPREL_L, + + /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS + /// model, produces an ADD instruction that adds the contents of + /// G8RReg to the thread pointer. Symbol contains a relocation + /// sym\@tls which is to be replaced by the thread pointer and + /// identifies to the linker that the instruction is part of a + /// TLS sequence. + ADD_TLS, + + /// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS + /// model, produces an ADDIS8 instruction that adds the GOT base + /// register to sym\@got\@tlsgd\@ha. + ADDIS_TLSGD_HA, + + /// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS + /// model, produces an ADDI8 instruction that adds G8RReg to + /// sym\@got\@tlsgd\@l and stores the result in X3. Hidden by + /// ADDIS_TLSGD_L_ADDR until after register assignment. + ADDI_TLSGD_L, + + /// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS + /// model, produces a call to __tls_get_addr(sym\@tlsgd). Hidden by + /// ADDIS_TLSGD_L_ADDR until after register assignment. + GET_TLS_ADDR, + + /// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that + /// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following + /// register assignment. + ADDI_TLSGD_L_ADDR, + + /// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS + /// model, produces an ADDIS8 instruction that adds the GOT base + /// register to sym\@got\@tlsld\@ha. + ADDIS_TLSLD_HA, + + /// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS + /// model, produces an ADDI8 instruction that adds G8RReg to + /// sym\@got\@tlsld\@l and stores the result in X3. Hidden by + /// ADDIS_TLSLD_L_ADDR until after register assignment. + ADDI_TLSLD_L, + + /// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS + /// model, produces a call to __tls_get_addr(sym\@tlsld). Hidden by + /// ADDIS_TLSLD_L_ADDR until after register assignment. + GET_TLSLD_ADDR, + + /// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that + /// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion + /// following register assignment. + ADDI_TLSLD_L_ADDR, + + /// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS + /// model, produces an ADDIS8 instruction that adds X3 to + /// sym\@dtprel\@ha. + ADDIS_DTPREL_HA, + + /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS + /// model, produces an ADDI8 instruction that adds G8RReg to + /// sym\@got\@dtprel\@l. + ADDI_DTPREL_L, + + /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded + /// during instruction selection to optimize a BUILD_VECTOR into + /// operations on splats. This is necessary to avoid losing these + /// optimizations due to constant folding. + VADD_SPLAT, + + /// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned + /// operand identifies the operating system entry point. + SC, + + /// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer. + CLRBHRB, + + /// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch + /// history rolling buffer entry. + MFBHRBE, + + /// CHAIN = RFEBB CHAIN, State - Return from event-based branch. + RFEBB, + + /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little + /// endian. Maps to an xxswapd instruction that corrects an lxvd2x + /// or stxvd2x instruction. The chain is necessary because the + /// sequence replaces a load and needs to provide the same number + /// of outputs. + XXSWAPD, + + /// An SDNode for swaps that are not associated with any loads/stores + /// and thereby have no chain. + SWAP_NO_CHAIN, + + /// An SDNode for Power9 vector absolute value difference. + /// operand #0 vector + /// operand #1 vector + /// operand #2 constant i32 0 or 1, to indicate whether needs to patch + /// the most significant bit for signed i32 + /// + /// Power9 VABSD* instructions are designed to support unsigned integer + /// vectors (byte/halfword/word), if we want to make use of them for signed + /// integer vectors, we have to flip their sign bits first. To flip sign bit + /// for byte/halfword integer vector would become inefficient, but for word + /// integer vector, we can leverage XVNEGSP to make it efficiently. eg: + /// abs(sub(a,b)) => VABSDUW(a+0x80000000, b+0x80000000) + /// => VABSDUW((XVNEGSP a), (XVNEGSP b)) + VABSD, + + /// QVFPERM = This corresponds to the QPX qvfperm instruction. + QVFPERM, + + /// QVGPCI = This corresponds to the QPX qvgpci instruction. + QVGPCI, + + /// QVALIGNI = This corresponds to the QPX qvaligni instruction. + QVALIGNI, + + /// QVESPLATI = This corresponds to the QPX qvesplati instruction. + QVESPLATI, + + /// QBFLT = Access the underlying QPX floating-point boolean + /// representation. + QBFLT, + + /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or + /// lower (IDX=1) half of v4f32 to v2f64. + FP_EXTEND_HALF, + + /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a + /// byte-swapping store instruction. It byte-swaps the low "Type" bits of + /// the GPRC input, then stores it through Ptr. Type can be either i16 or + /// i32. + STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE, + + /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a + /// byte-swapping load instruction. It loads "Type" bits, byte swaps it, + /// then puts it in the bottom bits of the GPRC. TYPE can be either i16 + /// or i32. + LBRX, + + /// STFIWX - The STFIWX instruction. The first operand is an input token + /// chain, then an f64 value to store, then an address to store it to. + STFIWX, + + /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point + /// load which sign-extends from a 32-bit integer value into the + /// destination 64-bit register. + LFIWAX, + + /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point + /// load which zero-extends from a 32-bit integer value into the + /// destination 64-bit register. + LFIWZX, + + /// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an + /// integer smaller than 64 bits into a VSR. The integer is zero-extended. + /// This can be used for converting loaded integers to floating point. + LXSIZX, + + /// STXSIX - The STXSI[bh]X instruction. The first operand is an input + /// chain, then an f64 value to store, then an address to store it to, + /// followed by a byte-width for the store. + STXSIX, + + /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian. + /// Maps directly to an lxvd2x instruction that will be followed by + /// an xxswapd. + LXVD2X, + + /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian. + /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on + /// the vector type to load vector in big-endian element order. + LOAD_VEC_BE, + + /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a + /// v2f32 value into the lower half of a VSR register. + LD_VSX_LH, + + /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory + /// instructions such as LXVDSX, LXVWSX. + LD_SPLAT, + + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. + /// Maps directly to an stxvd2x instruction that will be preceded by + /// an xxswapd. + STXVD2X, + + /// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian. + /// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on + /// the vector type to store vector in big-endian element order. + STORE_VEC_BE, + + /// Store scalar integers from VSR. + ST_VSR_SCAL_INT, + + /// QBRC, CHAIN = QVLFSb CHAIN, Ptr + /// The 4xf32 load used for v4i1 constants. + QVLFSb, + + /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes + /// except they ensure that the compare input is zero-extended for + /// sub-word versions because the atomic loads zero-extend. + ATOMIC_CMP_SWAP_8, + ATOMIC_CMP_SWAP_16, + + /// GPRC = TOC_ENTRY GA, TOC + /// Loads the entry for GA from the TOC, where the TOC base is given by + /// the last operand. + TOC_ENTRY + }; } // end namespace PPCISD @@ -647,6 +662,10 @@ namespace llvm { return true; } + bool isEqualityCmpFoldedWithSignedCmp() const override { + return false; + } + bool hasAndNotCompare(SDValue) const override { return true; } @@ -733,7 +752,7 @@ namespace llvm { SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; - Register getRegisterByName(const char* RegName, EVT VT, + Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; void computeKnownBitsForTargetNode(const SDValue Op, @@ -900,7 +919,8 @@ namespace llvm { /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is /// expanded to fmul + fadd. - bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const override; const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; @@ -1113,6 +1133,10 @@ namespace llvm { SelectionDAG &DAG, SDValue ArgVal, const SDLoc &dl) const; + SDValue LowerFormalArguments_AIX( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, const SDLoc &dl, + SelectionDAG &DAG, SmallVectorImpl &InVals) const; SDValue LowerFormalArguments_Darwin( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index f16187149d36..43431a1e0069 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -71,13 +71,14 @@ def SRL64 : SDNodeXForm, Requires<[In64BitMode]>; let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in { - def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, - []>, - Requires<[In64BitMode]>; + let isPredicable = 1 in + def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>, + Requires<[In64BitMode]>; def BCCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond), "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB, []>, @@ -141,9 +142,10 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in { [(PPCcall_nop (i64 imm:$func))]>; } let Uses = [CTR8, RM] in { - def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), - "bctrl", IIC_BrB, [(PPCbctrl)]>, - Requires<[In64BitMode]>; + let isPredicable = 1 in + def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl)]>, + Requires<[In64BitMode]>; let isCodeGenOnly = 1 in { def BCCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond), @@ -253,7 +255,7 @@ def LDARX : XForm_1_memOp<31, 84, (outs g8rc:$rD), (ins memrr:$ptr), // Instruction to support lock versions of atomics // (EH=1 - see Power ISA 2.07 Book II 4.4.2) def LDARXL : XForm_1<31, 84, (outs g8rc:$rD), (ins memrr:$ptr), - "ldarx $rD, $ptr, 1", IIC_LdStLDARX, []>, isDOT; + "ldarx $rD, $ptr, 1", IIC_LdStLDARX, []>, isRecordForm; let hasExtraDefRegAllocReq = 1 in def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC), @@ -263,7 +265,7 @@ def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC), let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def STDCX : XForm_1_memOp<31, 214, (outs), (ins g8rc:$rS, memrr:$dst), - "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT; + "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isRecordForm; let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC), @@ -410,6 +412,7 @@ def DYNALLOC8 : PPCEmitTimePseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri def DYNAREAOFFSET8 : PPCEmitTimePseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>; +let hasSideEffects = 0 in { let Defs = [LR8] in { def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS), "mtlr $rS", IIC_SprMTSPR>, @@ -421,6 +424,7 @@ def MFLR8 : XFXForm_1_ext<31, 339, 8, (outs g8rc:$rT), (ins), PPC970_DGroup_First, PPC970_Unit_FXU; } } // Interpretation64Bit +} //===----------------------------------------------------------------------===// // Fixed point instructions. @@ -474,14 +478,14 @@ defm XOR8 : XForm_6r<31, 316, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB), // Logical ops with immediate. let Defs = [CR0] in { -def ANDIo8 : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), +def ANDI8_rec : DForm_4<28, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), "andi. $dst, $src1, $src2", IIC_IntGeneral, [(set i64:$dst, (and i64:$src1, immZExt16:$src2))]>, - isDOT; -def ANDISo8 : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), + isRecordForm; +def ANDIS8_rec : DForm_4<29, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), "andis. $dst, $src1, $src2", IIC_IntGeneral, [(set i64:$dst, (and i64:$src1, imm16ShiftedZExt:$src2))]>, - isDOT; + isRecordForm; } def ORI8 : DForm_4<24, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), "ori $dst, $src1, $src2", IIC_IntSimple, @@ -497,9 +501,9 @@ def XORIS8 : DForm_4<27, (outs g8rc:$dst), (ins g8rc:$src1, u16imm64:$src2), [(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>; let isCommutable = 1 in -defm ADD8 : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), - "add", "$rT, $rA, $rB", IIC_IntSimple, - [(set i64:$rT, (add i64:$rA, i64:$rB))]>; +defm ADD8 : XOForm_1rx<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "add", "$rT, $rA, $rB", IIC_IntSimple, + [(set i64:$rT, (add i64:$rA, i64:$rB))]>; // ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the // initial-exec thread-local storage model. We need to forbid r0 here - // while it works for add just fine, the linker can relax this to local-exec @@ -576,9 +580,9 @@ defm SUBFC8 : XOForm_1rc<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), "subfc", "$rT, $rA, $rB", IIC_IntGeneral, [(set i64:$rT, (subc i64:$rB, i64:$rA))]>, PPC970_DGroup_Cracked; -defm SUBF8 : XOForm_1r<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), - "subf", "$rT, $rA, $rB", IIC_IntGeneral, - [(set i64:$rT, (sub i64:$rB, i64:$rA))]>; +defm SUBF8 : XOForm_1rx<31, 40, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "subf", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i64:$rT, (sub i64:$rB, i64:$rA))]>; defm NEG8 : XOForm_3r<31, 104, 0, (outs g8rc:$rT), (ins g8rc:$rA), "neg", "$rT, $rA", IIC_IntSimple, [(set i64:$rT, (ineg i64:$rA))]>; @@ -777,10 +781,10 @@ defm DIVD : XOForm_1rcr<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), defm DIVDU : XOForm_1rcr<31, 457, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), "divdu", "$rT, $rA, $rB", IIC_IntDivD, [(set i64:$rT, (udiv i64:$rA, i64:$rB))]>, isPPC64; -def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), - "divde $rT, $rA, $rB", IIC_IntDivD, - [(set i64:$rT, (int_ppc_divde g8rc:$rA, g8rc:$rB))]>, - isPPC64, Requires<[HasExtDiv]>; +defm DIVDE : XOForm_1rcr<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divde", "$rT, $rA, $rB", IIC_IntDivD, + [(set i64:$rT, (int_ppc_divde g8rc:$rA, g8rc:$rB))]>, + isPPC64, Requires<[HasExtDiv]>; let Predicates = [IsISA3_0] in { def MADDHD : VAForm_1a<48, (outs g8rc :$RT), (ins g8rc:$RA, g8rc:$RB, g8rc:$RC), @@ -815,24 +819,14 @@ def MODUD : XForm_8<31, 265, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), [(set i64:$rT, (urem i64:$rA, i64:$rB))]>; } -let Defs = [CR0] in -def DIVDEo : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), - "divde. $rT, $rA, $rB", IIC_IntDivD, - []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First, - isPPC64, Requires<[HasExtDiv]>; -def DIVDEU : XOForm_1<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), - "divdeu $rT, $rA, $rB", IIC_IntDivD, - [(set i64:$rT, (int_ppc_divdeu g8rc:$rA, g8rc:$rB))]>, - isPPC64, Requires<[HasExtDiv]>; -let Defs = [CR0] in -def DIVDEUo : XOForm_1<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), - "divdeu. $rT, $rA, $rB", IIC_IntDivD, - []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First, - isPPC64, Requires<[HasExtDiv]>; +defm DIVDEU : XOForm_1rcr<31, 393, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "divdeu", "$rT, $rA, $rB", IIC_IntDivD, + [(set i64:$rT, (int_ppc_divdeu g8rc:$rA, g8rc:$rB))]>, + isPPC64, Requires<[HasExtDiv]>; let isCommutable = 1 in -defm MULLD : XOForm_1r<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), - "mulld", "$rT, $rA, $rB", IIC_IntMulHD, - [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64; +defm MULLD : XOForm_1rx<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB), + "mulld", "$rT, $rA, $rB", IIC_IntMulHD, + [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64; let Interpretation64Bit = 1, isCodeGenOnly = 1 in def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm), "mulli $rD, $rA, $imm", IIC_IntMulLI, @@ -943,7 +937,7 @@ def LWAX : XForm_1_memOp<31, 341, (outs g8rc:$rD), (ins memrr:$src), [(set i64:$rD, (sextloadi32 xaddrX4:$src))]>, isPPC64, PPC970_DGroup_Cracked; // For fast-isel: -let isCodeGenOnly = 1, mayLoad = 1 in { +let isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in { def LWA_32 : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src), "lwa $rD, $src", IIC_LdStLWA, []>, isPPC64, PPC970_DGroup_Cracked; @@ -1469,7 +1463,7 @@ class X_L1_RA5_RB5 opcode, bits<10> xo, string opc, RegisterOperand ty, let Interpretation64Bit = 1, isCodeGenOnly = 1 in { def CP_COPY8 : X_L1_RA5_RB5<31, 774, "copy" , g8rc, IIC_LdStCOPY, []>; def CP_PASTE8 : X_L1_RA5_RB5<31, 902, "paste" , g8rc, IIC_LdStPASTE, []>; -def CP_PASTE8o : X_L1_RA5_RB5<31, 902, "paste.", g8rc, IIC_LdStPASTE, []>,isDOT; +def CP_PASTE8_rec : X_L1_RA5_RB5<31, 902, "paste.", g8rc, IIC_LdStPASTE, []>,isRecordForm; } // SLB Invalidate Entry Global diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index fd3fc2af2327..f94816a35f79 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -261,6 +261,11 @@ def vecspltisw : PatLeaf<(build_vector), [{ return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != nullptr; }], VSPLTISW_get_imm>; +def immEQOneV : PatLeaf<(build_vector), [{ + if (ConstantSDNode *C = cast(N)->getConstantSplatNode()) + return C->isOne(); + return false; +}]>; //===----------------------------------------------------------------------===// // Helpers for defining instructions that directly correspond to intrinsics. @@ -518,19 +523,19 @@ def VANDC : VXForm_1<1092, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), def VCFSX : VXForm_1<842, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), "vcfsx $vD, $vB, $UIMM", IIC_VecFP, [(set v4f32:$vD, - (int_ppc_altivec_vcfsx v4i32:$vB, imm:$UIMM))]>; + (int_ppc_altivec_vcfsx v4i32:$vB, timm:$UIMM))]>; def VCFUX : VXForm_1<778, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), "vcfux $vD, $vB, $UIMM", IIC_VecFP, [(set v4f32:$vD, - (int_ppc_altivec_vcfux v4i32:$vB, imm:$UIMM))]>; + (int_ppc_altivec_vcfux v4i32:$vB, timm:$UIMM))]>; def VCTSXS : VXForm_1<970, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), "vctsxs $vD, $vB, $UIMM", IIC_VecFP, [(set v4i32:$vD, - (int_ppc_altivec_vctsxs v4f32:$vB, imm:$UIMM))]>; + (int_ppc_altivec_vctsxs v4f32:$vB, timm:$UIMM))]>; def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), "vctuxs $vD, $vB, $UIMM", IIC_VecFP, [(set v4i32:$vD, - (int_ppc_altivec_vctuxs v4f32:$vB, imm:$UIMM))]>; + (int_ppc_altivec_vctuxs v4f32:$vB, timm:$UIMM))]>; // Defines with the UIM field set to 0 for floating-point // to integer (fp_to_sint/fp_to_uint) conversions and integer @@ -706,7 +711,7 @@ def VSPLTW : VXForm_1<652, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB), "vspltw $vD, $vB, $UIMM", IIC_VecPerm, [(set v16i8:$vD, (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>; -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1, hasSideEffects = 0 in { def VSPLTBs : VXForm_1<524, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB), "vspltb $vD, $vB, $UIMM", IIC_VecPerm, []>; def VSPLTHs : VXForm_1<588, (outs vrrc:$vD), (ins u5imm:$UIMM, vfrc:$vB), @@ -789,37 +794,37 @@ class VCMPo xo, string asmstr, ValueType Ty> // f32 element comparisons.0 def VCMPBFP : VCMP <966, "vcmpbfp $vD, $vA, $vB" , v4f32>; -def VCMPBFPo : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>; +def VCMPBFP_rec : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>; def VCMPEQFP : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>; -def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>; +def VCMPEQFP_rec : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>; def VCMPGEFP : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>; -def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>; +def VCMPGEFP_rec : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>; def VCMPGTFP : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>; -def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>; +def VCMPGTFP_rec : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>; // i8 element comparisons. def VCMPEQUB : VCMP < 6, "vcmpequb $vD, $vA, $vB" , v16i8>; -def VCMPEQUBo : VCMPo< 6, "vcmpequb. $vD, $vA, $vB", v16i8>; +def VCMPEQUB_rec : VCMPo< 6, "vcmpequb. $vD, $vA, $vB", v16i8>; def VCMPGTSB : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>; -def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>; +def VCMPGTSB_rec : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>; def VCMPGTUB : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>; -def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>; +def VCMPGTUB_rec : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>; // i16 element comparisons. def VCMPEQUH : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>; -def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>; +def VCMPEQUH_rec : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>; def VCMPGTSH : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>; -def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>; +def VCMPGTSH_rec : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>; def VCMPGTUH : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>; -def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>; +def VCMPGTUH_rec : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>; // i32 element comparisons. def VCMPEQUW : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>; -def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>; +def VCMPEQUW_rec : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>; def VCMPGTSW : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>; -def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>; +def VCMPGTSW_rec : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>; def VCMPGTUW : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>; -def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>; +def VCMPGTUW_rec : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>; let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in { @@ -856,6 +861,14 @@ def V_SETALLONES : VXForm_3<908, (outs vrrc:$vD), (ins), def : InstAlias<"vmr $vD, $vA", (VOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>; def : InstAlias<"vnot $vD, $vA", (VNOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>; +// Rotates. +def : Pat<(v16i8 (rotl v16i8:$vA, v16i8:$vB)), + (v16i8 (VRLB v16i8:$vA, v16i8:$vB))>; +def : Pat<(v8i16 (rotl v8i16:$vA, v8i16:$vB)), + (v8i16 (VRLH v8i16:$vA, v8i16:$vB))>; +def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)), + (v4i32 (VRLW v4i32:$vA, v4i32:$vB))>; + // Loads. def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>; @@ -1092,6 +1105,20 @@ def : Pat<(v4f32 (vselect v4i32:$vA, v4f32:$vB, v4f32:$vC)), def : Pat<(v2f64 (vselect v2i64:$vA, v2f64:$vB, v2f64:$vC)), (VSEL $vC, $vB, $vA)>; +// Vector Integer Average Instructions +def : Pat<(v4i32 (sra (sub v4i32:$vA, (vnot_ppc v4i32:$vB)), + (v4i32 (immEQOneV)))), (v4i32 (VAVGSW $vA, $vB))>; +def : Pat<(v8i16 (sra (sub v8i16:$vA, (v8i16 (bitconvert(vnot_ppc v4i32:$vB)))), + (v8i16 (immEQOneV)))), (v8i16 (VAVGSH $vA, $vB))>; +def : Pat<(v16i8 (sra (sub v16i8:$vA, (v16i8 (bitconvert(vnot_ppc v4i32:$vB)))), + (v16i8 (immEQOneV)))), (v16i8 (VAVGSB $vA, $vB))>; +def : Pat<(v4i32 (srl (sub v4i32:$vA, (vnot_ppc v4i32:$vB)), + (v4i32 (immEQOneV)))), (v4i32 (VAVGUW $vA, $vB))>; +def : Pat<(v8i16 (srl (sub v8i16:$vA, (v8i16 (bitconvert(vnot_ppc v4i32:$vB)))), + (v8i16 (immEQOneV)))), (v8i16 (VAVGUH $vA, $vB))>; +def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot_ppc v4i32:$vB)))), + (v16i8 (immEQOneV)))), (v16i8 (VAVGUB $vA, $vB))>; + } // end HasAltivec def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">; @@ -1140,9 +1167,13 @@ def:Pat<(vmrgew_swapped_shuffle v16i8:$vA, v16i8:$vB), def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB), (VMRGOW $vB, $vA)>; +// Vector rotates. +def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>; + +def : Pat<(v2i64 (rotl v2i64:$vA, v2i64:$vB)), + (v2i64 (VRLD v2i64:$vA, v2i64:$vB))>; // Vector shifts -def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>; def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vsld $vD, $vA, $vB", IIC_VecGeneral, []>; def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), @@ -1245,11 +1276,11 @@ def VORC : VXForm_1<1348, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), // i64 element comparisons. def VCMPEQUD : VCMP <199, "vcmpequd $vD, $vA, $vB" , v2i64>; -def VCMPEQUDo : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>; +def VCMPEQUD_rec : VCMPo<199, "vcmpequd. $vD, $vA, $vB", v2i64>; def VCMPGTSD : VCMP <967, "vcmpgtsd $vD, $vA, $vB" , v2i64>; -def VCMPGTSDo : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>; +def VCMPGTSD_rec : VCMPo<967, "vcmpgtsd. $vD, $vA, $vB", v2i64>; def VCMPGTUD : VCMP <711, "vcmpgtud $vD, $vA, $vB" , v2i64>; -def VCMPGTUDo : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>; +def VCMPGTUD_rec : VCMPo<711, "vcmpgtud. $vD, $vA, $vB", v2i64>; // The cryptography instructions that do not require Category:Vector.Crypto def VPMSUMB : VX1_Int_Ty<1032, "vpmsumb", @@ -1313,21 +1344,21 @@ let Predicates = [HasP9Altivec] in { // i8 element comparisons. def VCMPNEB : VCMP < 7, "vcmpneb $vD, $vA, $vB" , v16i8>; -def VCMPNEBo : VCMPo < 7, "vcmpneb. $vD, $vA, $vB" , v16i8>; +def VCMPNEB_rec : VCMPo < 7, "vcmpneb. $vD, $vA, $vB" , v16i8>; def VCMPNEZB : VCMP <263, "vcmpnezb $vD, $vA, $vB" , v16i8>; -def VCMPNEZBo : VCMPo<263, "vcmpnezb. $vD, $vA, $vB", v16i8>; +def VCMPNEZB_rec : VCMPo<263, "vcmpnezb. $vD, $vA, $vB", v16i8>; // i16 element comparisons. def VCMPNEH : VCMP < 71, "vcmpneh $vD, $vA, $vB" , v8i16>; -def VCMPNEHo : VCMPo< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>; +def VCMPNEH_rec : VCMPo< 71, "vcmpneh. $vD, $vA, $vB" , v8i16>; def VCMPNEZH : VCMP <327, "vcmpnezh $vD, $vA, $vB" , v8i16>; -def VCMPNEZHo : VCMPo<327, "vcmpnezh. $vD, $vA, $vB", v8i16>; +def VCMPNEZH_rec : VCMPo<327, "vcmpnezh. $vD, $vA, $vB", v8i16>; // i32 element comparisons. def VCMPNEW : VCMP <135, "vcmpnew $vD, $vA, $vB" , v4i32>; -def VCMPNEWo : VCMPo<135, "vcmpnew. $vD, $vA, $vB" , v4i32>; +def VCMPNEW_rec : VCMPo<135, "vcmpnew. $vD, $vA, $vB" , v4i32>; def VCMPNEZW : VCMP <391, "vcmpnezw $vD, $vA, $vB" , v4i32>; -def VCMPNEZWo : VCMPo<391, "vcmpnezw. $vD, $vA, $vB", v4i32>; +def VCMPNEZW_rec : VCMPo<391, "vcmpnezw. $vD, $vA, $vB", v4i32>; // VX-Form: [PO VRT / UIM VRB XO]. // We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent @@ -1347,12 +1378,14 @@ def VEXTRACTUW : VX1_VT5_UIM5_VB5<653, "vextractuw", []>; def VEXTRACTD : VX1_VT5_UIM5_VB5<717, "vextractd" , []>; // Vector Extract Unsigned Byte/Halfword/Word Left/Right-Indexed +let hasSideEffects = 0 in { def VEXTUBLX : VX1_RT5_RA5_VB5<1549, "vextublx", []>; def VEXTUBRX : VX1_RT5_RA5_VB5<1805, "vextubrx", []>; def VEXTUHLX : VX1_RT5_RA5_VB5<1613, "vextuhlx", []>; def VEXTUHRX : VX1_RT5_RA5_VB5<1869, "vextuhrx", []>; def VEXTUWLX : VX1_RT5_RA5_VB5<1677, "vextuwlx", []>; def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>; +} // Vector Insert Element Instructions def VINSERTB : VXForm_1<781, (outs vrrc:$vD), @@ -1410,6 +1443,12 @@ let isCodeGenOnly = 1 in { def VEXTSW2Ds : VX_VT5_EO5_VB5s<1538, 26, "vextsw2d", []>; } +def : Pat<(v4i32 (sext_inreg v4i32:$VRB, v4i8)), (v4i32 (VEXTSB2W $VRB))>; +def : Pat<(v4i32 (sext_inreg v4i32:$VRB, v4i16)), (v4i32 (VEXTSH2W $VRB))>; +def : Pat<(v2i64 (sext_inreg v2i64:$VRB, v2i8)), (v2i64 (VEXTSB2D $VRB))>; +def : Pat<(v2i64 (sext_inreg v2i64:$VRB, v2i16)), (v2i64 (VEXTSH2D $VRB))>; +def : Pat<(v2i64 (sext_inreg v2i64:$VRB, v2i32)), (v2i64 (VEXTSW2D $VRB))>; + // Vector Integer Negate def VNEGW : VX_VT5_EO5_VB5<1538, 6, "vnegw", [(set v4i32:$vD, @@ -1496,18 +1535,18 @@ class VX_VT5_EO5_VB5_XO9_o eo, bits<9> xo, string opc, } // Decimal Convert From/to National/Zoned/Signed-QWord -def BCDCFNo : VX_VT5_EO5_VB5_PS1_XO9_o<7, 385, "bcdcfn." , []>; -def BCDCFZo : VX_VT5_EO5_VB5_PS1_XO9_o<6, 385, "bcdcfz." , []>; -def BCDCTNo : VX_VT5_EO5_VB5_XO9_o <5, 385, "bcdctn." , []>; -def BCDCTZo : VX_VT5_EO5_VB5_PS1_XO9_o<4, 385, "bcdctz." , []>; -def BCDCFSQo : VX_VT5_EO5_VB5_PS1_XO9_o<2, 385, "bcdcfsq.", []>; -def BCDCTSQo : VX_VT5_EO5_VB5_XO9_o <0, 385, "bcdctsq.", []>; +def BCDCFN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<7, 385, "bcdcfn." , []>; +def BCDCFZ_rec : VX_VT5_EO5_VB5_PS1_XO9_o<6, 385, "bcdcfz." , []>; +def BCDCTN_rec : VX_VT5_EO5_VB5_XO9_o <5, 385, "bcdctn." , []>; +def BCDCTZ_rec : VX_VT5_EO5_VB5_PS1_XO9_o<4, 385, "bcdctz." , []>; +def BCDCFSQ_rec : VX_VT5_EO5_VB5_PS1_XO9_o<2, 385, "bcdcfsq.", []>; +def BCDCTSQ_rec : VX_VT5_EO5_VB5_XO9_o <0, 385, "bcdctsq.", []>; // Decimal Copy-Sign/Set-Sign let Defs = [CR6] in -def BCDCPSGNo : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>; +def BCDCPSGN_rec : VX1_VT5_VA5_VB5<833, "bcdcpsgn.", []>; -def BCDSETSGNo : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>; +def BCDSETSGN_rec : VX_VT5_EO5_VB5_PS1_XO9_o<31, 385, "bcdsetsgn.", []>; // [PO VRT VRA VRB 1 PS XO], "_o" means CR6 is set. class VX_VT5_VA5_VB5_PS1_XO9_o xo, string opc, list pattern> @@ -1526,13 +1565,13 @@ class VX_VT5_VA5_VB5_XO9_o xo, string opc, list pattern> } // Decimal Shift/Unsigned-Shift/Shift-and-Round -def BCDSo : VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>; -def BCDUSo : VX_VT5_VA5_VB5_XO9_o <129, "bcdus.", []>; -def BCDSRo : VX_VT5_VA5_VB5_PS1_XO9_o<449, "bcdsr.", []>; +def BCDS_rec : VX_VT5_VA5_VB5_PS1_XO9_o<193, "bcds." , []>; +def BCDUS_rec : VX_VT5_VA5_VB5_XO9_o <129, "bcdus.", []>; +def BCDSR_rec : VX_VT5_VA5_VB5_PS1_XO9_o<449, "bcdsr.", []>; // Decimal (Unsigned) Truncate -def BCDTRUNCo : VX_VT5_VA5_VB5_PS1_XO9_o<257, "bcdtrunc." , []>; -def BCDUTRUNCo : VX_VT5_VA5_VB5_XO9_o <321, "bcdutrunc.", []>; +def BCDTRUNC_rec : VX_VT5_VA5_VB5_PS1_XO9_o<257, "bcdtrunc." , []>; +def BCDUTRUNC_rec : VX_VT5_VA5_VB5_XO9_o <321, "bcdutrunc.", []>; // Absolute Difference def VABSDUB : VXForm_1<1027, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td index 96b9c9a119c0..115bd44ea202 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td @@ -262,8 +262,8 @@ class DForm_2 opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : DForm_base { - // Even though ADDICo does not really have an RC bit, provide - // the declaration of one here so that isDOT has something to set. + // Even though ADDIC_rec does not really have an RC bit, provide + // the declaration of one here so that isRecordForm has something to set. bit RC = 0; } @@ -428,7 +428,7 @@ class XForm_base_r3xo opcode, bits<10> xo, dag OOL, dag IOL, string asms let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = RST; let Inst{11-15} = A; @@ -463,7 +463,7 @@ class XForm_base_r3xo_swapped bits<5> RST; bits<5> B; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = RST; let Inst{11-15} = A; @@ -744,7 +744,7 @@ class XForm_42 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, : XForm_base_r3xo { let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = RST; let Inst{11-20} = 0; @@ -757,7 +757,7 @@ class XForm_43 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; bits<5> FM; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = FM; let Inst{11-20} = 0; @@ -902,7 +902,7 @@ class XForm_htm2 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, : I { bit L; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{7-9} = 0; let Inst{10} = L; @@ -1265,7 +1265,7 @@ class XX3Form_Rc opcode, bits<7> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = XT{4-0}; let Inst{11-15} = XA{4-0}; @@ -1529,6 +1529,29 @@ class XLForm_2_ext_and_DSForm_1 opcode1, bits<10> xo1, let BH = 0; } +class XLForm_2_ext_and_DForm_1 opcode1, bits<10> xo1, bits<5> bo, + bits<5> bi, bit lk, bits<6> opcode2, dag OOL, + dag IOL, string asmstr, InstrItinClass itin, + list pattern> + : I2 { + + bits<5> RST; + bits<21> D_RA; + + let Pattern = pattern; + + let Inst{6-10} = bo; + let Inst{11-15} = bi; + let Inst{16-18} = 0; + let Inst{19-20} = 0; // Unused (BH) + let Inst{21-30} = xo1; + let Inst{31} = lk; + + let Inst{38-42} = RST; + let Inst{43-47} = D_RA{20-16}; // Base Register + let Inst{48-63} = D_RA{15-0}; // Displacement +} + // 1.7.8 XFX-Form class XFXForm_1 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin> @@ -1628,7 +1651,7 @@ class XFLForm opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bits<8> FM; bits<5> rT; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Pattern = pattern; let Inst{6} = 0; @@ -1647,7 +1670,7 @@ class XFLForm_1 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, bit W; bits<5> FRB; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Pattern = pattern; let Inst{6} = L; @@ -1666,7 +1689,7 @@ class XSForm_1 opcode, bits<9> xo, dag OOL, dag IOL, string asmstr, bits<5> RS; bits<6> SH; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Pattern = pattern; let Inst{6-10} = RS; @@ -1687,7 +1710,7 @@ class XOForm_1 opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asms let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = RT; let Inst{11-15} = RA; @@ -1714,7 +1737,7 @@ class AForm_1 opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = FRT; let Inst{11-15} = FRA; @@ -1774,7 +1797,7 @@ class MForm_1 opcode, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = RS; let Inst{11-15} = RA; @@ -1800,7 +1823,7 @@ class MDForm_1 opcode, bits<3> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = RS; let Inst{11-15} = RA; @@ -1821,7 +1844,7 @@ class MDSForm_1 opcode, bits<4> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = RS; let Inst{11-15} = RA; @@ -2083,7 +2106,7 @@ class Z23Form_1 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = FRT; let Inst{11-15} = FRA; @@ -2107,7 +2130,7 @@ class Z23Form_3 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = FRT; let Inst{11-22} = idx; @@ -2125,7 +2148,7 @@ class Z23Form_8 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, let Pattern = pattern; - bit RC = 0; // set by isDOT + bit RC = 0; // set by isRecordForm let Inst{6-10} = VRT; let Inst{11-14} = 0; diff --git a/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/llvm/lib/Target/PowerPC/PPCInstrHTM.td index 104b57a70a2e..6cbf999ca73d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrHTM.td +++ b/llvm/lib/Target/PowerPC/PPCInstrHTM.td @@ -36,7 +36,7 @@ def TEND : XForm_htm1 <31, 686, def TABORT : XForm_base_r3xo <31, 910, (outs), (ins gprc:$A), "tabort. $A", IIC_SprMTSPR, - []>, isDOT { + []>, isRecordForm { let RST = 0; let B = 0; } @@ -44,38 +44,38 @@ def TABORT : XForm_base_r3xo <31, 910, def TABORTWC : XForm_base_r3xo <31, 782, (outs), (ins u5imm:$RTS, gprc:$A, gprc:$B), "tabortwc. $RTS, $A, $B", IIC_SprMTSPR, []>, - isDOT; + isRecordForm; def TABORTWCI : XForm_base_r3xo <31, 846, (outs), (ins u5imm:$RTS, gprc:$A, u5imm:$B), "tabortwci. $RTS, $A, $B", IIC_SprMTSPR, []>, - isDOT; + isRecordForm; def TABORTDC : XForm_base_r3xo <31, 814, (outs), (ins u5imm:$RTS, gprc:$A, gprc:$B), "tabortdc. $RTS, $A, $B", IIC_SprMTSPR, []>, - isDOT; + isRecordForm; def TABORTDCI : XForm_base_r3xo <31, 878, (outs), (ins u5imm:$RTS, gprc:$A, u5imm:$B), "tabortdci. $RTS, $A, $B", IIC_SprMTSPR, []>, - isDOT; + isRecordForm; def TSR : XForm_htm2 <31, 750, (outs), (ins u1imm:$L), "tsr. $L", IIC_SprMTSPR, []>, - isDOT; + isRecordForm; def TRECLAIM : XForm_base_r3xo <31, 942, (outs), (ins gprc:$A), "treclaim. $A", IIC_SprMTSPR, []>, - isDOT { + isRecordForm { let RST = 0; let B = 0; } def TRECHKPT : XForm_base_r3xo <31, 1006, (outs), (ins), "trechkpt.", IIC_SprMTSPR, []>, - isDOT { + isRecordForm { let RST = 0; let A = 0; let B = 0; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 6b10672965c9..30906a32b00c 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -108,7 +108,7 @@ ScheduleHazardRecognizer * PPCInstrInfo::CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI, const ScheduleDAG *DAG) const { unsigned Directive = - static_cast(STI)->getDarwinDirective(); + static_cast(STI)->getCPUDirective(); if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 || Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) { const InstrItineraryData *II = @@ -125,7 +125,7 @@ ScheduleHazardRecognizer * PPCInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const { unsigned Directive = - DAG->MF.getSubtarget().getDarwinDirective(); + DAG->MF.getSubtarget().getCPUDirective(); // FIXME: Leaving this as-is until we have POWER9 scheduling info if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8) @@ -202,7 +202,7 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, // On some cores, there is an additional delay between writing to a condition // register, and using it from a branch. - unsigned Directive = Subtarget.getDarwinDirective(); + unsigned Directive = Subtarget.getCPUDirective(); switch (Directive) { default: break; case PPC::DIR_7400: @@ -371,7 +371,7 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, MachineFunction &MF = *MI.getParent()->getParent(); // Normal instructions can be commuted the obvious way. - if (MI.getOpcode() != PPC::RLWIMI && MI.getOpcode() != PPC::RLWIMIo) + if (MI.getOpcode() != PPC::RLWIMI && MI.getOpcode() != PPC::RLWIMI_rec) return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because @@ -391,7 +391,7 @@ MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // Swap op1/op2 assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) && - "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo."); + "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMI_rec."); Register Reg0 = MI.getOperand(0).getReg(); Register Reg1 = MI.getOperand(1).getReg(); Register Reg2 = MI.getOperand(2).getReg(); @@ -469,7 +469,7 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { // This function is used for scheduling, and the nop wanted here is the type // that terminates dispatch groups on the POWER cores. - unsigned Directive = Subtarget.getDarwinDirective(); + unsigned Directive = Subtarget.getCPUDirective(); unsigned Opcode; switch (Directive) { default: Opcode = PPC::NOP; break; @@ -903,15 +903,15 @@ static unsigned getCRBitValue(unsigned CRBit) { void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, - unsigned SrcReg, bool KillSrc) const { + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc) const { // We can end up with self copies and similar things as a result of VSX copy // legalization. Promote them here. const TargetRegisterInfo *TRI = &getRegisterInfo(); if (PPC::F8RCRegClass.contains(DestReg) && PPC::VSRCRegClass.contains(SrcReg)) { - unsigned SuperReg = - TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass); + MCRegister SuperReg = + TRI->getMatchingSuperReg(DestReg, PPC::sub_64, &PPC::VSRCRegClass); if (VSXSelfCopyCrash && SrcReg == SuperReg) llvm_unreachable("nop VSX copy"); @@ -919,8 +919,8 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, DestReg = SuperReg; } else if (PPC::F8RCRegClass.contains(SrcReg) && PPC::VSRCRegClass.contains(DestReg)) { - unsigned SuperReg = - TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass); + MCRegister SuperReg = + TRI->getMatchingSuperReg(SrcReg, PPC::sub_64, &PPC::VSRCRegClass); if (VSXSelfCopyCrash && DestReg == SuperReg) llvm_unreachable("nop VSX copy"); @@ -931,7 +931,7 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Different class register copy if (PPC::CRBITRCRegClass.contains(SrcReg) && PPC::GPRCRegClass.contains(DestReg)) { - unsigned CRReg = getCRFromCRBit(SrcReg); + MCRegister CRReg = getCRFromCRBit(SrcReg); BuildMI(MBB, I, DL, get(PPC::MFOCRF), DestReg).addReg(CRReg); getKillRegState(KillSrc); // Rotate the CR bit in the CR fields to be the least significant bit and @@ -1587,22 +1587,6 @@ bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI, return Found; } -bool PPCInstrInfo::isPredicable(const MachineInstr &MI) const { - unsigned OpC = MI.getOpcode(); - switch (OpC) { - default: - return false; - case PPC::B: - case PPC::BLR: - case PPC::BLR8: - case PPC::BCTR: - case PPC::BCTR8: - case PPC::BCTRL: - case PPC::BCTRL8: - return true; - } -} - bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const { @@ -1836,8 +1820,8 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, int NewOpC = -1; int MIOpC = MI->getOpcode(); - if (MIOpC == PPC::ANDIo || MIOpC == PPC::ANDIo8 || - MIOpC == PPC::ANDISo || MIOpC == PPC::ANDISo8) + if (MIOpC == PPC::ANDI_rec || MIOpC == PPC::ANDI8_rec || + MIOpC == PPC::ANDIS_rec || MIOpC == PPC::ANDIS8_rec) NewOpC = MIOpC; else { NewOpC = PPC::getRecordFormOpcode(MIOpC); @@ -1943,9 +1927,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, Mask = ((1LLU << (32 - MB)) - 1) & ~((1LLU << (31 - ME)) - 1); // The mask value needs to shift right 16 if we're emitting andis. Mask >>= MBInLoHWord ? 0 : 16; - NewOpC = MIOpC == PPC::RLWINM ? - (MBInLoHWord ? PPC::ANDIo : PPC::ANDISo) : - (MBInLoHWord ? PPC::ANDIo8 :PPC::ANDISo8); + NewOpC = MIOpC == PPC::RLWINM + ? (MBInLoHWord ? PPC::ANDI_rec : PPC::ANDIS_rec) + : (MBInLoHWord ? PPC::ANDI8_rec : PPC::ANDIS8_rec); } else if (MRI->use_empty(GPRRes) && (ME == 31) && (ME - MB + 1 == SH) && (MB >= 16)) { // If we are rotating by the exact number of bits as are in the mask @@ -1953,7 +1937,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // that's just an andis. (as long as the GPR result has no uses). Mask = ((1LLU << 32) - 1) & ~((1LLU << (32 - SH)) - 1); Mask >>= 16; - NewOpC = MIOpC == PPC::RLWINM ? PPC::ANDISo :PPC::ANDISo8; + NewOpC = MIOpC == PPC::RLWINM ? PPC::ANDIS_rec : PPC::ANDIS8_rec; } // If we've set the mask, we can transform. if (Mask != ~0LLU) { @@ -1966,7 +1950,7 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, int64_t MB = MI->getOperand(3).getImm(); if (MB >= 48) { uint64_t Mask = (1LLU << (63 - MB + 1)) - 1; - NewOpC = PPC::ANDIo8; + NewOpC = PPC::ANDI8_rec; MI->RemoveOperand(3); MI->getOperand(2).setImm(Mask); NumRcRotatesConvertedToRcAnd++; @@ -2306,7 +2290,7 @@ void PPCInstrInfo::replaceInstrWithLI(MachineInstr &MI, // Replace the instruction. if (LII.SetCR) { - MI.setDesc(get(LII.Is64Bit ? PPC::ANDIo8 : PPC::ANDIo)); + MI.setDesc(get(LII.Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec)); // Set the immediate. MachineInstrBuilder(*MI.getParent()->getParent(), MI) .addImm(LII.Imm).addReg(PPC::CR0, RegState::ImplicitDefine); @@ -2370,15 +2354,13 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI( ImmInstrInfo III; unsigned Opc = MI.getOpcode(); bool ConvertibleImmForm = - Opc == PPC::CMPWI || Opc == PPC::CMPLWI || - Opc == PPC::CMPDI || Opc == PPC::CMPLDI || - Opc == PPC::ADDI || Opc == PPC::ADDI8 || - Opc == PPC::ORI || Opc == PPC::ORI8 || - Opc == PPC::XORI || Opc == PPC::XORI8 || - Opc == PPC::RLDICL || Opc == PPC::RLDICLo || - Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 || - Opc == PPC::RLWINM || Opc == PPC::RLWINMo || - Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o; + Opc == PPC::CMPWI || Opc == PPC::CMPLWI || Opc == PPC::CMPDI || + Opc == PPC::CMPLDI || Opc == PPC::ADDI || Opc == PPC::ADDI8 || + Opc == PPC::ORI || Opc == PPC::ORI8 || Opc == PPC::XORI || + Opc == PPC::XORI8 || Opc == PPC::RLDICL || Opc == PPC::RLDICL_rec || + Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 || + Opc == PPC::RLWINM || Opc == PPC::RLWINM_rec || Opc == PPC::RLWINM8 || + Opc == PPC::RLWINM8_rec; bool IsVFReg = (MI.getNumOperands() && MI.getOperand(0).isReg()) ? isVFRegister(MI.getOperand(0).getReg()) : false; @@ -2527,6 +2509,225 @@ void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI, "RegNo should be killed or dead"); } +// This opt tries to convert the following imm form to an index form to save an +// add for stack variables. +// Return false if no such pattern found. +// +// ADDI instr: ToBeChangedReg = ADDI FrameBaseReg, OffsetAddi +// ADD instr: ToBeDeletedReg = ADD ToBeChangedReg(killed), ScaleReg +// Imm instr: Reg = op OffsetImm, ToBeDeletedReg(killed) +// +// can be converted to: +// +// new ADDI instr: ToBeChangedReg = ADDI FrameBaseReg, (OffsetAddi + OffsetImm) +// Index instr: Reg = opx ScaleReg, ToBeChangedReg(killed) +// +// In order to eliminate ADD instr, make sure that: +// 1: (OffsetAddi + OffsetImm) must be int16 since this offset will be used in +// new ADDI instr and ADDI can only take int16 Imm. +// 2: ToBeChangedReg must be killed in ADD instr and there is no other use +// between ADDI and ADD instr since its original def in ADDI will be changed +// in new ADDI instr. And also there should be no new def for it between +// ADD and Imm instr as ToBeChangedReg will be used in Index instr. +// 3: ToBeDeletedReg must be killed in Imm instr and there is no other use +// between ADD and Imm instr since ADD instr will be eliminated. +// 4: ScaleReg must not be redefined between ADD and Imm instr since it will be +// moved to Index instr. +bool PPCInstrInfo::foldFrameOffset(MachineInstr &MI) const { + MachineFunction *MF = MI.getParent()->getParent(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + bool PostRA = !MRI->isSSA(); + // Do this opt after PEI which is after RA. The reason is stack slot expansion + // in PEI may expose such opportunities since in PEI, stack slot offsets to + // frame base(OffsetAddi) are determined. + if (!PostRA) + return false; + unsigned ToBeDeletedReg = 0; + int64_t OffsetImm = 0; + unsigned XFormOpcode = 0; + ImmInstrInfo III; + + // Check if Imm instr meets requirement. + if (!isImmInstrEligibleForFolding(MI, ToBeDeletedReg, XFormOpcode, OffsetImm, + III)) + return false; + + bool OtherIntermediateUse = false; + MachineInstr *ADDMI = getDefMIPostRA(ToBeDeletedReg, MI, OtherIntermediateUse); + + // Exit if there is other use between ADD and Imm instr or no def found. + if (OtherIntermediateUse || !ADDMI) + return false; + + // Check if ADD instr meets requirement. + if (!isADDInstrEligibleForFolding(*ADDMI)) + return false; + + unsigned ScaleRegIdx = 0; + int64_t OffsetAddi = 0; + MachineInstr *ADDIMI = nullptr; + + // Check if there is a valid ToBeChangedReg in ADDMI. + // 1: It must be killed. + // 2: Its definition must be a valid ADDIMI. + // 3: It must satify int16 offset requirement. + if (isValidToBeChangedReg(ADDMI, 1, ADDIMI, OffsetAddi, OffsetImm)) + ScaleRegIdx = 2; + else if (isValidToBeChangedReg(ADDMI, 2, ADDIMI, OffsetAddi, OffsetImm)) + ScaleRegIdx = 1; + else + return false; + + assert(ADDIMI && "There should be ADDIMI for valid ToBeChangedReg."); + unsigned ToBeChangedReg = ADDIMI->getOperand(0).getReg(); + unsigned ScaleReg = ADDMI->getOperand(ScaleRegIdx).getReg(); + auto NewDefFor = [&](unsigned Reg, MachineBasicBlock::iterator Start, + MachineBasicBlock::iterator End) { + for (auto It = ++Start; It != End; It++) + if (It->modifiesRegister(Reg, &getRegisterInfo())) + return true; + return false; + }; + // Make sure no other def for ToBeChangedReg and ScaleReg between ADD Instr + // and Imm Instr. + if (NewDefFor(ToBeChangedReg, *ADDMI, MI) || NewDefFor(ScaleReg, *ADDMI, MI)) + return false; + + // Now start to do the transformation. + LLVM_DEBUG(dbgs() << "Replace instruction: " + << "\n"); + LLVM_DEBUG(ADDIMI->dump()); + LLVM_DEBUG(ADDMI->dump()); + LLVM_DEBUG(MI.dump()); + LLVM_DEBUG(dbgs() << "with: " + << "\n"); + + // Update ADDI instr. + ADDIMI->getOperand(2).setImm(OffsetAddi + OffsetImm); + + // Update Imm instr. + MI.setDesc(get(XFormOpcode)); + MI.getOperand(III.ImmOpNo) + .ChangeToRegister(ScaleReg, false, false, + ADDMI->getOperand(ScaleRegIdx).isKill()); + + MI.getOperand(III.OpNoForForwarding) + .ChangeToRegister(ToBeChangedReg, false, false, true); + + // Eliminate ADD instr. + ADDMI->eraseFromParent(); + + LLVM_DEBUG(ADDIMI->dump()); + LLVM_DEBUG(MI.dump()); + + return true; +} + +bool PPCInstrInfo::isADDIInstrEligibleForFolding(MachineInstr &ADDIMI, + int64_t &Imm) const { + unsigned Opc = ADDIMI.getOpcode(); + + // Exit if the instruction is not ADDI. + if (Opc != PPC::ADDI && Opc != PPC::ADDI8) + return false; + + Imm = ADDIMI.getOperand(2).getImm(); + + return true; +} + +bool PPCInstrInfo::isADDInstrEligibleForFolding(MachineInstr &ADDMI) const { + unsigned Opc = ADDMI.getOpcode(); + + // Exit if the instruction is not ADD. + return Opc == PPC::ADD4 || Opc == PPC::ADD8; +} + +bool PPCInstrInfo::isImmInstrEligibleForFolding(MachineInstr &MI, + unsigned &ToBeDeletedReg, + unsigned &XFormOpcode, + int64_t &OffsetImm, + ImmInstrInfo &III) const { + // Only handle load/store. + if (!MI.mayLoadOrStore()) + return false; + + unsigned Opc = MI.getOpcode(); + + XFormOpcode = RI.getMappedIdxOpcForImmOpc(Opc); + + // Exit if instruction has no index form. + if (XFormOpcode == PPC::INSTRUCTION_LIST_END) + return false; + + // TODO: sync the logic between instrHasImmForm() and ImmToIdxMap. + if (!instrHasImmForm(XFormOpcode, isVFRegister(MI.getOperand(0).getReg()), + III, true)) + return false; + + if (!III.IsSummingOperands) + return false; + + MachineOperand ImmOperand = MI.getOperand(III.ImmOpNo); + MachineOperand RegOperand = MI.getOperand(III.OpNoForForwarding); + // Only support imm operands, not relocation slots or others. + if (!ImmOperand.isImm()) + return false; + + assert(RegOperand.isReg() && "Instruction format is not right"); + + // There are other use for ToBeDeletedReg after Imm instr, can not delete it. + if (!RegOperand.isKill()) + return false; + + ToBeDeletedReg = RegOperand.getReg(); + OffsetImm = ImmOperand.getImm(); + + return true; +} + +bool PPCInstrInfo::isValidToBeChangedReg(MachineInstr *ADDMI, unsigned Index, + MachineInstr *&ADDIMI, + int64_t &OffsetAddi, + int64_t OffsetImm) const { + assert((Index == 1 || Index == 2) && "Invalid operand index for add."); + MachineOperand &MO = ADDMI->getOperand(Index); + + if (!MO.isKill()) + return false; + + bool OtherIntermediateUse = false; + + ADDIMI = getDefMIPostRA(MO.getReg(), *ADDMI, OtherIntermediateUse); + // Currently handle only one "add + Imminstr" pair case, exit if other + // intermediate use for ToBeChangedReg found. + // TODO: handle the cases where there are other "add + Imminstr" pairs + // with same offset in Imminstr which is like: + // + // ADDI instr: ToBeChangedReg = ADDI FrameBaseReg, OffsetAddi + // ADD instr1: ToBeDeletedReg1 = ADD ToBeChangedReg, ScaleReg1 + // Imm instr1: Reg1 = op1 OffsetImm, ToBeDeletedReg1(killed) + // ADD instr2: ToBeDeletedReg2 = ADD ToBeChangedReg(killed), ScaleReg2 + // Imm instr2: Reg2 = op2 OffsetImm, ToBeDeletedReg2(killed) + // + // can be converted to: + // + // new ADDI instr: ToBeChangedReg = ADDI FrameBaseReg, + // (OffsetAddi + OffsetImm) + // Index instr1: Reg1 = opx1 ScaleReg1, ToBeChangedReg + // Index instr2: Reg2 = opx2 ScaleReg2, ToBeChangedReg(killed) + + if (OtherIntermediateUse || !ADDIMI) + return false; + // Check if ADDI instr meets requirement. + if (!isADDIInstrEligibleForFolding(*ADDIMI, OffsetAddi)) + return false; + + if (isInt<16>(OffsetAddi + OffsetImm)) + return true; + return false; +} + // If this instruction has an immediate form and one of its operands is a // result of a load-immediate or an add-immediate, convert it to // the immediate form if the constant is in range. @@ -2660,34 +2861,34 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, return false; } case PPC::RLDICL: - case PPC::RLDICLo: + case PPC::RLDICL_rec: case PPC::RLDICL_32: case PPC::RLDICL_32_64: { // Use APInt's rotate function. int64_t SH = MI.getOperand(2).getImm(); int64_t MB = MI.getOperand(3).getImm(); - APInt InVal((Opc == PPC::RLDICL || Opc == PPC::RLDICLo) ? - 64 : 32, SExtImm, true); + APInt InVal((Opc == PPC::RLDICL || Opc == PPC::RLDICL_rec) ? 64 : 32, + SExtImm, true); InVal = InVal.rotl(SH); uint64_t Mask = (1LLU << (63 - MB + 1)) - 1; InVal &= Mask; // Can't replace negative values with an LI as that will sign-extend // and not clear the left bits. If we're setting the CR bit, we will use - // ANDIo which won't sign extend, so that's safe. + // ANDI_rec which won't sign extend, so that's safe. if (isUInt<15>(InVal.getSExtValue()) || - (Opc == PPC::RLDICLo && isUInt<16>(InVal.getSExtValue()))) { + (Opc == PPC::RLDICL_rec && isUInt<16>(InVal.getSExtValue()))) { ReplaceWithLI = true; Is64BitLI = Opc != PPC::RLDICL_32; NewImm = InVal.getSExtValue(); - SetCR = Opc == PPC::RLDICLo; + SetCR = Opc == PPC::RLDICL_rec; break; } return false; } case PPC::RLWINM: case PPC::RLWINM8: - case PPC::RLWINMo: - case PPC::RLWINM8o: { + case PPC::RLWINM_rec: + case PPC::RLWINM8_rec: { int64_t SH = MI.getOperand(2).getImm(); int64_t MB = MI.getOperand(3).getImm(); int64_t ME = MI.getOperand(4).getImm(); @@ -2698,15 +2899,15 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, InVal &= Mask; // Can't replace negative values with an LI as that will sign-extend // and not clear the left bits. If we're setting the CR bit, we will use - // ANDIo which won't sign extend, so that's safe. + // ANDI_rec which won't sign extend, so that's safe. bool ValueFits = isUInt<15>(InVal.getSExtValue()); - ValueFits |= ((Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o) && + ValueFits |= ((Opc == PPC::RLWINM_rec || Opc == PPC::RLWINM8_rec) && isUInt<16>(InVal.getSExtValue())); if (ValueFits) { ReplaceWithLI = true; - Is64BitLI = Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o; + Is64BitLI = Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8_rec; NewImm = InVal.getSExtValue(); - SetCR = Opc == PPC::RLWINMo || Opc == PPC::RLWINM8o; + SetCR = Opc == PPC::RLWINM_rec || Opc == PPC::RLWINM8_rec; break; } return false; @@ -2768,7 +2969,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI, LII.Is64Bit = Is64BitLI; LII.SetCR = SetCR; // If we're setting the CR, the original load-immediate must be kept (as an - // operand to ANDIo/ANDI8o). + // operand to ANDI_rec/ANDI8_rec). if (KilledDef && SetCR) *KilledDef = nullptr; replaceInstrWithLI(MI, LII); @@ -2819,13 +3020,13 @@ bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, III.IsSummingOperands = true; III.ImmOpcode = Opc == PPC::ADDC ? PPC::ADDIC : PPC::ADDIC8; break; - case PPC::ADDCo: + case PPC::ADDC_rec: III.SignedImm = true; III.ZeroIsSpecialOrig = 0; III.ZeroIsSpecialNew = 0; III.IsCommutative = true; III.IsSummingOperands = true; - III.ImmOpcode = PPC::ADDICo; + III.ImmOpcode = PPC::ADDIC_rec; break; case PPC::SUBFC: case PPC::SUBFC8: @@ -2851,8 +3052,8 @@ bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, III.IsCommutative = false; III.ImmOpcode = Opc == PPC::CMPLW ? PPC::CMPLWI : PPC::CMPLDI; break; - case PPC::ANDo: - case PPC::AND8o: + case PPC::AND_rec: + case PPC::AND8_rec: case PPC::OR: case PPC::OR8: case PPC::XOR: @@ -2863,8 +3064,12 @@ bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, III.IsCommutative = true; switch(Opc) { default: llvm_unreachable("Unknown opcode"); - case PPC::ANDo: III.ImmOpcode = PPC::ANDIo; break; - case PPC::AND8o: III.ImmOpcode = PPC::ANDIo8; break; + case PPC::AND_rec: + III.ImmOpcode = PPC::ANDI_rec; + break; + case PPC::AND8_rec: + III.ImmOpcode = PPC::ANDI8_rec; + break; case PPC::OR: III.ImmOpcode = PPC::ORI; break; case PPC::OR8: III.ImmOpcode = PPC::ORI8; break; case PPC::XOR: III.ImmOpcode = PPC::XORI; break; @@ -2873,18 +3078,18 @@ bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, break; case PPC::RLWNM: case PPC::RLWNM8: - case PPC::RLWNMo: - case PPC::RLWNM8o: + case PPC::RLWNM_rec: + case PPC::RLWNM8_rec: case PPC::SLW: case PPC::SLW8: - case PPC::SLWo: - case PPC::SLW8o: + case PPC::SLW_rec: + case PPC::SLW8_rec: case PPC::SRW: case PPC::SRW8: - case PPC::SRWo: - case PPC::SRW8o: + case PPC::SRW_rec: + case PPC::SRW8_rec: case PPC::SRAW: - case PPC::SRAWo: + case PPC::SRAW_rec: III.SignedImm = false; III.ZeroIsSpecialOrig = 0; III.ZeroIsSpecialNew = 0; @@ -2894,8 +3099,8 @@ bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, // This does not apply to shift right algebraic because a value // out of range will produce a -1/0. III.ImmWidth = 16; - if (Opc == PPC::RLWNM || Opc == PPC::RLWNM8 || - Opc == PPC::RLWNMo || Opc == PPC::RLWNM8o) + if (Opc == PPC::RLWNM || Opc == PPC::RLWNM8 || Opc == PPC::RLWNM_rec || + Opc == PPC::RLWNM8_rec) III.TruncateImmTo = 5; else III.TruncateImmTo = 6; @@ -2903,38 +3108,50 @@ bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, default: llvm_unreachable("Unknown opcode"); case PPC::RLWNM: III.ImmOpcode = PPC::RLWINM; break; case PPC::RLWNM8: III.ImmOpcode = PPC::RLWINM8; break; - case PPC::RLWNMo: III.ImmOpcode = PPC::RLWINMo; break; - case PPC::RLWNM8o: III.ImmOpcode = PPC::RLWINM8o; break; + case PPC::RLWNM_rec: + III.ImmOpcode = PPC::RLWINM_rec; + break; + case PPC::RLWNM8_rec: + III.ImmOpcode = PPC::RLWINM8_rec; + break; case PPC::SLW: III.ImmOpcode = PPC::RLWINM; break; case PPC::SLW8: III.ImmOpcode = PPC::RLWINM8; break; - case PPC::SLWo: III.ImmOpcode = PPC::RLWINMo; break; - case PPC::SLW8o: III.ImmOpcode = PPC::RLWINM8o; break; + case PPC::SLW_rec: + III.ImmOpcode = PPC::RLWINM_rec; + break; + case PPC::SLW8_rec: + III.ImmOpcode = PPC::RLWINM8_rec; + break; case PPC::SRW: III.ImmOpcode = PPC::RLWINM; break; case PPC::SRW8: III.ImmOpcode = PPC::RLWINM8; break; - case PPC::SRWo: III.ImmOpcode = PPC::RLWINMo; break; - case PPC::SRW8o: III.ImmOpcode = PPC::RLWINM8o; break; + case PPC::SRW_rec: + III.ImmOpcode = PPC::RLWINM_rec; + break; + case PPC::SRW8_rec: + III.ImmOpcode = PPC::RLWINM8_rec; + break; case PPC::SRAW: III.ImmWidth = 5; III.TruncateImmTo = 0; III.ImmOpcode = PPC::SRAWI; break; - case PPC::SRAWo: + case PPC::SRAW_rec: III.ImmWidth = 5; III.TruncateImmTo = 0; - III.ImmOpcode = PPC::SRAWIo; + III.ImmOpcode = PPC::SRAWI_rec; break; } break; case PPC::RLDCL: - case PPC::RLDCLo: + case PPC::RLDCL_rec: case PPC::RLDCR: - case PPC::RLDCRo: + case PPC::RLDCR_rec: case PPC::SLD: - case PPC::SLDo: + case PPC::SLD_rec: case PPC::SRD: - case PPC::SRDo: + case PPC::SRD_rec: case PPC::SRAD: - case PPC::SRADo: + case PPC::SRAD_rec: III.SignedImm = false; III.ZeroIsSpecialOrig = 0; III.ZeroIsSpecialNew = 0; @@ -2944,30 +3161,38 @@ bool PPCInstrInfo::instrHasImmForm(unsigned Opc, bool IsVFReg, // This does not apply to shift right algebraic because a value // out of range will produce a -1/0. III.ImmWidth = 16; - if (Opc == PPC::RLDCL || Opc == PPC::RLDCLo || - Opc == PPC::RLDCR || Opc == PPC::RLDCRo) + if (Opc == PPC::RLDCL || Opc == PPC::RLDCL_rec || Opc == PPC::RLDCR || + Opc == PPC::RLDCR_rec) III.TruncateImmTo = 6; else III.TruncateImmTo = 7; switch(Opc) { default: llvm_unreachable("Unknown opcode"); case PPC::RLDCL: III.ImmOpcode = PPC::RLDICL; break; - case PPC::RLDCLo: III.ImmOpcode = PPC::RLDICLo; break; + case PPC::RLDCL_rec: + III.ImmOpcode = PPC::RLDICL_rec; + break; case PPC::RLDCR: III.ImmOpcode = PPC::RLDICR; break; - case PPC::RLDCRo: III.ImmOpcode = PPC::RLDICRo; break; + case PPC::RLDCR_rec: + III.ImmOpcode = PPC::RLDICR_rec; + break; case PPC::SLD: III.ImmOpcode = PPC::RLDICR; break; - case PPC::SLDo: III.ImmOpcode = PPC::RLDICRo; break; + case PPC::SLD_rec: + III.ImmOpcode = PPC::RLDICR_rec; + break; case PPC::SRD: III.ImmOpcode = PPC::RLDICL; break; - case PPC::SRDo: III.ImmOpcode = PPC::RLDICLo; break; + case PPC::SRD_rec: + III.ImmOpcode = PPC::RLDICL_rec; + break; case PPC::SRAD: III.ImmWidth = 6; III.TruncateImmTo = 0; III.ImmOpcode = PPC::SRADI; break; - case PPC::SRADo: + case PPC::SRAD_rec: III.ImmWidth = 6; III.TruncateImmTo = 0; - III.ImmOpcode = PPC::SRADIo; + III.ImmOpcode = PPC::SRADI_rec; break; } break; @@ -3538,14 +3763,16 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, ForwardKilledOperandReg = MI.getOperand(ConstantOpNo).getReg(); unsigned Opc = MI.getOpcode(); - bool SpecialShift32 = - Opc == PPC::SLW || Opc == PPC::SLWo || Opc == PPC::SRW || Opc == PPC::SRWo; - bool SpecialShift64 = - Opc == PPC::SLD || Opc == PPC::SLDo || Opc == PPC::SRD || Opc == PPC::SRDo; - bool SetCR = Opc == PPC::SLWo || Opc == PPC::SRWo || - Opc == PPC::SLDo || Opc == PPC::SRDo; - bool RightShift = - Opc == PPC::SRW || Opc == PPC::SRWo || Opc == PPC::SRD || Opc == PPC::SRDo; + bool SpecialShift32 = Opc == PPC::SLW || Opc == PPC::SLW_rec || + Opc == PPC::SRW || Opc == PPC::SRW_rec || + Opc == PPC::SLW8 || Opc == PPC::SLW8_rec || + Opc == PPC::SRW8 || Opc == PPC::SRW8_rec; + bool SpecialShift64 = Opc == PPC::SLD || Opc == PPC::SLD_rec || + Opc == PPC::SRD || Opc == PPC::SRD_rec; + bool SetCR = Opc == PPC::SLW_rec || Opc == PPC::SRW_rec || + Opc == PPC::SLD_rec || Opc == PPC::SRD_rec; + bool RightShift = Opc == PPC::SRW || Opc == PPC::SRW_rec || Opc == PPC::SRD || + Opc == PPC::SRD_rec; MI.setDesc(get(III.ImmOpcode)); if (ConstantOpNo == III.OpNoForForwarding) { @@ -3649,27 +3876,21 @@ int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) { // i.e. 0 to 31-th bits are same as 32-th bit. static bool isSignExtendingOp(const MachineInstr &MI) { int Opcode = MI.getOpcode(); - if (Opcode == PPC::LI || Opcode == PPC::LI8 || - Opcode == PPC::LIS || Opcode == PPC::LIS8 || - Opcode == PPC::SRAW || Opcode == PPC::SRAWo || - Opcode == PPC::SRAWI || Opcode == PPC::SRAWIo || - Opcode == PPC::LWA || Opcode == PPC::LWAX || - Opcode == PPC::LWA_32 || Opcode == PPC::LWAX_32 || - Opcode == PPC::LHA || Opcode == PPC::LHAX || - Opcode == PPC::LHA8 || Opcode == PPC::LHAX8 || - Opcode == PPC::LBZ || Opcode == PPC::LBZX || - Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 || - Opcode == PPC::LBZU || Opcode == PPC::LBZUX || - Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8 || - Opcode == PPC::LHZ || Opcode == PPC::LHZX || - Opcode == PPC::LHZ8 || Opcode == PPC::LHZX8 || - Opcode == PPC::LHZU || Opcode == PPC::LHZUX || - Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8 || - Opcode == PPC::EXTSB || Opcode == PPC::EXTSBo || - Opcode == PPC::EXTSH || Opcode == PPC::EXTSHo || - Opcode == PPC::EXTSB8 || Opcode == PPC::EXTSH8 || - Opcode == PPC::EXTSW || Opcode == PPC::EXTSWo || - Opcode == PPC::SETB || Opcode == PPC::SETB8 || + if (Opcode == PPC::LI || Opcode == PPC::LI8 || Opcode == PPC::LIS || + Opcode == PPC::LIS8 || Opcode == PPC::SRAW || Opcode == PPC::SRAW_rec || + Opcode == PPC::SRAWI || Opcode == PPC::SRAWI_rec || Opcode == PPC::LWA || + Opcode == PPC::LWAX || Opcode == PPC::LWA_32 || Opcode == PPC::LWAX_32 || + Opcode == PPC::LHA || Opcode == PPC::LHAX || Opcode == PPC::LHA8 || + Opcode == PPC::LHAX8 || Opcode == PPC::LBZ || Opcode == PPC::LBZX || + Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 || Opcode == PPC::LBZU || + Opcode == PPC::LBZUX || Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8 || + Opcode == PPC::LHZ || Opcode == PPC::LHZX || Opcode == PPC::LHZ8 || + Opcode == PPC::LHZX8 || Opcode == PPC::LHZU || Opcode == PPC::LHZUX || + Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8 || Opcode == PPC::EXTSB || + Opcode == PPC::EXTSB_rec || Opcode == PPC::EXTSH || + Opcode == PPC::EXTSH_rec || Opcode == PPC::EXTSB8 || + Opcode == PPC::EXTSH8 || Opcode == PPC::EXTSW || + Opcode == PPC::EXTSW_rec || Opcode == PPC::SETB || Opcode == PPC::SETB8 || Opcode == PPC::EXTSH8_32_64 || Opcode == PPC::EXTSW_32_64 || Opcode == PPC::EXTSB8_32_64) return true; @@ -3677,8 +3898,8 @@ static bool isSignExtendingOp(const MachineInstr &MI) { if (Opcode == PPC::RLDICL && MI.getOperand(3).getImm() >= 33) return true; - if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo || - Opcode == PPC::RLWNM || Opcode == PPC::RLWNMo) && + if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINM_rec || + Opcode == PPC::RLWNM || Opcode == PPC::RLWNM_rec) && MI.getOperand(3).getImm() > 0 && MI.getOperand(3).getImm() <= MI.getOperand(4).getImm()) return true; @@ -3701,52 +3922,46 @@ static bool isZeroExtendingOp(const MachineInstr &MI) { // We have some variations of rotate-and-mask instructions // that clear higher 32-bits. - if ((Opcode == PPC::RLDICL || Opcode == PPC::RLDICLo || - Opcode == PPC::RLDCL || Opcode == PPC::RLDCLo || + if ((Opcode == PPC::RLDICL || Opcode == PPC::RLDICL_rec || + Opcode == PPC::RLDCL || Opcode == PPC::RLDCL_rec || Opcode == PPC::RLDICL_32_64) && MI.getOperand(3).getImm() >= 32) return true; - if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDICo) && + if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDIC_rec) && MI.getOperand(3).getImm() >= 32 && MI.getOperand(3).getImm() <= 63 - MI.getOperand(2).getImm()) return true; - if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo || - Opcode == PPC::RLWNM || Opcode == PPC::RLWNMo || + if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINM_rec || + Opcode == PPC::RLWNM || Opcode == PPC::RLWNM_rec || Opcode == PPC::RLWINM8 || Opcode == PPC::RLWNM8) && MI.getOperand(3).getImm() <= MI.getOperand(4).getImm()) return true; // There are other instructions that clear higher 32-bits. - if (Opcode == PPC::CNTLZW || Opcode == PPC::CNTLZWo || - Opcode == PPC::CNTTZW || Opcode == PPC::CNTTZWo || + if (Opcode == PPC::CNTLZW || Opcode == PPC::CNTLZW_rec || + Opcode == PPC::CNTTZW || Opcode == PPC::CNTTZW_rec || Opcode == PPC::CNTLZW8 || Opcode == PPC::CNTTZW8 || - Opcode == PPC::CNTLZD || Opcode == PPC::CNTLZDo || - Opcode == PPC::CNTTZD || Opcode == PPC::CNTTZDo || - Opcode == PPC::POPCNTD || Opcode == PPC::POPCNTW || - Opcode == PPC::SLW || Opcode == PPC::SLWo || - Opcode == PPC::SRW || Opcode == PPC::SRWo || - Opcode == PPC::SLW8 || Opcode == PPC::SRW8 || - Opcode == PPC::SLWI || Opcode == PPC::SLWIo || - Opcode == PPC::SRWI || Opcode == PPC::SRWIo || - Opcode == PPC::LWZ || Opcode == PPC::LWZX || - Opcode == PPC::LWZU || Opcode == PPC::LWZUX || - Opcode == PPC::LWBRX || Opcode == PPC::LHBRX || - Opcode == PPC::LHZ || Opcode == PPC::LHZX || - Opcode == PPC::LHZU || Opcode == PPC::LHZUX || - Opcode == PPC::LBZ || Opcode == PPC::LBZX || - Opcode == PPC::LBZU || Opcode == PPC::LBZUX || - Opcode == PPC::LWZ8 || Opcode == PPC::LWZX8 || - Opcode == PPC::LWZU8 || Opcode == PPC::LWZUX8 || - Opcode == PPC::LWBRX8 || Opcode == PPC::LHBRX8 || - Opcode == PPC::LHZ8 || Opcode == PPC::LHZX8 || - Opcode == PPC::LHZU8 || Opcode == PPC::LHZUX8 || - Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 || - Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8 || - Opcode == PPC::ANDIo || Opcode == PPC::ANDISo || - Opcode == PPC::ROTRWI || Opcode == PPC::ROTRWIo || - Opcode == PPC::EXTLWI || Opcode == PPC::EXTLWIo || + Opcode == PPC::CNTLZD || Opcode == PPC::CNTLZD_rec || + Opcode == PPC::CNTTZD || Opcode == PPC::CNTTZD_rec || + Opcode == PPC::POPCNTD || Opcode == PPC::POPCNTW || Opcode == PPC::SLW || + Opcode == PPC::SLW_rec || Opcode == PPC::SRW || Opcode == PPC::SRW_rec || + Opcode == PPC::SLW8 || Opcode == PPC::SRW8 || Opcode == PPC::SLWI || + Opcode == PPC::SLWI_rec || Opcode == PPC::SRWI || + Opcode == PPC::SRWI_rec || Opcode == PPC::LWZ || Opcode == PPC::LWZX || + Opcode == PPC::LWZU || Opcode == PPC::LWZUX || Opcode == PPC::LWBRX || + Opcode == PPC::LHBRX || Opcode == PPC::LHZ || Opcode == PPC::LHZX || + Opcode == PPC::LHZU || Opcode == PPC::LHZUX || Opcode == PPC::LBZ || + Opcode == PPC::LBZX || Opcode == PPC::LBZU || Opcode == PPC::LBZUX || + Opcode == PPC::LWZ8 || Opcode == PPC::LWZX8 || Opcode == PPC::LWZU8 || + Opcode == PPC::LWZUX8 || Opcode == PPC::LWBRX8 || Opcode == PPC::LHBRX8 || + Opcode == PPC::LHZ8 || Opcode == PPC::LHZX8 || Opcode == PPC::LHZU8 || + Opcode == PPC::LHZUX8 || Opcode == PPC::LBZ8 || Opcode == PPC::LBZX8 || + Opcode == PPC::LBZU8 || Opcode == PPC::LBZUX8 || + Opcode == PPC::ANDI_rec || Opcode == PPC::ANDIS_rec || + Opcode == PPC::ROTRWI || Opcode == PPC::ROTRWI_rec || + Opcode == PPC::EXTLWI || Opcode == PPC::EXTLWI_rec || Opcode == PPC::MFVSRWZ) return true; @@ -3840,14 +4055,14 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt, return false; } - case PPC::ANDIo: - case PPC::ANDISo: + case PPC::ANDI_rec: + case PPC::ANDIS_rec: case PPC::ORI: case PPC::ORIS: case PPC::XORI: case PPC::XORIS: - case PPC::ANDIo8: - case PPC::ANDISo8: + case PPC::ANDI8_rec: + case PPC::ANDIS8_rec: case PPC::ORI8: case PPC::ORIS8: case PPC::XORI8: @@ -4042,12 +4257,10 @@ MachineInstr *PPCInstrInfo::findLoopInstr( // Return true if get the base operand, byte offset of an instruction and the // memory width. Width is the size of memory that is being loaded/stored. bool PPCInstrInfo::getMemOperandWithOffsetWidth( - const MachineInstr &LdSt, - const MachineOperand *&BaseReg, - int64_t &Offset, - unsigned &Width, - const TargetRegisterInfo *TRI) const { - assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); + const MachineInstr &LdSt, const MachineOperand *&BaseReg, int64_t &Offset, + unsigned &Width, const TargetRegisterInfo *TRI) const { + if (!LdSt.mayLoadOrStore()) + return false; // Handle only loads/stores with base register followed by immediate offset. if (LdSt.getNumExplicitOperands() != 3) diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 19ab30cb0908..2fe8df0e1d68 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -280,7 +280,7 @@ public: unsigned FalseReg) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, @@ -346,8 +346,6 @@ public: bool DefinesPredicate(MachineInstr &MI, std::vector &Pred) const override; - bool isPredicable(const MachineInstr &MI) const override; - // Comparison optimization. bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, @@ -422,6 +420,16 @@ public: bool convertToImmediateForm(MachineInstr &MI, MachineInstr **KilledDef = nullptr) const; + bool foldFrameOffset(MachineInstr &MI) const; + bool isADDIInstrEligibleForFolding(MachineInstr &ADDIMI, int64_t &Imm) const; + bool isADDInstrEligibleForFolding(MachineInstr &ADDMI) const; + bool isImmInstrEligibleForFolding(MachineInstr &MI, unsigned &BaseReg, + unsigned &XFormOpcode, + int64_t &OffsetOfImmInstr, + ImmInstrInfo &III) const; + bool isValidToBeChangedReg(MachineInstr *ADDMI, unsigned Index, + MachineInstr *&ADDIMI, int64_t &OffsetAddi, + int64_t OffsetImm) const; /// Fixup killed/dead flag for register \p RegNo between instructions [\p /// StartMI, \p EndMI]. Some PostRA transformations may violate register diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 24183277519b..b38ca3af63f5 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -61,10 +61,6 @@ def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> ]>; -def SDT_PPCVecReverse: SDTypeProfile<1, 1, [ SDTCisVec<0>, - SDTCisVec<1> -]>; - def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3> ]>; @@ -117,6 +113,10 @@ def SDT_PPCextswsli : SDTypeProfile<1, 2, [ // extswsli SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisInt<2> ]>; +def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0> +]>; + //===----------------------------------------------------------------------===// // PowerPC specific DAG Nodes. // @@ -165,7 +165,8 @@ def PPCfsel : SDNode<"PPCISD::FSEL", // Type constraint for fsel. SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisFP<0>, SDTCisVT<1, f64>]>, []>; - +def PPCxsmaxc : SDNode<"PPCISD::XSMAXCDP", SDT_PPCFPMinMax, []>; +def PPCxsminc : SDNode<"PPCISD::XSMINCDP", SDT_PPCFPMinMax, []>; def PPChi : SDNode<"PPCISD::Hi", SDTIntBinOp, []>; def PPClo : SDNode<"PPCISD::Lo", SDTIntBinOp, []>; def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, @@ -199,7 +200,6 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>; def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>; def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>; -def PPCxxreverse : SDNode<"PPCISD::XXREVERSE", SDT_PPCVecReverse, []>; def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; @@ -486,7 +486,7 @@ def mul_without_simm16 : BinOpWithoutSImm16Operand; // PowerPC Flag Definitions. class isPPC64 { bit PPC64 = 1; } -class isDOT { bit RC = 1; } +class isRecordForm { bit RC = 1; } class RegConstraint { string Constraints = C; @@ -961,9 +961,9 @@ multiclass XForm_6r opcode, bits<10> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR0] in - def o : XForm_6, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -976,9 +976,9 @@ multiclass XForm_6rc opcode, bits<10> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CARRY, CR0] in - def o : XForm_6, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -991,9 +991,9 @@ multiclass XForm_10rc opcode, bits<10> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CARRY, CR0] in - def o : XForm_10, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1005,9 +1005,9 @@ multiclass XForm_11r opcode, bits<10> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR0] in - def o : XForm_11, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1019,9 +1019,35 @@ multiclass XOForm_1r opcode, bits<9> xo, bit oe, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR0] in - def o : XOForm_1, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; + } +} + +// Multiclass for instructions which have a record overflow form as well +// as a record form but no carry (i.e. mulld, mulldo, subf, subfo, etc.) +multiclass XOForm_1rx opcode, bits<9> xo, bit oe, dag OOL, dag IOL, + string asmbase, string asmstr, InstrItinClass itin, + list pattern> { + let BaseName = asmbase in { + def NAME : XOForm_1, RecFormRel; + let Defs = [CR0] in + def _rec : XOForm_1, isRecordForm, RecFormRel; + } + let BaseName = !strconcat(asmbase, "O") in { + let Defs = [XER] in + def O : XOForm_1, RecFormRel; + let Defs = [XER, CR0] in + def O_rec : XOForm_1, isRecordForm, RecFormRel; } } @@ -1035,11 +1061,21 @@ multiclass XOForm_1rcr opcode, bits<9> xo, bit oe, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR0] in - def o : XOForm_1, isDOT, RecFormRel, PPC970_DGroup_First, + []>, isRecordForm, RecFormRel, PPC970_DGroup_First, PPC970_DGroup_Cracked; } + let BaseName = !strconcat(asmbase, "O") in { + let Defs = [XER] in + def O : XOForm_1, RecFormRel; + let Defs = [XER, CR0] in + def O_rec : XOForm_1, isRecordForm, RecFormRel; + } } multiclass XOForm_1rc opcode, bits<9> xo, bit oe, dag OOL, dag IOL, @@ -1051,9 +1087,19 @@ multiclass XOForm_1rc opcode, bits<9> xo, bit oe, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CARRY, CR0] in - def o : XOForm_1, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; + } + let BaseName = !strconcat(asmbase, "O") in { + let Defs = [CARRY, XER] in + def O : XOForm_1, RecFormRel; + let Defs = [CARRY, XER, CR0] in + def O_rec : XOForm_1, isRecordForm, RecFormRel; } } @@ -1065,9 +1111,19 @@ multiclass XOForm_3r opcode, bits<9> xo, bit oe, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR0] in - def o : XOForm_3, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; + } + let BaseName = !strconcat(asmbase, "O") in { + let Defs = [XER] in + def O : XOForm_3, RecFormRel; + let Defs = [XER, CR0] in + def O_rec : XOForm_3, isRecordForm, RecFormRel; } } @@ -1080,9 +1136,19 @@ multiclass XOForm_3rc opcode, bits<9> xo, bit oe, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CARRY, CR0] in - def o : XOForm_3, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; + } + let BaseName = !strconcat(asmbase, "O") in { + let Defs = [CARRY, XER] in + def O : XOForm_3, RecFormRel; + let Defs = [CARRY, XER, CR0] in + def O_rec : XOForm_3, isRecordForm, RecFormRel; } } @@ -1094,9 +1160,9 @@ multiclass MForm_2r opcode, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR0] in - def o : MForm_2, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1108,9 +1174,9 @@ multiclass MDForm_1r opcode, bits<3> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR0] in - def o : MDForm_1, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1122,9 +1188,9 @@ multiclass MDSForm_1r opcode, bits<4> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR0] in - def o : MDSForm_1, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1137,9 +1203,9 @@ multiclass XSForm_1rc opcode, bits<9> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CARRY, CR0] in - def o : XSForm_1, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1151,9 +1217,9 @@ multiclass XSForm_1r opcode, bits<9> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR0] in - def o : XSForm_1, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1165,9 +1231,9 @@ multiclass XForm_26r opcode, bits<10> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR1] in - def o : XForm_26, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1179,9 +1245,9 @@ multiclass XForm_28r opcode, bits<10> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR1] in - def o : XForm_28, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1193,9 +1259,9 @@ multiclass AForm_1r opcode, bits<5> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR1] in - def o : AForm_1, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1207,9 +1273,9 @@ multiclass AForm_2r opcode, bits<5> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR1] in - def o : AForm_2, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1221,9 +1287,9 @@ multiclass AForm_3r opcode, bits<5> xo, dag OOL, dag IOL, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, pattern>, RecFormRel; let Defs = [CR1] in - def o : AForm_3, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } } @@ -1324,12 +1390,13 @@ def RESTORE_CRBIT : PPCEmitTimePseudo<(outs crbitrc:$cond), (ins memri:$F), } let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in { - let isReturn = 1, Uses = [LR, RM] in + let isPredicable = 1, isReturn = 1, Uses = [LR, RM] in def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB, [(retflag)]>, Requires<[In32BitMode]>; let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in { - def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, - []>; + let isPredicable = 1 in + def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, + []>; let isCodeGenOnly = 1 in { def BCCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond), @@ -1362,9 +1429,10 @@ let Defs = [LR] in let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in { let isBarrier = 1 in { - def B : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst), - "b $dst", IIC_BrB, - [(br bb:$dst)]>; + let isPredicable = 1 in + def B : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst), + "b $dst", IIC_BrB, + [(br bb:$dst)]>; def BA : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst), "ba $dst", IIC_BrB, []>; } @@ -1485,9 +1553,10 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { } } let Uses = [CTR, RM] in { - def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), - "bctrl", IIC_BrB, [(PPCbctrl)]>, - Requires<[In32BitMode]>; + let isPredicable = 1 in + def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl)]>, + Requires<[In32BitMode]>; let isCodeGenOnly = 1 in { def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond), @@ -1574,6 +1643,15 @@ def TCRETURNri : PPCEmitTimePseudo<(outs), (ins CTRRC:$dst, i32imm:$offset), "#TC_RETURNr $dst $offset", []>; +let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR, R2], Uses = [CTR, RM], RST = 2 in { + def BCTRL_LWZinto_toc: + XLForm_2_ext_and_DForm_1<19, 528, 20, 0, 1, 32, (outs), + (ins memri:$src), "bctrl\n\tlwz 2, $src", IIC_BrB, + [(PPCbctrl_load_toc iaddr:$src)]>, Requires<[In32BitMode]>; + +} + let isCodeGenOnly = 1 in { @@ -1839,15 +1917,15 @@ def LWARX : XForm_1_memOp<31, 20, (outs gprc:$rD), (ins memrr:$src), // Instructions to support lock versions of atomics // (EH=1 - see Power ISA 2.07 Book II 4.4.2) def LBARXL : XForm_1_memOp<31, 52, (outs gprc:$rD), (ins memrr:$src), - "lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT, + "lbarx $rD, $src, 1", IIC_LdStLWARX, []>, isRecordForm, Requires<[HasPartwordAtomics]>; def LHARXL : XForm_1_memOp<31, 116, (outs gprc:$rD), (ins memrr:$src), - "lharx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT, + "lharx $rD, $src, 1", IIC_LdStLWARX, []>, isRecordForm, Requires<[HasPartwordAtomics]>; def LWARXL : XForm_1_memOp<31, 20, (outs gprc:$rD), (ins memrr:$src), - "lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isDOT; + "lwarx $rD, $src, 1", IIC_LdStLWARX, []>, isRecordForm; // The atomic instructions use the destination register as well as the next one // or two registers in order (modulo 31). @@ -1860,14 +1938,14 @@ def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC), let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in { def STBCX : XForm_1_memOp<31, 694, (outs), (ins gprc:$rS, memrr:$dst), "stbcx. $rS, $dst", IIC_LdStSTWCX, []>, - isDOT, Requires<[HasPartwordAtomics]>; + isRecordForm, Requires<[HasPartwordAtomics]>; def STHCX : XForm_1_memOp<31, 726, (outs), (ins gprc:$rS, memrr:$dst), "sthcx. $rS, $dst", IIC_LdStSTWCX, []>, - isDOT, Requires<[HasPartwordAtomics]>; + isRecordForm, Requires<[HasPartwordAtomics]>; def STWCX : XForm_1_memOp<31, 150, (outs), (ins gprc:$rS, memrr:$dst), - "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT; + "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isRecordForm; } let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in @@ -2034,6 +2112,7 @@ def LFIWZX : XForm_25_memOp<31, 887, (outs f8rc:$frD), (ins memrr:$src), } // Load Multiple +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src), "lmw $rD, $src", IIC_LdStLMW, []>; @@ -2188,6 +2267,7 @@ def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff), } // Store Multiple +let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst), "stmw $rS, $dst", IIC_LdStLMW, []>; @@ -2221,9 +2301,9 @@ def ADDIC : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), [(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>, RecFormRel, PPC970_DGroup_Cracked; let Defs = [CARRY, CR0] in -def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), +def ADDIC_rec : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm), "addic. $rD, $rA, $imm", IIC_IntGeneral, - []>, isDOT, RecFormRel; + []>, isRecordForm, RecFormRel; } def ADDIS : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm), "addis $rD, $rA, $imm", IIC_IntSimple, @@ -2253,14 +2333,14 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { let PPC970_Unit = 1 in { // FXU Operations. let Defs = [CR0] in { -def ANDIo : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), +def ANDI_rec : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), "andi. $dst, $src1, $src2", IIC_IntGeneral, [(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>, - isDOT; -def ANDISo : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), + isRecordForm; +def ANDIS_rec : DForm_4<29, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), "andis. $dst, $src1, $src2", IIC_IntGeneral, [(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>, - isDOT; + isRecordForm; } def ORI : DForm_4<24, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2), "ori $dst, $src1, $src2", IIC_IntSimple, @@ -2621,6 +2701,7 @@ def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS), PPC970_DGroup_First, PPC970_Unit_FXU; } +let hasSideEffects = 0 in { let Defs = [LR] in { def MTLR : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS), "mtlr $rS", IIC_SprMTSPR>, @@ -2631,6 +2712,7 @@ def MFLR : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins), "mflr $rT", IIC_SprMFSPR>, PPC970_DGroup_First, PPC970_Unit_FXU; } +} let isCodeGenOnly = 1 in { // Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed @@ -2732,8 +2814,8 @@ let Uses = [RM] in { PPC970_DGroup_Single, PPC970_Unit_FPU; let Defs = [CR1] in - def MFFSo : XForm_42<63, 583, (outs f8rc:$rT), (ins), - "mffs. $rT", IIC_IntMFFS, []>, isDOT; + def MFFS_rec : XForm_42<63, 583, (outs f8rc:$rT), (ins), + "mffs. $rT", IIC_IntMFFS, []>, isRecordForm; def MFFSCE : X_FRT5_XO2_XO3_XO10<63, 0, 1, 583, (outs f8rc:$rT), (ins), "mffsce $rT", IIC_IntMFFS, []>, @@ -2778,9 +2860,9 @@ def MODUW : XForm_8<31, 267, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), let PPC970_Unit = 1, hasSideEffects = 0 in { // FXU Operations. // XO-Form instructions. Arithmetic instructions that can set overflow bit let isCommutable = 1 in -defm ADD4 : XOForm_1r<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), - "add", "$rT, $rA, $rB", IIC_IntSimple, - [(set i32:$rT, (add i32:$rA, i32:$rB))]>; +defm ADD4 : XOForm_1rx<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "add", "$rT, $rA, $rB", IIC_IntSimple, + [(set i32:$rT, (add i32:$rA, i32:$rB))]>; let isCodeGenOnly = 1 in def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$rT), (ins gprc:$rA, tlsreg32:$rB), "add $rT, $rA, $rB", IIC_IntSimple, @@ -2797,24 +2879,14 @@ defm DIVW : XOForm_1rcr<31, 491, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), defm DIVWU : XOForm_1rcr<31, 459, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), "divwu", "$rT, $rA, $rB", IIC_IntDivW, [(set i32:$rT, (udiv i32:$rA, i32:$rB))]>; -def DIVWE : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), - "divwe $rT, $rA, $rB", IIC_IntDivW, - [(set i32:$rT, (int_ppc_divwe gprc:$rA, gprc:$rB))]>, - Requires<[HasExtDiv]>; -let Defs = [CR0] in -def DIVWEo : XOForm_1<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), - "divwe. $rT, $rA, $rB", IIC_IntDivW, - []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First, - Requires<[HasExtDiv]>; -def DIVWEU : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), - "divweu $rT, $rA, $rB", IIC_IntDivW, - [(set i32:$rT, (int_ppc_divweu gprc:$rA, gprc:$rB))]>, - Requires<[HasExtDiv]>; -let Defs = [CR0] in -def DIVWEUo : XOForm_1<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), - "divweu. $rT, $rA, $rB", IIC_IntDivW, - []>, isDOT, PPC970_DGroup_Cracked, PPC970_DGroup_First, - Requires<[HasExtDiv]>; +defm DIVWE : XOForm_1rcr<31, 427, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divwe", "$rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (int_ppc_divwe gprc:$rA, gprc:$rB))]>, + Requires<[HasExtDiv]>; +defm DIVWEU : XOForm_1rcr<31, 395, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "divweu", "$rT, $rA, $rB", IIC_IntDivW, + [(set i32:$rT, (int_ppc_divweu gprc:$rA, gprc:$rB))]>, + Requires<[HasExtDiv]>; let isCommutable = 1 in { defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), "mulhw", "$rT, $rA, $rB", IIC_IntMulHW, @@ -2822,13 +2894,13 @@ defm MULHW : XOForm_1r<31, 75, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), defm MULHWU : XOForm_1r<31, 11, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), "mulhwu", "$rT, $rA, $rB", IIC_IntMulHWU, [(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>; -defm MULLW : XOForm_1r<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), - "mullw", "$rT, $rA, $rB", IIC_IntMulHW, - [(set i32:$rT, (mul i32:$rA, i32:$rB))]>; +defm MULLW : XOForm_1rx<31, 235, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "mullw", "$rT, $rA, $rB", IIC_IntMulHW, + [(set i32:$rT, (mul i32:$rA, i32:$rB))]>; } // isCommutable -defm SUBF : XOForm_1r<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), - "subf", "$rT, $rA, $rB", IIC_IntGeneral, - [(set i32:$rT, (sub i32:$rB, i32:$rA))]>; +defm SUBF : XOForm_1rx<31, 40, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), + "subf", "$rT, $rA, $rB", IIC_IntGeneral, + [(set i32:$rT, (sub i32:$rB, i32:$rA))]>; defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB), "subfc", "$rT, $rA, $rB", IIC_IntGeneral, [(set i32:$rT, (subc i32:$rB, i32:$rA))]>, @@ -2984,10 +3056,10 @@ def RLWINM : MForm_2<21, "rlwinm $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral, []>, RecFormRel; let Defs = [CR0] in -def RLWINMo : MForm_2<21, +def RLWINM_rec : MForm_2<21, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME), "rlwinm. $rA, $rS, $SH, $MB, $ME", IIC_IntGeneral, - []>, isDOT, RecFormRel, PPC970_DGroup_Cracked; + []>, isRecordForm, RecFormRel, PPC970_DGroup_Cracked; } defm RLWNM : MForm_2r<23, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB, u5imm:$MB, u5imm:$ME), @@ -3177,6 +3249,11 @@ def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentr // the function label. def UpdateGBR : PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins gprc:$rI), "#UpdateGBR", []>; +// Pseudo-instruction marked for deletion. When deleting the instruction would +// cause iterator invalidation in MIR transformation passes, this pseudo can be +// used instead. It will be removed unconditionally at pre-emit time (prior to +// branch selection). +def UNENCODED_NOP: PPCEmitTimePseudo<(outs), (ins), "#UNENCODED_NOP", []>; // Standard shifts. These are represented separately from the real shifts above // so that we can distinguish between shifts that allow 5-bit and 6-bit shift @@ -3219,10 +3296,10 @@ def : Pat<(f64 (fpextend f32:$src)), // source: http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html // The rule for seq_cst is duplicated to work with both 64 bits and 32 bits // versions of Power. -def : Pat<(atomic_fence (i64 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>; -def : Pat<(atomic_fence (i32 7), (imm)), (SYNC 0)>, Requires<[HasSYNC]>; -def : Pat<(atomic_fence (imm), (imm)), (SYNC 1)>, Requires<[HasSYNC]>; -def : Pat<(atomic_fence (imm), (imm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>; +def : Pat<(atomic_fence (i64 7), (timm)), (SYNC 0)>, Requires<[HasSYNC]>; +def : Pat<(atomic_fence (i32 7), (timm)), (SYNC 0)>, Requires<[HasSYNC]>; +def : Pat<(atomic_fence (timm), (timm)), (SYNC 1)>, Requires<[HasSYNC]>; +def : Pat<(atomic_fence (timm), (timm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>; let Predicates = [HasFPU] in { // Additional FNMSUB patterns: -a*c + b == -(a*c - b) @@ -4010,24 +4087,24 @@ def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETUGT)), def : Pat<(v4i32 (selectcc i1:$lhs, i1:$rhs, v4i32:$tval, v4i32:$fval, SETNE)), (SELECT_VRRC (CRXOR $lhs, $rhs), $tval, $fval)>; -def ANDIo_1_EQ_BIT : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins gprc:$in), - "#ANDIo_1_EQ_BIT", +def ANDI_rec_1_EQ_BIT : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins gprc:$in), + "#ANDI_rec_1_EQ_BIT", [(set i1:$dst, (trunc (not i32:$in)))]>; -def ANDIo_1_GT_BIT : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins gprc:$in), - "#ANDIo_1_GT_BIT", +def ANDI_rec_1_GT_BIT : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins gprc:$in), + "#ANDI_rec_1_GT_BIT", [(set i1:$dst, (trunc i32:$in))]>; -def ANDIo_1_EQ_BIT8 : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins g8rc:$in), - "#ANDIo_1_EQ_BIT8", +def ANDI_rec_1_EQ_BIT8 : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins g8rc:$in), + "#ANDI_rec_1_EQ_BIT8", [(set i1:$dst, (trunc (not i64:$in)))]>; -def ANDIo_1_GT_BIT8 : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins g8rc:$in), - "#ANDIo_1_GT_BIT8", +def ANDI_rec_1_GT_BIT8 : PPCCustomInserterPseudo<(outs crbitrc:$dst), (ins g8rc:$in), + "#ANDI_rec_1_GT_BIT8", [(set i1:$dst, (trunc i64:$in))]>; def : Pat<(i1 (not (trunc i32:$in))), - (ANDIo_1_EQ_BIT $in)>; + (ANDI_rec_1_EQ_BIT $in)>; def : Pat<(i1 (not (trunc i64:$in))), - (ANDIo_1_EQ_BIT8 $in)>; + (ANDI_rec_1_EQ_BIT8 $in)>; //===----------------------------------------------------------------------===// // PowerPC Instructions used for assembler/disassembler only @@ -4111,22 +4188,22 @@ def MCRFS : XLForm_3<63, 64, (outs crrc:$BF), (ins crrc:$BFA), def MTFSFI : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W), "mtfsfi $BF, $U, $W", IIC_IntMFFS>; -def MTFSFIo : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W), - "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isDOT; +def MTFSFI_rec : XLForm_4<63, 134, (outs crrc:$BF), (ins i32imm:$U, i32imm:$W), + "mtfsfi. $BF, $U, $W", IIC_IntMFFS>, isRecordForm; def : InstAlias<"mtfsfi $BF, $U", (MTFSFI crrc:$BF, i32imm:$U, 0)>; -def : InstAlias<"mtfsfi. $BF, $U", (MTFSFIo crrc:$BF, i32imm:$U, 0)>; +def : InstAlias<"mtfsfi. $BF, $U", (MTFSFI_rec crrc:$BF, i32imm:$U, 0)>; let Predicates = [HasFPU] in { def MTFSF : XFLForm_1<63, 711, (outs), (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W), "mtfsf $FLM, $FRB, $L, $W", IIC_IntMFFS, []>; -def MTFSFo : XFLForm_1<63, 711, (outs), +def MTFSF_rec : XFLForm_1<63, 711, (outs), (ins i32imm:$FLM, f8rc:$FRB, i32imm:$L, i32imm:$W), - "mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isDOT; + "mtfsf. $FLM, $FRB, $L, $W", IIC_IntMFFS, []>, isRecordForm; def : InstAlias<"mtfsf $FLM, $FRB", (MTFSF i32imm:$FLM, f8rc:$FRB, 0, 0)>; -def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSFo i32imm:$FLM, f8rc:$FRB, 0, 0)>; +def : InstAlias<"mtfsf. $FLM, $FRB", (MTFSF_rec i32imm:$FLM, f8rc:$FRB, 0, 0)>; } def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB), @@ -4144,8 +4221,8 @@ def SLBMFEV : XLForm_1_gen<31, 851, (outs gprc:$RT), (ins gprc:$RB), def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", IIC_SprSLBIA, []>; let Defs = [CR0] in -def SLBFEEo : XForm_26<31, 979, (outs gprc:$RT), (ins gprc:$RB), - "slbfee. $RT, $RB", IIC_SprSLBFEE, []>, isDOT; +def SLBFEE_rec : XForm_26<31, 979, (outs gprc:$RT), (ins gprc:$RB), + "slbfee. $RT, $RB", IIC_SprSLBFEE, []>, isRecordForm; def TLBIA : XForm_0<31, 370, (outs), (ins), "tlbia", IIC_SprTLBIA, []>; @@ -4188,7 +4265,7 @@ def TLBSX2 : XForm_base_r3xo<31, 914, (outs), (ins gprc:$RST, gprc:$A, gprc:$B), def TLBSX2D : XForm_base_r3xo<31, 914, (outs), (ins gprc:$RST, gprc:$A, gprc:$B), "tlbsx. $RST, $A, $B", IIC_LdStLoad, []>, - Requires<[IsPPC4xx]>, isDOT; + Requires<[IsPPC4xx]>, isRecordForm; def RFID : XForm_0<19, 18, (outs), (ins), "rfid", IIC_IntRFID, []>; @@ -4406,10 +4483,10 @@ def : InstAlias<"mttbhi $Rx", (MTSPR 988, gprc:$Rx)>, Requires<[IsPPC4xx]>; def : InstAlias<"xnop", (XORI R0, R0, 0)>; def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>; -def : InstAlias<"mr. $rA, $rB", (OR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>; +def : InstAlias<"mr. $rA, $rB", (OR8_rec g8rc:$rA, g8rc:$rB, g8rc:$rB)>; def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>; -def : InstAlias<"not. $rA, $rB", (NOR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>; +def : InstAlias<"not. $rA, $rB", (NOR8_rec g8rc:$rA, g8rc:$rB, g8rc:$rB)>; def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>; @@ -4475,13 +4552,13 @@ def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm", (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm", (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; -def SUBICo : PPCAsmPseudo<"subic. $rA, $rB, $imm", +def SUBIC_rec : PPCAsmPseudo<"subic. $rA, $rB, $imm", (ins gprc:$rA, gprc:$rB, s16imm:$imm)>; def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>; -def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>; +def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8_rec g8rc:$rA, g8rc:$rC, g8rc:$rB)>; def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>; -def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>; +def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8_rec g8rc:$rA, g8rc:$rC, g8rc:$rB)>; def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>; def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>; @@ -4534,109 +4611,109 @@ def : InstAlias<"tlbwelo $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 1)>, def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b", (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; -def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b", +def EXTLWI_rec : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b", (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; def EXTRWI : PPCAsmPseudo<"extrwi $rA, $rS, $n, $b", (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; -def EXTRWIo : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b", +def EXTRWI_rec : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b", (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; def INSLWI : PPCAsmPseudo<"inslwi $rA, $rS, $n, $b", (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; -def INSLWIo : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b", +def INSLWI_rec : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b", (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; def INSRWI : PPCAsmPseudo<"insrwi $rA, $rS, $n, $b", (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; -def INSRWIo : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b", +def INSRWI_rec : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b", (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>; def ROTRWI : PPCAsmPseudo<"rotrwi $rA, $rS, $n", (ins gprc:$rA, gprc:$rS, u5imm:$n)>; -def ROTRWIo : PPCAsmPseudo<"rotrwi. $rA, $rS, $n", +def ROTRWI_rec : PPCAsmPseudo<"rotrwi. $rA, $rS, $n", (ins gprc:$rA, gprc:$rS, u5imm:$n)>; def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n", (ins gprc:$rA, gprc:$rS, u5imm:$n)>; -def SLWIo : PPCAsmPseudo<"slwi. $rA, $rS, $n", +def SLWI_rec : PPCAsmPseudo<"slwi. $rA, $rS, $n", (ins gprc:$rA, gprc:$rS, u5imm:$n)>; def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n", (ins gprc:$rA, gprc:$rS, u5imm:$n)>; -def SRWIo : PPCAsmPseudo<"srwi. $rA, $rS, $n", +def SRWI_rec : PPCAsmPseudo<"srwi. $rA, $rS, $n", (ins gprc:$rA, gprc:$rS, u5imm:$n)>; def CLRRWI : PPCAsmPseudo<"clrrwi $rA, $rS, $n", (ins gprc:$rA, gprc:$rS, u5imm:$n)>; -def CLRRWIo : PPCAsmPseudo<"clrrwi. $rA, $rS, $n", +def CLRRWI_rec : PPCAsmPseudo<"clrrwi. $rA, $rS, $n", (ins gprc:$rA, gprc:$rS, u5imm:$n)>; def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n", (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>; -def CLRLSLWIo : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n", +def CLRLSLWI_rec : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n", (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>; def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>; -def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>; +def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINM_rec gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>; def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>; -def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>; +def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNM_rec gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>; def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; -def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; +def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINM_rec gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>; -def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>; +def : InstAlias<"cntlzw. $rA, $rS", (CNTLZW_rec gprc:$rA, gprc:$rS)>; // The POWER variant def : MnemonicAlias<"cntlz", "cntlzw">; def : MnemonicAlias<"cntlz.", "cntlzw.">; def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; -def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b", +def EXTLDI_rec : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; def EXTRDI : PPCAsmPseudo<"extrdi $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; -def EXTRDIo : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b", +def EXTRDI_rec : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; def INSRDI : PPCAsmPseudo<"insrdi $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; -def INSRDIo : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b", +def INSRDI_rec : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; def ROTRDI : PPCAsmPseudo<"rotrdi $rA, $rS, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; -def ROTRDIo : PPCAsmPseudo<"rotrdi. $rA, $rS, $n", +def ROTRDI_rec : PPCAsmPseudo<"rotrdi. $rA, $rS, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; -def SLDIo : PPCAsmPseudo<"sldi. $rA, $rS, $n", +def SLDI_rec : PPCAsmPseudo<"sldi. $rA, $rS, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; -def SRDIo : PPCAsmPseudo<"srdi. $rA, $rS, $n", +def SRDI_rec : PPCAsmPseudo<"srdi. $rA, $rS, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; def CLRRDI : PPCAsmPseudo<"clrrdi $rA, $rS, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; -def CLRRDIo : PPCAsmPseudo<"clrrdi. $rA, $rS, $n", +def CLRRDI_rec : PPCAsmPseudo<"clrrdi. $rA, $rS, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>; def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>; -def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n", +def CLRLSLDI_rec : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n", (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>; def SUBPCIS : PPCAsmPseudo<"subpcis $RT, $D", (ins g8rc:$RT, s16imm:$D)>; def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>; -def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>; +def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICL_rec g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>; def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; -def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; +def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCL_rec g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL_32_64 g8rc:$rA, gprc:$rS, 0, u6imm:$n)>; -def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; +def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICL_rec g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; def : InstAlias<"lnia $RT", (ADDPCIS g8rc:$RT, 0)>; def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; -def RLWINMobm : PPCAsmPseudo<"rlwinm. $rA, $rS, $n, $b", +def RLWINMbm_rec : PPCAsmPseudo<"rlwinm. $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; def RLWIMIbm : PPCAsmPseudo<"rlwimi $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; -def RLWIMIobm : PPCAsmPseudo<"rlwimi. $rA, $rS, $n, $b", +def RLWIMIbm_rec : PPCAsmPseudo<"rlwimi. $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; def RLWNMbm : PPCAsmPseudo<"rlwnm $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; -def RLWNMobm : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b", +def RLWNMbm_rec : PPCAsmPseudo<"rlwnm. $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u5imm:$n, i32imm:$b)>; // These generic branch instruction forms are used for the assembler parser only. @@ -4865,7 +4942,7 @@ let mayStore = 1 in def CP_PASTE : X_L1_RA5_RB5<31, 902, "paste" , gprc, IIC_LdStPASTE, []>; let mayStore = 1, Defs = [CR0] in -def CP_PASTEo : X_L1_RA5_RB5<31, 902, "paste.", gprc, IIC_LdStPASTE, []>, isDOT; +def CP_PASTE_rec : X_L1_RA5_RB5<31, 902, "paste.", gprc, IIC_LdStPASTE, []>, isRecordForm; def CP_COPYx : PPCAsmPseudo<"copy $rA, $rB" , (ins gprc:$rA, gprc:$rB)>; def CP_PASTEx : PPCAsmPseudo<"paste $rA, $rB", (ins gprc:$rA, gprc:$rB)>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 2aad5860d87f..be6b30ffa08b 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -120,11 +120,11 @@ multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase, !strconcat(asmbase, !strconcat(" ", asmstr)), itin, [(set OutTy:$XT, (Int InTy:$XA, InTy:$XB))]>; let Defs = [CR6] in - def o : XX3Form_Rc, - isDOT; + isRecordForm; } } @@ -152,7 +152,6 @@ def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">; let Predicates = [HasVSX] in { let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. let hasSideEffects = 0 in { // VSX instructions don't have side effects. -let Uses = [RM] in { // Load indexed instructions let mayLoad = 1, mayStore = 0 in { @@ -214,6 +213,7 @@ let Uses = [RM] in { } } // mayStore + let Uses = [RM] in { // Add/Mul Instructions let isCommutable = 1 in { def XSADDDP : XX3Form<60, 32, @@ -1255,6 +1255,55 @@ def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))), } // AddedComplexity } // HasVSX +def FpMinMax { + dag F32Min = (COPY_TO_REGCLASS (XSMINDP (COPY_TO_REGCLASS $A, VSFRC), + (COPY_TO_REGCLASS $B, VSFRC)), + VSSRC); + dag F32Max = (COPY_TO_REGCLASS (XSMAXDP (COPY_TO_REGCLASS $A, VSFRC), + (COPY_TO_REGCLASS $B, VSFRC)), + VSSRC); +} + +let AddedComplexity = 400, Predicates = [HasVSX] in { + // f32 Min. + def : Pat<(f32 (fminnum_ieee f32:$A, f32:$B)), + (f32 FpMinMax.F32Min)>; + def : Pat<(f32 (fminnum_ieee (fcanonicalize f32:$A), f32:$B)), + (f32 FpMinMax.F32Min)>; + def : Pat<(f32 (fminnum_ieee f32:$A, (fcanonicalize f32:$B))), + (f32 FpMinMax.F32Min)>; + def : Pat<(f32 (fminnum_ieee (fcanonicalize f32:$A), (fcanonicalize f32:$B))), + (f32 FpMinMax.F32Min)>; + // F32 Max. + def : Pat<(f32 (fmaxnum_ieee f32:$A, f32:$B)), + (f32 FpMinMax.F32Max)>; + def : Pat<(f32 (fmaxnum_ieee (fcanonicalize f32:$A), f32:$B)), + (f32 FpMinMax.F32Max)>; + def : Pat<(f32 (fmaxnum_ieee f32:$A, (fcanonicalize f32:$B))), + (f32 FpMinMax.F32Max)>; + def : Pat<(f32 (fmaxnum_ieee (fcanonicalize f32:$A), (fcanonicalize f32:$B))), + (f32 FpMinMax.F32Max)>; + + // f64 Min. + def : Pat<(f64 (fminnum_ieee f64:$A, f64:$B)), + (f64 (XSMINDP $A, $B))>; + def : Pat<(f64 (fminnum_ieee (fcanonicalize f64:$A), f64:$B)), + (f64 (XSMINDP $A, $B))>; + def : Pat<(f64 (fminnum_ieee f64:$A, (fcanonicalize f64:$B))), + (f64 (XSMINDP $A, $B))>; + def : Pat<(f64 (fminnum_ieee (fcanonicalize f64:$A), (fcanonicalize f64:$B))), + (f64 (XSMINDP $A, $B))>; + // f64 Max. + def : Pat<(f64 (fmaxnum_ieee f64:$A, f64:$B)), + (f64 (XSMAXDP $A, $B))>; + def : Pat<(f64 (fmaxnum_ieee (fcanonicalize f64:$A), f64:$B)), + (f64 (XSMAXDP $A, $B))>; + def : Pat<(f64 (fmaxnum_ieee f64:$A, (fcanonicalize f64:$B))), + (f64 (XSMAXDP $A, $B))>; + def : Pat<(f64 (fmaxnum_ieee (fcanonicalize f64:$A), (fcanonicalize f64:$B))), + (f64 (XSMAXDP $A, $B))>; +} + def ScalarLoads { dag Li8 = (i32 (extloadi8 xoaddr:$src)); dag ZELi8 = (i32 (zextloadi8 xoaddr:$src)); @@ -1330,7 +1379,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>; // VSX scalar loads introduced in ISA 2.07 - let mayLoad = 1, mayStore = 0 in { + let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in { let CodeSize = 3 in def LXSSPX : XX1Form_memOp<31, 524, (outs vssrc:$XT), (ins memrr:$src), "lxsspx $XT, $src", IIC_LdStLFD, []>; @@ -1355,7 +1404,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. } // mayLoad // VSX scalar stores introduced in ISA 2.07 - let mayStore = 1, mayLoad = 0 in { + let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in { let CodeSize = 3 in def STXSSPX : XX1Form_memOp<31, 652, (outs), (ins vssrc:$XT, memrr:$dst), "stxsspx $XT, $dst", IIC_LdStSTFD, []>; @@ -1912,7 +1961,7 @@ def VectorExtractions { - The order of elements after the move to GPR is reversed, so we invert the bits of the index prior to truncating to the range 0-7 */ - dag BE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDIo8 $Idx, 8))); + dag BE_VBYTE_PERM_VEC = (v16i8 (LVSL ZERO8, (ANDI8_rec $Idx, 8))); dag BE_VBYTE_PERMUTE = (v16i8 (VPERM $S, $S, BE_VBYTE_PERM_VEC)); dag BE_MV_VBYTE = (MFVSRD (EXTRACT_SUBREG @@ -1931,7 +1980,7 @@ def VectorExtractions { the bits of the index prior to truncating to the range 0-3 */ dag BE_VHALF_PERM_VEC = (v16i8 (LVSL ZERO8, - (RLDICR (ANDIo8 $Idx, 4), 1, 62))); + (RLDICR (ANDI8_rec $Idx, 4), 1, 62))); dag BE_VHALF_PERMUTE = (v16i8 (VPERM $S, $S, BE_VHALF_PERM_VEC)); dag BE_MV_VHALF = (MFVSRD (EXTRACT_SUBREG @@ -1949,7 +1998,7 @@ def VectorExtractions { the bits of the index prior to truncating to the range 0-1 */ dag BE_VWORD_PERM_VEC = (v16i8 (LVSL ZERO8, - (RLDICR (ANDIo8 $Idx, 2), 2, 61))); + (RLDICR (ANDI8_rec $Idx, 2), 2, 61))); dag BE_VWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VWORD_PERM_VEC)); dag BE_MV_VWORD = (MFVSRD (EXTRACT_SUBREG @@ -1965,7 +2014,7 @@ def VectorExtractions { element indices. */ dag BE_VDWORD_PERM_VEC = (v16i8 (LVSL ZERO8, - (RLDICR (ANDIo8 $Idx, 1), 3, 60))); + (RLDICR (ANDI8_rec $Idx, 1), 3, 60))); dag BE_VDWORD_PERMUTE = (v16i8 (VPERM $S, $S, BE_VDWORD_PERM_VEC)); dag BE_VARIABLE_DWORD = (MFVSRD (EXTRACT_SUBREG @@ -2477,6 +2526,43 @@ def : Pat<(i64 (bitconvert f64:$S)), // (move to FPR, nothing else needed) def : Pat<(f64 (bitconvert i64:$S)), (f64 (MTVSRD $S))>; + +// Rounding to integer. +def : Pat<(i64 (lrint f64:$S)), + (i64 (MFVSRD (FCTID $S)))>; +def : Pat<(i64 (lrint f32:$S)), + (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>; +def : Pat<(i64 (llrint f64:$S)), + (i64 (MFVSRD (FCTID $S)))>; +def : Pat<(i64 (llrint f32:$S)), + (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>; +def : Pat<(i64 (lround f64:$S)), + (i64 (MFVSRD (FCTID (XSRDPI $S))))>; +def : Pat<(i64 (lround f32:$S)), + (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; +def : Pat<(i64 (llround f64:$S)), + (i64 (MFVSRD (FCTID (XSRDPI $S))))>; +def : Pat<(i64 (llround f32:$S)), + (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>; +} + +let Predicates = [HasVSX] in { +// Rounding for single precision. +def : Pat<(f32 (fround f32:$S)), + (f32 (COPY_TO_REGCLASS (XSRDPI + (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; +def : Pat<(f32 (fnearbyint f32:$S)), + (f32 (COPY_TO_REGCLASS (XSRDPIC + (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; +def : Pat<(f32 (ffloor f32:$S)), + (f32 (COPY_TO_REGCLASS (XSRDPIM + (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; +def : Pat<(f32 (fceil f32:$S)), + (f32 (COPY_TO_REGCLASS (XSRDPIP + (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; +def : Pat<(f32 (ftrunc f32:$S)), + (f32 (COPY_TO_REGCLASS (XSRDPIZ + (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; } // Materialize a zero-vector of long long @@ -2502,7 +2588,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /] class X_VT5_XO5_VB5_Ro opcode, bits<5> xo2, bits<10> xo, string opc, list pattern> - : X_VT5_XO5_VB5, isDOT; + : X_VT5_XO5_VB5, isRecordForm; // [PO VRT XO VRB XO /], but the VRB is only used the left 64 bits (or less), // So we use different operand class for VRB @@ -2520,7 +2606,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /] class X_VT5_XO5_VB5_VSFR_Ro opcode, bits<5> xo2, bits<10> xo, string opc, list pattern> - : X_VT5_XO5_VB5_VSFR, isDOT; + : X_VT5_XO5_VB5_VSFR, isRecordForm; // [PO T XO B XO BX /] class XX2_RT5_XO5_XB6 opcode, bits<5> xo2, bits<9> xo, string opc, @@ -2550,7 +2636,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /] class X_VT5_VA5_VB5_Ro opcode, bits<10> xo, string opc, list pattern> - : X_VT5_VA5_VB5, isDOT; + : X_VT5_VA5_VB5, isRecordForm; // [PO VRT VRA VRB XO /] class X_VT5_VA5_VB5_FMA opcode, bits<10> xo, string opc, @@ -2562,7 +2648,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /] class X_VT5_VA5_VB5_FMA_Ro opcode, bits<10> xo, string opc, list pattern> - : X_VT5_VA5_VB5_FMA, isDOT; + : X_VT5_VA5_VB5_FMA, isRecordForm; //===--------------------------------------------------------------------===// // Quad-Precision Scalar Move Instructions: @@ -2884,13 +2970,14 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { //===--------------------------------------------------------------------===// // Maximum/Minimum Type-C/Type-J DP - // XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU, so we use vsrc for XT - def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsrc, vsfrc, vsfrc, - IIC_VecFP, []>; + def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsfrc, vsfrc, vsfrc, + IIC_VecFP, + [(set f64:$XT, (PPCxsmaxc f64:$XA, f64:$XB))]>; def XSMAXJDP : XX3_XT5_XA5_XB5<60, 144, "xsmaxjdp", vsrc, vsfrc, vsfrc, IIC_VecFP, []>; - def XSMINCDP : XX3_XT5_XA5_XB5<60, 136, "xsmincdp", vsrc, vsfrc, vsfrc, - IIC_VecFP, []>; + def XSMINCDP : XX3_XT5_XA5_XB5<60, 136, "xsmincdp", vsfrc, vsfrc, vsfrc, + IIC_VecFP, + [(set f64:$XT, (PPCxsminc f64:$XA, f64:$XB))]>; def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc, IIC_VecFP, []>; @@ -2898,18 +2985,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // Vector Byte-Reverse H/W/D/Q Word def XXBRH : XX2_XT6_XO5_XB6<60, 7, 475, "xxbrh", vsrc, []>; - def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc, []>; - def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, []>; + def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc, + [(set v4i32:$XT, (bswap v4i32:$XB))]>; + def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, + [(set v2i64:$XT, (bswap v2i64:$XB))]>; def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>; // Vector Reverse - def : Pat<(v8i16 (PPCxxreverse v8i16 :$A)), + def : Pat<(v8i16 (bswap v8i16 :$A)), (v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; - def : Pat<(v4i32 (PPCxxreverse v4i32 :$A)), - (v4i32 (XXBRW $A))>; - def : Pat<(v2i64 (PPCxxreverse v2i64 :$A)), - (v2i64 (XXBRD $A))>; - def : Pat<(v1i128 (PPCxxreverse v1i128 :$A)), + def : Pat<(v1i128 (bswap v1i128 :$A)), (v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; // Vector Permute @@ -2927,7 +3012,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging. - let mayLoad = 1, mayStore = 0 in { + let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in { // Load Vector def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src), "lxv $XT, $src", IIC_LdStLFD, []>; @@ -2972,7 +3057,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging. - let mayStore = 1, mayLoad = 0 in { + let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in { // Store Vector def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst), "stxv $XT, $dst", IIC_LdStSTFD, []>; @@ -3697,6 +3782,15 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { def : Pat<(f128 (fpextend f32:$src)), (f128 (XSCVDPQP (COPY_TO_REGCLASS $src, VFRC)))>; + def : Pat<(f32 (PPCxsmaxc f32:$XA, f32:$XB)), + (f32 (COPY_TO_REGCLASS (XSMAXCDP (COPY_TO_REGCLASS $XA, VSSRC), + (COPY_TO_REGCLASS $XB, VSSRC)), + VSSRC))>; + def : Pat<(f32 (PPCxsminc f32:$XA, f32:$XB)), + (f32 (COPY_TO_REGCLASS (XSMINCDP (COPY_TO_REGCLASS $XA, VSSRC), + (COPY_TO_REGCLASS $XB, VSSRC)), + VSSRC))>; + } // end HasP9Vector, AddedComplexity let AddedComplexity = 400 in { @@ -3710,7 +3804,7 @@ let AddedComplexity = 400 in { } } -let Predicates = [HasP9Vector] in { +let Predicates = [HasP9Vector], hasSideEffects = 0 in { let mayStore = 1 in { def SPILLTOVSR_STX : PseudoXFormMemOp<(outs), (ins spilltovsrrc:$XT, memrr:$dst), @@ -3865,8 +3959,20 @@ def DblToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A))))); } +// FP load dags (for f32 -> v4f32) +def LoadFP { + dag A = (f32 (load xoaddr:$A)); + dag B = (f32 (load xoaddr:$B)); + dag C = (f32 (load xoaddr:$C)); + dag D = (f32 (load xoaddr:$D)); +} + // FP merge dags (for f32 -> v4f32) def MrgFP { + dag LD32A = (COPY_TO_REGCLASS (LIWZX xoaddr:$A), VSRC); + dag LD32B = (COPY_TO_REGCLASS (LIWZX xoaddr:$B), VSRC); + dag LD32C = (COPY_TO_REGCLASS (LIWZX xoaddr:$C), VSRC); + dag LD32D = (COPY_TO_REGCLASS (LIWZX xoaddr:$D), VSRC); dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC), (COPY_TO_REGCLASS $C, VSRC), 0)); dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), @@ -4022,7 +4128,18 @@ let AddedComplexity = 400 in { (v2f64 (XXPERMDI (COPY_TO_REGCLASS $A, VSRC), (COPY_TO_REGCLASS $B, VSRC), 0))>; - + // Using VMRGEW to assemble the final vector would be a lower latency + // solution. However, we choose to go with the slightly higher latency + // XXPERMDI for 2 reasons: + // 1. This is likely to occur in unrolled loops where regpressure is high, + // so we want to use the latter as it has access to all 64 VSX registers. + // 2. Using Altivec instructions in this sequence would likely cause the + // allocation of Altivec registers even for the loads which in turn would + // force the use of LXSIWZX for the loads, adding a cycle of latency to + // each of the loads which would otherwise be able to use LFIWZX. + def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)), + (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32A, MrgFP.LD32B), + (XXMRGHW MrgFP.LD32C, MrgFP.LD32D), 3))>; def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)), (VMRGEW MrgFP.AC, MrgFP.BD)>; def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, @@ -4089,7 +4206,18 @@ let AddedComplexity = 400 in { (v2f64 (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), (COPY_TO_REGCLASS $A, VSRC), 0))>; - + // Using VMRGEW to assemble the final vector would be a lower latency + // solution. However, we choose to go with the slightly higher latency + // XXPERMDI for 2 reasons: + // 1. This is likely to occur in unrolled loops where regpressure is high, + // so we want to use the latter as it has access to all 64 VSX registers. + // 2. Using Altivec instructions in this sequence would likely cause the + // allocation of Altivec registers even for the loads which in turn would + // force the use of LXSIWZX for the loads, adding a cycle of latency to + // each of the loads which would otherwise be able to use LFIWZX. + def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)), + (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32D, MrgFP.LD32C), + (XXMRGHW MrgFP.LD32B, MrgFP.LD32A), 3))>; def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)), (VMRGEW MrgFP.AC, MrgFP.BD)>; def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp new file mode 100644 index 000000000000..b761f337533b --- /dev/null +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -0,0 +1,894 @@ +//===------ PPCLoopInstrFormPrep.cpp - Loop Instr Form Prep Pass ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to prepare loops for ppc preferred addressing +// modes, leveraging different instruction form. (eg: DS/DQ form, D/DS form with +// update) +// Additional PHIs are created for loop induction variables used by load/store +// instructions so that preferred addressing modes can be used. +// +// 1: DS/DQ form preparation, prepare the load/store instructions so that they +// can satisfy the DS/DQ form displacement requirements. +// Generically, this means transforming loops like this: +// for (int i = 0; i < n; ++i) { +// unsigned long x1 = *(unsigned long *)(p + i + 5); +// unsigned long x2 = *(unsigned long *)(p + i + 9); +// } +// +// to look like this: +// +// unsigned NewP = p + 5; +// for (int i = 0; i < n; ++i) { +// unsigned long x1 = *(unsigned long *)(i + NewP); +// unsigned long x2 = *(unsigned long *)(i + NewP + 4); +// } +// +// 2: D/DS form with update preparation, prepare the load/store instructions so +// that we can use update form to do pre-increment. +// Generically, this means transforming loops like this: +// for (int i = 0; i < n; ++i) +// array[i] = c; +// +// to look like this: +// +// T *p = array[-1]; +// for (int i = 0; i < n; ++i) +// *++p = c; +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ppc-loop-instr-form-prep" + +#include "PPC.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include +#include +#include + +using namespace llvm; + +// By default, we limit this to creating 16 common bases out of loops per +// function. 16 is a little over half of the allocatable register set. +static cl::opt MaxVarsPrep("ppc-formprep-max-vars", + cl::Hidden, cl::init(16), + cl::desc("Potential common base number threshold per function for PPC loop " + "prep")); + +static cl::opt PreferUpdateForm("ppc-formprep-prefer-update", + cl::init(true), cl::Hidden, + cl::desc("prefer update form when ds form is also a update form")); + +// Sum of following 3 per loop thresholds for all loops can not be larger +// than MaxVarsPrep. +// By default, we limit this to creating 9 PHIs for one loop. +// 9 and 3 for each kind prep are exterimental values on Power9. +static cl::opt MaxVarsUpdateForm("ppc-preinc-prep-max-vars", + cl::Hidden, cl::init(3), + cl::desc("Potential PHI threshold per loop for PPC loop prep of update " + "form")); + +static cl::opt MaxVarsDSForm("ppc-dsprep-max-vars", + cl::Hidden, cl::init(3), + cl::desc("Potential PHI threshold per loop for PPC loop prep of DS form")); + +static cl::opt MaxVarsDQForm("ppc-dqprep-max-vars", + cl::Hidden, cl::init(3), + cl::desc("Potential PHI threshold per loop for PPC loop prep of DQ form")); + + +// If would not be profitable if the common base has only one load/store, ISEL +// should already be able to choose best load/store form based on offset for +// single load/store. Set minimal profitable value default to 2 and make it as +// an option. +static cl::opt DispFormPrepMinThreshold("ppc-dispprep-min-threshold", + cl::Hidden, cl::init(2), + cl::desc("Minimal common base load/store instructions triggering DS/DQ form " + "preparation")); + +STATISTIC(PHINodeAlreadyExistsUpdate, "PHI node already in pre-increment form"); +STATISTIC(PHINodeAlreadyExistsDS, "PHI node already in DS form"); +STATISTIC(PHINodeAlreadyExistsDQ, "PHI node already in DQ form"); +STATISTIC(DSFormChainRewritten, "Num of DS form chain rewritten"); +STATISTIC(DQFormChainRewritten, "Num of DQ form chain rewritten"); +STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten"); + +namespace { + struct BucketElement { + BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} + BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} + + const SCEVConstant *Offset; + Instruction *Instr; + }; + + struct Bucket { + Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), + Elements(1, BucketElement(I)) {} + + const SCEV *BaseSCEV; + SmallVector Elements; + }; + + // "UpdateForm" is not a real PPC instruction form, it stands for dform + // load/store with update like ldu/stdu, or Prefetch intrinsic. + // For DS form instructions, their displacements must be multiple of 4. + // For DQ form instructions, their displacements must be multiple of 16. + enum InstrForm { UpdateForm = 1, DSForm = 4, DQForm = 16 }; + + class PPCLoopInstrFormPrep : public FunctionPass { + public: + static char ID; // Pass ID, replacement for typeid + + PPCLoopInstrFormPrep() : FunctionPass(ID) { + initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry()); + } + + PPCLoopInstrFormPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { + initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + } + + bool runOnFunction(Function &F) override; + + private: + PPCTargetMachine *TM = nullptr; + const PPCSubtarget *ST; + DominatorTree *DT; + LoopInfo *LI; + ScalarEvolution *SE; + bool PreserveLCSSA; + + /// Successful preparation number for Update/DS/DQ form in all inner most + /// loops. One successful preparation will put one common base out of loop, + /// this may leads to register presure like LICM does. + /// Make sure total preparation number can be controlled by option. + unsigned SuccPrepCount; + + bool runOnLoop(Loop *L); + + /// Check if required PHI node is already exist in Loop \p L. + bool alreadyPrepared(Loop *L, Instruction* MemI, + const SCEV *BasePtrStartSCEV, + const SCEVConstant *BasePtrIncSCEV, + InstrForm Form); + + /// Collect condition matched(\p isValidCandidate() returns true) + /// candidates in Loop \p L. + SmallVector + collectCandidates(Loop *L, + std::function + isValidCandidate, + unsigned MaxCandidateNum); + + /// Add a candidate to candidates \p Buckets. + void addOneCandidate(Instruction *MemI, const SCEV *LSCEV, + SmallVector &Buckets, + unsigned MaxCandidateNum); + + /// Prepare all candidates in \p Buckets for update form. + bool updateFormPrep(Loop *L, SmallVector &Buckets); + + /// Prepare all candidates in \p Buckets for displacement form, now for + /// ds/dq. + bool dispFormPrep(Loop *L, SmallVector &Buckets, + InstrForm Form); + + /// Prepare for one chain \p BucketChain, find the best base element and + /// update all other elements in \p BucketChain accordingly. + /// \p Form is used to find the best base element. + /// If success, best base element must be stored as the first element of + /// \p BucketChain. + /// Return false if no base element found, otherwise return true. + bool prepareBaseForDispFormChain(Bucket &BucketChain, + InstrForm Form); + + /// Prepare for one chain \p BucketChain, find the best base element and + /// update all other elements in \p BucketChain accordingly. + /// If success, best base element must be stored as the first element of + /// \p BucketChain. + /// Return false if no base element found, otherwise return true. + bool prepareBaseForUpdateFormChain(Bucket &BucketChain); + + /// Rewrite load/store instructions in \p BucketChain according to + /// preparation. + bool rewriteLoadStores(Loop *L, Bucket &BucketChain, + SmallSet &BBChanged, + InstrForm Form); + }; + +} // end anonymous namespace + +char PPCLoopInstrFormPrep::ID = 0; +static const char *name = "Prepare loop for ppc preferred instruction forms"; +INITIALIZE_PASS_BEGIN(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false) + +static const std::string PHINodeNameSuffix = ".phi"; +static const std::string CastNodeNameSuffix = ".cast"; +static const std::string GEPNodeIncNameSuffix = ".inc"; +static const std::string GEPNodeOffNameSuffix = ".off"; + +FunctionPass *llvm::createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM) { + return new PPCLoopInstrFormPrep(TM); +} + +static bool IsPtrInBounds(Value *BasePtr) { + Value *StrippedBasePtr = BasePtr; + while (BitCastInst *BC = dyn_cast(StrippedBasePtr)) + StrippedBasePtr = BC->getOperand(0); + if (GetElementPtrInst *GEP = dyn_cast(StrippedBasePtr)) + return GEP->isInBounds(); + + return false; +} + +static std::string getInstrName(const Value *I, const std::string Suffix) { + assert(I && "Invalid paramater!"); + if (I->hasName()) + return (I->getName() + Suffix).str(); + else + return ""; +} + +static Value *GetPointerOperand(Value *MemI) { + if (LoadInst *LMemI = dyn_cast(MemI)) { + return LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast(MemI)) { + return SMemI->getPointerOperand(); + } else if (IntrinsicInst *IMemI = dyn_cast(MemI)) { + if (IMemI->getIntrinsicID() == Intrinsic::prefetch) + return IMemI->getArgOperand(0); + } + + return nullptr; +} + +bool PPCLoopInstrFormPrep::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + LI = &getAnalysis().getLoopInfo(); + SE = &getAnalysis().getSE(); + auto *DTWP = getAnalysisIfAvailable(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + ST = TM ? TM->getSubtargetImpl(F) : nullptr; + SuccPrepCount = 0; + + bool MadeChange = false; + + for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I) + for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L) + MadeChange |= runOnLoop(*L); + + return MadeChange; +} + +void PPCLoopInstrFormPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV, + SmallVector &Buckets, + unsigned MaxCandidateNum) { + assert((MemI && GetPointerOperand(MemI)) && + "Candidate should be a memory instruction."); + assert(LSCEV && "Invalid SCEV for Ptr value."); + bool FoundBucket = false; + for (auto &B : Buckets) { + const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); + if (const auto *CDiff = dyn_cast(Diff)) { + B.Elements.push_back(BucketElement(CDiff, MemI)); + FoundBucket = true; + break; + } + } + + if (!FoundBucket) { + if (Buckets.size() == MaxCandidateNum) + return; + Buckets.push_back(Bucket(LSCEV, MemI)); + } +} + +SmallVector PPCLoopInstrFormPrep::collectCandidates( + Loop *L, + std::function isValidCandidate, + unsigned MaxCandidateNum) { + SmallVector Buckets; + for (const auto &BB : L->blocks()) + for (auto &J : *BB) { + Value *PtrValue; + Instruction *MemI; + + if (LoadInst *LMemI = dyn_cast(&J)) { + MemI = LMemI; + PtrValue = LMemI->getPointerOperand(); + } else if (StoreInst *SMemI = dyn_cast(&J)) { + MemI = SMemI; + PtrValue = SMemI->getPointerOperand(); + } else if (IntrinsicInst *IMemI = dyn_cast(&J)) { + if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { + MemI = IMemI; + PtrValue = IMemI->getArgOperand(0); + } else continue; + } else continue; + + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); + if (PtrAddrSpace) + continue; + + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); + const SCEVAddRecExpr *LARSCEV = dyn_cast(LSCEV); + if (!LARSCEV || LARSCEV->getLoop() != L) + continue; + + if (isValidCandidate(&J, PtrValue)) + addOneCandidate(MemI, LSCEV, Buckets, MaxCandidateNum); + } + return Buckets; +} + +bool PPCLoopInstrFormPrep::prepareBaseForDispFormChain(Bucket &BucketChain, + InstrForm Form) { + // RemainderOffsetInfo details: + // key: value of (Offset urem DispConstraint). For DSForm, it can + // be [0, 4). + // first of pair: the index of first BucketElement whose remainder is equal + // to key. For key 0, this value must be 0. + // second of pair: number of load/stores with the same remainder. + DenseMap> RemainderOffsetInfo; + + for (unsigned j = 0, je = BucketChain.Elements.size(); j != je; ++j) { + if (!BucketChain.Elements[j].Offset) + RemainderOffsetInfo[0] = std::make_pair(0, 1); + else { + unsigned Remainder = + BucketChain.Elements[j].Offset->getAPInt().urem(Form); + if (RemainderOffsetInfo.find(Remainder) == RemainderOffsetInfo.end()) + RemainderOffsetInfo[Remainder] = std::make_pair(j, 1); + else + RemainderOffsetInfo[Remainder].second++; + } + } + // Currently we choose the most profitable base as the one which has the max + // number of load/store with same remainder. + // FIXME: adjust the base selection strategy according to load/store offset + // distribution. + // For example, if we have one candidate chain for DS form preparation, which + // contains following load/stores with different remainders: + // 1: 10 load/store whose remainder is 1; + // 2: 9 load/store whose remainder is 2; + // 3: 1 for remainder 3 and 0 for remainder 0; + // Now we will choose the first load/store whose remainder is 1 as base and + // adjust all other load/stores according to new base, so we will get 10 DS + // form and 10 X form. + // But we should be more clever, for this case we could use two bases, one for + // remainder 1 and the other for remainder 2, thus we could get 19 DS form and 1 + // X form. + unsigned MaxCountRemainder = 0; + for (unsigned j = 0; j < (unsigned)Form; j++) + if ((RemainderOffsetInfo.find(j) != RemainderOffsetInfo.end()) && + RemainderOffsetInfo[j].second > + RemainderOffsetInfo[MaxCountRemainder].second) + MaxCountRemainder = j; + + // Abort when there are too few insts with common base. + if (RemainderOffsetInfo[MaxCountRemainder].second < DispFormPrepMinThreshold) + return false; + + // If the first value is most profitable, no needed to adjust BucketChain + // elements as they are substracted the first value when collecting. + if (MaxCountRemainder == 0) + return true; + + // Adjust load/store to the new chosen base. + const SCEV *Offset = + BucketChain.Elements[RemainderOffsetInfo[MaxCountRemainder].first].Offset; + BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset); + for (auto &E : BucketChain.Elements) { + if (E.Offset) + E.Offset = cast(SE->getMinusSCEV(E.Offset, Offset)); + else + E.Offset = cast(SE->getNegativeSCEV(Offset)); + } + + std::swap(BucketChain.Elements[RemainderOffsetInfo[MaxCountRemainder].first], + BucketChain.Elements[0]); + return true; +} + +// FIXME: implement a more clever base choosing policy. +// Currently we always choose an exist load/store offset. This maybe lead to +// suboptimal code sequences. For example, for one DS chain with offsets +// {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp +// for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a +// multipler of 4, it cannot be represented by sint16. +bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) { + // We have a choice now of which instruction's memory operand we use as the + // base for the generated PHI. Always picking the first instruction in each + // bucket does not work well, specifically because that instruction might + // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, + // the choice is somewhat arbitrary, because the backend will happily + // generate direct offsets from both the pre-incremented and + // post-incremented pointer values. Thus, we'll pick the first non-prefetch + // instruction in each bucket, and adjust the recurrence and other offsets + // accordingly. + for (int j = 0, je = BucketChain.Elements.size(); j != je; ++j) { + if (auto *II = dyn_cast(BucketChain.Elements[j].Instr)) + if (II->getIntrinsicID() == Intrinsic::prefetch) + continue; + + // If we'd otherwise pick the first element anyway, there's nothing to do. + if (j == 0) + break; + + // If our chosen element has no offset from the base pointer, there's + // nothing to do. + if (!BucketChain.Elements[j].Offset || + BucketChain.Elements[j].Offset->isZero()) + break; + + const SCEV *Offset = BucketChain.Elements[j].Offset; + BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset); + for (auto &E : BucketChain.Elements) { + if (E.Offset) + E.Offset = cast(SE->getMinusSCEV(E.Offset, Offset)); + else + E.Offset = cast(SE->getNegativeSCEV(Offset)); + } + + std::swap(BucketChain.Elements[j], BucketChain.Elements[0]); + break; + } + return true; +} + +bool PPCLoopInstrFormPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain, + SmallSet &BBChanged, + InstrForm Form) { + bool MadeChange = false; + const SCEVAddRecExpr *BasePtrSCEV = + cast(BucketChain.BaseSCEV); + if (!BasePtrSCEV->isAffine()) + return MadeChange; + + LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); + + assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?"); + + // The instruction corresponding to the Bucket's BaseSCEV must be the first + // in the vector of elements. + Instruction *MemI = BucketChain.Elements.begin()->Instr; + Value *BasePtr = GetPointerOperand(MemI); + assert(BasePtr && "No pointer operand"); + + Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); + Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(), + BasePtr->getType()->getPointerAddressSpace()); + + if (!SE->isLoopInvariant(BasePtrSCEV->getStart(), L)) + return MadeChange; + + const SCEVConstant *BasePtrIncSCEV = + dyn_cast(BasePtrSCEV->getStepRecurrence(*SE)); + if (!BasePtrIncSCEV) + return MadeChange; + + // For some DS form load/store instructions, it can also be an update form, + // if the stride is a multipler of 4. Use update form if prefer it. + bool CanPreInc = (Form == UpdateForm || + ((Form == DSForm) && !BasePtrIncSCEV->getAPInt().urem(4) && + PreferUpdateForm)); + const SCEV *BasePtrStartSCEV = nullptr; + if (CanPreInc) + BasePtrStartSCEV = + SE->getMinusSCEV(BasePtrSCEV->getStart(), BasePtrIncSCEV); + else + BasePtrStartSCEV = BasePtrSCEV->getStart(); + + if (!isSafeToExpand(BasePtrStartSCEV, *SE)) + return MadeChange; + + if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV, Form)) + return MadeChange; + + LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); + + BasicBlock *Header = L->getHeader(); + unsigned HeaderLoopPredCount = pred_size(Header); + BasicBlock *LoopPredecessor = L->getLoopPredecessor(); + + PHINode *NewPHI = + PHINode::Create(I8PtrTy, HeaderLoopPredCount, + getInstrName(MemI, PHINodeNameSuffix), + Header->getFirstNonPHI()); + + SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); + Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, + LoopPredecessor->getTerminator()); + + // Note that LoopPredecessor might occur in the predecessor list multiple + // times, and we need to add it the right number of times. + for (auto PI : predecessors(Header)) { + if (PI != LoopPredecessor) + continue; + + NewPHI->addIncoming(BasePtrStart, LoopPredecessor); + } + + Instruction *PtrInc = nullptr; + Instruction *NewBasePtr = nullptr; + if (CanPreInc) { + Instruction *InsPoint = &*Header->getFirstInsertionPt(); + PtrInc = GetElementPtrInst::Create( + I8Ty, NewPHI, BasePtrIncSCEV->getValue(), + getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint); + cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); + for (auto PI : predecessors(Header)) { + if (PI == LoopPredecessor) + continue; + + NewPHI->addIncoming(PtrInc, PI); + } + if (PtrInc->getType() != BasePtr->getType()) + NewBasePtr = new BitCastInst( + PtrInc, BasePtr->getType(), + getInstrName(PtrInc, CastNodeNameSuffix), InsPoint); + else + NewBasePtr = PtrInc; + } else { + // Note that LoopPredecessor might occur in the predecessor list multiple + // times, and we need to make sure no more incoming value for them in PHI. + for (auto PI : predecessors(Header)) { + if (PI == LoopPredecessor) + continue; + + // For the latch predecessor, we need to insert a GEP just before the + // terminator to increase the address. + BasicBlock *BB = PI; + Instruction *InsPoint = BB->getTerminator(); + PtrInc = GetElementPtrInst::Create( + I8Ty, NewPHI, BasePtrIncSCEV->getValue(), + getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint); + + cast(PtrInc)->setIsInBounds(IsPtrInBounds(BasePtr)); + + NewPHI->addIncoming(PtrInc, PI); + } + PtrInc = NewPHI; + if (NewPHI->getType() != BasePtr->getType()) + NewBasePtr = + new BitCastInst(NewPHI, BasePtr->getType(), + getInstrName(NewPHI, CastNodeNameSuffix), + &*Header->getFirstInsertionPt()); + else + NewBasePtr = NewPHI; + } + + if (Instruction *IDel = dyn_cast(BasePtr)) + BBChanged.insert(IDel->getParent()); + BasePtr->replaceAllUsesWith(NewBasePtr); + RecursivelyDeleteTriviallyDeadInstructions(BasePtr); + + // Keep track of the replacement pointer values we've inserted so that we + // don't generate more pointer values than necessary. + SmallPtrSet NewPtrs; + NewPtrs.insert(NewBasePtr); + + for (auto I = std::next(BucketChain.Elements.begin()), + IE = BucketChain.Elements.end(); I != IE; ++I) { + Value *Ptr = GetPointerOperand(I->Instr); + assert(Ptr && "No pointer operand"); + if (NewPtrs.count(Ptr)) + continue; + + Instruction *RealNewPtr; + if (!I->Offset || I->Offset->getValue()->isZero()) { + RealNewPtr = NewBasePtr; + } else { + Instruction *PtrIP = dyn_cast(Ptr); + if (PtrIP && isa(NewBasePtr) && + cast(NewBasePtr)->getParent() == PtrIP->getParent()) + PtrIP = nullptr; + else if (PtrIP && isa(PtrIP)) + PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); + else if (!PtrIP) + PtrIP = I->Instr; + + GetElementPtrInst *NewPtr = GetElementPtrInst::Create( + I8Ty, PtrInc, I->Offset->getValue(), + getInstrName(I->Instr, GEPNodeOffNameSuffix), PtrIP); + if (!PtrIP) + NewPtr->insertAfter(cast(PtrInc)); + NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); + RealNewPtr = NewPtr; + } + + if (Instruction *IDel = dyn_cast(Ptr)) + BBChanged.insert(IDel->getParent()); + + Instruction *ReplNewPtr; + if (Ptr->getType() != RealNewPtr->getType()) { + ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), + getInstrName(Ptr, CastNodeNameSuffix)); + ReplNewPtr->insertAfter(RealNewPtr); + } else + ReplNewPtr = RealNewPtr; + + Ptr->replaceAllUsesWith(ReplNewPtr); + RecursivelyDeleteTriviallyDeadInstructions(Ptr); + + NewPtrs.insert(RealNewPtr); + } + + MadeChange = true; + + SuccPrepCount++; + + if (Form == DSForm && !CanPreInc) + DSFormChainRewritten++; + else if (Form == DQForm) + DQFormChainRewritten++; + else if (Form == UpdateForm || (Form == DSForm && CanPreInc)) + UpdFormChainRewritten++; + + return MadeChange; +} + +bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L, + SmallVector &Buckets) { + bool MadeChange = false; + if (Buckets.empty()) + return MadeChange; + SmallSet BBChanged; + for (auto &Bucket : Buckets) + // The base address of each bucket is transformed into a phi and the others + // are rewritten based on new base. + if (prepareBaseForUpdateFormChain(Bucket)) + MadeChange |= rewriteLoadStores(L, Bucket, BBChanged, UpdateForm); + + if (MadeChange) + for (auto &BB : L->blocks()) + if (BBChanged.count(BB)) + DeleteDeadPHIs(BB); + return MadeChange; +} + +bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L, SmallVector &Buckets, + InstrForm Form) { + bool MadeChange = false; + + if (Buckets.empty()) + return MadeChange; + + SmallSet BBChanged; + for (auto &Bucket : Buckets) { + if (Bucket.Elements.size() < DispFormPrepMinThreshold) + continue; + if (prepareBaseForDispFormChain(Bucket, Form)) + MadeChange |= rewriteLoadStores(L, Bucket, BBChanged, Form); + } + + if (MadeChange) + for (auto &BB : L->blocks()) + if (BBChanged.count(BB)) + DeleteDeadPHIs(BB); + return MadeChange; +} + +// In order to prepare for the preferred instruction form, a PHI is added. +// This function will check to see if that PHI already exists and will return +// true if it found an existing PHI with the matched start and increment as the +// one we wanted to create. +bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction* MemI, + const SCEV *BasePtrStartSCEV, + const SCEVConstant *BasePtrIncSCEV, + InstrForm Form) { + BasicBlock *BB = MemI->getParent(); + if (!BB) + return false; + + BasicBlock *PredBB = L->getLoopPredecessor(); + BasicBlock *LatchBB = L->getLoopLatch(); + + if (!PredBB || !LatchBB) + return false; + + // Run through the PHIs and see if we have some that looks like a preparation + iterator_range PHIIter = BB->phis(); + for (auto & CurrentPHI : PHIIter) { + PHINode *CurrentPHINode = dyn_cast(&CurrentPHI); + if (!CurrentPHINode) + continue; + + if (!SE->isSCEVable(CurrentPHINode->getType())) + continue; + + const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L); + + const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast(PHISCEV); + if (!PHIBasePtrSCEV) + continue; + + const SCEVConstant *PHIBasePtrIncSCEV = + dyn_cast(PHIBasePtrSCEV->getStepRecurrence(*SE)); + if (!PHIBasePtrIncSCEV) + continue; + + if (CurrentPHINode->getNumIncomingValues() == 2) { + if ((CurrentPHINode->getIncomingBlock(0) == LatchBB && + CurrentPHINode->getIncomingBlock(1) == PredBB) || + (CurrentPHINode->getIncomingBlock(1) == LatchBB && + CurrentPHINode->getIncomingBlock(0) == PredBB)) { + if (PHIBasePtrIncSCEV == BasePtrIncSCEV) { + // The existing PHI (CurrentPHINode) has the same start and increment + // as the PHI that we wanted to create. + if (Form == UpdateForm && + PHIBasePtrSCEV->getStart() == BasePtrStartSCEV) { + ++PHINodeAlreadyExistsUpdate; + return true; + } + if (Form == DSForm || Form == DQForm) { + const SCEVConstant *Diff = dyn_cast( + SE->getMinusSCEV(PHIBasePtrSCEV->getStart(), BasePtrStartSCEV)); + if (Diff && !Diff->getAPInt().urem(Form)) { + if (Form == DSForm) + ++PHINodeAlreadyExistsDS; + else + ++PHINodeAlreadyExistsDQ; + return true; + } + } + } + } + } + } + return false; +} + +bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) { + bool MadeChange = false; + + // Only prep. the inner-most loop + if (!L->empty()) + return MadeChange; + + // Return if already done enough preparation. + if (SuccPrepCount >= MaxVarsPrep) + return MadeChange; + + LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n"); + + BasicBlock *LoopPredecessor = L->getLoopPredecessor(); + // If there is no loop predecessor, or the loop predecessor's terminator + // returns a value (which might contribute to determining the loop's + // iteration space), insert a new preheader for the loop. + if (!LoopPredecessor || + !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { + LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); + if (LoopPredecessor) + MadeChange = true; + } + if (!LoopPredecessor) { + LLVM_DEBUG(dbgs() << "PIP fails since no predecessor for current loop.\n"); + return MadeChange; + } + // Check if a load/store has update form. This lambda is used by function + // collectCandidates which can collect candidates for types defined by lambda. + auto isUpdateFormCandidate = [&] (const Instruction *I, + const Value *PtrValue) { + assert((PtrValue && I) && "Invalid parameter!"); + // There are no update forms for Altivec vector load/stores. + if (ST && ST->hasAltivec() && + PtrValue->getType()->getPointerElementType()->isVectorTy()) + return false; + // See getPreIndexedAddressParts, the displacement for LDU/STDU has to + // be 4's multiple (DS-form). For i64 loads/stores when the displacement + // fits in a 16-bit signed field but isn't a multiple of 4, it will be + // useless and possible to break some original well-form addressing mode + // to make this pre-inc prep for it. + if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) { + const SCEV *LSCEV = SE->getSCEVAtScope(const_cast(PtrValue), L); + const SCEVAddRecExpr *LARSCEV = dyn_cast(LSCEV); + if (!LARSCEV || LARSCEV->getLoop() != L) + return false; + if (const SCEVConstant *StepConst = + dyn_cast(LARSCEV->getStepRecurrence(*SE))) { + const APInt &ConstInt = StepConst->getValue()->getValue(); + if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0) + return false; + } + } + return true; + }; + + // Check if a load/store has DS form. + auto isDSFormCandidate = [] (const Instruction *I, const Value *PtrValue) { + assert((PtrValue && I) && "Invalid parameter!"); + if (isa(I)) + return false; + Type *PointerElementType = PtrValue->getType()->getPointerElementType(); + return (PointerElementType->isIntegerTy(64)) || + (PointerElementType->isFloatTy()) || + (PointerElementType->isDoubleTy()) || + (PointerElementType->isIntegerTy(32) && + llvm::any_of(I->users(), + [](const User *U) { return isa(U); })); + }; + + // Check if a load/store has DQ form. + auto isDQFormCandidate = [&] (const Instruction *I, const Value *PtrValue) { + assert((PtrValue && I) && "Invalid parameter!"); + return !isa(I) && ST && ST->hasP9Vector() && + (PtrValue->getType()->getPointerElementType()->isVectorTy()); + }; + + // intrinsic for update form. + SmallVector UpdateFormBuckets = + collectCandidates(L, isUpdateFormCandidate, MaxVarsUpdateForm); + + // Prepare for update form. + if (!UpdateFormBuckets.empty()) + MadeChange |= updateFormPrep(L, UpdateFormBuckets); + + // Collect buckets of comparable addresses used by loads and stores for DS + // form. + SmallVector DSFormBuckets = + collectCandidates(L, isDSFormCandidate, MaxVarsDSForm); + + // Prepare for DS form. + if (!DSFormBuckets.empty()) + MadeChange |= dispFormPrep(L, DSFormBuckets, DSForm); + + // Collect buckets of comparable addresses used by loads and stores for DQ + // form. + SmallVector DQFormBuckets = + collectCandidates(L, isDQFormCandidate, MaxVarsDQForm); + + // Prepare for DQ form. + if (!DQFormBuckets.empty()) + MadeChange |= dispFormPrep(L, DQFormBuckets, DQForm); + + return MadeChange; +} diff --git a/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp deleted file mode 100644 index d252cfbd26b1..000000000000 --- a/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ /dev/null @@ -1,605 +0,0 @@ -//===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a pass to prepare loops for pre-increment addressing -// modes. Additional PHIs are created for loop induction variables used by -// load/store instructions so that the pre-increment forms can be used. -// Generically, this means transforming loops like this: -// for (int i = 0; i < n; ++i) -// array[i] = c; -// to look like this: -// T *p = array[-1]; -// for (int i = 0; i < n; ++i) -// *++p = c; -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "ppc-loop-preinc-prep" - -#include "PPC.h" -#include "PPCSubtarget.h" -#include "PPCTargetMachine.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/Pass.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/LoopUtils.h" -#include -#include -#include - -using namespace llvm; - -// By default, we limit this to creating 16 PHIs (which is a little over half -// of the allocatable register set). -static cl::opt MaxVars("ppc-preinc-prep-max-vars", - cl::Hidden, cl::init(16), - cl::desc("Potential PHI threshold for PPC preinc loop prep")); - -STATISTIC(PHINodeAlreadyExists, "PHI node already in pre-increment form"); -STATISTIC(UpdFormChainRewritten, "Num of update form chain rewritten"); - -namespace { - struct BucketElement { - BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} - BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} - - const SCEVConstant *Offset; - Instruction *Instr; - }; - - struct Bucket { - Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), - Elements(1, BucketElement(I)) {} - - const SCEV *BaseSCEV; - SmallVector Elements; - }; - - class PPCLoopPreIncPrep : public FunctionPass { - public: - static char ID; // Pass ID, replacement for typeid - - PPCLoopPreIncPrep() : FunctionPass(ID) { - initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); - } - - PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { - initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - } - - bool runOnFunction(Function &F) override; - - private: - PPCTargetMachine *TM = nullptr; - const PPCSubtarget *ST; - DominatorTree *DT; - LoopInfo *LI; - ScalarEvolution *SE; - bool PreserveLCSSA; - - bool runOnLoop(Loop *L); - - /// Check if required PHI node is already exist in Loop \p L. - bool alreadyPrepared(Loop *L, Instruction* MemI, - const SCEV *BasePtrStartSCEV, - const SCEVConstant *BasePtrIncSCEV); - - /// Collect condition matched(\p isValidCandidate() returns true) - /// candidates in Loop \p L. - SmallVector - collectCandidates(Loop *L, - std::function - isValidCandidate, - unsigned MaxCandidateNum); - - /// Add a candidate to candidates \p Buckets. - void addOneCandidate(Instruction *MemI, const SCEV *LSCEV, - SmallVector &Buckets, - unsigned MaxCandidateNum); - - /// Prepare all candidates in \p Buckets for update form. - bool updateFormPrep(Loop *L, SmallVector &Buckets); - - /// Prepare for one chain \p BucketChain, find the best base element and - /// update all other elements in \p BucketChain accordingly. - bool prepareBaseForUpdateFormChain(Bucket &BucketChain); - - /// Rewrite load/store instructions in \p BucketChain according to - /// preparation. - bool rewriteLoadStores(Loop *L, Bucket &BucketChain, - SmallSet &BBChanged); - }; - -} // end anonymous namespace - -char PPCLoopPreIncPrep::ID = 0; -static const char *name = "Prepare loop for pre-inc. addressing modes"; -INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) - -static const std::string PHINodeNameSuffix = ".phi"; -static const std::string CastNodeNameSuffix = ".cast"; -static const std::string GEPNodeIncNameSuffix = ".inc"; -static const std::string GEPNodeOffNameSuffix = ".off"; - -FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) { - return new PPCLoopPreIncPrep(TM); -} - -static bool IsPtrInBounds(Value *BasePtr) { - Value *StrippedBasePtr = BasePtr; - while (BitCastInst *BC = dyn_cast(StrippedBasePtr)) - StrippedBasePtr = BC->getOperand(0); - if (GetElementPtrInst *GEP = dyn_cast(StrippedBasePtr)) - return GEP->isInBounds(); - - return false; -} - -static std::string getInstrName(const Value *I, const std::string Suffix) { - assert(I && "Invalid paramater!"); - if (I->hasName()) - return (I->getName() + Suffix).str(); - else - return ""; -} - -static Value *GetPointerOperand(Value *MemI) { - if (LoadInst *LMemI = dyn_cast(MemI)) { - return LMemI->getPointerOperand(); - } else if (StoreInst *SMemI = dyn_cast(MemI)) { - return SMemI->getPointerOperand(); - } else if (IntrinsicInst *IMemI = dyn_cast(MemI)) { - if (IMemI->getIntrinsicID() == Intrinsic::prefetch) - return IMemI->getArgOperand(0); - } - - return nullptr; -} - -bool PPCLoopPreIncPrep::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - LI = &getAnalysis().getLoopInfo(); - SE = &getAnalysis().getSE(); - auto *DTWP = getAnalysisIfAvailable(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - ST = TM ? TM->getSubtargetImpl(F) : nullptr; - - bool MadeChange = false; - - for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I) - for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L) - MadeChange |= runOnLoop(*L); - - return MadeChange; -} - -void PPCLoopPreIncPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV, - SmallVector &Buckets, - unsigned MaxCandidateNum) { - assert((MemI && GetPointerOperand(MemI)) && - "Candidate should be a memory instruction."); - assert(LSCEV && "Invalid SCEV for Ptr value."); - bool FoundBucket = false; - for (auto &B : Buckets) { - const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); - if (const auto *CDiff = dyn_cast(Diff)) { - B.Elements.push_back(BucketElement(CDiff, MemI)); - FoundBucket = true; - break; - } - } - - if (!FoundBucket) { - if (Buckets.size() == MaxCandidateNum) - return; - Buckets.push_back(Bucket(LSCEV, MemI)); - } -} - -SmallVector PPCLoopPreIncPrep::collectCandidates( - Loop *L, - std::function isValidCandidate, - unsigned MaxCandidateNum) { - SmallVector Buckets; - for (const auto &BB : L->blocks()) - for (auto &J : *BB) { - Value *PtrValue; - Instruction *MemI; - - if (LoadInst *LMemI = dyn_cast(&J)) { - MemI = LMemI; - PtrValue = LMemI->getPointerOperand(); - } else if (StoreInst *SMemI = dyn_cast(&J)) { - MemI = SMemI; - PtrValue = SMemI->getPointerOperand(); - } else if (IntrinsicInst *IMemI = dyn_cast(&J)) { - if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { - MemI = IMemI; - PtrValue = IMemI->getArgOperand(0); - } else continue; - } else continue; - - unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); - if (PtrAddrSpace) - continue; - - if (L->isLoopInvariant(PtrValue)) - continue; - - const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); - const SCEVAddRecExpr *LARSCEV = dyn_cast(LSCEV); - if (!LARSCEV || LARSCEV->getLoop() != L) - continue; - - if (isValidCandidate(&J, PtrValue)) - addOneCandidate(MemI, LSCEV, Buckets, MaxCandidateNum); - } - return Buckets; -} - -// TODO: implement a more clever base choosing policy. -// Currently we always choose an exist load/store offset. This maybe lead to -// suboptimal code sequences. For example, for one DS chain with offsets -// {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp -// for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a -// multipler of 4, it cannot be represented by sint16. -bool PPCLoopPreIncPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) { - // We have a choice now of which instruction's memory operand we use as the - // base for the generated PHI. Always picking the first instruction in each - // bucket does not work well, specifically because that instruction might - // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, - // the choice is somewhat arbitrary, because the backend will happily - // generate direct offsets from both the pre-incremented and - // post-incremented pointer values. Thus, we'll pick the first non-prefetch - // instruction in each bucket, and adjust the recurrence and other offsets - // accordingly. - for (int j = 0, je = BucketChain.Elements.size(); j != je; ++j) { - if (auto *II = dyn_cast(BucketChain.Elements[j].Instr)) - if (II->getIntrinsicID() == Intrinsic::prefetch) - continue; - - // If we'd otherwise pick the first element anyway, there's nothing to do. - if (j == 0) - break; - - // If our chosen element has no offset from the base pointer, there's - // nothing to do. - if (!BucketChain.Elements[j].Offset || - BucketChain.Elements[j].Offset->isZero()) - break; - - const SCEV *Offset = BucketChain.Elements[j].Offset; - BucketChain.BaseSCEV = SE->getAddExpr(BucketChain.BaseSCEV, Offset); - for (auto &E : BucketChain.Elements) { - if (E.Offset) - E.Offset = cast(SE->getMinusSCEV(E.Offset, Offset)); - else - E.Offset = cast(SE->getNegativeSCEV(Offset)); - } - - std::swap(BucketChain.Elements[j], BucketChain.Elements[0]); - break; - } - return true; -} - -bool PPCLoopPreIncPrep::rewriteLoadStores( - Loop *L, Bucket &BucketChain, SmallSet &BBChanged) { - bool MadeChange = false; - const SCEVAddRecExpr *BasePtrSCEV = - cast(BucketChain.BaseSCEV); - if (!BasePtrSCEV->isAffine()) - return MadeChange; - - LLVM_DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); - - assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?"); - - // The instruction corresponding to the Bucket's BaseSCEV must be the first - // in the vector of elements. - Instruction *MemI = BucketChain.Elements.begin()->Instr; - Value *BasePtr = GetPointerOperand(MemI); - assert(BasePtr && "No pointer operand"); - - Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); - Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(), - BasePtr->getType()->getPointerAddressSpace()); - - const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart(); - if (!SE->isLoopInvariant(BasePtrStartSCEV, L)) - return MadeChange; - - const SCEVConstant *BasePtrIncSCEV = - dyn_cast(BasePtrSCEV->getStepRecurrence(*SE)); - if (!BasePtrIncSCEV) - return MadeChange; - BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV); - if (!isSafeToExpand(BasePtrStartSCEV, *SE)) - return MadeChange; - - if (alreadyPrepared(L, MemI, BasePtrStartSCEV, BasePtrIncSCEV)) - return MadeChange; - - LLVM_DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); - - BasicBlock *Header = L->getHeader(); - unsigned HeaderLoopPredCount = pred_size(Header); - BasicBlock *LoopPredecessor = L->getLoopPredecessor(); - - PHINode *NewPHI = - PHINode::Create(I8PtrTy, HeaderLoopPredCount, - getInstrName(MemI, PHINodeNameSuffix), - Header->getFirstNonPHI()); - - SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); - Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, - LoopPredecessor->getTerminator()); - - // Note that LoopPredecessor might occur in the predecessor list multiple - // times, and we need to add it the right number of times. - for (const auto &PI : predecessors(Header)) { - if (PI != LoopPredecessor) - continue; - - NewPHI->addIncoming(BasePtrStart, LoopPredecessor); - } - - Instruction *InsPoint = &*Header->getFirstInsertionPt(); - GetElementPtrInst *PtrInc = GetElementPtrInst::Create( - I8Ty, NewPHI, BasePtrIncSCEV->getValue(), - getInstrName(MemI, GEPNodeIncNameSuffix), InsPoint); - PtrInc->setIsInBounds(IsPtrInBounds(BasePtr)); - for (const auto &PI : predecessors(Header)) { - if (PI == LoopPredecessor) - continue; - - NewPHI->addIncoming(PtrInc, PI); - } - - Instruction *NewBasePtr; - if (PtrInc->getType() != BasePtr->getType()) - NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(), - getInstrName(PtrInc, CastNodeNameSuffix), InsPoint); - else - NewBasePtr = PtrInc; - - if (Instruction *IDel = dyn_cast(BasePtr)) - BBChanged.insert(IDel->getParent()); - BasePtr->replaceAllUsesWith(NewBasePtr); - RecursivelyDeleteTriviallyDeadInstructions(BasePtr); - - // Keep track of the replacement pointer values we've inserted so that we - // don't generate more pointer values than necessary. - SmallPtrSet NewPtrs; - NewPtrs.insert(NewBasePtr); - - for (auto I = std::next(BucketChain.Elements.begin()), - IE = BucketChain.Elements.end(); I != IE; ++I) { - Value *Ptr = GetPointerOperand(I->Instr); - assert(Ptr && "No pointer operand"); - if (NewPtrs.count(Ptr)) - continue; - - Instruction *RealNewPtr; - if (!I->Offset || I->Offset->getValue()->isZero()) { - RealNewPtr = NewBasePtr; - } else { - Instruction *PtrIP = dyn_cast(Ptr); - if (PtrIP && isa(NewBasePtr) && - cast(NewBasePtr)->getParent() == PtrIP->getParent()) - PtrIP = nullptr; - else if (PtrIP && isa(PtrIP)) - PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); - else if (!PtrIP) - PtrIP = I->Instr; - - GetElementPtrInst *NewPtr = GetElementPtrInst::Create( - I8Ty, PtrInc, I->Offset->getValue(), - getInstrName(I->Instr, GEPNodeOffNameSuffix), PtrIP); - if (!PtrIP) - NewPtr->insertAfter(cast(PtrInc)); - NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); - RealNewPtr = NewPtr; - } - - if (Instruction *IDel = dyn_cast(Ptr)) - BBChanged.insert(IDel->getParent()); - - Instruction *ReplNewPtr; - if (Ptr->getType() != RealNewPtr->getType()) { - ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), - getInstrName(Ptr, CastNodeNameSuffix)); - ReplNewPtr->insertAfter(RealNewPtr); - } else - ReplNewPtr = RealNewPtr; - - Ptr->replaceAllUsesWith(ReplNewPtr); - RecursivelyDeleteTriviallyDeadInstructions(Ptr); - - NewPtrs.insert(RealNewPtr); - } - - MadeChange = true; - UpdFormChainRewritten++; - - return MadeChange; -} - -bool PPCLoopPreIncPrep::updateFormPrep(Loop *L, - SmallVector &Buckets) { - bool MadeChange = false; - if (Buckets.empty()) - return MadeChange; - SmallSet BBChanged; - for (auto &Bucket : Buckets) - // The base address of each bucket is transformed into a phi and the others - // are rewritten based on new base. - if (prepareBaseForUpdateFormChain(Bucket)) - MadeChange |= rewriteLoadStores(L, Bucket, BBChanged); - if (MadeChange) - for (auto &BB : L->blocks()) - if (BBChanged.count(BB)) - DeleteDeadPHIs(BB); - return MadeChange; -} - -// In order to prepare for the pre-increment a PHI is added. -// This function will check to see if that PHI already exists and will return -// true if it found an existing PHI with the same start and increment as the -// one we wanted to create. -bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI, - const SCEV *BasePtrStartSCEV, - const SCEVConstant *BasePtrIncSCEV) { - BasicBlock *BB = MemI->getParent(); - if (!BB) - return false; - - BasicBlock *PredBB = L->getLoopPredecessor(); - BasicBlock *LatchBB = L->getLoopLatch(); - - if (!PredBB || !LatchBB) - return false; - - // Run through the PHIs and see if we have some that looks like a preparation - iterator_range PHIIter = BB->phis(); - for (auto & CurrentPHI : PHIIter) { - PHINode *CurrentPHINode = dyn_cast(&CurrentPHI); - if (!CurrentPHINode) - continue; - - if (!SE->isSCEVable(CurrentPHINode->getType())) - continue; - - const SCEV *PHISCEV = SE->getSCEVAtScope(CurrentPHINode, L); - - const SCEVAddRecExpr *PHIBasePtrSCEV = dyn_cast(PHISCEV); - if (!PHIBasePtrSCEV) - continue; - - const SCEVConstant *PHIBasePtrIncSCEV = - dyn_cast(PHIBasePtrSCEV->getStepRecurrence(*SE)); - if (!PHIBasePtrIncSCEV) - continue; - - if (CurrentPHINode->getNumIncomingValues() == 2) { - if ((CurrentPHINode->getIncomingBlock(0) == LatchBB && - CurrentPHINode->getIncomingBlock(1) == PredBB) || - (CurrentPHINode->getIncomingBlock(1) == LatchBB && - CurrentPHINode->getIncomingBlock(0) == PredBB)) { - if (PHIBasePtrSCEV->getStart() == BasePtrStartSCEV && - PHIBasePtrIncSCEV == BasePtrIncSCEV) { - // The existing PHI (CurrentPHINode) has the same start and increment - // as the PHI that we wanted to create. - ++PHINodeAlreadyExists; - return true; - } - } - } - } - return false; -} - -bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { - bool MadeChange = false; - - // Only prep. the inner-most loop - if (!L->empty()) - return MadeChange; - - LLVM_DEBUG(dbgs() << "PIP: Examining: " << *L << "\n"); - - BasicBlock *LoopPredecessor = L->getLoopPredecessor(); - // If there is no loop predecessor, or the loop predecessor's terminator - // returns a value (which might contribute to determining the loop's - // iteration space), insert a new preheader for the loop. - if (!LoopPredecessor || - !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { - LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); - if (LoopPredecessor) - MadeChange = true; - } - if (!LoopPredecessor) { - LLVM_DEBUG(dbgs() << "PIP fails since no predecessor for current loop.\n"); - return MadeChange; - } - - // Check if a load/store has update form. This lambda is used by function - // collectCandidates which can collect candidates for types defined by lambda. - auto isUpdateFormCandidate = [&] (const Instruction *I, - const Value *PtrValue) { - assert((PtrValue && I) && "Invalid parameter!"); - // There are no update forms for Altivec vector load/stores. - if (ST && ST->hasAltivec() && - PtrValue->getType()->getPointerElementType()->isVectorTy()) - return false; - // See getPreIndexedAddressParts, the displacement for LDU/STDU has to - // be 4's multiple (DS-form). For i64 loads/stores when the displacement - // fits in a 16-bit signed field but isn't a multiple of 4, it will be - // useless and possible to break some original well-form addressing mode - // to make this pre-inc prep for it. - if (PtrValue->getType()->getPointerElementType()->isIntegerTy(64)) { - const SCEV *LSCEV = SE->getSCEVAtScope(const_cast(PtrValue), L); - const SCEVAddRecExpr *LARSCEV = dyn_cast(LSCEV); - if (!LARSCEV || LARSCEV->getLoop() != L) - return false; - if (const SCEVConstant *StepConst = - dyn_cast(LARSCEV->getStepRecurrence(*SE))) { - const APInt &ConstInt = StepConst->getValue()->getValue(); - if (ConstInt.isSignedIntN(16) && ConstInt.srem(4) != 0) - return false; - } - } - return true; - }; - - // Collect buckets of comparable addresses used by loads, stores and prefetch - // intrinsic for update form. - SmallVector UpdateFormBuckets = - collectCandidates(L, isUpdateFormCandidate, MaxVars); - - // Prepare for update form. - if (!UpdateFormBuckets.empty()) - MadeChange |= updateFormPrep(L, UpdateFormBuckets); - - return MadeChange; -} diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp new file mode 100644 index 000000000000..83cca11b27a3 --- /dev/null +++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp @@ -0,0 +1,164 @@ +//===-- PPCLowerMASSVEntries.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements lowering of MASSV (SIMD) entries for specific PowerPC +// subtargets. +// Following is an example of a conversion specific to Power9 subtarget: +// __sind2_massv ---> __sind2_P9 +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "PPCSubtarget.h" +#include "PPCTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +#define DEBUG_TYPE "ppc-lower-massv-entries" + +using namespace llvm; + +namespace { + +// Length of the suffix "massv", which is specific to IBM MASSV library entries. +const unsigned MASSVSuffixLength = 5; + +static StringRef MASSVFuncs[] = { +#define TLI_DEFINE_MASSV_VECFUNCS_NAMES +#include "llvm/Analysis/VecFuncs.def" +}; + +class PPCLowerMASSVEntries : public ModulePass { +public: + static char ID; + + PPCLowerMASSVEntries() : ModulePass(ID) {} + + bool runOnModule(Module &M) override; + + StringRef getPassName() const override { return "PPC Lower MASS Entries"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + +private: + static bool isMASSVFunc(StringRef Name); + static StringRef getCPUSuffix(const PPCSubtarget *Subtarget); + static std::string createMASSVFuncName(Function &Func, + const PPCSubtarget *Subtarget); + bool lowerMASSVCall(CallInst *CI, Function &Func, Module &M, + const PPCSubtarget *Subtarget); +}; + +} // namespace + +/// Checks if the specified function name represents an entry in the MASSV +/// library. +bool PPCLowerMASSVEntries::isMASSVFunc(StringRef Name) { + auto Iter = std::find(std::begin(MASSVFuncs), std::end(MASSVFuncs), Name); + return Iter != std::end(MASSVFuncs); +} + +// FIXME: +/// Returns a string corresponding to the specified PowerPC subtarget. e.g.: +/// "P8" for Power8, "P9" for Power9. The string is used as a suffix while +/// generating subtarget-specific MASSV library functions. Current support +/// includes Power8 and Power9 subtargets. +StringRef PPCLowerMASSVEntries::getCPUSuffix(const PPCSubtarget *Subtarget) { + // Assume Power8 when Subtarget is unavailable. + if (!Subtarget) + return "P8"; + if (Subtarget->hasP9Vector()) + return "P9"; + if (Subtarget->hasP8Vector()) + return "P8"; + + report_fatal_error("Unsupported Subtarget: MASSV is supported only on " + "Power8 and Power9 subtargets."); +} + +/// Creates PowerPC subtarget-specific name corresponding to the specified +/// generic MASSV function, and the PowerPC subtarget. +std::string +PPCLowerMASSVEntries::createMASSVFuncName(Function &Func, + const PPCSubtarget *Subtarget) { + StringRef Suffix = getCPUSuffix(Subtarget); + auto GenericName = Func.getName().drop_back(MASSVSuffixLength).str(); + std::string MASSVEntryName = GenericName + Suffix.str(); + return MASSVEntryName; +} + +/// Lowers generic MASSV entries to PowerPC subtarget-specific MASSV entries. +/// e.g.: __sind2_massv --> __sind2_P9 for a Power9 subtarget. +/// Both function prototypes and their callsites are updated during lowering. +bool PPCLowerMASSVEntries::lowerMASSVCall(CallInst *CI, Function &Func, + Module &M, + const PPCSubtarget *Subtarget) { + if (CI->use_empty()) + return false; + + std::string MASSVEntryName = createMASSVFuncName(Func, Subtarget); + FunctionCallee FCache = M.getOrInsertFunction( + MASSVEntryName, Func.getFunctionType(), Func.getAttributes()); + + CallSite CS(CI); + CI->setCalledFunction(FCache); + + return true; +} + +bool PPCLowerMASSVEntries::runOnModule(Module &M) { + bool Changed = false; + + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return Changed; + + auto &TM = TPC->getTM(); + const PPCSubtarget *Subtarget; + + for (Function &Func : M) { + if (!Func.isDeclaration()) + continue; + + if (!isMASSVFunc(Func.getName())) + continue; + + // Call to lowerMASSVCall() invalidates the iterator over users upon + // replacing the users. Precomputing the current list of users allows us to + // replace all the call sites. + SmallVector MASSVUsers; + for (auto *User: Func.users()) + MASSVUsers.push_back(User); + + for (auto *User : MASSVUsers) { + auto *CI = dyn_cast(User); + if (!CI) + continue; + + Subtarget = &TM.getSubtarget(*CI->getParent()->getParent()); + Changed |= lowerMASSVCall(CI, Func, M, Subtarget); + } + } + + return Changed; +} + +char PPCLowerMASSVEntries::ID = 0; + +char &llvm::PPCLowerMASSVEntriesID = PPCLowerMASSVEntries::ID; + +INITIALIZE_PASS(PPCLowerMASSVEntries, DEBUG_TYPE, "Lower MASSV entries", false, + false) + +ModulePass *llvm::createPPCLowerMASSVEntriesPass() { + return new PPCLowerMASSVEntries(); +} diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp index ac8ac060f460..74192cb20cd0 100644 --- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -18,6 +18,8 @@ // //===---------------------------------------------------------------------===// +#include "MCTargetDesc/PPCMCTargetDesc.h" +#include "MCTargetDesc/PPCPredicates.h" #include "PPC.h" #include "PPCInstrBuilder.h" #include "PPCInstrInfo.h" @@ -26,12 +28,12 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" -#include "MCTargetDesc/PPCPredicates.h" using namespace llvm; @@ -160,33 +162,33 @@ static MachineInstr *getVRegDefOrNull(MachineOperand *Op, static unsigned getKnownLeadingZeroCount(MachineInstr *MI, const PPCInstrInfo *TII) { unsigned Opcode = MI->getOpcode(); - if (Opcode == PPC::RLDICL || Opcode == PPC::RLDICLo || - Opcode == PPC::RLDCL || Opcode == PPC::RLDCLo) + if (Opcode == PPC::RLDICL || Opcode == PPC::RLDICL_rec || + Opcode == PPC::RLDCL || Opcode == PPC::RLDCL_rec) return MI->getOperand(3).getImm(); - if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDICo) && - MI->getOperand(3).getImm() <= 63 - MI->getOperand(2).getImm()) + if ((Opcode == PPC::RLDIC || Opcode == PPC::RLDIC_rec) && + MI->getOperand(3).getImm() <= 63 - MI->getOperand(2).getImm()) return MI->getOperand(3).getImm(); - if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINMo || - Opcode == PPC::RLWNM || Opcode == PPC::RLWNMo || + if ((Opcode == PPC::RLWINM || Opcode == PPC::RLWINM_rec || + Opcode == PPC::RLWNM || Opcode == PPC::RLWNM_rec || Opcode == PPC::RLWINM8 || Opcode == PPC::RLWNM8) && - MI->getOperand(3).getImm() <= MI->getOperand(4).getImm()) + MI->getOperand(3).getImm() <= MI->getOperand(4).getImm()) return 32 + MI->getOperand(3).getImm(); - if (Opcode == PPC::ANDIo) { + if (Opcode == PPC::ANDI_rec) { uint16_t Imm = MI->getOperand(2).getImm(); return 48 + countLeadingZeros(Imm); } - if (Opcode == PPC::CNTLZW || Opcode == PPC::CNTLZWo || - Opcode == PPC::CNTTZW || Opcode == PPC::CNTTZWo || + if (Opcode == PPC::CNTLZW || Opcode == PPC::CNTLZW_rec || + Opcode == PPC::CNTTZW || Opcode == PPC::CNTTZW_rec || Opcode == PPC::CNTLZW8 || Opcode == PPC::CNTTZW8) // The result ranges from 0 to 32. return 58; - if (Opcode == PPC::CNTLZD || Opcode == PPC::CNTLZDo || - Opcode == PPC::CNTTZD || Opcode == PPC::CNTTZDo) + if (Opcode == PPC::CNTLZD || Opcode == PPC::CNTLZD_rec || + Opcode == PPC::CNTTZD || Opcode == PPC::CNTTZD_rec) // The result ranges from 0 to 64. return 57; @@ -331,108 +333,121 @@ bool PPCMIPeephole::simplifyCode(void) { // is identified by an immediate value of 0 or 3. int Immed = MI.getOperand(3).getImm(); - if (Immed != 1) { - - // For each of these simplifications, we need the two source - // regs to match. Unfortunately, MachineCSE ignores COPY and - // SUBREG_TO_REG, so for example we can see - // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed. - // We have to look through chains of COPY and SUBREG_TO_REG - // to find the real source values for comparison. - unsigned TrueReg1 = - TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI); - unsigned TrueReg2 = - TRI->lookThruCopyLike(MI.getOperand(2).getReg(), MRI); - - if (TrueReg1 == TrueReg2 && Register::isVirtualRegister(TrueReg1)) { - MachineInstr *DefMI = MRI->getVRegDef(TrueReg1); - unsigned DefOpc = DefMI ? DefMI->getOpcode() : 0; - - // If this is a splat fed by a splatting load, the splat is - // redundant. Replace with a copy. This doesn't happen directly due - // to code in PPCDAGToDAGISel.cpp, but it can happen when converting - // a load of a double to a vector of 64-bit integers. - auto isConversionOfLoadAndSplat = [=]() -> bool { - if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS) - return false; - unsigned DefReg = - TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); - if (Register::isVirtualRegister(DefReg)) { - MachineInstr *LoadMI = MRI->getVRegDef(DefReg); - if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX) - return true; - } - return false; - }; - if (DefMI && (Immed == 0 || Immed == 3)) { - if (DefOpc == PPC::LXVDSX || isConversionOfLoadAndSplat()) { - LLVM_DEBUG(dbgs() << "Optimizing load-and-splat/splat " - "to load-and-splat/copy: "); - LLVM_DEBUG(MI.dump()); - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), - MI.getOperand(0).getReg()) - .add(MI.getOperand(1)); - ToErase = &MI; - Simplified = true; - } - } + if (Immed == 1) + break; - // If this is a splat or a swap fed by another splat, we - // can replace it with a copy. - if (DefOpc == PPC::XXPERMDI) { - unsigned FeedImmed = DefMI->getOperand(3).getImm(); - unsigned FeedReg1 = - TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); - unsigned FeedReg2 = - TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI); - - if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) { - LLVM_DEBUG(dbgs() << "Optimizing splat/swap or splat/splat " - "to splat/copy: "); - LLVM_DEBUG(MI.dump()); - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), - MI.getOperand(0).getReg()) - .add(MI.getOperand(1)); - ToErase = &MI; - Simplified = true; - } - - // If this is a splat fed by a swap, we can simplify modify - // the splat to splat the other value from the swap's input - // parameter. - else if ((Immed == 0 || Immed == 3) - && FeedImmed == 2 && FeedReg1 == FeedReg2) { - LLVM_DEBUG(dbgs() << "Optimizing swap/splat => splat: "); - LLVM_DEBUG(MI.dump()); - MI.getOperand(1).setReg(DefMI->getOperand(1).getReg()); - MI.getOperand(2).setReg(DefMI->getOperand(2).getReg()); - MI.getOperand(3).setImm(3 - Immed); - Simplified = true; - } - - // If this is a swap fed by a swap, we can replace it - // with a copy from the first swap's input. - else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) { - LLVM_DEBUG(dbgs() << "Optimizing swap/swap => copy: "); - LLVM_DEBUG(MI.dump()); - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), - MI.getOperand(0).getReg()) - .add(DefMI->getOperand(1)); - ToErase = &MI; - Simplified = true; - } - } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs && - (DefMI->getOperand(2).getImm() == 0 || - DefMI->getOperand(2).getImm() == 3)) { - // Splat fed by another splat - switch the output of the first - // and remove the second. - DefMI->getOperand(0).setReg(MI.getOperand(0).getReg()); - ToErase = &MI; - Simplified = true; - LLVM_DEBUG(dbgs() << "Removing redundant splat: "); - LLVM_DEBUG(MI.dump()); - } + // For each of these simplifications, we need the two source + // regs to match. Unfortunately, MachineCSE ignores COPY and + // SUBREG_TO_REG, so for example we can see + // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed. + // We have to look through chains of COPY and SUBREG_TO_REG + // to find the real source values for comparison. + unsigned TrueReg1 = + TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI); + unsigned TrueReg2 = + TRI->lookThruCopyLike(MI.getOperand(2).getReg(), MRI); + + if (!(TrueReg1 == TrueReg2 && Register::isVirtualRegister(TrueReg1))) + break; + + MachineInstr *DefMI = MRI->getVRegDef(TrueReg1); + + if (!DefMI) + break; + + unsigned DefOpc = DefMI->getOpcode(); + + // If this is a splat fed by a splatting load, the splat is + // redundant. Replace with a copy. This doesn't happen directly due + // to code in PPCDAGToDAGISel.cpp, but it can happen when converting + // a load of a double to a vector of 64-bit integers. + auto isConversionOfLoadAndSplat = [=]() -> bool { + if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS) + return false; + unsigned FeedReg1 = + TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI); + if (Register::isVirtualRegister(FeedReg1)) { + MachineInstr *LoadMI = MRI->getVRegDef(FeedReg1); + if (LoadMI && LoadMI->getOpcode() == PPC::LXVDSX) + return true; + } + return false; + }; + if ((Immed == 0 || Immed == 3) && + (DefOpc == PPC::LXVDSX || isConversionOfLoadAndSplat())) { + LLVM_DEBUG(dbgs() << "Optimizing load-and-splat/splat " + "to load-and-splat/copy: "); + LLVM_DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); + ToErase = &MI; + Simplified = true; + } + + // If this is a splat or a swap fed by another splat, we + // can replace it with a copy. + if (DefOpc == PPC::XXPERMDI) { + unsigned DefReg1 = DefMI->getOperand(1).getReg(); + unsigned DefReg2 = DefMI->getOperand(2).getReg(); + unsigned DefImmed = DefMI->getOperand(3).getImm(); + + // If the two inputs are not the same register, check to see if + // they originate from the same virtual register after only + // copy-like instructions. + if (DefReg1 != DefReg2) { + unsigned FeedReg1 = TRI->lookThruCopyLike(DefReg1, MRI); + unsigned FeedReg2 = TRI->lookThruCopyLike(DefReg2, MRI); + + if (!(FeedReg1 == FeedReg2 && + Register::isVirtualRegister(FeedReg1))) + break; + } + + if (DefImmed == 0 || DefImmed == 3) { + LLVM_DEBUG(dbgs() << "Optimizing splat/swap or splat/splat " + "to splat/copy: "); + LLVM_DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .add(MI.getOperand(1)); + ToErase = &MI; + Simplified = true; } + + // If this is a splat fed by a swap, we can simplify modify + // the splat to splat the other value from the swap's input + // parameter. + else if ((Immed == 0 || Immed == 3) && DefImmed == 2) { + LLVM_DEBUG(dbgs() << "Optimizing swap/splat => splat: "); + LLVM_DEBUG(MI.dump()); + MI.getOperand(1).setReg(DefReg1); + MI.getOperand(2).setReg(DefReg2); + MI.getOperand(3).setImm(3 - Immed); + Simplified = true; + } + + // If this is a swap fed by a swap, we can replace it + // with a copy from the first swap's input. + else if (Immed == 2 && DefImmed == 2) { + LLVM_DEBUG(dbgs() << "Optimizing swap/swap => copy: "); + LLVM_DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY), + MI.getOperand(0).getReg()) + .add(DefMI->getOperand(1)); + ToErase = &MI; + Simplified = true; + } + } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs && + (DefMI->getOperand(2).getImm() == 0 || + DefMI->getOperand(2).getImm() == 3)) { + // Splat fed by another splat - switch the output of the first + // and remove the second. + DefMI->getOperand(0).setReg(MI.getOperand(0).getReg()); + ToErase = &MI; + Simplified = true; + LLVM_DEBUG(dbgs() << "Removing redundant splat: "); + LLVM_DEBUG(MI.dump()); } break; } @@ -805,6 +820,153 @@ bool PPCMIPeephole::simplifyCode(void) { combineSEXTAndSHL(MI, ToErase); break; } + case PPC::RLWINM: + case PPC::RLWINM_rec: + case PPC::RLWINM8: + case PPC::RLWINM8_rec: { + unsigned FoldingReg = MI.getOperand(1).getReg(); + if (!Register::isVirtualRegister(FoldingReg)) + break; + + MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg); + if (SrcMI->getOpcode() != PPC::RLWINM && + SrcMI->getOpcode() != PPC::RLWINM_rec && + SrcMI->getOpcode() != PPC::RLWINM8 && + SrcMI->getOpcode() != PPC::RLWINM8_rec) + break; + assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() && + MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() && + SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) && + "Invalid PPC::RLWINM Instruction!"); + uint64_t SHSrc = SrcMI->getOperand(2).getImm(); + uint64_t SHMI = MI.getOperand(2).getImm(); + uint64_t MBSrc = SrcMI->getOperand(3).getImm(); + uint64_t MBMI = MI.getOperand(3).getImm(); + uint64_t MESrc = SrcMI->getOperand(4).getImm(); + uint64_t MEMI = MI.getOperand(4).getImm(); + + assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) && + "Invalid PPC::RLWINM Instruction!"); + + // If MBMI is bigger than MEMI, we always can not get run of ones. + // RotatedSrcMask non-wrap: + // 0........31|32........63 + // RotatedSrcMask: B---E B---E + // MaskMI: -----------|--E B------ + // Result: ----- --- (Bad candidate) + // + // RotatedSrcMask wrap: + // 0........31|32........63 + // RotatedSrcMask: --E B----|--E B---- + // MaskMI: -----------|--E B------ + // Result: --- -----|--- ----- (Bad candidate) + // + // One special case is RotatedSrcMask is a full set mask. + // RotatedSrcMask full: + // 0........31|32........63 + // RotatedSrcMask: ------EB---|-------EB--- + // MaskMI: -----------|--E B------ + // Result: -----------|--- ------- (Good candidate) + + // Mark special case. + bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31); + + // For other MBMI > MEMI cases, just return. + if ((MBMI > MEMI) && !SrcMaskFull) + break; + + // Handle MBMI <= MEMI cases. + APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI); + // In MI, we only need low 32 bits of SrcMI, just consider about low 32 + // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0, + // while in PowerPC ISA, lowerest bit is at index 63. + APInt MaskSrc = + APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc); + // Current APInt::getBitsSetWithWrap sets all bits to 0 if loBit is + // equal to highBit. + // If MBSrc - MESrc == 1, we expect a full set mask instead of Null. + if (SrcMaskFull && (MBSrc - MESrc == 1)) + MaskSrc.setAllBits(); + + APInt RotatedSrcMask = MaskSrc.rotl(SHMI); + APInt FinalMask = RotatedSrcMask & MaskMI; + uint32_t NewMB, NewME; + + // If final mask is 0, MI result should be 0 too. + if (FinalMask.isNullValue()) { + bool Is64Bit = (MI.getOpcode() == PPC::RLWINM8 || + MI.getOpcode() == PPC::RLWINM8_rec); + + Simplified = true; + + LLVM_DEBUG(dbgs() << "Replace Instr: "); + LLVM_DEBUG(MI.dump()); + + if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) { + // Replace MI with "LI 0" + MI.RemoveOperand(4); + MI.RemoveOperand(3); + MI.RemoveOperand(2); + MI.getOperand(1).ChangeToImmediate(0); + MI.setDesc(TII->get(Is64Bit ? PPC::LI8 : PPC::LI)); + } else { + // Replace MI with "ANDI_rec reg, 0" + MI.RemoveOperand(4); + MI.RemoveOperand(3); + MI.getOperand(2).setImm(0); + MI.setDesc(TII->get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec)); + MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); + if (SrcMI->getOperand(1).isKill()) { + MI.getOperand(1).setIsKill(true); + SrcMI->getOperand(1).setIsKill(false); + } else + // About to replace MI.getOperand(1), clear its kill flag. + MI.getOperand(1).setIsKill(false); + } + + LLVM_DEBUG(dbgs() << "With: "); + LLVM_DEBUG(MI.dump()); + } else if ((isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB, + NewME) && NewMB <= NewME)|| SrcMaskFull) { + // Here we only handle MBMI <= MEMI case, so NewMB must be no bigger + // than NewME. Otherwise we get a 64 bit value after folding, but MI + // return a 32 bit value. + + Simplified = true; + LLVM_DEBUG(dbgs() << "Converting Instr: "); + LLVM_DEBUG(MI.dump()); + + uint16_t NewSH = (SHSrc + SHMI) % 32; + MI.getOperand(2).setImm(NewSH); + // If SrcMI mask is full, no need to update MBMI and MEMI. + if (!SrcMaskFull) { + MI.getOperand(3).setImm(NewMB); + MI.getOperand(4).setImm(NewME); + } + MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg()); + if (SrcMI->getOperand(1).isKill()) { + MI.getOperand(1).setIsKill(true); + SrcMI->getOperand(1).setIsKill(false); + } else + // About to replace MI.getOperand(1), clear its kill flag. + MI.getOperand(1).setIsKill(false); + + LLVM_DEBUG(dbgs() << "To: "); + LLVM_DEBUG(MI.dump()); + } + if (Simplified) { + // If FoldingReg has no non-debug use and it has no implicit def (it + // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI. + // Otherwise keep it. + ++NumRotatesCollapsed; + if (MRI->use_nodbg_empty(FoldingReg) && !SrcMI->hasImplicitDef()) { + ToErase = SrcMI; + LLVM_DEBUG(dbgs() << "Delete dead instruction: "); + LLVM_DEBUG(SrcMI->dump()); + } + } + break; + } } } diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h index dfae19804d94..2b341b5952c8 100644 --- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -42,7 +42,7 @@ class PPCFunctionInfo : public MachineFunctionInfo { /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current /// function. This is only valid after the initial scan of the function by /// PEI. - bool MustSaveLR; + bool MustSaveLR = false; /// MustSaveTOC - Indicates that the TOC save needs to be performed in the /// prologue of the function. This is typically the case when there are diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp index b1c0433641dd..a4b4bf2973d1 100644 --- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp +++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp @@ -35,6 +35,8 @@ STATISTIC(NumRemovedInPreEmit, "Number of instructions deleted in pre-emit peephole"); STATISTIC(NumberOfSelfCopies, "Number of self copy instructions eliminated"); +STATISTIC(NumFrameOffFoldInPreEmit, + "Number of folding frame offset by using r+r in pre-emit peephole"); static cl::opt RunPreEmitPeephole("ppc-late-peephole", cl::Hidden, cl::init(true), @@ -161,8 +163,19 @@ namespace { } bool runOnMachineFunction(MachineFunction &MF) override { - if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole) + if (skipFunction(MF.getFunction()) || !RunPreEmitPeephole) { + // Remove UNENCODED_NOP even when this pass is disabled. + // This needs to be done unconditionally so we don't emit zeros + // in the instruction stream. + SmallVector InstrsToErase; + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) + if (MI.getOpcode() == PPC::UNENCODED_NOP) + InstrsToErase.push_back(&MI); + for (MachineInstr *MI : InstrsToErase) + MI->eraseFromParent(); return false; + } bool Changed = false; const PPCInstrInfo *TII = MF.getSubtarget().getInstrInfo(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); @@ -171,6 +184,10 @@ namespace { Changed |= removeRedundantLIs(MBB, TRI); for (MachineInstr &MI : MBB) { unsigned Opc = MI.getOpcode(); + if (Opc == PPC::UNENCODED_NOP) { + InstrsToErase.push_back(&MI); + continue; + } // Detect self copies - these can result from running AADB. if (PPCInstrInfo::isSameClassPhysRegCopy(Opc)) { const MCInstrDesc &MCID = TII->get(Opc); @@ -202,6 +219,12 @@ namespace { InstrsToErase.push_back(DefMIToErase); } } + if (TII->foldFrameOffset(MI)) { + Changed = true; + NumFrameOffFoldInPreEmit++; + LLVM_DEBUG(dbgs() << "Frame offset folding by using index form: "); + LLVM_DEBUG(MI.dump()); + } } // Eliminate conditional branch based on a constant CR bit by diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp index 3b71ed219c17..90cc81beb89d 100644 --- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp +++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Config/llvm-config.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -375,10 +376,10 @@ public: }; private: - const PPCInstrInfo *TII; - MachineFunction *MF; - MachineRegisterInfo *MRI; - const MachineBranchProbabilityInfo *MBPI; + const PPCInstrInfo *TII = nullptr; + MachineFunction *MF = nullptr; + MachineRegisterInfo *MRI = nullptr; + const MachineBranchProbabilityInfo *MBPI = nullptr; // A vector to contain all the CR logical operations SmallVector AllCRLogicalOps; @@ -470,21 +471,21 @@ PPCReduceCRLogicals::createCRLogicalOpInfo(MachineInstr &MIParam) { } else { MachineInstr *Def1 = lookThroughCRCopy(MIParam.getOperand(1).getReg(), Ret.SubregDef1, Ret.CopyDefs.first); + assert(Def1 && "Must be able to find a definition of operand 1."); Ret.DefsSingleUse &= MRI->hasOneNonDBGUse(Def1->getOperand(0).getReg()); Ret.DefsSingleUse &= MRI->hasOneNonDBGUse(Ret.CopyDefs.first->getOperand(0).getReg()); - assert(Def1 && "Must be able to find a definition of operand 1."); if (isBinary(MIParam)) { Ret.IsBinary = 1; MachineInstr *Def2 = lookThroughCRCopy(MIParam.getOperand(2).getReg(), Ret.SubregDef2, Ret.CopyDefs.second); + assert(Def2 && "Must be able to find a definition of operand 2."); Ret.DefsSingleUse &= MRI->hasOneNonDBGUse(Def2->getOperand(0).getReg()); Ret.DefsSingleUse &= MRI->hasOneNonDBGUse(Ret.CopyDefs.second->getOperand(0).getReg()); - assert(Def2 && "Must be able to find a definition of operand 2."); Ret.TrueDefs = std::make_pair(Def1, Def2); } else { Ret.TrueDefs = std::make_pair(Def1, nullptr); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 9ec26a19bdaa..01b97ba6ab20 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -380,7 +380,7 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co // This is eiher: // 1) A fixed frame index object which we know are aligned so // as long as we have a valid DForm/DSForm/DQForm (non XForm) we don't - // need to consider the alignement here. + // need to consider the alignment here. // 2) A not fixed object but in that case we now know that the min required // alignment is no more than 1 based on the previous check. if (InstrInfo->isXFormMemOp(Opcode)) @@ -747,12 +747,18 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, Register SrcReg = MI.getOperand(0).getReg(); // Search up the BB to find the definition of the CR bit. - MachineBasicBlock::reverse_iterator Ins; + MachineBasicBlock::reverse_iterator Ins = MI; + MachineBasicBlock::reverse_iterator Rend = MBB.rend(); + ++Ins; unsigned CRBitSpillDistance = 0; - for (Ins = MI; Ins != MBB.rend(); Ins++) { + bool SeenUse = false; + for (; Ins != Rend; ++Ins) { // Definition found. if (Ins->modifiesRegister(SrcReg, TRI)) break; + // Use found. + if (Ins->readsRegister(SrcReg, TRI)) + SeenUse = true; // Unable to find CR bit definition within maximum search distance. if (CRBitSpillDistance == MaxCRBitSpillDist) { Ins = MI; @@ -767,17 +773,35 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, if (Ins == MBB.rend()) Ins = MI; + bool SpillsKnownBit = false; // There is no need to extract the CR bit if its value is already known. switch (Ins->getOpcode()) { case PPC::CRUNSET: BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LI8 : PPC::LI), Reg) .addImm(0); + SpillsKnownBit = true; break; case PPC::CRSET: BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LIS8 : PPC::LIS), Reg) .addImm(-32768); + SpillsKnownBit = true; break; default: + // On Power9, we can use SETB to extract the LT bit. This only works for + // the LT bit since SETB produces -1/1/0 for LT/GT/. So the value + // of the bit we care about (32-bit sign bit) will be set to the value of + // the LT bit (regardless of the other bits in the CR field). + if (Subtarget.isISA3_0()) { + if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR1LT || + SrcReg == PPC::CR2LT || SrcReg == PPC::CR3LT || + SrcReg == PPC::CR4LT || SrcReg == PPC::CR5LT || + SrcReg == PPC::CR6LT || SrcReg == PPC::CR7LT) { + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETB8 : PPC::SETB), Reg) + .addReg(getCRFromCRBit(SrcReg), RegState::Undef); + break; + } + } + // We need to move the CR field that contains the CR bit we are spilling. // The super register may not be explicitly defined (i.e. it can be defined // by a CR-logical that only defines the subreg) so we state that the CR @@ -803,8 +827,13 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, .addReg(Reg, RegState::Kill), FrameIndex); + bool KillsCRBit = MI.killsRegister(SrcReg, TRI); // Discard the pseudo instruction. MBB.erase(II); + if (SpillsKnownBit && KillsCRBit && !SeenUse) { + Ins->setDesc(TII.get(PPC::UNENCODED_NOP)); + Ins->RemoveOperand(0); + } } void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II, diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index a50e05920cd4..a5fbb0c6ec64 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -61,6 +61,15 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { public: PPCRegisterInfo(const PPCTargetMachine &TM); + /// getMappedIdxOpcForImmOpc - Return the mapped index form load/store opcode + /// for a given imm form load/store opcode \p ImmFormOpcode. + /// FIXME: move this to PPCInstrInfo class. + unsigned getMappedIdxOpcForImmOpc(unsigned ImmOpcode) const { + if (!ImmToIdxMap.count(ImmOpcode)) + return PPC::INSTRUCTION_LIST_END; + return ImmToIdxMap.find(ImmOpcode)->second; + } + /// getPointerRegClass - Return the register class to use to hold pointers. /// This is used for addressing modes. const TargetRegisterClass * diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 10568ed4b655..0997f68bd999 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -61,7 +61,7 @@ PPCSubtarget::PPCSubtarget(const Triple &TT, const std::string &CPU, void PPCSubtarget::initializeEnvironment() { StackAlignment = Align(16); - DarwinDirective = PPC::DIR_NONE; + CPUDirective = PPC::DIR_NONE; HasMFOCRF = false; Has64BitSupport = false; Use64BitRegs = false; @@ -100,6 +100,7 @@ void PPCSubtarget::initializeEnvironment() { IsPPC6xx = false; IsE500 = false; FeatureMFTB = false; + AllowsUnalignedFPAccess = false; DeprecatedDST = false; HasLazyResolverStubs = false; HasICBT = false; @@ -126,6 +127,8 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // If cross-compiling with -march=ppc64le without -mcpu if (TargetTriple.getArch() == Triple::ppc64le) CPUName = "ppc64le"; + else if (TargetTriple.getSubArch() == Triple::PPCSubArch_spe) + CPUName = "e500"; else CPUName = "generic"; } @@ -190,7 +193,7 @@ bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const { bool PPCSubtarget::enableMachineScheduler() const { return true; } bool PPCSubtarget::enableMachinePipeliner() const { - return (DarwinDirective == PPC::DIR_PWR9) && EnableMachinePipeliner; + return (CPUDirective == PPC::DIR_PWR9) && EnableMachinePipeliner; } bool PPCSubtarget::useDFAforSMS() const { return false; } diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index d96c2893aee9..044e982740e9 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -57,6 +57,7 @@ namespace PPC { DIR_PWR7, DIR_PWR8, DIR_PWR9, + DIR_PWR_FUTURE, DIR_64 }; } @@ -84,7 +85,7 @@ protected: InstrItineraryData InstrItins; /// Which cpu directive was used. - unsigned DarwinDirective; + unsigned CPUDirective; /// Used by the ISel to turn in optimizations for POWER4-derived architectures bool HasMFOCRF; @@ -123,6 +124,7 @@ protected: bool IsPPC4xx; bool IsPPC6xx; bool FeatureMFTB; + bool AllowsUnalignedFPAccess; bool DeprecatedDST; bool HasLazyResolverStubs; bool IsLittleEndian; @@ -169,8 +171,11 @@ public: Align getStackAlignment() const { return StackAlignment; } /// getDarwinDirective - Returns the -m directive specified for the cpu. + unsigned getDarwinDirective() const { return CPUDirective; } + + /// getCPUDirective - Returns the -m directive specified for the cpu. /// - unsigned getDarwinDirective() const { return DarwinDirective; } + unsigned getCPUDirective() const { return CPUDirective; } /// getInstrItins - Return the instruction itineraries based on subtarget /// selection. @@ -270,6 +275,7 @@ public: bool vectorsUseTwoUnits() const {return VectorsUseTwoUnits; } bool isE500() const { return IsE500; } bool isFeatureMFTB() const { return FeatureMFTB; } + bool allowsUnalignedFPAccess() const { return AllowsUnalignedFPAccess; } bool isDeprecatedDST() const { return DeprecatedDST; } bool hasICBT() const { return HasICBT; } bool hasInvariantFunctionDescriptors() const { @@ -347,6 +353,41 @@ public: /// True if the GV will be accessed via an indirect symbol. bool isGVIndirectSymbol(const GlobalValue *GV) const; + /// True if the ABI is descriptor based. + bool usesFunctionDescriptors() const { + // Both 32-bit and 64-bit AIX are descriptor based. For ELF only the 64-bit + // v1 ABI uses descriptors. + return isAIXABI() || (is64BitELFABI() && !isELFv2ABI()); + } + + unsigned descriptorTOCAnchorOffset() const { + assert(usesFunctionDescriptors() && + "Should only be called when the target uses descriptors."); + return IsPPC64 ? 8 : 4; + } + + unsigned descriptorEnvironmentPointerOffset() const { + assert(usesFunctionDescriptors() && + "Should only be called when the target uses descriptors."); + return IsPPC64 ? 16 : 8; + } + + MCRegister getEnvironmentPointerRegister() const { + assert(usesFunctionDescriptors() && + "Should only be called when the target uses descriptors."); + return IsPPC64 ? PPC::X11 : PPC::R11; + } + + MCRegister getTOCPointerRegister() const { + assert((is64BitELFABI() || isAIXABI()) && + "Should only be called when the target is a TOC based ABI."); + return IsPPC64 ? PPC::X2 : PPC::R2; + } + + MCRegister getStackPointerRegister() const { + return IsPPC64 ? PPC::X1 : PPC::R1; + } + bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; } }; } // End llvm namespace diff --git a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 8f313d9d01c4..17e1196eea59 100644 --- a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index abefee8b339d..2caf4c99a1f8 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -51,8 +51,8 @@ opt DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden, cl::desc("Disable CTR loops for PPC")); static cl:: -opt DisablePreIncPrep("disable-ppc-preinc-prep", cl::Hidden, - cl::desc("Disable PPC loop preinc prep")); +opt DisableInstrFormPrep("disable-ppc-instr-form-prep", cl::Hidden, + cl::desc("Disable PPC loop instr form prep")); static cl::opt VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early", @@ -77,7 +77,7 @@ EnableGEPOpt("ppc-gep-opt", cl::Hidden, static cl::opt EnablePrefetch("enable-ppc-prefetching", - cl::desc("disable software prefetching on PPC"), + cl::desc("enable software prefetching on PPC"), cl::init(false), cl::Hidden); static cl::opt @@ -94,7 +94,7 @@ static cl::opt ReduceCRLogical("ppc-reduce-cr-logicals", cl::desc("Expand eligible cr-logical binary ops to branches"), cl::init(true), cl::Hidden); -extern "C" void LLVMInitializePowerPCTarget() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { // Register the targets RegisterTargetMachine A(getThePPC32Target()); RegisterTargetMachine B(getThePPC64Target()); @@ -104,7 +104,7 @@ extern "C" void LLVMInitializePowerPCTarget() { #ifndef NDEBUG initializePPCCTRLoopsVerifyPass(PR); #endif - initializePPCLoopPreIncPrepPass(PR); + initializePPCLoopInstrFormPrepPass(PR); initializePPCTOCRegDepsPass(PR); initializePPCEarlyReturnPass(PR); initializePPCVSXCopyPass(PR); @@ -119,6 +119,7 @@ extern "C" void LLVMInitializePowerPCTarget() { initializePPCPreEmitPeepholePass(PR); initializePPCTLSDynamicCallPass(PR); initializePPCMIPeepholePass(PR); + initializePPCLowerMASSVEntriesPass(PR); } /// Return the datalayout string of a subtarget. @@ -214,8 +215,6 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, case Triple::ppc64le: return PPCTargetMachine::PPC_ABI_ELFv2; case Triple::ppc64: - if (TT.getEnvironment() == llvm::Triple::ELFv2) - return PPCTargetMachine::PPC_ABI_ELFv2; return PPCTargetMachine::PPC_ABI_ELFv1; default: return PPCTargetMachine::PPC_ABI_UNKNOWN; @@ -401,6 +400,9 @@ void PPCPassConfig::addIRPasses() { addPass(createPPCBoolRetToIntPass()); addPass(createAtomicExpandPass()); + // Lower generic MASSV routines to PowerPC subtarget-specific entries. + addPass(createPPCLowerMASSVEntriesPass()); + // For the BG/Q (or if explicitly requested), add explicit data prefetch // intrinsics. bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ && @@ -427,8 +429,8 @@ void PPCPassConfig::addIRPasses() { } bool PPCPassConfig::addPreISel() { - if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None) - addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine())); + if (!DisableInstrFormPrep && getOptLevel() != CodeGenOpt::None) + addPass(createPPCLoopInstrFormPrepPass(getPPCTargetMachine())); if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None) addPass(createHardwareLoopsPass()); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index f51300c656aa..e05699cc95ec 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -84,10 +84,10 @@ int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return 4 * TTI::TCC_Basic; } -int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty) { +int PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { if (DisablePPCConstHoist) - return BaseT::getIntImmCost(IID, Idx, Imm, Ty); + return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty); assert(Ty->isIntegerTy()); @@ -118,10 +118,10 @@ int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, return PPCTTIImpl::getIntImmCost(Imm, Ty); } -int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty) { +int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { if (DisablePPCConstHoist) - return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty); + return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty); assert(Ty->isIntegerTy()); @@ -283,24 +283,6 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, case Intrinsic::loop_decrement: return true; -// VisualStudio defines setjmp as _setjmp -#if defined(_MSC_VER) && defined(setjmp) && \ - !defined(setjmp_undefined_for_msvc) -# pragma push_macro("setjmp") -# undef setjmp -# define setjmp_undefined_for_msvc -#endif - - case Intrinsic::setjmp: - -#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) - // let's return it to _setjmp state -# pragma pop_macro("setjmp") -# undef setjmp_undefined_for_msvc -#endif - - case Intrinsic::longjmp: - // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp // because, although it does clobber the counter register, the // control can't then return to inside the loop unless there is also @@ -331,8 +313,12 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, case Intrinsic::ceil: Opcode = ISD::FCEIL; break; case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; case Intrinsic::rint: Opcode = ISD::FRINT; break; + case Intrinsic::lrint: Opcode = ISD::LRINT; break; + case Intrinsic::llrint: Opcode = ISD::LLRINT; break; case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; case Intrinsic::round: Opcode = ISD::FROUND; break; + case Intrinsic::lround: Opcode = ISD::LROUND; break; + case Intrinsic::llround: Opcode = ISD::LLROUND; break; case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; @@ -550,7 +536,7 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { - if (ST->getDarwinDirective() == PPC::DIR_A2) { + if (ST->getCPUDirective() == PPC::DIR_A2) { // The A2 is in-order with a deep pipeline, and concatenation unrolling // helps expose latency-hiding opportunities to the instruction scheduler. UP.Partial = UP.Runtime = true; @@ -576,7 +562,7 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { // on combining the loads generated for consecutive accesses, and failure to // do so is particularly expensive. This makes it much more likely (compared // to only using concatenation unrolling). - if (ST->getDarwinDirective() == PPC::DIR_A2) + if (ST->getCPUDirective() == PPC::DIR_A2) return true; return LoopHasReductions; @@ -598,8 +584,8 @@ unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const { assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC || ClassID == VSXRC); if (ST->hasVSX()) { - assert(ClassID == GPRRC || ClassID == VSXRC); - return ClassID == GPRRC ? 32 : 64; + assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC); + return ClassID == VSXRC ? 64 : 32; } assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC); return 32; @@ -608,8 +594,14 @@ unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const { unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const { if (Vector) return ST->hasVSX() ? VSXRC : VRRC; - else if (Ty && Ty->getScalarType()->isFloatTy()) + else if (Ty && (Ty->getScalarType()->isFloatTy() || + Ty->getScalarType()->isDoubleTy())) return ST->hasVSX() ? VSXRC : FPRRC; + else if (Ty && (Ty->getScalarType()->isFP128Ty() || + Ty->getScalarType()->isPPC_FP128Ty())) + return VRRC; + else if (Ty && Ty->getScalarType()->isHalfTy()) + return VSXRC; else return GPRRC; } @@ -646,9 +638,10 @@ unsigned PPCTTIImpl::getCacheLineSize() const { return CacheLineSize; // On P7, P8 or P9 we have a cache line size of 128. - unsigned Directive = ST->getDarwinDirective(); + unsigned Directive = ST->getCPUDirective(); + // Assume that Future CPU has the same cache line size as the others. if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || - Directive == PPC::DIR_PWR9) + Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE) return 128; // On other processors return a default of 64 bytes. @@ -662,7 +655,7 @@ unsigned PPCTTIImpl::getPrefetchDistance() const { } unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { - unsigned Directive = ST->getDarwinDirective(); + unsigned Directive = ST->getCPUDirective(); // The 440 has no SIMD support, but floating-point instructions // have a 5-cycle latency, so unroll by 5x for latency hiding. if (Directive == PPC::DIR_440) @@ -680,8 +673,9 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { // For P7 and P8, floating-point instructions have a 6-cycle latency and // there are two execution units, so unroll by 12x for latency hiding. // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready + // Assume that future is the same as the others. if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 || - Directive == PPC::DIR_PWR9) + Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE) return 12; // For most things, modern systems have two execution units (and @@ -716,10 +710,13 @@ int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, return Cost * 2; } -int PPCTTIImpl::getArithmeticInstrCost( - unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, - TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) { +int PPCTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + TTI::OperandValueKind Op1Info, + TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, + ArrayRef Args, + const Instruction *CxtI) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); // Fallback to the default implementation. @@ -829,8 +826,9 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return Cost; } -int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace, const Instruction *I) { +int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + MaybeAlign Alignment, unsigned AddressSpace, + const Instruction *I) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && @@ -888,7 +886,8 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, // to be decomposed based on the alignment factor. // Add the cost of each scalar load or store. - Cost += LT.first*(SrcBytes/Alignment-1); + assert(Alignment); + Cost += LT.first * ((SrcBytes / Alignment->value()) - 1); // For a vector type, there is also scalarization overhead (only for // stores, loads are expanded using the vector-load + permutation sequence, @@ -919,7 +918,8 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, std::pair LT = TLI->getTypeLegalizationCost(DL, VecTy); // Firstly, the cost of load/store operation. - int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + int Cost = + getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace); // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations // (at least in the sense that there need only be one non-loop-invariant @@ -931,6 +931,20 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, return Cost; } +unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args, FastMathFlags FMF, unsigned VF) { + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); +} + +unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) { + if (ID == Intrinsic::bswap && ST->hasP9Vector()) + return TLI->getTypeLegalizationCost(DL, RetTy).first; + return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF, + ScalarizationCostPassed); +} + bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 83a70364bf68..35388d14f606 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -46,9 +46,10 @@ public: using BaseT::getIntImmCost; int getIntImmCost(const APInt &Imm, Type *Ty); - int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); - int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty); + int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); unsigned getUserCost(const User *U, ArrayRef Operands); @@ -90,14 +91,15 @@ public: TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, - ArrayRef Args = ArrayRef()); + ArrayRef Args = ArrayRef(), + const Instruction *CxtI = nullptr); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, const Instruction *I = nullptr); int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, @@ -106,6 +108,11 @@ public: unsigned AddressSpace, bool UseMaskForCond = false, bool UseMaskForGaps = false); + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args, FastMathFlags FMF, unsigned VF); + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX); /// @} }; diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index 5e150be544ed..3e6d1c7939f1 100644 --- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp index 99b5dec74668..649bd648a6cf 100644 --- a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp +++ b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp @@ -23,7 +23,7 @@ Target &llvm::getThePPC64LETarget() { return ThePPC64LETarget; } -extern "C" void LLVMInitializePowerPCTargetInfo() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetInfo() { RegisterTarget X(getThePPC32Target(), "ppc32", "PowerPC 32", "PPC"); -- cgit v1.2.3