diff options
Diffstat (limited to 'contrib/llvm/lib/Target')
129 files changed, 10102 insertions, 976 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 3ef3c8b840cb..f398117de953 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2487,15 +2487,36 @@ static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, return true; } -/// Return true when there is potentially a faster code sequence -/// for an instruction chain ending in \p Root. All potential patterns are -/// listed -/// in the \p Pattern vector. Pattern should be sorted in priority order since -/// the pattern evaluator stops checking as soon as it finds a faster sequence. +// TODO: There are many more machine instruction opcodes to match: +// 1. Other data types (integer, vectors) +// 2. Other math / logic operations (xor, or) +// 3. Other forms of the same operation (intrinsics and other variants) +bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case AArch64::FADDDrr: + case AArch64::FADDSrr: + case AArch64::FADDv2f32: + case AArch64::FADDv2f64: + case AArch64::FADDv4f32: + case AArch64::FMULDrr: + case AArch64::FMULSrr: + case AArch64::FMULX32: + case AArch64::FMULX64: + case AArch64::FMULXv2f32: + case AArch64::FMULXv2f64: + case AArch64::FMULXv4f32: + case AArch64::FMULv2f32: + case AArch64::FMULv2f64: + case AArch64::FMULv4f32: + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + default: + return false; + } +} -bool AArch64InstrInfo::getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern> &Patterns) const { +/// Find instructions that can be turned into madd. +static bool getMaddPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) { unsigned Opc = Root.getOpcode(); MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; @@ -2600,6 +2621,20 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( return Found; } +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool AArch64InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + if (getMaddPatterns(Root, Patterns)) + return true; + + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + /// genMadd - Generate madd instruction and combine mul and add. /// Example: /// MUL I=A,B,0 @@ -2713,8 +2748,10 @@ void AArch64InstrInfo::genAlternativeCodeSequence( unsigned Opc; switch (Pattern) { default: - // signal error. - break; + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; case MachineCombinerPattern::MULADDW_OP1: case MachineCombinerPattern::MULADDX_OP1: // MUL I=A,B,0 diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h index ae02822a32e6..b5bb446f8c16 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -169,7 +169,9 @@ public: bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns) const override; - + /// Return true when Inst is associative and commutative so that it can be + /// reassociated. + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; /// When getMachineCombinerPatterns() finds patterns, this function generates /// the instructions that could replace the original code sequence void genAlternativeCodeSequence( diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h index 8c3cb567fc7e..5d00e1cb3be5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -20,8 +20,10 @@ class AMDGPUInstrPrinter; class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; +class MachineSchedContext; class MCAsmInfo; class raw_ostream; +class ScheduleDAGInstrs; class Target; class TargetMachine; @@ -49,6 +51,8 @@ FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); +ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); + ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 9c3790264377..1239dfb235ef 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -91,6 +91,25 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)) {} +void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + return; + + // Need to construct an MCSubtargetInfo here in case we have no functions + // in the module. + std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo( + TM.getTargetTriple().str(), TM.getTargetCPU(), + TM.getTargetFeatureString())); + + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + + TS->EmitDirectiveHSACodeObjectVersion(1, 0); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); + TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, + "AMD", "AMDGPU"); +} + void AMDGPUAsmPrinter::EmitFunctionBodyStart() { const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; @@ -148,11 +167,15 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName()); } + MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV)); const DataLayout &DL = getDataLayout(); + + // Emit the size + uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); + OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext)); OutStreamer->PushSection(); OutStreamer->SwitchSection( getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); - MCSymbol *GVSym = getSymbol(GV); const Constant *C = GV->getInitializer(); OutStreamer->EmitLabel(GVSym); EmitGlobalConstant(DL, C); @@ -178,13 +201,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (!STM.isAmdHsaOS()) { EmitProgramInfoSI(MF, KernelInfo); } - // Emit directives - AMDGPUTargetStreamer *TS = - static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); - TS->EmitDirectiveHSACodeObjectVersion(1, 0); - AMDGPU::IsaVersion ISA = STM.getIsaVersion(); - TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, - "AMD", "AMDGPU"); } else { EmitProgramInfoR600(MF); } @@ -417,16 +433,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } } - if (VCCUsed || FlatUsed || STM.isXNACKEnabled()) { - MaxSGPR += 2; + unsigned ExtraSGPRs = 0; - if (FlatUsed) - MaxSGPR += 2; + if (VCCUsed) + ExtraSGPRs = 2; + if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (FlatUsed) + ExtraSGPRs = 4; + } else { if (STM.isXNACKEnabled()) - MaxSGPR += 2; + ExtraSGPRs = 4; + + if (FlatUsed) + ExtraSGPRs = 6; } + MaxSGPR += ExtraSGPRs; + // We found the maximum register index. They start at 0, so add one to get the // number of registers. ProgInfo.NumVGPR = MaxVGPR + 1; @@ -563,7 +587,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->PSInputAddr, 4); + OutStreamer->EmitIntValue(MFI->PSInputEna, 4); + OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); + OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); } } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 817cbfc0c0eb..99d4091670fe 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -103,6 +103,8 @@ public: void EmitGlobalVariable(const GlobalVariable *GV) override; + void EmitStartOfAsmFile(Module &M) override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) override; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 6ffa7a083583..b0db26124a0c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -20,28 +20,83 @@ def CC_SI : CallingConv<[ CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, - SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21 + SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, + SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 ]>>>, CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow< - [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], - [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, + SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, + SGPR32, SGPR34, SGPR36, SGPR38 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, + SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, + SGPR33, SGPR35, SGPR37, SGPR39 ] >>>, + // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, - VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31, + VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39, + VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47, + VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55, + VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63, + VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71, + VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79, + VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87, + VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95, + VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103, + VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111, + VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119, + VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, + VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 ]>>>, CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow< - [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], - [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14, + SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30, + SGPR32, SGPR34, SGPR36, SGPR38 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15, + SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31, + SGPR33, SGPR35, SGPR37, SGPR39 ] >>> ]>; +def RetCC_SI : CallingConv<[ + CCIfType<[i32] , CCAssignToReg<[ + SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, + SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, + SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, + SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + ]>>, + + // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. + CCIfType<[f32] , CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31, + VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39, + VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47, + VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55, + VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63, + VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71, + VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79, + VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87, + VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95, + VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103, + VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111, + VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119, + VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, + VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 + ]>> +]>; + // Calling convention for R600 def CC_R600 : CallingConv<[ CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[ diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 222f63161be5..1a59a460ee7d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -282,12 +282,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::SMAX, MVT::i32, Legal); setOperationAction(ISD::UMAX, MVT::i32, Legal); - if (!Subtarget->hasFFBH()) + if (Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); + else setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); if (!Subtarget->hasFFBL()) setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + + setOperationAction(ISD::CTLZ, MVT::i64, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; @@ -565,6 +572,12 @@ void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, State.AnalyzeFormalArguments(Ins, CC_AMDGPU); } +void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, + const SmallVectorImpl<ISD::OutputArg> &Outs) const { + + State.AnalyzeReturn(Outs, RetCC_SI); +} + SDValue AMDGPUTargetLowering::LowerReturn( SDValue Chain, CallingConv::ID CallConv, @@ -633,6 +646,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::CTLZ: + case ISD::CTLZ_ZERO_UNDEF: + return LowerCTLZ(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; @@ -2159,6 +2175,145 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + + if (ZeroUndef && Src.getValueType() == MVT::i32) + return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); + + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); + + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), MVT::i32); + + SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); + + SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); + SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); + + const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); + SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); + + // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) + SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); + + if (!ZeroUndef) { + // Test if the full 64-bit input is zero. + + // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, + // which we probably don't want. + SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); + SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); + + // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction + // with the same cycles, otherwise it is slower. + // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src, + // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ); + + const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32); + + // The instruction returns -1 for 0 input, but the defined intrinsic + // behavior is to return the number of bits. + NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, + SrcIsZero, Bits32, NewCtlz); + } + + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); +} + +SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + // Unsigned + // cul2f(ulong u) + //{ + // uint lz = clz(u); + // uint e = (u != 0) ? 127U + 63U - lz : 0; + // u = (u << lz) & 0x7fffffffffffffffUL; + // ulong t = u & 0xffffffffffUL; + // uint v = (e << 23) | (uint)(u >> 40); + // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U); + // return as_float(v + r); + //} + // Signed + // cl2f(long l) + //{ + // long s = l >> 63; + // float r = cul2f((l + s) ^ s); + // return s ? -r : r; + //} + + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + SDValue L = Src; + + SDValue S; + if (Signed) { + const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64); + S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit); + + SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S); + L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S); + } + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), MVT::f32); + + + SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32); + SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64); + SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L); + LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ); + + SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32); + SDValue E = DAG.getSelect(SL, MVT::i32, + DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE), + DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ), + ZeroI32); + + SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64, + DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ), + DAG.getConstant((-1ULL) >> 1, SL, MVT::i64)); + + SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U, + DAG.getConstant(0xffffffffffULL, SL, MVT::i64)); + + SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64, + U, DAG.getConstant(40, SL, MVT::i64)); + + SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32, + DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)), + DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl)); + + SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64); + SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT); + SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ); + + SDValue One = DAG.getConstant(1, SL, MVT::i32); + + SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One); + + SDValue R = DAG.getSelect(SL, MVT::i32, + RCmp, + One, + DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32)); + R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R); + R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R); + + if (!Signed) + return R; + + SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R); + return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R); +} + SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const { SDLoc SL(Op); @@ -2184,35 +2339,29 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue S0 = Op.getOperand(0); - if (S0.getValueType() != MVT::i64) - return SDValue(); + assert(Op.getOperand(0).getValueType() == MVT::i64 && + "operation should be legal"); EVT DestVT = Op.getValueType(); if (DestVT == MVT::f64) return LowerINT_TO_FP64(Op, DAG, false); - assert(DestVT == MVT::f32); - - SDLoc DL(Op); + if (DestVT == MVT::f32) + return LowerINT_TO_FP32(Op, DAG, false); - // f32 uint_to_fp i64 - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(0, DL, MVT::i32)); - SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, - DAG.getConstant(1, DL, MVT::i32)); - SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); - // TODO: Should this propagate fast-math-flags? - FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, - DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 - return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); + return SDValue(); } SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); - if (Src.getValueType() == MVT::i64 && Op.getValueType() == MVT::f64) + assert(Op.getOperand(0).getValueType() == MVT::i64 && + "operation should be legal"); + + EVT DestVT = Op.getValueType(); + if (DestVT == MVT::f32) + return LowerINT_TO_FP32(Op, DAG, true); + + if (DestVT == MVT::f64) return LowerINT_TO_FP64(Op, DAG, true); return SDValue(); @@ -2447,6 +2596,97 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, return DAG.getSExtOrTrunc(Mul, DL, VT); } +static bool isNegativeOne(SDValue Val) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) + return C->isAllOnesValue(); + return false; +} + +static bool isCtlzOpc(unsigned Opc) { + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + +// Get FFBH node if the incoming op may have been type legalized from a smaller +// type VT. +// Need to match pre-legalized type because the generic legalization inserts the +// add/sub between the select and compare. +static SDValue getFFBH_U32(const TargetLowering &TLI, + SelectionDAG &DAG, SDLoc SL, SDValue Op) { + EVT VT = Op.getValueType(); + EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + if (LegalVT != MVT::i32) + return SDValue(); + + if (VT != MVT::i32) + Op = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Op); + + SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Op); + if (VT != MVT::i32) + FFBH = DAG.getNode(ISD::TRUNCATE, SL, VT, FFBH); + + return FFBH; +} + +// The native instructions return -1 on 0 input. Optimize out a select that +// produces -1 on 0. +// +// TODO: If zero is not undef, we could also do this if the output is compared +// against the bitwidth. +// +// TODO: Should probably combine against FFBH_U32 instead of ctlz directly. +SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL, + SDValue Cond, + SDValue LHS, + SDValue RHS, + DAGCombinerInfo &DCI) const { + ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + if (!CmpRhs || !CmpRhs->isNullValue()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + SDValue CmpLHS = Cond.getOperand(0); + + // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x + if (CCOpcode == ISD::SETEQ && + isCtlzOpc(RHS.getOpcode()) && + RHS.getOperand(0) == CmpLHS && + isNegativeOne(LHS)) { + return getFFBH_U32(*this, DAG, SL, CmpLHS); + } + + // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x + if (CCOpcode == ISD::SETNE && + isCtlzOpc(LHS.getOpcode()) && + LHS.getOperand(0) == CmpLHS && + isNegativeOne(RHS)) { + return getFFBH_U32(*this, DAG, SL, CmpLHS); + } + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + EVT VT = N->getValueType(0); + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue CC = Cond.getOperand(2); + + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + + if (VT == MVT::f32 && Cond.hasOneUse()) + return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + + // There's no reason to not do this if the condition has other uses. + return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -2471,23 +2711,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, simplifyI24(N1, DCI); return SDValue(); } - case ISD::SELECT: { - SDValue Cond = N->getOperand(0); - if (Cond.getOpcode() == ISD::SETCC && Cond.hasOneUse()) { - EVT VT = N->getValueType(0); - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - SDValue CC = Cond.getOperand(2); - - SDValue True = N->getOperand(1); - SDValue False = N->getOperand(2); - - if (VT == MVT::f32) - return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); - } - - break; - } + case ISD::SELECT: + return performSelectCombine(N, DCI); case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -2699,6 +2924,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) + NODE_NAME_CASE(FFBH_U32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 7314cc050ba5..37925416a9c4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -54,6 +54,9 @@ private: SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; @@ -67,6 +70,9 @@ private: SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS, + DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; protected: static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); @@ -109,6 +115,8 @@ protected: SmallVectorImpl<ISD::InputArg> &OrigIns) const; void AnalyzeFormalArguments(CCState &State, const SmallVectorImpl<ISD::InputArg> &Ins) const; + void AnalyzeReturn(CCState &State, + const SmallVectorImpl<ISD::OutputArg> &Outs) const; public: AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); @@ -263,6 +271,7 @@ enum NodeType : unsigned { BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. + FFBH_U32, // ctlz with -1 if input is zero. MUL_U24, MUL_I24, MAD_U24, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 70e589c28429..575dfe413658 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -191,6 +191,8 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; +def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; + // Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when // performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, @@ -240,4 +242,4 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai // Call/Return DAG Nodes //===----------------------------------------------------------------------===// def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 22f85b3e663c..b1be6197a6c6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -66,8 +66,12 @@ static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { } static MachineSchedRegistry -SchedCustomRegistry("r600", "Run R600's custom scheduler", - createR600MachineScheduler); +R600SchedRegistry("r600", "Run R600's custom scheduler", + createR600MachineScheduler); + +static MachineSchedRegistry +SISchedRegistry("si", "Run SI's custom scheduler", + createSIMachineScheduler); static std::string computeDataLayout(const Triple &TT) { std::string Ret = "e-p:32:32"; diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 779a14e95d22..2245f1417e53 100644 --- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -349,7 +349,7 @@ def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>; def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>; -def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", ctlz_zero_undef, VecALU>; +def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; let hasSideEffects = 1 in { diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 68b1d1ae83cc..4bc80a028936 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -28,7 +28,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { //===--- Global Variable Emission Directives --------------------------===// HasAggressiveSymbolFolding = true; COMMDirectiveAlignmentIsInBytes = false; - HasDotTypeDotSizeDirective = false; HasNoDeadStrip = true; WeakRefDirective = ".weakref\t"; //===--- Dwarf Emission Directives -----------------------------------===// diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h index 7f79dd34f3ba..aa1e352ed748 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h @@ -137,7 +137,7 @@ namespace SIOutMods { #define C_00B84C_EXCP_EN #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC - +#define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0 #define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 #define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 96e37c566240..f59d9948f98e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -215,7 +215,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { unsigned SrcReg = MI.getOperand(I).getReg(); - unsigned SrcSubReg = MI.getOperand(I).getReg(); + unsigned SrcSubReg = MI.getOperand(I).getSubReg(); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); assert(TRI->isSGPRClass(SrcRC) && diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 02a39307e74e..6230d1e28b74 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -334,12 +334,20 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { !MRI.hasOneUse(MI.getOperand(0).getReg())) continue; - // FIXME: Fold operands with subregs. if (OpToFold.isReg() && - (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || - OpToFold.getSubReg())) + !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) continue; + // Prevent folding operands backwards in the function. For example, + // the COPY opcode must not be replaced by 1 in this example: + // + // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 + // ... + // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> + MachineOperand &Dst = MI.getOperand(0); + if (Dst.isReg() && + !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + continue; // We need mutate the operands of new mov instructions to add implicit // uses of EXEC, but adding them invalidates the use_iterator, so defer diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0e043cb47da7..544867513d9c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -259,7 +259,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMIN); setTargetDAGCombine(ISD::UMAX); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); @@ -598,18 +597,20 @@ SDValue SITargetLowering::LowerFormalArguments( // First check if it's a PS input addr if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && - !Arg.Flags.isByVal()) { + !Arg.Flags.isByVal() && PSInputNum <= 15) { - assert((PSInputNum <= 15) && "Too many PS inputs!"); - - if (!Arg.Used) { + if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { // We can safely skip PS inputs Skipped.set(i); ++PSInputNum; continue; } - Info->PSInputAddr |= 1 << PSInputNum++; + Info->markPSInputAllocated(PSInputNum); + if (Arg.Used) + Info->PSInputEna |= 1 << PSInputNum; + + ++PSInputNum; } // Second split vertices into their elements @@ -639,11 +640,25 @@ SDValue SITargetLowering::LowerFormalArguments( *DAG.getContext()); // At least one interpolation mode must be enabled or else the GPU will hang. + // + // Check PSInputAddr instead of PSInputEna. The idea is that if the user set + // PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. + // + // Otherwise, the following restrictions apply: + // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. + // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be + // enabled too. if (Info->getShaderType() == ShaderType::PIXEL && - (Info->PSInputAddr & 0x7F) == 0) { - Info->PSInputAddr |= 1; + ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11)))) { CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->PSInputEna |= 1; } if (Info->getShaderType() == ShaderType::COMPUTE) { @@ -872,6 +887,97 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } +SDValue SITargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + if (Info->getShaderType() == ShaderType::COMPUTE) + return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, + OutVals, DL, DAG); + + Info->setIfReturnsVoid(Outs.size() == 0); + + SmallVector<ISD::OutputArg, 48> Splits; + SmallVector<SDValue, 48> SplitVals; + + // Split vectors into their elements. + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + const ISD::OutputArg &Out = Outs[i]; + + if (Out.VT.isVector()) { + MVT VT = Out.VT.getVectorElementType(); + ISD::OutputArg NewOut = Out; + NewOut.Flags.setSplit(); + NewOut.VT = VT; + + // We want the original number of vector elements here, e.g. + // three or five, not four or eight. + unsigned NumElements = Out.ArgVT.getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], + DAG.getConstant(j, DL, MVT::i32)); + SplitVals.push_back(Elem); + Splits.push_back(NewOut); + NewOut.PartOffset += NewOut.VT.getStoreSize(); + } + } else { + SplitVals.push_back(OutVals[i]); + Splits.push_back(Out); + } + } + + // CCValAssign - represent the assignment of the return value to a location. + SmallVector<CCValAssign, 48> RVLocs; + + // CCState - Info about the registers and stack slots. + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + + // Analyze outgoing return values. + AnalyzeReturn(CCInfo, Splits); + + SDValue Flag; + SmallVector<SDValue, 48> RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + + // Copy the result values into the output registers. + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = SplitVals[realRVLocIdx]; + + // Copied from other backends. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + // Update chain and glue. + RetOps[0] = Chain; + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); +} + MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { @@ -1158,6 +1264,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_dispatch_ptr: + if (!Subtarget->isAmdHsaOS()) { + DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(), + "hsa intrinsic without hsa target"); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); + } + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); @@ -2027,7 +2140,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: { return performUCharToFloatCombine(N, DCI); - + } case ISD::FADD: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) break; @@ -2109,7 +2222,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, break; } - } case ISD::LOAD: case ISD::STORE: case ISD::ATOMIC_LOAD: diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h index e2f8cb19d6be..f01b2c0d09f3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -95,6 +95,13 @@ public: SDLoc DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + SDLoc DL, SelectionDAG &DAG) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, MachineBasicBlock * BB) const override; bool enableAggressiveFMAFusion(EVT VT) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index 821aada526c7..94e614750d2f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -84,6 +84,9 @@ private: bool LastInstWritesM0; + /// \brief Whether the machine function returns void + bool ReturnsVoid; + /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -322,7 +325,9 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, const Counters &Required) { // End of program? No need to wait on anything - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + // A function not returning void needs to wait, because other bytecode will + // be appended after it and we don't know what it will be. + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid) return false; // Figure out if the async instructions execute in order @@ -465,6 +470,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; + ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid(); memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); @@ -488,6 +494,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + + // Functions returning something shouldn't contain S_ENDPGM, because other + // bytecode will be appended after it. + if (!ReturnsVoid) { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + I->eraseFromParent(); + } } return Changes; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a08a5a8fed36..1e10d25e8fb7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1777,6 +1777,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, MRI.getRegClass(Reg) : RI.getPhysRegClass(Reg); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo()); + RC = TRI->getSubRegClass(RC, MO.getSubReg()); + // In order to be legal, the common sub-class must be equal to the // class of the current operand. For example: // @@ -3075,3 +3079,15 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { return Rsrc23; } + +bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + return isSMRD(Opc); +} + +bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 307ef67ed263..cce1ae725611 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -462,6 +462,9 @@ public: uint64_t getDefaultRsrcDataFormat() const; uint64_t getScratchRsrcWords23() const; + + bool isLowLatencyInstruction(const MachineInstr *MI) const; + bool isHighLatencyInstruction(const MachineInstr *MI) const; }; namespace AMDGPU { diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index b7df058b7c0c..89692ab71f4d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -144,7 +144,7 @@ defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", - [(set i32:$dst, (ctlz_zero_undef i32:$src0))] + [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))] >; defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index bf15516bea7b..49677fc2b0a3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -46,8 +46,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), - LDSWaveSpillSize(0), PSInputAddr(0), + ReturnsVoid(true), + LDSWaveSpillSize(0), + PSInputEna(0), NumUserSGPRs(0), NumSystemSGPRs(0), HasSpilledSGPRs(false), @@ -72,6 +74,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); const Function *F = MF.getFunction(); + PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); if (getShaderType() == ShaderType::COMPUTE) diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 9c528d63bd0e..846ee5de057d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -57,10 +57,14 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { unsigned WorkGroupInfoSystemSGPR; unsigned PrivateSegmentWaveByteOffsetSystemSGPR; + // Graphics info. + unsigned PSInputAddr; + bool ReturnsVoid; + public: // FIXME: Make private unsigned LDSWaveSpillSize; - unsigned PSInputAddr; + unsigned PSInputEna; std::map<unsigned, unsigned> LaneVGPRs; unsigned ScratchOffsetReg; unsigned NumUserSGPRs; @@ -273,6 +277,26 @@ public: HasSpilledVGPRs = Spill; } + unsigned getPSInputAddr() const { + return PSInputAddr; + } + + bool isPSInputAllocated(unsigned Index) const { + return PSInputAddr & (1 << Index); + } + + void markPSInputAllocated(unsigned Index) { + PSInputAddr |= 1 << Index; + } + + bool returnsVoid() const { + return ReturnsVoid; + } + + void setIfReturnsVoid(bool Value) { + ReturnsVoid = Value; + } + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp new file mode 100644 index 000000000000..1cfa98430020 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -0,0 +1,1968 @@ +//===-- SIMachineScheduler.cpp - SI Scheduler Interface -*- C++ -*-----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#include "SIMachineScheduler.h" +#include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/RegisterPressure.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +// This scheduler implements a different scheduling algorithm than +// GenericScheduler. +// +// There are several specific architecture behaviours that can't be modelled +// for GenericScheduler: +// . When accessing the result of an SGPR load instruction, you have to wait +// for all the SGPR load instructions before your current instruction to +// have finished. +// . When accessing the result of an VGPR load instruction, you have to wait +// for all the VGPR load instructions previous to the VGPR load instruction +// you are interested in to finish. +// . The less the register pressure, the best load latencies are hidden +// +// Moreover some specifities (like the fact a lot of instructions in the shader +// have few dependencies) makes the generic scheduler have some unpredictable +// behaviours. For example when register pressure becomes high, it can either +// manage to prevent register pressure from going too high, or it can +// increase register pressure even more than if it hadn't taken register +// pressure into account. +// +// Also some other bad behaviours are generated, like loading at the beginning +// of the shader a constant in VGPR you won't need until the end of the shader. +// +// The scheduling problem for SI can distinguish three main parts: +// . Hiding high latencies (texture sampling, etc) +// . Hiding low latencies (SGPR constant loading, etc) +// . Keeping register usage low for better latency hiding and general +// performance +// +// Some other things can also affect performance, but are hard to predict +// (cache usage, the fact the HW can issue several instructions from different +// wavefronts if different types, etc) +// +// This scheduler tries to solve the scheduling problem by dividing it into +// simpler sub-problems. It divides the instructions into blocks, schedules +// locally inside the blocks where it takes care of low latencies, and then +// chooses the order of the blocks by taking care of high latencies. +// Dividing the instructions into blocks helps control keeping register +// usage low. +// +// First the instructions are put into blocks. +// We want the blocks help control register usage and hide high latencies +// later. To help control register usage, we typically want all local +// computations, when for example you create a result that can be comsummed +// right away, to be contained in a block. Block inputs and outputs would +// typically be important results that are needed in several locations of +// the shader. Since we do want blocks to help hide high latencies, we want +// the instructions inside the block to have a minimal set of dependencies +// on high latencies. It will make it easy to pick blocks to hide specific +// high latencies. +// The block creation algorithm is divided into several steps, and several +// variants can be tried during the scheduling process. +// +// Second the order of the instructions inside the blocks is choosen. +// At that step we do take into account only register usage and hiding +// low latency instructions +// +// Third the block order is choosen, there we try to hide high latencies +// and keep register usage low. +// +// After the third step, a pass is done to improve the hiding of low +// latencies. +// +// Actually when talking about 'low latency' or 'high latency' it includes +// both the latency to get the cache (or global mem) data go to the register, +// and the bandwith limitations. +// Increasing the number of active wavefronts helps hide the former, but it +// doesn't solve the latter, thus why even if wavefront count is high, we have +// to try have as many instructions hiding high latencies as possible. +// The OpenCL doc says for example latency of 400 cycles for a global mem access, +// which is hidden by 10 instructions if the wavefront count is 10. + +// Some figures taken from AMD docs: +// Both texture and constant L1 caches are 4-way associative with 64 bytes +// lines. +// Constant cache is shared with 4 CUs. +// For texture sampling, the address generation unit receives 4 texture +// addresses per cycle, thus we could expect texture sampling latency to be +// equivalent to 4 instructions in the very best case (a VGPR is 64 work items, +// instructions in a wavefront group are executed every 4 cycles), +// or 16 instructions if the other wavefronts associated to the 3 other VALUs +// of the CU do texture sampling too. (Don't take these figures too seriously, +// as I'm not 100% sure of the computation) +// Data exports should get similar latency. +// For constant loading, the cache is shader with 4 CUs. +// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" +// I guess if the other CU don't read the cache, it can go up to 64B/cycle. +// It means a simple s_buffer_load should take one instruction to hide, as +// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same +// cache line. +// +// As of today the driver doesn't preload the constants in cache, thus the +// first loads get extra latency. The doc says global memory access can be +// 300-600 cycles. We do not specially take that into account when scheduling +// As we expect the driver to be able to preload the constants soon. + + +// common code // + +#ifndef NDEBUG + +static const char *getReasonStr(SIScheduleCandReason Reason) { + switch (Reason) { + case NoCand: return "NOCAND"; + case RegUsage: return "REGUSAGE"; + case Latency: return "LATENCY"; + case Successor: return "SUCCESSOR"; + case Depth: return "DEPTH"; + case NodeOrder: return "ORDER"; + } + llvm_unreachable("Unknown reason!"); +} + +#endif + +static bool tryLess(int TryVal, int CandVal, + SISchedulerCandidate &TryCand, + SISchedulerCandidate &Cand, + SIScheduleCandReason Reason) { + if (TryVal < CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal > CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + Cand.setRepeat(Reason); + return false; +} + +static bool tryGreater(int TryVal, int CandVal, + SISchedulerCandidate &TryCand, + SISchedulerCandidate &Cand, + SIScheduleCandReason Reason) { + if (TryVal > CandVal) { + TryCand.Reason = Reason; + return true; + } + if (TryVal < CandVal) { + if (Cand.Reason > Reason) + Cand.Reason = Reason; + return true; + } + Cand.setRepeat(Reason); + return false; +} + +// SIScheduleBlock // + +void SIScheduleBlock::addUnit(SUnit *SU) { + NodeNum2Index[SU->NodeNum] = SUnits.size(); + SUnits.push_back(SU); +} + +#ifndef NDEBUG + +void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { + + dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); + dbgs() << '\n'; +} +#endif + +void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, + SISchedCandidate &TryCand) { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return; + } + + if (Cand.SGPRUsage > 60 && + tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage)) + return; + + // Schedule low latency instructions as top as possible. + // Order of priority is: + // . Low latency instructions which do not depend on other low latency + // instructions we haven't waited for + // . Other instructions which do not depend on low latency instructions + // we haven't waited for + // . Low latencies + // . All other instructions + // Goal is to get: low latency instructions - independant instructions + // - (eventually some more low latency instructions) + // - instructions that depend on the first low latency instructions. + // If in the block there is a lot of constant loads, the SGPR usage + // could go quite high, thus above the arbitrary limit of 60 will encourage + // use the already loaded constants (in order to release some SGPRs) before + // loading more. + if (tryLess(TryCand.HasLowLatencyNonWaitedParent, + Cand.HasLowLatencyNonWaitedParent, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (TryCand.IsLowLatency && + tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, + TryCand, Cand, SIScheduleCandReason::Depth)) + return; + + if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)) + return; + + // Fall through to original instruction order. + if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { + TryCand.Reason = NodeOrder; + } +} + +SUnit* SIScheduleBlock::pickNode() { + SISchedCandidate TopCand; + + for (SUnit* SU : TopReadySUs) { + SISchedCandidate TryCand; + std::vector<unsigned> pressure; + std::vector<unsigned> MaxPressure; + // Predict register usage after this instruction. + TryCand.SU = SU; + TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); + TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; + TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; + TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; + TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; + TryCand.HasLowLatencyNonWaitedParent = + HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; + tryCandidateTopDown(TopCand, TryCand); + if (TryCand.Reason != NoCand) + TopCand.setBest(TryCand); + } + + return TopCand.SU; +} + + +// Schedule something valid. +void SIScheduleBlock::fastSchedule() { + TopReadySUs.clear(); + if (Scheduled) + undoSchedule(); + + for (SUnit* SU : SUnits) { + if (!SU->NumPredsLeft) + TopReadySUs.push_back(SU); + } + + while (!TopReadySUs.empty()) { + SUnit *SU = TopReadySUs[0]; + ScheduledSUnits.push_back(SU); + nodeScheduled(SU); + } + + Scheduled = true; +} + +// Returns if the register was set between first and last. +static bool isDefBetween(unsigned Reg, + SlotIndex First, SlotIndex Last, + const MachineRegisterInfo *MRI, + const LiveIntervals *LIS) { + for (MachineRegisterInfo::def_instr_iterator + UI = MRI->def_instr_begin(Reg), + UE = MRI->def_instr_end(); UI != UE; ++UI) { + const MachineInstr* MI = &*UI; + if (MI->isDebugValue()) + continue; + SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot(); + if (InstSlot >= First && InstSlot <= Last) + return true; + } + return false; +} + +void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock) { + IntervalPressure Pressure, BotPressure; + RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); + LiveIntervals *LIS = DAG->getLIS(); + MachineRegisterInfo *MRI = DAG->getMRI(); + DAG->initRPTracker(TopRPTracker); + DAG->initRPTracker(BotRPTracker); + DAG->initRPTracker(RPTracker); + + // Goes though all SU. RPTracker captures what had to be alive for the SUs + // to execute, and what is still alive at the end. + for (SUnit* SU : ScheduledSUnits) { + RPTracker.setPos(SU->getInstr()); + RPTracker.advance(); + } + + // Close the RPTracker to finalize live ins/outs. + RPTracker.closeRegion(); + + // Initialize the live ins and live outs. + TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); + BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); + + // Do not Track Physical Registers, because it messes up. + for (unsigned Reg : RPTracker.getPressure().LiveInRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + LiveInRegs.insert(Reg); + } + LiveOutRegs.clear(); + // There is several possibilities to distinguish: + // 1) Reg is not input to any instruction in the block, but is output of one + // 2) 1) + read in the block and not needed after it + // 3) 1) + read in the block but needed in another block + // 4) Reg is input of an instruction but another block will read it too + // 5) Reg is input of an instruction and then rewritten in the block. + // result is not read in the block (implies used in another block) + // 6) Reg is input of an instruction and then rewritten in the block. + // result is read in the block and not needed in another block + // 7) Reg is input of an instruction and then rewritten in the block. + // result is read in the block but also needed in another block + // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 + // We want LiveOutRegs to contain only Regs whose content will be read after + // in another block, and whose content was written in the current block, + // that is we want it to get 1, 3, 5, 7 + // Since we made the MIs of a block to be packed all together before + // scheduling, then the LiveIntervals were correct, and the RPTracker was + // able to correctly handle 5 vs 6, 2 vs 3. + // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) + // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 + // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 + // The use of findDefBetween removes the case 4. + for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg) && + isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(), + LIS->getInstructionIndex(EndBlock).getRegSlot(), + MRI, LIS)) { + LiveOutRegs.insert(Reg); + } + } + + // Pressure = sum_alive_registers register size + // Internally llvm will represent some registers as big 128 bits registers + // for example, but they actually correspond to 4 actual 32 bits registers. + // Thus Pressure is not equal to num_alive_registers * constant. + LiveInPressure = TopPressure.MaxSetPressure; + LiveOutPressure = BotPressure.MaxSetPressure; + + // Prepares TopRPTracker for top down scheduling. + TopRPTracker.closeTop(); +} + +void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock) { + if (!Scheduled) + fastSchedule(); + + // PreScheduling phase to set LiveIn and LiveOut. + initRegPressure(BeginBlock, EndBlock); + undoSchedule(); + + // Schedule for real now. + + TopReadySUs.clear(); + + for (SUnit* SU : SUnits) { + if (!SU->NumPredsLeft) + TopReadySUs.push_back(SU); + } + + while (!TopReadySUs.empty()) { + SUnit *SU = pickNode(); + ScheduledSUnits.push_back(SU); + TopRPTracker.setPos(SU->getInstr()); + TopRPTracker.advance(); + nodeScheduled(SU); + } + + // TODO: compute InternalAdditionnalPressure. + InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); + + // Check everything is right. +#ifndef NDEBUG + assert(SUnits.size() == ScheduledSUnits.size() && + TopReadySUs.empty()); + for (SUnit* SU : SUnits) { + assert(SU->isScheduled && + SU->NumPredsLeft == 0); + } +#endif + + Scheduled = true; +} + +void SIScheduleBlock::undoSchedule() { + for (SUnit* SU : SUnits) { + SU->isScheduled = false; + for (SDep& Succ : SU->Succs) { + if (BC->isSUInBlock(Succ.getSUnit(), ID)) + undoReleaseSucc(SU, &Succ); + } + } + HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); + ScheduledSUnits.clear(); + Scheduled = false; +} + +void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { + SUnit *SuccSU = SuccEdge->getSUnit(); + + if (SuccEdge->isWeak()) { + ++SuccSU->WeakPredsLeft; + return; + } + ++SuccSU->NumPredsLeft; +} + +void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { + SUnit *SuccSU = SuccEdge->getSUnit(); + + if (SuccEdge->isWeak()) { + --SuccSU->WeakPredsLeft; + return; + } +#ifndef NDEBUG + if (SuccSU->NumPredsLeft == 0) { + dbgs() << "*** Scheduling failed! ***\n"; + SuccSU->dump(DAG); + dbgs() << " has been released too many times!\n"; + llvm_unreachable(nullptr); + } +#endif + + --SuccSU->NumPredsLeft; +} + +/// Release Successors of the SU that are in the block or not. +void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { + for (SDep& Succ : SU->Succs) { + SUnit *SuccSU = Succ.getSUnit(); + + if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) + continue; + + releaseSucc(SU, &Succ); + if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) + TopReadySUs.push_back(SuccSU); + } +} + +void SIScheduleBlock::nodeScheduled(SUnit *SU) { + // Is in TopReadySUs + assert (!SU->NumPredsLeft); + std::vector<SUnit*>::iterator I = + std::find(TopReadySUs.begin(), TopReadySUs.end(), SU); + if (I == TopReadySUs.end()) { + dbgs() << "Data Structure Bug in SI Scheduler\n"; + llvm_unreachable(nullptr); + } + TopReadySUs.erase(I); + + releaseSuccessors(SU, true); + // Scheduling this node will trigger a wait, + // thus propagate to other instructions that they do not need to wait either. + if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) + HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); + + if (DAG->IsLowLatencySU[SU->NodeNum]) { + for (SDep& Succ : SU->Succs) { + std::map<unsigned, unsigned>::iterator I = + NodeNum2Index.find(Succ.getSUnit()->NodeNum); + if (I != NodeNum2Index.end()) + HasLowLatencyNonWaitedParent[I->second] = 1; + } + } + SU->isScheduled = true; +} + +void SIScheduleBlock::finalizeUnits() { + // We remove links from outside blocks to enable scheduling inside the block. + for (SUnit* SU : SUnits) { + releaseSuccessors(SU, false); + if (DAG->IsHighLatencySU[SU->NodeNum]) + HighLatencyBlock = true; + } + HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); +} + +// we maintain ascending order of IDs +void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { + unsigned PredID = Pred->getID(); + + // Check if not already predecessor. + for (SIScheduleBlock* P : Preds) { + if (PredID == P->getID()) + return; + } + Preds.push_back(Pred); + +#ifndef NDEBUG + for (SIScheduleBlock* S : Succs) { + if (PredID == S->getID()) + assert(!"Loop in the Block Graph!\n"); + } +#endif +} + +void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { + unsigned SuccID = Succ->getID(); + + // Check if not already predecessor. + for (SIScheduleBlock* S : Succs) { + if (SuccID == S->getID()) + return; + } + if (Succ->isHighLatencyBlock()) + ++NumHighLatencySuccessors; + Succs.push_back(Succ); +#ifndef NDEBUG + for (SIScheduleBlock* P : Preds) { + if (SuccID == P->getID()) + assert("Loop in the Block Graph!\n"); + } +#endif +} + +#ifndef NDEBUG +void SIScheduleBlock::printDebug(bool full) { + dbgs() << "Block (" << ID << ")\n"; + if (!full) + return; + + dbgs() << "\nContains High Latency Instruction: " + << HighLatencyBlock << '\n'; + dbgs() << "\nDepends On:\n"; + for (SIScheduleBlock* P : Preds) { + P->printDebug(false); + } + + dbgs() << "\nSuccessors:\n"; + for (SIScheduleBlock* S : Succs) { + S->printDebug(false); + } + + if (Scheduled) { + dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' + << LiveInPressure[DAG->getVGPRSetID()] << '\n'; + dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' + << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; + dbgs() << "LiveIns:\n"; + for (unsigned Reg : LiveInRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + + dbgs() << "\nLiveOuts:\n"; + for (unsigned Reg : LiveOutRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + } + + dbgs() << "\nInstructions:\n"; + if (!Scheduled) { + for (SUnit* SU : SUnits) { + SU->dump(DAG); + } + } else { + for (SUnit* SU : SUnits) { + SU->dump(DAG); + } + } + + dbgs() << "///////////////////////\n"; +} + +#endif + +// SIScheduleBlockCreator // + +SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : +DAG(DAG) { +} + +SIScheduleBlockCreator::~SIScheduleBlockCreator() { +} + +SIScheduleBlocks +SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { + std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = + Blocks.find(BlockVariant); + if (B == Blocks.end()) { + SIScheduleBlocks Res; + createBlocksForVariant(BlockVariant); + topologicalSort(); + scheduleInsideBlocks(); + fillStats(); + Res.Blocks = CurrentBlocks; + Res.TopDownIndex2Block = TopDownIndex2Block; + Res.TopDownBlock2Index = TopDownBlock2Index; + Blocks[BlockVariant] = Res; + return Res; + } else { + return B->second; + } +} + +bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { + if (SU->NodeNum >= DAG->SUnits.size()) + return false; + return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; +} + +void SIScheduleBlockCreator::colorHighLatenciesAlone() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) { + CurrentColoring[SU->NodeNum] = NextReservedID++; + } + } +} + +void SIScheduleBlockCreator::colorHighLatenciesGroups() { + unsigned DAGSize = DAG->SUnits.size(); + unsigned NumHighLatencies = 0; + unsigned GroupSize; + unsigned Color = NextReservedID; + unsigned Count = 0; + std::set<unsigned> FormingGroup; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) + ++NumHighLatencies; + } + + if (NumHighLatencies == 0) + return; + + if (NumHighLatencies <= 6) + GroupSize = 2; + else if (NumHighLatencies <= 12) + GroupSize = 3; + else + GroupSize = 4; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + if (DAG->IsHighLatencySU[SU->NodeNum]) { + unsigned CompatibleGroup = true; + unsigned ProposedColor = Color; + for (unsigned j : FormingGroup) { + // TODO: Currently CompatibleGroup will always be false, + // because the graph enforces the load order. This + // can be fixed, but as keeping the load order is often + // good for performance that causes a performance hit (both + // the default scheduler and this scheduler). + // When this scheduler determines a good load order, + // this can be fixed. + if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) || + !DAG->canAddEdge(&DAG->SUnits[j], SU)) + CompatibleGroup = false; + } + if (!CompatibleGroup || ++Count == GroupSize) { + FormingGroup.clear(); + Color = ++NextReservedID; + if (!CompatibleGroup) { + ProposedColor = Color; + FormingGroup.insert(SU->NodeNum); + } + Count = 0; + } else { + FormingGroup.insert(SU->NodeNum); + } + CurrentColoring[SU->NodeNum] = ProposedColor; + } + } +} + +void SIScheduleBlockCreator::colorComputeReservedDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::map<std::set<unsigned>, unsigned> ColorCombinations; + + CurrentTopDownReservedDependencyColoring.clear(); + CurrentBottomUpReservedDependencyColoring.clear(); + + CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); + CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); + + // Traverse TopDown, and give different colors to SUs depending + // on which combination of High Latencies they depend on. + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]]; + std::set<unsigned> SUColors; + + // Already given. + if (CurrentColoring[SU->NodeNum]) { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + CurrentColoring[SU->NodeNum]; + continue; + } + + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) + continue; + if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) + SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); + } + // Color 0 by default. + if (SUColors.empty()) + continue; + // Same color than parents. + if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + *SUColors.begin(); + else { + std::map<std::set<unsigned>, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; + } else { + CurrentTopDownReservedDependencyColoring[SU->NodeNum] = + NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } + } + + ColorCombinations.clear(); + + // Same as before, but BottomUp. + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + // Already given. + if (CurrentColoring[SU->NodeNum]) { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + CurrentColoring[SU->NodeNum]; + continue; + } + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) + SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); + } + // Keep color 0. + if (SUColors.empty()) + continue; + // Same color than parents. + if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + *SUColors.begin(); + else { + std::map<std::set<unsigned>, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; + } else { + CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = + NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } + } +} + +void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; + + // Every combination of colors given by the top down + // and bottom up Reserved node dependency + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + std::pair<unsigned, unsigned> SUColors; + + // High latency instructions: already given. + if (CurrentColoring[SU->NodeNum]) + continue; + + SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; + SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; + + std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = + ColorCombinations.find(SUColors); + if (Pos != ColorCombinations.end()) { + CurrentColoring[SU->NodeNum] = Pos->second; + } else { + CurrentColoring[SU->NodeNum] = NextNonReservedID; + ColorCombinations[SUColors] = NextNonReservedID++; + } + } +} + +void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { + unsigned DAGSize = DAG->SUnits.size(); + std::vector<int> PendingColoring = CurrentColoring; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + std::set<unsigned> SUColorsPending; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || + CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || + CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) + SUColors.insert(CurrentColoring[Succ->NodeNum]); + SUColorsPending.insert(PendingColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && SUColorsPending.size() == 1) + PendingColoring[SU->NodeNum] = *SUColors.begin(); + else // TODO: Attribute new colors depending on color + // combination of children. + PendingColoring[SU->NodeNum] = NextNonReservedID++; + } + CurrentColoring = PendingColoring; +} + + +void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { + unsigned DAGSize = DAG->SUnits.size(); + unsigned PreviousColor; + std::set<unsigned> SeenColors; + + if (DAGSize <= 1) + return; + + PreviousColor = CurrentColoring[0]; + + for (unsigned i = 1, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + unsigned CurrentColor = CurrentColoring[i]; + unsigned PreviousColorSave = PreviousColor; + assert(i == SU->NodeNum); + + if (CurrentColor != PreviousColor) + SeenColors.insert(PreviousColor); + PreviousColor = CurrentColor; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (SeenColors.find(CurrentColor) == SeenColors.end()) + continue; + + if (PreviousColorSave != CurrentColor) + CurrentColoring[i] = NextNonReservedID++; + else + CurrentColoring[i] = CurrentColoring[i-1]; + } +} + +void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + // No predecessor: Vgpr constant loading. + // Low latency instructions usually have a predecessor (the address) + if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { + unsigned DAGSize = DAG->SUnits.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + } +} + +void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { + unsigned DAGSize = DAG->SUnits.size(); + std::map<unsigned, unsigned> ColorCount; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + unsigned color = CurrentColoring[SU->NodeNum]; + std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color); + if (Pos != ColorCount.end()) { + ++ColorCount[color]; + } else { + ColorCount[color] = 1; + } + } + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + unsigned color = CurrentColoring[SU->NodeNum]; + std::set<unsigned> SUColors; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + if (ColorCount[color] > 1) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + SUColors.insert(CurrentColoring[Succ->NodeNum]); + } + if (SUColors.size() == 1 && *SUColors.begin() != color) { + --ColorCount[color]; + CurrentColoring[SU->NodeNum] = *SUColors.begin(); + ++ColorCount[*SUColors.begin()]; + } + } +} + +void SIScheduleBlockCreator::cutHugeBlocks() { + // TODO +} + +void SIScheduleBlockCreator::regroupNoUserInstructions() { + unsigned DAGSize = DAG->SUnits.size(); + int GroupID = NextNonReservedID++; + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + bool hasSuccessor = false; + + if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) + continue; + + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + hasSuccessor = true; + } + if (!hasSuccessor) + CurrentColoring[SU->NodeNum] = GroupID; + } +} + +void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { + unsigned DAGSize = DAG->SUnits.size(); + std::map<unsigned,unsigned> RealID; + + CurrentBlocks.clear(); + CurrentColoring.clear(); + CurrentColoring.resize(DAGSize, 0); + Node2CurrentBlock.clear(); + + // Restore links previous scheduling variant has overridden. + DAG->restoreSULinksLeft(); + + NextReservedID = 1; + NextNonReservedID = DAGSize + 1; + + DEBUG(dbgs() << "Coloring the graph\n"); + + if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) + colorHighLatenciesGroups(); + else + colorHighLatenciesAlone(); + colorComputeReservedDependencies(); + colorAccordingToReservedDependencies(); + colorEndsAccordingToDependencies(); + if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) + colorForceConsecutiveOrderInGroup(); + regroupNoUserInstructions(); + colorMergeConstantLoadsNextGroup(); + colorMergeIfPossibleNextGroupOnlyForReserved(); + + // Put SUs of same color into same block + Node2CurrentBlock.resize(DAGSize, -1); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + unsigned Color = CurrentColoring[SU->NodeNum]; + if (RealID.find(Color) == RealID.end()) { + int ID = CurrentBlocks.size(); + BlockPtrs.push_back( + make_unique<SIScheduleBlock>(DAG, this, ID)); + CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); + RealID[Color] = ID; + } + CurrentBlocks[RealID[Color]]->addUnit(SU); + Node2CurrentBlock[SU->NodeNum] = RealID[Color]; + } + + // Build dependencies between blocks. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &DAG->SUnits[i]; + int SUID = Node2CurrentBlock[i]; + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) + continue; + if (Node2CurrentBlock[Succ->NodeNum] != SUID) + CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]); + } + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) + continue; + if (Node2CurrentBlock[Pred->NodeNum] != SUID) + CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); + } + } + + // Free root and leafs of all blocks to enable scheduling inside them. + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->finalizeUnits(); + } + DEBUG( + dbgs() << "Blocks created:\n\n"; + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + } + ); +} + +// Two functions taken from Codegen/MachineScheduler.cpp + +/// If this iterator is a debug value, increment until reaching the End or a +/// non-debug instruction. +static MachineBasicBlock::const_iterator +nextIfDebug(MachineBasicBlock::const_iterator I, + MachineBasicBlock::const_iterator End) { + for(; I != End; ++I) { + if (!I->isDebugValue()) + break; + } + return I; +} + +/// Non-const version. +static MachineBasicBlock::iterator +nextIfDebug(MachineBasicBlock::iterator I, + MachineBasicBlock::const_iterator End) { + // Cast the return value to nonconst MachineInstr, then cast to an + // instr_iterator, which does not check for null, finally return a + // bundle_iterator. + return MachineBasicBlock::instr_iterator( + const_cast<MachineInstr*>( + &*nextIfDebug(MachineBasicBlock::const_iterator(I), End))); +} + +void SIScheduleBlockCreator::topologicalSort() { + unsigned DAGSize = CurrentBlocks.size(); + std::vector<int> WorkList; + + DEBUG(dbgs() << "Topological Sort\n"); + + WorkList.reserve(DAGSize); + TopDownIndex2Block.resize(DAGSize); + TopDownBlock2Index.resize(DAGSize); + BottomUpIndex2Block.resize(DAGSize); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + unsigned Degree = Block->getSuccs().size(); + TopDownBlock2Index[i] = Degree; + if (Degree == 0) { + WorkList.push_back(i); + } + } + + int Id = DAGSize; + while (!WorkList.empty()) { + int i = WorkList.back(); + SIScheduleBlock *Block = CurrentBlocks[i]; + WorkList.pop_back(); + TopDownBlock2Index[i] = --Id; + TopDownIndex2Block[Id] = i; + for (SIScheduleBlock* Pred : Block->getPreds()) { + if (!--TopDownBlock2Index[Pred->getID()]) + WorkList.push_back(Pred->getID()); + } + } + +#ifndef NDEBUG + // Check correctness of the ordering. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + for (SIScheduleBlock* Pred : Block->getPreds()) { + assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && + "Wrong Top Down topological sorting"); + } + } +#endif + + BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), + TopDownIndex2Block.rend()); +} + +void SIScheduleBlockCreator::scheduleInsideBlocks() { + unsigned DAGSize = CurrentBlocks.size(); + + DEBUG(dbgs() << "\nScheduling Blocks\n\n"); + + // We do schedule a valid scheduling such that a Block corresponds + // to a range of instructions. + DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->fastSchedule(); + } + + // Note: the following code, and the part restoring previous position + // is by far the most expensive operation of the Scheduler. + + // Do not update CurrentTop. + MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); + std::vector<MachineBasicBlock::iterator> PosOld; + std::vector<MachineBasicBlock::iterator> PosNew; + PosOld.reserve(DAG->SUnits.size()); + PosNew.reserve(DAG->SUnits.size()); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = TopDownIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + std::vector<SUnit*> SUs = Block->getScheduledUnits(); + + for (SUnit* SU : SUs) { + MachineInstr *MI = SU->getInstr(); + MachineBasicBlock::iterator Pos = MI; + PosOld.push_back(Pos); + if (&*CurrentTopFastSched == MI) { + PosNew.push_back(Pos); + CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, + DAG->getCurrentBottom()); + } else { + // Update the instruction stream. + DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); + + // Update LiveIntervals. + // Note: Moving all instructions and calling handleMove everytime + // is the most cpu intensive operation of the scheduler. + // It would gain a lot if there was a way to recompute the + // LiveIntervals for the entire scheduling region. + DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true); + PosNew.push_back(CurrentTopFastSched); + } + } + } + + // Now we have Block of SUs == Block of MI. + // We do the final schedule for the instructions inside the block. + // The property that all the SUs of the Block are grouped together as MI + // is used for correct reg usage tracking. + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + std::vector<SUnit*> SUs = Block->getScheduledUnits(); + Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); + } + + DEBUG(dbgs() << "Restoring MI Pos\n"); + // Restore old ordering (which prevents a LIS->handleMove bug). + for (unsigned i = PosOld.size(), e = 0; i != e; --i) { + MachineBasicBlock::iterator POld = PosOld[i-1]; + MachineBasicBlock::iterator PNew = PosNew[i-1]; + if (PNew != POld) { + // Update the instruction stream. + DAG->getBB()->splice(POld, DAG->getBB(), PNew); + + // Update LiveIntervals. + DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true); + } + } + + DEBUG( + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + } + ); +} + +void SIScheduleBlockCreator::fillStats() { + unsigned DAGSize = CurrentBlocks.size(); + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = TopDownIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + if (Block->getPreds().size() == 0) + Block->Depth = 0; + else { + unsigned Depth = 0; + for (SIScheduleBlock *Pred : Block->getPreds()) { + if (Depth < Pred->Depth + 1) + Depth = Pred->Depth + 1; + } + Block->Depth = Depth; + } + } + + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + int BlockIndice = BottomUpIndex2Block[i]; + SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; + if (Block->getSuccs().size() == 0) + Block->Height = 0; + else { + unsigned Height = 0; + for (SIScheduleBlock *Succ : Block->getSuccs()) { + if (Height < Succ->Height + 1) + Height = Succ->Height + 1; + } + Block->Height = Height; + } + } +} + +// SIScheduleBlockScheduler // + +SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, + SISchedulerBlockSchedulerVariant Variant, + SIScheduleBlocks BlocksStruct) : + DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), + LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), + SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { + + // Fill the usage of every output + // Warning: while by construction we always have a link between two blocks + // when one needs a result from the other, the number of users of an output + // is not the sum of child blocks having as input the same virtual register. + // Here is an example. A produces x and y. B eats x and produces x'. + // C eats x' and y. The register coalescer may have attributed the same + // virtual register to x and x'. + // To count accurately, we do a topological sort. In case the register is + // found for several parents, we increment the usage of the one with the + // highest topological index. + LiveOutRegsNumUsages.resize(Blocks.size()); + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + for (unsigned Reg : Block->getInRegs()) { + bool Found = false; + int topoInd = -1; + for (SIScheduleBlock* Pred: Block->getPreds()) { + std::set<unsigned> PredOutRegs = Pred->getOutRegs(); + std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); + + if (RegPos != PredOutRegs.end()) { + Found = true; + if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { + topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; + } + } + } + + if (!Found) + continue; + + int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; + std::map<unsigned, unsigned>::iterator RegPos = + LiveOutRegsNumUsages[PredID].find(Reg); + if (RegPos != LiveOutRegsNumUsages[PredID].end()) { + ++LiveOutRegsNumUsages[PredID][Reg]; + } else { + LiveOutRegsNumUsages[PredID][Reg] = 1; + } + } + } + + LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); + BlockNumPredsLeft.resize(Blocks.size()); + BlockNumSuccsLeft.resize(Blocks.size()); + + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + BlockNumPredsLeft[i] = Block->getPreds().size(); + BlockNumSuccsLeft[i] = Block->getSuccs().size(); + } + +#ifndef NDEBUG + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + assert(Block->getID() == i); + } +#endif + + std::set<unsigned> InRegs = DAG->getInRegs(); + addLiveRegs(InRegs); + + // Fill LiveRegsConsumers for regs that were already + // defined before scheduling. + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + for (unsigned Reg : Block->getInRegs()) { + bool Found = false; + for (SIScheduleBlock* Pred: Block->getPreds()) { + std::set<unsigned> PredOutRegs = Pred->getOutRegs(); + std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); + + if (RegPos != PredOutRegs.end()) { + Found = true; + break; + } + } + + if (!Found) { + if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end()) + LiveRegsConsumers[Reg] = 1; + else + ++LiveRegsConsumers[Reg]; + } + } + } + + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + SIScheduleBlock *Block = Blocks[i]; + if (BlockNumPredsLeft[i] == 0) { + ReadyBlocks.push_back(Block); + } + } + + while (SIScheduleBlock *Block = pickBlock()) { + BlocksScheduled.push_back(Block); + blockScheduled(Block); + } + + DEBUG( + dbgs() << "Block Order:"; + for (SIScheduleBlock* Block : BlocksScheduled) { + dbgs() << ' ' << Block->getID(); + } + ); +} + +bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand) { + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + // Try to hide high latencies. + if (tryLess(TryCand.LastPosHighLatParentScheduled, + Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) + return true; + // Schedule high latencies early so you can hide them better. + if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, + TryCand, Cand, Latency)) + return true; + if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height, + TryCand, Cand, Depth)) + return true; + if (tryGreater(TryCand.NumHighLatencySuccessors, + Cand.NumHighLatencySuccessors, + TryCand, Cand, Successor)) + return true; + return false; +} + +bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand) { + if (!Cand.isValid()) { + TryCand.Reason = NodeOrder; + return true; + } + + if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, + TryCand, Cand, RegUsage)) + return true; + if (tryGreater(TryCand.NumSuccessors > 0, + Cand.NumSuccessors > 0, + TryCand, Cand, Successor)) + return true; + if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) + return true; + if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, + TryCand, Cand, RegUsage)) + return true; + return false; +} + +SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { + SIBlockSchedCandidate Cand; + std::vector<SIScheduleBlock*>::iterator Best; + SIScheduleBlock *Block; + if (ReadyBlocks.empty()) + return nullptr; + + DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), + VregCurrentUsage, SregCurrentUsage); + if (VregCurrentUsage > maxVregUsage) + maxVregUsage = VregCurrentUsage; + if (VregCurrentUsage > maxSregUsage) + maxSregUsage = VregCurrentUsage; + DEBUG( + dbgs() << "Picking New Blocks\n"; + dbgs() << "Available: "; + for (SIScheduleBlock* Block : ReadyBlocks) + dbgs() << Block->getID() << ' '; + dbgs() << "\nCurrent Live:\n"; + for (unsigned Reg : LiveRegs) + dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << '\n'; + dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; + dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; + ); + + Cand.Block = nullptr; + for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), + E = ReadyBlocks.end(); I != E; ++I) { + SIBlockSchedCandidate TryCand; + TryCand.Block = *I; + TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); + TryCand.VGPRUsageDiff = + checkRegUsageImpact(TryCand.Block->getInRegs(), + TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; + TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); + TryCand.NumHighLatencySuccessors = + TryCand.Block->getNumHighLatencySuccessors(); + TryCand.LastPosHighLatParentScheduled = + (unsigned int) std::max<int> (0, + LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - + LastPosWaitedHighLatency); + TryCand.Height = TryCand.Block->Height; + // Try not to increase VGPR usage too much, else we may spill. + if (VregCurrentUsage > 120 || + Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { + if (!tryCandidateRegUsage(Cand, TryCand) && + Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) + tryCandidateLatency(Cand, TryCand); + } else { + if (!tryCandidateLatency(Cand, TryCand)) + tryCandidateRegUsage(Cand, TryCand); + } + if (TryCand.Reason != NoCand) { + Cand.setBest(TryCand); + Best = I; + DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' + << getReasonStr(Cand.Reason) << '\n'); + } + } + + DEBUG( + dbgs() << "Picking: " << Cand.Block->getID() << '\n'; + dbgs() << "Is a block with high latency instruction: " + << (Cand.IsHighLatency ? "yes\n" : "no\n"); + dbgs() << "Position of last high latency dependency: " + << Cand.LastPosHighLatParentScheduled << '\n'; + dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; + dbgs() << '\n'; + ); + + Block = Cand.Block; + ReadyBlocks.erase(Best); + return Block; +} + +// Tracking of currently alive registers to determine VGPR Usage. + +void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { + for (unsigned Reg : Regs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + // If not already in the live set, then add it. + (void) LiveRegs.insert(Reg); + } +} + +void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, + std::set<unsigned> &Regs) { + for (unsigned Reg : Regs) { + // For now only track virtual registers. + std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); + assert (Pos != LiveRegs.end() && // Reg must be live. + LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && + LiveRegsConsumers[Reg] >= 1); + --LiveRegsConsumers[Reg]; + if (LiveRegsConsumers[Reg] == 0) + LiveRegs.erase(Pos); + } +} + +void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { + for (SIScheduleBlock* Block : Parent->getSuccs()) { + --BlockNumPredsLeft[Block->getID()]; + if (BlockNumPredsLeft[Block->getID()] == 0) { + ReadyBlocks.push_back(Block); + } + // TODO: Improve check. When the dependency between the high latency + // instructions and the instructions of the other blocks are WAR or WAW + // there will be no wait triggered. We would like these cases to not + // update LastPosHighLatencyParentScheduled. + if (Parent->isHighLatencyBlock()) + LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled; + } +} + +void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { + decreaseLiveRegs(Block, Block->getInRegs()); + addLiveRegs(Block->getOutRegs()); + releaseBlockSuccs(Block); + for (std::map<unsigned, unsigned>::iterator RegI = + LiveOutRegsNumUsages[Block->getID()].begin(), + E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { + std::pair<unsigned, unsigned> RegP = *RegI; + if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end()) + LiveRegsConsumers[RegP.first] = RegP.second; + else { + assert(LiveRegsConsumers[RegP.first] == 0); + LiveRegsConsumers[RegP.first] += RegP.second; + } + } + if (LastPosHighLatencyParentScheduled[Block->getID()] > + (unsigned)LastPosWaitedHighLatency) + LastPosWaitedHighLatency = + LastPosHighLatencyParentScheduled[Block->getID()]; + ++NumBlockScheduled; +} + +std::vector<int> +SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, + std::set<unsigned> &OutRegs) { + std::vector<int> DiffSetPressure; + DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); + + for (unsigned Reg : InRegs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + if (LiveRegsConsumers[Reg] > 1) + continue; + PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + DiffSetPressure[*PSetI] -= PSetI.getWeight(); + } + } + + for (unsigned Reg : OutRegs) { + // For now only track virtual registers. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + DiffSetPressure[*PSetI] += PSetI.getWeight(); + } + } + + return DiffSetPressure; +} + +// SIScheduler // + +struct SIScheduleBlockResult +SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, + SISchedulerBlockSchedulerVariant ScheduleVariant) { + SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); + SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); + std::vector<SIScheduleBlock*> ScheduledBlocks; + struct SIScheduleBlockResult Res; + + ScheduledBlocks = Scheduler.getBlocks(); + + for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { + SIScheduleBlock *Block = ScheduledBlocks[b]; + std::vector<SUnit*> SUs = Block->getScheduledUnits(); + + for (SUnit* SU : SUs) + Res.SUs.push_back(SU->NodeNum); + } + + Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); + Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); + return Res; +} + +// SIScheduleDAGMI // + +SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : + ScheduleDAGMILive(C, make_unique<GenericScheduler>(C)) { + SITII = static_cast<const SIInstrInfo*>(TII); + SITRI = static_cast<const SIRegisterInfo*>(TRI); + + VGPRSetID = SITRI->getVGPR32PressureSet(); + SGPRSetID = SITRI->getSGPR32PressureSet(); +} + +SIScheduleDAGMI::~SIScheduleDAGMI() { +} + +ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) { + return new SIScheduleDAGMI(C); +} + +// Code adapted from scheduleDAG.cpp +// Does a topological sort over the SUs. +// Both TopDown and BottomUp +void SIScheduleDAGMI::topologicalSort() { + std::vector<int> TopDownSU2Index; + unsigned DAGSize = SUnits.size(); + std::vector<SUnit*> WorkList; + + DEBUG(dbgs() << "Topological Sort\n"); + WorkList.reserve(DAGSize); + + TopDownIndex2SU.resize(DAGSize); + TopDownSU2Index.resize(DAGSize); + BottomUpIndex2SU.resize(DAGSize); + + WorkList.push_back(&getExitSU()); + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + int NodeNum = SU->NodeNum; + unsigned Degree = SU->Succs.size(); + TopDownSU2Index[NodeNum] = Degree; + if (Degree == 0) { + assert(SU->Succs.empty() && "SUnit should have no successors"); + WorkList.push_back(SU); + } + } + + int Id = DAGSize; + while (!WorkList.empty()) { + SUnit *SU = WorkList.back(); + WorkList.pop_back(); + if (SU->NodeNum < DAGSize) { + TopDownSU2Index[SU->NodeNum] = --Id; + TopDownIndex2SU[Id] = SU->NodeNum; + } + for (SDep& Pred : SU->Preds) { + SUnit *SU = Pred.getSUnit(); + if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum]) + WorkList.push_back(SU); + } + } + + BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(), + TopDownIndex2SU.rend()); + +#ifndef NDEBUG + // Check correctness of the ordering + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + for (SDep& Pred : SU->Preds) { + if (Pred.getSUnit()->NodeNum >= DAGSize) + continue; + assert(TopDownSU2Index[SU->NodeNum] > + TopDownSU2Index[Pred.getSUnit()->NodeNum] && + "Wrong Top Down topological sorting"); + } + } + for (unsigned i = 0, e = DAGSize; i != e; ++i) { + SUnit *SU = &SUnits[i]; + for (SDep& Succ : SU->Succs) { + if (Succ.getSUnit()->NodeNum >= DAGSize) + continue; + assert(TopDownSU2Index[SU->NodeNum] < + TopDownSU2Index[Succ.getSUnit()->NodeNum] && + "Wrong Bottom Up topological sorting"); + } + } +#endif +} + +// Move low latencies further from their user without +// increasing SGPR usage (in general) +// This is to be replaced by a better pass that would +// take into account SGPR usage (based on VGPR Usage +// and the corresponding wavefront count), that would +// try to merge groups of loads if it make sense, etc +void SIScheduleDAGMI::moveLowLatencies() { + unsigned DAGSize = SUnits.size(); + int LastLowLatencyUser = -1; + int LastLowLatencyPos = -1; + + for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { + SUnit *SU = &SUnits[ScheduledSUnits[i]]; + bool IsLowLatencyUser = false; + unsigned MinPos = 0; + + for (SDep& PredDep : SU->Preds) { + SUnit *Pred = PredDep.getSUnit(); + if (SITII->isLowLatencyInstruction(Pred->getInstr())) { + IsLowLatencyUser = true; + } + if (Pred->NodeNum >= DAGSize) + continue; + unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; + if (PredPos >= MinPos) + MinPos = PredPos + 1; + } + + if (SITII->isLowLatencyInstruction(SU->getInstr())) { + unsigned BestPos = LastLowLatencyUser + 1; + if ((int)BestPos <= LastLowLatencyPos) + BestPos = LastLowLatencyPos + 1; + if (BestPos < MinPos) + BestPos = MinPos; + if (BestPos < i) { + for (unsigned u = i; u > BestPos; --u) { + ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; + ScheduledSUnits[u] = ScheduledSUnits[u-1]; + } + ScheduledSUnits[BestPos] = SU->NodeNum; + ScheduledSUnitsInv[SU->NodeNum] = BestPos; + } + LastLowLatencyPos = BestPos; + if (IsLowLatencyUser) + LastLowLatencyUser = BestPos; + } else if (IsLowLatencyUser) { + LastLowLatencyUser = i; + // Moves COPY instructions on which depends + // the low latency instructions too. + } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { + bool CopyForLowLat = false; + for (SDep& SuccDep : SU->Succs) { + SUnit *Succ = SuccDep.getSUnit(); + if (SITII->isLowLatencyInstruction(Succ->getInstr())) { + CopyForLowLat = true; + } + } + if (!CopyForLowLat) + continue; + if (MinPos < i) { + for (unsigned u = i; u > MinPos; --u) { + ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; + ScheduledSUnits[u] = ScheduledSUnits[u-1]; + } + ScheduledSUnits[MinPos] = SU->NodeNum; + ScheduledSUnitsInv[SU->NodeNum] = MinPos; + } + } + } +} + +void SIScheduleDAGMI::restoreSULinksLeft() { + for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { + SUnits[i].isScheduled = false; + SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; + SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; + SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; + SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; + } +} + +// Return the Vgpr and Sgpr usage corresponding to some virtual registers. +template<typename _Iterator> void +SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, + unsigned &VgprUsage, unsigned &SgprUsage) { + VgprUsage = 0; + SgprUsage = 0; + for (_Iterator RegI = First; RegI != End; ++RegI) { + unsigned Reg = *RegI; + // For now only track virtual registers + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + PSetIterator PSetI = MRI.getPressureSets(Reg); + for (; PSetI.isValid(); ++PSetI) { + if (*PSetI == VGPRSetID) + VgprUsage += PSetI.getWeight(); + else if (*PSetI == SGPRSetID) + SgprUsage += PSetI.getWeight(); + } + } +} + +void SIScheduleDAGMI::schedule() +{ + SmallVector<SUnit*, 8> TopRoots, BotRoots; + SIScheduleBlockResult Best, Temp; + DEBUG(dbgs() << "Preparing Scheduling\n"); + + buildDAGWithRegPressure(); + DEBUG( + for(SUnit& SU : SUnits) + SU.dumpAll(this) + ); + + Topo.InitDAGTopologicalSorting(); + topologicalSort(); + findRootsAndBiasEdges(TopRoots, BotRoots); + // We reuse several ScheduleDAGMI and ScheduleDAGMILive + // functions, but to make them happy we must initialize + // the default Scheduler implementation (even if we do not + // run it) + SchedImpl->initialize(this); + initQueues(TopRoots, BotRoots); + + // Fill some stats to help scheduling. + + SUnitsLinksBackup = SUnits; + IsLowLatencySU.clear(); + LowLatencyOffset.clear(); + IsHighLatencySU.clear(); + + IsLowLatencySU.resize(SUnits.size(), 0); + LowLatencyOffset.resize(SUnits.size(), 0); + IsHighLatencySU.resize(SUnits.size(), 0); + + for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { + SUnit *SU = &SUnits[i]; + unsigned BaseLatReg, OffLatReg; + if (SITII->isLowLatencyInstruction(SU->getInstr())) { + IsLowLatencySU[i] = 1; + if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg, + OffLatReg, TRI)) + LowLatencyOffset[i] = OffLatReg; + } else if (SITII->isHighLatencyInstruction(SU->getInstr())) + IsHighLatencySU[i] = 1; + } + + SIScheduler Scheduler(this); + Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, + SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); +#if 0 // To enable when handleMove fix lands + // if VGPR usage is extremely high, try other good performing variants + // which could lead to lower VGPR usage + if (Best.MaxVGPRUsage > 180) { + std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { + { LatenciesAlone, BlockRegUsageLatency }, +// { LatenciesAlone, BlockRegUsage }, + { LatenciesGrouped, BlockLatencyRegUsage }, +// { LatenciesGrouped, BlockRegUsageLatency }, +// { LatenciesGrouped, BlockRegUsage }, + { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, +// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, +// { LatenciesAlonePlusConsecutive, BlockRegUsage } + }; + for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { + Temp = Scheduler.scheduleVariant(v.first, v.second); + if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) + Best = Temp; + } + } + // if VGPR usage is still extremely high, we may spill. Try other variants + // which are less performing, but that could lead to lower VGPR usage. + if (Best.MaxVGPRUsage > 200) { + std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { +// { LatenciesAlone, BlockRegUsageLatency }, + { LatenciesAlone, BlockRegUsage }, +// { LatenciesGrouped, BlockLatencyRegUsage }, + { LatenciesGrouped, BlockRegUsageLatency }, + { LatenciesGrouped, BlockRegUsage }, +// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, + { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, + { LatenciesAlonePlusConsecutive, BlockRegUsage } + }; + for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { + Temp = Scheduler.scheduleVariant(v.first, v.second); + if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) + Best = Temp; + } + } +#endif + ScheduledSUnits = Best.SUs; + ScheduledSUnitsInv.resize(SUnits.size()); + + for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { + ScheduledSUnitsInv[ScheduledSUnits[i]] = i; + } + + moveLowLatencies(); + + // Tell the outside world about the result of the scheduling. + + assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); + TopRPTracker.setPos(CurrentTop); + + for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), + E = ScheduledSUnits.end(); I != E; ++I) { + SUnit *SU = &SUnits[*I]; + + scheduleMI(SU, true); + + DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " + << *SU->getInstr()); + } + + assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); + + placeDebugValues(); + + DEBUG({ + unsigned BBNum = begin()->getParent()->getNumber(); + dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; + dumpSchedule(); + dbgs() << '\n'; + }); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h new file mode 100644 index 000000000000..b270136811c6 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -0,0 +1,489 @@ +//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief SI Machine Scheduler interface +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H + +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/RegisterPressure.h" + +using namespace llvm; + +namespace llvm { + +enum SIScheduleCandReason { + NoCand, + RegUsage, + Latency, + Successor, + Depth, + NodeOrder +}; + +struct SISchedulerCandidate { + // The reason for this candidate. + SIScheduleCandReason Reason; + + // Set of reasons that apply to multiple candidates. + uint32_t RepeatReasonSet; + + SISchedulerCandidate() + : Reason(NoCand), RepeatReasonSet(0) {} + + bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); } + void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); } +}; + +class SIScheduleDAGMI; +class SIScheduleBlockCreator; + +class SIScheduleBlock { + SIScheduleDAGMI *DAG; + SIScheduleBlockCreator *BC; + + std::vector<SUnit*> SUnits; + std::map<unsigned, unsigned> NodeNum2Index; + std::vector<SUnit*> TopReadySUs; + std::vector<SUnit*> ScheduledSUnits; + + /// The top of the unscheduled zone. + IntervalPressure TopPressure; + RegPressureTracker TopRPTracker; + + // Pressure: number of said class of registers needed to + // store the live virtual and real registers. + // We do care only of SGPR32 and VGPR32 and do track only virtual registers. + // Pressure of additional registers required inside the block. + std::vector<unsigned> InternalAdditionnalPressure; + // Pressure of input and output registers + std::vector<unsigned> LiveInPressure; + std::vector<unsigned> LiveOutPressure; + // Registers required by the block, and outputs. + // We do track only virtual registers. + // Note that some registers are not 32 bits, + // and thus the pressure is not equal + // to the number of live registers. + std::set<unsigned> LiveInRegs; + std::set<unsigned> LiveOutRegs; + + bool Scheduled; + bool HighLatencyBlock; + + std::vector<unsigned> HasLowLatencyNonWaitedParent; + + // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table. + unsigned ID; + + std::vector<SIScheduleBlock*> Preds; // All blocks predecessors. + std::vector<SIScheduleBlock*> Succs; // All blocks successors. + unsigned NumHighLatencySuccessors; + +public: + SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC, + unsigned ID): + DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(), + TopRPTracker(TopPressure), Scheduled(false), + HighLatencyBlock(false), ID(ID), + Preds(), Succs(), NumHighLatencySuccessors(0) {}; + + ~SIScheduleBlock() {}; + + unsigned getID() const { return ID; } + + /// Functions for Block construction. + void addUnit(SUnit *SU); + + // When all SUs have been added. + void finalizeUnits(); + + // Add block pred, which has instruction predecessor of SU. + void addPred(SIScheduleBlock *Pred); + void addSucc(SIScheduleBlock *Succ); + + const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; } + const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; } + + unsigned Height; // Maximum topdown path length to block without outputs + unsigned Depth; // Maximum bottomup path length to block without inputs + + unsigned getNumHighLatencySuccessors() const { + return NumHighLatencySuccessors; + } + + bool isHighLatencyBlock() { return HighLatencyBlock; } + + // This is approximative. + // Ideally should take into accounts some instructions (rcp, etc) + // are 4 times slower. + int getCost() { return SUnits.size(); } + + // The block Predecessors and Successors must be all registered + // before fastSchedule(). + // Fast schedule with no particular requirement. + void fastSchedule(); + + std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; } + + // Complete schedule that will try to minimize reg pressure and + // low latencies, and will fill liveins and liveouts. + // Needs all MIs to be grouped between BeginBlock and EndBlock. + // The MIs can be moved after the scheduling, + // it is just used to allow correct track of live registers. + void schedule(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock); + + bool isScheduled() { return Scheduled; } + + + // Needs the block to be scheduled inside + // TODO: find a way to compute it. + std::vector<unsigned> &getInternalAdditionnalRegUsage() { + return InternalAdditionnalPressure; + } + + std::set<unsigned> &getInRegs() { return LiveInRegs; } + std::set<unsigned> &getOutRegs() { return LiveOutRegs; } + + void printDebug(bool Full); + +private: + struct SISchedCandidate : SISchedulerCandidate { + // The best SUnit candidate. + SUnit *SU; + + unsigned SGPRUsage; + unsigned VGPRUsage; + bool IsLowLatency; + unsigned LowLatencyOffset; + bool HasLowLatencyNonWaitedParent; + + SISchedCandidate() + : SU(nullptr) {} + + bool isValid() const { return SU; } + + // Copy the status of another candidate without changing policy. + void setBest(SISchedCandidate &Best) { + assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + SU = Best.SU; + Reason = Best.Reason; + SGPRUsage = Best.SGPRUsage; + VGPRUsage = Best.VGPRUsage; + IsLowLatency = Best.IsLowLatency; + LowLatencyOffset = Best.LowLatencyOffset; + HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent; + } + }; + + void undoSchedule(); + + void undoReleaseSucc(SUnit *SU, SDep *SuccEdge); + void releaseSucc(SUnit *SU, SDep *SuccEdge); + // InOrOutBlock: restrict to links pointing inside the block (true), + // or restrict to links pointing outside the block (false). + void releaseSuccessors(SUnit *SU, bool InOrOutBlock); + + void nodeScheduled(SUnit *SU); + void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand); + void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand); + SUnit* pickNode(); + void traceCandidate(const SISchedCandidate &Cand); + void initRegPressure(MachineBasicBlock::iterator BeginBlock, + MachineBasicBlock::iterator EndBlock); +}; + +struct SIScheduleBlocks { + std::vector<SIScheduleBlock*> Blocks; + std::vector<int> TopDownIndex2Block; + std::vector<int> TopDownBlock2Index; +}; + +enum SISchedulerBlockCreatorVariant { + LatenciesAlone, + LatenciesGrouped, + LatenciesAlonePlusConsecutive +}; + +class SIScheduleBlockCreator { + SIScheduleDAGMI *DAG; + // unique_ptr handles freeing memory for us. + std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs; + std::map<SISchedulerBlockCreatorVariant, + SIScheduleBlocks> Blocks; + std::vector<SIScheduleBlock*> CurrentBlocks; + std::vector<int> Node2CurrentBlock; + + // Topological sort + // Maps topological index to the node number. + std::vector<int> TopDownIndex2Block; + std::vector<int> TopDownBlock2Index; + std::vector<int> BottomUpIndex2Block; + + // 0 -> Color not given. + // 1 to SUnits.size() -> Reserved group (you should only add elements to them). + // Above -> Other groups. + int NextReservedID; + int NextNonReservedID; + std::vector<int> CurrentColoring; + std::vector<int> CurrentTopDownReservedDependencyColoring; + std::vector<int> CurrentBottomUpReservedDependencyColoring; + +public: + SIScheduleBlockCreator(SIScheduleDAGMI *DAG); + ~SIScheduleBlockCreator(); + + SIScheduleBlocks + getBlocks(SISchedulerBlockCreatorVariant BlockVariant); + + bool isSUInBlock(SUnit *SU, unsigned ID); + +private: + // Give a Reserved color to every high latency. + void colorHighLatenciesAlone(); + + // Create groups of high latencies with a Reserved color. + void colorHighLatenciesGroups(); + + // Compute coloring for topdown and bottom traversals with + // different colors depending on dependencies on Reserved colors. + void colorComputeReservedDependencies(); + + // Give color to all non-colored SUs according to Reserved groups dependencies. + void colorAccordingToReservedDependencies(); + + // Divides Blocks having no bottom up or top down dependencies on Reserved groups. + // The new colors are computed according to the dependencies on the other blocks + // formed with colorAccordingToReservedDependencies. + void colorEndsAccordingToDependencies(); + + // Cut groups into groups with SUs in consecutive order (except for Reserved groups). + void colorForceConsecutiveOrderInGroup(); + + // Merge Constant loads that have all their users into another group to the group. + // (TODO: else if all their users depend on the same group, put them there) + void colorMergeConstantLoadsNextGroup(); + + // Merge SUs that have all their users into another group to the group + void colorMergeIfPossibleNextGroup(); + + // Merge SUs that have all their users into another group to the group, + // but only for Reserved groups. + void colorMergeIfPossibleNextGroupOnlyForReserved(); + + // Merge SUs that have all their users into another group to the group, + // but only if the group is no more than a few SUs. + void colorMergeIfPossibleSmallGroupsToNextGroup(); + + // Divides Blocks with important size. + // Idea of implementation: attribute new colors depending on topdown and + // bottom up links to other blocks. + void cutHugeBlocks(); + + // Put in one group all instructions with no users in this scheduling region + // (we'd want these groups be at the end). + void regroupNoUserInstructions(); + + void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant); + + void topologicalSort(); + + void scheduleInsideBlocks(); + + void fillStats(); +}; + +enum SISchedulerBlockSchedulerVariant { + BlockLatencyRegUsage, + BlockRegUsageLatency, + BlockRegUsage +}; + +class SIScheduleBlockScheduler { + SIScheduleDAGMI *DAG; + SISchedulerBlockSchedulerVariant Variant; + std::vector<SIScheduleBlock*> Blocks; + + std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages; + std::set<unsigned> LiveRegs; + // Num of schedulable unscheduled blocks reading the register. + std::map<unsigned, unsigned> LiveRegsConsumers; + + std::vector<unsigned> LastPosHighLatencyParentScheduled; + int LastPosWaitedHighLatency; + + std::vector<SIScheduleBlock*> BlocksScheduled; + unsigned NumBlockScheduled; + std::vector<SIScheduleBlock*> ReadyBlocks; + + unsigned VregCurrentUsage; + unsigned SregCurrentUsage; + + // Currently is only approximation. + unsigned maxVregUsage; + unsigned maxSregUsage; + + std::vector<unsigned> BlockNumPredsLeft; + std::vector<unsigned> BlockNumSuccsLeft; + +public: + SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, + SISchedulerBlockSchedulerVariant Variant, + SIScheduleBlocks BlocksStruct); + ~SIScheduleBlockScheduler() {}; + + std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }; + + unsigned getVGPRUsage() { return maxVregUsage; }; + unsigned getSGPRUsage() { return maxSregUsage; }; + +private: + struct SIBlockSchedCandidate : SISchedulerCandidate { + // The best Block candidate. + SIScheduleBlock *Block; + + bool IsHighLatency; + int VGPRUsageDiff; + unsigned NumSuccessors; + unsigned NumHighLatencySuccessors; + unsigned LastPosHighLatParentScheduled; + unsigned Height; + + SIBlockSchedCandidate() + : Block(nullptr) {} + + bool isValid() const { return Block; } + + // Copy the status of another candidate without changing policy. + void setBest(SIBlockSchedCandidate &Best) { + assert(Best.Reason != NoCand && "uninitialized Sched candidate"); + Block = Best.Block; + Reason = Best.Reason; + IsHighLatency = Best.IsHighLatency; + VGPRUsageDiff = Best.VGPRUsageDiff; + NumSuccessors = Best.NumSuccessors; + NumHighLatencySuccessors = Best.NumHighLatencySuccessors; + LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled; + Height = Best.Height; + } + }; + + bool tryCandidateLatency(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand); + bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand, + SIBlockSchedCandidate &TryCand); + SIScheduleBlock *pickBlock(); + + void addLiveRegs(std::set<unsigned> &Regs); + void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs); + void releaseBlockSuccs(SIScheduleBlock *Parent); + void blockScheduled(SIScheduleBlock *Block); + + // Check register pressure change + // by scheduling a block with these LiveIn and LiveOut. + std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs, + std::set<unsigned> &OutRegs); + + void schedule(); +}; + +struct SIScheduleBlockResult { + std::vector<unsigned> SUs; + unsigned MaxSGPRUsage; + unsigned MaxVGPRUsage; +}; + +class SIScheduler { + SIScheduleDAGMI *DAG; + SIScheduleBlockCreator BlockCreator; + +public: + SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}; + + ~SIScheduler() {}; + + struct SIScheduleBlockResult + scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, + SISchedulerBlockSchedulerVariant ScheduleVariant); +}; + +class SIScheduleDAGMI : public ScheduleDAGMILive { + const SIInstrInfo *SITII; + const SIRegisterInfo *SITRI; + + std::vector<SUnit> SUnitsLinksBackup; + + // For moveLowLatencies. After all Scheduling variants are tested. + std::vector<unsigned> ScheduledSUnits; + std::vector<unsigned> ScheduledSUnitsInv; + + unsigned VGPRSetID; + unsigned SGPRSetID; + +public: + SIScheduleDAGMI(MachineSchedContext *C); + + ~SIScheduleDAGMI() override; + + // Entry point for the schedule. + void schedule() override; + + // To init Block's RPTracker. + void initRPTracker(RegPressureTracker &RPTracker) { + RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); + } + + MachineBasicBlock *getBB() { return BB; } + MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }; + MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }; + LiveIntervals *getLIS() { return LIS; } + MachineRegisterInfo *getMRI() { return &MRI; } + const TargetRegisterInfo *getTRI() { return TRI; } + SUnit& getEntrySU() { return EntrySU; }; + SUnit& getExitSU() { return ExitSU; }; + + void restoreSULinksLeft(); + + template<typename _Iterator> void fillVgprSgprCost(_Iterator First, + _Iterator End, + unsigned &VgprUsage, + unsigned &SgprUsage); + std::set<unsigned> getInRegs() { + std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(), + RPTracker.getPressure().LiveInRegs.end()); + return InRegs; + }; + + unsigned getVGPRSetID() const { return VGPRSetID; } + unsigned getSGPRSetID() const { return SGPRSetID; } + +private: + void topologicalSort(); + // After scheduling is done, improve low latency placements. + void moveLowLatencies(); + +public: + // Some stats for scheduling inside blocks. + std::vector<unsigned> IsLowLatencySU; + std::vector<unsigned> LowLatencyOffset; + std::vector<unsigned> IsHighLatencySU; + // Topological sort + // Maps topological index to the node number. + std::vector<int> TopDownIndex2SU; + std::vector<int> BottomUpIndex2SU; +}; + +} // namespace llvm + +#endif /* SIMACHINESCHEDULER_H_ */ diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 2afa00996609..609f5e7df549 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -23,7 +23,20 @@ using namespace llvm; -SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() {} +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() { + unsigned NumRegPressureSets = getNumRegPressureSets(); + + SGPR32SetID = NumRegPressureSets; + VGPR32SetID = NumRegPressureSets; + for (unsigned i = 0; i < NumRegPressureSets; ++i) { + if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0) + SGPR32SetID = i; + else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0) + VGPR32SetID = i; + } + assert(SGPR32SetID < NumRegPressureSets && + VGPR32SetID < NumRegPressureSets); +} void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { MCRegAliasIterator R(Reg, this, true); @@ -36,18 +49,15 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); if (ST.hasSGPRInitBug()) { - unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4; - if (ST.isXNACKEnabled()) - BaseIdx -= 4; - + // Leave space for flat_scr, xnack_mask, vcc, and alignment + unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4; unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // 98/99 need to be reserved for flat_scr or 96/97 for flat_scr and - // 98/99 for xnack_mask, and 100/101 for vcc. This is the next sgpr128 down - // either way. + // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and + // 100/101 for vcc. This is the next sgpr128 down. return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; } @@ -58,25 +68,14 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); if (ST.hasSGPRInitBug()) { - unsigned Idx; - - if (!ST.isXNACKEnabled()) - Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5; - else - Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1; - + unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1; return AMDGPU::SGPR_32RegClass.getRegister(Idx); } if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - if (!ST.isXNACKEnabled()) { - // Next register before reservations for flat_scr and vcc. - return AMDGPU::SGPR97; - } else { - // Next register before reservations for flat_scr, xnack_mask, vcc, - // and scratch resource. - return AMDGPU::SGPR91; - } + // Next register before reservations for flat_scr, xnack_mask, vcc, + // and scratch resource. + return AMDGPU::SGPR91; } return AMDGPU::SGPR95; @@ -99,23 +98,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation - // for VCC/FLAT_SCR. + // for VCC/XNACK_MASK/FLAT_SCR. + // + // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose + // SGPRs when the XNACK feature is not used. This is currently not done + // because the code that counts SGPRs cannot account for such holes. + reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97); reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); - - if (ST.isXNACKEnabled()) - reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97); } // Tonga and Iceland can only allocate a fixed number of SGPRs due // to a hw bug. if (ST.hasSGPRInitBug()) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). - unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4; - - if (ST.isXNACKEnabled()) - Limit -= 2; + // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs). + unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6; for (unsigned i = Limit; i < NumSGPRs; ++i) { unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); @@ -479,12 +477,38 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( if (SubIdx == AMDGPU::NoSubRegister) return RC; - // If this register has a sub-register, we can safely assume it is a 32-bit - // register, because all of SI's sub-registers are 32-bit. + // We can assume that each lane corresponds to one 32-bit register. + unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx)); if (isSGPRClass(RC)) { - return &AMDGPU::SGPR_32RegClass; + switch (Count) { + case 1: + return &AMDGPU::SGPR_32RegClass; + case 2: + return &AMDGPU::SReg_64RegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } } else { - return &AMDGPU::VGPR_32RegClass; + switch (Count) { + case 1: + return &AMDGPU::VGPR_32RegClass; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + case 8: + return &AMDGPU::VReg_256RegClass; + case 16: /* fall-through */ + default: + llvm_unreachable("Invalid sub-register class size"); + } } } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 1795237c2140..9410e2049cba 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -25,6 +25,9 @@ namespace llvm { struct SIRegisterInfo : public AMDGPURegisterInfo { private: + unsigned SGPR32SetID; + unsigned VGPR32SetID; + void reserveRegisterTuples(BitVector &, unsigned Reg) const; public: @@ -146,6 +149,9 @@ public: unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const; + unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; + unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; + private: void buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, unsigned Value, diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp index dbdc76b917f3..d36c5d29b127 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -98,6 +98,9 @@ void SITypeRewriter::visitCallInst(CallInst &I) { SmallVector <Type*, 8> Types; bool NeedToReplace = false; Function *F = I.getCalledFunction(); + if (!F) + return; + std::string Name = F->getName(); for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) { Value *Arg = I.getArgOperand(i); diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index add415e215cf..3b4c235c0dc9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -106,20 +106,27 @@ bool isReadOnlySegment(const GlobalValue *GV) { return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; } -static const char ShaderTypeAttribute[] = "ShaderType"; - -unsigned getShaderType(const Function &F) { - Attribute A = F.getFnAttribute(ShaderTypeAttribute); - unsigned ShaderType = ShaderType::COMPUTE; +static unsigned getIntegerAttribute(const Function &F, const char *Name, + unsigned Default) { + Attribute A = F.getFnAttribute(Name); + unsigned Result = Default; if (A.isStringAttribute()) { StringRef Str = A.getValueAsString(); - if (Str.getAsInteger(0, ShaderType)) { + if (Str.getAsInteger(0, Result)) { LLVMContext &Ctx = F.getContext(); Ctx.emitError("can't parse shader type"); } } - return ShaderType; + return Result; +} + +unsigned getShaderType(const Function &F) { + return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE); +} + +unsigned getInitialPSInputAddr(const Function &F) { + return getIntegerAttribute(F, "InitialPSInputAddr", 0); } bool isSI(const MCSubtargetInfo &STI) { diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 19419a29f5e0..57cbe1b58f98 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -45,6 +45,8 @@ bool isGlobalSegment(const GlobalValue *GV); bool isReadOnlySegment(const GlobalValue *GV); unsigned getShaderType(const Function &F); +unsigned getInitialPSInputAddr(const Function &F); + bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 419717c85a79..a5207705fc69 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -87,9 +87,22 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } } + if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS) + return MF->getInfo<ARMFunctionInfo>()->isSplitCSR() + ? CSR_iOS_CXX_TLS_PE_SaveList + : CSR_iOS_CXX_TLS_SaveList; return RegList; } +const MCPhysReg *ARMBaseRegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo<ARMFunctionInfo>()->isSplitCSR()) + return CSR_iOS_CXX_TLS_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { @@ -97,6 +110,8 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (CC == CallingConv::GHC) // This is academic becase all GHC calls are (supposed to be) tail calls return CSR_NoRegs_RegMask; + if (STI.isTargetDarwin() && CC == CallingConv::CXX_FAST_TLS) + return CSR_iOS_CXX_TLS_RegMask; return STI.isTargetDarwin() ? CSR_iOS_RegMask : CSR_AAPCS_RegMask; } @@ -106,6 +121,14 @@ ARMBaseRegisterInfo::getNoPreservedMask() const { } const uint32_t * +ARMBaseRegisterInfo::getTLSCallPreservedMask(const MachineFunction &MF) const { + assert(MF.getSubtarget<ARMSubtarget>().isTargetDarwin() && + "only know about special TLS call on Darwin"); + return CSR_iOS_TLSCall_RegMask; +} + + +const uint32_t * ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h index cea8b80c7821..6a9a45a65687 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -62,6 +62,12 @@ static inline bool isARMArea3Register(unsigned Reg, bool isIOS) { switch (Reg) { case D15: case D14: case D13: case D12: case D11: case D10: case D9: case D8: + case D7: case D6: case D5: case D4: + case D3: case D2: case D1: case D0: + case D31: case D30: case D29: case D28: + case D27: case D26: case D25: case D24: + case D23: case D22: case D21: case D20: + case D19: case D18: case D17: case D16: return true; default: return false; @@ -92,9 +98,12 @@ protected: public: /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const override; + const uint32_t *getTLSCallPreservedMask(const MachineFunction &MF) const; /// getThisReturnPreservedMask - Returns a call preserved mask specific to the /// case that 'returned' is on an i32 first argument if the calling convention diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td index 233516415149..847ef87c1b26 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td @@ -225,6 +225,21 @@ def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>; def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS_ThisReturn, R9))>; +def CSR_iOS_TLSCall : CalleeSavedRegs<(add LR, SP, + (sequence "R%u", 12, 1), + (sequence "D%u", 31, 0))>; + +// C++ TLS access function saves all registers except SP. Try to match +// the order of CSRs in CSR_iOS. +def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1), + (sequence "D%u", 31, 0))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR)>; + +// CSRs that are handled explicitly via copies. +def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS, LR)>; + // The "interrupt" attribute is used to generate code that is acceptable in // exception-handlers of various kinds. It makes us use a different return // instruction (handled elsewhere) and affects which registers we must return to diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp index 9bdf823c85bd..ff2fcfa349dc 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -578,7 +578,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) { unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { // For now 32-bit only. - if (VT != MVT::i32) return 0; + if (VT != MVT::i32 || GV->isThreadLocal()) return 0; Reloc::Model RelocM = TM.getRelocationModel(); bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM); @@ -2083,6 +2083,9 @@ bool ARMFastISel::SelectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + // Build a list of return value registers. SmallVector<unsigned, 4> RetRegs; diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 024244092a34..dfbb96959470 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -622,7 +622,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N, } if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } else Base = N; @@ -801,7 +802,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N, Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } else if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } Offset = CurDAG->getRegister(0, MVT::i32); @@ -1067,7 +1069,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N, Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); } else if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), @@ -1186,7 +1189,8 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, if (N.getOpcode() == ISD::ADD) { return false; // We want to select register offset instead } else if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); } else { Base = N; @@ -1292,7 +1296,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, } if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress && + N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) { Base = N.getOperand(0); if (Base.getOpcode() == ISD::TargetConstantPool) return false; // We want to select t2LDRpci instead. diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9cfb06b00c4b..37c0795af283 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -744,7 +744,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBE, MVT::i32, Custom); } - if (!Subtarget->isThumb1Only()) + if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); // ARM does not have ROTL. @@ -1385,6 +1385,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, else return CallingConv::ARM_AAPCS; case CallingConv::Fast: + case CallingConv::CXX_FAST_TLS: if (!Subtarget->isAAPCS_ABI()) { if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::Fast; @@ -2347,6 +2348,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain, Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (ARM::GPRRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i32)); + else if (ARM::DPRRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } // Update chain and glue. RetOps[0] = Chain; @@ -2530,6 +2544,72 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); } +/// \brief Convert a TLS address reference into the correct sequence of loads +/// and calls to compute the variable's address for Darwin, and return an +/// SDValue containing the final node. + +/// Darwin only has one TLS scheme which must be capable of dealing with the +/// fully general situation, in the worst case. This means: +/// + "extern __thread" declaration. +/// + Defined in a possibly unknown dynamic library. +/// +/// The general system is that each __thread variable has a [3 x i32] descriptor +/// which contains information used by the runtime to calculate the address. The +/// only part of this the compiler needs to know about is the first word, which +/// contains a function pointer that must be called with the address of the +/// entire descriptor in "r0". +/// +/// Since this descriptor may be in a different unit, in general access must +/// proceed along the usual ARM rules. A common sequence to produce is: +/// +/// movw rT1, :lower16:_var$non_lazy_ptr +/// movt rT1, :upper16:_var$non_lazy_ptr +/// ldr r0, [rT1] +/// ldr rT2, [r0] +/// blx rT2 +/// [...address now in r0...] +SDValue +ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); + SDLoc DL(Op); + + // First step is to get the address of the actua global symbol. This is where + // the TLS descriptor lives. + SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); + + // The first entry in the descriptor is a function pointer that we must call + // to obtain the address of the variable. + SDValue Chain = DAG.getEntryNode(); + SDValue FuncTLVGet = + DAG.getLoad(MVT::i32, DL, Chain, DescAddr, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, true, true, 4); + Chain = FuncTLVGet.getValue(1); + + MachineFunction &F = DAG.getMachineFunction(); + MachineFrameInfo *MFI = F.getFrameInfo(); + MFI->setAdjustsStack(true); + + // TLS calls preserve all registers except those that absolutely must be + // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be + // silly). + auto TRI = + getTargetMachine().getSubtargetImpl(*F.getFunction())->getRegisterInfo(); + auto ARI = static_cast<const ARMRegisterInfo *>(TRI); + const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); + + // Finally, we can make the call. This is just a degenerate version of a + // normal AArch64 call node: r0 takes the address of the descriptor, and + // returns the address of the variable in this thread. + Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); + Chain = + DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), + Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), + DAG.getRegisterMask(Mask), Chain.getValue(1)); + return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); +} + // Lower ISD::GlobalTLSAddress using the "general dynamic" model SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, @@ -2631,9 +2711,11 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SDValue ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget->isTargetDarwin()) + return LowerGlobalTLSAddressDarwin(Op, DAG); + // TODO: implement the "local dynamic" model - assert(Subtarget->isTargetELF() && - "TLS not implemented for non-ELF targets"); + assert(Subtarget->isTargetELF() && "Only ELF implemented here"); GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); if (DAG.getTarget().Options.EmulatedTLS) return LowerToTLSEmulatedModel(GA, DAG); @@ -11407,7 +11489,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'J': - if (Subtarget->isThumb()) { // FIXME thumb2 + if (Subtarget->isThumb1Only()) { // This must be a constant between -255 and -1, for negated ADD // immediates. This can be used in GCC with an "n" modifier that // prints the negated value, for use with SUB instructions. It is @@ -11476,7 +11558,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'M': - if (Subtarget->isThumb()) { // FIXME thumb2 + if (Subtarget->isThumb1Only()) { // This must be a multiple of 4 between 0 and 1020, for // ADD sp + immediate. if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) @@ -12324,3 +12406,49 @@ unsigned ARMTargetLowering::getExceptionSelectorRegister( // via the personality function. return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; } + +void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in ARMFunctionInfo. + ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); + AFI->setIsSplitCSR(true); +} + +void ARMTargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const { + const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (ARM::GPRRegClass.contains(*I)) + RC = &ARM::GPRRegClass; + else if (ARM::DPRRegClass.contains(*I)) + RC = &ARM::DPRRegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h index b764624f1492..96b56c3ec330 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -526,6 +526,8 @@ namespace llvm { SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const; + SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -578,6 +580,15 @@ namespace llvm { SmallVectorImpl<SDValue> &InVals, bool isThisReturn, SDValue ThisVal) const; + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td index b9de83bfe6dc..c446ba3109e4 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -5398,6 +5398,27 @@ def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr), Requires<[IsARM, UseMovt]>; } // isReMaterializable +// The many different faces of TLS access. +def : ARMPat<(ARMWrapper tglobaltlsaddr :$dst), + (MOVi32imm tglobaltlsaddr :$dst)>, + Requires<[IsARM, UseMovt]>; + +def : Pat<(ARMWrapper tglobaltlsaddr:$src), + (LDRLIT_ga_abs tglobaltlsaddr:$src)>, + Requires<[IsARM, DontUseMovt]>; + +def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr), + (MOV_ga_pcrel tglobaltlsaddr:$addr)>, Requires<[IsARM, UseMovt]>; + +def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr), + (LDRLIT_ga_pcrel tglobaltlsaddr:$addr)>, + Requires<[IsARM, DontUseMovt]>; +let AddedComplexity = 10 in +def : Pat<(load (ARMWrapperPIC tglobaltlsaddr:$addr)), + (MOV_ga_pcrel_ldr tglobaltlsaddr:$addr)>, + Requires<[IsARM, UseMovt]>; + + // ConstantPool, GlobalAddress, and JumpTable def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>, diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td index 7020ffb41b64..defef4ea9073 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -5689,7 +5689,10 @@ def : NEONInstAlias<"vmov${p} $Vd, $Vm", // VMOV : Vector Move (Immediate) -let isReMaterializable = 1 in { +// Although VMOVs are not strictly speaking cheap, they are as expensive +// as their copies counterpart (VORR), so we should prefer rematerialization +// over splitting when it applies. +let isReMaterializable = 1, isAsCheapAsAMove=1 in { def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd), (ins nImmSplatI8:$SIMM), IIC_VMOVImm, "vmov", "i8", "$Vd, $SIMM", "", @@ -5744,7 +5747,7 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd), (ins nImmVMOVF32:$SIMM), IIC_VMOVImm, "vmov", "f32", "$Vd, $SIMM", "", [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>; -} // isReMaterializable +} // isReMaterializable, isAsCheapAsAMove // Add support for bytes replication feature, so it could be GAS compatible. // E.g. instructions below: diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td index df6f24306354..5b1f9a06442e 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -1366,6 +1366,14 @@ def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src), (ARMWrapper tglobaladdr:$src))]>, Requires<[IsThumb, DontUseMovt]>; +// TLS globals +def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr), + (tLDRLIT_ga_pcrel tglobaltlsaddr:$addr)>, + Requires<[IsThumb, DontUseMovt]>; +def : Pat<(ARMWrapper tglobaltlsaddr:$addr), + (tLDRLIT_ga_abs tglobaltlsaddr:$addr)>, + Requires<[IsThumb, DontUseMovt]>; + // JumpTable def : T1Pat<(ARMWrapperJT tjumptable:$dst), diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td index d460d33fa0a3..f42f4569b2f8 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -3875,6 +3875,13 @@ def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr), } +def : T2Pat<(ARMWrapperPIC tglobaltlsaddr :$dst), + (t2MOV_ga_pcrel tglobaltlsaddr:$dst)>, + Requires<[IsThumb2, UseMovt]>; +def : T2Pat<(ARMWrapper tglobaltlsaddr:$dst), + (t2MOVi32imm tglobaltlsaddr:$dst)>, + Requires<[IsThumb2, UseMovt]>; + // ConstantPool, GlobalAddress, and JumpTable def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>; def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>, diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td index 050cd1a445ad..63e7940bb14e 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -930,10 +930,10 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011, // and could enable the conversion to float to be removed completely. def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)), (VMOVDRR GPR:$Rl, (BFC GPR:$Rh, (i32 0x7FFFFFFF)))>, - Requires<[IsARM]>; + Requires<[IsARM, HasV6T2]>; def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)), (VMOVDRR GPR:$Rl, (t2BFC GPR:$Rh, (i32 0x7FFFFFFF)))>, - Requires<[IsThumb2]>; + Requires<[IsThumb2, HasV6T2]>; def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)), (VMOVDRR GPR:$Rl, (EORri GPR:$Rh, (i32 0x80000000)))>, Requires<[IsARM]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp index ac0330fbcb34..71ad7a4a732a 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -20,4 +20,5 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF) RestoreSPFromFP(false), LRSpilledForFarJump(false), FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), - PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false) {} + PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false), + IsSplitCSR(false) {} diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index d6447978ef2c..68f9aec8cae5 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -118,6 +118,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// coalesced weights. DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR; + public: ARMFunctionInfo() : isThumb(false), @@ -128,7 +132,7 @@ public: FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0), NumAlignedDPRCS2Regs(0), PICLabelUId(0), - VarArgsFrameIndex(0), HasITBlocks(false) {} + VarArgsFrameIndex(0), HasITBlocks(false), IsSplitCSR(false) {} explicit ARMFunctionInfo(MachineFunction &MF); @@ -199,6 +203,9 @@ public: bool hasITBlocks() const { return HasITBlocks; } void setHasITBlocks(bool h) { HasITBlocks = h; } + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } + void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) { if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second) llvm_unreachable("Duplicate entries!"); diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 6084f22c8470..57577dc834b7 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -388,6 +388,9 @@ private: size_t calculateContentSize() const; + // Reset state between object emissions + void reset() override; + public: ARMTargetELFStreamer(MCStreamer &S) : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::FK_INVALID), @@ -415,7 +418,7 @@ public: MCCodeEmitter *Emitter, bool IsThumb) : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None) { - Reset(); + EHReset(); } ~ARMELFStreamer() {} @@ -579,7 +582,10 @@ private: } // Helper functions for ARM exception handling directives - void Reset(); + void EHReset(); + + // Reset state between object emissions + void reset() override; void EmitPersonalityFixup(StringRef Name); void FlushPendingOffset(); @@ -1040,6 +1046,8 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) { getStreamer().emitInst(Inst, Suffix); } +void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; } + void ARMELFStreamer::FinishImpl() { MCTargetStreamer &TS = *getTargetStreamer(); ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); @@ -1048,6 +1056,18 @@ void ARMELFStreamer::FinishImpl() { MCELFStreamer::FinishImpl(); } +void ARMELFStreamer::reset() { + MCTargetStreamer &TS = *getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); + ATS.reset(); + MappingSymbolCounter = 0; + MCELFStreamer::reset(); + // MCELFStreamer clear's the assembler's e_flags. However, for + // arm we manually set the ABI version on streamer creation, so + // do the same here + getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5); +} + inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags, @@ -1094,7 +1114,7 @@ void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) { Kind)); } -void ARMELFStreamer::Reset() { +void ARMELFStreamer::EHReset() { ExTab = nullptr; FnStart = nullptr; Personality = nullptr; @@ -1164,7 +1184,7 @@ void ARMELFStreamer::emitFnEnd() { SwitchSection(&FnStart->getSection()); // Clean exception handling frame information - Reset(); + EHReset(); } void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index dad50f2834ee..c0d10c896354 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -38,6 +38,9 @@ void ARMTargetStreamer::emitCurrentConstantPool() { // finish() - write out any non-empty assembler constant pools. void ARMTargetStreamer::finish() { ConstantPools->emitAll(Streamer); } +// reset() - Reset any state +void ARMTargetStreamer::reset() {} + // The remaining callbacks should be handled separately by each // streamer. void ARMTargetStreamer::emitFnStart() {} diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h index 812f9830824d..27faac63683a 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h @@ -53,6 +53,11 @@ public: /// \p MBB will be correctly handled by the target. bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; + /// Disable shrink wrap as tBfar/BL will be used to adjust for long jumps. + bool enableShrinkWrapping(const MachineFunction &MF) const override { + return false; + } + private: /// Check if the frame lowering of \p MF needs a special fixup /// code sequence for the epilogue. diff --git a/contrib/llvm/lib/Target/AVR/AVR.h b/contrib/llvm/lib/Target/AVR/AVR.h new file mode 100644 index 000000000000..4c1667ed341c --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVR.h @@ -0,0 +1,54 @@ +//===-- AVR.h - Top-level interface for AVR representation ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// AVR back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_H +#define LLVM_AVR_H + +#include "llvm/Target/TargetMachine.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" + +namespace llvm { + +class AVRTargetMachine; +class FunctionPass; + +FunctionPass *createAVRISelDag(AVRTargetMachine &TM, + CodeGenOpt::Level OptLevel); +FunctionPass *createAVRExpandPseudoPass(); +FunctionPass *createAVRFrameAnalyzerPass(); +FunctionPass *createAVRDynAllocaSRPass(); +FunctionPass *createAVRBranchSelectionPass(); + +/** + * Contains the AVR backend. + */ +namespace AVR { + +enum AddressSpace { DataMemory, ProgramMemory }; + +template <typename T> bool isProgramMemoryAddress(T *V) { + return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory; +} + +inline bool isProgramMemoryAccess(MemSDNode const *N) { + auto V = N->getMemOperand()->getValue(); + + return (V != nullptr) ? isProgramMemoryAddress(V) : false; +} + +} // end of namespace AVR + +} // end namespace llvm + +#endif // LLVM_AVR_H diff --git a/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h new file mode 100644 index 000000000000..ee832ad1bc75 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRSelectionDAGInfo.h @@ -0,0 +1,29 @@ +//===-- AVRSelectionDAGInfo.h - AVR SelectionDAG Info -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the AVR subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_SELECTION_DAG_INFO_H +#define LLVM_AVR_SELECTION_DAG_INFO_H + +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { +/** + * Holds information about the AVR instruction selection DAG. + */ +class AVRSelectionDAGInfo : public TargetSelectionDAGInfo { +public: +}; + +} // end namespace llvm + +#endif // LLVM_AVR_SELECTION_DAG_INFO_H diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp new file mode 100644 index 000000000000..85f03e818e83 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp @@ -0,0 +1,40 @@ +//===-- AVRTargetObjectFile.cpp - AVR Object Files ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AVRTargetObjectFile.h" + +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ELF.h" + +#include "AVR.h" + +namespace llvm { +void AVRTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { + Base::Initialize(Ctx, TM); + ProgmemDataSection = + Ctx.getELFSection(".progmem.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC); +} + +MCSection * +AVRTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, + SectionKind Kind, Mangler &Mang, + const TargetMachine &TM) const { + // Global values in flash memory are placed in the progmem.data section + // unless they already have a user assigned section. + if (AVR::isProgramMemoryAddress(GV) && !GV->hasSection()) + return ProgmemDataSection; + + // Otherwise, we work the same way as ELF. + return Base::SelectSectionForGlobal(GV, Kind, Mang, TM); +} +} // end of namespace llvm diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h new file mode 100644 index 000000000000..bdda35b34993 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.h @@ -0,0 +1,35 @@ +//===-- AVRTargetObjectFile.h - AVR Object Info -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_TARGET_OBJECT_FILE_H +#define LLVM_AVR_TARGET_OBJECT_FILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" + +namespace llvm { +/** + * Lowering for an AVR ELF32 object file. + */ +class AVRTargetObjectFile : public TargetLoweringObjectFileELF { + typedef TargetLoweringObjectFileELF Base; + +public: + void Initialize(MCContext &ctx, const TargetMachine &TM) override; + + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; + +private: + MCSection *ProgmemDataSection; +}; + +} // end namespace llvm + +#endif // LLVM_AVR_TARGET_OBJECT_FILE_H diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index e213089687e8..4c7c0392a132 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -190,9 +190,9 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, return false; } -MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI, - MCStreamer &OutStreamer, - const MCOperand &Imm, int AlignSize) { +static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI, + MCStreamer &OutStreamer, const MCOperand &Imm, + int AlignSize) { MCSymbol *Sym; int64_t Value; if (Imm.getExpr()->evaluateAsAbsolute(Value)) { diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 77907b054d54..4d2b54521e83 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1275,6 +1275,8 @@ bool RedundantInstrElimination::processBlock(MachineBasicBlock &B, if (!BT.has(RD.Reg)) continue; const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg); + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); // Find a source operand that is equal to the result. for (auto &Op : MI->uses()) { @@ -1298,7 +1300,7 @@ bool RedundantInstrElimination::processBlock(MachineBasicBlock &B, DebugLoc DL = MI->getDebugLoc(); const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI); unsigned NewR = MRI.createVirtualRegister(FRC); - BuildMI(B, I, DL, HII.get(TargetOpcode::COPY), NewR) + BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR) .addReg(RS.Reg, 0, RS.Sub); HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); BT.put(BitTracker::RegisterRef(NewR), SC); @@ -1925,7 +1927,9 @@ bool BitSimplification::genPackhl(MachineInstr *MI, MachineBasicBlock &B = *MI->getParent(); unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass); DebugLoc DL = MI->getDebugLoc(); - BuildMI(B, MI, DL, HII.get(Hexagon::S2_packhl), NewR) + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + BuildMI(B, At, DL, HII.get(Hexagon::S2_packhl), NewR) .addReg(Rs.Reg, 0, Rs.Sub) .addReg(Rt.Reg, 0, Rt.Sub); HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); @@ -1950,9 +1954,11 @@ bool BitSimplification::genExtractHalf(MachineInstr *MI, // Prefer zxth, since zxth can go in any slot, while extractu only in // slots 2 and 3. unsigned NewR = 0; + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); if (L.Low && Opc != Hexagon::A2_zxth) { NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); - BuildMI(B, MI, DL, HII.get(Hexagon::A2_zxth), NewR) + BuildMI(B, At, DL, HII.get(Hexagon::A2_zxth), NewR) .addReg(L.Reg, 0, L.Sub); } else if (!L.Low && Opc != Hexagon::S2_extractu) { NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); @@ -1989,7 +1995,9 @@ bool BitSimplification::genCombineHalf(MachineInstr *MI, MachineBasicBlock &B = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); - BuildMI(B, MI, DL, HII.get(COpc), NewR) + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + BuildMI(B, At, DL, HII.get(COpc), NewR) .addReg(H.Reg, 0, H.Sub) .addReg(L.Reg, 0, L.Sub); HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); @@ -2043,7 +2051,9 @@ bool BitSimplification::genExtractLow(MachineInstr *MI, continue; unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); - auto MIB = BuildMI(B, MI, DL, HII.get(NewOpc), NewR) + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); + auto MIB = BuildMI(B, At, DL, HII.get(NewOpc), NewR) .addReg(RS.Reg, 0, RS.Sub); if (NewOpc == Hexagon::A2_andir) MIB.addImm((1 << W) - 1); @@ -2076,6 +2086,8 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI, if (!BT.has(RS.Reg) || !HBS::getSubregMask(RS, F, W, MRI)) return false; MachineBasicBlock &B = *MI->getParent(); + auto At = MI->isPHI() ? B.getFirstNonPHI() + : MachineBasicBlock::iterator(MI); const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg); const BitTracker::BitValue &V = SC[F+BN]; @@ -2098,7 +2110,7 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI, } if (P != UINT_MAX) { unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); - BuildMI(B, MI, DL, HII.get(Hexagon::S2_tstbit_i), NewR) + BuildMI(B, At, DL, HII.get(Hexagon::S2_tstbit_i), NewR) .addReg(RR.Reg, 0, RR.Sub) .addImm(P); HBS::replaceReg(RD.Reg, NewR, MRI); @@ -2108,7 +2120,7 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI, } else if (V.is(0) || V.is(1)) { unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); unsigned NewOpc = V.is(0) ? Hexagon::TFR_PdFalse : Hexagon::TFR_PdTrue; - BuildMI(B, MI, DL, HII.get(NewOpc), NewR); + BuildMI(B, At, DL, HII.get(NewOpc), NewR); HBS::replaceReg(RD.Reg, NewR, MRI); return true; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td index 87d6b359f5fb..37c2042a2ccd 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td @@ -3320,6 +3320,7 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, /* u16_0Imm */ addr{15-0}))); // Store upper-half and store doubleword cannot be NV. let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1)); + let Uses = !if (isAbs, [], [GP]); let IClass = 0b0100; let Inst{27} = 1; @@ -3425,6 +3426,7 @@ class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp, bit isAbs> !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2}, !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1}, /* u16_0Imm */ addr{15-0}))); + let Uses = !if (isAbs, [], [GP]); let IClass = 0b0100; let Inst{27} = 1; @@ -3736,7 +3738,7 @@ defm loadrd : LD_Abs<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>; // if ([!]Pv[.new]) Rx=mem[bhwd](##global) //===----------------------------------------------------------------------===// -let isAsmParserOnly = 1 in +let isAsmParserOnly = 1, Uses = [GP] in class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp, bits<3> MajOp> : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel { diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp new file mode 100644 index 000000000000..06719cddf4b6 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.cpp @@ -0,0 +1,60 @@ +//===--- HexagonRDF.cpp ---------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "HexagonRDF.h" +#include "HexagonInstrInfo.h" +#include "HexagonRegisterInfo.h" + +#include "llvm/CodeGen/MachineInstr.h" + +using namespace llvm; +using namespace rdf; + +bool HexagonRegisterAliasInfo::covers(RegisterRef RA, RegisterRef RB) const { + if (RA == RB) + return true; + + if (TargetRegisterInfo::isVirtualRegister(RA.Reg) && + TargetRegisterInfo::isVirtualRegister(RB.Reg)) { + // Hexagon-specific cases. + if (RA.Reg == RB.Reg) { + if (RA.Sub == 0) + return true; + if (RB.Sub == 0) + return false; + } + } + + return RegisterAliasInfo::covers(RA, RB); +} + +bool HexagonRegisterAliasInfo::covers(const RegisterSet &RRs, RegisterRef RR) + const { + if (RRs.count(RR)) + return true; + + if (!TargetRegisterInfo::isPhysicalRegister(RR.Reg)) { + assert(TargetRegisterInfo::isVirtualRegister(RR.Reg)); + // Check if both covering subregisters are present. + bool HasLo = RRs.count({RR.Reg, Hexagon::subreg_loreg}); + bool HasHi = RRs.count({RR.Reg, Hexagon::subreg_hireg}); + if (HasLo && HasHi) + return true; + } + + if (RR.Sub == 0) { + // Check if both covering subregisters are present. + unsigned Lo = TRI.getSubReg(RR.Reg, Hexagon::subreg_loreg); + unsigned Hi = TRI.getSubReg(RR.Reg, Hexagon::subreg_hireg); + if (RRs.count({Lo, 0}) && RRs.count({Hi, 0})) + return true; + } + + return RegisterAliasInfo::covers(RRs, RR); +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h new file mode 100644 index 000000000000..00c1889e8eb5 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDF.h @@ -0,0 +1,28 @@ +//===--- HexagonRDF.h -----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef HEXAGON_RDF_H +#define HEXAGON_RDF_H +#include "RDFGraph.h" + +namespace llvm { + class TargetRegisterInfo; +} + +namespace rdf { + struct HexagonRegisterAliasInfo : public RegisterAliasInfo { + HexagonRegisterAliasInfo(const TargetRegisterInfo &TRI) + : RegisterAliasInfo(TRI) {} + bool covers(RegisterRef RA, RegisterRef RR) const override; + bool covers(const RegisterSet &RRs, RegisterRef RR) const override; + }; +} + +#endif + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp new file mode 100644 index 000000000000..3fcda984d265 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp @@ -0,0 +1,272 @@ +//===--- HexagonRDFOpt.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "HexagonInstrInfo.h" +#include "HexagonRDF.h" +#include "HexagonSubtarget.h" +#include "RDFCopy.h" +#include "RDFDeadCode.h" +#include "RDFGraph.h" +#include "RDFLiveness.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Format.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; +using namespace rdf; + +namespace llvm { + void initializeHexagonRDFOptPass(PassRegistry&); + FunctionPass *createHexagonRDFOpt(); +} + +namespace { + cl::opt<unsigned> RDFLimit("rdf-limit", cl::init(UINT_MAX)); + unsigned RDFCount = 0; + cl::opt<bool> RDFDump("rdf-dump", cl::init(false)); + + class HexagonRDFOpt : public MachineFunctionPass { + public: + HexagonRDFOpt() : MachineFunctionPass(ID) { + initializeHexagonRDFOptPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachineDominanceFrontier>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + const char *getPassName() const override { + return "Hexagon RDF optimizations"; + } + bool runOnMachineFunction(MachineFunction &MF) override; + + static char ID; + + private: + MachineDominatorTree *MDT; + MachineRegisterInfo *MRI; + }; + + char HexagonRDFOpt::ID = 0; +} + +INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) +INITIALIZE_PASS_END(HexagonRDFOpt, "rdfopt", "Hexagon RDF opt", false, false) + + +struct HexagonDCE : public DeadCodeElimination { + HexagonDCE(DataFlowGraph &G, MachineRegisterInfo &MRI) + : DeadCodeElimination(G, MRI) {} + bool rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove); + void removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum); + + bool run(); +}; + + +bool HexagonDCE::run() { + bool Collected = collect(); + if (!Collected) + return false; + + const SetVector<NodeId> &DeadNodes = getDeadNodes(); + const SetVector<NodeId> &DeadInstrs = getDeadInstrs(); + + typedef DenseMap<NodeId,NodeId> RefToInstrMap; + RefToInstrMap R2I; + SetVector<NodeId> PartlyDead; + DataFlowGraph &DFG = getDFG(); + + for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) { + for (auto TA : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Stmt>, DFG)) { + NodeAddr<StmtNode*> SA = TA; + for (NodeAddr<RefNode*> RA : SA.Addr->members(DFG)) { + R2I.insert(std::make_pair(RA.Id, SA.Id)); + if (DFG.IsDef(RA) && DeadNodes.count(RA.Id)) + if (!DeadInstrs.count(SA.Id)) + PartlyDead.insert(SA.Id); + } + } + } + + // Nodes to remove. + SetVector<NodeId> Remove = DeadInstrs; + + bool Changed = false; + for (NodeId N : PartlyDead) { + auto SA = DFG.addr<StmtNode*>(N); + if (trace()) + dbgs() << "Partly dead: " << *SA.Addr->getCode(); + Changed |= rewrite(SA, Remove); + } + + return erase(Remove) || Changed; +} + + +void HexagonDCE::removeOperand(NodeAddr<InstrNode*> IA, unsigned OpNum) { + MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + + auto getOpNum = [MI] (MachineOperand &Op) -> unsigned { + for (unsigned i = 0, n = MI->getNumOperands(); i != n; ++i) + if (&MI->getOperand(i) == &Op) + return i; + llvm_unreachable("Invalid operand"); + }; + DenseMap<NodeId,unsigned> OpMap; + NodeList Refs = IA.Addr->members(getDFG()); + for (NodeAddr<RefNode*> RA : Refs) + OpMap.insert(std::make_pair(RA.Id, getOpNum(RA.Addr->getOp()))); + + MI->RemoveOperand(OpNum); + + for (NodeAddr<RefNode*> RA : Refs) { + unsigned N = OpMap[RA.Id]; + if (N < OpNum) + RA.Addr->setRegRef(&MI->getOperand(N)); + else if (N > OpNum) + RA.Addr->setRegRef(&MI->getOperand(N-1)); + } +} + + +bool HexagonDCE::rewrite(NodeAddr<InstrNode*> IA, SetVector<NodeId> &Remove) { + if (!getDFG().IsCode<NodeAttrs::Stmt>(IA)) + return false; + DataFlowGraph &DFG = getDFG(); + MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + auto &HII = static_cast<const HexagonInstrInfo&>(DFG.getTII()); + if (HII.getAddrMode(MI) != HexagonII::PostInc) + return false; + unsigned Opc = MI->getOpcode(); + unsigned OpNum, NewOpc; + switch (Opc) { + case Hexagon::L2_loadri_pi: + NewOpc = Hexagon::L2_loadri_io; + OpNum = 1; + break; + case Hexagon::L2_loadrd_pi: + NewOpc = Hexagon::L2_loadrd_io; + OpNum = 1; + break; + case Hexagon::V6_vL32b_pi: + NewOpc = Hexagon::V6_vL32b_ai; + OpNum = 1; + break; + case Hexagon::S2_storeri_pi: + NewOpc = Hexagon::S2_storeri_io; + OpNum = 0; + break; + case Hexagon::S2_storerd_pi: + NewOpc = Hexagon::S2_storerd_io; + OpNum = 0; + break; + case Hexagon::V6_vS32b_pi: + NewOpc = Hexagon::V6_vS32b_ai; + OpNum = 0; + break; + default: + return false; + } + auto IsDead = [this] (NodeAddr<DefNode*> DA) -> bool { + return getDeadNodes().count(DA.Id); + }; + NodeList Defs; + MachineOperand &Op = MI->getOperand(OpNum); + for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG)) { + if (&DA.Addr->getOp() != &Op) + continue; + Defs = DFG.getRelatedRefs(IA, DA); + if (!std::all_of(Defs.begin(), Defs.end(), IsDead)) + return false; + break; + } + + // Mark all nodes in Defs for removal. + for (auto D : Defs) + Remove.insert(D.Id); + + if (trace()) + dbgs() << "Rewriting: " << *MI; + MI->setDesc(HII.get(NewOpc)); + MI->getOperand(OpNum+2).setImm(0); + removeOperand(IA, OpNum); + if (trace()) + dbgs() << " to: " << *MI; + + return true; +} + + +bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) { + if (RDFLimit.getPosition()) { + if (RDFCount >= RDFLimit) + return false; + RDFCount++; + } + + MDT = &getAnalysis<MachineDominatorTree>(); + const auto &MDF = getAnalysis<MachineDominanceFrontier>(); + const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); + const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + MRI = &MF.getRegInfo(); + + HexagonRegisterAliasInfo HAI(HRI); + TargetOperandInfo TOI(HII); + + if (RDFDump) + MF.print(dbgs() << "Before " << getPassName() << "\n", nullptr); + DataFlowGraph G(MF, HII, HRI, *MDT, MDF, HAI, TOI); + G.build(); + if (RDFDump) { + dbgs() << PrintNode<FuncNode*>(G.getFunc(), G) << '\n'; + dbgs() << MF.getName() << '\n'; + } + + bool Changed; + CopyPropagation CP(G); + CP.trace(RDFDump); + Changed = CP.run(); + if (Changed) + G.build(); + + HexagonDCE DCE(G, *MRI); + DCE.trace(RDFDump); + Changed |= DCE.run(); + + if (Changed) { + Liveness LV(*MRI, G); + LV.trace(RDFDump); + LV.computeLiveIns(); + LV.resetLiveIns(); + LV.resetKills(); + } + + if (RDFDump) + MF.print(dbgs() << "After " << getPassName() << "\n", nullptr); + return false; +} + + +FunctionPass *llvm::createHexagonRDFOpt() { + return new HexagonRDFOpt(); +} + + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp index 61c0589fb5bf..6e5f7324aca8 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -103,6 +103,8 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF) Reserved.set(Hexagon::R30); Reserved.set(Hexagon::R31); Reserved.set(Hexagon::PC); + Reserved.set(Hexagon::GP); + Reserved.set(Hexagon::D14); Reserved.set(Hexagon::D15); Reserved.set(Hexagon::LC0); Reserved.set(Hexagon::LC1); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 9dccd696c989..34b03fb74cef 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -26,7 +26,11 @@ using namespace llvm; -static cl:: opt<bool> DisableHardwareLoops("disable-hexagon-hwloops", + +static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore, + cl::init(true), cl::desc("Enable RDF-based optimizations")); + +static cl::opt<bool> DisableHardwareLoops("disable-hexagon-hwloops", cl::Hidden, cl::desc("Disable Hardware Loops for Hexagon target")); static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt", @@ -111,6 +115,7 @@ namespace llvm { FunctionPass *createHexagonOptimizeSZextends(); FunctionPass *createHexagonPacketizer(); FunctionPass *createHexagonPeephole(); + FunctionPass *createHexagonRDFOpt(); FunctionPass *createHexagonSplitConst32AndConst64(); FunctionPass *createHexagonSplitDoubleRegs(); FunctionPass *createHexagonStoreWidening(); @@ -262,9 +267,12 @@ void HexagonPassConfig::addPreRegAlloc() { } void HexagonPassConfig::addPostRegAlloc() { - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None) { + if (EnableRDFOpt) + addPass(createHexagonRDFOpt()); if (!DisableHexagonCFGOpt) addPass(createHexagonCFGOptimizer(), false); + } } void HexagonPassConfig::addPreSched2() { diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index c2c6275e7e8d..4b07ca7490a8 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -334,21 +334,21 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI, // The only relocs left should be GP relative: default: if (MCID.mayStore() || MCID.mayLoad()) { - for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses; - ++ImpUses) { - if (*ImpUses == Hexagon::GP) { - switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) { - case HexagonII::MemAccessSize::ByteAccess: - return fixup_Hexagon_GPREL16_0; - case HexagonII::MemAccessSize::HalfWordAccess: - return fixup_Hexagon_GPREL16_1; - case HexagonII::MemAccessSize::WordAccess: - return fixup_Hexagon_GPREL16_2; - case HexagonII::MemAccessSize::DoubleWordAccess: - return fixup_Hexagon_GPREL16_3; - default: - llvm_unreachable("unhandled fixup"); - } + for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); + ImpUses && *ImpUses; ++ImpUses) { + if (*ImpUses != Hexagon::GP) + continue; + switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) { + case HexagonII::MemAccessSize::ByteAccess: + return fixup_Hexagon_GPREL16_0; + case HexagonII::MemAccessSize::HalfWordAccess: + return fixup_Hexagon_GPREL16_1; + case HexagonII::MemAccessSize::WordAccess: + return fixup_Hexagon_GPREL16_2; + case HexagonII::MemAccessSize::DoubleWordAccess: + return fixup_Hexagon_GPREL16_3; + default: + llvm_unreachable("unhandled fixup"); } } } else diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp index 6ceb848ba20c..4e1cce3bd7d1 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp @@ -95,14 +95,7 @@ unsigned HexagonResource::setWeight(unsigned s) { return (Weight); } -HexagonCVIResource::TypeUnitsAndLanes *HexagonCVIResource::TUL; - -bool HexagonCVIResource::SetUp = HexagonCVIResource::setup(); - -bool HexagonCVIResource::setup() { - assert(!TUL); - TUL = new (TypeUnitsAndLanes); - +void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) { (*TUL)[HexagonII::TypeCVI_VA] = UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2); @@ -123,13 +116,12 @@ bool HexagonCVIResource::setup() { (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0); (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1); (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4); - - return true; } -HexagonCVIResource::HexagonCVIResource(MCInstrInfo const &MCII, unsigned s, +HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL, + MCInstrInfo const &MCII, unsigned s, MCInst const *id) - : HexagonResource(s) { + : HexagonResource(s), TUL(TUL) { unsigned T = HexagonMCInstrInfo::getType(MCII, *id); if (TUL->count(T)) { @@ -153,6 +145,7 @@ HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI) : MCII(MCII), STI(STI) { reset(); + HexagonCVIResource::SetupTUL(&TUL, STI.getCPU()); } void HexagonShuffler::reset() { @@ -163,7 +156,7 @@ void HexagonShuffler::reset() { void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender, unsigned S, bool X) { - HexagonInstr PI(MCII, ID, Extender, S, X); + HexagonInstr PI(&TUL, MCII, ID, Extender, S, X); Packet.push_back(PI); } diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h index 174f10fb2580..a093f8545132 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h @@ -20,6 +20,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" using namespace llvm; @@ -53,9 +54,11 @@ public: // HVX insn resources. class HexagonCVIResource : public HexagonResource { +public: typedef std::pair<unsigned, unsigned> UnitsAndLanes; typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes; +private: // Available HVX slots. enum { CVI_NONE = 0, @@ -65,9 +68,7 @@ class HexagonCVIResource : public HexagonResource { CVI_MPY1 = 1 << 3 }; - static bool SetUp; - static bool setup(); - static TypeUnitsAndLanes *TUL; + TypeUnitsAndLanes *TUL; // Count of adjacent slots that the insn requires to be executed. unsigned Lanes; @@ -81,7 +82,9 @@ class HexagonCVIResource : public HexagonResource { void setStore(bool f = true) { Store = f; }; public: - HexagonCVIResource(MCInstrInfo const &MCII, unsigned s, MCInst const *id); + HexagonCVIResource(TypeUnitsAndLanes *TUL, MCInstrInfo const &MCII, + unsigned s, MCInst const *id); + static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU); bool isValid() const { return (Valid); }; unsigned getLanes() const { return (Lanes); }; @@ -100,10 +103,11 @@ class HexagonInstr { bool SoloException; public: - HexagonInstr(MCInstrInfo const &MCII, MCInst const *id, + HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T, + MCInstrInfo const &MCII, MCInst const *id, MCInst const *Extender, unsigned s, bool x = false) - : ID(id), Extender(Extender), Core(s), CVI(MCII, s, id), - SoloException(x){}; + : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id), + SoloException(x) {}; MCInst const *getDesc() const { return (ID); }; @@ -136,6 +140,8 @@ class HexagonShuffler { // Shuffling error code. unsigned Error; + HexagonCVIResource::TypeUnitsAndLanes TUL; + protected: int64_t BundleFlags; MCInstrInfo const &MCII; diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp new file mode 100644 index 000000000000..c547c7195075 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp @@ -0,0 +1,180 @@ +//===--- RDFCopy.cpp ------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Simplistic RDF-based copy propagation. + +#include "RDFCopy.h" +#include "RDFGraph.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/CommandLine.h" + +#include <atomic> + +#ifndef NDEBUG +static cl::opt<unsigned> CpLimit("rdf-cp-limit", cl::init(0), cl::Hidden); +static unsigned CpCount = 0; +#endif + +using namespace llvm; +using namespace rdf; + +void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI) { + assert(MI->getOpcode() == TargetOpcode::COPY); + const MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1); + RegisterRef DstR = { Op0.getReg(), Op0.getSubReg() }; + RegisterRef SrcR = { Op1.getReg(), Op1.getSubReg() }; + auto FS = DefM.find(SrcR); + if (FS == DefM.end() || FS->second.empty()) + return; + Copies.push_back(SA.Id); + RDefMap[SrcR][SA.Id] = FS->second.top()->Id; + // Insert DstR into the map. + RDefMap[DstR]; +} + + +void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) { + RegisterSet RRs; + for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) + RRs.insert(RA.Addr->getRegRef()); + bool Common = false; + for (auto &R : RDefMap) { + if (!RRs.count(R.first)) + continue; + Common = true; + break; + } + if (!Common) + return; + + for (auto &R : RDefMap) { + if (!RRs.count(R.first)) + continue; + auto F = DefM.find(R.first); + if (F == DefM.end() || F->second.empty()) + continue; + R.second[IA.Id] = F->second.top()->Id; + } +} + + +bool CopyPropagation::scanBlock(MachineBasicBlock *B) { + bool Changed = false; + auto BA = DFG.getFunc().Addr->findBlock(B, DFG); + DFG.markBlock(BA.Id, DefM); + + for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) { + if (DFG.IsCode<NodeAttrs::Stmt>(IA)) { + NodeAddr<StmtNode*> SA = IA; + MachineInstr *MI = SA.Addr->getCode(); + if (MI->isCopy()) + recordCopy(SA, MI); + } + + updateMap(IA); + DFG.pushDefs(IA, DefM); + } + + MachineDomTreeNode *N = MDT.getNode(B); + for (auto I : *N) + Changed |= scanBlock(I->getBlock()); + + DFG.releaseBlock(BA.Id, DefM); + return Changed; +} + + +bool CopyPropagation::run() { + scanBlock(&DFG.getMF().front()); + + if (trace()) { + dbgs() << "Copies:\n"; + for (auto I : Copies) + dbgs() << *DFG.addr<StmtNode*>(I).Addr->getCode(); + dbgs() << "\nRDef map:\n"; + for (auto R : RDefMap) { + dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {"; + for (auto &M : R.second) + dbgs() << ' ' << Print<NodeId>(M.first, DFG) << ':' + << Print<NodeId>(M.second, DFG); + dbgs() << " }\n"; + } + } + + bool Changed = false; + NodeSet Deleted; +#ifndef NDEBUG + bool HasLimit = CpLimit.getNumOccurrences() > 0; +#endif + + for (auto I : Copies) { +#ifndef NDEBUG + if (HasLimit && CpCount >= CpLimit) + break; +#endif + if (Deleted.count(I)) + continue; + auto SA = DFG.addr<InstrNode*>(I); + NodeList Ds = SA.Addr->members_if(DFG.IsDef, DFG); + if (Ds.size() != 1) + continue; + NodeAddr<DefNode*> DA = Ds[0]; + RegisterRef DR0 = DA.Addr->getRegRef(); + NodeList Us = SA.Addr->members_if(DFG.IsUse, DFG); + if (Us.size() != 1) + continue; + NodeAddr<UseNode*> UA0 = Us[0]; + RegisterRef UR0 = UA0.Addr->getRegRef(); + NodeId RD0 = UA0.Addr->getReachingDef(); + + for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) { + auto UA = DFG.addr<UseNode*>(N); + NextN = UA.Addr->getSibling(); + uint16_t F = UA.Addr->getFlags(); + if ((F & NodeAttrs::PhiRef) || (F & NodeAttrs::Fixed)) + continue; + if (UA.Addr->getRegRef() != DR0) + continue; + NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG); + assert(DFG.IsCode<NodeAttrs::Stmt>(IA)); + MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + if (RDefMap[UR0][IA.Id] != RD0) + continue; + MachineOperand &Op = UA.Addr->getOp(); + if (Op.isTied()) + continue; + if (trace()) { + dbgs() << "can replace " << Print<RegisterRef>(DR0, DFG) + << " with " << Print<RegisterRef>(UR0, DFG) << " in " + << *NodeAddr<StmtNode*>(IA).Addr->getCode(); + } + + Op.setReg(UR0.Reg); + Op.setSubReg(UR0.Sub); + Changed = true; +#ifndef NDEBUG + if (HasLimit && CpCount >= CpLimit) + break; + CpCount++; +#endif + + if (MI->isCopy()) { + MachineOperand &Op0 = MI->getOperand(0), &Op1 = MI->getOperand(1); + if (Op0.getReg() == Op1.getReg() && Op0.getSubReg() == Op1.getSubReg()) + MI->eraseFromParent(); + Deleted.insert(IA.Id); + } + } + } + + return Changed; +} + diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.h b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h new file mode 100644 index 000000000000..02531b94c9b0 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h @@ -0,0 +1,48 @@ +//===--- RDFCopy.h --------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef RDF_COPY_H +#define RDF_COPY_H + +#include "RDFGraph.h" +#include <map> +#include <vector> + +namespace llvm { + class MachineBasicBlock; + class MachineDominatorTree; + class MachineInstr; +} + +namespace rdf { + struct CopyPropagation { + CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg), + Trace(false) {} + + bool run(); + void trace(bool On) { Trace = On; } + bool trace() const { return Trace; } + + private: + const MachineDominatorTree &MDT; + DataFlowGraph &DFG; + DataFlowGraph::DefStackMap DefM; + bool Trace; + + // map: register -> (map: stmt -> reaching def) + std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap; + std::vector<NodeId> Copies; + + void recordCopy(NodeAddr<StmtNode*> SA, MachineInstr *MI); + void updateMap(NodeAddr<InstrNode*> IA); + bool scanBlock(MachineBasicBlock *B); + }; +} + +#endif diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp new file mode 100644 index 000000000000..95668577bd50 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp @@ -0,0 +1,204 @@ +//===--- RDFDeadCode.cpp --------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// RDF-based generic dead code elimination. + +#include "RDFGraph.h" +#include "RDFLiveness.h" +#include "RDFDeadCode.h" + +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; +using namespace rdf; + +// Check if the given instruction has observable side-effects, i.e. if +// it should be considered "live". It is safe for this function to be +// overly conservative (i.e. return "true" for all instructions), but it +// is not safe to return "false" for an instruction that should not be +// considered removable. +bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const { + if (MI->mayStore() || MI->isBranch() || MI->isCall() || MI->isReturn()) + return true; + if (MI->hasOrderedMemoryRef() || MI->hasUnmodeledSideEffects()) + return true; + if (MI->isPHI()) + return false; + for (auto &Op : MI->operands()) + if (Op.isReg() && MRI.isReserved(Op.getReg())) + return true; + return false; +} + +void DeadCodeElimination::scanInstr(NodeAddr<InstrNode*> IA, + SetVector<NodeId> &WorkQ) { + if (!DFG.IsCode<NodeAttrs::Stmt>(IA)) + return; + if (!isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode())) + return; + for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) { + if (!LiveNodes.count(RA.Id)) + WorkQ.insert(RA.Id); + } +} + +void DeadCodeElimination::processDef(NodeAddr<DefNode*> DA, + SetVector<NodeId> &WorkQ) { + NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG); + for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) { + if (!LiveNodes.count(UA.Id)) + WorkQ.insert(UA.Id); + } + for (NodeAddr<DefNode*> TA : DFG.getRelatedRefs(IA, DA)) + LiveNodes.insert(TA.Id); +} + +void DeadCodeElimination::processUse(NodeAddr<UseNode*> UA, + SetVector<NodeId> &WorkQ) { + for (NodeAddr<DefNode*> DA : LV.getAllReachingDefs(UA)) { + if (!LiveNodes.count(DA.Id)) + WorkQ.insert(DA.Id); + } +} + +// Traverse the DFG and collect the set dead RefNodes and the set of +// dead instructions. Return "true" if any of these sets is non-empty, +// "false" otherwise. +bool DeadCodeElimination::collect() { + // This function works by first finding all live nodes. The dead nodes + // are then the complement of the set of live nodes. + // + // Assume that all nodes are dead. Identify instructions which must be + // considered live, i.e. instructions with observable side-effects, such + // as calls and stores. All arguments of such instructions are considered + // live. For each live def, all operands used in the corresponding + // instruction are considered live. For each live use, all its reaching + // defs are considered live. + LiveNodes.clear(); + SetVector<NodeId> WorkQ; + for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) + for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) + scanInstr(IA, WorkQ); + + while (!WorkQ.empty()) { + NodeId N = *WorkQ.begin(); + WorkQ.remove(N); + LiveNodes.insert(N); + auto RA = DFG.addr<RefNode*>(N); + if (DFG.IsDef(RA)) + processDef(RA, WorkQ); + else + processUse(RA, WorkQ); + } + + if (trace()) { + dbgs() << "Live nodes:\n"; + for (NodeId N : LiveNodes) { + auto RA = DFG.addr<RefNode*>(N); + dbgs() << PrintNode<RefNode*>(RA, DFG) << "\n"; + } + } + + auto IsDead = [this] (NodeAddr<InstrNode*> IA) -> bool { + for (NodeAddr<DefNode*> DA : IA.Addr->members_if(DFG.IsDef, DFG)) + if (LiveNodes.count(DA.Id)) + return false; + return true; + }; + + for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) { + for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) { + for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) + if (!LiveNodes.count(RA.Id)) + DeadNodes.insert(RA.Id); + if (DFG.IsCode<NodeAttrs::Stmt>(IA)) + if (isLiveInstr(NodeAddr<StmtNode*>(IA).Addr->getCode())) + continue; + if (IsDead(IA)) { + DeadInstrs.insert(IA.Id); + if (trace()) + dbgs() << "Dead instr: " << PrintNode<InstrNode*>(IA, DFG) << "\n"; + } + } + } + + return !DeadNodes.empty(); +} + +// Erase the nodes given in the Nodes set from DFG. In addition to removing +// them from the DFG, if a node corresponds to a statement, the corresponding +// machine instruction is erased from the function. +bool DeadCodeElimination::erase(const SetVector<NodeId> &Nodes) { + if (Nodes.empty()) + return false; + + // Prepare the actual set of ref nodes to remove: ref nodes from Nodes + // are included directly, for each InstrNode in Nodes, include the set + // of all RefNodes from it. + NodeList DRNs, DINs; + for (auto I : Nodes) { + auto BA = DFG.addr<NodeBase*>(I); + uint16_t Type = BA.Addr->getType(); + if (Type == NodeAttrs::Ref) { + DRNs.push_back(DFG.addr<RefNode*>(I)); + continue; + } + + // If it's a code node, add all ref nodes from it. + uint16_t Kind = BA.Addr->getKind(); + if (Kind == NodeAttrs::Stmt || Kind == NodeAttrs::Phi) { + for (auto N : NodeAddr<CodeNode*>(BA).Addr->members(DFG)) + DRNs.push_back(N); + DINs.push_back(DFG.addr<InstrNode*>(I)); + } else { + llvm_unreachable("Unexpected code node"); + return false; + } + } + + // Sort the list so that use nodes are removed first. This makes the + // "unlink" functions a bit faster. + auto UsesFirst = [] (NodeAddr<RefNode*> A, NodeAddr<RefNode*> B) -> bool { + uint16_t KindA = A.Addr->getKind(), KindB = B.Addr->getKind(); + if (KindA == NodeAttrs::Use && KindB == NodeAttrs::Def) + return true; + if (KindA == NodeAttrs::Def && KindB == NodeAttrs::Use) + return false; + return A.Id < B.Id; + }; + std::sort(DRNs.begin(), DRNs.end(), UsesFirst); + + if (trace()) + dbgs() << "Removing dead ref nodes:\n"; + for (NodeAddr<RefNode*> RA : DRNs) { + if (trace()) + dbgs() << " " << PrintNode<RefNode*>(RA, DFG) << '\n'; + if (DFG.IsUse(RA)) + DFG.unlinkUse(RA); + else if (DFG.IsDef(RA)) + DFG.unlinkDef(RA); + } + + // Now, remove all dead instruction nodes. + for (NodeAddr<InstrNode*> IA : DINs) { + NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG); + BA.Addr->removeMember(IA, DFG); + if (!DFG.IsCode<NodeAttrs::Stmt>(IA)) + continue; + + MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + if (trace()) + dbgs() << "erasing: " << *MI; + MI->eraseFromParent(); + } + return true; +} diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h new file mode 100644 index 000000000000..f4373fb5007d --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.h @@ -0,0 +1,65 @@ +//===--- RDFDeadCode.h ----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// RDF-based generic dead code elimination. +// +// The main interface of this class are functions "collect" and "erase". +// This allows custom processing of the function being optimized by a +// particular consumer. The simplest way to use this class would be to +// instantiate an object, and then simply call "collect" and "erase", +// passing the result of "getDeadInstrs()" to it. +// A more complex scenario would be to call "collect" first, then visit +// all post-increment instructions to see if the address update is dead +// or not, and if it is, convert the instruction to a non-updating form. +// After that "erase" can be called with the set of nodes including both, +// dead defs from the updating instructions and the nodes corresponding +// to the dead instructions. + +#ifndef RDF_DEADCODE_H +#define RDF_DEADCODE_H + +#include "RDFGraph.h" +#include "RDFLiveness.h" +#include "llvm/ADT/SetVector.h" + +namespace llvm { + class MachineRegisterInfo; +} + +namespace rdf { + struct DeadCodeElimination { + DeadCodeElimination(DataFlowGraph &dfg, MachineRegisterInfo &mri) + : Trace(false), DFG(dfg), MRI(mri), LV(mri, dfg) {} + + bool collect(); + bool erase(const SetVector<NodeId> &Nodes); + void trace(bool On) { Trace = On; } + bool trace() const { return Trace; } + + SetVector<NodeId> getDeadNodes() { return DeadNodes; } + SetVector<NodeId> getDeadInstrs() { return DeadInstrs; } + DataFlowGraph &getDFG() { return DFG; } + + private: + bool Trace; + SetVector<NodeId> LiveNodes; + SetVector<NodeId> DeadNodes; + SetVector<NodeId> DeadInstrs; + DataFlowGraph &DFG; + MachineRegisterInfo &MRI; + Liveness LV; + + bool isLiveInstr(const MachineInstr *MI) const; + void scanInstr(NodeAddr<InstrNode*> IA, SetVector<NodeId> &WorkQ); + void processDef(NodeAddr<DefNode*> DA, SetVector<NodeId> &WorkQ); + void processUse(NodeAddr<UseNode*> UA, SetVector<NodeId> &WorkQ); + }; +} + +#endif diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp new file mode 100644 index 000000000000..9b47422153bb --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp @@ -0,0 +1,1716 @@ +//===--- RDFGraph.cpp -----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Target-independent, SSA-based data flow graph for register data flow (RDF). +// +#include "RDFGraph.h" + +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; +using namespace rdf; + +// Printing functions. Have them here first, so that the rest of the code +// can use them. +namespace rdf { + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) { + auto &TRI = P.G.getTRI(); + if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs()) + OS << TRI.getName(P.Obj.Reg); + else + OS << '#' << P.Obj.Reg; + if (P.Obj.Sub > 0) { + OS << ':'; + if (P.Obj.Sub < TRI.getNumSubRegIndices()) + OS << TRI.getSubRegIndexName(P.Obj.Sub); + else + OS << '#' << P.Obj.Sub; + } + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) { + auto NA = P.G.addr<NodeBase*>(P.Obj); + uint16_t Attrs = NA.Addr->getAttrs(); + uint16_t Kind = NodeAttrs::kind(Attrs); + uint16_t Flags = NodeAttrs::flags(Attrs); + switch (NodeAttrs::type(Attrs)) { + case NodeAttrs::Code: + switch (Kind) { + case NodeAttrs::Func: OS << 'f'; break; + case NodeAttrs::Block: OS << 'b'; break; + case NodeAttrs::Stmt: OS << 's'; break; + case NodeAttrs::Phi: OS << 'p'; break; + default: OS << "c?"; break; + } + break; + case NodeAttrs::Ref: + if (Flags & NodeAttrs::Preserving) + OS << '+'; + if (Flags & NodeAttrs::Clobbering) + OS << '~'; + switch (Kind) { + case NodeAttrs::Use: OS << 'u'; break; + case NodeAttrs::Def: OS << 'd'; break; + case NodeAttrs::Block: OS << 'b'; break; + default: OS << "r?"; break; + } + break; + default: + OS << '?'; + break; + } + OS << P.Obj; + if (Flags & NodeAttrs::Shadow) + OS << '"'; + return OS; +} + +namespace { + void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA, + const DataFlowGraph &G) { + OS << Print<NodeId>(RA.Id, G) << '<' + << Print<RegisterRef>(RA.Addr->getRegRef(), G) << '>'; + if (RA.Addr->getFlags() & NodeAttrs::Fixed) + OS << '!'; + } +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) { + printRefHeader(OS, P.Obj, P.G); + OS << '('; + if (NodeId N = P.Obj.Addr->getReachingDef()) + OS << Print<NodeId>(N, P.G); + OS << ','; + if (NodeId N = P.Obj.Addr->getReachedDef()) + OS << Print<NodeId>(N, P.G); + OS << ','; + if (NodeId N = P.Obj.Addr->getReachedUse()) + OS << Print<NodeId>(N, P.G); + OS << "):"; + if (NodeId N = P.Obj.Addr->getSibling()) + OS << Print<NodeId>(N, P.G); + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) { + printRefHeader(OS, P.Obj, P.G); + OS << '('; + if (NodeId N = P.Obj.Addr->getReachingDef()) + OS << Print<NodeId>(N, P.G); + OS << "):"; + if (NodeId N = P.Obj.Addr->getSibling()) + OS << Print<NodeId>(N, P.G); + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<PhiUseNode*>> &P) { + printRefHeader(OS, P.Obj, P.G); + OS << '('; + if (NodeId N = P.Obj.Addr->getReachingDef()) + OS << Print<NodeId>(N, P.G); + OS << ','; + if (NodeId N = P.Obj.Addr->getPredecessor()) + OS << Print<NodeId>(N, P.G); + OS << "):"; + if (NodeId N = P.Obj.Addr->getSibling()) + OS << Print<NodeId>(N, P.G); + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) { + switch (P.Obj.Addr->getKind()) { + case NodeAttrs::Def: + OS << PrintNode<DefNode*>(P.Obj, P.G); + break; + case NodeAttrs::Use: + if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef) + OS << PrintNode<PhiUseNode*>(P.Obj, P.G); + else + OS << PrintNode<UseNode*>(P.Obj, P.G); + break; + } + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) { + unsigned N = P.Obj.size(); + for (auto I : P.Obj) { + OS << Print<NodeId>(I.Id, P.G); + if (--N) + OS << ' '; + } + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) { + unsigned N = P.Obj.size(); + for (auto I : P.Obj) { + OS << Print<NodeId>(I, P.G); + if (--N) + OS << ' '; + } + return OS; +} + +namespace { + template <typename T> + struct PrintListV { + PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {} + typedef T Type; + const NodeList &List; + const DataFlowGraph &G; + }; + + template <typename T> + raw_ostream &operator<< (raw_ostream &OS, const PrintListV<T> &P) { + unsigned N = P.List.size(); + for (NodeAddr<T> A : P.List) { + OS << PrintNode<T>(A, P.G); + if (--N) + OS << ", "; + } + return OS; + } +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) { + OS << Print<NodeId>(P.Obj.Id, P.G) << ": phi [" + << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']'; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<StmtNode*>> &P) { + unsigned Opc = P.Obj.Addr->getCode()->getOpcode(); + OS << Print<NodeId>(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc) + << " [" << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']'; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<InstrNode*>> &P) { + switch (P.Obj.Addr->getKind()) { + case NodeAttrs::Phi: + OS << PrintNode<PhiNode*>(P.Obj, P.G); + break; + case NodeAttrs::Stmt: + OS << PrintNode<StmtNode*>(P.Obj, P.G); + break; + default: + OS << "instr? " << Print<NodeId>(P.Obj.Id, P.G); + break; + } + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<BlockNode*>> &P) { + auto *BB = P.Obj.Addr->getCode(); + unsigned NP = BB->pred_size(); + std::vector<int> Ns; + auto PrintBBs = [&OS,&P] (std::vector<int> Ns) -> void { + unsigned N = Ns.size(); + for (auto I : Ns) { + OS << "BB#" << I; + if (--N) + OS << ", "; + } + }; + + OS << Print<NodeId>(P.Obj.Id, P.G) << ": === BB#" << BB->getNumber() + << " === preds(" << NP << "): "; + for (auto I : BB->predecessors()) + Ns.push_back(I->getNumber()); + PrintBBs(Ns); + + unsigned NS = BB->succ_size(); + OS << " succs(" << NS << "): "; + Ns.clear(); + for (auto I : BB->successors()) + Ns.push_back(I->getNumber()); + PrintBBs(Ns); + OS << '\n'; + + for (auto I : P.Obj.Addr->members(P.G)) + OS << PrintNode<InstrNode*>(I, P.G) << '\n'; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<NodeAddr<FuncNode*>> &P) { + OS << "DFG dump:[\n" << Print<NodeId>(P.Obj.Id, P.G) << ": Function: " + << P.Obj.Addr->getCode()->getName() << '\n'; + for (auto I : P.Obj.Addr->members(P.G)) + OS << PrintNode<BlockNode*>(I, P.G) << '\n'; + OS << "]\n"; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) { + OS << '{'; + for (auto I : P.Obj) + OS << ' ' << Print<RegisterRef>(I, P.G); + OS << " }"; + return OS; +} + +template<> +raw_ostream &operator<< (raw_ostream &OS, + const Print<DataFlowGraph::DefStack> &P) { + for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) { + OS << Print<NodeId>(I->Id, P.G) + << '<' << Print<RegisterRef>(I->Addr->getRegRef(), P.G) << '>'; + I.down(); + if (I != E) + OS << ' '; + } + return OS; +} + +} // namespace rdf + +// Node allocation functions. +// +// Node allocator is like a slab memory allocator: it allocates blocks of +// memory in sizes that are multiples of the size of a node. Each block has +// the same size. Nodes are allocated from the currently active block, and +// when it becomes full, a new one is created. +// There is a mapping scheme between node id and its location in a block, +// and within that block is described in the header file. +// +void NodeAllocator::startNewBlock() { + void *T = MemPool.Allocate(NodesPerBlock*NodeMemSize, NodeMemSize); + char *P = static_cast<char*>(T); + Blocks.push_back(P); + // Check if the block index is still within the allowed range, i.e. less + // than 2^N, where N is the number of bits in NodeId for the block index. + // BitsPerIndex is the number of bits per node index. + assert((Blocks.size() < (1U << (8*sizeof(NodeId)-BitsPerIndex))) && + "Out of bits for block index"); + ActiveEnd = P; +} + +bool NodeAllocator::needNewBlock() { + if (Blocks.empty()) + return true; + + char *ActiveBegin = Blocks.back(); + uint32_t Index = (ActiveEnd-ActiveBegin)/NodeMemSize; + return Index >= NodesPerBlock; +} + +NodeAddr<NodeBase*> NodeAllocator::New() { + if (needNewBlock()) + startNewBlock(); + + uint32_t ActiveB = Blocks.size()-1; + uint32_t Index = (ActiveEnd - Blocks[ActiveB])/NodeMemSize; + NodeAddr<NodeBase*> NA = { reinterpret_cast<NodeBase*>(ActiveEnd), + makeId(ActiveB, Index) }; + ActiveEnd += NodeMemSize; + return NA; +} + +NodeId NodeAllocator::id(const NodeBase *P) const { + uintptr_t A = reinterpret_cast<uintptr_t>(P); + for (unsigned i = 0, n = Blocks.size(); i != n; ++i) { + uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]); + if (A < B || A >= B + NodesPerBlock*NodeMemSize) + continue; + uint32_t Idx = (A-B)/NodeMemSize; + return makeId(i, Idx); + } + llvm_unreachable("Invalid node address"); +} + +void NodeAllocator::clear() { + MemPool.Reset(); + Blocks.clear(); + ActiveEnd = nullptr; +} + + +// Insert node NA after "this" in the circular chain. +void NodeBase::append(NodeAddr<NodeBase*> NA) { + NodeId Nx = Next; + // If NA is already "next", do nothing. + if (Next != NA.Id) { + Next = NA.Id; + NA.Addr->Next = Nx; + } +} + + +// Fundamental node manipulator functions. + +// Obtain the register reference from a reference node. +RegisterRef RefNode::getRegRef() const { + assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref); + if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef) + return Ref.RR; + assert(Ref.Op != nullptr); + return { Ref.Op->getReg(), Ref.Op->getSubReg() }; +} + +// Set the register reference in the reference node directly (for references +// in phi nodes). +void RefNode::setRegRef(RegisterRef RR) { + assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref); + assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef); + Ref.RR = RR; +} + +// Set the register reference in the reference node based on a machine +// operand (for references in statement nodes). +void RefNode::setRegRef(MachineOperand *Op) { + assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref); + assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)); + Ref.Op = Op; +} + +// Get the owner of a given reference node. +NodeAddr<NodeBase*> RefNode::getOwner(const DataFlowGraph &G) { + NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext()); + + while (NA.Addr != this) { + if (NA.Addr->getType() == NodeAttrs::Code) + return NA; + NA = G.addr<NodeBase*>(NA.Addr->getNext()); + } + llvm_unreachable("No owner in circular list"); +} + +// Connect the def node to the reaching def node. +void DefNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) { + Ref.RD = DA.Id; + Ref.Sib = DA.Addr->getReachedDef(); + DA.Addr->setReachedDef(Self); +} + +// Connect the use node to the reaching def node. +void UseNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) { + Ref.RD = DA.Id; + Ref.Sib = DA.Addr->getReachedUse(); + DA.Addr->setReachedUse(Self); +} + +// Get the first member of the code node. +NodeAddr<NodeBase*> CodeNode::getFirstMember(const DataFlowGraph &G) const { + if (Code.FirstM == 0) + return NodeAddr<NodeBase*>(); + return G.addr<NodeBase*>(Code.FirstM); +} + +// Get the last member of the code node. +NodeAddr<NodeBase*> CodeNode::getLastMember(const DataFlowGraph &G) const { + if (Code.LastM == 0) + return NodeAddr<NodeBase*>(); + return G.addr<NodeBase*>(Code.LastM); +} + +// Add node NA at the end of the member list of the given code node. +void CodeNode::addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) { + auto ML = getLastMember(G); + if (ML.Id != 0) { + ML.Addr->append(NA); + } else { + Code.FirstM = NA.Id; + NodeId Self = G.id(this); + NA.Addr->setNext(Self); + } + Code.LastM = NA.Id; +} + +// Add node NA after member node MA in the given code node. +void CodeNode::addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA, + const DataFlowGraph &G) { + MA.Addr->append(NA); + if (Code.LastM == MA.Id) + Code.LastM = NA.Id; +} + +// Remove member node NA from the given code node. +void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) { + auto MA = getFirstMember(G); + assert(MA.Id != 0); + + // Special handling if the member to remove is the first member. + if (MA.Id == NA.Id) { + if (Code.LastM == MA.Id) { + // If it is the only member, set both first and last to 0. + Code.FirstM = Code.LastM = 0; + } else { + // Otherwise, advance the first member. + Code.FirstM = MA.Addr->getNext(); + } + return; + } + + while (MA.Addr != this) { + NodeId MX = MA.Addr->getNext(); + if (MX == NA.Id) { + MA.Addr->setNext(NA.Addr->getNext()); + // If the member to remove happens to be the last one, update the + // LastM indicator. + if (Code.LastM == NA.Id) + Code.LastM = MA.Id; + return; + } + MA = G.addr<NodeBase*>(MX); + } + llvm_unreachable("No such member"); +} + +// Return the list of all members of the code node. +NodeList CodeNode::members(const DataFlowGraph &G) const { + static auto True = [] (NodeAddr<NodeBase*>) -> bool { return true; }; + return members_if(True, G); +} + +// Return the owner of the given instr node. +NodeAddr<NodeBase*> InstrNode::getOwner(const DataFlowGraph &G) { + NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext()); + + while (NA.Addr != this) { + assert(NA.Addr->getType() == NodeAttrs::Code); + if (NA.Addr->getKind() == NodeAttrs::Block) + return NA; + NA = G.addr<NodeBase*>(NA.Addr->getNext()); + } + llvm_unreachable("No owner in circular list"); +} + +// Add the phi node PA to the given block node. +void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) { + auto M = getFirstMember(G); + if (M.Id == 0) { + addMember(PA, G); + return; + } + + assert(M.Addr->getType() == NodeAttrs::Code); + if (M.Addr->getKind() == NodeAttrs::Stmt) { + // If the first member of the block is a statement, insert the phi as + // the first member. + Code.FirstM = PA.Id; + PA.Addr->setNext(M.Id); + } else { + // If the first member is a phi, find the last phi, and append PA to it. + assert(M.Addr->getKind() == NodeAttrs::Phi); + NodeAddr<NodeBase*> MN = M; + do { + M = MN; + MN = G.addr<NodeBase*>(M.Addr->getNext()); + assert(MN.Addr->getType() == NodeAttrs::Code); + } while (MN.Addr->getKind() == NodeAttrs::Phi); + + // M is the last phi. + addMemberAfter(M, PA, G); + } +} + +// Find the block node corresponding to the machine basic block BB in the +// given func node. +NodeAddr<BlockNode*> FuncNode::findBlock(const MachineBasicBlock *BB, + const DataFlowGraph &G) const { + auto EqBB = [BB] (NodeAddr<NodeBase*> NA) -> bool { + return NodeAddr<BlockNode*>(NA).Addr->getCode() == BB; + }; + NodeList Ms = members_if(EqBB, G); + if (!Ms.empty()) + return Ms[0]; + return NodeAddr<BlockNode*>(); +} + +// Get the block node for the entry block in the given function. +NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) { + MachineBasicBlock *EntryB = &getCode()->front(); + return findBlock(EntryB, G); +} + + +// Register aliasing information. +// +// In theory, the lane information could be used to determine register +// covering (and aliasing), but depending on the sub-register structure, +// the lane mask information may be missing. The covering information +// must be available for this framework to work, so relying solely on +// the lane data is not sufficient. + +// Determine whether RA covers RB. +bool RegisterAliasInfo::covers(RegisterRef RA, RegisterRef RB) const { + if (RA == RB) + return true; + if (TargetRegisterInfo::isVirtualRegister(RA.Reg)) { + assert(TargetRegisterInfo::isVirtualRegister(RB.Reg)); + if (RA.Reg != RB.Reg) + return false; + if (RA.Sub == 0) + return true; + return TRI.composeSubRegIndices(RA.Sub, RB.Sub) == RA.Sub; + } + + assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg) && + TargetRegisterInfo::isPhysicalRegister(RB.Reg)); + unsigned A = RA.Sub != 0 ? TRI.getSubReg(RA.Reg, RA.Sub) : RA.Reg; + unsigned B = RB.Sub != 0 ? TRI.getSubReg(RB.Reg, RB.Sub) : RB.Reg; + return TRI.isSubRegister(A, B); +} + +// Determine whether RR is covered by the set of references RRs. +bool RegisterAliasInfo::covers(const RegisterSet &RRs, RegisterRef RR) const { + if (RRs.count(RR)) + return true; + + // For virtual registers, we cannot accurately determine covering based + // on subregisters. If RR itself is not present in RRs, but it has a sub- + // register reference, check for the super-register alone. Otherwise, + // assume non-covering. + if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) { + if (RR.Sub != 0) + return RRs.count({RR.Reg, 0}); + return false; + } + + // If any super-register of RR is present, then RR is covered. + unsigned Reg = RR.Sub == 0 ? RR.Reg : TRI.getSubReg(RR.Reg, RR.Sub); + for (MCSuperRegIterator SR(Reg, &TRI); SR.isValid(); ++SR) + if (RRs.count({*SR, 0})) + return true; + + return false; +} + +// Get the list of references aliased to RR. +std::vector<RegisterRef> RegisterAliasInfo::getAliasSet(RegisterRef RR) const { + // Do not include RR in the alias set. For virtual registers return an + // empty set. + std::vector<RegisterRef> AS; + if (TargetRegisterInfo::isVirtualRegister(RR.Reg)) + return AS; + assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg)); + unsigned R = RR.Reg; + if (RR.Sub) + R = TRI.getSubReg(RR.Reg, RR.Sub); + + for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI) + AS.push_back(RegisterRef({*AI, 0})); + return AS; +} + +// Check whether RA and RB are aliased. +bool RegisterAliasInfo::alias(RegisterRef RA, RegisterRef RB) const { + bool VirtA = TargetRegisterInfo::isVirtualRegister(RA.Reg); + bool VirtB = TargetRegisterInfo::isVirtualRegister(RB.Reg); + bool PhysA = TargetRegisterInfo::isPhysicalRegister(RA.Reg); + bool PhysB = TargetRegisterInfo::isPhysicalRegister(RB.Reg); + + if (VirtA != VirtB) + return false; + + if (VirtA) { + if (RA.Reg != RB.Reg) + return false; + // RA and RB refer to the same register. If any of them refer to the + // whole register, they must be aliased. + if (RA.Sub == 0 || RB.Sub == 0) + return true; + unsigned SA = TRI.getSubRegIdxSize(RA.Sub); + unsigned OA = TRI.getSubRegIdxOffset(RA.Sub); + unsigned SB = TRI.getSubRegIdxSize(RB.Sub); + unsigned OB = TRI.getSubRegIdxOffset(RB.Sub); + if (OA <= OB && OA+SA > OB) + return true; + if (OB <= OA && OB+SB > OA) + return true; + return false; + } + + assert(PhysA && PhysB); + (void)PhysA, (void)PhysB; + unsigned A = RA.Sub ? TRI.getSubReg(RA.Reg, RA.Sub) : RA.Reg; + unsigned B = RB.Sub ? TRI.getSubReg(RB.Reg, RB.Sub) : RB.Reg; + for (MCRegAliasIterator I(A, &TRI, true); I.isValid(); ++I) + if (B == *I) + return true; + return false; +} + + +// Target operand information. +// + +// For a given instruction, check if there are any bits of RR that can remain +// unchanged across this def. +bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum) + const { + return TII.isPredicated(&In); +} + +// Check if the definition of RR produces an unspecified value. +bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum) + const { + if (In.isCall()) + if (In.getOperand(OpNum).isImplicit()) + return true; + return false; +} + +// Check if the given instruction specifically requires +bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum) + const { + if (In.isCall() || In.isReturn()) + return true; + const MCInstrDesc &D = In.getDesc(); + if (!D.getImplicitDefs() && !D.getImplicitUses()) + return false; + const MachineOperand &Op = In.getOperand(OpNum); + // If there is a sub-register, treat the operand as non-fixed. Currently, + // fixed registers are those that are listed in the descriptor as implicit + // uses or defs, and those lists do not allow sub-registers. + if (Op.getSubReg() != 0) + return false; + unsigned Reg = Op.getReg(); + const MCPhysReg *ImpR = Op.isDef() ? D.getImplicitDefs() + : D.getImplicitUses(); + if (!ImpR) + return false; + while (*ImpR) + if (*ImpR++ == Reg) + return true; + return false; +} + + +// +// The data flow graph construction. +// + +DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii, + const TargetRegisterInfo &tri, const MachineDominatorTree &mdt, + const MachineDominanceFrontier &mdf, const RegisterAliasInfo &rai, + const TargetOperandInfo &toi) + : TimeG("rdf"), MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), RAI(rai), + TOI(toi) { +} + + +// The implementation of the definition stack. +// Each register reference has its own definition stack. In particular, +// for a register references "Reg" and "Reg:subreg" will each have their +// own definition stacks. + +// Construct a stack iterator. +DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S, + bool Top) : DS(S) { + if (!Top) { + // Initialize to bottom. + Pos = 0; + return; + } + // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty). + Pos = DS.Stack.size(); + while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos-1])) + Pos--; +} + +// Return the size of the stack, including block delimiters. +unsigned DataFlowGraph::DefStack::size() const { + unsigned S = 0; + for (auto I = top(), E = bottom(); I != E; I.down()) + S++; + return S; +} + +// Remove the top entry from the stack. Remove all intervening delimiters +// so that after this, the stack is either empty, or the top of the stack +// is a non-delimiter. +void DataFlowGraph::DefStack::pop() { + assert(!empty()); + unsigned P = nextDown(Stack.size()); + Stack.resize(P); +} + +// Push a delimiter for block node N on the stack. +void DataFlowGraph::DefStack::start_block(NodeId N) { + assert(N != 0); + Stack.push_back(NodeAddr<DefNode*>(nullptr, N)); +} + +// Remove all nodes from the top of the stack, until the delimited for +// block node N is encountered. Remove the delimiter as well. In effect, +// this will remove from the stack all definitions from block N. +void DataFlowGraph::DefStack::clear_block(NodeId N) { + assert(N != 0); + unsigned P = Stack.size(); + while (P > 0) { + bool Found = isDelimiter(Stack[P-1], N); + P--; + if (Found) + break; + } + // This will also remove the delimiter, if found. + Stack.resize(P); +} + +// Move the stack iterator up by one. +unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const { + // Get the next valid position after P (skipping all delimiters). + // The input position P does not have to point to a non-delimiter. + unsigned SS = Stack.size(); + bool IsDelim; + assert(P < SS); + do { + P++; + IsDelim = isDelimiter(Stack[P-1]); + } while (P < SS && IsDelim); + assert(!IsDelim); + return P; +} + +// Move the stack iterator down by one. +unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const { + // Get the preceding valid position before P (skipping all delimiters). + // The input position P does not have to point to a non-delimiter. + assert(P > 0 && P <= Stack.size()); + bool IsDelim = isDelimiter(Stack[P-1]); + do { + if (--P == 0) + break; + IsDelim = isDelimiter(Stack[P-1]); + } while (P > 0 && IsDelim); + assert(!IsDelim); + return P; +} + +// Node management functions. + +// Get the pointer to the node with the id N. +NodeBase *DataFlowGraph::ptr(NodeId N) const { + if (N == 0) + return nullptr; + return Memory.ptr(N); +} + +// Get the id of the node at the address P. +NodeId DataFlowGraph::id(const NodeBase *P) const { + if (P == nullptr) + return 0; + return Memory.id(P); +} + +// Allocate a new node and set the attributes to Attrs. +NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) { + NodeAddr<NodeBase*> P = Memory.New(); + P.Addr->init(); + P.Addr->setAttrs(Attrs); + return P; +} + +// Make a copy of the given node B, except for the data-flow links, which +// are set to 0. +NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) { + NodeAddr<NodeBase*> NA = newNode(0); + memcpy(NA.Addr, B.Addr, sizeof(NodeBase)); + // Ref nodes need to have the data-flow links reset. + if (NA.Addr->getType() == NodeAttrs::Ref) { + NodeAddr<RefNode*> RA = NA; + RA.Addr->setReachingDef(0); + RA.Addr->setSibling(0); + if (NA.Addr->getKind() == NodeAttrs::Def) { + NodeAddr<DefNode*> DA = NA; + DA.Addr->setReachedDef(0); + DA.Addr->setReachedUse(0); + } + } + return NA; +} + + +// Allocation routines for specific node types/kinds. + +NodeAddr<UseNode*> DataFlowGraph::newUse(NodeAddr<InstrNode*> Owner, + MachineOperand &Op, uint16_t Flags) { + NodeAddr<UseNode*> UA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags); + UA.Addr->setRegRef(&Op); + return UA; +} + +NodeAddr<PhiUseNode*> DataFlowGraph::newPhiUse(NodeAddr<PhiNode*> Owner, + RegisterRef RR, NodeAddr<BlockNode*> PredB, uint16_t Flags) { + NodeAddr<PhiUseNode*> PUA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags); + assert(Flags & NodeAttrs::PhiRef); + PUA.Addr->setRegRef(RR); + PUA.Addr->setPredecessor(PredB.Id); + return PUA; +} + +NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner, + MachineOperand &Op, uint16_t Flags) { + NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags); + DA.Addr->setRegRef(&Op); + return DA; +} + +NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner, + RegisterRef RR, uint16_t Flags) { + NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags); + assert(Flags & NodeAttrs::PhiRef); + DA.Addr->setRegRef(RR); + return DA; +} + +NodeAddr<PhiNode*> DataFlowGraph::newPhi(NodeAddr<BlockNode*> Owner) { + NodeAddr<PhiNode*> PA = newNode(NodeAttrs::Code | NodeAttrs::Phi); + Owner.Addr->addPhi(PA, *this); + return PA; +} + +NodeAddr<StmtNode*> DataFlowGraph::newStmt(NodeAddr<BlockNode*> Owner, + MachineInstr *MI) { + NodeAddr<StmtNode*> SA = newNode(NodeAttrs::Code | NodeAttrs::Stmt); + SA.Addr->setCode(MI); + Owner.Addr->addMember(SA, *this); + return SA; +} + +NodeAddr<BlockNode*> DataFlowGraph::newBlock(NodeAddr<FuncNode*> Owner, + MachineBasicBlock *BB) { + NodeAddr<BlockNode*> BA = newNode(NodeAttrs::Code | NodeAttrs::Block); + BA.Addr->setCode(BB); + Owner.Addr->addMember(BA, *this); + return BA; +} + +NodeAddr<FuncNode*> DataFlowGraph::newFunc(MachineFunction *MF) { + NodeAddr<FuncNode*> FA = newNode(NodeAttrs::Code | NodeAttrs::Func); + FA.Addr->setCode(MF); + return FA; +} + +// Build the data flow graph. +void DataFlowGraph::build() { + reset(); + Func = newFunc(&MF); + + if (MF.empty()) + return; + + for (auto &B : MF) { + auto BA = newBlock(Func, &B); + for (auto &I : B) { + if (I.isDebugValue()) + continue; + buildStmt(BA, I); + } + } + + // Collect information about block references. + NodeAddr<BlockNode*> EA = Func.Addr->getEntryBlock(*this); + BlockRefsMap RefM; + buildBlockRefs(EA, RefM); + + // Add function-entry phi nodes. + MachineRegisterInfo &MRI = MF.getRegInfo(); + for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) { + NodeAddr<PhiNode*> PA = newPhi(EA); + RegisterRef RR = { I->first, 0 }; + uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving; + NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags); + PA.Addr->addMember(DA, *this); + } + + // Build a map "PhiM" which will contain, for each block, the set + // of references that will require phi definitions in that block. + BlockRefsMap PhiM; + auto Blocks = Func.Addr->members(*this); + for (NodeAddr<BlockNode*> BA : Blocks) + recordDefsForDF(PhiM, RefM, BA); + for (NodeAddr<BlockNode*> BA : Blocks) + buildPhis(PhiM, RefM, BA); + + // Link all the refs. This will recursively traverse the dominator tree. + DefStackMap DM; + linkBlockRefs(DM, EA); + + // Finally, remove all unused phi nodes. + removeUnusedPhis(); +} + +// For each stack in the map DefM, push the delimiter for block B on it. +void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) { + // Push block delimiters. + for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I) + I->second.start_block(B); +} + +// Remove all definitions coming from block B from each stack in DefM. +void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) { + // Pop all defs from this block from the definition stack. Defs that were + // added to the map during the traversal of instructions will not have a + // delimiter, but for those, the whole stack will be emptied. + for (auto I = DefM.begin(), E = DefM.end(); I != E; ++I) + I->second.clear_block(B); + + // Finally, remove empty stacks from the map. + for (auto I = DefM.begin(), E = DefM.end(), NextI = I; I != E; I = NextI) { + NextI = std::next(I); + // This preserves the validity of iterators other than I. + if (I->second.empty()) + DefM.erase(I); + } +} + +// Push all definitions from the instruction node IA to an appropriate +// stack in DefM. +void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) { + NodeList Defs = IA.Addr->members_if(IsDef, *this); + NodeSet Visited; +#ifndef NDEBUG + RegisterSet Defined; +#endif + + // The important objectives of this function are: + // - to be able to handle instructions both while the graph is being + // constructed, and after the graph has been constructed, and + // - maintain proper ordering of definitions on the stack for each + // register reference: + // - if there are two or more related defs in IA (i.e. coming from + // the same machine operand), then only push one def on the stack, + // - if there are multiple unrelated defs of non-overlapping + // subregisters of S, then the stack for S will have both (in an + // unspecified order), but the order does not matter from the data- + // -flow perspective. + + for (NodeAddr<DefNode*> DA : Defs) { + if (Visited.count(DA.Id)) + continue; + NodeList Rel = getRelatedRefs(IA, DA); + NodeAddr<DefNode*> PDA = Rel.front(); + // Push the definition on the stack for the register and all aliases. + RegisterRef RR = PDA.Addr->getRegRef(); +#ifndef NDEBUG + // Assert if the register is defined in two or more unrelated defs. + // This could happen if there are two or more def operands defining it. + if (!Defined.insert(RR).second) { + auto *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); + dbgs() << "Multiple definitions of register: " + << Print<RegisterRef>(RR, *this) << " in\n " << *MI + << "in BB#" << MI->getParent()->getNumber() << '\n'; + llvm_unreachable(nullptr); + } +#endif + DefM[RR].push(DA); + for (auto A : RAI.getAliasSet(RR)) { + assert(A != RR); + DefM[A].push(DA); + } + // Mark all the related defs as visited. + for (auto T : Rel) + Visited.insert(T.Id); + } +} + +// Return the list of all reference nodes related to RA, including RA itself. +// See "getNextRelated" for the meaning of a "related reference". +NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const { + assert(IA.Id != 0 && RA.Id != 0); + + NodeList Refs; + NodeId Start = RA.Id; + do { + Refs.push_back(RA); + RA = getNextRelated(IA, RA); + } while (RA.Id != 0 && RA.Id != Start); + return Refs; +} + + +// Clear all information in the graph. +void DataFlowGraph::reset() { + Memory.clear(); + Func = NodeAddr<FuncNode*>(); +} + + +// Return the next reference node in the instruction node IA that is related +// to RA. Conceptually, two reference nodes are related if they refer to the +// same instance of a register access, but differ in flags or other minor +// characteristics. Specific examples of related nodes are shadow reference +// nodes. +// Return the equivalent of nullptr if there are no more related references. +NodeAddr<RefNode*> DataFlowGraph::getNextRelated(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const { + assert(IA.Id != 0 && RA.Id != 0); + + auto Related = [RA](NodeAddr<RefNode*> TA) -> bool { + if (TA.Addr->getKind() != RA.Addr->getKind()) + return false; + if (TA.Addr->getRegRef() != RA.Addr->getRegRef()) + return false; + return true; + }; + auto RelatedStmt = [&Related,RA](NodeAddr<RefNode*> TA) -> bool { + return Related(TA) && + &RA.Addr->getOp() == &TA.Addr->getOp(); + }; + auto RelatedPhi = [&Related,RA](NodeAddr<RefNode*> TA) -> bool { + if (!Related(TA)) + return false; + if (TA.Addr->getKind() != NodeAttrs::Use) + return true; + // For phi uses, compare predecessor blocks. + const NodeAddr<const PhiUseNode*> TUA = TA; + const NodeAddr<const PhiUseNode*> RUA = RA; + return TUA.Addr->getPredecessor() == RUA.Addr->getPredecessor(); + }; + + RegisterRef RR = RA.Addr->getRegRef(); + if (IA.Addr->getKind() == NodeAttrs::Stmt) + return RA.Addr->getNextRef(RR, RelatedStmt, true, *this); + return RA.Addr->getNextRef(RR, RelatedPhi, true, *this); +} + +// Find the next node related to RA in IA that satisfies condition P. +// If such a node was found, return a pair where the second element is the +// located node. If such a node does not exist, return a pair where the +// first element is the element after which such a node should be inserted, +// and the second element is a null-address. +template <typename Predicate> +std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>> +DataFlowGraph::locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, + Predicate P) const { + assert(IA.Id != 0 && RA.Id != 0); + + NodeAddr<RefNode*> NA; + NodeId Start = RA.Id; + while (true) { + NA = getNextRelated(IA, RA); + if (NA.Id == 0 || NA.Id == Start) + break; + if (P(NA)) + break; + RA = NA; + } + + if (NA.Id != 0 && NA.Id != Start) + return std::make_pair(RA, NA); + return std::make_pair(RA, NodeAddr<RefNode*>()); +} + +// Get the next shadow node in IA corresponding to RA, and optionally create +// such a node if it does not exist. +NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA, bool Create) { + assert(IA.Id != 0 && RA.Id != 0); + + uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow; + auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool { + return TA.Addr->getFlags() == Flags; + }; + auto Loc = locateNextRef(IA, RA, IsShadow); + if (Loc.second.Id != 0 || !Create) + return Loc.second; + + // Create a copy of RA and mark is as shadow. + NodeAddr<RefNode*> NA = cloneNode(RA); + NA.Addr->setFlags(Flags | NodeAttrs::Shadow); + IA.Addr->addMemberAfter(Loc.first, NA, *this); + return NA; +} + +// Get the next shadow node in IA corresponding to RA. Return null-address +// if such a node does not exist. +NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const { + assert(IA.Id != 0 && RA.Id != 0); + uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow; + auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool { + return TA.Addr->getFlags() == Flags; + }; + return locateNextRef(IA, RA, IsShadow).second; +} + +// Create a new statement node in the block node BA that corresponds to +// the machine instruction MI. +void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { + auto SA = newStmt(BA, &In); + + // Collect a set of registers that this instruction implicitly uses + // or defines. Implicit operands from an instruction will be ignored + // unless they are listed here. + RegisterSet ImpUses, ImpDefs; + if (const uint16_t *ImpD = In.getDesc().getImplicitDefs()) + while (uint16_t R = *ImpD++) + ImpDefs.insert({R, 0}); + if (const uint16_t *ImpU = In.getDesc().getImplicitUses()) + while (uint16_t R = *ImpU++) + ImpUses.insert({R, 0}); + + bool IsCall = In.isCall(), IsReturn = In.isReturn(); + bool IsPredicated = TII.isPredicated(&In); + unsigned NumOps = In.getNumOperands(); + + // Avoid duplicate implicit defs. This will not detect cases of implicit + // defs that define registers that overlap, but it is not clear how to + // interpret that in the absence of explicit defs. Overlapping explicit + // defs are likely illegal already. + RegisterSet DoneDefs; + // Process explicit defs first. + for (unsigned OpN = 0; OpN < NumOps; ++OpN) { + MachineOperand &Op = In.getOperand(OpN); + if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) + continue; + RegisterRef RR = { Op.getReg(), Op.getSubReg() }; + uint16_t Flags = NodeAttrs::None; + if (TOI.isPreserving(In, OpN)) + Flags |= NodeAttrs::Preserving; + if (TOI.isClobbering(In, OpN)) + Flags |= NodeAttrs::Clobbering; + if (TOI.isFixedReg(In, OpN)) + Flags |= NodeAttrs::Fixed; + NodeAddr<DefNode*> DA = newDef(SA, Op, Flags); + SA.Addr->addMember(DA, *this); + DoneDefs.insert(RR); + } + + // Process implicit defs, skipping those that have already been added + // as explicit. + for (unsigned OpN = 0; OpN < NumOps; ++OpN) { + MachineOperand &Op = In.getOperand(OpN); + if (!Op.isReg() || !Op.isDef() || !Op.isImplicit()) + continue; + RegisterRef RR = { Op.getReg(), Op.getSubReg() }; + if (!IsCall && !ImpDefs.count(RR)) + continue; + if (DoneDefs.count(RR)) + continue; + uint16_t Flags = NodeAttrs::None; + if (TOI.isPreserving(In, OpN)) + Flags |= NodeAttrs::Preserving; + if (TOI.isClobbering(In, OpN)) + Flags |= NodeAttrs::Clobbering; + if (TOI.isFixedReg(In, OpN)) + Flags |= NodeAttrs::Fixed; + NodeAddr<DefNode*> DA = newDef(SA, Op, Flags); + SA.Addr->addMember(DA, *this); + DoneDefs.insert(RR); + } + + for (unsigned OpN = 0; OpN < NumOps; ++OpN) { + MachineOperand &Op = In.getOperand(OpN); + if (!Op.isReg() || !Op.isUse()) + continue; + RegisterRef RR = { Op.getReg(), Op.getSubReg() }; + // Add implicit uses on return and call instructions, and on predicated + // instructions regardless of whether or not they appear in the instruction + // descriptor's list. + bool Implicit = Op.isImplicit(); + bool TakeImplicit = IsReturn || IsCall || IsPredicated; + if (Implicit && !TakeImplicit && !ImpUses.count(RR)) + continue; + uint16_t Flags = NodeAttrs::None; + if (TOI.isFixedReg(In, OpN)) + Flags |= NodeAttrs::Fixed; + NodeAddr<UseNode*> UA = newUse(SA, Op, Flags); + SA.Addr->addMember(UA, *this); + } +} + +// Build a map that for each block will have the set of all references from +// that block, and from all blocks dominated by it. +void DataFlowGraph::buildBlockRefs(NodeAddr<BlockNode*> BA, + BlockRefsMap &RefM) { + auto &Refs = RefM[BA.Id]; + MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode()); + assert(N); + for (auto I : *N) { + MachineBasicBlock *SB = I->getBlock(); + auto SBA = Func.Addr->findBlock(SB, *this); + buildBlockRefs(SBA, RefM); + const auto &SRs = RefM[SBA.Id]; + Refs.insert(SRs.begin(), SRs.end()); + } + + for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) + for (NodeAddr<RefNode*> RA : IA.Addr->members(*this)) + Refs.insert(RA.Addr->getRegRef()); +} + +// Scan all defs in the block node BA and record in PhiM the locations of +// phi nodes corresponding to these defs. +void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM, + NodeAddr<BlockNode*> BA) { + // Check all defs from block BA and record them in each block in BA's + // iterated dominance frontier. This information will later be used to + // create phi nodes. + MachineBasicBlock *BB = BA.Addr->getCode(); + assert(BB); + auto DFLoc = MDF.find(BB); + if (DFLoc == MDF.end() || DFLoc->second.empty()) + return; + + // Traverse all instructions in the block and collect the set of all + // defined references. For each reference there will be a phi created + // in the block's iterated dominance frontier. + // This is done to make sure that each defined reference gets only one + // phi node, even if it is defined multiple times. + RegisterSet Defs; + for (auto I : BA.Addr->members(*this)) { + assert(I.Addr->getType() == NodeAttrs::Code); + assert(I.Addr->getKind() == NodeAttrs::Phi || + I.Addr->getKind() == NodeAttrs::Stmt); + NodeAddr<InstrNode*> IA = I; + for (NodeAddr<RefNode*> RA : IA.Addr->members_if(IsDef, *this)) + Defs.insert(RA.Addr->getRegRef()); + } + + // Finally, add the set of defs to each block in the iterated dominance + // frontier. + const MachineDominanceFrontier::DomSetType &DF = DFLoc->second; + SetVector<MachineBasicBlock*> IDF(DF.begin(), DF.end()); + for (unsigned i = 0; i < IDF.size(); ++i) { + auto F = MDF.find(IDF[i]); + if (F != MDF.end()) + IDF.insert(F->second.begin(), F->second.end()); + } + + // Get the register references that are reachable from this block. + RegisterSet &Refs = RefM[BA.Id]; + for (auto DB : IDF) { + auto DBA = Func.Addr->findBlock(DB, *this); + const auto &Rs = RefM[DBA.Id]; + Refs.insert(Rs.begin(), Rs.end()); + } + + for (auto DB : IDF) { + auto DBA = Func.Addr->findBlock(DB, *this); + PhiM[DBA.Id].insert(Defs.begin(), Defs.end()); + } +} + +// Given the locations of phi nodes in the map PhiM, create the phi nodes +// that are located in the block node BA. +void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM, + NodeAddr<BlockNode*> BA) { + // Check if this blocks has any DF defs, i.e. if there are any defs + // that this block is in the iterated dominance frontier of. + auto HasDF = PhiM.find(BA.Id); + if (HasDF == PhiM.end() || HasDF->second.empty()) + return; + + // First, remove all R in Refs in such that there exists T in Refs + // such that T covers R. In other words, only leave those refs that + // are not covered by another ref (i.e. maximal with respect to covering). + + auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef { + for (auto I : RRs) + if (I != RR && RAI.covers(I, RR)) + RR = I; + return RR; + }; + + RegisterSet MaxDF; + for (auto I : HasDF->second) + MaxDF.insert(MaxCoverIn(I, HasDF->second)); + + std::vector<RegisterRef> MaxRefs; + auto &RefB = RefM[BA.Id]; + for (auto I : MaxDF) + MaxRefs.push_back(MaxCoverIn(I, RefB)); + + // Now, for each R in MaxRefs, get the alias closure of R. If the closure + // only has R in it, create a phi a def for R. Otherwise, create a phi, + // and add a def for each S in the closure. + + // Sort the refs so that the phis will be created in a deterministic order. + std::sort(MaxRefs.begin(), MaxRefs.end()); + // Remove duplicates. + auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end()); + MaxRefs.erase(NewEnd, MaxRefs.end()); + + auto Aliased = [this,&MaxRefs](RegisterRef RR, + std::vector<unsigned> &Closure) -> bool { + for (auto I : Closure) + if (RAI.alias(RR, MaxRefs[I])) + return true; + return false; + }; + + // Prepare a list of NodeIds of the block's predecessors. + std::vector<NodeId> PredList; + const MachineBasicBlock *MBB = BA.Addr->getCode(); + for (auto PB : MBB->predecessors()) { + auto B = Func.Addr->findBlock(PB, *this); + PredList.push_back(B.Id); + } + + while (!MaxRefs.empty()) { + // Put the first element in the closure, and then add all subsequent + // elements from MaxRefs to it, if they alias at least one element + // already in the closure. + // ClosureIdx: vector of indices in MaxRefs of members of the closure. + std::vector<unsigned> ClosureIdx = { 0 }; + for (unsigned i = 1; i != MaxRefs.size(); ++i) + if (Aliased(MaxRefs[i], ClosureIdx)) + ClosureIdx.push_back(i); + + // Build a phi for the closure. + unsigned CS = ClosureIdx.size(); + NodeAddr<PhiNode*> PA = newPhi(BA); + + // Add defs. + for (unsigned X = 0; X != CS; ++X) { + RegisterRef RR = MaxRefs[ClosureIdx[X]]; + uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving; + NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags); + PA.Addr->addMember(DA, *this); + } + // Add phi uses. + for (auto P : PredList) { + auto PBA = addr<BlockNode*>(P); + for (unsigned X = 0; X != CS; ++X) { + RegisterRef RR = MaxRefs[ClosureIdx[X]]; + NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA); + PA.Addr->addMember(PUA, *this); + } + } + + // Erase from MaxRefs all elements in the closure. + auto Begin = MaxRefs.begin(); + for (unsigned i = ClosureIdx.size(); i != 0; --i) + MaxRefs.erase(Begin + ClosureIdx[i-1]); + } +} + +// Remove any unneeded phi nodes that were created during the build process. +void DataFlowGraph::removeUnusedPhis() { + // This will remove unused phis, i.e. phis where each def does not reach + // any uses or other defs. This will not detect or remove circular phi + // chains that are otherwise dead. Unused/dead phis are created during + // the build process and this function is intended to remove these cases + // that are easily determinable to be unnecessary. + + SetVector<NodeId> PhiQ; + for (NodeAddr<BlockNode*> BA : Func.Addr->members(*this)) { + for (auto P : BA.Addr->members_if(IsPhi, *this)) + PhiQ.insert(P.Id); + } + + static auto HasUsedDef = [](NodeList &Ms) -> bool { + for (auto M : Ms) { + if (M.Addr->getKind() != NodeAttrs::Def) + continue; + NodeAddr<DefNode*> DA = M; + if (DA.Addr->getReachedDef() != 0 || DA.Addr->getReachedUse() != 0) + return true; + } + return false; + }; + + // Any phi, if it is removed, may affect other phis (make them dead). + // For each removed phi, collect the potentially affected phis and add + // them back to the queue. + while (!PhiQ.empty()) { + auto PA = addr<PhiNode*>(PhiQ[0]); + PhiQ.remove(PA.Id); + NodeList Refs = PA.Addr->members(*this); + if (HasUsedDef(Refs)) + continue; + for (NodeAddr<RefNode*> RA : Refs) { + if (NodeId RD = RA.Addr->getReachingDef()) { + auto RDA = addr<DefNode*>(RD); + NodeAddr<InstrNode*> OA = RDA.Addr->getOwner(*this); + if (IsPhi(OA)) + PhiQ.insert(OA.Id); + } + if (RA.Addr->isDef()) + unlinkDef(RA); + else + unlinkUse(RA); + } + NodeAddr<BlockNode*> BA = PA.Addr->getOwner(*this); + BA.Addr->removeMember(PA, *this); + } +} + +// For a given reference node TA in an instruction node IA, connect the +// reaching def of TA to the appropriate def node. Create any shadow nodes +// as appropriate. +template <typename T> +void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA, + DefStack &DS) { + if (DS.empty()) + return; + RegisterRef RR = TA.Addr->getRegRef(); + NodeAddr<T> TAP; + + // References from the def stack that have been examined so far. + RegisterSet Defs; + + for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) { + RegisterRef QR = I->Addr->getRegRef(); + auto AliasQR = [QR,this] (RegisterRef RR) -> bool { + return RAI.alias(QR, RR); + }; + bool PrecUp = RAI.covers(QR, RR); + // Skip all defs that are aliased to any of the defs that we have already + // seen. If we encounter a covering def, stop the stack traversal early. + if (std::any_of(Defs.begin(), Defs.end(), AliasQR)) { + if (PrecUp) + break; + continue; + } + // The reaching def. + NodeAddr<DefNode*> RDA = *I; + + // Pick the reached node. + if (TAP.Id == 0) { + TAP = TA; + } else { + // Mark the existing ref as "shadow" and create a new shadow. + TAP.Addr->setFlags(TAP.Addr->getFlags() | NodeAttrs::Shadow); + TAP = getNextShadow(IA, TAP, true); + } + + // Create the link. + TAP.Addr->linkToDef(TAP.Id, RDA); + + if (PrecUp) + break; + Defs.insert(QR); + } +} + +// Create data-flow links for all reference nodes in the statement node SA. +void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA) { + RegisterSet Defs; + + // Link all nodes (upwards in the data-flow) with their reaching defs. + for (NodeAddr<RefNode*> RA : SA.Addr->members(*this)) { + uint16_t Kind = RA.Addr->getKind(); + assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use); + RegisterRef RR = RA.Addr->getRegRef(); + // Do not process multiple defs of the same reference. + if (Kind == NodeAttrs::Def && Defs.count(RR)) + continue; + Defs.insert(RR); + + auto F = DefM.find(RR); + if (F == DefM.end()) + continue; + DefStack &DS = F->second; + if (Kind == NodeAttrs::Use) + linkRefUp<UseNode*>(SA, RA, DS); + else if (Kind == NodeAttrs::Def) + linkRefUp<DefNode*>(SA, RA, DS); + else + llvm_unreachable("Unexpected node in instruction"); + } +} + +// Create data-flow links for all instructions in the block node BA. This +// will include updating any phi nodes in BA. +void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) { + // Push block delimiters. + markBlock(BA.Id, DefM); + + // For each non-phi instruction in the block, link all the defs and uses + // to their reaching defs. For any member of the block (including phis), + // push the defs on the corresponding stacks. + for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) { + // Ignore phi nodes here. They will be linked part by part from the + // predecessors. + if (IA.Addr->getKind() == NodeAttrs::Stmt) + linkStmtRefs(DefM, IA); + + // Push the definitions on the stack. + pushDefs(IA, DefM); + } + + // Recursively process all children in the dominator tree. + MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode()); + for (auto I : *N) { + MachineBasicBlock *SB = I->getBlock(); + auto SBA = Func.Addr->findBlock(SB, *this); + linkBlockRefs(DefM, SBA); + } + + // Link the phi uses from the successor blocks. + auto IsUseForBA = [BA](NodeAddr<NodeBase*> NA) -> bool { + if (NA.Addr->getKind() != NodeAttrs::Use) + return false; + assert(NA.Addr->getFlags() & NodeAttrs::PhiRef); + NodeAddr<PhiUseNode*> PUA = NA; + return PUA.Addr->getPredecessor() == BA.Id; + }; + MachineBasicBlock *MBB = BA.Addr->getCode(); + for (auto SB : MBB->successors()) { + auto SBA = Func.Addr->findBlock(SB, *this); + for (NodeAddr<InstrNode*> IA : SBA.Addr->members_if(IsPhi, *this)) { + // Go over each phi use associated with MBB, and link it. + for (auto U : IA.Addr->members_if(IsUseForBA, *this)) { + NodeAddr<PhiUseNode*> PUA = U; + RegisterRef RR = PUA.Addr->getRegRef(); + linkRefUp<UseNode*>(IA, PUA, DefM[RR]); + } + } + } + + // Pop all defs from this block from the definition stacks. + releaseBlock(BA.Id, DefM); +} + +// Remove the use node UA from any data-flow and structural links. +void DataFlowGraph::unlinkUse(NodeAddr<UseNode*> UA) { + NodeId RD = UA.Addr->getReachingDef(); + NodeId Sib = UA.Addr->getSibling(); + + NodeAddr<InstrNode*> IA = UA.Addr->getOwner(*this); + IA.Addr->removeMember(UA, *this); + + if (RD == 0) { + assert(Sib == 0); + return; + } + + auto RDA = addr<DefNode*>(RD); + auto TA = addr<UseNode*>(RDA.Addr->getReachedUse()); + if (TA.Id == UA.Id) { + RDA.Addr->setReachedUse(Sib); + return; + } + + while (TA.Id != 0) { + NodeId S = TA.Addr->getSibling(); + if (S == UA.Id) { + TA.Addr->setSibling(UA.Addr->getSibling()); + return; + } + TA = addr<UseNode*>(S); + } +} + +// Remove the def node DA from any data-flow and structural links. +void DataFlowGraph::unlinkDef(NodeAddr<DefNode*> DA) { + // + // RD + // | reached + // | def + // : + // . + // +----+ + // ... -- | DA | -- ... -- 0 : sibling chain of DA + // +----+ + // | | reached + // | : def + // | . + // | ... : Siblings (defs) + // | + // : reached + // . use + // ... : sibling chain of reached uses + + NodeId RD = DA.Addr->getReachingDef(); + + // Visit all siblings of the reached def and reset their reaching defs. + // Also, defs reached by DA are now "promoted" to being reached by RD, + // so all of them will need to be spliced into the sibling chain where + // DA belongs. + auto getAllNodes = [this] (NodeId N) -> NodeList { + NodeList Res; + while (N) { + auto RA = addr<RefNode*>(N); + // Keep the nodes in the exact sibling order. + Res.push_back(RA); + N = RA.Addr->getSibling(); + } + return Res; + }; + NodeList ReachedDefs = getAllNodes(DA.Addr->getReachedDef()); + NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse()); + + if (RD == 0) { + for (NodeAddr<RefNode*> I : ReachedDefs) + I.Addr->setSibling(0); + for (NodeAddr<RefNode*> I : ReachedUses) + I.Addr->setSibling(0); + } + for (NodeAddr<DefNode*> I : ReachedDefs) + I.Addr->setReachingDef(RD); + for (NodeAddr<UseNode*> I : ReachedUses) + I.Addr->setReachingDef(RD); + + NodeId Sib = DA.Addr->getSibling(); + if (RD == 0) { + assert(Sib == 0); + return; + } + + // Update the reaching def node and remove DA from the sibling list. + auto RDA = addr<DefNode*>(RD); + auto TA = addr<DefNode*>(RDA.Addr->getReachedDef()); + if (TA.Id == DA.Id) { + // If DA is the first reached def, just update the RD's reached def + // to the DA's sibling. + RDA.Addr->setReachedDef(Sib); + } else { + // Otherwise, traverse the sibling list of the reached defs and remove + // DA from it. + while (TA.Id != 0) { + NodeId S = TA.Addr->getSibling(); + if (S == DA.Id) { + TA.Addr->setSibling(Sib); + break; + } + TA = addr<DefNode*>(S); + } + } + + // Splice the DA's reached defs into the RDA's reached def chain. + if (!ReachedDefs.empty()) { + auto Last = NodeAddr<DefNode*>(ReachedDefs.back()); + Last.Addr->setSibling(RDA.Addr->getReachedDef()); + RDA.Addr->setReachedDef(ReachedDefs.front().Id); + } + // Splice the DA's reached uses into the RDA's reached use chain. + if (!ReachedUses.empty()) { + auto Last = NodeAddr<UseNode*>(ReachedUses.back()); + Last.Addr->setSibling(RDA.Addr->getReachedUse()); + RDA.Addr->setReachedUse(ReachedUses.front().Id); + } + + NodeAddr<InstrNode*> IA = DA.Addr->getOwner(*this); + IA.Addr->removeMember(DA, *this); +} diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.h b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h new file mode 100644 index 000000000000..7da7bb5973cf --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h @@ -0,0 +1,841 @@ +//===--- RDFGraph.h -------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Target-independent, SSA-based data flow graph for register data flow (RDF) +// for a non-SSA program representation (e.g. post-RA machine code). +// +// +// *** Introduction +// +// The RDF graph is a collection of nodes, each of which denotes some element +// of the program. There are two main types of such elements: code and refe- +// rences. Conceptually, "code" is something that represents the structure +// of the program, e.g. basic block or a statement, while "reference" is an +// instance of accessing a register, e.g. a definition or a use. Nodes are +// connected with each other based on the structure of the program (such as +// blocks, instructions, etc.), and based on the data flow (e.g. reaching +// definitions, reached uses, etc.). The single-reaching-definition principle +// of SSA is generally observed, although, due to the non-SSA representation +// of the program, there are some differences between the graph and a "pure" +// SSA representation. +// +// +// *** Implementation remarks +// +// Since the graph can contain a large number of nodes, memory consumption +// was one of the major design considerations. As a result, there is a single +// base class NodeBase which defines all members used by all possible derived +// classes. The members are arranged in a union, and a derived class cannot +// add any data members of its own. Each derived class only defines the +// functional interface, i.e. member functions. NodeBase must be a POD, +// which implies that all of its members must also be PODs. +// Since nodes need to be connected with other nodes, pointers have been +// replaced with 32-bit identifiers: each node has an id of type NodeId. +// There are mapping functions in the graph that translate between actual +// memory addresses and the corresponding identifiers. +// A node id of 0 is equivalent to nullptr. +// +// +// *** Structure of the graph +// +// A code node is always a collection of other nodes. For example, a code +// node corresponding to a basic block will contain code nodes corresponding +// to instructions. In turn, a code node corresponding to an instruction will +// contain a list of reference nodes that correspond to the definitions and +// uses of registers in that instruction. The members are arranged into a +// circular list, which is yet another consequence of the effort to save +// memory: for each member node it should be possible to obtain its owner, +// and it should be possible to access all other members. There are other +// ways to accomplish that, but the circular list seemed the most natural. +// +// +- CodeNode -+ +// | | <---------------------------------------------------+ +// +-+--------+-+ | +// |FirstM |LastM | +// | +-------------------------------------+ | +// | | | +// V V | +// +----------+ Next +----------+ Next Next +----------+ Next | +// | |----->| |-----> ... ----->| |----->-+ +// +- Member -+ +- Member -+ +- Member -+ +// +// The order of members is such that related reference nodes (see below) +// should be contiguous on the member list. +// +// A reference node is a node that encapsulates an access to a register, +// in other words, data flowing into or out of a register. There are two +// major kinds of reference nodes: defs and uses. A def node will contain +// the id of the first reached use, and the id of the first reached def. +// Each def and use will contain the id of the reaching def, and also the +// id of the next reached def (for def nodes) or use (for use nodes). +// The "next node sharing the same reaching def" is denoted as "sibling". +// In summary: +// - Def node contains: reaching def, sibling, first reached def, and first +// reached use. +// - Use node contains: reaching def and sibling. +// +// +-- DefNode --+ +// | R2 = ... | <---+--------------------+ +// ++---------+--+ | | +// |Reached |Reached | | +// |Def |Use | | +// | | |Reaching |Reaching +// | V |Def |Def +// | +-- UseNode --+ Sib +-- UseNode --+ Sib Sib +// | | ... = R2 |----->| ... = R2 |----> ... ----> 0 +// | +-------------+ +-------------+ +// V +// +-- DefNode --+ Sib +// | R2 = ... |----> ... +// ++---------+--+ +// | | +// | | +// ... ... +// +// To get a full picture, the circular lists connecting blocks within a +// function, instructions within a block, etc. should be superimposed with +// the def-def, def-use links shown above. +// To illustrate this, consider a small example in a pseudo-assembly: +// foo: +// add r2, r0, r1 ; r2 = r0+r1 +// addi r0, r2, 1 ; r0 = r2+1 +// ret r0 ; return value in r0 +// +// The graph (in a format used by the debugging functions) would look like: +// +// DFG dump:[ +// f1: Function foo +// b2: === BB#0 === preds(0), succs(0): +// p3: phi [d4<r0>(,d12,u9):] +// p5: phi [d6<r1>(,,u10):] +// s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):] +// s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):] +// s14: ret [u15<r0>(d12):] +// ] +// +// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the +// kind of the node (i.e. f - function, b - basic block, p - phi, s - state- +// ment, d - def, u - use). +// The format of a def node is: +// dN<R>(rd,d,u):sib, +// where +// N - numeric node id, +// R - register being defined +// rd - reaching def, +// d - reached def, +// u - reached use, +// sib - sibling. +// The format of a use node is: +// uN<R>[!](rd):sib, +// where +// N - numeric node id, +// R - register being used, +// rd - reaching def, +// sib - sibling. +// Possible annotations (usually preceding the node id): +// + - preserving def, +// ~ - clobbering def, +// " - shadow ref (follows the node id), +// ! - fixed register (appears after register name). +// +// The circular lists are not explicit in the dump. +// +// +// *** Node attributes +// +// NodeBase has a member "Attrs", which is the primary way of determining +// the node's characteristics. The fields in this member decide whether +// the node is a code node or a reference node (i.e. node's "type"), then +// within each type, the "kind" determines what specifically this node +// represents. The remaining bits, "flags", contain additional information +// that is even more detailed than the "kind". +// CodeNode's kinds are: +// - Phi: Phi node, members are reference nodes. +// - Stmt: Statement, members are reference nodes. +// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt). +// - Func: The whole function. The members are basic block nodes. +// RefNode's kinds are: +// - Use. +// - Def. +// +// Meaning of flags: +// - Preserving: applies only to defs. A preserving def is one that can +// preserve some of the original bits among those that are included in +// the register associated with that def. For example, if R0 is a 32-bit +// register, but a def can only change the lower 16 bits, then it will +// be marked as preserving. +// - Shadow: a reference that has duplicates holding additional reaching +// defs (see more below). +// - Clobbering: applied only to defs, indicates that the value generated +// by this def is unspecified. A typical example would be volatile registers +// after function calls. +// +// +// *** Shadow references +// +// It may happen that a super-register can have two (or more) non-overlapping +// sub-registers. When both of these sub-registers are defined and followed +// by a use of the super-register, the use of the super-register will not +// have a unique reaching def: both defs of the sub-registers need to be +// accounted for. In such cases, a duplicate use of the super-register is +// added and it points to the extra reaching def. Both uses are marked with +// a flag "shadow". Example: +// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap: +// set r0, 1 ; r0 = 1 +// set r1, 1 ; r1 = 1 +// addi t1, t0, 1 ; t1 = t0+1 +// +// The DFG: +// s1: set [d2<r0>(,,u9):] +// s3: set [d4<r1>(,,u10):] +// s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):] +// +// The statement s5 has two use nodes for t0: u7" and u9". The quotation +// mark " indicates that the node is a shadow. +// +#ifndef RDF_GRAPH_H +#define RDF_GRAPH_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Timer.h" + +#include <functional> +#include <map> +#include <set> +#include <vector> + +using namespace llvm; + +namespace llvm { + class MachineBasicBlock; + class MachineFunction; + class MachineInstr; + class MachineOperand; + class MachineDominanceFrontier; + class MachineDominatorTree; + class TargetInstrInfo; + class TargetRegisterInfo; +} + +namespace rdf { + typedef uint32_t NodeId; + + struct NodeAttrs { + enum : uint16_t { + None = 0x0000, // Nothing + + // Types: 2 bits + TypeMask = 0x0003, + Code = 0x0001, // 01, Container + Ref = 0x0002, // 10, Reference + + // Kind: 3 bits + KindMask = 0x0007 << 2, + Def = 0x0001 << 2, // 001 + Use = 0x0002 << 2, // 010 + Phi = 0x0003 << 2, // 011 + Stmt = 0x0004 << 2, // 100 + Block = 0x0005 << 2, // 101 + Func = 0x0006 << 2, // 110 + + // Flags: 5 bits for now + FlagMask = 0x001F << 5, + Shadow = 0x0001 << 5, // 00001, Has extra reaching defs. + Clobbering = 0x0002 << 5, // 00010, Produces unspecified values. + PhiRef = 0x0004 << 5, // 00100, Member of PhiNode. + Preserving = 0x0008 << 5, // 01000, Def can keep original bits. + Fixed = 0x0010 << 5, // 10000, Fixed register. + }; + + static uint16_t type(uint16_t T) { return T & TypeMask; } + static uint16_t kind(uint16_t T) { return T & KindMask; } + static uint16_t flags(uint16_t T) { return T & FlagMask; } + + static uint16_t set_type(uint16_t A, uint16_t T) { + return (A & ~TypeMask) | T; + } + static uint16_t set_kind(uint16_t A, uint16_t K) { + return (A & ~KindMask) | K; + } + static uint16_t set_flags(uint16_t A, uint16_t F) { + return (A & ~FlagMask) | F; + } + + // Test if A contains B. + static bool contains(uint16_t A, uint16_t B) { + if (type(A) != Code) + return false; + uint16_t KB = kind(B); + switch (kind(A)) { + case Func: + return KB == Block; + case Block: + return KB == Phi || KB == Stmt; + case Phi: + case Stmt: + return type(B) == Ref; + } + return false; + } + }; + + template <typename T> struct NodeAddr { + NodeAddr() : Addr(nullptr), Id(0) {} + NodeAddr(T A, NodeId I) : Addr(A), Id(I) {} + NodeAddr(const NodeAddr&) = default; + NodeAddr &operator= (const NodeAddr&) = default; + + bool operator== (const NodeAddr<T> &NA) const { + assert((Addr == NA.Addr) == (Id == NA.Id)); + return Addr == NA.Addr; + } + bool operator!= (const NodeAddr<T> &NA) const { + return !operator==(NA); + } + // Type cast (casting constructor). The reason for having this class + // instead of std::pair. + template <typename S> NodeAddr(const NodeAddr<S> &NA) + : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {} + + T Addr; + NodeId Id; + }; + + struct NodeBase; + + // Fast memory allocation and translation between node id and node address. + // This is really the same idea as the one underlying the "bump pointer + // allocator", the difference being in the translation. A node id is + // composed of two components: the index of the block in which it was + // allocated, and the index within the block. With the default settings, + // where the number of nodes per block is 4096, the node id (minus 1) is: + // + // bit position: 11 0 + // +----------------------------+--------------+ + // | Index of the block |Index in block| + // +----------------------------+--------------+ + // + // The actual node id is the above plus 1, to avoid creating a node id of 0. + // + // This method significantly improved the build time, compared to using maps + // (std::unordered_map or DenseMap) to translate between pointers and ids. + struct NodeAllocator { + // Amount of storage for a single node. + enum { NodeMemSize = 32 }; + NodeAllocator(uint32_t NPB = 4096) + : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)), + IndexMask((1 << BitsPerIndex)-1), ActiveEnd(nullptr) { + assert(isPowerOf2_32(NPB)); + } + NodeBase *ptr(NodeId N) const { + uint32_t N1 = N-1; + uint32_t BlockN = N1 >> BitsPerIndex; + uint32_t Offset = (N1 & IndexMask) * NodeMemSize; + return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset); + } + NodeId id(const NodeBase *P) const; + NodeAddr<NodeBase*> New(); + void clear(); + + private: + void startNewBlock(); + bool needNewBlock(); + uint32_t makeId(uint32_t Block, uint32_t Index) const { + // Add 1 to the id, to avoid the id of 0, which is treated as "null". + return ((Block << BitsPerIndex) | Index) + 1; + } + + const uint32_t NodesPerBlock; + const uint32_t BitsPerIndex; + const uint32_t IndexMask; + char *ActiveEnd; + std::vector<char*> Blocks; + typedef BumpPtrAllocatorImpl<MallocAllocator, 65536> AllocatorTy; + AllocatorTy MemPool; + }; + + struct RegisterRef { + unsigned Reg, Sub; + + // No non-trivial constructors, since this will be a member of a union. + RegisterRef() = default; + RegisterRef(const RegisterRef &RR) = default; + RegisterRef &operator= (const RegisterRef &RR) = default; + bool operator== (const RegisterRef &RR) const { + return Reg == RR.Reg && Sub == RR.Sub; + } + bool operator!= (const RegisterRef &RR) const { + return !operator==(RR); + } + bool operator< (const RegisterRef &RR) const { + return Reg < RR.Reg || (Reg == RR.Reg && Sub < RR.Sub); + } + }; + typedef std::set<RegisterRef> RegisterSet; + + struct RegisterAliasInfo { + RegisterAliasInfo(const TargetRegisterInfo &tri) : TRI(tri) {} + virtual ~RegisterAliasInfo() {} + + virtual std::vector<RegisterRef> getAliasSet(RegisterRef RR) const; + virtual bool alias(RegisterRef RA, RegisterRef RB) const; + virtual bool covers(RegisterRef RA, RegisterRef RB) const; + virtual bool covers(const RegisterSet &RRs, RegisterRef RR) const; + + const TargetRegisterInfo &TRI; + }; + + struct TargetOperandInfo { + TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {} + virtual ~TargetOperandInfo() {} + virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const; + virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const; + virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const; + + const TargetInstrInfo &TII; + }; + + + struct DataFlowGraph; + + struct NodeBase { + public: + // Make sure this is a POD. + NodeBase() = default; + uint16_t getType() const { return NodeAttrs::type(Attrs); } + uint16_t getKind() const { return NodeAttrs::kind(Attrs); } + uint16_t getFlags() const { return NodeAttrs::flags(Attrs); } + NodeId getNext() const { return Next; } + + uint16_t getAttrs() const { return Attrs; } + void setAttrs(uint16_t A) { Attrs = A; } + void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); } + + // Insert node NA after "this" in the circular chain. + void append(NodeAddr<NodeBase*> NA); + // Initialize all members to 0. + void init() { memset(this, 0, sizeof *this); } + void setNext(NodeId N) { Next = N; } + + protected: + uint16_t Attrs; + uint16_t Reserved; + NodeId Next; // Id of the next node in the circular chain. + // Definitions of nested types. Using anonymous nested structs would make + // this class definition clearer, but unnamed structs are not a part of + // the standard. + struct Def_struct { + NodeId DD, DU; // Ids of the first reached def and use. + }; + struct PhiU_struct { + NodeId PredB; // Id of the predecessor block for a phi use. + }; + struct Code_struct { + void *CP; // Pointer to the actual code. + NodeId FirstM, LastM; // Id of the first member and last. + }; + struct Ref_struct { + NodeId RD, Sib; // Ids of the reaching def and the sibling. + union { + Def_struct Def; + PhiU_struct PhiU; + }; + union { + MachineOperand *Op; // Non-phi refs point to a machine operand. + RegisterRef RR; // Phi refs store register info directly. + }; + }; + + // The actual payload. + union { + Ref_struct Ref; + Code_struct Code; + }; + }; + // The allocator allocates chunks of 32 bytes for each node. The fact that + // each node takes 32 bytes in memory is used for fast translation between + // the node id and the node address. + static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize, + "NodeBase must be at most NodeAllocator::NodeMemSize bytes"); + + typedef std::vector<NodeAddr<NodeBase*>> NodeList; + typedef std::set<NodeId> NodeSet; + + struct RefNode : public NodeBase { + RefNode() = default; + RegisterRef getRegRef() const; + MachineOperand &getOp() { + assert(!(getFlags() & NodeAttrs::PhiRef)); + return *Ref.Op; + } + void setRegRef(RegisterRef RR); + void setRegRef(MachineOperand *Op); + NodeId getReachingDef() const { + return Ref.RD; + } + void setReachingDef(NodeId RD) { + Ref.RD = RD; + } + NodeId getSibling() const { + return Ref.Sib; + } + void setSibling(NodeId Sib) { + Ref.Sib = Sib; + } + bool isUse() const { + assert(getType() == NodeAttrs::Ref); + return getKind() == NodeAttrs::Use; + } + bool isDef() const { + assert(getType() == NodeAttrs::Ref); + return getKind() == NodeAttrs::Def; + } + + template <typename Predicate> + NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly, + const DataFlowGraph &G); + NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G); + }; + + struct DefNode : public RefNode { + NodeId getReachedDef() const { + return Ref.Def.DD; + } + void setReachedDef(NodeId D) { + Ref.Def.DD = D; + } + NodeId getReachedUse() const { + return Ref.Def.DU; + } + void setReachedUse(NodeId U) { + Ref.Def.DU = U; + } + + void linkToDef(NodeId Self, NodeAddr<DefNode*> DA); + }; + + struct UseNode : public RefNode { + void linkToDef(NodeId Self, NodeAddr<DefNode*> DA); + }; + + struct PhiUseNode : public UseNode { + NodeId getPredecessor() const { + assert(getFlags() & NodeAttrs::PhiRef); + return Ref.PhiU.PredB; + } + void setPredecessor(NodeId B) { + assert(getFlags() & NodeAttrs::PhiRef); + Ref.PhiU.PredB = B; + } + }; + + struct CodeNode : public NodeBase { + template <typename T> T getCode() const { + return static_cast<T>(Code.CP); + } + void setCode(void *C) { + Code.CP = C; + } + + NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const; + NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const; + void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G); + void addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA, + const DataFlowGraph &G); + void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G); + + NodeList members(const DataFlowGraph &G) const; + template <typename Predicate> + NodeList members_if(Predicate P, const DataFlowGraph &G) const; + }; + + struct InstrNode : public CodeNode { + NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G); + }; + + struct PhiNode : public InstrNode { + MachineInstr *getCode() const { + return nullptr; + } + }; + + struct StmtNode : public InstrNode { + MachineInstr *getCode() const { + return CodeNode::getCode<MachineInstr*>(); + } + }; + + struct BlockNode : public CodeNode { + MachineBasicBlock *getCode() const { + return CodeNode::getCode<MachineBasicBlock*>(); + } + void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G); + }; + + struct FuncNode : public CodeNode { + MachineFunction *getCode() const { + return CodeNode::getCode<MachineFunction*>(); + } + NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB, + const DataFlowGraph &G) const; + NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G); + }; + + struct DataFlowGraph { + DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii, + const TargetRegisterInfo &tri, const MachineDominatorTree &mdt, + const MachineDominanceFrontier &mdf, const RegisterAliasInfo &rai, + const TargetOperandInfo &toi); + + NodeBase *ptr(NodeId N) const; + template <typename T> T ptr(NodeId N) const { + return static_cast<T>(ptr(N)); + } + NodeId id(const NodeBase *P) const; + + template <typename T> NodeAddr<T> addr(NodeId N) const { + return { ptr<T>(N), N }; + } + + NodeAddr<FuncNode*> getFunc() const { + return Func; + } + MachineFunction &getMF() const { + return MF; + } + const TargetInstrInfo &getTII() const { + return TII; + } + const TargetRegisterInfo &getTRI() const { + return TRI; + } + const MachineDominatorTree &getDT() const { + return MDT; + } + const MachineDominanceFrontier &getDF() const { + return MDF; + } + const RegisterAliasInfo &getRAI() const { + return RAI; + } + + struct DefStack { + DefStack() = default; + bool empty() const { return Stack.empty() || top() == bottom(); } + private: + typedef NodeAddr<DefNode*> value_type; + struct Iterator { + typedef DefStack::value_type value_type; + Iterator &up() { Pos = DS.nextUp(Pos); return *this; } + Iterator &down() { Pos = DS.nextDown(Pos); return *this; } + value_type operator*() const { + assert(Pos >= 1); + return DS.Stack[Pos-1]; + } + const value_type *operator->() const { + assert(Pos >= 1); + return &DS.Stack[Pos-1]; + } + bool operator==(const Iterator &It) const { return Pos == It.Pos; } + bool operator!=(const Iterator &It) const { return Pos != It.Pos; } + private: + Iterator(const DefStack &S, bool Top); + // Pos-1 is the index in the StorageType object that corresponds to + // the top of the DefStack. + const DefStack &DS; + unsigned Pos; + friend struct DefStack; + }; + public: + typedef Iterator iterator; + iterator top() const { return Iterator(*this, true); } + iterator bottom() const { return Iterator(*this, false); } + unsigned size() const; + + void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); } + void pop(); + void start_block(NodeId N); + void clear_block(NodeId N); + private: + friend struct Iterator; + typedef std::vector<value_type> StorageType; + bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const { + return (P.Addr == nullptr) && (N == 0 || P.Id == N); + } + unsigned nextUp(unsigned P) const; + unsigned nextDown(unsigned P) const; + StorageType Stack; + }; + + typedef std::map<RegisterRef,DefStack> DefStackMap; + + void build(); + void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM); + void markBlock(NodeId B, DefStackMap &DefM); + void releaseBlock(NodeId B, DefStackMap &DefM); + + NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const; + NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA, bool Create); + NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const; + NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA, bool Create); + NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const; + + NodeList getRelatedRefs(NodeAddr<InstrNode*> IA, + NodeAddr<RefNode*> RA) const; + + void unlinkUse(NodeAddr<UseNode*> UA); + void unlinkDef(NodeAddr<DefNode*> DA); + + // Some useful filters. + template <uint16_t Kind> + static bool IsRef(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Ref && + BA.Addr->getKind() == Kind; + } + template <uint16_t Kind> + static bool IsCode(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Code && + BA.Addr->getKind() == Kind; + } + static bool IsDef(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Ref && + BA.Addr->getKind() == NodeAttrs::Def; + } + static bool IsUse(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Ref && + BA.Addr->getKind() == NodeAttrs::Use; + } + static bool IsPhi(const NodeAddr<NodeBase*> BA) { + return BA.Addr->getType() == NodeAttrs::Code && + BA.Addr->getKind() == NodeAttrs::Phi; + } + + private: + void reset(); + + NodeAddr<NodeBase*> newNode(uint16_t Attrs); + NodeAddr<NodeBase*> cloneNode(const NodeAddr<NodeBase*> B); + NodeAddr<UseNode*> newUse(NodeAddr<InstrNode*> Owner, + MachineOperand &Op, uint16_t Flags = NodeAttrs::None); + NodeAddr<PhiUseNode*> newPhiUse(NodeAddr<PhiNode*> Owner, + RegisterRef RR, NodeAddr<BlockNode*> PredB, + uint16_t Flags = NodeAttrs::PhiRef); + NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner, + MachineOperand &Op, uint16_t Flags = NodeAttrs::None); + NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner, + RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef); + NodeAddr<PhiNode*> newPhi(NodeAddr<BlockNode*> Owner); + NodeAddr<StmtNode*> newStmt(NodeAddr<BlockNode*> Owner, + MachineInstr *MI); + NodeAddr<BlockNode*> newBlock(NodeAddr<FuncNode*> Owner, + MachineBasicBlock *BB); + NodeAddr<FuncNode*> newFunc(MachineFunction *MF); + + template <typename Predicate> + std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>> + locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, + Predicate P) const; + + typedef std::map<NodeId,RegisterSet> BlockRefsMap; + + void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In); + void buildBlockRefs(NodeAddr<BlockNode*> BA, BlockRefsMap &RefM); + void recordDefsForDF(BlockRefsMap &PhiM, BlockRefsMap &RefM, + NodeAddr<BlockNode*> BA); + void buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM, + NodeAddr<BlockNode*> BA); + void removeUnusedPhis(); + + template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA, + NodeAddr<T> TA, DefStack &DS); + void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA); + void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA); + + TimerGroup TimeG; + NodeAddr<FuncNode*> Func; + NodeAllocator Memory; + + MachineFunction &MF; + const TargetInstrInfo &TII; + const TargetRegisterInfo &TRI; + const MachineDominatorTree &MDT; + const MachineDominanceFrontier &MDF; + const RegisterAliasInfo &RAI; + const TargetOperandInfo &TOI; + }; // struct DataFlowGraph + + template <typename Predicate> + NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P, + bool NextOnly, const DataFlowGraph &G) { + // Get the "Next" reference in the circular list that references RR and + // satisfies predicate "Pred". + auto NA = G.addr<NodeBase*>(getNext()); + + while (NA.Addr != this) { + if (NA.Addr->getType() == NodeAttrs::Ref) { + NodeAddr<RefNode*> RA = NA; + if (RA.Addr->getRegRef() == RR && P(NA)) + return NA; + if (NextOnly) + break; + NA = G.addr<NodeBase*>(NA.Addr->getNext()); + } else { + // We've hit the beginning of the chain. + assert(NA.Addr->getType() == NodeAttrs::Code); + NodeAddr<CodeNode*> CA = NA; + NA = CA.Addr->getFirstMember(G); + } + } + // Return the equivalent of "nullptr" if such a node was not found. + return NodeAddr<RefNode*>(); + } + + template <typename Predicate> + NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const { + NodeList MM; + auto M = getFirstMember(G); + if (M.Id == 0) + return MM; + + while (M.Addr != this) { + if (P(M)) + MM.push_back(M); + M = G.addr<NodeBase*>(M.Addr->getNext()); + } + return MM; + } + + + template <typename T> struct Print; + template <typename T> + raw_ostream &operator<< (raw_ostream &OS, const Print<T> &P); + + template <typename T> + struct Print { + Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {} + const T &Obj; + const DataFlowGraph &G; + }; + + template <typename T> + struct PrintNode : Print<NodeAddr<T>> { + PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g) + : Print<NodeAddr<T>>(x, g) {} + }; +} // namespace rdf + +#endif // RDF_GRAPH_H diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp new file mode 100644 index 000000000000..1d9bd372ff4e --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp @@ -0,0 +1,848 @@ +//===--- RDFLiveness.cpp --------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Computation of the liveness information from the data-flow graph. +// +// The main functionality of this code is to compute block live-in +// information. With the live-in information in place, the placement +// of kill flags can also be recalculated. +// +// The block live-in calculation is based on the ideas from the following +// publication: +// +// Dibyendu Das, Ramakrishna Upadrasta, Benoit Dupont de Dinechin. +// "Efficient Liveness Computation Using Merge Sets and DJ-Graphs." +// ACM Transactions on Architecture and Code Optimization, Association for +// Computing Machinery, 2012, ACM TACO Special Issue on "High-Performance +// and Embedded Architectures and Compilers", 8 (4), +// <10.1145/2086696.2086706>. <hal-00647369> +// +#include "RDFGraph.h" +#include "RDFLiveness.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; +using namespace rdf; + +namespace rdf { + template<> + raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) { + OS << '{'; + for (auto I : P.Obj) { + OS << ' ' << Print<RegisterRef>(I.first, P.G) << '{'; + for (auto J = I.second.begin(), E = I.second.end(); J != E; ) { + OS << Print<NodeId>(*J, P.G); + if (++J != E) + OS << ','; + } + OS << '}'; + } + OS << " }"; + return OS; + } +} + +// The order in the returned sequence is the order of reaching defs in the +// upward traversal: the first def is the closest to the given reference RefA, +// the next one is further up, and so on. +// The list ends at a reaching phi def, or when the reference from RefA is +// covered by the defs in the list (see FullChain). +// This function provides two modes of operation: +// (1) Returning the sequence of reaching defs for a particular reference +// node. This sequence will terminate at the first phi node [1]. +// (2) Returning a partial sequence of reaching defs, where the final goal +// is to traverse past phi nodes to the actual defs arising from the code +// itself. +// In mode (2), the register reference for which the search was started +// may be different from the reference node RefA, for which this call was +// made, hence the argument RefRR, which holds the original register. +// Also, some definitions may have already been encountered in a previous +// call that will influence register covering. The register references +// already defined are passed in through DefRRs. +// In mode (1), the "continuation" considerations do not apply, and the +// RefRR is the same as the register in RefA, and the set DefRRs is empty. +// +// [1] It is possible for multiple phi nodes to be included in the returned +// sequence: +// SubA = phi ... +// SubB = phi ... +// ... = SuperAB(rdef:SubA), SuperAB"(rdef:SubB) +// However, these phi nodes are independent from one another in terms of +// the data-flow. + +NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, + NodeAddr<RefNode*> RefA, bool FullChain, const RegisterSet &DefRRs) { + SetVector<NodeId> DefQ; + SetVector<NodeId> Owners; + + // The initial queue should not have reaching defs for shadows. The + // whole point of a shadow is that it will have a reaching def that + // is not aliased to the reaching defs of the related shadows. + NodeId Start = RefA.Id; + auto SNA = DFG.addr<RefNode*>(Start); + if (NodeId RD = SNA.Addr->getReachingDef()) + DefQ.insert(RD); + + // Collect all the reaching defs, going up until a phi node is encountered, + // or there are no more reaching defs. From this set, the actual set of + // reaching defs will be selected. + // The traversal upwards must go on until a covering def is encountered. + // It is possible that a collection of non-covering (individually) defs + // will be sufficient, but keep going until a covering one is found. + for (unsigned i = 0; i < DefQ.size(); ++i) { + auto TA = DFG.addr<DefNode*>(DefQ[i]); + if (TA.Addr->getFlags() & NodeAttrs::PhiRef) + continue; + // Stop at the covering/overwriting def of the initial register reference. + RegisterRef RR = TA.Addr->getRegRef(); + if (RAI.covers(RR, RefRR)) { + uint16_t Flags = TA.Addr->getFlags(); + if (!(Flags & NodeAttrs::Preserving)) + continue; + } + // Get the next level of reaching defs. This will include multiple + // reaching defs for shadows. + for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA)) + if (auto RD = NodeAddr<RefNode*>(S).Addr->getReachingDef()) + DefQ.insert(RD); + } + + // Remove all non-phi defs that are not aliased to RefRR, and collect + // the owners of the remaining defs. + SetVector<NodeId> Defs; + for (auto N : DefQ) { + auto TA = DFG.addr<DefNode*>(N); + bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef; + if (!IsPhi && !RAI.alias(RefRR, TA.Addr->getRegRef())) + continue; + Defs.insert(TA.Id); + Owners.insert(TA.Addr->getOwner(DFG).Id); + } + + // Return the MachineBasicBlock containing a given instruction. + auto Block = [this] (NodeAddr<InstrNode*> IA) -> MachineBasicBlock* { + if (IA.Addr->getKind() == NodeAttrs::Stmt) + return NodeAddr<StmtNode*>(IA).Addr->getCode()->getParent(); + assert(IA.Addr->getKind() == NodeAttrs::Phi); + NodeAddr<PhiNode*> PA = IA; + NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG); + return BA.Addr->getCode(); + }; + // Less(A,B) iff instruction A is further down in the dominator tree than B. + auto Less = [&Block,this] (NodeId A, NodeId B) -> bool { + if (A == B) + return false; + auto OA = DFG.addr<InstrNode*>(A), OB = DFG.addr<InstrNode*>(B); + MachineBasicBlock *BA = Block(OA), *BB = Block(OB); + if (BA != BB) + return MDT.dominates(BB, BA); + // They are in the same block. + bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt; + bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt; + if (StmtA) { + if (!StmtB) // OB is a phi and phis dominate statements. + return true; + auto CA = NodeAddr<StmtNode*>(OA).Addr->getCode(); + auto CB = NodeAddr<StmtNode*>(OB).Addr->getCode(); + // The order must be linear, so tie-break such equalities. + if (CA == CB) + return A < B; + return MDT.dominates(CB, CA); + } else { + // OA is a phi. + if (StmtB) + return false; + // Both are phis. There is no ordering between phis (in terms of + // the data-flow), so tie-break this via node id comparison. + return A < B; + } + }; + + std::vector<NodeId> Tmp(Owners.begin(), Owners.end()); + std::sort(Tmp.begin(), Tmp.end(), Less); + + // The vector is a list of instructions, so that defs coming from + // the same instruction don't need to be artificially ordered. + // Then, when computing the initial segment, and iterating over an + // instruction, pick the defs that contribute to the covering (i.e. is + // not covered by previously added defs). Check the defs individually, + // i.e. first check each def if is covered or not (without adding them + // to the tracking set), and then add all the selected ones. + + // The reason for this is this example: + // *d1<A>, *d2<B>, ... Assume A and B are aliased (can happen in phi nodes). + // *d3<C> If A \incl BuC, and B \incl AuC, then *d2 would be + // covered if we added A first, and A would be covered + // if we added B first. + + NodeList RDefs; + RegisterSet RRs = DefRRs; + + auto DefInSet = [&Defs] (NodeAddr<RefNode*> TA) -> bool { + return TA.Addr->getKind() == NodeAttrs::Def && + Defs.count(TA.Id); + }; + for (auto T : Tmp) { + if (!FullChain && RAI.covers(RRs, RefRR)) + break; + auto TA = DFG.addr<InstrNode*>(T); + bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA); + NodeList Ds; + for (NodeAddr<DefNode*> DA : TA.Addr->members_if(DefInSet, DFG)) { + auto QR = DA.Addr->getRegRef(); + // Add phi defs even if they are covered by subsequent defs. This is + // for cases where the reached use is not covered by any of the defs + // encountered so far: the phi def is needed to expose the liveness + // of that use to the entry of the block. + // Example: + // phi d1<R3>(,d2,), ... Phi def d1 is covered by d2. + // d2<R3>(d1,,u3), ... + // ..., u3<D1>(d2) This use needs to be live on entry. + if (FullChain || IsPhi || !RAI.covers(RRs, QR)) + Ds.push_back(DA); + } + RDefs.insert(RDefs.end(), Ds.begin(), Ds.end()); + for (NodeAddr<DefNode*> DA : Ds) { + // When collecting a full chain of definitions, do not consider phi + // defs to actually define a register. + uint16_t Flags = DA.Addr->getFlags(); + if (!FullChain || !(Flags & NodeAttrs::PhiRef)) + if (!(Flags & NodeAttrs::Preserving)) + RRs.insert(DA.Addr->getRegRef()); + } + } + + return RDefs; +} + + +static const RegisterSet NoRegs; + +NodeList Liveness::getAllReachingDefs(NodeAddr<RefNode*> RefA) { + return getAllReachingDefs(RefA.Addr->getRegRef(), RefA, false, NoRegs); +} + + +void Liveness::computePhiInfo() { + NodeList Phis; + NodeAddr<FuncNode*> FA = DFG.getFunc(); + auto Blocks = FA.Addr->members(DFG); + for (NodeAddr<BlockNode*> BA : Blocks) { + auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG); + Phis.insert(Phis.end(), Ps.begin(), Ps.end()); + } + + // phi use -> (map: reaching phi -> set of registers defined in between) + std::map<NodeId,std::map<NodeId,RegisterSet>> PhiUp; + std::vector<NodeId> PhiUQ; // Work list of phis for upward propagation. + + // Go over all phis. + for (NodeAddr<PhiNode*> PhiA : Phis) { + // Go over all defs and collect the reached uses that are non-phi uses + // (i.e. the "real uses"). + auto &RealUses = RealUseMap[PhiA.Id]; + auto PhiRefs = PhiA.Addr->members(DFG); + + // Have a work queue of defs whose reached uses need to be found. + // For each def, add to the queue all reached (non-phi) defs. + SetVector<NodeId> DefQ; + NodeSet PhiDefs; + for (auto R : PhiRefs) { + if (!DFG.IsRef<NodeAttrs::Def>(R)) + continue; + DefQ.insert(R.Id); + PhiDefs.insert(R.Id); + } + for (unsigned i = 0; i < DefQ.size(); ++i) { + NodeAddr<DefNode*> DA = DFG.addr<DefNode*>(DefQ[i]); + NodeId UN = DA.Addr->getReachedUse(); + while (UN != 0) { + NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN); + if (!(A.Addr->getFlags() & NodeAttrs::PhiRef)) + RealUses[getRestrictedRegRef(A)].insert(A.Id); + UN = A.Addr->getSibling(); + } + NodeId DN = DA.Addr->getReachedDef(); + while (DN != 0) { + NodeAddr<DefNode*> A = DFG.addr<DefNode*>(DN); + for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) { + uint16_t Flags = NodeAddr<DefNode*>(T).Addr->getFlags(); + // Must traverse the reached-def chain. Consider: + // def(D0) -> def(R0) -> def(R0) -> use(D0) + // The reachable use of D0 passes through a def of R0. + if (!(Flags & NodeAttrs::PhiRef)) + DefQ.insert(T.Id); + } + DN = A.Addr->getSibling(); + } + } + // Filter out these uses that appear to be reachable, but really + // are not. For example: + // + // R1:0 = d1 + // = R1:0 u2 Reached by d1. + // R0 = d3 + // = R1:0 u4 Still reached by d1: indirectly through + // the def d3. + // R1 = d5 + // = R1:0 u6 Not reached by d1 (covered collectively + // by d3 and d5), but following reached + // defs and uses from d1 will lead here. + auto HasDef = [&PhiDefs] (NodeAddr<DefNode*> DA) -> bool { + return PhiDefs.count(DA.Id); + }; + for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) { + // For each reached register UI->first, there is a set UI->second, of + // uses of it. For each such use, check if it is reached by this phi, + // i.e. check if the set of its reaching uses intersects the set of + // this phi's defs. + auto &Uses = UI->second; + for (auto I = Uses.begin(), E = Uses.end(); I != E; ) { + auto UA = DFG.addr<UseNode*>(*I); + NodeList RDs = getAllReachingDefs(UI->first, UA); + if (std::any_of(RDs.begin(), RDs.end(), HasDef)) + ++I; + else + I = Uses.erase(I); + } + if (Uses.empty()) + UI = RealUses.erase(UI); + else + ++UI; + } + + // If this phi reaches some "real" uses, add it to the queue for upward + // propagation. + if (!RealUses.empty()) + PhiUQ.push_back(PhiA.Id); + + // Go over all phi uses and check if the reaching def is another phi. + // Collect the phis that are among the reaching defs of these uses. + // While traversing the list of reaching defs for each phi use, collect + // the set of registers defined between this phi (Phi) and the owner phi + // of the reaching def. + for (auto I : PhiRefs) { + if (!DFG.IsRef<NodeAttrs::Use>(I)) + continue; + NodeAddr<UseNode*> UA = I; + auto &UpMap = PhiUp[UA.Id]; + RegisterSet DefRRs; + for (NodeAddr<DefNode*> DA : getAllReachingDefs(UA)) { + if (DA.Addr->getFlags() & NodeAttrs::PhiRef) + UpMap[DA.Addr->getOwner(DFG).Id] = DefRRs; + else + DefRRs.insert(DA.Addr->getRegRef()); + } + } + } + + if (Trace) { + dbgs() << "Phi-up-to-phi map:\n"; + for (auto I : PhiUp) { + dbgs() << "phi " << Print<NodeId>(I.first, DFG) << " -> {"; + for (auto R : I.second) + dbgs() << ' ' << Print<NodeId>(R.first, DFG) + << Print<RegisterSet>(R.second, DFG); + dbgs() << " }\n"; + } + } + + // Propagate the reached registers up in the phi chain. + // + // The following type of situation needs careful handling: + // + // phi d1<R1:0> (1) + // | + // ... d2<R1> + // | + // phi u3<R1:0> (2) + // | + // ... u4<R1> + // + // The phi node (2) defines a register pair R1:0, and reaches a "real" + // use u4 of just R1. The same phi node is also known to reach (upwards) + // the phi node (1). However, the use u4 is not reached by phi (1), + // because of the intervening definition d2 of R1. The data flow between + // phis (1) and (2) is restricted to R1:0 minus R1, i.e. R0. + // + // When propagating uses up the phi chains, get the all reaching defs + // for a given phi use, and traverse the list until the propagated ref + // is covered, or until or until reaching the final phi. Only assume + // that the reference reaches the phi in the latter case. + + for (unsigned i = 0; i < PhiUQ.size(); ++i) { + auto PA = DFG.addr<PhiNode*>(PhiUQ[i]); + auto &RealUses = RealUseMap[PA.Id]; + for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) { + NodeAddr<UseNode*> UA = U; + auto &UpPhis = PhiUp[UA.Id]; + for (auto UP : UpPhis) { + bool Changed = false; + auto &MidDefs = UP.second; + // Collect the set UpReached of uses that are reached by the current + // phi PA, and are not covered by any intervening def between PA and + // the upward phi UP. + RegisterSet UpReached; + for (auto T : RealUses) { + if (!isRestricted(PA, UA, T.first)) + continue; + if (!RAI.covers(MidDefs, T.first)) + UpReached.insert(T.first); + } + if (UpReached.empty()) + continue; + // Update the set PRUs of real uses reached by the upward phi UP with + // the actual set of uses (UpReached) that the UP phi reaches. + auto &PRUs = RealUseMap[UP.first]; + for (auto R : UpReached) { + unsigned Z = PRUs[R].size(); + PRUs[R].insert(RealUses[R].begin(), RealUses[R].end()); + Changed |= (PRUs[R].size() != Z); + } + if (Changed) + PhiUQ.push_back(UP.first); + } + } + } + + if (Trace) { + dbgs() << "Real use map:\n"; + for (auto I : RealUseMap) { + dbgs() << "phi " << Print<NodeId>(I.first, DFG); + NodeAddr<PhiNode*> PA = DFG.addr<PhiNode*>(I.first); + NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG); + if (!Ds.empty()) { + RegisterRef RR = NodeAddr<DefNode*>(Ds[0]).Addr->getRegRef(); + dbgs() << '<' << Print<RegisterRef>(RR, DFG) << '>'; + } else { + dbgs() << "<noreg>"; + } + dbgs() << " -> " << Print<RefMap>(I.second, DFG) << '\n'; + } + } +} + + +void Liveness::computeLiveIns() { + // Populate the node-to-block map. This speeds up the calculations + // significantly. + NBMap.clear(); + for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) { + MachineBasicBlock *BB = BA.Addr->getCode(); + for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) { + for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) + NBMap.insert(std::make_pair(RA.Id, BB)); + NBMap.insert(std::make_pair(IA.Id, BB)); + } + } + + MachineFunction &MF = DFG.getMF(); + + // Compute IDF first, then the inverse. + decltype(IIDF) IDF; + for (auto &B : MF) { + auto F1 = MDF.find(&B); + if (F1 == MDF.end()) + continue; + SetVector<MachineBasicBlock*> IDFB(F1->second.begin(), F1->second.end()); + for (unsigned i = 0; i < IDFB.size(); ++i) { + auto F2 = MDF.find(IDFB[i]); + if (F2 != MDF.end()) + IDFB.insert(F2->second.begin(), F2->second.end()); + } + // Add B to the IDF(B). This will put B in the IIDF(B). + IDFB.insert(&B); + IDF[&B].insert(IDFB.begin(), IDFB.end()); + } + + for (auto I : IDF) + for (auto S : I.second) + IIDF[S].insert(I.first); + + computePhiInfo(); + + NodeAddr<FuncNode*> FA = DFG.getFunc(); + auto Blocks = FA.Addr->members(DFG); + + // Build the phi live-on-entry map. + for (NodeAddr<BlockNode*> BA : Blocks) { + MachineBasicBlock *MB = BA.Addr->getCode(); + auto &LON = PhiLON[MB]; + for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG)) + for (auto S : RealUseMap[P.Id]) + LON[S.first].insert(S.second.begin(), S.second.end()); + } + + if (Trace) { + dbgs() << "Phi live-on-entry map:\n"; + for (auto I : PhiLON) + dbgs() << "block #" << I.first->getNumber() << " -> " + << Print<RefMap>(I.second, DFG) << '\n'; + } + + // Build the phi live-on-exit map. Each phi node has some set of reached + // "real" uses. Propagate this set backwards into the block predecessors + // through the reaching defs of the corresponding phi uses. + for (NodeAddr<BlockNode*> BA : Blocks) { + auto Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG); + for (NodeAddr<PhiNode*> PA : Phis) { + auto &RUs = RealUseMap[PA.Id]; + if (RUs.empty()) + continue; + + for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) { + NodeAddr<PhiUseNode*> UA = U; + if (UA.Addr->getReachingDef() == 0) + continue; + + // Mark all reached "real" uses of P as live on exit in the + // predecessor. + // Remap all the RUs so that they have a correct reaching def. + auto PrA = DFG.addr<BlockNode*>(UA.Addr->getPredecessor()); + auto &LOX = PhiLOX[PrA.Addr->getCode()]; + for (auto R : RUs) { + RegisterRef RR = R.first; + if (!isRestricted(PA, UA, RR)) + RR = getRestrictedRegRef(UA); + // The restricted ref may be different from the ref that was + // accessed in the "real use". This means that this phi use + // is not the one that carries this reference, so skip it. + if (!RAI.alias(R.first, RR)) + continue; + for (auto D : getAllReachingDefs(RR, UA)) + LOX[RR].insert(D.Id); + } + } // for U : phi uses + } // for P : Phis + } // for B : Blocks + + if (Trace) { + dbgs() << "Phi live-on-exit map:\n"; + for (auto I : PhiLOX) + dbgs() << "block #" << I.first->getNumber() << " -> " + << Print<RefMap>(I.second, DFG) << '\n'; + } + + RefMap LiveIn; + traverse(&MF.front(), LiveIn); + + // Add function live-ins to the live-in set of the function entry block. + auto &EntryIn = LiveMap[&MF.front()]; + for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) + EntryIn.insert({I->first,0}); + + if (Trace) { + // Dump the liveness map + for (auto &B : MF) { + BitVector LV(TRI.getNumRegs()); + for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I) + LV.set(I->PhysReg); + dbgs() << "BB#" << B.getNumber() << "\t rec = {"; + for (int x = LV.find_first(); x >= 0; x = LV.find_next(x)) + dbgs() << ' ' << Print<RegisterRef>({unsigned(x),0}, DFG); + dbgs() << " }\n"; + dbgs() << "\tcomp = " << Print<RegisterSet>(LiveMap[&B], DFG) << '\n'; + } + } +} + + +void Liveness::resetLiveIns() { + for (auto &B : DFG.getMF()) { + // Remove all live-ins. + std::vector<unsigned> T; + for (auto I = B.livein_begin(), E = B.livein_end(); I != E; ++I) + T.push_back(I->PhysReg); + for (auto I : T) + B.removeLiveIn(I); + // Add the newly computed live-ins. + auto &LiveIns = LiveMap[&B]; + for (auto I : LiveIns) { + assert(I.Sub == 0); + B.addLiveIn(I.Reg); + } + } +} + + +void Liveness::resetKills() { + for (auto &B : DFG.getMF()) + resetKills(&B); +} + + +void Liveness::resetKills(MachineBasicBlock *B) { + auto CopyLiveIns = [] (MachineBasicBlock *B, BitVector &LV) -> void { + for (auto I = B->livein_begin(), E = B->livein_end(); I != E; ++I) + LV.set(I->PhysReg); + }; + + BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs()); + CopyLiveIns(B, LiveIn); + for (auto SI : B->successors()) + CopyLiveIns(SI, Live); + + for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) { + MachineInstr *MI = &*I; + if (MI->isDebugValue()) + continue; + + MI->clearKillInfo(); + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(R)) + continue; + for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) + Live.reset(*SR); + } + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isUse()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isPhysicalRegister(R)) + continue; + bool IsLive = false; + for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) { + if (!Live[*SR]) + continue; + IsLive = true; + break; + } + if (IsLive) + continue; + Op.setIsKill(true); + for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) + Live.set(*SR); + } + } +} + + +// For shadows, determine if RR is aliased to a reaching def of any other +// shadow associated with RA. If it is not, then RR is "restricted" to RA, +// and so it can be considered a value specific to RA. This is important +// for accurately determining values associated with phi uses. +// For non-shadows, this function returns "true". +bool Liveness::isRestricted(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, + RegisterRef RR) const { + NodeId Start = RA.Id; + for (NodeAddr<RefNode*> TA = DFG.getNextShadow(IA, RA); + TA.Id != 0 && TA.Id != Start; TA = DFG.getNextShadow(IA, TA)) { + NodeId RD = TA.Addr->getReachingDef(); + if (RD == 0) + continue; + if (RAI.alias(RR, DFG.addr<DefNode*>(RD).Addr->getRegRef())) + return false; + } + return true; +} + + +RegisterRef Liveness::getRestrictedRegRef(NodeAddr<RefNode*> RA) const { + assert(DFG.IsRef<NodeAttrs::Use>(RA)); + if (RA.Addr->getFlags() & NodeAttrs::Shadow) { + NodeId RD = RA.Addr->getReachingDef(); + assert(RD); + RA = DFG.addr<DefNode*>(RD); + } + return RA.Addr->getRegRef(); +} + + +unsigned Liveness::getPhysReg(RegisterRef RR) const { + if (!TargetRegisterInfo::isPhysicalRegister(RR.Reg)) + return 0; + return RR.Sub ? TRI.getSubReg(RR.Reg, RR.Sub) : RR.Reg; +} + + +// Helper function to obtain the basic block containing the reaching def +// of the given use. +MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const { + auto F = NBMap.find(RN); + if (F != NBMap.end()) + return F->second; + llvm_unreachable("Node id not in map"); +} + + +void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { + // The LiveIn map, for each (physical) register, contains the set of live + // reaching defs of that register that are live on entry to the associated + // block. + + // The summary of the traversal algorithm: + // + // R is live-in in B, if there exists a U(R), such that rdef(R) dom B + // and (U \in IDF(B) or B dom U). + // + // for (C : children) { + // LU = {} + // traverse(C, LU) + // LiveUses += LU + // } + // + // LiveUses -= Defs(B); + // LiveUses += UpwardExposedUses(B); + // for (C : IIDF[B]) + // for (U : LiveUses) + // if (Rdef(U) dom C) + // C.addLiveIn(U) + // + + // Go up the dominator tree (depth-first). + MachineDomTreeNode *N = MDT.getNode(B); + for (auto I : *N) { + RefMap L; + MachineBasicBlock *SB = I->getBlock(); + traverse(SB, L); + + for (auto S : L) + LiveIn[S.first].insert(S.second.begin(), S.second.end()); + } + + if (Trace) { + dbgs() << LLVM_FUNCTION_NAME << " in BB#" << B->getNumber() + << " after recursion into"; + for (auto I : *N) + dbgs() << ' ' << I->getBlock()->getNumber(); + dbgs() << "\n LiveIn: " << Print<RefMap>(LiveIn, DFG); + dbgs() << "\n Local: " << Print<RegisterSet>(LiveMap[B], DFG) << '\n'; + } + + // Add phi uses that are live on exit from this block. + RefMap &PUs = PhiLOX[B]; + for (auto S : PUs) + LiveIn[S.first].insert(S.second.begin(), S.second.end()); + + if (Trace) { + dbgs() << "after LOX\n"; + dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n'; + dbgs() << " Local: " << Print<RegisterSet>(LiveMap[B], DFG) << '\n'; + } + + // Stop tracking all uses defined in this block: erase those records + // where the reaching def is located in B and which cover all reached + // uses. + auto Copy = LiveIn; + LiveIn.clear(); + + for (auto I : Copy) { + auto &Defs = LiveIn[I.first]; + NodeSet Rest; + for (auto R : I.second) { + auto DA = DFG.addr<DefNode*>(R); + RegisterRef DDR = DA.Addr->getRegRef(); + NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG); + NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG); + // Defs from a different block need to be preserved. Defs from this + // block will need to be processed further, except for phi defs, the + // liveness of which is handled through the PhiLON/PhiLOX maps. + if (B != BA.Addr->getCode()) + Defs.insert(R); + else { + bool IsPreserving = DA.Addr->getFlags() & NodeAttrs::Preserving; + if (IA.Addr->getKind() != NodeAttrs::Phi && !IsPreserving) { + bool Covering = RAI.covers(DDR, I.first); + NodeId U = DA.Addr->getReachedUse(); + while (U && Covering) { + auto DUA = DFG.addr<UseNode*>(U); + RegisterRef Q = DUA.Addr->getRegRef(); + Covering = RAI.covers(DA.Addr->getRegRef(), Q); + U = DUA.Addr->getSibling(); + } + if (!Covering) + Rest.insert(R); + } + } + } + + // Non-covering defs from B. + for (auto R : Rest) { + auto DA = DFG.addr<DefNode*>(R); + RegisterRef DRR = DA.Addr->getRegRef(); + RegisterSet RRs; + for (NodeAddr<DefNode*> TA : getAllReachingDefs(DA)) { + NodeAddr<InstrNode*> IA = TA.Addr->getOwner(DFG); + NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG); + // Preserving defs do not count towards covering. + if (!(TA.Addr->getFlags() & NodeAttrs::Preserving)) + RRs.insert(TA.Addr->getRegRef()); + if (BA.Addr->getCode() == B) + continue; + if (RAI.covers(RRs, DRR)) + break; + Defs.insert(TA.Id); + } + } + } + + emptify(LiveIn); + + if (Trace) { + dbgs() << "after defs in block\n"; + dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n'; + dbgs() << " Local: " << Print<RegisterSet>(LiveMap[B], DFG) << '\n'; + } + + // Scan the block for upward-exposed uses and add them to the tracking set. + for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) { + NodeAddr<InstrNode*> IA = I; + if (IA.Addr->getKind() != NodeAttrs::Stmt) + continue; + for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) { + RegisterRef RR = UA.Addr->getRegRef(); + for (auto D : getAllReachingDefs(UA)) + if (getBlockWithRef(D.Id) != B) + LiveIn[RR].insert(D.Id); + } + } + + if (Trace) { + dbgs() << "after uses in block\n"; + dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n'; + dbgs() << " Local: " << Print<RegisterSet>(LiveMap[B], DFG) << '\n'; + } + + // Phi uses should not be propagated up the dominator tree, since they + // are not dominated by their corresponding reaching defs. + auto &Local = LiveMap[B]; + auto &LON = PhiLON[B]; + for (auto R : LON) + Local.insert(R.first); + + if (Trace) { + dbgs() << "after phi uses in block\n"; + dbgs() << " LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n'; + dbgs() << " Local: " << Print<RegisterSet>(Local, DFG) << '\n'; + } + + for (auto C : IIDF[B]) { + auto &LiveC = LiveMap[C]; + for (auto S : LiveIn) + for (auto R : S.second) + if (MDT.properlyDominates(getBlockWithRef(R), C)) + LiveC.insert(S.first); + } +} + + +void Liveness::emptify(RefMap &M) { + for (auto I = M.begin(), E = M.end(); I != E; ) + I = I->second.empty() ? M.erase(I) : std::next(I); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h new file mode 100644 index 000000000000..4c1e8f3ee838 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h @@ -0,0 +1,106 @@ +//===--- RDFLiveness.h ----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Recalculate the liveness information given a data flow graph. +// This includes block live-ins and kill flags. + +#ifndef RDF_LIVENESS_H +#define RDF_LIVENESS_H + +#include "RDFGraph.h" +#include "llvm/ADT/DenseMap.h" +#include <map> + +using namespace llvm; + +namespace llvm { + class MachineBasicBlock; + class MachineFunction; + class MachineRegisterInfo; + class TargetRegisterInfo; + class MachineDominatorTree; + class MachineDominanceFrontier; +} + +namespace rdf { + struct Liveness { + public: + typedef std::map<MachineBasicBlock*,RegisterSet> LiveMapType; + typedef std::map<RegisterRef,NodeSet> RefMap; + + Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g) + : DFG(g), TRI(g.getTRI()), MDT(g.getDT()), MDF(g.getDF()), + RAI(g.getRAI()), MRI(mri), Empty(), Trace(false) {} + + NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA, + bool FullChain = false, const RegisterSet &DefRRs = RegisterSet()); + NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA); + + LiveMapType &getLiveMap() { return LiveMap; } + const LiveMapType &getLiveMap() const { return LiveMap; } + const RefMap &getRealUses(NodeId P) const { + auto F = RealUseMap.find(P); + return F == RealUseMap.end() ? Empty : F->second; + } + + void computePhiInfo(); + void computeLiveIns(); + void resetLiveIns(); + void resetKills(); + void resetKills(MachineBasicBlock *B); + + void trace(bool T) { Trace = T; } + + private: + const DataFlowGraph &DFG; + const TargetRegisterInfo &TRI; + const MachineDominatorTree &MDT; + const MachineDominanceFrontier &MDF; + const RegisterAliasInfo &RAI; + MachineRegisterInfo &MRI; + LiveMapType LiveMap; + const RefMap Empty; + bool Trace; + + // Cache of mapping from node ids (for RefNodes) to the containing + // basic blocks. Not computing it each time for each node reduces + // the liveness calculation time by a large fraction. + typedef DenseMap<NodeId,MachineBasicBlock*> NodeBlockMap; + NodeBlockMap NBMap; + + // Phi information: + // + // map: NodeId -> (map: RegisterRef -> NodeSet) + // phi id -> (map: register -> set of reached non-phi uses) + std::map<NodeId, RefMap> RealUseMap; + + // Inverse iterated dominance frontier. + std::map<MachineBasicBlock*,std::set<MachineBasicBlock*>> IIDF; + + // Live on entry. + std::map<MachineBasicBlock*,RefMap> PhiLON; + + // Phi uses are considered to be located at the end of the block that + // they are associated with. The reaching def of a phi use dominates the + // block that the use corresponds to, but not the block that contains + // the phi itself. To include these uses in the liveness propagation (up + // the dominator tree), create a map: block -> set of uses live on exit. + std::map<MachineBasicBlock*,RefMap> PhiLOX; + + bool isRestricted(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, + RegisterRef RR) const; + RegisterRef getRestrictedRegRef(NodeAddr<RefNode*> RA) const; + unsigned getPhysReg(RegisterRef RR) const; + MachineBasicBlock *getBlockWithRef(NodeId RN) const; + void traverse(MachineBasicBlock *B, RefMap &LiveIn); + void emptify(RefMap &M); + }; +} + +#endif // RDF_LIVENESS_H diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp index 6756c1702f76..5680130b91b2 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -277,8 +277,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); @@ -327,6 +325,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i64, Expand); setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); + setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand); @@ -872,7 +872,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::GlobalTLSAddress: return lowerGlobalTLSAddress(Op, DAG); case ISD::JumpTable: return lowerJumpTable(Op, DAG); case ISD::SELECT: return lowerSELECT(Op, DAG); - case ISD::SELECT_CC: return lowerSELECT_CC(Op, DAG); case ISD::SETCC: return lowerSETCC(Op, DAG); case ISD::VASTART: return lowerVASTART(Op, DAG); case ISD::VAARG: return lowerVAARG(Op, DAG); @@ -1648,20 +1647,6 @@ lowerSELECT(SDValue Op, SelectionDAG &DAG) const SDLoc(Op)); } -SDValue MipsTargetLowering:: -lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const -{ - SDLoc DL(Op); - EVT Ty = Op.getOperand(0).getValueType(); - SDValue Cond = - DAG.getNode(ISD::SETCC, DL, getSetCCResultType(DAG.getDataLayout(), - *DAG.getContext(), Ty), - Op.getOperand(0), Op.getOperand(1), Op.getOperand(4)); - - return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2), - Op.getOperand(3)); -} - SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const { assert(!Subtarget.hasMips32r6() && !Subtarget.hasMips64r6()); SDValue Cond = createFPCmp(DAG, Op); diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h index b33e125b81b7..0dc683e3df27 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h @@ -430,7 +430,6 @@ namespace llvm { SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVAARG(SDValue Op, SelectionDAG &DAG) const; diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td index d9fb8c890739..ffda491f0c86 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td @@ -1003,7 +1003,7 @@ class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> { let isCall=1, hasDelaySlot=1, Defs = [RA] in { class JumpLink<string opstr, DAGOperand opnd> : InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"), - [(MipsJmpLink imm:$target)], II_JAL, FrmJ, opstr> { + [(MipsJmpLink tglobaladdr:$target)], II_JAL, FrmJ, opstr> { let DecoderMethod = "DecodeJumpTarget"; } @@ -2075,8 +2075,6 @@ def : MipsPat<(MipsSync (i32 immz)), (SYNC 0)>, ISA_MIPS2; // Call -def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)), - (JAL tglobaladdr:$dst)>; def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)), (JAL texternalsym:$dst)>; //def : MipsPat<(MipsJmpLink GPR32:$dst), diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp index e6f7fe9aae1d..d4aeaf928655 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -544,8 +544,6 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB, const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2); MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc)); MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc)); - LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill())); - HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill())); // Add lo/hi registers if the mtlo/hi instructions created have explicit // def registers. @@ -556,6 +554,9 @@ void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB, LoInst.addReg(DstLo, RegState::Define); HiInst.addReg(DstHi, RegState::Define); } + + LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill())); + HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill())); } void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB, diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 766369631e14..be735f6c1bce 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -4549,6 +4549,7 @@ NVPTXTargetObjectFile::~NVPTXTargetObjectFile() { delete static_cast<NVPTXSection *>(DwarfLocSection); delete static_cast<NVPTXSection *>(DwarfARangesSection); delete static_cast<NVPTXSection *>(DwarfRangesSection); + delete static_cast<NVPTXSection *>(DwarfMacinfoSection); } MCSection * diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h index 0f88ddfaa934..683b9a3f49f7 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -41,6 +41,7 @@ public: DwarfLocSection = nullptr; DwarfARangesSection = nullptr; DwarfRangesSection = nullptr; + DwarfMacinfoSection = nullptr; } virtual ~NVPTXTargetObjectFile(); @@ -81,6 +82,8 @@ public: new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); DwarfRangesSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); + DwarfMacinfoSection = + new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); } MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 9a63c14b5053..ec354c209ca0 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1092,8 +1092,28 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { } // ELFv2 ABI - Normal entry label. - if (Subtarget->isELFv2ABI()) + if (Subtarget->isELFv2ABI()) { + // In the Large code model, we allow arbitrary displacements between + // the text section and its associated TOC section. We place the + // full 8-byte offset to the TOC in memory immediatedly preceding + // the function global entry point. + if (TM.getCodeModel() == CodeModel::Large + && !MF->getRegInfo().use_empty(PPC::X2)) { + const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>(); + + MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC.")); + MCSymbol *GlobalEPSymbol = PPCFI->getGlobalEPSymbol(); + const MCExpr *TOCDeltaExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext), + MCSymbolRefExpr::create(GlobalEPSymbol, + OutContext), + OutContext); + + OutStreamer->EmitLabel(PPCFI->getTOCOffsetSymbol()); + OutStreamer->EmitValue(TOCDeltaExpr, 8); + } return AsmPrinter::EmitFunctionEntryLabel(); + } // Emit an official procedure descriptor. MCSectionSubPair Current = OutStreamer->getCurrentSection(); @@ -1160,10 +1180,25 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() { // thus emit a prefix sequence along the following lines: // // func: + // .Lfunc_gepNN: + // # global entry point + // addis r2,r12,(.TOC.-.Lfunc_gepNN)@ha + // addi r2,r2,(.TOC.-.Lfunc_gepNN)@l + // .Lfunc_lepNN: + // .localentry func, .Lfunc_lepNN-.Lfunc_gepNN + // # local entry point, followed by function body + // + // For the Large code model, we create + // + // .Lfunc_tocNN: + // .quad .TOC.-.Lfunc_gepNN # done by EmitFunctionEntryLabel + // func: + // .Lfunc_gepNN: // # global entry point - // addis r2,r12,(.TOC.-func)@ha - // addi r2,r2,(.TOC.-func)@l - // .localentry func, .-func + // ld r2,.Lfunc_tocNN-.Lfunc_gepNN(r12) + // add r2,r2,r12 + // .Lfunc_lepNN: + // .localentry func, .Lfunc_lepNN-.Lfunc_gepNN // # local entry point, followed by function body // // This ensures we have r2 set up correctly while executing the function @@ -1171,32 +1206,49 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() { if (Subtarget->isELFv2ABI() // Only do all that if the function uses r2 in the first place. && !MF->getRegInfo().use_empty(PPC::X2)) { + const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>(); - MCSymbol *GlobalEntryLabel = OutContext.createTempSymbol(); + MCSymbol *GlobalEntryLabel = PPCFI->getGlobalEPSymbol(); OutStreamer->EmitLabel(GlobalEntryLabel); const MCSymbolRefExpr *GlobalEntryLabelExp = MCSymbolRefExpr::create(GlobalEntryLabel, OutContext); - MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC.")); - const MCExpr *TOCDeltaExpr = - MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext), - GlobalEntryLabelExp, OutContext); + if (TM.getCodeModel() != CodeModel::Large) { + MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC.")); + const MCExpr *TOCDeltaExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext), + GlobalEntryLabelExp, OutContext); - const MCExpr *TOCDeltaHi = - PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext); - EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS) - .addReg(PPC::X2) - .addReg(PPC::X12) - .addExpr(TOCDeltaHi)); - - const MCExpr *TOCDeltaLo = - PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext); - EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI) - .addReg(PPC::X2) - .addReg(PPC::X2) - .addExpr(TOCDeltaLo)); - - MCSymbol *LocalEntryLabel = OutContext.createTempSymbol(); + const MCExpr *TOCDeltaHi = + PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS) + .addReg(PPC::X2) + .addReg(PPC::X12) + .addExpr(TOCDeltaHi)); + + const MCExpr *TOCDeltaLo = + PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI) + .addReg(PPC::X2) + .addReg(PPC::X2) + .addExpr(TOCDeltaLo)); + } else { + MCSymbol *TOCOffset = PPCFI->getTOCOffsetSymbol(); + const MCExpr *TOCOffsetDeltaExpr = + MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCOffset, OutContext), + GlobalEntryLabelExp, OutContext); + + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) + .addReg(PPC::X2) + .addExpr(TOCOffsetDeltaExpr) + .addReg(PPC::X12)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADD8) + .addReg(PPC::X2) + .addReg(PPC::X2) + .addReg(PPC::X12)); + } + + MCSymbol *LocalEntryLabel = PPCFI->getLocalEPSymbol(); OutStreamer->EmitLabel(LocalEntryLabel); const MCSymbolRefExpr *LocalEntryLabelExp = MCSymbolRefExpr::create(LocalEntryLabel, OutContext); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 075e093e41a1..79e4fe379c2d 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -299,22 +299,35 @@ def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm), // 64-bit CR instructions let Interpretation64Bit = 1, isCodeGenOnly = 1 in { let hasSideEffects = 0 in { +// mtocrf's input needs to be prepared by shifting by an amount dependent +// on the cr register selected. Thus, post-ra anti-dep breaking must not +// later change that register assignment. +let hasExtraDefRegAllocReq = 1 in { def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST), "mtocrf $FXM, $ST", IIC_BrMCRX>, PPC970_DGroup_First, PPC970_Unit_CRU; +// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that +// is dependent on the cr fields being set. def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS), "mtcrf $FXM, $rS", IIC_BrMCRX>, PPC970_MicroCode, PPC970_Unit_CRU; +} // hasExtraDefRegAllocReq = 1 -let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking. +// mfocrf's input needs to be prepared by shifting by an amount dependent +// on the cr register selected. Thus, post-ra anti-dep breaking must not +// later change that register assignment. +let hasExtraSrcRegAllocReq = 1 in { def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM), "mfocrf $rT, $FXM", IIC_SprMFCRF>, PPC970_DGroup_First, PPC970_Unit_CRU; +// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that +// is dependent on the cr fields being copied. def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins), "mfcr $rT", IIC_SprMFCR>, PPC970_MicroCode, PPC970_Unit_CRU; +} // hasExtraSrcRegAllocReq = 1 } // hasSideEffects = 0 let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in { diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index c17603a7718a..dcff6ad2486f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -744,20 +744,43 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB, "isel is for regular integer GPRs only"); unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL; - unsigned SelectPred = Cond[0].getImm(); + auto SelectPred = static_cast<PPC::Predicate>(Cond[0].getImm()); unsigned SubIdx; bool SwapOps; switch (SelectPred) { - default: llvm_unreachable("invalid predicate for isel"); - case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break; - case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break; - case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break; - case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break; - case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break; - case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break; - case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break; - case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break; + case PPC::PRED_EQ: + case PPC::PRED_EQ_MINUS: + case PPC::PRED_EQ_PLUS: + SubIdx = PPC::sub_eq; SwapOps = false; break; + case PPC::PRED_NE: + case PPC::PRED_NE_MINUS: + case PPC::PRED_NE_PLUS: + SubIdx = PPC::sub_eq; SwapOps = true; break; + case PPC::PRED_LT: + case PPC::PRED_LT_MINUS: + case PPC::PRED_LT_PLUS: + SubIdx = PPC::sub_lt; SwapOps = false; break; + case PPC::PRED_GE: + case PPC::PRED_GE_MINUS: + case PPC::PRED_GE_PLUS: + SubIdx = PPC::sub_lt; SwapOps = true; break; + case PPC::PRED_GT: + case PPC::PRED_GT_MINUS: + case PPC::PRED_GT_PLUS: + SubIdx = PPC::sub_gt; SwapOps = false; break; + case PPC::PRED_LE: + case PPC::PRED_LE_MINUS: + case PPC::PRED_LE_PLUS: + SubIdx = PPC::sub_gt; SwapOps = true; break; + case PPC::PRED_UN: + case PPC::PRED_UN_MINUS: + case PPC::PRED_UN_PLUS: + SubIdx = PPC::sub_un; SwapOps = false; break; + case PPC::PRED_NU: + case PPC::PRED_NU_MINUS: + case PPC::PRED_NU_PLUS: + SubIdx = PPC::sub_un; SwapOps = true; break; case PPC::PRED_BIT_SET: SubIdx = 0; SwapOps = false; break; case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 6c4364aad331..ce0f9e6f52a7 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -2299,22 +2299,35 @@ def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F), "#RESTORE_VRSAVE", []>; let hasSideEffects = 0 in { +// mtocrf's input needs to be prepared by shifting by an amount dependent +// on the cr register selected. Thus, post-ra anti-dep breaking must not +// later change that register assignment. +let hasExtraDefRegAllocReq = 1 in { def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST), "mtocrf $FXM, $ST", IIC_BrMCRX>, PPC970_DGroup_First, PPC970_Unit_CRU; +// Similarly to mtocrf, the mask for mtcrf must be prepared in a way that +// is dependent on the cr fields being set. def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS), "mtcrf $FXM, $rS", IIC_BrMCRX>, PPC970_MicroCode, PPC970_Unit_CRU; +} // hasExtraDefRegAllocReq = 1 -let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking. +// mfocrf's input needs to be prepared by shifting by an amount dependent +// on the cr register selected. Thus, post-ra anti-dep breaking must not +// later change that register assignment. +let hasExtraSrcRegAllocReq = 1 in { def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM), "mfocrf $rT, $FXM", IIC_SprMFCRF>, PPC970_DGroup_First, PPC970_Unit_CRU; +// Similarly to mfocrf, the mask for mfcrf must be prepared in a way that +// is dependent on the cr fields being copied. def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins), "mfcr $rT", IIC_SprMFCR>, PPC970_MicroCode, PPC970_Unit_CRU; +} // hasExtraSrcRegAllocReq = 1 } // hasSideEffects = 0 // Pseudo instruction to perform FADD in round-to-zero mode. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp index 95f163153c74..9d91e31165de 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -23,3 +23,24 @@ MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const { Twine(MF.getFunctionNumber()) + "$poff"); } + +MCSymbol *PPCFunctionInfo::getGlobalEPSymbol() const { + const DataLayout &DL = MF.getDataLayout(); + return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + "func_gep" + + Twine(MF.getFunctionNumber())); +} + +MCSymbol *PPCFunctionInfo::getLocalEPSymbol() const { + const DataLayout &DL = MF.getDataLayout(); + return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + "func_lep" + + Twine(MF.getFunctionNumber())); +} + +MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const { + const DataLayout &DL = MF.getDataLayout(); + return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + + "func_toc" + + Twine(MF.getFunctionNumber())); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h index 607cdf612eef..10a8ce068d40 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h @@ -197,6 +197,10 @@ public: bool usesPICBase() const { return UsesPICBase; } MCSymbol *getPICOffsetSymbol() const; + + MCSymbol *getGlobalEPSymbol() const; + MCSymbol *getLocalEPSymbol() const; + MCSymbol *getTOCOffsetSymbol() const; }; } // end of namespace llvm diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 2dc0d825c80d..a9d2e888f4b7 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -99,6 +99,11 @@ protected: break; } + // Don't really need to save data to the stack - the clobbered + // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr) + // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR). + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0); + // Expand into two ops built prior to the existing instruction. MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3) .addReg(InReg); @@ -113,6 +118,8 @@ protected: .addReg(GPR3)); Call->addOperand(MI->getOperand(3)); + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0); + BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg) .addReg(GPR3); diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index 733027a5d2be..05006ac5772b 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -83,7 +83,6 @@ static bool IsIntegerCC(unsigned CC) return (CC <= SPCC::ICC_VC); } - static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC) { switch(CC) { @@ -124,106 +123,103 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC) llvm_unreachable("Invalid cond code"); } -bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const -{ - - MachineBasicBlock::iterator I = MBB.end(); - MachineBasicBlock::iterator UnCondBrIter = MBB.end(); - while (I != MBB.begin()) { - --I; +static bool isUncondBranchOpcode(int Opc) { return Opc == SP::BA; } - if (I->isDebugValue()) - continue; +static bool isCondBranchOpcode(int Opc) { + return Opc == SP::FBCOND || Opc == SP::BCOND; +} - // When we see a non-terminator, we are done. - if (!isUnpredicatedTerminator(I)) - break; +static bool isIndirectBranchOpcode(int Opc) { + return Opc == SP::BINDrr || Opc == SP::BINDri; +} - // Terminator is not a branch. - if (!I->isBranch()) - return true; +static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, + SmallVectorImpl<MachineOperand> &Cond) { + Cond.push_back(MachineOperand::CreateImm(LastInst->getOperand(1).getImm())); + Target = LastInst->getOperand(0).getMBB(); +} - // Handle Unconditional branches. - if (I->getOpcode() == SP::BA) { - UnCondBrIter = I; +bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); + if (I == MBB.end()) + return false; + + if (!isUnpredicatedTerminator(I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + unsigned LastOpc = LastInst->getOpcode(); + + // If there is only one terminator instruction, process it. + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (isUncondBranchOpcode(LastOpc)) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } + if (isCondBranchOpcode(LastOpc)) { + // Block ends with fall-through condbranch. + parseCondBranch(LastInst, TBB, Cond); + return false; + } + return true; // Can't handle indirect branch. + } - if (!AllowModify) { - TBB = I->getOperand(0).getMBB(); - continue; + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If AllowModify is true and the block ends with two or more unconditional + // branches, delete all but the first unconditional branch. + if (AllowModify && isUncondBranchOpcode(LastOpc)) { + while (isUncondBranchOpcode(SecondLastOpc)) { + LastInst->eraseFromParent(); + LastInst = SecondLastInst; + LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + // Return now the only terminator is an unconditional branch. + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else { + SecondLastInst = I; + SecondLastOpc = SecondLastInst->getOpcode(); } + } + } - while (std::next(I) != MBB.end()) - std::next(I)->eraseFromParent(); - - Cond.clear(); - FBB = nullptr; + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + return true; - if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { - TBB = nullptr; - I->eraseFromParent(); - I = MBB.end(); - UnCondBrIter = MBB.end(); - continue; - } + // If the block ends with a B and a Bcc, handle it. + if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + parseCondBranch(SecondLastInst, TBB, Cond); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } - TBB = I->getOperand(0).getMBB(); - continue; - } + // If the block ends with two unconditional branches, handle it. The second + // one is not executed. + if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + return false; + } - unsigned Opcode = I->getOpcode(); - if (Opcode != SP::BCOND && Opcode != SP::FBCOND) - return true; // Unknown Opcode. - - SPCC::CondCodes BranchCode = (SPCC::CondCodes)I->getOperand(1).getImm(); - - if (Cond.empty()) { - MachineBasicBlock *TargetBB = I->getOperand(0).getMBB(); - if (AllowModify && UnCondBrIter != MBB.end() && - MBB.isLayoutSuccessor(TargetBB)) { - - // Transform the code - // - // brCC L1 - // ba L2 - // L1: - // .. - // L2: - // - // into - // - // brnCC L2 - // L1: - // ... - // L2: - // - BranchCode = GetOppositeBranchCondition(BranchCode); - MachineBasicBlock::iterator OldInst = I; - BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(Opcode)) - .addMBB(UnCondBrIter->getOperand(0).getMBB()).addImm(BranchCode); - BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(SP::BA)) - .addMBB(TargetBB); - - OldInst->eraseFromParent(); - UnCondBrIter->eraseFromParent(); - - UnCondBrIter = MBB.end(); - I = MBB.end(); - continue; - } - FBB = TBB; - TBB = I->getOperand(0).getMBB(); - Cond.push_back(MachineOperand::CreateImm(BranchCode)); - continue; - } - // FIXME: Handle subsequent conditional branches. - // For now, we can't handle multiple conditional branches. + // ...likewise if it ends with an indirect branch followed by an unconditional + // branch. + if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + I = LastInst; + if (AllowModify) + I->eraseFromParent(); return true; } - return false; + + // Otherwise, can't handle this. + return true; } unsigned @@ -277,6 +273,14 @@ unsigned SparcInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const return Count; } +bool SparcInstrInfo::ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 1); + SPCC::CondCodes CC = static_cast<SPCC::CondCodes>(Cond[0].getImm()); + Cond[0].setImm(GetOppositeBranchCondition(CC)); + return false; +} + void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h index 15673f134d80..9de624cc9582 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h @@ -76,6 +76,9 @@ public: MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td index b9f2eb5514a5..d5dabc2cd6ab 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -1219,6 +1219,9 @@ def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>; // Atomic operations //===----------------------------------------------------------------------===// +// A serialization instruction that acts as a barrier for all memory +// accesses, which expands to "bcr 14, 0". +let hasSideEffects = 1 in def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>; let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in { diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt b/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt new file mode 100644 index 000000000000..5e55e2958aeb --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMWebAssemblyDisassembler + WebAssemblyDisassembler.cpp + ) diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt b/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt new file mode 100644 index 000000000000..a452ca1acd04 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===-- ./lib/Target/WebAssembly/Disassembler/LLVMBuild.txt -----*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = WebAssemblyDisassembler +parent = WebAssembly +required_libraries = MCDisassembler WebAssemblyInfo Support +add_to_library_groups = WebAssembly diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile b/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile new file mode 100644 index 000000000000..bcd36ba6f01f --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/Makefile @@ -0,0 +1,16 @@ +##===-- lib/Target/WebAssembly/Disassembler/Makefile -------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../../.. +LIBRARYNAME = LLVMWebAssemblyDisassembler + +# Hack: we need to include 'main' target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp new file mode 100644 index 000000000000..0143b10c0ab1 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -0,0 +1,148 @@ +//==- WebAssemblyDisassembler.cpp - Disassembler for WebAssembly -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file is part of the WebAssembly Disassembler. +/// +/// It contains code to translate the data produced by the decoder into +/// MCInsts. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-disassembler" + +namespace { +class WebAssemblyDisassembler final : public MCDisassembler { + std::unique_ptr<const MCInstrInfo> MCII; + + DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, + ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &VStream, + raw_ostream &CStream) const override; + +public: + WebAssemblyDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + std::unique_ptr<const MCInstrInfo> MCII) + : MCDisassembler(STI, Ctx), MCII(std::move(MCII)) {} +}; +} // end anonymous namespace + +static MCDisassembler *createWebAssemblyDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + std::unique_ptr<const MCInstrInfo> MCII(T.createMCInstrInfo()); + return new WebAssemblyDisassembler(STI, Ctx, std::move(MCII)); +} + +extern "C" void LLVMInitializeWebAssemblyDisassembler() { + // Register the disassembler for each target. + TargetRegistry::RegisterMCDisassembler(TheWebAssemblyTarget32, + createWebAssemblyDisassembler); + TargetRegistry::RegisterMCDisassembler(TheWebAssemblyTarget64, + createWebAssemblyDisassembler); +} + +MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( + MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/, + raw_ostream &OS, raw_ostream &CS) const { + Size = 0; + uint64_t Pos = 0; + + // Read the opcode. + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + uint64_t Opcode = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + + if (Opcode >= WebAssembly::INSTRUCTION_LIST_END) + return MCDisassembler::Fail; + + MI.setOpcode(Opcode); + const MCInstrDesc &Desc = MCII->get(Opcode); + unsigned NumFixedOperands = Desc.NumOperands; + + // If it's variadic, read the number of extra operands. + unsigned NumExtraOperands = 0; + if (Desc.isVariadic()) { + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + NumExtraOperands = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + } + + // Read the fixed operands. These are described by the MCInstrDesc. + for (unsigned i = 0; i < NumFixedOperands; ++i) { + const MCOperandInfo &Info = Desc.OpInfo[i]; + switch (Info.OperandType) { + case MCOI::OPERAND_IMMEDIATE: + case WebAssembly::OPERAND_BASIC_BLOCK: { + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + uint64_t Imm = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + MI.addOperand(MCOperand::createImm(Imm)); + break; + } + case MCOI::OPERAND_REGISTER: { + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + uint64_t Reg = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + MI.addOperand(MCOperand::createReg(Reg)); + break; + } + case WebAssembly::OPERAND_FPIMM: { + // TODO: MC converts all floating point immediate operands to double. + // This is fine for numeric values, but may cause NaNs to change bits. + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + uint64_t Bits = support::endian::read64le(Bytes.data() + Pos); + Pos += sizeof(uint64_t); + double Imm; + memcpy(&Imm, &Bits, sizeof(Imm)); + MI.addOperand(MCOperand::createFPImm(Imm)); + break; + } + default: + llvm_unreachable("unimplemented operand kind"); + } + } + + // Read the extra operands. + assert(NumExtraOperands == 0 || Desc.isVariadic()); + for (unsigned i = 0; i < NumExtraOperands; ++i) { + if (Pos + sizeof(uint64_t) > Bytes.size()) + return MCDisassembler::Fail; + if (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate) { + // Decode extra immediate operands. + uint64_t Imm = support::endian::read64le(Bytes.data() + Pos); + MI.addOperand(MCOperand::createImm(Imm)); + } else { + // Decode extra register operands. + uint64_t Reg = support::endian::read64le(Bytes.data() + Pos); + MI.addOperand(MCOperand::createReg(Reg)); + } + Pos += sizeof(uint64_t); + } + + Size = Pos; + return MCDisassembler::Success; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp index 7ce3a00ae360..9a95150cb557 100644 --- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp @@ -16,6 +16,8 @@ #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "WebAssembly.h" #include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -33,7 +35,7 @@ using namespace llvm; WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} + : MCInstPrinter(MAI, MII, MRI), ControlFlowCounter(0) {} void WebAssemblyInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { @@ -59,6 +61,52 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, // Print any added annotation. printAnnotation(OS, Annot); + + if (CommentStream) { + // Observe any effects on the control flow stack, for use in annotating + // control flow label references. + switch (MI->getOpcode()) { + default: + break; + case WebAssembly::LOOP: { + // Grab the TopLabel value first so that labels print in numeric order. + uint64_t TopLabel = ControlFlowCounter++; + ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false)); + printAnnotation(OS, "label" + utostr(TopLabel) + ':'); + ControlFlowStack.push_back(std::make_pair(TopLabel, true)); + break; + } + case WebAssembly::BLOCK: + ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false)); + break; + case WebAssembly::END_LOOP: + ControlFlowStack.pop_back(); + printAnnotation( + OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':'); + break; + case WebAssembly::END_BLOCK: + printAnnotation( + OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':'); + break; + } + + // Annotate any control flow label references. + unsigned NumFixedOperands = Desc.NumOperands; + SmallSet<uint64_t, 8> Printed; + for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { + const MCOperandInfo &Info = Desc.OpInfo[i]; + if (!(i < NumFixedOperands + ? (Info.OperandType == WebAssembly::OPERAND_BASIC_BLOCK) + : (Desc.TSFlags & WebAssemblyII::VariableOpImmediateIsLabel))) + continue; + uint64_t Depth = MI->getOperand(i).getImm(); + if (!Printed.insert(Depth).second) + continue; + const auto &Pair = ControlFlowStack.rbegin()[Depth]; + printAnnotation(OS, utostr(Depth) + ": " + (Pair.second ? "up" : "down") + + " to label" + utostr(Pair.first)); + } + } } static std::string toString(const APFloat &FP) { @@ -82,6 +130,9 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { + assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || + MII.get(MI->getOpcode()).TSFlags == 0) && + "WebAssembly variable_ops register ops don't use TSFlags"); unsigned WAReg = Op.getReg(); if (int(WAReg) >= 0) printRegName(O, WAReg); @@ -95,19 +146,27 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (OpNo < MII.get(MI->getOpcode()).getNumDefs()) O << '='; } else if (Op.isImm()) { - switch (MI->getOpcode()) { - case WebAssembly::PARAM: - case WebAssembly::RESULT: - case WebAssembly::LOCAL: - O << WebAssembly::TypeToString(MVT::SimpleValueType(Op.getImm())); - break; - default: - O << Op.getImm(); - break; - } - } else if (Op.isFPImm()) + assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || + (MII.get(MI->getOpcode()).TSFlags & + WebAssemblyII::VariableOpIsImmediate)) && + "WebAssemblyII::VariableOpIsImmediate should be set for " + "variable_ops immediate ops"); + // TODO: (MII.get(MI->getOpcode()).TSFlags & + // WebAssemblyII::VariableOpImmediateIsLabel) + // can tell us whether this is an immediate referencing a label in the + // control flow stack, and it may be nice to pretty-print. + O << Op.getImm(); + } else if (Op.isFPImm()) { + assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || + MII.get(MI->getOpcode()).TSFlags == 0) && + "WebAssembly variable_ops floating point ops don't use TSFlags"); O << toString(APFloat(Op.getFPImm())); - else { + } else { + assert((OpNo < MII.get(MI->getOpcode()).getNumOperands() || + (MII.get(MI->getOpcode()).TSFlags & + WebAssemblyII::VariableOpIsImmediate)) && + "WebAssemblyII::VariableOpIsImmediate should be set for " + "variable_ops expr ops"); assert(Op.isExpr() && "unknown operand kind in printOperand"); Op.getExpr()->print(O, &MAI); } diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h index 39a16f59fd78..cd6c59a41c33 100644 --- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h +++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h @@ -23,6 +23,9 @@ namespace llvm { class MCSubtargetInfo; class WebAssemblyInstPrinter final : public MCInstPrinter { + uint64_t ControlFlowCounter; + SmallVector<std::pair<uint64_t, bool>, 0> ControlFlowStack; + public: WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI); diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp index b158ccb46f99..bba06f65e169 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -95,9 +95,6 @@ WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const { } } // end anonymous namespace -MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Target &T, - const MCRegisterInfo &MRI, - const Triple &TT, - StringRef CPU) { +MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) { return new WebAssemblyAsmBackend(TT.isArch64Bit()); } diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp index c47a3d9094e5..2bb58b33934e 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp @@ -30,19 +30,31 @@ protected: }; } // end anonymous namespace -// FIXME: Use EM_NONE as a temporary hack. Should we decide to pursue ELF -// writing seriously, we should email generic-abi@googlegroups.com and ask -// for our own ELF code. WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI) - : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_NONE, - /*HasRelocationAddend=*/true) {} + : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_WEBASSEMBLY, + /*HasRelocationAddend=*/false) {} unsigned WebAssemblyELFObjectWriter::GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - // FIXME: Do we need our own relocs? - return Fixup.getKind(); + // WebAssembly functions are not allocated in the address space. To resolve a + // pointer to a function, we must use a special relocation type. + if (const MCSymbolRefExpr *SyExp = + dyn_cast<MCSymbolRefExpr>(Fixup.getValue())) + if (SyExp->getKind() == MCSymbolRefExpr::VK_WebAssembly_FUNCTION) + return ELF::R_WEBASSEMBLY_FUNCTION; + + switch (Fixup.getKind()) { + case FK_Data_4: + assert(!is64Bit() && "4-byte relocations only supported on wasm32"); + return ELF::R_WEBASSEMBLY_DATA; + case FK_Data_8: + assert(is64Bit() && "8-byte relocations only supported on wasm64"); + return ELF::R_WEBASSEMBLY_DATA; + default: + llvm_unreachable("unimplemented fixup kind"); + } } MCObjectWriter *llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS, diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp index d2617796ca99..02c717a92101 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp @@ -27,11 +27,12 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) { // TODO: What should MaxInstLength be? - PrivateGlobalPrefix = ""; - PrivateLabelPrefix = ""; - UseDataRegionDirectives = true; + // Use .skip instead of .zero because .zero is confusing when used with two + // arguments (it doesn't actually zero things out). + ZeroDirective = "\t.skip\t"; + Data8bitsDirective = "\t.int8\t"; Data16bitsDirective = "\t.int16\t"; Data32bitsDirective = "\t.int32\t"; diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp index 7c6c79eb5db2..f409bd77442c 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCFixup.h" @@ -26,75 +27,66 @@ using namespace llvm; #define DEBUG_TYPE "mccodeemitter" +STATISTIC(MCNumEmitted, "Number of MC instructions emitted."); +STATISTIC(MCNumFixups, "Number of MC fixups created."); + namespace { class WebAssemblyMCCodeEmitter final : public MCCodeEmitter { - const MCRegisterInfo &MRI; - -public: - WebAssemblyMCCodeEmitter(const MCInstrInfo &, const MCRegisterInfo &mri, - MCContext &) - : MRI(mri) {} + const MCInstrInfo &MCII; + const MCContext &Ctx; - ~WebAssemblyMCCodeEmitter() override {} - - /// TableGen'erated function for getting the binary encoding for an - /// instruction. + // Implementation generated by tablegen. uint64_t getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - /// Return binary encoding of operand. If the machine operand requires - /// relocation, record the relocation and return zero. - unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; - - uint64_t getMemoryOpValue(const MCInst &MI, unsigned Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; - void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; + +public: + WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) + : MCII(mcii), Ctx(ctx) {} }; } // end anonymous namespace MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx) { - return new WebAssemblyMCCodeEmitter(MCII, MRI, Ctx); -} - -unsigned WebAssemblyMCCodeEmitter::getMachineOpValue( - const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - if (MO.isReg()) - return MRI.getEncodingValue(MO.getReg()); - if (MO.isImm()) - return static_cast<unsigned>(MO.getImm()); - - assert(MO.isExpr()); - - assert(MO.getExpr()->getKind() == MCExpr::SymbolRef); - - assert(false && "FIXME: not implemented yet"); - - return 0; + return new WebAssemblyMCCodeEmitter(MCII, Ctx); } void WebAssemblyMCCodeEmitter::encodeInstruction( const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - assert(false && "FIXME: not implemented yet"); -} - -// Encode WebAssembly Memory Operand -uint64_t -WebAssemblyMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - assert(false && "FIXME: not implemented yet"); - return 0; + // FIXME: This is not the real binary encoding. This is an extremely + // over-simplified encoding where we just use uint64_t for everything. This + // is a temporary measure. + support::endian::Writer<support::little>(OS).write<uint64_t>(MI.getOpcode()); + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (Desc.isVariadic()) + support::endian::Writer<support::little>(OS).write<uint64_t>( + MI.getNumOperands() - Desc.NumOperands); + for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg()) { + support::endian::Writer<support::little>(OS).write<uint64_t>(MO.getReg()); + } else if (MO.isImm()) { + support::endian::Writer<support::little>(OS).write<uint64_t>(MO.getImm()); + } else if (MO.isFPImm()) { + support::endian::Writer<support::little>(OS).write<double>(MO.getFPImm()); + } else if (MO.isExpr()) { + support::endian::Writer<support::little>(OS).write<uint64_t>(0); + Fixups.push_back(MCFixup::create( + (1 + MCII.get(MI.getOpcode()).isVariadic() + i) * sizeof(uint64_t), + MO.getExpr(), STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4, + MI.getLoc())); + ++MCNumFixups; + } else { + llvm_unreachable("unexpected operand kind"); + } + } + + ++MCNumEmitted; // Keep track of the # of mi's emitted. } #include "WebAssemblyGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index 14cd295353d5..37000f1cd571 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -15,10 +15,10 @@ #include "WebAssemblyMCTargetDesc.h" #include "InstPrinter/WebAssemblyInstPrinter.h" #include "WebAssemblyMCAsmInfo.h" +#include "WebAssemblyTargetStreamer.h" #include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" @@ -35,52 +35,89 @@ using namespace llvm; #define GET_REGINFO_MC_DESC #include "WebAssemblyGenRegisterInfo.inc" -static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo & /*MRI*/, - const Triple &TT) { +static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/, + const Triple &TT) { return new WebAssemblyMCAsmInfo(TT); } -static MCInstrInfo *createWebAssemblyMCInstrInfo() { +static MCInstrInfo *createMCInstrInfo() { MCInstrInfo *X = new MCInstrInfo(); InitWebAssemblyMCInstrInfo(X); return X; } -static MCStreamer *createWebAssemblyMCStreamer(const Triple &T, MCContext &Ctx, - MCAsmBackend &MAB, - raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, - bool RelaxAll) { - return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll); +static MCRegisterInfo *createMCRegisterInfo(const Triple & /*T*/) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitWebAssemblyMCRegisterInfo(X, 0); + return X; } -static MCInstPrinter * -createWebAssemblyMCInstPrinter(const Triple & /*T*/, unsigned SyntaxVariant, - const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) { +static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI) { assert(SyntaxVariant == 0); return new WebAssemblyInstPrinter(MAI, MII, MRI); } +static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo & /*MRI*/, + MCContext &Ctx) { + return createWebAssemblyMCCodeEmitter(MCII, Ctx); +} + +static MCAsmBackend *createAsmBackend(const Target & /*T*/, + const MCRegisterInfo & /*MRI*/, + const Triple &TT, StringRef /*CPU*/) { + return createWebAssemblyAsmBackend(TT); +} + +static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU, + StringRef FS) { + return createWebAssemblyMCSubtargetInfoImpl(TT, CPU, FS); +} + +static MCTargetStreamer * +createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo & /*STI*/) { + return new WebAssemblyTargetELFStreamer(S); +} + +static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter * /*InstPrint*/, + bool /*isVerboseAsm*/) { + return new WebAssemblyTargetAsmStreamer(S, OS); +} + // Force static initialization. extern "C" void LLVMInitializeWebAssemblyTargetMC() { for (Target *T : {&TheWebAssemblyTarget32, &TheWebAssemblyTarget64}) { // Register the MC asm info. - RegisterMCAsmInfoFn X(*T, createWebAssemblyMCAsmInfo); + RegisterMCAsmInfoFn X(*T, createMCAsmInfo); // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(*T, createWebAssemblyMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(*T, createMCInstrInfo); - // Register the object streamer - TargetRegistry::RegisterELFStreamer(*T, createWebAssemblyMCStreamer); + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(*T, createMCRegisterInfo); // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(*T, createWebAssemblyMCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(*T, createMCInstPrinter); + + // Register the MC code emitter. + TargetRegistry::RegisterMCCodeEmitter(*T, createCodeEmitter); + + // Register the ASM Backend. + TargetRegistry::RegisterMCAsmBackend(*T, createAsmBackend); - // Register the MC code emitter - TargetRegistry::RegisterMCCodeEmitter(*T, createWebAssemblyMCCodeEmitter); + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(*T, createMCSubtargetInfo); - // Register the ASM Backend - TargetRegistry::RegisterMCAsmBackend(*T, createWebAssemblyAsmBackend); + // Register the object target streamer. + TargetRegistry::RegisterObjectTargetStreamer(*T, + createObjectTargetStreamer); + // Register the asm target streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, createAsmTargetStreamer); } } diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index e78f73e3da95..9bac4f82822a 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -15,40 +15,62 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H +#include "llvm/MC/MCInstrDesc.h" #include "llvm/Support/DataTypes.h" namespace llvm { -class formatted_raw_ostream; class MCAsmBackend; class MCCodeEmitter; class MCContext; class MCInstrInfo; -class MCRegisterInfo; class MCObjectWriter; -class MCStreamer; class MCSubtargetInfo; -class MCTargetStreamer; -class StringRef; class Target; class Triple; -class raw_ostream; class raw_pwrite_stream; extern Target TheWebAssemblyTarget32; extern Target TheWebAssemblyTarget64; MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createWebAssemblyAsmBackend(const Target &T, - const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU); +MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT); MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, uint8_t OSABI); +namespace WebAssembly { +enum OperandType { + /// Basic block label in a branch construct. + OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET, + /// Floating-point immediate. + OPERAND_FPIMM +}; + +/// WebAssembly-specific directive identifiers. +enum Directive { + // FIXME: This is not the real binary encoding. + DotParam = UINT64_MAX - 0, ///< .param + DotResult = UINT64_MAX - 1, ///< .result + DotLocal = UINT64_MAX - 2, ///< .local + DotEndFunc = UINT64_MAX - 3, ///< .endfunc +}; + +} // end namespace WebAssembly + +namespace WebAssemblyII { +enum { + // For variadic instructions, this flag indicates whether an operand + // in the variable_ops range is an immediate value. + VariableOpIsImmediate = (1 << 0), + // For immediate values in the variable_ops range, this flag indicates + // whether the value represents a control-flow label. + VariableOpImmediateIsLabel = (1 << 1), +}; +} // end namespace WebAssemblyII + } // end namespace llvm // Defines symbolic names for WebAssembly registers. This defines a mapping from diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp new file mode 100644 index 000000000000..1d2822869a15 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp @@ -0,0 +1,94 @@ +//==-- WebAssemblyTargetStreamer.cpp - WebAssembly Target Streamer Methods --=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file defines WebAssembly-specific target streamer classes. +/// These are for implementing support for target-specific assembly directives. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssemblyTargetStreamer.h" +#include "InstPrinter/WebAssemblyInstPrinter.h" +#include "WebAssemblyMCTargetDesc.h" +#include "WebAssemblyTargetObjectFile.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbolELF.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +using namespace llvm; + +WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S) + : MCTargetStreamer(S) {} + +WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer( + MCStreamer &S, formatted_raw_ostream &OS) + : WebAssemblyTargetStreamer(S), OS(OS) {} + +WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S) + : WebAssemblyTargetStreamer(S) {} + +static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) { + bool First = true; + for (MVT Type : Types) { + if (First) + First = false; + else + OS << ", "; + OS << WebAssembly::TypeToString(Type); + } + OS << '\n'; +} + +void WebAssemblyTargetAsmStreamer::emitParam(ArrayRef<MVT> Types) { + OS << "\t.param \t"; + PrintTypes(OS, Types); +} + +void WebAssemblyTargetAsmStreamer::emitResult(ArrayRef<MVT> Types) { + OS << "\t.result \t"; + PrintTypes(OS, Types); +} + +void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) { + OS << "\t.local \t"; + PrintTypes(OS, Types); +} + +void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; } + +// FIXME: What follows is not the real binary encoding. + +static void EncodeTypes(MCStreamer &Streamer, ArrayRef<MVT> Types) { + Streamer.EmitIntValue(Types.size(), sizeof(uint64_t)); + for (MVT Type : Types) + Streamer.EmitIntValue(Type.SimpleTy, sizeof(uint64_t)); +} + +void WebAssemblyTargetELFStreamer::emitParam(ArrayRef<MVT> Types) { + Streamer.EmitIntValue(WebAssembly::DotParam, sizeof(uint64_t)); + EncodeTypes(Streamer, Types); +} + +void WebAssemblyTargetELFStreamer::emitResult(ArrayRef<MVT> Types) { + Streamer.EmitIntValue(WebAssembly::DotResult, sizeof(uint64_t)); + EncodeTypes(Streamer, Types); +} + +void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) { + Streamer.EmitIntValue(WebAssembly::DotLocal, sizeof(uint64_t)); + EncodeTypes(Streamer, Types); +} + +void WebAssemblyTargetELFStreamer::emitEndFunc() { + Streamer.EmitIntValue(WebAssembly::DotEndFunc, sizeof(uint64_t)); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h new file mode 100644 index 000000000000..c66a51574efb --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h @@ -0,0 +1,68 @@ +//==-- WebAssemblyTargetStreamer.h - WebAssembly Target Streamer -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file declares WebAssembly-specific target streamer classes. +/// These are for implementing support for target-specific assembly directives. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H +#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H + +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/MC/MCStreamer.h" + +namespace llvm { + +class MCELFStreamer; + +/// WebAssembly-specific streamer interface, to implement support +/// WebAssembly-specific assembly directives. +class WebAssemblyTargetStreamer : public MCTargetStreamer { +public: + explicit WebAssemblyTargetStreamer(MCStreamer &S); + + /// .param + virtual void emitParam(ArrayRef<MVT> Types) = 0; + /// .result + virtual void emitResult(ArrayRef<MVT> Types) = 0; + /// .local + virtual void emitLocal(ArrayRef<MVT> Types) = 0; + /// .endfunc + virtual void emitEndFunc() = 0; +}; + +/// This part is for ascii assembly output +class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer { + formatted_raw_ostream &OS; + +public: + WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); + + void emitParam(ArrayRef<MVT> Types) override; + void emitResult(ArrayRef<MVT> Types) override; + void emitLocal(ArrayRef<MVT> Types) override; + void emitEndFunc() override; +}; + +/// This part is for ELF object output +class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer { +public: + explicit WebAssemblyTargetELFStreamer(MCStreamer &S); + + void emitParam(ArrayRef<MVT> Types) override; + void emitResult(ArrayRef<MVT> Types) override; + void emitLocal(ArrayRef<MVT> Types) override; + void emitEndFunc() override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 0d2b4d9debb9..45ac99d90ed9 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -17,6 +17,7 @@ #include "WebAssembly.h" #include "InstPrinter/WebAssemblyInstPrinter.h" #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "MCTargetDesc/WebAssemblyTargetStreamer.h" #include "WebAssemblyMCInstLower.h" #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblyRegisterInfo.h" @@ -69,7 +70,9 @@ private: void EmitJumpTableInfo() override; void EmitConstantPool() override; void EmitFunctionBodyStart() override; + void EmitFunctionBodyEnd() override; void EmitInstruction(const MachineInstr *MI) override; + const MCExpr *lowerConstant(const Constant *CV) override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &OS) override; @@ -80,6 +83,7 @@ private: MVT getRegType(unsigned RegNo) const; const char *toString(MVT VT) const; std::string regToString(const MachineOperand &MO); + WebAssemblyTargetStreamer *getTargetStreamer(); }; } // end anonymous namespace @@ -90,9 +94,9 @@ private: MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const { const TargetRegisterClass *TRC = - TargetRegisterInfo::isVirtualRegister(RegNo) ? - MRI->getRegClass(RegNo) : - MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo); + TargetRegisterInfo::isVirtualRegister(RegNo) + ? MRI->getRegClass(RegNo) + : MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo); for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64}) if (TRC->hasType(T)) return T; @@ -101,6 +105,10 @@ MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const { return MVT::Other; } +const char *WebAssemblyAsmPrinter::toString(MVT VT) const { + return WebAssembly::TypeToString(VT); +} + std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) { unsigned RegNo = MO.getReg(); assert(TargetRegisterInfo::isVirtualRegister(RegNo) && @@ -111,8 +119,10 @@ std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) { return '$' + utostr(WAReg); } -const char *WebAssemblyAsmPrinter::toString(MVT VT) const { - return WebAssembly::TypeToString(VT); +WebAssemblyTargetStreamer * +WebAssemblyAsmPrinter::getTargetStreamer() { + MCTargetStreamer *TS = OutStreamer->getTargetStreamer(); + return static_cast<WebAssemblyTargetStreamer *>(TS); } //===----------------------------------------------------------------------===// @@ -145,29 +155,20 @@ static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM, } void WebAssemblyAsmPrinter::EmitFunctionBodyStart() { - if (!MFI->getParams().empty()) { - MCInst Param; - Param.setOpcode(WebAssembly::PARAM); - for (MVT VT : MFI->getParams()) - Param.addOperand(MCOperand::createImm(VT.SimpleTy)); - EmitToStreamer(*OutStreamer, Param); - } + if (!MFI->getParams().empty()) + getTargetStreamer()->emitParam(MFI->getParams()); SmallVector<MVT, 4> ResultVTs; const Function &F(*MF->getFunction()); ComputeLegalValueVTs(F, TM, F.getReturnType(), ResultVTs); + // If the return type needs to be legalized it will get converted into // passing a pointer. - if (ResultVTs.size() == 1) { - MCInst Result; - Result.setOpcode(WebAssembly::RESULT); - Result.addOperand(MCOperand::createImm(ResultVTs.front().SimpleTy)); - EmitToStreamer(*OutStreamer, Result); - } + if (ResultVTs.size() == 1) + getTargetStreamer()->emitResult(ResultVTs); bool AnyWARegs = false; - MCInst Local; - Local.setOpcode(WebAssembly::LOCAL); + SmallVector<MVT, 16> LocalTypes; for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) { unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx); unsigned WAReg = MFI->getWAReg(VReg); @@ -180,22 +181,26 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() { // Don't declare stackified registers. if (int(WAReg) < 0) continue; - Local.addOperand(MCOperand::createImm(getRegType(VReg).SimpleTy)); + LocalTypes.push_back(getRegType(VReg)); AnyWARegs = true; } auto &PhysRegs = MFI->getPhysRegs(); for (unsigned PReg = 0; PReg < PhysRegs.size(); ++PReg) { if (PhysRegs[PReg] == -1U) continue; - Local.addOperand(MCOperand::createImm(getRegType(PReg).SimpleTy)); + LocalTypes.push_back(getRegType(PReg)); AnyWARegs = true; } if (AnyWARegs) - EmitToStreamer(*OutStreamer, Local); + getTargetStreamer()->emitLocal(LocalTypes); AsmPrinter::EmitFunctionBodyStart(); } +void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() { + getTargetStreamer()->emitEndFunc(); +} + void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) { DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n'); @@ -207,10 +212,6 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) { // These represent values which are live into the function entry, so there's // no instruction to emit. break; - case WebAssembly::LOOP_END: - // This is a no-op which just exists to tell AsmPrinter.cpp that there's a - // fallthrough which nevertheless requires a label for the destination here. - break; default: { WebAssemblyMCInstLower MCInstLowering(OutContext, *this); MCInst TmpInst; @@ -221,6 +222,14 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) { } } +const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) { + if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) + if (GV->getValueType()->isFunctionTy()) + return MCSymbolRefExpr::create( + getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext); + return AsmPrinter::lowerConstant(CV); +} + bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index e9671ee07e69..a39349c562fd 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -34,6 +34,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -256,7 +257,8 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) { /// code) for a branch instruction to both branch to a block and fallthrough /// to it, so we check the actual branch operands to see if there are any /// explicit mentions. -static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred, MachineBasicBlock *MBB) { +static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred, + MachineBasicBlock *MBB) { for (MachineInstr &MI : Pred->terminators()) for (MachineOperand &MO : MI.explicit_operands()) if (MO.isMBB() && MO.getMBB() == MBB) @@ -325,13 +327,21 @@ static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF, InsertPos = Header->getFirstTerminator(); while (InsertPos != Header->begin() && prev(InsertPos)->definesRegister(WebAssembly::EXPR_STACK) && - prev(InsertPos)->getOpcode() != WebAssembly::LOOP) + prev(InsertPos)->getOpcode() != WebAssembly::LOOP && + prev(InsertPos)->getOpcode() != WebAssembly::END_BLOCK && + prev(InsertPos)->getOpcode() != WebAssembly::END_LOOP) --InsertPos; } // Add the BLOCK. - BuildMI(*Header, InsertPos, DebugLoc(), TII.get(WebAssembly::BLOCK)) - .addMBB(&MBB); + BuildMI(*Header, InsertPos, DebugLoc(), TII.get(WebAssembly::BLOCK)); + + // Mark the end of the block. + InsertPos = MBB.begin(); + while (InsertPos != MBB.end() && + InsertPos->getOpcode() == WebAssembly::END_LOOP) + ++InsertPos; + BuildMI(MBB, InsertPos, DebugLoc(), TII.get(WebAssembly::END_BLOCK)); // Track the farthest-spanning scope that ends at this point. int Number = MBB.getNumber(); @@ -341,10 +351,11 @@ static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF, } /// Insert a LOOP marker for a loop starting at MBB (if it's a loop header). -static void PlaceLoopMarker(MachineBasicBlock &MBB, MachineFunction &MF, - SmallVectorImpl<MachineBasicBlock *> &ScopeTops, - const WebAssemblyInstrInfo &TII, - const MachineLoopInfo &MLI) { +static void PlaceLoopMarker( + MachineBasicBlock &MBB, MachineFunction &MF, + SmallVectorImpl<MachineBasicBlock *> &ScopeTops, + DenseMap<const MachineInstr *, const MachineBasicBlock *> &LoopTops, + const WebAssemblyInstrInfo &TII, const MachineLoopInfo &MLI) { MachineLoop *Loop = MLI.getLoopFor(&MBB); if (!Loop || Loop->getHeader() != &MBB) return; @@ -361,14 +372,19 @@ static void PlaceLoopMarker(MachineBasicBlock &MBB, MachineFunction &MF, Iter = next(MachineFunction::iterator(Bottom)); } MachineBasicBlock *AfterLoop = &*Iter; - BuildMI(MBB, MBB.begin(), DebugLoc(), TII.get(WebAssembly::LOOP)) - .addMBB(AfterLoop); - // Emit a special no-op telling the asm printer that we need a label to close - // the loop scope, even though the destination is only reachable by - // fallthrough. - if (!Bottom->back().isBarrier()) - BuildMI(*Bottom, Bottom->end(), DebugLoc(), TII.get(WebAssembly::LOOP_END)); + // Mark the beginning of the loop (after the end of any existing loop that + // ends here). + auto InsertPos = MBB.begin(); + while (InsertPos != MBB.end() && + InsertPos->getOpcode() == WebAssembly::END_LOOP) + ++InsertPos; + BuildMI(MBB, InsertPos, DebugLoc(), TII.get(WebAssembly::LOOP)); + + // Mark the end of the loop. + MachineInstr *End = BuildMI(*AfterLoop, AfterLoop->begin(), DebugLoc(), + TII.get(WebAssembly::END_LOOP)); + LoopTops[End] = &MBB; assert((!ScopeTops[AfterLoop->getNumber()] || ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) && @@ -377,6 +393,19 @@ static void PlaceLoopMarker(MachineBasicBlock &MBB, MachineFunction &MF, ScopeTops[AfterLoop->getNumber()] = &MBB; } +static unsigned +GetDepth(const SmallVectorImpl<const MachineBasicBlock *> &Stack, + const MachineBasicBlock *MBB) { + unsigned Depth = 0; + for (auto X : reverse(Stack)) { + if (X == MBB) + break; + ++Depth; + } + assert(Depth < Stack.size() && "Branch destination should be in scope"); + return Depth; +} + /// Insert LOOP and BLOCK markers at appropriate places. static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI, const WebAssemblyInstrInfo &TII, @@ -388,25 +417,57 @@ static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI, // we may insert at the end. SmallVector<MachineBasicBlock *, 8> ScopeTops(MF.getNumBlockIDs() + 1); + // For eacn LOOP_END, the corresponding LOOP. + DenseMap<const MachineInstr *, const MachineBasicBlock *> LoopTops; + for (auto &MBB : MF) { // Place the LOOP for MBB if MBB is the header of a loop. - PlaceLoopMarker(MBB, MF, ScopeTops, TII, MLI); + PlaceLoopMarker(MBB, MF, ScopeTops, LoopTops, TII, MLI); // Place the BLOCK for MBB if MBB is branched to from above. PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT); } -} -#ifndef NDEBUG -static bool -IsOnStack(const SmallVectorImpl<std::pair<MachineBasicBlock *, bool>> &Stack, - const MachineBasicBlock *MBB) { - for (const auto &Pair : Stack) - if (Pair.first == MBB) - return true; - return false; + // Now rewrite references to basic blocks to be depth immediates. + SmallVector<const MachineBasicBlock *, 8> Stack; + for (auto &MBB : reverse(MF)) { + for (auto &MI : reverse(MBB)) { + switch (MI.getOpcode()) { + case WebAssembly::BLOCK: + assert(ScopeTops[Stack.back()->getNumber()] == &MBB && + "Block should be balanced"); + Stack.pop_back(); + break; + case WebAssembly::LOOP: + assert(Stack.back() == &MBB && "Loop top should be balanced"); + Stack.pop_back(); + Stack.pop_back(); + break; + case WebAssembly::END_BLOCK: + Stack.push_back(&MBB); + break; + case WebAssembly::END_LOOP: + Stack.push_back(&MBB); + Stack.push_back(LoopTops[&MI]); + break; + default: + if (MI.isTerminator()) { + // Rewrite MBB operands to be depth immediates. + SmallVector<MachineOperand, 4> Ops(MI.operands()); + while (MI.getNumOperands() > 0) + MI.RemoveOperand(MI.getNumOperands() - 1); + for (auto MO : Ops) { + if (MO.isMBB()) + MO = MachineOperand::CreateImm(GetDepth(Stack, MO.getMBB())); + MI.addOperand(MF, MO); + } + } + break; + } + } + } + assert(Stack.empty() && "Control flow should be balanced"); } -#endif bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** CFG Stackifying **********\n" @@ -415,7 +476,9 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { const auto &MLI = getAnalysis<MachineLoopInfo>(); auto &MDT = getAnalysis<MachineDominatorTree>(); + // Liveness is not tracked for EXPR_STACK physreg. const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); + MF.getRegInfo().invalidateLiveness(); // RPO sorting needs all loops to be single-entry. EliminateMultipleEntryLoops(MF, MLI); @@ -426,43 +489,5 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { // Place the BLOCK and LOOP markers to indicate the beginnings of scopes. PlaceMarkers(MF, MLI, TII, MDT); -#ifndef NDEBUG - // Verify that block and loop beginnings and endings are in LIFO order, and - // that all references to blocks are to blocks on the stack at the point of - // the reference. - SmallVector<std::pair<MachineBasicBlock *, bool>, 0> Stack; - for (auto &MBB : MF) { - while (!Stack.empty() && Stack.back().first == &MBB) - if (Stack.back().second) { - assert(Stack.size() >= 2); - Stack.pop_back(); - Stack.pop_back(); - } else { - assert(Stack.size() >= 1); - Stack.pop_back(); - } - for (auto &MI : MBB) - switch (MI.getOpcode()) { - case WebAssembly::LOOP: - Stack.push_back(std::make_pair(&MBB, false)); - Stack.push_back(std::make_pair(MI.getOperand(0).getMBB(), true)); - break; - case WebAssembly::BLOCK: - Stack.push_back(std::make_pair(MI.getOperand(0).getMBB(), false)); - break; - default: - // Verify that all referenced blocks are in scope. A reference to a - // block with a negative number is invalid, but can happen with inline - // asm, so we shouldn't assert on it, but instead let CodeGen properly - // fail on it. - for (const MachineOperand &MO : MI.explicit_operands()) - if (MO.isMBB() && MO.getMBB()->getNumber() >= 0) - assert(IsOnStack(Stack, MO.getMBB())); - break; - } - } - assert(Stack.empty()); -#endif - return true; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 7a89f788c1ad..e9933b092988 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -573,7 +573,8 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op, SDLoc DL(Op); const auto *GA = cast<GlobalAddressSDNode>(Op); EVT VT = Op.getValueType(); - assert(GA->getTargetFlags() == 0 && "WebAssembly doesn't set target flags"); + assert(GA->getTargetFlags() == 0 && + "Unexpected target flags on generic GlobalAddressSDNode"); if (GA->getAddressSpace() != 0) fail(DL, DAG, "WebAssembly only expects the 0 address space"); return DAG.getNode( @@ -587,9 +588,16 @@ WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op, SDLoc DL(Op); const auto *ES = cast<ExternalSymbolSDNode>(Op); EVT VT = Op.getValueType(); - assert(ES->getTargetFlags() == 0 && "WebAssembly doesn't set target flags"); + assert(ES->getTargetFlags() == 0 && + "Unexpected target flags on generic ExternalSymbolSDNode"); + // Set the TargetFlags to 0x1 which indicates that this is a "function" + // symbol rather than a data symbol. We do this unconditionally even though + // we don't know anything about the symbol other than its name, because all + // external symbols used in target-independent SelectionDAG code are for + // functions. return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT, - DAG.getTargetExternalSymbol(ES->getSymbol(), VT)); + DAG.getTargetExternalSymbol(ES->getSymbol(), VT, + /*TargetFlags=*/0x1)); } SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op, diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td index 05efe8903413..fda95953db81 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -41,28 +41,33 @@ let Defs = [ARGUMENTS] in { // TODO: SelectionDAG's lowering insists on using a pointer as the index for // jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode // currently. +// Set TSFlags{0} to 1 to indicate that the variable_ops are immediates. +// Set TSFlags{1} to 1 to indicate that the immediates represent labels. let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops), [(WebAssemblytableswitch I32:$index, bb:$default)], - "tableswitch\t$index, $default">; + "tableswitch\t$index, $default"> { + let TSFlags{0} = 1; + let TSFlags{1} = 1; +} def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops), [(WebAssemblytableswitch I64:$index, bb:$default)], - "tableswitch\t$index, $default">; + "tableswitch\t$index, $default"> { + let TSFlags{0} = 1; + let TSFlags{1} = 1; +} } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 -// Placemarkers to indicate the start of a block or loop scope. These +// Placemarkers to indicate the start or end of a block or loop scope. These // use/clobber EXPR_STACK to prevent them from being moved into the middle of // an expression tree. let Uses = [EXPR_STACK], Defs = [EXPR_STACK] in { -def BLOCK : I<(outs), (ins bb_op:$dst), [], "block \t$dst">; -def LOOP : I<(outs), (ins bb_op:$dst), [], "loop \t$dst">; +def BLOCK : I<(outs), (ins), [], "block">; +def LOOP : I<(outs), (ins), [], "loop">; +def END_BLOCK : I<(outs), (ins), [], "end_block">; +def END_LOOP : I<(outs), (ins), [], "end_loop">; } // Uses = [EXPR_STACK], Defs = [EXPR_STACK] -// No-op to indicate to the AsmPrinter that a loop ends here, so a -// basic block label is needed even if it wouldn't otherwise appear so. -let isTerminator = 1, hasCtrlDep = 1 in -def LOOP_END : I<(outs), (ins), []>; - multiclass RETURN<WebAssemblyRegClass vt> { def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)], "return \t$val">; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 5e7663cdb506..028e9af0834f 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -74,6 +74,9 @@ bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, case WebAssembly::BR_IF: if (HaveCond) return true; + // If we're running after CFGStackify, we can't optimize further. + if (!MI.getOperand(1).isMBB()) + return true; Cond.push_back(MachineOperand::CreateImm(true)); Cond.push_back(MI.getOperand(0)); TBB = MI.getOperand(1).getMBB(); @@ -82,12 +85,18 @@ bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, case WebAssembly::BR_UNLESS: if (HaveCond) return true; + // If we're running after CFGStackify, we can't optimize further. + if (!MI.getOperand(1).isMBB()) + return true; Cond.push_back(MachineOperand::CreateImm(false)); Cond.push_back(MI.getOperand(0)); TBB = MI.getOperand(1).getMBB(); HaveCond = true; break; case WebAssembly::BR: + // If we're running after CFGStackify, we can't optimize further. + if (!MI.getOperand(0).isMBB()) + return true; if (!HaveCond) TBB = MI.getOperand(0).getMBB(); else diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index f0b4ce7caf51..2e682a475471 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -66,8 +66,18 @@ def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper", // WebAssembly-specific Operands. //===----------------------------------------------------------------------===// +let OperandNamespace = "WebAssembly" in { + +let OperandType = "OPERAND_BASIC_BLOCK" in def bb_op : Operand<OtherVT>; +let OperandType = "OPERAND_FPIMM" in { +def f32imm_op : Operand<f32>; +def f64imm_op : Operand<f64>; +} // OperandType = "OPERAND_FPIMM" + +} // OperandNamespace = "WebAssembly" + //===----------------------------------------------------------------------===// // WebAssembly Instruction Format Definitions. //===----------------------------------------------------------------------===// @@ -120,31 +130,20 @@ def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm), def CONST_I64 : I<(outs I64:$res), (ins i64imm:$imm), [(set I64:$res, imm:$imm)], "i64.const\t$res, $imm">; -def CONST_F32 : I<(outs F32:$res), (ins f32imm:$imm), +def CONST_F32 : I<(outs F32:$res), (ins f32imm_op:$imm), [(set F32:$res, fpimm:$imm)], "f32.const\t$res, $imm">; -def CONST_F64 : I<(outs F64:$res), (ins f64imm:$imm), +def CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm), [(set F64:$res, fpimm:$imm)], "f64.const\t$res, $imm">; } // isMoveImm = 1 } // Defs = [ARGUMENTS] -def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$dst)), - (CONST_I32 tglobaladdr:$dst)>; -def : Pat<(i32 (WebAssemblywrapper texternalsym:$dst)), - (CONST_I32 texternalsym:$dst)>; -def : Pat<(i32 (WebAssemblywrapper tjumptable:$dst)), - (CONST_I32 tjumptable:$dst)>; - -let Defs = [ARGUMENTS] in { - -// Function signature and local variable declaration "instructions". -def PARAM : I<(outs), (ins variable_ops), [], ".param \t">; -def RESULT : I<(outs), (ins variable_ops), [], ".result \t">; -def LOCAL : I<(outs), (ins variable_ops), [], ".local \t">; - -} // Defs = [ARGUMENTS] +def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)), + (CONST_I32 tglobaladdr:$addr)>; +def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)), + (CONST_I32 texternalsym:$addr)>; //===----------------------------------------------------------------------===// // Additional sets of instructions. diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index 74ec45d58644..b39ac5212f87 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -24,10 +24,25 @@ // WebAssembly constant offsets are performed as unsigned with infinite // precision, so we need to check for NoUnsignedWrap so that we don't fold an // offset for an add that needs wrapping. -def regPlusImm : PatFrag<(ops node:$off, node:$addr), +def regPlusImm : PatFrag<(ops node:$addr, node:$off), (add node:$addr, node:$off), [{ return N->getFlags()->hasNoUnsignedWrap(); }]>; +// GlobalAddresses are conceptually unsigned values, so we can also fold them +// into immediate values as long as their offsets are non-negative. +def regPlusGA : PatFrag<(ops node:$addr, node:$off), + (add node:$addr, node:$off), + [{ + return N->getFlags()->hasNoUnsignedWrap() || + (N->getOperand(1)->getOpcode() == WebAssemblyISD::Wrapper && + isa<GlobalAddressSDNode>(N->getOperand(1)->getOperand(0)) && + cast<GlobalAddressSDNode>(N->getOperand(1)->getOperand(0)) + ->getOffset() >= 0); +}]>; + +// We don't need a regPlusES because external symbols never have constant +// offsets folded into them, so we can just use add. + let Defs = [ARGUMENTS] in { // Basic load. @@ -49,29 +64,33 @@ def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>; def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>; // Select loads with a constant offset. -def : Pat<(i32 (load (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i32 (load (regPlusImm I32:$addr, imm:$off))), (LOAD_I32 imm:$off, $addr)>; -def : Pat<(i64 (load (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (load (regPlusImm I32:$addr, imm:$off))), (LOAD_I64 imm:$off, $addr)>; -def : Pat<(f32 (load (regPlusImm imm:$off, I32:$addr))), +def : Pat<(f32 (load (regPlusImm I32:$addr, imm:$off))), (LOAD_F32 imm:$off, $addr)>; -def : Pat<(f64 (load (regPlusImm imm:$off, I32:$addr))), +def : Pat<(f64 (load (regPlusImm I32:$addr, imm:$off))), (LOAD_F64 imm:$off, $addr)>; -def : Pat<(i32 (load (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i32 (load (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD_I32 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (load (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (load (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD_I64 tglobaladdr:$off, $addr)>; -def : Pat<(f32 (load (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(f32 (load (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD_F32 tglobaladdr:$off, $addr)>; -def : Pat<(f64 (load (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(f64 (load (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD_F64 tglobaladdr:$off, $addr)>; -def : Pat<(i32 (load (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))), (LOAD_I32 texternalsym:$off, $addr)>; -def : Pat<(i64 (load (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))), (LOAD_I64 texternalsym:$off, $addr)>; -def : Pat<(f32 (load (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(f32 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))), (LOAD_F32 texternalsym:$off, $addr)>; -def : Pat<(f64 (load (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(f64 (load (add I32:$addr, (WebAssemblywrapper texternalsym:$off)))), (LOAD_F64 texternalsym:$off, $addr)>; // Select loads with just a constant offset. @@ -135,65 +154,85 @@ def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>; def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>; // Select extending loads with a constant offset. -def : Pat<(i32 (sextloadi8 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i32 (sextloadi8 (regPlusImm I32:$addr, imm:$off))), (LOAD8_S_I32 imm:$off, $addr)>; -def : Pat<(i32 (zextloadi8 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i32 (zextloadi8 (regPlusImm I32:$addr, imm:$off))), (LOAD8_U_I32 imm:$off, $addr)>; -def : Pat<(i32 (sextloadi16 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i32 (sextloadi16 (regPlusImm I32:$addr, imm:$off))), (LOAD16_S_I32 imm:$off, $addr)>; -def : Pat<(i32 (zextloadi16 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i32 (zextloadi16 (regPlusImm I32:$addr, imm:$off))), (LOAD16_U_I32 imm:$off, $addr)>; -def : Pat<(i64 (sextloadi8 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (sextloadi8 (regPlusImm I32:$addr, imm:$off))), (LOAD8_S_I64 imm:$off, $addr)>; -def : Pat<(i64 (zextloadi8 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (zextloadi8 (regPlusImm I32:$addr, imm:$off))), (LOAD8_U_I64 imm:$off, $addr)>; -def : Pat<(i64 (sextloadi16 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (sextloadi16 (regPlusImm I32:$addr, imm:$off))), (LOAD16_S_I64 imm:$off, $addr)>; -def : Pat<(i64 (zextloadi16 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (zextloadi16 (regPlusImm I32:$addr, imm:$off))), (LOAD16_U_I64 imm:$off, $addr)>; -def : Pat<(i64 (sextloadi32 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (sextloadi32 (regPlusImm I32:$addr, imm:$off))), (LOAD32_S_I64 imm:$off, $addr)>; -def : Pat<(i64 (zextloadi32 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (zextloadi32 (regPlusImm I32:$addr, imm:$off))), (LOAD32_U_I64 imm:$off, $addr)>; -def : Pat<(i32 (sextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i32 (sextloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD8_S_I32 tglobaladdr:$off, $addr)>; -def : Pat<(i32 (zextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i32 (zextloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD8_U_I32 tglobaladdr:$off, $addr)>; -def : Pat<(i32 (sextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i32 (sextloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD16_S_I32 tglobaladdr:$off, $addr)>; -def : Pat<(i32 (zextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i32 (zextloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD16_U_I32 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (sextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (sextloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD8_S_I64 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (zextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (zextloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD8_U_I64 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (sextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (sextloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD16_S_I64 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (zextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (zextloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD16_U_I64 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (sextloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (sextloadi32 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD32_S_I64 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (zextloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (zextloadi32 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD32_U_I64 tglobaladdr:$off, $addr)>; -def : Pat<(i32 (sextloadi8 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i32 (sextloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD8_S_I32 texternalsym:$off, $addr)>; -def : Pat<(i32 (zextloadi8 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i32 (zextloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD8_U_I32 texternalsym:$off, $addr)>; -def : Pat<(i32 (sextloadi16 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i32 (sextloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD16_S_I32 texternalsym:$off, $addr)>; -def : Pat<(i32 (zextloadi16 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i32 (zextloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD16_U_I32 texternalsym:$off, $addr)>; -def : Pat<(i64 (sextloadi8 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (sextloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD8_S_I64 texternalsym:$off, $addr)>; -def : Pat<(i64 (zextloadi8 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (zextloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD8_U_I64 texternalsym:$off, $addr)>; -def : Pat<(i64 (sextloadi16 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (sextloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD16_S_I64 texternalsym:$off, $addr)>; -def : Pat<(i64 (zextloadi16 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (zextloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD16_U_I64 texternalsym:$off, $addr)>; -def : Pat<(i64 (sextloadi32 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (sextloadi32 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD32_S_I64 texternalsym:$off, $addr)>; -def : Pat<(i64 (zextloadi32 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (zextloadi32 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD32_U_I64 texternalsym:$off, $addr)>; // Select extending loads with just a constant offset. @@ -259,35 +298,45 @@ def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>; def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>; // Select "don't care" extending loads with a constant offset. -def : Pat<(i32 (extloadi8 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i32 (extloadi8 (regPlusImm I32:$addr, imm:$off))), (LOAD8_U_I32 imm:$off, $addr)>; -def : Pat<(i32 (extloadi16 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i32 (extloadi16 (regPlusImm I32:$addr, imm:$off))), (LOAD16_U_I32 imm:$off, $addr)>; -def : Pat<(i64 (extloadi8 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (extloadi8 (regPlusImm I32:$addr, imm:$off))), (LOAD8_U_I64 imm:$off, $addr)>; -def : Pat<(i64 (extloadi16 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (extloadi16 (regPlusImm I32:$addr, imm:$off))), (LOAD16_U_I64 imm:$off, $addr)>; -def : Pat<(i64 (extloadi32 (regPlusImm imm:$off, I32:$addr))), +def : Pat<(i64 (extloadi32 (regPlusImm I32:$addr, imm:$off))), (LOAD32_U_I64 imm:$off, $addr)>; -def : Pat<(i32 (extloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i32 (extloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD8_U_I32 tglobaladdr:$off, $addr)>; -def : Pat<(i32 (extloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i32 (extloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD16_U_I32 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (extloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (extloadi8 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD8_U_I64 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (extloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (extloadi16 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD16_U_I64 tglobaladdr:$off, $addr)>; -def : Pat<(i64 (extloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))), +def : Pat<(i64 (extloadi32 (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off)))), (LOAD32_U_I64 tglobaladdr:$off, $addr)>; -def : Pat<(i32 (extloadi8 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i32 (extloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD8_U_I32 texternalsym:$off, $addr)>; -def : Pat<(i32 (extloadi16 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i32 (extloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD16_U_I32 texternalsym:$off, $addr)>; -def : Pat<(i64 (extloadi8 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (extloadi8 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD8_U_I64 texternalsym:$off, $addr)>; -def : Pat<(i64 (extloadi16 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (extloadi16 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD16_U_I64 texternalsym:$off, $addr)>; -def : Pat<(i64 (extloadi32 (regPlusImm texternalsym:$off, I32:$addr))), +def : Pat<(i64 (extloadi32 (add I32:$addr, + (WebAssemblywrapper texternalsym:$off)))), (LOAD32_U_I64 texternalsym:$off, $addr)>; // Select "don't care" extending loads with just a constant offset. @@ -343,29 +392,37 @@ def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>; def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>; // Select stores with a constant offset. -def : Pat<(store I32:$val, (regPlusImm imm:$off, I32:$addr)), +def : Pat<(store I32:$val, (regPlusImm I32:$addr, imm:$off)), (STORE_I32 imm:$off, I32:$addr, I32:$val)>; -def : Pat<(store I64:$val, (regPlusImm imm:$off, I32:$addr)), +def : Pat<(store I64:$val, (regPlusImm I32:$addr, imm:$off)), (STORE_I64 imm:$off, I32:$addr, I64:$val)>; -def : Pat<(store F32:$val, (regPlusImm imm:$off, I32:$addr)), +def : Pat<(store F32:$val, (regPlusImm I32:$addr, imm:$off)), (STORE_F32 imm:$off, I32:$addr, F32:$val)>; -def : Pat<(store F64:$val, (regPlusImm imm:$off, I32:$addr)), +def : Pat<(store F64:$val, (regPlusImm I32:$addr, imm:$off)), (STORE_F64 imm:$off, I32:$addr, F64:$val)>; -def : Pat<(store I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), +def : Pat<(store I32:$val, (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), (STORE_I32 tglobaladdr:$off, I32:$addr, I32:$val)>; -def : Pat<(store I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), +def : Pat<(store I64:$val, (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), (STORE_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; -def : Pat<(store F32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), +def : Pat<(store F32:$val, (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), (STORE_F32 tglobaladdr:$off, I32:$addr, F32:$val)>; -def : Pat<(store F64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), +def : Pat<(store F64:$val, (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), (STORE_F64 tglobaladdr:$off, I32:$addr, F64:$val)>; -def : Pat<(store I32:$val, (regPlusImm texternalsym:$off, I32:$addr)), +def : Pat<(store I32:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), (STORE_I32 texternalsym:$off, I32:$addr, I32:$val)>; -def : Pat<(store I64:$val, (regPlusImm texternalsym:$off, I32:$addr)), +def : Pat<(store I64:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), (STORE_I64 texternalsym:$off, I32:$addr, I64:$val)>; -def : Pat<(store F32:$val, (regPlusImm texternalsym:$off, I32:$addr)), +def : Pat<(store F32:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), (STORE_F32 texternalsym:$off, I32:$addr, F32:$val)>; -def : Pat<(store F64:$val, (regPlusImm texternalsym:$off, I32:$addr)), +def : Pat<(store F64:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), (STORE_F64 texternalsym:$off, I32:$addr, F64:$val)>; // Select stores with just a constant offset. @@ -423,35 +480,54 @@ def : Pat<(truncstorei32 I64:$val, I32:$addr), (STORE32_I64 0, I32:$addr, I64:$val)>; // Select truncating stores with a constant offset. -def : Pat<(truncstorei8 I32:$val, (regPlusImm imm:$off, I32:$addr)), +def : Pat<(truncstorei8 I32:$val, (regPlusImm I32:$addr, imm:$off)), (STORE8_I32 imm:$off, I32:$addr, I32:$val)>; -def : Pat<(truncstorei16 I32:$val, (regPlusImm imm:$off, I32:$addr)), +def : Pat<(truncstorei16 I32:$val, (regPlusImm I32:$addr, imm:$off)), (STORE16_I32 imm:$off, I32:$addr, I32:$val)>; -def : Pat<(truncstorei8 I64:$val, (regPlusImm imm:$off, I32:$addr)), +def : Pat<(truncstorei8 I64:$val, (regPlusImm I32:$addr, imm:$off)), (STORE8_I64 imm:$off, I32:$addr, I64:$val)>; -def : Pat<(truncstorei16 I64:$val, (regPlusImm imm:$off, I32:$addr)), +def : Pat<(truncstorei16 I64:$val, (regPlusImm I32:$addr, imm:$off)), (STORE16_I64 imm:$off, I32:$addr, I64:$val)>; -def : Pat<(truncstorei32 I64:$val, (regPlusImm imm:$off, I32:$addr)), +def : Pat<(truncstorei32 I64:$val, (regPlusImm I32:$addr, imm:$off)), (STORE32_I64 imm:$off, I32:$addr, I64:$val)>; -def : Pat<(truncstorei8 I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), +def : Pat<(truncstorei8 I32:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), (STORE8_I32 tglobaladdr:$off, I32:$addr, I32:$val)>; -def : Pat<(truncstorei16 I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), +def : Pat<(truncstorei16 I32:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), (STORE16_I32 tglobaladdr:$off, I32:$addr, I32:$val)>; -def : Pat<(truncstorei8 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), +def : Pat<(truncstorei8 I64:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), (STORE8_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; -def : Pat<(truncstorei16 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), +def : Pat<(truncstorei16 I64:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), (STORE16_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; -def : Pat<(truncstorei32 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), +def : Pat<(truncstorei32 I64:$val, + (regPlusGA I32:$addr, + (WebAssemblywrapper tglobaladdr:$off))), (STORE32_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; -def : Pat<(truncstorei8 I32:$val, (regPlusImm texternalsym:$off, I32:$addr)), +def : Pat<(truncstorei8 I32:$val, (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), (STORE8_I32 texternalsym:$off, I32:$addr, I32:$val)>; -def : Pat<(truncstorei16 I32:$val, (regPlusImm texternalsym:$off, I32:$addr)), +def : Pat<(truncstorei16 I32:$val, + (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), (STORE16_I32 texternalsym:$off, I32:$addr, I32:$val)>; -def : Pat<(truncstorei8 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)), +def : Pat<(truncstorei8 I64:$val, + (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), (STORE8_I64 texternalsym:$off, I32:$addr, I64:$val)>; -def : Pat<(truncstorei16 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)), +def : Pat<(truncstorei16 I64:$val, + (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), (STORE16_I64 texternalsym:$off, I32:$addr, I64:$val)>; -def : Pat<(truncstorei32 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)), +def : Pat<(truncstorei32 I64:$val, + (add I32:$addr, + (WebAssemblywrapper texternalsym:$off))), (STORE32_I64 texternalsym:$off, I32:$addr, I64:$val)>; // Select truncating stores with just a constant offset. diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index a953f8247006..022a448590ec 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -36,15 +36,17 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( return Printer.GetExternalSymbolSymbol(MO.getSymbolName()); } -MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(const MachineOperand &MO, - MCSymbol *Sym) const { - assert(MO.getTargetFlags() == 0 && "WebAssembly does not use target flags"); +MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym, + int64_t Offset, + bool IsFunc) const { + MCSymbolRefExpr::VariantKind VK = + IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION + : MCSymbolRefExpr::VK_None; + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx); - const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - - int64_t Offset = MO.getOffset(); if (Offset != 0) { - assert(!MO.isJTI() && "Unexpected offset with jump table index"); + if (IsFunc) + report_fatal_error("Function addresses with offsets not supported"); Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx); } @@ -64,6 +66,9 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, default: MI->dump(); llvm_unreachable("unknown operand type"); + case MachineOperand::MO_MachineBasicBlock: + MI->dump(); + llvm_unreachable("MachineBasicBlock operand should have been rewritten"); case MachineOperand::MO_Register: { // Ignore all implicit register operands. if (MO.isImplicit()) @@ -89,15 +94,19 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, llvm_unreachable("unknown floating point immediate type"); break; } - case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::createExpr( - MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx)); - break; case MachineOperand::MO_GlobalAddress: - MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO)); + assert(MO.getTargetFlags() == 0 && + "WebAssembly does not use target flags on GlobalAddresses"); + MCOp = LowerSymbolOperand(GetGlobalAddressSymbol(MO), MO.getOffset(), + MO.getGlobal()->getValueType()->isFunctionTy()); break; case MachineOperand::MO_ExternalSymbol: - MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO)); + // The target flag indicates whether this is a symbol for a + // variable or a function. + assert((MO.getTargetFlags() & -2) == 0 && + "WebAssembly uses only one target flag bit on ExternalSymbols"); + MCOp = LowerSymbolOperand(GetExternalSymbolSymbol(MO), /*Offset=*/0, + MO.getTargetFlags() & 1); break; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h index 6d704704f576..ab4ba1c28d53 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h @@ -31,9 +31,10 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower { MCContext &Ctx; AsmPrinter &Printer; - MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; + MCOperand LowerSymbolOperand(MCSymbol *Sym, int64_t Offset, + bool IsFunc) const; public: WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer) diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index 89ef5cdb2bef..537c147e6142 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -147,8 +147,10 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { // block boundaries, and the blocks aren't ordered so the block visitation // order isn't significant, but we may want to change this in the future. for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : reverse(MBB)) { - MachineInstr *Insert = &MI; + // Don't use a range-based for loop, because we modify the list as we're + // iterating over it and the end iterator may change. + for (auto MII = MBB.rbegin(); MII != MBB.rend(); ++MII) { + MachineInstr *Insert = &*MII; // Don't nest anything inside a phi. if (Insert->getOpcode() == TargetOpcode::PHI) break; @@ -221,7 +223,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { Insert = Def; } if (AnyStackified) - ImposeStackOrdering(&MI); + ImposeStackOrdering(&*MII); } } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp index dcada45f96d1..90d8dda530ba 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp @@ -61,17 +61,23 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex( MachineFunction &MF = *MBB.getParent(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); const MachineFrameInfo& MFI = *MF.getFrameInfo(); - int FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex); + int64_t FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex); if (MI.mayLoadOrStore()) { // If this is a load or store, make it relative to SP and fold the frame - // offset directly in - assert(MI.getOperand(1).getImm() == 0 && - "Can't eliminate FI yet if offset is already set"); - MI.getOperand(1).setImm(FrameOffset); + // offset directly in. + assert(FrameOffset >= 0 && MI.getOperand(1).getImm() >= 0); + int64_t Offset = MI.getOperand(1).getImm() + FrameOffset; + + if (static_cast<uint64_t>(Offset) > std::numeric_limits<uint32_t>::max()) { + // If this happens the program is invalid, but better to error here than + // generate broken code. + report_fatal_error("Memory offset field overflow"); + } + MI.getOperand(1).setImm(Offset); MI.getOperand(2).ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false); } else { - // Otherwise create an i32.add SP, offset and make it the operand + // Otherwise create an i32.add SP, offset and make it the operand. auto &MRI = MF.getRegInfo(); const auto *TII = MF.getSubtarget().getInstrInfo(); diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index e31ea46de9f5..b290b4bf7440 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -45,8 +45,9 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT.isArch64Bit() ? "e-p:64:64-i64:64-n32:64-S128" - : "e-p:32:32-i64:64-n32:64-S128", + : LLVMTargetMachine(T, + TT.isArch64Bit() ? "e-m:e-p:64:64-i64:64-n32:64-S128" + : "e-m:e-p:32:32-i64:64-n32:64-S128", TT, CPU, FS, Options, RM, CM, OL), TLOF(make_unique<WebAssemblyTargetObjectFile>()) { // WebAssembly type-checks expressions, but a noreturn function with a return diff --git a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt index 92ecde3f90d6..91b3fff05dca 100644 --- a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt +++ b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt @@ -5,23 +5,6 @@ pr38151.c va-arg-22.c -# WebAssemblyRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator, int, unsigned int, llvm::RegScavenger *) const: Assertion `MI.getOperand(1).getImm() == 0 && "Can't eliminate FI yet if offset is already set"' -20030313-1.c -20030916-1.c -20031012-1.c -20041126-1.c -20060420-1.c -20071202-1.c -20120808-1.c -pr20527-1.c -pr27073.c -pr36339.c -pr37573.c -pr43236.c -pr43835.c -pr45070.c -pr51933.c - # TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed. struct-ret-1.c va-arg-11.c @@ -140,8 +123,6 @@ pr38051.c pr39100.c pr39339.c -pr40022.c -pr40657.c pr43987.c diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index fbec6626d99d..01e65b89f480 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -29,7 +29,7 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel); /// This pass initializes a global base register for PIC on x86-32. -FunctionPass* createX86GlobalBaseRegPass(); +FunctionPass *createX86GlobalBaseRegPass(); /// This pass combines multiple accesses to local-dynamic TLS variables so that /// the TLS base address for the module is only fetched once per execution path @@ -49,12 +49,13 @@ FunctionPass *createX86IssueVZeroUpperPass(); /// This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); -/// Return a a pass that selectively replaces certain instructions (like add, +/// Return a pass that selectively replaces certain instructions (like add, /// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA /// instructions, in order to eliminate execution delays in some processors. FunctionPass *createX86FixupLEAs(); -/// Return a pass that removes redundant address recalculations. +/// Return a pass that removes redundant LEA instructions and redundant address +/// recalculations. FunctionPass *createX86OptimizeLEAs(); /// Return a pass that optimizes the code-size of x86 call sequences. This is diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td index 54d88cbb244e..e8b96e74a7af 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -831,6 +831,12 @@ def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI, R8, R9, R10, R11)>; +// CSRs that are handled by prologue, epilogue. +def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>; + +// CSRs that are handled explicitly via copies. +def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>; + // All GPRs - except r11 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, R8, R9, R10, RSP)>; diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index 629d4d3565f2..f48b47934e03 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -1002,6 +1002,9 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index d31aab0fa141..1ec93b5f2d23 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -265,7 +265,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } - } + } else if (!Subtarget->is64Bit()) + setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes @@ -2310,6 +2311,18 @@ X86TargetLowering::LowerReturn(SDValue Chain, DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (X86::GR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + RetOps[0] = Chain; // Update chain. // Add the flag if we have it. @@ -3907,6 +3920,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: + case X86ISD::INSERTPS: case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: @@ -4157,6 +4171,35 @@ static bool hasFPCMov(unsigned X86CC) { } } + +bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + + const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); + if (!IntrData) + return false; + + switch (IntrData->Type) { + case LOADA: + case LOADU: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(I.getType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1); + Info.vol = false; + Info.readMem = true; + Info.writeMem = false; + return true; + } + default: + break; + } + + return false; +} + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. @@ -4743,8 +4786,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, /// uses one source. Note that this will set IsUnary for shuffles which use a /// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. -/// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. -static bool getTargetShuffleMask(SDNode *N, MVT VT, +static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, SmallVectorImpl<int> &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; @@ -4761,6 +4803,11 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; + case X86ISD::INSERTPS: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); @@ -4870,10 +4917,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); - // Mask only contains negative index if an element is zero. - if (std::any_of(Mask.begin(), Mask.end(), - [](int M){ return M == SM_SentinelZero; })) - return false; + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); @@ -5008,6 +5052,12 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (Mask.empty()) return false; + // Check if we're getting a shuffle mask with zero'd elements. + if (!AllowSentinelZero) + if (std::any_of(Mask.begin(), Mask.end(), + [](int M){ return M == SM_SentinelZero; })) + return false; + // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. @@ -5046,19 +5096,19 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); - unsigned NumElems = ShufVT.getVectorNumElements(); + int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; bool IsUnary; - if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, false, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; - if (Elt < 0) + if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufVT.getVectorElementType()); - SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) - : N->getOperand(1); + assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); + SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -8165,6 +8215,13 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; + MVT BroadcastVT = VT; + + // Peek through any bitcast (only useful for loads). + SDValue BC = V; + while (BC.getOpcode() == ISD::BITCAST) + BC = BC.getOperand(0); + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { @@ -8174,13 +8231,17 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) { + } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) { + // 32-bit targets need to load i64 as a f64 and then bitcast the result. + if (!Subtarget->is64Bit() && VT.getScalarType() == MVT::i64) + BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); + // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. - LoadSDNode *Ld = cast<LoadSDNode>(V); + LoadSDNode *Ld = cast<LoadSDNode>(BC); SDValue BaseAddr = Ld->getOperand(1); EVT AddrVT = BaseAddr.getValueType(); - EVT SVT = VT.getScalarType(); + EVT SVT = BroadcastVT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); SDValue NewAddr = DAG.getNode( ISD::ADD, DL, AddrVT, BaseAddr, @@ -8194,7 +8255,8 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, return SDValue(); } - return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); + V = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, V); + return DAG.getBitcast(VT, V); } // Check for whether we can use INSERTPS to perform the shuffle. We only use @@ -12474,8 +12536,12 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); + Chain = + DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), + DAG.getIntPtrConstant(0, DL, true), SDValue(), DL); // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -12648,13 +12714,21 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } + SDValue ValueToStore = Op.getOperand(0); + if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && + !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); SDValue Chain = DAG.getStore( - DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, + DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); @@ -13027,7 +13101,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, } assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + SDValue ValueToStore = Op.getOperand(0); + if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget->is64Bit()) + // Bitcasting to f64 here allows us to do a single 64-bit store from + // an SSE register, avoiding the store forwarding penalty that would come + // with two 32-bit stores. + ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot, MachinePointerInfo(), false, false, 0); // For i64 source, we need to add the appropriate power of 2 if the input @@ -17487,7 +17567,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, dl); } case COMPRESS_TO_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue DataToCompress = Op.getOperand(3); SDValue Addr = Op.getOperand(2); @@ -17513,7 +17592,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case TRUNCATE_TO_MEM_VI32: return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { - SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); @@ -17533,6 +17611,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, Mask, PassThru, Subtarget, DAG), Chain}; return DAG.getMergeValues(Results, dl); } + case LOADU: + case LOADA: { + SDValue Mask = Op.getOperand(4); + SDValue PassThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + MVT VT = Op.getSimpleValueType(); + + MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); + assert(MemIntr && "Expected MemIntrinsicSDNode!"); + + if (isAllOnesConstant(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand()); + + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT, + MemIntr->getMemOperand(), ISD::NON_EXTLOAD); + } } } @@ -19512,24 +19609,37 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, MVT SrcVT = Op.getOperand(0).getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); - if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) { + if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || + SrcVT == MVT::i64) { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); if (DstVT != MVT::f64) // This conversion needs to be expanded. return SDValue(); - SDValue InVec = Op->getOperand(0); - SDLoc dl(Op); - unsigned NumElts = SrcVT.getVectorNumElements(); - MVT SVT = SrcVT.getVectorElementType(); - - // Widen the vector in input in the case of MVT::v2i32. - // Example: from MVT::v2i32 to MVT::v4i32. + SDValue Op0 = Op->getOperand(0); SmallVector<SDValue, 16> Elts; - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec, - DAG.getIntPtrConstant(i, dl))); - + SDLoc dl(Op); + unsigned NumElts; + MVT SVT; + if (SrcVT.isVector()) { + NumElts = SrcVT.getVectorNumElements(); + SVT = SrcVT.getVectorElementType(); + + // Widen the vector in input in the case of MVT::v2i32. + // Example: from MVT::v2i32 to MVT::v4i32. + for (unsigned i = 0, e = NumElts; i != e; ++i) + Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0, + DAG.getIntPtrConstant(i, dl))); + } else { + assert(SrcVT == MVT::i64 && !Subtarget->is64Bit() && + "Unexpected source type in LowerBITCAST"); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(0, dl))); + Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0, + DAG.getIntPtrConstant(1, dl))); + NumElts = 2; + SVT = MVT::i32; + } // Explicitly mark the extra elements as Undef. Elts.append(NumElts, DAG.getUNDEF(SVT)); @@ -20685,6 +20795,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VROTLI: return "X86ISD::VROTLI"; + case X86ISD::VROTRI: return "X86ISD::VROTRI"; case X86ISD::CMPP: return "X86ISD::CMPP"; case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; @@ -23184,7 +23296,7 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, return false; SmallVector<int, 16> OpMask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); + bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary); // We only can combine unary shuffles which we can decode the mask for. if (!HaveMask || !IsUnary) return false; @@ -23281,7 +23393,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), VT, Mask, IsUnary); + bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary); (void)HaveMask; assert(HaveMask); @@ -23854,6 +23966,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SDValue InVec = N->getOperand(0); SDValue EltNo = N->getOperand(1); + EVT EltVT = N->getValueType(0); if (!isa<ConstantSDNode>(EltNo)) return SDValue(); @@ -23882,14 +23995,22 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); - int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; + int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt]; + + if (Idx == SM_SentinelZero) + return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT) + : DAG.getConstantFP(+0.0, SDLoc(N), EltVT); + if (Idx == SM_SentinelUndef) + return DAG.getUNDEF(EltVT); + + assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); @@ -23914,7 +24035,6 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); - EVT EltVT = N->getValueType(0); // If there's a bitcast before the shuffle, check if the load type and // alignment is valid. unsigned Align = LN0->getAlignment(); @@ -27233,6 +27353,32 @@ static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); } +/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) -> +/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y) +/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly +/// extends from AH (which we otherwise need to do contortions to access). +static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + auto OpcodeN = N->getOpcode(); + auto OpcodeN0 = N0.getOpcode(); + if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) || + (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM))) + return SDValue(); + + EVT VT = N->getValueType(0); + EVT InVT = N0.getValueType(); + if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32) + return SDValue(); + + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG + : X86ISD::UDIVREM8_ZEXT_HREG; + SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0), + N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -27243,18 +27389,8 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, EVT InSVT = InVT.getScalarType(); SDLoc DL(N); - // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> - // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) - // This exposes the sext to the sdivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && - InVT == MVT::i8 && VT == MVT::i32) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, DL, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { @@ -27413,19 +27549,8 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - // (i8,i32 zext (udivrem (i8 x, i8 y)) -> - // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) - // This exposes the zext to the udivrem lowering, so that it directly extends - // from AH (which we otherwise need to do contortions to access). - if (N0.getOpcode() == ISD::UDIVREM && - N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && - VT == MVT::i32) { - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); - SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, - N0.getOperand(0), N0.getOperand(1)); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); - return R.getValue(1); - } + if (SDValue DivRem8 = getDivRem8(N, DAG)) + return DivRem8; return SDValue(); } @@ -27923,7 +28048,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); -// TODO: refactor the [SU]DIVREM8_[SZ]EXT_HREG code so that it's not duplicated. case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); @@ -28763,3 +28887,51 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { Attribute::MinSize); return OptSize && !VT.isVector(); } + +void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + if (!Subtarget->is64Bit()) + return; + + // Update IsSplitCSR in X86MachineFunctionInfo. + X86MachineFunctionInfo *AFI = + Entry->getParent()->getInfo<X86MachineFunctionInfo>(); + AFI->setIsSplitCSR(true); +} + +void X86TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const { + const X86RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (X86::GR64RegClass.contains(*I)) + RC = &X86::GR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index 8bb0e5f8bd36..0ab786e08e02 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -316,6 +316,9 @@ namespace llvm { // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, + // Bit rotate by immediate + VROTLI, VROTRI, + // Vector packed double/float comparison. CMPP, @@ -837,6 +840,13 @@ namespace llvm { /// from i32 to i8 but not from i32 to i16. bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + /// Given an intrinsic, checks if on the target the intrinsic will need to map + /// to a MemIntrinsicNode (touches memory). If this is the case, it returns + /// true and stores the intrinsic information into the IntrinsicInfo that was + /// passed to the function. + bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + unsigned Intrinsic) const override; + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. @@ -1057,6 +1067,15 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, SelectionDAG &DAG) const override; + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(CallInst *CI) const override; diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index 0a27c33f033e..49be64883939 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -188,7 +188,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, let isCommutable = IsCommutable in def NAME: AVX512<O, F, Outs, Ins, OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# - "$dst , "#IntelSrcAsm#"}", + "$dst, "#IntelSrcAsm#"}", Pattern, itin>; // Prefer over VMOV*rrk Pat<> @@ -323,18 +323,16 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, list<dag> Pattern, - list<dag> MaskingPattern, - string Round = "", - InstrItinClass itin = NoItinerary> { + list<dag> MaskingPattern> { def NAME: AVX512<O, F, Outs, Ins, - OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"# - "$dst "#Round#", "#IntelSrcAsm#"}", - Pattern, itin>; + OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# + "$dst, "#IntelSrcAsm#"}", + Pattern, NoItinerary>; def NAME#k: AVX512<O, F, Outs, MaskingIns, - OpcodeStr#"\t{"#Round#AttSrcAsm#", $dst {${mask}}|"# - "$dst {${mask}}, "#IntelSrcAsm#Round#"}", - MaskingPattern, itin>, EVEX_K; + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#"}", + MaskingPattern, NoItinerary>, EVEX_K; } multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, @@ -342,33 +340,27 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, dag Ins, dag MaskingIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, dag MaskingRHS, - string Round = "", - InstrItinClass itin = NoItinerary> : + dag RHS, dag MaskingRHS> : AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr, AttSrcAsm, IntelSrcAsm, [(set _.KRC:$dst, RHS)], - [(set _.KRC:$dst, MaskingRHS)], - Round, NoItinerary>; + [(set _.KRC:$dst, MaskingRHS)]>; multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, string Round = "", - InstrItinClass itin = NoItinerary> : + dag RHS> : AVX512_maskable_common_cmp<O, F, _, Outs, Ins, !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (and _.KRCWM:$mask, RHS), - Round, itin>; + (and _.KRCWM:$mask, RHS)>; multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm> : AVX512_maskable_custom_cmp<O, F, Outs, Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr, - AttSrcAsm, IntelSrcAsm, - [],[],"", NoItinerary>; + AttSrcAsm, IntelSrcAsm, [],[]>; // Bitcasts between 512-bit vector types. Return the original type since // no instruction is needed for the conversion @@ -1294,7 +1286,7 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>, EVEX_4V; def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), @@ -1311,7 +1303,7 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), @@ -1426,7 +1418,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), "vcmp${cc}"#_.Suffix, - "{sae}, $src2, $src1", "$src1, $src2,{sae}", + "{sae}, $src2, $src1", "$src1, $src2, {sae}", (OpNodeRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc, @@ -1449,7 +1441,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc,{sae}, $src2, $src1","$src1, $src2,{sae}, $cc">, + "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">, EVEX_4V, EVEX_B; }// let isAsmParserOnly = 1, hasSideEffects = 0 @@ -1831,7 +1823,7 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> { defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), "vcmp${cc}"#_.Suffix, - "{sae}, $src2, $src1", "$src1, $src2,{sae}", + "{sae}, $src2, $src1", "$src1, $src2, {sae}", (X86cmpmRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc, @@ -1842,8 +1834,8 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> { (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc,{sae}, $src2, $src1", - "$src1, $src2,{sae}, $cc">, EVEX_B; + "$cc, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $cc">, EVEX_B; } } @@ -1889,13 +1881,13 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, let Predicates = [prd] in { def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}", + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), (i32 imm:$src2)))], NoItinerary>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix# - "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (i32 imm:$src2))))], NoItinerary>, EVEX_K; @@ -1903,14 +1895,14 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix## - "\t{$src2, $src1, $dst | $dst, $src1, $src2}", + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst, (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i32 imm:$src2)))], NoItinerary>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix## - "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i32 imm:$src2))))], NoItinerary>, EVEX_K; @@ -1925,13 +1917,13 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string mem, string broadcast>{ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}", + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), (i32 imm:$src2)))], NoItinerary>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix# - "\t{$src2, $src1, $dst {${mask}}| $dst {${mask}}, $src1, $src2}", + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (i32 imm:$src2))))], NoItinerary>, EVEX_K; @@ -1939,21 +1931,21 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##mem# - "\t{$src2, $src1, $dst | $dst, $src1, $src2}", + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i32 imm:$src2)))], NoItinerary>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##mem# - "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i32 imm:$src2))))], NoItinerary>, EVEX_K; def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## - _.BroadcastStr##", $dst | $dst, ${src1}" + _.BroadcastStr##", $dst|$dst, ${src1}" ##_.BroadcastStr##", $src2}", [(set _.KRC:$dst,(OpNode (_.VT (X86VBroadcast @@ -1962,7 +1954,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## - _.BroadcastStr##", $dst {${mask}} | $dst {${mask}}, ${src1}"## + _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"## _.BroadcastStr##", $src2}", [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT (X86VBroadcast @@ -2715,30 +2707,6 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>, avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; -def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), - (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; - -def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr, - (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), - (VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; - -def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), - (VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; - -def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, - (bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)), - (VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; - -def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VMOVAPDZrm addr:$ptr)>; - -def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr, - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VMOVAPSZrm addr:$ptr)>; - def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src), GR16:$mask), (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), @@ -4088,8 +4056,8 @@ defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>, defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>, avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V; -defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", rotr>, AVX512BIi8Base, EVEX_4V; -defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", rotl>, AVX512BIi8Base, EVEX_4V; +defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V; +defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V; defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>; defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; @@ -6057,12 +6025,12 @@ multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, let mayStore = 1 in { def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, SrcInfo.RC:$src), - OpcodeStr # "\t{$src, $dst |$dst, $src}", + OpcodeStr # "\t{$src, $dst|$dst, $src}", []>, EVEX; def mrk : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src), - OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", []>, EVEX, EVEX_K; }//mayStore = 1 } @@ -6666,12 +6634,12 @@ multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, let mayStore = 1 in { def mr : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), - OpcodeStr # "\t{$src, $dst |$dst, $src}", + OpcodeStr # "\t{$src, $dst|$dst, $src}", []>, EVEX_CD8<_.EltSize, CD8VT1>; def mrk : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), - OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", [(store (_.VT (vselect _.KRCWM:$mask, (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)), addr:$dst)]>, @@ -6766,7 +6734,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix, "$src2,{sae}, $src1", + OpcodeStr##_.Suffix, "$src2, {sae}, $src1", "$src1, {sae}, $src2", (OpNode (_.VT _.RC:$src1), (i32 imm:$src2), @@ -6895,8 +6863,8 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), - OpcodeStr, "$src3,{sae}, $src2, $src1", - "$src1, $src2,{sae}, $src3", + OpcodeStr, "$src3, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), @@ -6907,8 +6875,8 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), - OpcodeStr, "$src3,{sae}, $src2, $src1", - "$src1, $src2,{sae}, $src3", + OpcodeStr, "$src3, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td index c4b2d6d3bb75..af43d9f53325 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -98,22 +98,22 @@ let hasSideEffects = 0, isCodeGenOnly = 1 in { def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", - [], IIC_MOVZX>, TB, Sched<[WriteALU]>; + [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALU]>; let mayLoad = 1 in def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", - [], IIC_MOVZX>, TB, Sched<[WriteALULd]>; + [], IIC_MOVZX>, TB, OpSize32, Sched<[WriteALULd]>; def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", - [], IIC_MOVSX>, TB, Sched<[WriteALU]>; + [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALU]>; let mayLoad = 1 in def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem, (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", - [], IIC_MOVSX>, TB, Sched<[WriteALULd]>; + [], IIC_MOVSX>, TB, OpSize32, Sched<[WriteALULd]>; } // MOVSX64rr8 always has a REX prefix and it has an 8-bit register @@ -146,18 +146,22 @@ def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), Sched<[WriteALULd]>, Requires<[In64BitMode]>; // movzbq and movzwq encodings for the disassembler -def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), - "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB, Sched<[WriteALU]>; -def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), - "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB, Sched<[WriteALULd]>; -def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), - "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB, Sched<[WriteALU]>; -def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), - "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, - TB, Sched<[WriteALULd]>; +let hasSideEffects = 0 in { +def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; +def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>, + TB, Sched<[WriteALULd]>; +} // 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a // 32-bit register. diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 829cedd55fb3..643286324e25 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -225,6 +225,9 @@ def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; +def X86vrotli : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>; +def X86vrotri : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>; + def X86vprot : SDNode<"X86ISD::VPROT", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index ea8e56206ce6..9c8339a841c9 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -1273,7 +1273,7 @@ def STOSW : I<0xAB, RawFrmDst, (outs dstidx16:$dst), (ins), let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in def STOSL : I<0xAB, RawFrmDst, (outs dstidx32:$dst), (ins), "stos{l|d}\t{%eax, $dst|$dst, eax}", [], IIC_STOS>, OpSize32; -let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in +let Defs = [RDI], Uses = [RAX,RDI,EFLAGS] in def STOSQ : RI<0xAB, RawFrmDst, (outs dstidx64:$dst), (ins), "stosq\t{%rax, $dst|$dst, rax}", [], IIC_STOS>; @@ -2755,56 +2755,56 @@ def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. // Likewise for btc/btr/bts. -def : InstAlias<"bt {$imm, $mem|$mem, $imm}", +def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}", (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>; -def : InstAlias<"btc {$imm, $mem|$mem, $imm}", +def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}", (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>; -def : InstAlias<"btr {$imm, $mem|$mem, $imm}", +def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}", (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>; -def : InstAlias<"bts {$imm, $mem|$mem, $imm}", +def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}", (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>; // clr aliases. -def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; -def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; -def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; -def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; +def : InstAlias<"clrb\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; +def : InstAlias<"clrw\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; +def : InstAlias<"clrl\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; +def : InstAlias<"clrq\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; // lods aliases. Accept the destination being omitted because it's implicit // in the mnemonic, or the mnemonic suffix being omitted because it's implicit // in the destination. -def : InstAlias<"lodsb $src", (LODSB srcidx8:$src), 0>; -def : InstAlias<"lodsw $src", (LODSW srcidx16:$src), 0>; -def : InstAlias<"lods{l|d} $src", (LODSL srcidx32:$src), 0>; -def : InstAlias<"lodsq $src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; -def : InstAlias<"lods {$src, %al|al, $src}", (LODSB srcidx8:$src), 0>; -def : InstAlias<"lods {$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>; -def : InstAlias<"lods {$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>; -def : InstAlias<"lods {$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; // stos aliases. Accept the source being omitted because it's implicit in // the mnemonic, or the mnemonic suffix being omitted because it's implicit // in the source. -def : InstAlias<"stosb $dst", (STOSB dstidx8:$dst), 0>; -def : InstAlias<"stosw $dst", (STOSW dstidx16:$dst), 0>; -def : InstAlias<"stos{l|d} $dst", (STOSL dstidx32:$dst), 0>; -def : InstAlias<"stosq $dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"stos {%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>; -def : InstAlias<"stos {%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>; -def : InstAlias<"stos {%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>; -def : InstAlias<"stos {%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; // scas aliases. Accept the destination being omitted because it's implicit // in the mnemonic, or the mnemonic suffix being omitted because it's implicit // in the destination. -def : InstAlias<"scasb $dst", (SCASB dstidx8:$dst), 0>; -def : InstAlias<"scasw $dst", (SCASW dstidx16:$dst), 0>; -def : InstAlias<"scas{l|d} $dst", (SCASL dstidx32:$dst), 0>; -def : InstAlias<"scasq $dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"scas {$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>; -def : InstAlias<"scas {$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>; -def : InstAlias<"scas {$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>; -def : InstAlias<"scas {$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; // div and idiv aliases for explicit A register. def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>; @@ -2892,30 +2892,30 @@ def : InstAlias<"fnstsw" , (FNSTSW16r)>; // lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but // this is compatible with what GAS does. -def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"lcall {*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"ljmp {*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; -def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; -def : InstAlias<"lcall {*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"ljmp {*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; - -def : InstAlias<"call {*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"jmp {*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"call {*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>; -def : InstAlias<"jmp {*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>; -def : InstAlias<"call {*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"jmp {*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; + +def : InstAlias<"call\t{*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"call\t{*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"call\t{*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; // "imul <imm>, B" is an alias for "imul <imm>, B, B". -def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; -def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; -def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; -def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; -def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; -def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; +def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; +def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; +def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; +def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; +def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; +def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; // inb %dx -> inb %al, %dx def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; @@ -2927,46 +2927,46 @@ def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>; // jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp -def : InstAlias<"call $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; -def : InstAlias<"jmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; -def : InstAlias<"call $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; -def : InstAlias<"jmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; -def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>; -def : InstAlias<"jmpw $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>; -def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; -def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"call\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"jmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"call\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; +def : InstAlias<"jmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>; +def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; // Force mov without a suffix with a segment and mem to prefer the 'l' form of // the move. All segment/mem forms are equivalent, this has the shortest // encoding. -def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; -def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; +def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; +def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; // Match 'movq <largeimm>, <reg>' as an alias for movabsq. -def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; +def : InstAlias<"movq\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; // Match 'movq GR64, MMX' as an alias for movd. -def : InstAlias<"movq {$src, $dst|$dst, $src}", +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; -def : InstAlias<"movq {$src, $dst|$dst, $src}", +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; // movsx aliases -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; -def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; // movzx aliases -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; -def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td index 31608cd4c128..71ab97374dd6 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMPX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td @@ -15,10 +15,10 @@ multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> { def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src), - OpcodeStr#" \t{$src, $dst|$dst, $src}", []>, + OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, Requires<[HasMPX, Not64BitMode]>; def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - OpcodeStr#" \t{$src, $dst|$dst, $src}", []>, + OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, Requires<[HasMPX, In64BitMode]>; } @@ -26,16 +26,16 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> { def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i32mem:$src2), - OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, Not64BitMode]>; def 64rm: RI<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i64mem:$src2), - OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, In64BitMode]>; def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2), - OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, Not64BitMode]>; def 64rr: RI<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2), - OpcodeStr#" \t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, Requires<[HasMPX, In64BitMode]>; } defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS; @@ -43,28 +43,28 @@ defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD; defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD; def BNDMOVRMrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX]>; def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, Not64BitMode]>; def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, In64BitMode]>; def BNDMOVMRrr : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX]>; def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs i64mem:$dst), (ins BNDR:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, Not64BitMode]>; def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs i128mem:$dst), (ins BNDR:$src), - "bndmov \t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, Requires<[HasMPX, In64BitMode]>; def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), - "bndstx \t{$src, $dst|$dst, $src}", []>, PS, + "bndstx\t{$src, $dst|$dst, $src}", []>, PS, Requires<[HasMPX]>; def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - "bndldx \t{$src, $dst|$dst, $src}", []>, PS, + "bndldx\t{$src, $dst|$dst, $src}", []>, PS, Requires<[HasMPX]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index 624b9316e6fd..6a7c45665e9c 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -1808,7 +1808,7 @@ def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>; def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", - (CVTSD2SI64rm GR64:$dst, sdmem:$src)>; + (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; /// SSE 2 Only @@ -7838,9 +7838,7 @@ class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, - Sched<[Sched]>, VEX { - let mayLoad = 1; -} + Sched<[Sched]>, VEX; // AVX2 adds register forms class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, @@ -7871,7 +7869,7 @@ let ExeDomain = SSEPackedDouble in def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, v4f64, v2f64, WriteFShuffle256>, VEX_L; -let mayLoad = 1, Predicates = [HasAVX2] in +let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, @@ -8259,6 +8257,9 @@ let Predicates = [HasF16C] in { (VCVTPH2PSrm addr:$src)>; def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), (VCVTPH2PSrm addr:$src)>; + def : Pat<(int_x86_vcvtph2ps_128 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VCVTPH2PSrm addr:$src)>; def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 646b556faa8f..b525d5eb60a7 100644 --- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -29,7 +29,7 @@ enum IntrinsicType { INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, - EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC, + EXPAND_FROM_MEM, LOADA, LOADU, BLEND, INSERT_SUBVEC, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK }; @@ -143,6 +143,18 @@ static const IntrinsicData IntrinsicsWithChain[] = { EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_load_pd_128, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_pd_256, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_pd_512, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_ps_128, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_ps_256, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_load_ps_512, LOADA, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, @@ -1129,6 +1141,42 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNCS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_d_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxb_w_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxd_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_d_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovsxw_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK, @@ -1165,6 +1213,42 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_d_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxb_w_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxd_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_d_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_128, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_256, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovzxw_q_512, INTR_TYPE_1OP_MASK, + X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, @@ -1201,12 +1285,54 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0), X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0), X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_d_128, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_d_256, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_d_512, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_q_128, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_q_256, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prol_q_512, INTR_TYPE_2OP_MASK, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_d_128, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_d_256, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_d_512, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_q_128, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_q_256, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_pror_q_512, INTR_TYPE_2OP_MASK, X86ISD::VROTRI, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), + X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0), X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_d_128, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFD, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_d_256, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFD, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_d_512, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFD, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufh_w_128, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFHW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufh_w_256, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFHW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufh_w_512, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFHW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufl_w_128, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFLW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufl_w_256, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFLW, 0), + X86_INTRINSIC_DATA(avx512_mask_pshufl_w_512, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFLW, 0), X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx512_mask_psll_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx512_mask_psll_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), @@ -1219,8 +1345,21 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psll_qi_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_psll_qi_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_psll_qi_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_q, VSHIFT_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv16_hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv2_di, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv32hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv4_di, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv4_si, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv8_hi, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv8_si, INTR_TYPE_2OP_MASK, ISD::SHL, 0), X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0), X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0), X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), @@ -1243,8 +1382,15 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psra_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0), X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav16_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav32_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav4_si, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav8_hi, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav8_si, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q_128, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q_256, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 3a7a98db50f4..00515dde5568 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -92,6 +92,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// used to address arguments in a function using a base pointer. int SEHFramePtrSaveIndex = 0; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR = false; + private: /// ForwardedMustTailRegParms - A list of virtual and physical registers /// that must be forwarded to every musttail call. @@ -160,6 +164,9 @@ public: SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() { return ForwardedMustTailRegParms; } + + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp index 58020d909a43..45cc0aef1d93 100644 --- a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -9,8 +9,10 @@ // // This file defines the pass that performs some optimizations with LEA // instructions in order to improve code size. -// Currently, it does one thing: -// 1) Address calculations in load and store instructions are replaced by +// Currently, it does two things: +// 1) If there are two LEA instructions calculating addresses which only differ +// by displacement inside a basic block, one of them is removed. +// 2) Address calculations in load and store instructions are replaced by // existing LEA def registers where possible. // //===----------------------------------------------------------------------===// @@ -38,6 +40,7 @@ static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden, cl::init(false)); STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions"); +STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed"); namespace { class OptimizeLEAPass : public MachineFunctionPass { @@ -71,6 +74,13 @@ private: /// \brief Returns true if the instruction is LEA. bool isLEA(const MachineInstr &MI); + /// \brief Returns true if the \p Last LEA instruction can be replaced by the + /// \p First. The difference between displacements of the addresses calculated + /// by these LEAs is returned in \p AddrDispShift. It'll be used for proper + /// replacement of the \p Last LEA's uses with the \p First's def register. + bool isReplaceable(const MachineInstr &First, const MachineInstr &Last, + int64_t &AddrDispShift); + /// \brief Returns true if two instructions have memory operands that only /// differ by displacement. The numbers of the first memory operands for both /// instructions are specified through \p N1 and \p N2. The address @@ -79,13 +89,20 @@ private: const MachineInstr &MI2, unsigned N2, int64_t &AddrDispShift); - /// \brief Find all LEA instructions in the basic block. + /// \brief Find all LEA instructions in the basic block. Also, assign position + /// numbers to all instructions in the basic block to speed up calculation of + /// distance between them. void findLEAs(const MachineBasicBlock &MBB, SmallVectorImpl<MachineInstr *> &List); /// \brief Removes redundant address calculations. bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List); + /// \brief Removes LEAs which calculate similar addresses. + bool removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List); + + DenseMap<const MachineInstr *, unsigned> InstrPos; + MachineRegisterInfo *MRI; const X86InstrInfo *TII; const X86RegisterInfo *TRI; @@ -99,14 +116,15 @@ FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, const MachineInstr &Last) { - const MachineBasicBlock *MBB = First.getParent(); - - // Both instructions must be in the same basic block. - assert(Last.getParent() == MBB && + // Both instructions must be in the same basic block and they must be + // presented in InstrPos. + assert(Last.getParent() == First.getParent() && "Instructions are in different basic blocks"); + assert(InstrPos.find(&First) != InstrPos.end() && + InstrPos.find(&Last) != InstrPos.end() && + "Instructions' positions are undefined"); - return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) - - std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First)); + return InstrPos[&Last] - InstrPos[&First]; } // Find the best LEA instruction in the List to replace address recalculation in @@ -189,6 +207,69 @@ bool OptimizeLEAPass::isLEA(const MachineInstr &MI) { Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; } +// Check that the Last LEA can be replaced by the First LEA. To be so, +// these requirements must be met: +// 1) Addresses calculated by LEAs differ only by displacement. +// 2) Def registers of LEAs belong to the same class. +// 3) All uses of the Last LEA def register are replaceable, thus the +// register is used only as address base. +bool OptimizeLEAPass::isReplaceable(const MachineInstr &First, + const MachineInstr &Last, + int64_t &AddrDispShift) { + assert(isLEA(First) && isLEA(Last) && + "The function works only with LEA instructions"); + + // Compare instructions' memory operands. + if (!isSimilarMemOp(Last, 1, First, 1, AddrDispShift)) + return false; + + // Make sure that LEA def registers belong to the same class. There may be + // instructions (like MOV8mr_NOREX) which allow a limited set of registers to + // be used as their operands, so we must be sure that replacing one LEA + // with another won't lead to putting a wrong register in the instruction. + if (MRI->getRegClass(First.getOperand(0).getReg()) != + MRI->getRegClass(Last.getOperand(0).getReg())) + return false; + + // Loop over all uses of the Last LEA to check that its def register is + // used only as address base for memory accesses. If so, it can be + // replaced, otherwise - no. + for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) { + MachineInstr &MI = *MO.getParent(); + + // Get the number of the first memory operand. + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()); + + // If the use instruction has no memory operand - the LEA is not + // replaceable. + if (MemOpNo < 0) + return false; + + MemOpNo += X86II::getOperandBias(Desc); + + // If the address base of the use instruction is not the LEA def register - + // the LEA is not replaceable. + if (!isIdenticalOp(MI.getOperand(MemOpNo + X86::AddrBaseReg), MO)) + return false; + + // If the LEA def register is used as any other operand of the use + // instruction - the LEA is not replaceable. + for (unsigned i = 0; i < MI.getNumOperands(); i++) + if (i != (unsigned)(MemOpNo + X86::AddrBaseReg) && + isIdenticalOp(MI.getOperand(i), MO)) + return false; + + // Check that the new address displacement will fit 4 bytes. + if (MI.getOperand(MemOpNo + X86::AddrDisp).isImm() && + !isInt<32>(MI.getOperand(MemOpNo + X86::AddrDisp).getImm() + + AddrDispShift)) + return false; + } + + return true; +} + // Check if MI1 and MI2 have memory operands which represent addresses that // differ only by displacement. bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1, @@ -219,7 +300,15 @@ bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1, void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, SmallVectorImpl<MachineInstr *> &List) { + unsigned Pos = 0; for (auto &MI : MBB) { + // Assign the position number to the instruction. Note that we are going to + // move some instructions during the optimization however there will never + // be a need to move two instructions before any selected instruction. So to + // avoid multiple positions' updates during moves we just increase position + // counter by two leaving a free space for instructions which will be moved. + InstrPos[&MI] = Pos += 2; + if (isLEA(MI)) List.push_back(const_cast<MachineInstr *>(&MI)); } @@ -270,6 +359,13 @@ bool OptimizeLEAPass::removeRedundantAddrCalc( if (Dist < 0) { DefMI->removeFromParent(); MBB->insert(MachineBasicBlock::iterator(&MI), DefMI); + InstrPos[DefMI] = InstrPos[&MI] - 1; + + // Make sure the instructions' position numbers are sane. + assert(((InstrPos[DefMI] == 1 && DefMI == MBB->begin()) || + InstrPos[DefMI] > + InstrPos[std::prev(MachineBasicBlock::iterator(DefMI))]) && + "Instruction positioning is broken"); } // Since we can possibly extend register lifetime, clear kill flags. @@ -296,6 +392,81 @@ bool OptimizeLEAPass::removeRedundantAddrCalc( return Changed; } +// Try to find similar LEAs in the list and replace one with another. +bool +OptimizeLEAPass::removeRedundantLEAs(SmallVectorImpl<MachineInstr *> &List) { + bool Changed = false; + + // Loop over all LEA pairs. + auto I1 = List.begin(); + while (I1 != List.end()) { + MachineInstr &First = **I1; + auto I2 = std::next(I1); + while (I2 != List.end()) { + MachineInstr &Last = **I2; + int64_t AddrDispShift; + + // LEAs should be in occurence order in the list, so we can freely + // replace later LEAs with earlier ones. + assert(calcInstrDist(First, Last) > 0 && + "LEAs must be in occurence order in the list"); + + // Check that the Last LEA instruction can be replaced by the First. + if (!isReplaceable(First, Last, AddrDispShift)) { + ++I2; + continue; + } + + // Loop over all uses of the Last LEA and update their operands. Note that + // the correctness of this has already been checked in the isReplaceable + // function. + for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()), + UE = MRI->use_end(); + UI != UE;) { + MachineOperand &MO = *UI++; + MachineInstr &MI = *MO.getParent(); + + // Get the number of the first memory operand. + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) + + X86II::getOperandBias(Desc); + + // Update address base. + MO.setReg(First.getOperand(0).getReg()); + + // Update address disp. + MachineOperand *Op = &MI.getOperand(MemOpNo + X86::AddrDisp); + if (Op->isImm()) + Op->setImm(Op->getImm() + AddrDispShift); + else if (Op->isGlobal()) + Op->setOffset(Op->getOffset() + AddrDispShift); + else + llvm_unreachable("Invalid address displacement operand"); + } + + // Since we can possibly extend register lifetime, clear kill flags. + MRI->clearKillFlags(First.getOperand(0).getReg()); + + ++NumRedundantLEAs; + DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump();); + + // By this moment, all of the Last LEA's uses must be replaced. So we can + // freely remove it. + assert(MRI->use_empty(Last.getOperand(0).getReg()) && + "The LEA's def register must have no uses"); + Last.eraseFromParent(); + + // Erase removed LEA from the list. + I2 = List.erase(I2); + + Changed = true; + } + ++I1; + } + + return Changed; +} + bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; @@ -310,6 +481,7 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { // Process all basic blocks. for (auto &MBB : MF) { SmallVector<MachineInstr *, 16> LEAs; + InstrPos.clear(); // Find all LEA instructions in basic block. findLEAs(MBB, LEAs); @@ -318,6 +490,11 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { if (LEAs.empty()) continue; + // Remove redundant LEA instructions. The optimization may have a negative + // effect on performance, so do it only for -Oz. + if (MF.getFunction()->optForMinSize()) + Changed |= removeRedundantLEAs(LEAs); + // Remove redundant address calculations. Changed |= removeRedundantAddrCalc(LEAs); } diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index 58404433e1ae..274b56688558 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -250,7 +250,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_64_RT_AllRegs_SaveList; case CallingConv::CXX_FAST_TLS: if (Is64Bit) - return CSR_64_TLS_Darwin_SaveList; + return MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR() ? + CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList; break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) @@ -305,6 +306,15 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_32_SaveList; } +const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR()) + return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index f014c8f6ff61..8d0094cbf3d6 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -99,6 +99,8 @@ public: /// callee-save registers on this target. const MCPhysReg * getCalleeSavedRegs(const MachineFunction* MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const override; |