diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:03:47 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:04:23 +0000 |
commit | 7fa27ce4a07f19b07799a767fc29416f3b625afb (patch) | |
tree | 27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/CodeGen | |
parent | e3b557809604d036af6e00c60f012c2025b59a5e (diff) | |
download | src-7fa27ce4a07f19b07799a767fc29416f3b625afb.tar.gz src-7fa27ce4a07f19b07799a767fc29416f3b625afb.zip |
Vendor import of llvm-project main llvmorg-17-init-19304-gd0b54bb50e51,vendor/llvm-project/llvmorg-17-init-19304-gd0b54bb50e51
the last commit before the upstream release/17.x branch was created.
Diffstat (limited to 'llvm/lib/CodeGen')
223 files changed, 15404 insertions, 7469 deletions
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index bb71d72256d8..886c4db069f1 100644 --- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -31,7 +32,6 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/raw_ostream.h" #include <cassert> #include <utility> @@ -200,7 +200,7 @@ void AggressiveAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count, LLVM_DEBUG(dbgs() << "\tRegs:"); std::vector<unsigned> &DefIndices = State->GetDefIndices(); - for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) { + for (unsigned Reg = 1; Reg != TRI->getNumRegs(); ++Reg) { // If Reg is current live, then mark that it can't be renamed as // we don't know the extent of its live-range anymore (now that it // has been scheduled). If it is not live but was defined in the @@ -246,9 +246,8 @@ void AggressiveAntiDepBreaker::GetPassthruRegs( if ((MO.isDef() && MI.isRegTiedToUseOperand(i)) || IsImplicitDefUse(MI, MO)) { const Register Reg = MO.getReg(); - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - PassthruRegs.insert(*SubRegs); + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) + PassthruRegs.insert(SubReg); } } } @@ -322,8 +321,7 @@ void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx, // was not live because otherwise, regardless whether we have an explicit // use of the subregister, the subregister's contents are needed for the // uses of the superregister. - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - unsigned SubregReg = *SubRegs; + for (MCPhysReg SubregReg : TRI->subregs(Reg)) { if (!State->IsLive(SubregReg)) { KillIndices[SubregReg] = KillIdx; DefIndices[SubregReg] = ~0u; @@ -353,8 +351,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction( // dead, or because only a subregister is live at the def. If we // don't do this the dead def will be incorrectly merged into the // previous def. - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef()) continue; + for (const MachineOperand &MO : MI.all_defs()) { Register Reg = MO.getReg(); if (Reg == 0) continue; @@ -778,7 +775,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( #ifndef NDEBUG LLVM_DEBUG(dbgs() << "\n===== Aggressive anti-dependency breaking\n"); LLVM_DEBUG(dbgs() << "Available regs:"); - for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) { + for (unsigned Reg = 1; Reg < TRI->getNumRegs(); ++Reg) { if (!State->IsLive(Reg)) LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI)); } @@ -922,7 +919,6 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies( } assert(AntiDepReg != 0); - if (AntiDepReg == 0) continue; // Determine AntiDepReg's register group. const unsigned GroupIndex = State->GetGroup(AntiDepReg); diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h index 419cb7626945..cece217e645c 100644 --- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h +++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h @@ -132,6 +132,9 @@ class LLVM_LIBRARY_VISIBILITY AggressiveAntiDepState { AggressiveAntiDepBreaker(MachineFunction &MFi, const RegisterClassInfo &RCI, TargetSubtargetInfo::RegClassVector& CriticalPathRCs); + AggressiveAntiDepBreaker & + operator=(const AggressiveAntiDepBreaker &other) = delete; + AggressiveAntiDepBreaker(const AggressiveAntiDepBreaker &other) = delete; ~AggressiveAntiDepBreaker() override; /// Initialize anti-dep breaking for a new basic block. diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp index b9579441a0ba..2065bfbd1c44 100644 --- a/llvm/lib/CodeGen/Analysis.cpp +++ b/llvm/lib/CodeGen/Analysis.cpp @@ -79,8 +79,8 @@ unsigned llvm::ComputeLinearIndex(Type *Ty, void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl<EVT> &ValueVTs, SmallVectorImpl<EVT> *MemVTs, - SmallVectorImpl<uint64_t> *Offsets, - uint64_t StartingOffset) { + SmallVectorImpl<TypeSize> *Offsets, + TypeSize StartingOffset) { // Given a struct type, recursively traverse the elements. if (StructType *STy = dyn_cast<StructType>(Ty)) { // If the Offsets aren't needed, don't query the struct layout. This allows @@ -92,7 +92,8 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, EE = STy->element_end(); EI != EE; ++EI) { // Don't compute the element offset if we didn't get a StructLayout above. - uint64_t EltOffset = SL ? SL->getElementOffset(EI - EB) : 0; + TypeSize EltOffset = SL ? SL->getElementOffset(EI - EB) + : TypeSize::get(0, StartingOffset.isScalable()); ComputeValueVTs(TLI, DL, *EI, ValueVTs, MemVTs, Offsets, StartingOffset + EltOffset); } @@ -101,7 +102,7 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, // Given an array type, recursively traverse the elements. if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { Type *EltTy = ATy->getElementType(); - uint64_t EltSize = DL.getTypeAllocSize(EltTy).getFixedValue(); + TypeSize EltSize = DL.getTypeAllocSize(EltTy); for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) ComputeValueVTs(TLI, DL, EltTy, ValueVTs, MemVTs, Offsets, StartingOffset + i * EltSize); @@ -120,12 +121,62 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl<EVT> &ValueVTs, - SmallVectorImpl<uint64_t> *Offsets, - uint64_t StartingOffset) { + SmallVectorImpl<TypeSize> *Offsets, + TypeSize StartingOffset) { return ComputeValueVTs(TLI, DL, Ty, ValueVTs, /*MemVTs=*/nullptr, Offsets, StartingOffset); } +void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl<EVT> &ValueVTs, + SmallVectorImpl<TypeSize> *Offsets, + uint64_t StartingOffset) { + TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy()); + return ComputeValueVTs(TLI, DL, Ty, ValueVTs, Offsets, Offset); +} + +void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl<EVT> &ValueVTs, + SmallVectorImpl<uint64_t> *FixedOffsets, + uint64_t StartingOffset) { + TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy()); + SmallVector<TypeSize, 4> Offsets; + if (FixedOffsets) + ComputeValueVTs(TLI, DL, Ty, ValueVTs, &Offsets, Offset); + else + ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, Offset); + + if (FixedOffsets) + for (TypeSize Offset : Offsets) + FixedOffsets->push_back(Offset.getKnownMinValue()); +} + +void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl<EVT> &ValueVTs, + SmallVectorImpl<EVT> *MemVTs, + SmallVectorImpl<TypeSize> *Offsets, + uint64_t StartingOffset) { + TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy()); + return ComputeValueVTs(TLI, DL, Ty, ValueVTs, MemVTs, Offsets, Offset); +} + +void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, + Type *Ty, SmallVectorImpl<EVT> &ValueVTs, + SmallVectorImpl<EVT> *MemVTs, + SmallVectorImpl<uint64_t> *FixedOffsets, + uint64_t StartingOffset) { + TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy()); + SmallVector<TypeSize, 4> Offsets; + if (FixedOffsets) + ComputeValueVTs(TLI, DL, Ty, ValueVTs, MemVTs, &Offsets, Offset); + else + ComputeValueVTs(TLI, DL, Ty, ValueVTs, MemVTs, nullptr, Offset); + + if (FixedOffsets) + for (TypeSize Offset : Offsets) + FixedOffsets->push_back(Offset.getKnownMinValue()); +} + void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty, SmallVectorImpl<LLT> &ValueTys, SmallVectorImpl<uint64_t> *Offsets, diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp index 22ecc5199742..aab3c2681339 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp @@ -194,8 +194,8 @@ class Dwarf5AccelTableWriter : public AccelTableWriter { uint32_t CompUnitCount; uint32_t LocalTypeUnitCount = 0; uint32_t ForeignTypeUnitCount = 0; - uint32_t BucketCount; - uint32_t NameCount; + uint32_t BucketCount = 0; + uint32_t NameCount = 0; uint32_t AbbrevTableSize = 0; uint32_t AugmentationStringSize = sizeof(AugmentationString); char AugmentationString[8] = {'L', 'L', 'V', 'M', '0', '7', '0', '0'}; @@ -549,9 +549,13 @@ void llvm::emitDWARF5AccelTable( SmallVector<unsigned, 1> CUIndex(CUs.size()); int Count = 0; for (const auto &CU : enumerate(CUs)) { - if (CU.value()->getCUNode()->getNameTableKind() != - DICompileUnit::DebugNameTableKind::Default) + switch (CU.value()->getCUNode()->getNameTableKind()) { + case DICompileUnit::DebugNameTableKind::Default: + case DICompileUnit::DebugNameTableKind::Apple: + break; + default: continue; + } CUIndex[CU.index()] = Count++; assert(CU.index() == CU.value()->getUniqueID()); const DwarfCompileUnit *MainCU = @@ -660,9 +664,9 @@ void AccelTableBase::HashData::print(raw_ostream &OS) const { void AccelTableBase::print(raw_ostream &OS) const { // Print Content. OS << "Entries: \n"; - for (const auto &Entry : Entries) { - OS << "Name: " << Entry.first() << "\n"; - for (auto *V : Entry.second.Values) + for (const auto &[Name, Data] : Entries) { + OS << "Name: " << Name << "\n"; + for (auto *V : Data.Values) V->print(OS); } diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp index 32d8dc793510..00ee4e1b47a8 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include <utility> @@ -24,7 +25,7 @@ unsigned AddressPool::getIndex(const MCSymbol *Sym, bool TLS) { } MCSymbol *AddressPool::emitHeader(AsmPrinter &Asm, MCSection *Section) { - static const uint8_t AddrSize = Asm.getDataLayout().getPointerSize(); + static const uint8_t AddrSize = Asm.MAI->getCodePointerSize(); MCSymbol *EndLabel = Asm.emitDwarfUnitLength("debug_addr", "Length of contribution"); @@ -65,7 +66,7 @@ void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) { : MCSymbolRefExpr::create(I.first, Asm.OutContext); for (const MCExpr *Entry : Entries) - Asm.OutStreamer->emitValue(Entry, Asm.getDataLayout().getPointerSize()); + Asm.OutStreamer->emitValue(Entry, Asm.MAI->getCodePointerSize()); if (EndLabel) Asm.OutStreamer->emitLabel(EndLabel); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 8c126d20fc9a..5381dfdd184c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -26,12 +26,11 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/TinyPtrVector.h" -#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/COFF.h" @@ -39,6 +38,7 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/GCMetadata.h" #include "llvm/CodeGen/GCMetadataPrinter.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" @@ -67,6 +67,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/GCStrategy.h" #include "llvm/IR/GlobalAlias.h" @@ -99,6 +100,7 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/SectionKind.h" +#include "llvm/Object/ELFTypes.h" #include "llvm/Pass.h" #include "llvm/Remarks/RemarkStreamer.h" #include "llvm/Support/Casting.h" @@ -113,6 +115,7 @@ #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/TargetParser/Triple.h" #include <algorithm> #include <cassert> #include <cinttypes> @@ -128,6 +131,13 @@ using namespace llvm; #define DEBUG_TYPE "asm-printer" +static cl::opt<std::string> BasicBlockProfileDump( + "mbb-profile-dump", cl::Hidden, + cl::desc("Basic block profile dump for external cost modelling. If " + "matching up BBs with afterwards, the compilation must be " + "performed with -basic-block-sections=labels. Enabling this " + "flag during in-process ThinLTO is not supported.")); + const char DWARFGroupName[] = "dwarf"; const char DWARFGroupDescription[] = "DWARF Emission"; const char DbgTimerName[] = "emit"; @@ -414,6 +424,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired<MachineOptimizationRemarkEmitterPass>(); AU.addRequired<GCModuleInfo>(); + AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); } bool AsmPrinter::doInitialization(Module &M) { @@ -475,6 +486,11 @@ bool AsmPrinter::doInitialization(Module &M) { } } + // On AIX, emit bytes for llvm.commandline metadata after .file so that the + // C_INFO symbol is preserved if any csect is kept by the linker. + if (TM.getTargetTriple().isOSBinFormatXCOFF()) + emitModuleCommandLines(M); + GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>(); assert(MI && "AsmPrinter didn't require GCModuleInfo?"); for (const auto &I : *MI) @@ -531,7 +547,7 @@ bool AsmPrinter::doInitialization(Module &M) { break; } assert(MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI || - ModuleCFISection != CFISection::EH); + usesCFIWithoutEH() || ModuleCFISection != CFISection::EH); break; default: break; @@ -540,7 +556,7 @@ bool AsmPrinter::doInitialization(Module &M) { EHStreamer *ES = nullptr; switch (MAI->getExceptionHandlingType()) { case ExceptionHandling::None: - if (!needsCFIForDebug()) + if (!usesCFIWithoutEH()) break; [[fallthrough]]; case ExceptionHandling::SjLj: @@ -585,6 +601,16 @@ bool AsmPrinter::doInitialization(Module &M) { HI.Handler->beginModule(&M); } + if (!BasicBlockProfileDump.empty()) { + std::error_code PossibleFileError; + MBBProfileDumpFileOutput = std::make_unique<raw_fd_ostream>( + BasicBlockProfileDump, PossibleFileError); + if (PossibleFileError) { + M.getContext().emitError("Failed to open file for MBB Profile Dump: " + + PossibleFileError.message() + "\n"); + } + } + return false; } @@ -704,8 +730,8 @@ void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { if (T.getArch() != Triple::aarch64 || !T.isAndroid()) OutContext.reportError(SMLoc(), - "Tagged symbols (-fsanitize=memtag-globals) are " - "only supported on aarch64 + Android."); + "tagged symbols (-fsanitize=memtag-globals) are " + "only supported on AArch64 Android"); OutStreamer->emitSymbolAttribute(EmittedSym, MAI->getMemtagAttr()); } @@ -908,13 +934,6 @@ void AsmPrinter::emitFunctionHeader() { if (F.hasFnAttribute(Attribute::Cold)) OutStreamer->emitSymbolAttribute(CurrentFnSym, MCSA_Cold); - if (isVerbose()) { - F.printAsOperand(OutStreamer->getCommentOS(), - /*PrintType=*/false, F.getParent()); - emitFunctionHeaderComment(); - OutStreamer->getCommentOS() << '\n'; - } - // Emit the prefix data. if (F.hasPrefixData()) { if (MAI->hasSubsectionsViaSymbols()) { @@ -958,6 +977,23 @@ void AsmPrinter::emitFunctionHeader() { CurrentPatchableFunctionEntrySym = CurrentFnBegin; } + // Emit the function prologue data for the indirect call sanitizer. + if (const MDNode *MD = F.getMetadata(LLVMContext::MD_func_sanitize)) { + assert(MD->getNumOperands() == 2); + + auto *PrologueSig = mdconst::extract<Constant>(MD->getOperand(0)); + auto *TypeHash = mdconst::extract<Constant>(MD->getOperand(1)); + emitGlobalConstant(F.getParent()->getDataLayout(), PrologueSig); + emitGlobalConstant(F.getParent()->getDataLayout(), TypeHash); + } + + if (isVerbose()) { + F.printAsOperand(OutStreamer->getCommentOS(), + /*PrintType=*/false, F.getParent()); + emitFunctionHeaderComment(); + OutStreamer->getCommentOS() << '\n'; + } + // Emit the function descriptor. This is a virtual function to allow targets // to emit their specific function descriptor. Right now it is only used by // the AIX target. The PowerPC 64-bit V1 ELF target also uses function @@ -1005,24 +1041,6 @@ void AsmPrinter::emitFunctionHeader() { // Emit the prologue data. if (F.hasPrologueData()) emitGlobalConstant(F.getParent()->getDataLayout(), F.getPrologueData()); - - // Emit the function prologue data for the indirect call sanitizer. - if (const MDNode *MD = F.getMetadata(LLVMContext::MD_func_sanitize)) { - assert(TM.getTargetTriple().getArch() == Triple::x86 || - TM.getTargetTriple().getArch() == Triple::x86_64); - assert(MD->getNumOperands() == 2); - - auto *PrologueSig = mdconst::extract<Constant>(MD->getOperand(0)); - auto *FTRTTIProxy = mdconst::extract<Constant>(MD->getOperand(1)); - assert(PrologueSig && FTRTTIProxy); - emitGlobalConstant(F.getParent()->getDataLayout(), PrologueSig); - - const MCExpr *Proxy = lowerConstant(FTRTTIProxy); - const MCExpr *FnExp = MCSymbolRefExpr::create(CurrentFnSym, OutContext); - const MCExpr *PCRel = MCBinaryExpr::createSub(Proxy, FnExp, OutContext); - // Use 32 bit since only small code model is supported. - OutStreamer->emitValue(PCRel, 4u); - } } /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the @@ -1254,6 +1272,10 @@ AsmPrinter::getFunctionCFISectionType(const Function &F) const { F.needsUnwindTableEntry()) return CFISection::EH; + if (MAI->usesCFIWithoutEH() && F.hasUWTable()) + return CFISection::EH; + + assert(MMI != nullptr && "Invalid machine module info"); if (MMI->hasDebugInfo() || TM.Options.ForceDwarfFrameSection) return CFISection::Debug; @@ -1269,14 +1291,13 @@ bool AsmPrinter::needsSEHMoves() { return MAI->usesWindowsCFI() && MF->getFunction().needsUnwindTableEntry(); } -bool AsmPrinter::needsCFIForDebug() const { - return MAI->getExceptionHandlingType() == ExceptionHandling::None && - MAI->doesUseCFIForDebug() && ModuleCFISection == CFISection::Debug; +bool AsmPrinter::usesCFIWithoutEH() const { + return MAI->usesCFIWithoutEH() && ModuleCFISection != CFISection::None; } void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) { ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType(); - if (!needsCFIForDebug() && + if (!usesCFIWithoutEH() && ExceptionHandlingType != ExceptionHandling::DwarfCFI && ExceptionHandlingType != ExceptionHandling::ARM) return; @@ -1310,21 +1331,16 @@ void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) { MCConstantExpr::create(FrameOffset, OutContext)); } -/// Returns the BB metadata to be emitted in the .llvm_bb_addr_map section for a -/// given basic block. This can be used to capture more precise profile -/// information. We use the last 4 bits (LSBs) to encode the following -/// information: -/// * (1): set if return block (ret or tail call). -/// * (2): set if ends with a tail call. -/// * (3): set if exception handling (EH) landing pad. -/// * (4): set if the block can fall through to its next. -/// The remaining bits are zero. -static unsigned getBBAddrMapMetadata(const MachineBasicBlock &MBB) { +/// Returns the BB metadata to be emitted in the SHT_LLVM_BB_ADDR_MAP section +/// for a given basic block. This can be used to capture more precise profile +/// information. +static uint32_t getBBAddrMapMetadata(const MachineBasicBlock &MBB) { const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); - return ((unsigned)MBB.isReturnBlock()) | - ((!MBB.empty() && TII->isTailCall(MBB.back())) << 1) | - (MBB.isEHPad() << 2) | - (const_cast<MachineBasicBlock &>(MBB).canFallThrough() << 3); + return object::BBAddrMap::BBEntry::Metadata{ + MBB.isReturnBlock(), !MBB.empty() && TII->isTailCall(MBB.back()), + MBB.isEHPad(), const_cast<MachineBasicBlock &>(MBB).canFallThrough(), + !MBB.empty() && MBB.rbegin()->isIndirectBranch()} + .encode(); } void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { @@ -1346,7 +1362,7 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) { OutStreamer->AddComment("number of basic blocks"); OutStreamer->emitULEB128IntValue(MF.size()); const MCSymbol *PrevMBBEndSymbol = FunctionSymbol; - // Emit BB Information for each basic block in the funciton. + // Emit BB Information for each basic block in the function. for (const MachineBasicBlock &MBB : MF) { const MCSymbol *MBBSymbol = MBB.isEntryBlock() ? FunctionSymbol : MBB.getSymbol(); @@ -1496,9 +1512,22 @@ void AsmPrinter::emitPCSections(const MachineFunction &MF) { // constants may appear, which will simply be emitted into the current // section (the user of MD_pcsections decides the format of encoded data). assert(isa<MDString>(MD.getOperand(0)) && "first operand not a string"); + bool ConstULEB128 = false; for (const MDOperand &MDO : MD.operands()) { if (auto *S = dyn_cast<MDString>(MDO)) { - SwitchSection(S->getString()); + // Found string, start of new section! + // Find options for this section "<section>!<opts>" - supported options: + // C = Compress constant integers of size 2-8 bytes as ULEB128. + const StringRef SecWithOpt = S->getString(); + const size_t OptStart = SecWithOpt.find('!'); // likely npos + const StringRef Sec = SecWithOpt.substr(0, OptStart); + const StringRef Opts = SecWithOpt.substr(OptStart); // likely empty + ConstULEB128 = Opts.find('C') != StringRef::npos; +#ifndef NDEBUG + for (char O : Opts) + assert((O == '!' || O == 'C') && "Invalid !pcsections options"); +#endif + SwitchSection(Sec); const MCSymbol *Prev = Syms.front(); for (const MCSymbol *Sym : Syms) { if (Sym == Prev || !Deltas) { @@ -1510,17 +1539,30 @@ void AsmPrinter::emitPCSections(const MachineFunction &MF) { // `base + addr`. emitLabelDifference(Sym, Base, RelativeRelocSize); } else { - emitLabelDifference(Sym, Prev, 4); + // Emit delta between symbol and previous symbol. + if (ConstULEB128) + emitLabelDifferenceAsULEB128(Sym, Prev); + else + emitLabelDifference(Sym, Prev, 4); } Prev = Sym; } } else { + // Emit auxiliary data after PC. assert(isa<MDNode>(MDO) && "expecting either string or tuple"); const auto *AuxMDs = cast<MDNode>(MDO); for (const MDOperand &AuxMDO : AuxMDs->operands()) { assert(isa<ConstantAsMetadata>(AuxMDO) && "expecting a constant"); - const auto *C = cast<ConstantAsMetadata>(AuxMDO); - emitGlobalConstant(F.getParent()->getDataLayout(), C->getValue()); + const Constant *C = cast<ConstantAsMetadata>(AuxMDO)->getValue(); + const DataLayout &DL = F.getParent()->getDataLayout(); + const uint64_t Size = DL.getTypeStoreSize(C->getType()); + + if (auto *CI = dyn_cast<ConstantInt>(C); + CI && ConstULEB128 && Size > 1 && Size <= 8) { + emitULEB128(CI->getZExtValue()); + } else { + emitGlobalConstant(DL, C); + } } } } @@ -1582,6 +1624,7 @@ void AsmPrinter::emitFunctionBody() { // Print out code for the function. bool HasAnyRealCode = false; int NumInstsInFunction = 0; + bool IsEHa = MMI->getModule()->getModuleFlag("eh-asynch"); bool CanDoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); for (auto &MBB : *MF) { @@ -1620,10 +1663,25 @@ void AsmPrinter::emitFunctionBody() { emitFrameAlloc(MI); break; case TargetOpcode::ANNOTATION_LABEL: - case TargetOpcode::EH_LABEL: case TargetOpcode::GC_LABEL: OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol()); break; + case TargetOpcode::EH_LABEL: + OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol()); + // For AsynchEH, insert a Nop if followed by a trap inst + // Or the exception won't be caught. + // (see MCConstantExpr::create(1,..) in WinException.cpp) + // Ignore SDiv/UDiv because a DIV with Const-0 divisor + // must have being turned into an UndefValue. + // Div with variable opnds won't be the first instruction in + // an EH region as it must be led by at least a Load + { + auto MI2 = std::next(MI.getIterator()); + if (IsEHa && MI2 != MBB.end() && + (MI2->mayLoadOrStore() || MI2->mayRaiseFPException())) + emitNops(1); + } + break; case TargetOpcode::INLINEASM: case TargetOpcode::INLINEASM_BR: emitInlineAsm(&MI); @@ -1862,6 +1920,23 @@ void AsmPrinter::emitFunctionBody() { OutStreamer->getCommentOS() << "-- End function\n"; OutStreamer->addBlankLine(); + + // Output MBB ids, function names, and frequencies if the flag to dump + // MBB profile information has been set + if (MBBProfileDumpFileOutput) { + if (!MF->hasBBLabels()) + MF->getContext().reportError( + SMLoc(), + "Unable to find BB labels for MBB profile dump. -mbb-profile-dump " + "must be called with -basic-block-sections=labels"); + MachineBlockFrequencyInfo &MBFI = + getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI(); + for (const auto &MBB : *MF) { + *MBBProfileDumpFileOutput.get() + << MF->getName() << "," << MBB.getBBID() << "," + << MBFI.getBlockFreqRelativeToEntryBlock(&MBB) << "\n"; + } + } } /// Compute the number of Global Variables that uses a Constant. @@ -2235,6 +2310,8 @@ bool AsmPrinter::doFinalization(Module &M) { SmallVector<const GlobalAlias *, 16> AliasStack; SmallPtrSet<const GlobalAlias *, 16> AliasVisited; for (const auto &Alias : M.aliases()) { + if (Alias.hasAvailableExternallyLinkage()) + continue; for (const GlobalAlias *Cur = &Alias; Cur; Cur = dyn_cast<GlobalAlias>(Cur->getAliasee())) { if (!AliasVisited.insert(Cur).second) @@ -2258,7 +2335,9 @@ bool AsmPrinter::doFinalization(Module &M) { emitModuleIdents(M); // Emit bytes for llvm.commandline metadata. - emitModuleCommandLines(M); + // The command line metadata is emitted earlier on XCOFF. + if (!TM.getTargetTriple().isOSBinFormatXCOFF()) + emitModuleCommandLines(M); // Emit .note.GNU-split-stack and .note.GNU-no-split-stack sections if // split-stack is used. @@ -2786,6 +2865,22 @@ void AsmPrinter::emitInt16(int Value) const { OutStreamer->emitInt16(Value); } /// Emit a long directive and value. void AsmPrinter::emitInt32(int Value) const { OutStreamer->emitInt32(Value); } +/// EmitSLEB128 - emit the specified signed leb128 value. +void AsmPrinter::emitSLEB128(int64_t Value, const char *Desc) const { + if (isVerbose() && Desc) + OutStreamer->AddComment(Desc); + + OutStreamer->emitSLEB128IntValue(Value); +} + +void AsmPrinter::emitULEB128(uint64_t Value, const char *Desc, + unsigned PadTo) const { + if (isVerbose() && Desc) + OutStreamer->AddComment(Desc); + + OutStreamer->emitULEB128IntValue(Value, PadTo); +} + /// Emit a long long directive and value. void AsmPrinter::emitInt64(uint64_t Value) const { OutStreamer->emitInt64(Value); @@ -2799,6 +2894,12 @@ void AsmPrinter::emitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo, OutStreamer->emitAbsoluteSymbolDiff(Hi, Lo, Size); } +/// Emit something like ".uleb128 Hi-Lo". +void AsmPrinter::emitLabelDifferenceAsULEB128(const MCSymbol *Hi, + const MCSymbol *Lo) const { + OutStreamer->emitAbsoluteSymbolDiffAsULEB128(Hi, Lo); +} + /// EmitLabelPlusOffset - Emit something like ".long Label+Offset" /// where the size in bytes of the directive is specified by Size and Label /// specifies the label. This implicitly uses .set if it is available. @@ -3288,7 +3389,8 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) { ExtraBitsSize = alignTo(ExtraBitsSize, 8); ExtraBits = Realigned.getRawData()[0] & (((uint64_t)-1) >> (64 - ExtraBitsSize)); - Realigned.lshrInPlace(ExtraBitsSize); + if (BitWidth >= 64) + Realigned.lshrInPlace(ExtraBitsSize); } else ExtraBits = Realigned.getRawData()[BitWidth / 64]; } @@ -3917,16 +4019,18 @@ void AsmPrinter::emitXRayTable() { Flags, 0, GroupName, F.hasComdat(), MCSection::NonUniqueID, LinkedToSym); - if (!TM.Options.XRayOmitFunctionIndex) + if (TM.Options.XRayFunctionIndex) FnSledIndex = OutContext.getELFSection( - "xray_fn_idx", ELF::SHT_PROGBITS, Flags | ELF::SHF_WRITE, 0, - GroupName, F.hasComdat(), MCSection::NonUniqueID, LinkedToSym); + "xray_fn_idx", ELF::SHT_PROGBITS, Flags, 0, GroupName, F.hasComdat(), + MCSection::NonUniqueID, LinkedToSym); } else if (MF->getSubtarget().getTargetTriple().isOSBinFormatMachO()) { - InstMap = OutContext.getMachOSection("__DATA", "xray_instr_map", 0, + InstMap = OutContext.getMachOSection("__DATA", "xray_instr_map", + MachO::S_ATTR_LIVE_SUPPORT, SectionKind::getReadOnlyWithRel()); - if (!TM.Options.XRayOmitFunctionIndex) - FnSledIndex = OutContext.getMachOSection( - "__DATA", "xray_fn_idx", 0, SectionKind::getReadOnlyWithRel()); + if (TM.Options.XRayFunctionIndex) + FnSledIndex = OutContext.getMachOSection("__DATA", "xray_fn_idx", + MachO::S_ATTR_LIVE_SUPPORT, + SectionKind::getReadOnly()); } else { llvm_unreachable("Unsupported target"); } @@ -3937,7 +4041,8 @@ void AsmPrinter::emitXRayTable() { // per-function, we are able to create an index entry that will represent the // range of sleds associated with a function. auto &Ctx = OutContext; - MCSymbol *SledsStart = OutContext.createTempSymbol("xray_sleds_start", true); + MCSymbol *SledsStart = + OutContext.createLinkerPrivateSymbol("xray_sleds_start"); OutStreamer->switchSection(InstMap); OutStreamer->emitLabel(SledsStart); for (const auto &Sled : Sleds) { @@ -3968,8 +4073,17 @@ void AsmPrinter::emitXRayTable() { OutStreamer->switchSection(FnSledIndex); OutStreamer->emitCodeAlignment(Align(2 * WordSizeBytes), &getSubtargetInfo()); - OutStreamer->emitSymbolValue(SledsStart, WordSizeBytes, false); - OutStreamer->emitSymbolValue(SledsEnd, WordSizeBytes, false); + // For Mach-O, use an "l" symbol as the atom of this subsection. The label + // difference uses a SUBTRACTOR external relocation which references the + // symbol. + MCSymbol *Dot = Ctx.createLinkerPrivateSymbol("xray_fn_idx"); + OutStreamer->emitLabel(Dot); + OutStreamer->emitValueImpl( + MCBinaryExpr::createSub(MCSymbolRefExpr::create(SledsStart, Ctx), + MCSymbolRefExpr::create(Dot, Ctx), Ctx), + WordSizeBytes); + OutStreamer->emitValueImpl(MCConstantExpr::create(Sleds.size(), Ctx), + WordSizeBytes); OutStreamer->switchSection(PrevSection); } Sleds.clear(); @@ -4041,7 +4155,7 @@ unsigned int AsmPrinter::getDwarfOffsetByteSize() const { } dwarf::FormParams AsmPrinter::getDwarfFormParams() const { - return {getDwarfVersion(), uint8_t(getPointerSize()), + return {getDwarfVersion(), uint8_t(MAI->getCodePointerSize()), OutStreamer->getContext().getDwarfFormat(), doesDwarfUseRelocationsAcrossSections()}; } diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp index ecaa64afab4d..21d0d070c247 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp @@ -32,28 +32,6 @@ using namespace llvm; // Dwarf Emission Helper Routines //===----------------------------------------------------------------------===// -/// EmitSLEB128 - emit the specified signed leb128 value. -void AsmPrinter::emitSLEB128(int64_t Value, const char *Desc) const { - if (isVerbose() && Desc) - OutStreamer->AddComment(Desc); - - OutStreamer->emitSLEB128IntValue(Value); -} - -void AsmPrinter::emitULEB128(uint64_t Value, const char *Desc, - unsigned PadTo) const { - if (isVerbose() && Desc) - OutStreamer->AddComment(Desc); - - OutStreamer->emitULEB128IntValue(Value, PadTo); -} - -/// Emit something like ".uleb128 Hi-Lo". -void AsmPrinter::emitLabelDifferenceAsULEB128(const MCSymbol *Hi, - const MCSymbol *Lo) const { - OutStreamer->emitAbsoluteSymbolDiffAsULEB128(Hi, Lo); -} - static const char *DecodeDWARFEncoding(unsigned Encoding) { switch (Encoding) { case dwarf::DW_EH_PE_absptr: @@ -130,7 +108,7 @@ unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const { default: llvm_unreachable("Invalid encoded value."); case dwarf::DW_EH_PE_absptr: - return MF->getDataLayout().getPointerSize(); + return MAI->getCodePointerSize(); case dwarf::DW_EH_PE_udata2: return 2; case dwarf::DW_EH_PE_udata4: @@ -226,58 +204,59 @@ void AsmPrinter::emitCallSiteValue(uint64_t Value, unsigned Encoding) const { //===----------------------------------------------------------------------===// void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const { + SMLoc Loc = Inst.getLoc(); switch (Inst.getOperation()) { default: llvm_unreachable("Unexpected instruction"); case MCCFIInstruction::OpDefCfaOffset: - OutStreamer->emitCFIDefCfaOffset(Inst.getOffset()); + OutStreamer->emitCFIDefCfaOffset(Inst.getOffset(), Loc); break; case MCCFIInstruction::OpAdjustCfaOffset: - OutStreamer->emitCFIAdjustCfaOffset(Inst.getOffset()); + OutStreamer->emitCFIAdjustCfaOffset(Inst.getOffset(), Loc); break; case MCCFIInstruction::OpDefCfa: - OutStreamer->emitCFIDefCfa(Inst.getRegister(), Inst.getOffset()); + OutStreamer->emitCFIDefCfa(Inst.getRegister(), Inst.getOffset(), Loc); break; case MCCFIInstruction::OpDefCfaRegister: - OutStreamer->emitCFIDefCfaRegister(Inst.getRegister()); + OutStreamer->emitCFIDefCfaRegister(Inst.getRegister(), Loc); break; case MCCFIInstruction::OpLLVMDefAspaceCfa: OutStreamer->emitCFILLVMDefAspaceCfa(Inst.getRegister(), Inst.getOffset(), - Inst.getAddressSpace()); + Inst.getAddressSpace(), Loc); break; case MCCFIInstruction::OpOffset: - OutStreamer->emitCFIOffset(Inst.getRegister(), Inst.getOffset()); + OutStreamer->emitCFIOffset(Inst.getRegister(), Inst.getOffset(), Loc); break; case MCCFIInstruction::OpRegister: - OutStreamer->emitCFIRegister(Inst.getRegister(), Inst.getRegister2()); + OutStreamer->emitCFIRegister(Inst.getRegister(), Inst.getRegister2(), Loc); break; case MCCFIInstruction::OpWindowSave: - OutStreamer->emitCFIWindowSave(); + OutStreamer->emitCFIWindowSave(Loc); break; case MCCFIInstruction::OpNegateRAState: - OutStreamer->emitCFINegateRAState(); + OutStreamer->emitCFINegateRAState(Loc); break; case MCCFIInstruction::OpSameValue: - OutStreamer->emitCFISameValue(Inst.getRegister()); + OutStreamer->emitCFISameValue(Inst.getRegister(), Loc); break; case MCCFIInstruction::OpGnuArgsSize: - OutStreamer->emitCFIGnuArgsSize(Inst.getOffset()); + OutStreamer->emitCFIGnuArgsSize(Inst.getOffset(), Loc); break; case MCCFIInstruction::OpEscape: OutStreamer->AddComment(Inst.getComment()); - OutStreamer->emitCFIEscape(Inst.getValues()); + OutStreamer->emitCFIEscape(Inst.getValues(), Loc); break; case MCCFIInstruction::OpRestore: - OutStreamer->emitCFIRestore(Inst.getRegister()); + OutStreamer->emitCFIRestore(Inst.getRegister(), Loc); break; case MCCFIInstruction::OpUndefined: - OutStreamer->emitCFIUndefined(Inst.getRegister()); + OutStreamer->emitCFIUndefined(Inst.getRegister(), Loc); break; case MCCFIInstruction::OpRememberState: - OutStreamer->emitCFIRememberState(); + OutStreamer->emitCFIRememberState(Loc); break; case MCCFIInstruction::OpRestoreState: - OutStreamer->emitCFIRestoreState(); + OutStreamer->emitCFIRestoreState(Loc); break; } } diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp index c1588aaea05e..32674bbeb061 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp @@ -12,6 +12,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 0a67c4b6beb6..8161de57b58e 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/TinyPtrVector.h" -#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/Dwarf.h" @@ -65,6 +64,7 @@ #include "llvm/Support/ScopedPrinter.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" #include <algorithm> #include <cassert> #include <cctype> @@ -488,10 +488,10 @@ void CodeViewDebug::recordLocalVariable(LocalVariable &&Var, // This variable was inlined. Associate it with the InlineSite. const DISubprogram *Inlinee = Var.DIVar->getScope()->getSubprogram(); InlineSite &Site = getInlineSite(InlinedAt, Inlinee); - Site.InlinedLocals.emplace_back(Var); + Site.InlinedLocals.emplace_back(std::move(Var)); } else { // This variable goes into the corresponding lexical scope. - ScopeVariables[LS].emplace_back(Var); + ScopeVariables[LS].emplace_back(std::move(Var)); } } @@ -569,7 +569,6 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) { case dwarf::DW_LANG_C89: case dwarf::DW_LANG_C99: case dwarf::DW_LANG_C11: - case dwarf::DW_LANG_ObjC: return SourceLanguage::C; case dwarf::DW_LANG_C_plus_plus: case dwarf::DW_LANG_C_plus_plus_03: @@ -595,6 +594,10 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) { return SourceLanguage::Swift; case dwarf::DW_LANG_Rust: return SourceLanguage::Rust; + case dwarf::DW_LANG_ObjC: + return SourceLanguage::ObjC; + case dwarf::DW_LANG_ObjC_plus_plus: + return SourceLanguage::ObjCpp; default: // There's no CodeView representation for this language, and CV doesn't // have an "unknown" option for the language field, so we'll use MASM, @@ -788,7 +791,6 @@ void CodeViewDebug::emitObjName() { // Don't emit the filename if we're writing to stdout or to /dev/null. PathRef = {}; } else { - llvm::sys::path::remove_dots(PathStore, /*remove_dot_dot=*/true); PathRef = PathStore; } @@ -1158,7 +1160,14 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, OS.AddComment("Function section index"); OS.emitCOFFSectionIndex(Fn); OS.AddComment("Flags"); - OS.emitInt8(0); + ProcSymFlags ProcFlags = ProcSymFlags::HasOptimizedDebugInfo; + if (FI.HasFramePointer) + ProcFlags |= ProcSymFlags::HasFP; + if (GV->hasFnAttribute(Attribute::NoReturn)) + ProcFlags |= ProcSymFlags::IsNoReturn; + if (GV->hasFnAttribute(Attribute::NoInline)) + ProcFlags |= ProcSymFlags::IsNoInline; + OS.emitInt8(static_cast<uint8_t>(ProcFlags)); // Emit the function display name as a null-terminated string. OS.AddComment("Function name"); // Truncate the name so we won't overflow the record length field. @@ -1262,7 +1271,8 @@ void CodeViewDebug::collectVariableInfoFromMFTable( const TargetFrameLowering *TFI = TSI.getFrameLowering(); const TargetRegisterInfo *TRI = TSI.getRegisterInfo(); - for (const MachineFunction::VariableDbgInfo &VI : MF.getVariableDbgInfo()) { + for (const MachineFunction::VariableDbgInfo &VI : + MF.getInStackSlotVariableDbgInfo()) { if (!VI.Var) continue; assert(VI.Var->isValidLocationForIntrinsic(VI.Loc) && @@ -1290,7 +1300,8 @@ void CodeViewDebug::collectVariableInfoFromMFTable( // Get the frame register used and the offset. Register FrameReg; - StackOffset FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg); + StackOffset FrameOffset = + TFI->getFrameIndexReference(*Asm->MF, VI.getStackSlot(), FrameReg); uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg); assert(!FrameOffset.getScalable() && @@ -1476,6 +1487,7 @@ void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) { CurFn->EncodedLocalFramePtrReg = EncodedFramePtrReg::StackPtr; CurFn->EncodedParamFramePtrReg = EncodedFramePtrReg::StackPtr; } else { + CurFn->HasFramePointer = true; // If there is an FP, parameters are always relative to it. CurFn->EncodedParamFramePtrReg = EncodedFramePtrReg::FramePtr; if (CurFn->HasStackRealignment) { @@ -1717,12 +1729,13 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) { // Otherwise, if it has an upperboud, use (upperbound - lowerbound + 1), // where lowerbound is from the LowerBound field of the Subrange, // or the language default lowerbound if that field is unspecified. - if (auto *CI = Subrange->getCount().dyn_cast<ConstantInt *>()) + if (auto *CI = dyn_cast_if_present<ConstantInt *>(Subrange->getCount())) Count = CI->getSExtValue(); - else if (auto *UI = Subrange->getUpperBound().dyn_cast<ConstantInt *>()) { + else if (auto *UI = dyn_cast_if_present<ConstantInt *>( + Subrange->getUpperBound())) { // Fortran uses 1 as the default lowerbound; other languages use 0. int64_t Lowerbound = (moduleIsInFortran()) ? 1 : 0; - auto *LI = Subrange->getLowerBound().dyn_cast<ConstantInt *>(); + auto *LI = dyn_cast_if_present<ConstantInt *>(Subrange->getLowerBound()); Lowerbound = (LI) ? LI->getSExtValue() : Lowerbound; Count = UI->getSExtValue() - Lowerbound + 1; } @@ -1793,12 +1806,14 @@ TypeIndex CodeViewDebug::lowerTypeBasic(const DIBasicType *Ty) { } break; case dwarf::DW_ATE_complex_float: + // The CodeView size for a complex represents the size of + // an individual component. switch (ByteSize) { - case 2: STK = SimpleTypeKind::Complex16; break; - case 4: STK = SimpleTypeKind::Complex32; break; - case 8: STK = SimpleTypeKind::Complex64; break; - case 10: STK = SimpleTypeKind::Complex80; break; - case 16: STK = SimpleTypeKind::Complex128; break; + case 4: STK = SimpleTypeKind::Complex16; break; + case 8: STK = SimpleTypeKind::Complex32; break; + case 16: STK = SimpleTypeKind::Complex64; break; + case 20: STK = SimpleTypeKind::Complex80; break; + case 32: STK = SimpleTypeKind::Complex128; break; } break; case dwarf::DW_ATE_float: @@ -3279,7 +3294,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() { // Second, emit each global that is in a comdat into its own .debug$S // section along with its own symbol substream. for (const CVGlobalVariable &CVGV : ComdatVariables) { - const GlobalVariable *GV = CVGV.GVInfo.get<const GlobalVariable *>(); + const GlobalVariable *GV = cast<const GlobalVariable *>(CVGV.GVInfo); MCSymbol *GVSym = Asm->getSymbol(GV); OS.AddComment("Symbol subsection for " + Twine(GlobalValue::dropLLVMManglingEscape(GV->getName()))); @@ -3388,7 +3403,7 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) { : getFullyQualifiedName(Scope, DIGV->getName()); if (const GlobalVariable *GV = - CVGV.GVInfo.dyn_cast<const GlobalVariable *>()) { + dyn_cast_if_present<const GlobalVariable *>(CVGV.GVInfo)) { // DataSym record, see SymbolRecord.h for more info. Thread local data // happens to have the same format as global data. MCSymbol *GVSym = Asm->getSymbol(GV); @@ -3403,7 +3418,7 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) { OS.AddComment("DataOffset"); uint64_t Offset = 0; - if (CVGlobalVariableOffsets.find(DIGV) != CVGlobalVariableOffsets.end()) + if (CVGlobalVariableOffsets.contains(DIGV)) // Use the offset seen while collecting info on globals. Offset = CVGlobalVariableOffsets[DIGV]; OS.emitCOFFSecRel32(GVSym, Offset); @@ -3415,7 +3430,7 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) { emitNullTerminatedSymbolName(OS, QualifiedName, LengthOfDataRecord); endSymbolRecord(DataEnd); } else { - const DIExpression *DIE = CVGV.GVInfo.get<const DIExpression *>(); + const DIExpression *DIE = cast<const DIExpression *>(CVGV.GVInfo); assert(DIE->isConstant() && "Global constant variables must contain a constant expression."); diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h index 495822a6e653..1455ac417824 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H #define LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H +#include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -191,6 +192,8 @@ private: bool HasStackRealignment = false; bool HaveLineInfo = false; + + bool HasFramePointer = false; }; FunctionInfo *CurFn = nullptr; diff --git a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp index 308d4b1b5d61..619155cafe92 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp @@ -173,9 +173,7 @@ void DIEAbbrevSet::Emit(const AsmPrinter *AP, MCSection *Section) const { // DIE Implementation //===----------------------------------------------------------------------===// -DIE *DIE::getParent() const { - return Owner.dyn_cast<DIE*>(); -} +DIE *DIE::getParent() const { return dyn_cast_if_present<DIE *>(Owner); } DIEAbbrev DIE::generateAbbrev() const { DIEAbbrev Abbrev(Tag, hasChildren()); @@ -209,7 +207,7 @@ const DIE *DIE::getUnitDie() const { DIEUnit *DIE::getUnit() const { const DIE *UnitDie = getUnitDie(); if (UnitDie) - return UnitDie->Owner.dyn_cast<DIEUnit*>(); + return dyn_cast_if_present<DIEUnit *>(UnitDie->Owner); return nullptr; } @@ -385,6 +383,7 @@ void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const { case dwarf::DW_FORM_strx2: case dwarf::DW_FORM_addrx2: case dwarf::DW_FORM_strx3: + case dwarf::DW_FORM_addrx3: case dwarf::DW_FORM_strp: case dwarf::DW_FORM_ref4: case dwarf::DW_FORM_data4: diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp index 0b40cdb0c3cc..55a0afcf7a33 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp @@ -138,6 +138,9 @@ void DbgValueHistoryMap::trimLocationRanges( // references if any entries are removed. SmallVector<size_t, 4> Offsets; + LLVM_DEBUG(dbgs() << "Trimming location ranges for function '" << MF.getName() + << "'\n"); + for (auto &Record : VarEntries) { auto &HistoryMapEntries = Record.second; if (HistoryMapEntries.empty()) @@ -213,6 +216,8 @@ void DbgValueHistoryMap::trimLocationRanges( // count of the closing entry, if one exists. if (EndIndex != NoEntry) ReferenceCount[EndIndex] -= 1; + LLVM_DEBUG(dbgs() << "Dropping value outside scope range of variable: "; + StartMI->print(llvm::dbgs());); } } @@ -253,6 +258,8 @@ void DbgValueHistoryMap::trimLocationRanges( // ToRemove indices are valid after each erase. for (EntryIndex Idx : llvm::reverse(ToRemove)) HistoryMapEntries.erase(HistoryMapEntries.begin() + Idx); + LLVM_DEBUG(llvm::dbgs() << "New HistoryMap('" << LocalVar->getName() + << "') size: " << HistoryMapEntries.size() << "\n"); } } @@ -555,8 +562,8 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF, } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD void DbgValueHistoryMap::dump() const { - dbgs() << "DbgValueHistoryMap:\n"; +LLVM_DUMP_METHOD void DbgValueHistoryMap::dump(StringRef FuncName) const { + dbgs() << "DbgValueHistoryMap('" << FuncName << "'):\n"; for (const auto &VarRangePair : *this) { const InlinedEntity &Var = VarRangePair.first; const Entries &Entries = VarRangePair.second; diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index 858a3e75e515..eb2d992c7e75 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -223,6 +223,7 @@ bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) { Encoding == dwarf::DW_ATE_signed_char || Encoding == dwarf::DW_ATE_float || Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean || + Encoding == dwarf::DW_ATE_complex_float || (Ty->getTag() == dwarf::DW_TAG_unspecified_type && Ty->getName() == "decltype(nullptr)")) && "Unsupported encoding"); @@ -273,7 +274,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) { InstOrdering.initialize(*MF); if (TrimVarLocs) DbgValues.trimLocationRanges(*MF, LScopes, InstOrdering); - LLVM_DEBUG(DbgValues.dump()); + LLVM_DEBUG(DbgValues.dump(MF->getName())); // Request labels for the full history. for (const auto &I : DbgValues) { diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h index 2008aa39ff87..726aba18bb80 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h +++ b/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h @@ -126,7 +126,7 @@ public: : Expression(Expr), ValueLocEntries(Locs.begin(), Locs.end()), IsVariadic(IsVariadic) { #ifndef NDEBUG - assert(cast<DIExpression>(Expr)->isValid() || + assert(Expr->isValid() || !any_of(Locs, [](auto LE) { return LE.isLocation(); })); if (!IsVariadic) { assert(ValueLocEntries.size() == 1); diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h b/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h index 0515173b4a24..a96bdd034918 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h +++ b/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h @@ -49,7 +49,7 @@ private: SmallVector<Entry, 32> Entries; SmallString<256> DWARFBytes; std::vector<std::string> Comments; - MCSymbol *Sym; + MCSymbol *Sym = nullptr; /// Only verbose textual output needs comments. This will be set to /// true for that case, and false otherwise. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp index df4fe8d49806..10c844ddb14a 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp @@ -94,7 +94,7 @@ void DwarfCFIException::beginFunction(const MachineFunction *MF) { shouldEmitCFI = MAI.usesCFIForEH() && (shouldEmitPersonality || shouldEmitMoves); else - shouldEmitCFI = Asm->needsCFIForDebug() && shouldEmitMoves; + shouldEmitCFI = Asm->usesCFIWithoutEH() && shouldEmitMoves; } void DwarfCFIException::beginBasicBlockSection(const MachineBasicBlock &MBB) { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 6dde50375a60..58ed21379d29 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -266,7 +267,7 @@ void DwarfCompileUnit::addLocationAttribute( // 16-bit platforms like MSP430 and AVR take this path, so sink this // assert to platforms that use it. auto GetPointerSizedFormAndOp = [this]() { - unsigned PointerSize = Asm->getDataLayout().getPointerSize(); + unsigned PointerSize = Asm->MAI->getCodePointerSize(); assert((PointerSize == 4 || PointerSize == 8) && "Add support for other sizes if necessary"); struct FormAndOp { @@ -278,7 +279,16 @@ void DwarfCompileUnit::addLocationAttribute( : FormAndOp{dwarf::DW_FORM_data8, dwarf::DW_OP_const8u}; }; if (Global->isThreadLocal()) { - if (Asm->TM.useEmulatedTLS()) { + if (Asm->TM.getTargetTriple().isWasm()) { + // FIXME This is not guaranteed, but in practice, in static linking, + // if present, __tls_base's index is 1. This doesn't hold for dynamic + // linking, so TLS variables used in dynamic linking won't have + // correct debug info for now. See + // https://github.com/llvm/llvm-project/blob/19afbfe33156d211fa959dadeea46cd17b9c723c/lld/wasm/Driver.cpp#L786-L823 + addWasmRelocBaseGlobal(Loc, "__tls_base", 1); + addOpAddress(*Loc, Sym); + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus); + } else if (Asm->TM.useEmulatedTLS()) { // TODO: add debug info for emulated thread local mode. } else { // FIXME: Make this work with -gsplit-dwarf. @@ -301,6 +311,14 @@ void DwarfCompileUnit::addLocationAttribute( DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address : dwarf::DW_OP_form_tls_address); } + } else if (Asm->TM.getTargetTriple().isWasm() && + Asm->TM.getRelocationModel() == Reloc::PIC_) { + // FIXME This is not guaranteed, but in practice, if present, + // __memory_base's index is 1. See + // https://github.com/llvm/llvm-project/blob/19afbfe33156d211fa959dadeea46cd17b9c723c/lld/wasm/Driver.cpp#L786-L823 + addWasmRelocBaseGlobal(Loc, "__memory_base", 1); + addOpAddress(*Loc, Sym); + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus); } else if ((Asm->TM.getRelocationModel() == Reloc::RWPI || Asm->TM.getRelocationModel() == Reloc::ROPI_RWPI) && !Asm->getObjFileLowering() @@ -449,6 +467,39 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) { return ContextCU->updateSubprogramScopeDIEImpl(SP, SPDie); } +// Add info for Wasm-global-based relocation. +// 'GlobalIndex' is used for split dwarf, which currently relies on a few +// assumptions that are not guaranteed in a formal way but work in practice. +void DwarfCompileUnit::addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName, + uint64_t GlobalIndex) { + // FIXME: duplicated from Target/WebAssembly/WebAssembly.h + // don't want to depend on target specific headers in this code? + const unsigned TI_GLOBAL_RELOC = 3; + unsigned PointerSize = Asm->getDataLayout().getPointerSize(); + auto *Sym = cast<MCSymbolWasm>(Asm->GetExternalSymbolSymbol(GlobalName)); + // FIXME: this repeats what WebAssemblyMCInstLower:: + // GetExternalSymbolSymbol does, since if there's no code that + // refers to this symbol, we have to set it here. + Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); + Sym->setGlobalType(wasm::WasmGlobalType{ + static_cast<uint8_t>(PointerSize == 4 ? wasm::WASM_TYPE_I32 + : wasm::WASM_TYPE_I64), + true}); + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_WASM_location); + addSInt(*Loc, dwarf::DW_FORM_sdata, TI_GLOBAL_RELOC); + if (!isDwoUnit()) { + addLabel(*Loc, dwarf::DW_FORM_data4, Sym); + } else { + // FIXME: when writing dwo, we need to avoid relocations. Probably + // the "right" solution is to treat globals the way func and data + // symbols are (with entries in .debug_addr). + // For now we hardcode the indices in the callsites. Global indices are not + // fixed, but in practice a few are fixed; for example, __stack_pointer is + // always index 0. + addUInt(*Loc, dwarf::DW_FORM_data4, GlobalIndex); + } +} + DIE &DwarfCompileUnit::updateSubprogramScopeDIEImpl(const DISubprogram *SP, DIE *SPDie) { SmallVector<RangeSpan, 2> BB_List; @@ -480,40 +531,24 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIEImpl(const DISubprogram *SP, case TargetFrameLowering::DwarfFrameBase::CFA: { DIELoc *Loc = new (DIEValueAllocator) DIELoc; addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_call_frame_cfa); + if (FrameBase.Location.Offset != 0) { + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_consts); + addSInt(*Loc, dwarf::DW_FORM_sdata, FrameBase.Location.Offset); + addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus); + } addBlock(*SPDie, dwarf::DW_AT_frame_base, Loc); break; } case TargetFrameLowering::DwarfFrameBase::WasmFrameBase: { // FIXME: duplicated from Target/WebAssembly/WebAssembly.h - // don't want to depend on target specific headers in this code? const unsigned TI_GLOBAL_RELOC = 3; if (FrameBase.Location.WasmLoc.Kind == TI_GLOBAL_RELOC) { // These need to be relocatable. - assert(FrameBase.Location.WasmLoc.Index == 0); // Only SP so far. - auto SPSym = cast<MCSymbolWasm>( - Asm->GetExternalSymbolSymbol("__stack_pointer")); - // FIXME: this repeats what WebAssemblyMCInstLower:: - // GetExternalSymbolSymbol does, since if there's no code that - // refers to this symbol, we have to set it here. - SPSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); - SPSym->setGlobalType(wasm::WasmGlobalType{ - uint8_t(Asm->getSubtargetInfo().getTargetTriple().getArch() == - Triple::wasm64 - ? wasm::WASM_TYPE_I64 - : wasm::WASM_TYPE_I32), - true}); DIELoc *Loc = new (DIEValueAllocator) DIELoc; - addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_WASM_location); - addSInt(*Loc, dwarf::DW_FORM_sdata, TI_GLOBAL_RELOC); - if (!isDwoUnit()) { - addLabel(*Loc, dwarf::DW_FORM_data4, SPSym); - } else { - // FIXME: when writing dwo, we need to avoid relocations. Probably - // the "right" solution is to treat globals the way func and data - // symbols are (with entries in .debug_addr). - // For now, since we only ever use index 0, this should work as-is. - addUInt(*Loc, dwarf::DW_FORM_data4, FrameBase.Location.WasmLoc.Index); - } + assert(FrameBase.Location.WasmLoc.Index == 0); // Only SP so far. + // For now, since we only ever use index 0, this should work as-is. + addWasmRelocBaseGlobal(Loc, "__stack_pointer", + FrameBase.Location.WasmLoc.Index); addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value); addBlock(*SPDie, dwarf::DW_AT_frame_base, Loc); } else { @@ -608,7 +643,7 @@ void DwarfCompileUnit::attachRangesOrLowHighPC( assert(!Ranges.empty()); if (!DD->useRangesSection() || (Ranges.size() == 1 && - (!DD->alwaysUseRanges() || + (!DD->alwaysUseRanges(*this) || DD->getSectionLabel(&Ranges.front().Begin->getSection()) == Ranges.front().Begin))) { const RangeSpan &Front = Ranges.front(); @@ -659,7 +694,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope, auto *InlinedSP = getDISubprogram(DS); // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram // was inlined from another compile unit. - DIE *OriginDIE = getAbstractSPDies()[InlinedSP]; + DIE *OriginDIE = getAbstractScopeDIEs()[InlinedSP]; assert(OriginDIE && "Unable to find original DIE for an inlined subprogram."); auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine); @@ -691,10 +726,20 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope, DIE *DwarfCompileUnit::constructLexicalScopeDIE(LexicalScope *Scope) { if (DD->isLexicalScopeDIENull(Scope)) return nullptr; + const auto *DS = Scope->getScopeNode(); auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_lexical_block); - if (Scope->isAbstractScope()) + if (Scope->isAbstractScope()) { + assert(!getAbstractScopeDIEs().count(DS) && + "Abstract DIE for this scope exists!"); + getAbstractScopeDIEs()[DS] = ScopeDIE; return ScopeDIE; + } + if (!Scope->getInlinedAt()) { + assert(!LexicalBlockDIEs.count(DS) && + "Concrete out-of-line DIE for this scope exists!"); + LexicalBlockDIEs[DS] = ScopeDIE; + } attachRangesOrLowHighPC(*ScopeDIE, Scope->getRanges()); @@ -929,29 +974,29 @@ static SmallVector<const DIVariable *, 2> dependencies(DbgVariable *Var) { for (auto *El : Array->getElements()) { if (auto *Subrange = dyn_cast<DISubrange>(El)) { if (auto Count = Subrange->getCount()) - if (auto *Dependency = Count.dyn_cast<DIVariable *>()) + if (auto *Dependency = dyn_cast_if_present<DIVariable *>(Count)) Result.push_back(Dependency); if (auto LB = Subrange->getLowerBound()) - if (auto *Dependency = LB.dyn_cast<DIVariable *>()) + if (auto *Dependency = dyn_cast_if_present<DIVariable *>(LB)) Result.push_back(Dependency); if (auto UB = Subrange->getUpperBound()) - if (auto *Dependency = UB.dyn_cast<DIVariable *>()) + if (auto *Dependency = dyn_cast_if_present<DIVariable *>(UB)) Result.push_back(Dependency); if (auto ST = Subrange->getStride()) - if (auto *Dependency = ST.dyn_cast<DIVariable *>()) + if (auto *Dependency = dyn_cast_if_present<DIVariable *>(ST)) Result.push_back(Dependency); } else if (auto *GenericSubrange = dyn_cast<DIGenericSubrange>(El)) { if (auto Count = GenericSubrange->getCount()) - if (auto *Dependency = Count.dyn_cast<DIVariable *>()) + if (auto *Dependency = dyn_cast_if_present<DIVariable *>(Count)) Result.push_back(Dependency); if (auto LB = GenericSubrange->getLowerBound()) - if (auto *Dependency = LB.dyn_cast<DIVariable *>()) + if (auto *Dependency = dyn_cast_if_present<DIVariable *>(LB)) Result.push_back(Dependency); if (auto UB = GenericSubrange->getUpperBound()) - if (auto *Dependency = UB.dyn_cast<DIVariable *>()) + if (auto *Dependency = dyn_cast_if_present<DIVariable *>(UB)) Result.push_back(Dependency); if (auto ST = GenericSubrange->getStride()) - if (auto *Dependency = ST.dyn_cast<DIVariable *>()) + if (auto *Dependency = dyn_cast_if_present<DIVariable *>(ST)) Result.push_back(Dependency); } } @@ -1062,35 +1107,35 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope, for (DbgVariable *DV : Locals) ScopeDIE.addChild(constructVariableDIE(*DV, *Scope, ObjectPointer)); - // Emit imported entities (skipped in gmlt-like data). - if (!includeMinimalInlineScopes()) { - for (const auto *IE : ImportedEntities[Scope->getScopeNode()]) - ScopeDIE.addChild(constructImportedEntityDIE(cast<DIImportedEntity>(IE))); - } - // Emit labels. for (DbgLabel *DL : DU->getScopeLabels().lookup(Scope)) ScopeDIE.addChild(constructLabelDIE(*DL, *Scope)); + // Track other local entities (skipped in gmlt-like data). + // This creates mapping between CU and a set of local declarations that + // should be emitted for subprograms in this CU. + if (!includeMinimalInlineScopes() && !Scope->getInlinedAt()) { + auto &LocalDecls = DD->getLocalDeclsForScope(Scope->getScopeNode()); + DeferredLocalDecls.insert(LocalDecls.begin(), LocalDecls.end()); + } + // Emit inner lexical scopes. - auto needToEmitLexicalScope = [this](LexicalScope *LS) { - if (isa<DISubprogram>(LS->getScopeNode())) - return true; - auto Vars = DU->getScopeVariables().lookup(LS); + auto skipLexicalScope = [this](LexicalScope *S) -> bool { + if (isa<DISubprogram>(S->getScopeNode())) + return false; + auto Vars = DU->getScopeVariables().lookup(S); if (!Vars.Args.empty() || !Vars.Locals.empty()) - return true; - if (!includeMinimalInlineScopes() && - !ImportedEntities[LS->getScopeNode()].empty()) - return true; - return false; + return false; + return includeMinimalInlineScopes() || + DD->getLocalDeclsForScope(S->getScopeNode()).empty(); }; for (LexicalScope *LS : Scope->getChildren()) { // If the lexical block doesn't have non-scope children, skip // its emission and put its children directly to the parent scope. - if (needToEmitLexicalScope(LS)) - constructScopeDIE(LS, ScopeDIE); - else + if (skipLexicalScope(LS)) createAndAddScopeChildren(LS, ScopeDIE); + else + constructScopeDIE(LS, ScopeDIE); } return ObjectPointer; @@ -1098,11 +1143,9 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope, void DwarfCompileUnit::constructAbstractSubprogramScopeDIE( LexicalScope *Scope) { - DIE *&AbsDef = getAbstractSPDies()[Scope->getScopeNode()]; - if (AbsDef) - return; - auto *SP = cast<DISubprogram>(Scope->getScopeNode()); + if (getAbstractScopeDIEs().count(SP)) + return; DIE *ContextDIE; DwarfCompileUnit *ContextCU = this; @@ -1126,14 +1169,19 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE( // Passing null as the associated node because the abstract definition // shouldn't be found by lookup. - AbsDef = &ContextCU->createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, nullptr); - ContextCU->applySubprogramAttributesToDefinition(SP, *AbsDef); - ContextCU->addSInt(*AbsDef, dwarf::DW_AT_inline, + DIE &AbsDef = ContextCU->createAndAddDIE(dwarf::DW_TAG_subprogram, + *ContextDIE, nullptr); + + // Store the DIE before creating children. + ContextCU->getAbstractScopeDIEs()[SP] = &AbsDef; + + ContextCU->applySubprogramAttributesToDefinition(SP, AbsDef); + ContextCU->addSInt(AbsDef, dwarf::DW_AT_inline, DD->getDwarfVersion() <= 4 ? std::optional<dwarf::Form>() : dwarf::DW_FORM_implicit_const, dwarf::DW_INL_inlined); - if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, *AbsDef)) - ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer); + if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, AbsDef)) + ContextCU->addDIEEntry(AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer); } bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const { @@ -1277,21 +1325,37 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE( EntityDie = getOrCreateNameSpace(NS); else if (auto *M = dyn_cast<DIModule>(Entity)) EntityDie = getOrCreateModule(M); - else if (auto *SP = dyn_cast<DISubprogram>(Entity)) - EntityDie = getOrCreateSubprogramDIE(SP); - else if (auto *T = dyn_cast<DIType>(Entity)) + else if (auto *SP = dyn_cast<DISubprogram>(Entity)) { + // If there is an abstract subprogram, refer to it. Note that this assumes + // that all the abstract subprograms have been already created (which is + // correct until imported entities get emitted in DwarfDebug::endModule()). + if (auto *AbsSPDie = getAbstractScopeDIEs().lookup(SP)) + EntityDie = AbsSPDie; + else + EntityDie = getOrCreateSubprogramDIE(SP); + } else if (auto *T = dyn_cast<DIType>(Entity)) EntityDie = getOrCreateTypeDIE(T); else if (auto *GV = dyn_cast<DIGlobalVariable>(Entity)) EntityDie = getOrCreateGlobalVariableDIE(GV, {}); + else if (auto *IE = dyn_cast<DIImportedEntity>(Entity)) + EntityDie = getOrCreateImportedEntityDIE(IE); else EntityDie = getDIE(Entity); assert(EntityDie); addSourceLine(*IMDie, Module->getLine(), Module->getFile()); addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie); StringRef Name = Module->getName(); - if (!Name.empty()) + if (!Name.empty()) { addString(*IMDie, dwarf::DW_AT_name, Name); + // FIXME: if consumers ever start caring about handling + // unnamed import declarations such as `using ::nullptr_t` + // or `using namespace std::ranges`, we could add the + // import declaration into the accelerator table with the + // name being the one of the entity being imported. + DD->addAccelNamespace(*CUNode, Name, *IMDie); + } + // This is for imported module with renamed entities (such as variables and // subprograms). DINodeArray Elements = Module->getElements(); @@ -1305,9 +1369,24 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE( return IMDie; } +DIE *DwarfCompileUnit::getOrCreateImportedEntityDIE( + const DIImportedEntity *IE) { + + // Check for pre-existence. + if (DIE *Die = getDIE(IE)) + return Die; + + DIE *ContextDIE = getOrCreateContextDIE(IE->getScope()); + assert(ContextDIE && "Empty scope for the imported entity!"); + + DIE *IMDie = constructImportedEntityDIE(IE); + ContextDIE->addChild(IMDie); + return IMDie; +} + void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) { DIE *D = getDIE(SP); - if (DIE *AbsSPDIE = getAbstractSPDies().lookup(SP)) { + if (DIE *AbsSPDIE = getAbstractScopeDIEs().lookup(SP)) { if (D) // If this subprogram has an abstract definition, reference that addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE); @@ -1356,8 +1435,8 @@ void DwarfCompileUnit::createAbstractEntity(const DINode *Node, assert(Scope && Scope->isAbstractScope()); auto &Entity = getAbstractEntities()[Node]; if (isa<const DILocalVariable>(Node)) { - Entity = std::make_unique<DbgVariable>( - cast<const DILocalVariable>(Node), nullptr /* IA */);; + Entity = std::make_unique<DbgVariable>(cast<const DILocalVariable>(Node), + nullptr /* IA */); DU->addScopeVariable(Scope, cast<DbgVariable>(Entity.get())); } else if (isa<const DILabel>(Node)) { Entity = std::make_unique<DbgLabel>( @@ -1389,6 +1468,8 @@ bool DwarfCompileUnit::hasDwarfPubSections() const { // generated for things like Gold's gdb_index generation. case DICompileUnit::DebugNameTableKind::GNU: return true; + case DICompileUnit::DebugNameTableKind::Apple: + return false; case DICompileUnit::DebugNameTableKind::Default: return DD->tuneForGDB() && !includeMinimalInlineScopes() && !CUNode->isDebugDirectivesOnly() && @@ -1599,3 +1680,29 @@ void DwarfCompileUnit::createBaseTypeDIEs() { Btr.Die = &Die; } } + +DIE *DwarfCompileUnit::getLexicalBlockDIE(const DILexicalBlock *LB) { + // Assume if there is an abstract tree all the DIEs are already emitted. + bool isAbstract = getAbstractScopeDIEs().count(LB->getSubprogram()); + if (isAbstract && getAbstractScopeDIEs().count(LB)) + return getAbstractScopeDIEs()[LB]; + assert(!isAbstract && "Missed lexical block DIE in abstract tree!"); + + // Return a concrete DIE if it exists or nullptr otherwise. + return LexicalBlockDIEs.lookup(LB); +} + +DIE *DwarfCompileUnit::getOrCreateContextDIE(const DIScope *Context) { + if (isa_and_nonnull<DILocalScope>(Context)) { + if (auto *LFScope = dyn_cast<DILexicalBlockFile>(Context)) + Context = LFScope->getNonLexicalBlockFileScope(); + if (auto *LScope = dyn_cast<DILexicalBlock>(Context)) + return getLexicalBlockDIE(LScope); + + // Otherwise the context must be a DISubprogram. + auto *SPScope = cast<DISubprogram>(Context); + if (getAbstractScopeDIEs().count(SPScope)) + return getAbstractScopeDIEs()[SPScope]; + } + return DwarfUnit::getOrCreateContextDIE(Context); +} diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 7d87f35021bb..6ef73ebd4f7f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -61,11 +61,6 @@ class DwarfCompileUnit final : public DwarfUnit { /// The start of the unit macro info within macro section. MCSymbol *MacroLabelBegin; - using ImportedEntityList = SmallVector<const MDNode *, 8>; - using ImportedEntityMap = DenseMap<const MDNode *, ImportedEntityList>; - - ImportedEntityMap ImportedEntities; - /// GlobalNames - A map of globally visible named entities for this unit. StringMap<const DIE *> GlobalNames; @@ -79,7 +74,20 @@ class DwarfCompileUnit final : public DwarfUnit { // ranges/locs. const MCSymbol *BaseAddress = nullptr; - DenseMap<const MDNode *, DIE *> AbstractSPDies; + using MDNodeSetVector = + SetVector<const MDNode *, SmallVector<const MDNode *, 4>, + SmallPtrSet<const MDNode *, 4>>; + + // List of entities (either static locals, types or imports) that + // belong to subprograms within this CU. + MDNodeSetVector DeferredLocalDecls; + + // List of concrete lexical block scopes belong to subprograms within this CU. + DenseMap<const DILocalScope *, DIE *> LexicalBlockDIEs; + + // List of abstract local scopes (either DISubprogram or DILexicalBlock). + DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs; + DenseMap<const DINode *, std::unique_ptr<DbgEntity>> AbstractEntities; /// DWO ID for correlating skeleton and split units. @@ -94,10 +102,10 @@ class DwarfCompileUnit final : public DwarfUnit { bool isDwoUnit() const override; - DenseMap<const MDNode *, DIE *> &getAbstractSPDies() { + DenseMap<const DILocalScope *, DIE *> &getAbstractScopeDIEs() { if (isDwoUnit() && !DD->shareAcrossDWOCUs()) - return AbstractSPDies; - return DU->getAbstractSPDies(); + return AbstractLocalScopeDIEs; + return DU->getAbstractScopeDIEs(); } DenseMap<const DINode *, std::unique_ptr<DbgEntity>> &getAbstractEntities() { @@ -108,6 +116,10 @@ class DwarfCompileUnit final : public DwarfUnit { void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) override; + /// Add info for Wasm-global-based relocation. + void addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName, + uint64_t GlobalIndex); + public: DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU, @@ -171,17 +183,6 @@ public: unsigned getOrCreateSourceID(const DIFile *File) override; - void addImportedEntity(const DIImportedEntity* IE) { - DIScope *Scope = IE->getScope(); - assert(Scope && "Invalid Scope encoding!"); - if (!isa<DILocalScope>(Scope)) - // No need to add imported enities that are not local declaration. - return; - - auto *LocalScope = cast<DILocalScope>(Scope)->getNonLexicalBlockFileScope(); - ImportedEntities[LocalScope].push_back(IE); - } - /// addRange - Add an address range to the list of ranges for this unit. void addRange(RangeSpan Range); @@ -213,6 +214,11 @@ public: /// attach DW_AT_low_pc/DW_AT_high_pc labels. DIE *constructLexicalScopeDIE(LexicalScope *Scope); + /// Get a DIE for the given DILexicalBlock. + /// Note that this function assumes that the DIE has been already created + /// and it's an error, if it hasn't. + DIE *getLexicalBlockDIE(const DILexicalBlock *LB); + /// constructVariableDIE - Construct a DIE for the given DbgVariable. DIE *constructVariableDIE(DbgVariable &DV, bool Abstract = false); @@ -224,6 +230,10 @@ public: void createBaseTypeDIEs(); + /// Construct a DIE for a given scope. + /// This instance of 'getOrCreateContextDIE()' can handle DILocalScope. + DIE *getOrCreateContextDIE(const DIScope *Ty) override; + /// Construct a DIE for this subprogram scope. DIE &constructSubprogramScopeDIE(const DISubprogram *Sub, LexicalScope *Scope); @@ -262,8 +272,9 @@ public: void constructCallSiteParmEntryDIEs(DIE &CallSiteDIE, SmallVector<DbgCallSiteParam, 4> &Params); - /// Construct import_module DIE. - DIE *constructImportedEntityDIE(const DIImportedEntity *Module); + /// Get or create a DIE for an imported entity. + DIE *getOrCreateImportedEntityDIE(const DIImportedEntity *IE); + DIE *constructImportedEntityDIE(const DIImportedEntity *IE); void finishSubprogramDefinition(const DISubprogram *SP); void finishEntityDefinition(const DbgEntity *Entity); @@ -360,6 +371,8 @@ public: bool hasDwarfPubSections() const; void addBaseTypeRef(DIEValueList &Die, int64_t Idx); + + MDNodeSetVector &getDeferredLocalDecls() { return DeferredLocalDecls; } }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index cde790cc77fb..1ae17ec9b874 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -18,7 +18,7 @@ #include "DwarfUnit.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/Triple.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/DIE.h" @@ -53,6 +53,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" #include <algorithm> #include <cstddef> #include <iterator> @@ -452,14 +453,8 @@ DwarfDebug::DwarfDebug(AsmPrinter *A) // Split DWARF would benefit object size significantly by trading reductions // in address pool usage for slightly increased range list encodings. - if (DwarfVersion >= 5) { + if (DwarfVersion >= 5) MinimizeAddr = MinimizeAddrInV5Option; - // FIXME: In the future, enable this by default for Split DWARF where the - // tradeoff is more pronounced due to being able to offload the range - // lists to the dwo file and shrink object files/reduce relocations there. - if (MinimizeAddr == MinimizeAddrInV5::Default) - MinimizeAddr = MinimizeAddrInV5::Disabled; - } Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion); Asm->OutStreamer->getContext().setDwarfFormat(Dwarf64 ? dwarf::DWARF64 @@ -500,6 +495,7 @@ static StringRef getObjCMethodName(StringRef In) { void DwarfDebug::addSubprogramNames(const DICompileUnit &CU, const DISubprogram *SP, DIE &Die) { if (getAccelTableKind() != AccelTableKind::Apple && + CU.getNameTableKind() != DICompileUnit::DebugNameTableKind::Apple && CU.getNameTableKind() == DICompileUnit::DebugNameTableKind::None) return; @@ -513,7 +509,7 @@ void DwarfDebug::addSubprogramNames(const DICompileUnit &CU, // well into the name table. Only do that if we are going to actually emit // that name. if (SP->getLinkageName() != "" && SP->getName() != SP->getLinkageName() && - (useAllLinkageNames() || InfoHolder.getAbstractSPDies().lookup(SP))) + (useAllLinkageNames() || InfoHolder.getAbstractScopeDIEs().lookup(SP))) addAccelName(CU, SP->getLinkageName(), Die); // If this is an Objective-C selector name add it to the ObjC accelerator @@ -710,13 +706,13 @@ static void interpretValues(const MachineInstr *CurMI, if (MI.isDebugInstr()) return; - for (const MachineOperand &MO : MI.operands()) { - if (MO.isReg() && MO.isDef() && MO.getReg().isPhysical()) { + for (const MachineOperand &MO : MI.all_defs()) { + if (MO.getReg().isPhysical()) { for (auto &FwdReg : ForwardedRegWorklist) if (TRI.regsOverlap(FwdReg.first, MO.getReg())) Defs.insert(FwdReg.first); - for (MCRegUnitIterator Units(MO.getReg(), &TRI); Units.isValid(); ++Units) - NewClobberedRegUnits.insert(*Units); + for (MCRegUnit Unit : TRI.regunits(MO.getReg())) + NewClobberedRegUnits.insert(Unit); } } }; @@ -1050,11 +1046,11 @@ void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit, if (!SDK.empty()) NewCU.addString(Die, dwarf::DW_AT_APPLE_sdk, SDK); - // Add DW_str_offsets_base to the unit DIE, except for split units. - if (useSegmentedStringOffsetsTable() && !useSplitDwarf()) - NewCU.addStringOffsetsStart(); - if (!useSplitDwarf()) { + // Add DW_str_offsets_base to the unit DIE, except for split units. + if (useSegmentedStringOffsetsTable()) + NewCU.addStringOffsetsStart(); + NewCU.initStmtList(); // If we're using split dwarf the compilation dir is going to be in the @@ -1097,6 +1093,13 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { if (auto *CU = CUMap.lookup(DIUnit)) return *CU; + if (useSplitDwarf() && + !shareAcrossDWOCUs() && + (!DIUnit->getSplitDebugInlining() || + DIUnit->getEmissionKind() == DICompileUnit::FullDebug) && + !CUMap.empty()) { + return *CUMap.begin()->second; + } CompilationDir = DIUnit->getDirectory(); auto OwnedUnit = std::make_unique<DwarfCompileUnit>( @@ -1104,9 +1107,6 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { DwarfCompileUnit &NewCU = *OwnedUnit; InfoHolder.addUnit(std::move(OwnedUnit)); - for (auto *IE : DIUnit->getImportedEntities()) - NewCU.addImportedEntity(IE); - // LTO with assembly output shares a single line table amongst multiple CUs. // To avoid the compilation directory being ambiguous, let the line table // explicitly describe the directory of all files, never relying on the @@ -1129,14 +1129,6 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) { return NewCU; } -void DwarfDebug::constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU, - const DIImportedEntity *N) { - if (isa<DILocalScope>(N->getScope())) - return; - if (DIE *D = TheCU.getOrCreateContextDIE(N->getScope())) - D->addChild(TheCU.constructImportedEntityDIE(N)); -} - /// Sort and unique GVEs by comparing their fragment offset. static SmallVectorImpl<DwarfCompileUnit::GlobalExpr> & sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) { @@ -1214,16 +1206,8 @@ void DwarfDebug::beginModule(Module *M) { DebugLocs.setSym(Asm->createTempSymbol("loclists_table_base")); for (DICompileUnit *CUNode : M->debug_compile_units()) { - // FIXME: Move local imported entities into a list attached to the - // subprogram, then this search won't be needed and a - // getImportedEntities().empty() test should go below with the rest. - bool HasNonLocalImportedEntities = llvm::any_of( - CUNode->getImportedEntities(), [](const DIImportedEntity *IE) { - return !isa<DILocalScope>(IE->getScope()); - }); - - if (!HasNonLocalImportedEntities && CUNode->getEnumTypes().empty() && - CUNode->getRetainedTypes().empty() && + if (CUNode->getImportedEntities().empty() && + CUNode->getEnumTypes().empty() && CUNode->getRetainedTypes().empty() && CUNode->getGlobalVariables().empty() && CUNode->getMacros().empty()) continue; @@ -1257,10 +1241,6 @@ void DwarfDebug::beginModule(Module *M) { // There is no point in force-emitting a forward declaration. CU.getOrCreateTypeDIE(RT); } - // Emit imported_modules last so that the relevant context is already - // available. - for (auto *IE : CUNode->getImportedEntities()) - constructAndAddImportedEntityDIE(CU, IE); } } @@ -1300,6 +1280,8 @@ void DwarfDebug::finalizeModuleInfo() { if (CUMap.size() > 1) DWOName = Asm->TM.Options.MCOptions.SplitDwarfFile; + bool HasEmittedSplitCU = false; + // Handle anything that needs to be done on a per-unit basis after // all other generation. for (const auto &P : CUMap) { @@ -1318,6 +1300,10 @@ void DwarfDebug::finalizeModuleInfo() { bool HasSplitUnit = SkCU && !TheCU.getUnitDie().children().empty(); if (HasSplitUnit) { + (void)HasEmittedSplitCU; + assert((shareAcrossDWOCUs() || !HasEmittedSplitCU) && + "Multiple CUs emitted into a single dwo file"); + HasEmittedSplitCU = true; dwarf::Attribute attrDWOName = getDwarfVersion() >= 5 ? dwarf::DW_AT_dwo_name : dwarf::DW_AT_GNU_dwo_name; @@ -1377,11 +1363,10 @@ void DwarfDebug::finalizeModuleInfo() { if (U.hasRangeLists()) U.addRnglistsBase(); - if (!DebugLocs.getLists().empty()) { - if (!useSplitDwarf()) - U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_loclists_base, - DebugLocs.getSym(), - TLOF.getDwarfLoclistsSection()->getBeginSymbol()); + if (!DebugLocs.getLists().empty() && !useSplitDwarf()) { + U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_loclists_base, + DebugLocs.getSym(), + TLOF.getDwarfLoclistsSection()->getBeginSymbol()); } } @@ -1436,8 +1421,24 @@ void DwarfDebug::endModule() { assert(CurMI == nullptr); for (const auto &P : CUMap) { - auto &CU = *P.second; - CU.createBaseTypeDIEs(); + const auto *CUNode = cast<DICompileUnit>(P.first); + DwarfCompileUnit *CU = &*P.second; + + // Emit imported entities. + for (auto *IE : CUNode->getImportedEntities()) { + assert(!isa_and_nonnull<DILocalScope>(IE->getScope()) && + "Unexpected function-local entity in 'imports' CU field."); + CU->getOrCreateImportedEntityDIE(IE); + } + for (const auto *D : CU->getDeferredLocalDecls()) { + if (auto *IE = dyn_cast<DIImportedEntity>(D)) + CU->getOrCreateImportedEntityDIE(IE); + else + llvm_unreachable("Unexpected local retained node!"); + } + + // Emit base types. + CU->createBaseTypeDIEs(); } // If we aren't actually generating debug info (check beginModule - @@ -1511,16 +1512,6 @@ void DwarfDebug::endModule() { // FIXME: AbstractVariables.clear(); } -void DwarfDebug::ensureAbstractEntityIsCreated(DwarfCompileUnit &CU, - const DINode *Node, - const MDNode *ScopeNode) { - if (CU.getExistingAbstractEntity(Node)) - return; - - CU.createAbstractEntity(Node, LScopes.getOrCreateAbstractScope( - cast<DILocalScope>(ScopeNode))); -} - void DwarfDebug::ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU, const DINode *Node, const MDNode *ScopeNode) { if (CU.getExistingAbstractEntity(Node)) @@ -1531,6 +1522,21 @@ void DwarfDebug::ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU, CU.createAbstractEntity(Node, Scope); } +static const DILocalScope *getRetainedNodeScope(const MDNode *N) { + const DIScope *S; + if (const auto *LV = dyn_cast<DILocalVariable>(N)) + S = LV->getScope(); + else if (const auto *L = dyn_cast<DILabel>(N)) + S = L->getScope(); + else if (const auto *IE = dyn_cast<DIImportedEntity>(N)) + S = IE->getScope(); + else + llvm_unreachable("Unexpected retained node!"); + + // Ensure the scope is not a DILexicalBlockFile. + return cast<DILocalScope>(S)->getNonLexicalBlockFileScope(); +} + // Collect variable information from side table maintained by MF. void DwarfDebug::collectVariableInfoFromMFTable( DwarfCompileUnit &TheCU, DenseSet<InlinedEntity> &Processed) { @@ -1556,13 +1562,24 @@ void DwarfDebug::collectVariableInfoFromMFTable( ensureAbstractEntityIsCreatedIfScoped(TheCU, Var.first, Scope->getScopeNode()); auto RegVar = std::make_unique<DbgVariable>( cast<DILocalVariable>(Var.first), Var.second); - RegVar->initializeMMI(VI.Expr, VI.Slot); + if (VI.inStackSlot()) + RegVar->initializeMMI(VI.Expr, VI.getStackSlot()); + else { + MachineLocation MLoc(VI.getEntryValueRegister(), /*IsIndirect*/ true); + auto LocEntry = DbgValueLocEntry(MLoc); + RegVar->initializeDbgValue(DbgValueLoc(VI.Expr, LocEntry)); + } LLVM_DEBUG(dbgs() << "Created DbgVariable for " << VI.Var->getName() << "\n"); - if (DbgVariable *DbgVar = MFVars.lookup(Var)) - DbgVar->addMMIEntry(*RegVar); - else if (InfoHolder.addScopeVariable(Scope, RegVar.get())) { + if (DbgVariable *DbgVar = MFVars.lookup(Var)) { + if (DbgVar->getValueLoc()) + LLVM_DEBUG(dbgs() << "Dropping repeated entry value debug info for " + "variable " + << VI.Var->getName() << "\n"); + else + DbgVar->addMMIEntry(*RegVar); + } else if (InfoHolder.addScopeVariable(Scope, RegVar.get())) { MFVars.insert({Var, RegVar.get()}); ConcreteEntities.push_back(std::move(RegVar)); } @@ -1964,19 +1981,18 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU, createConcreteEntity(TheCU, *Scope, Label, IL.second, Sym); } - // Collect info for variables/labels that were optimized out. + // Collect info for retained nodes. for (const DINode *DN : SP->getRetainedNodes()) { - if (!Processed.insert(InlinedEntity(DN, nullptr)).second) - continue; - LexicalScope *Scope = nullptr; - if (auto *DV = dyn_cast<DILocalVariable>(DN)) { - Scope = LScopes.findLexicalScope(DV->getScope()); - } else if (auto *DL = dyn_cast<DILabel>(DN)) { - Scope = LScopes.findLexicalScope(DL->getScope()); + const auto *LS = getRetainedNodeScope(DN); + if (isa<DILocalVariable>(DN) || isa<DILabel>(DN)) { + if (!Processed.insert(InlinedEntity(DN, nullptr)).second) + continue; + LexicalScope *LexS = LScopes.findLexicalScope(LS); + if (LexS) + createConcreteEntity(TheCU, *LexS, DN, nullptr); + } else { + LocalDeclsPerLS[LS].insert(DN); } - - if (Scope) - createConcreteEntity(TheCU, *Scope, DN, nullptr); } } @@ -2046,7 +2062,10 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { unsigned LastAsmLine = Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine(); - if (DL == PrevInstLoc) { + bool PrevInstInSameSection = + (!PrevInstBB || + PrevInstBB->getSectionIDNum() == MI->getParent()->getSectionIDNum()); + if (DL == PrevInstLoc && PrevInstInSameSection) { // If we have an ongoing unspecified location, nothing to do here. if (!DL) return; @@ -2114,25 +2133,35 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { PrevInstLoc = DL; } -static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { +static std::pair<DebugLoc, bool> findPrologueEndLoc(const MachineFunction *MF) { // First known non-DBG_VALUE and non-frame setup location marks // the beginning of the function body. DebugLoc LineZeroLoc; + const Function &F = MF->getFunction(); + + // Some instructions may be inserted into prologue after this function. Must + // keep prologue for these cases. + bool IsEmptyPrologue = + !(F.hasPrologueData() || F.getMetadata(LLVMContext::MD_func_sanitize)); for (const auto &MBB : *MF) { for (const auto &MI : MBB) { - if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) && - MI.getDebugLoc()) { - // Scan forward to try to find a non-zero line number. The prologue_end - // marks the first breakpoint in the function after the frame setup, and - // a compiler-generated line 0 location is not a meaningful breakpoint. - // If none is found, return the first location after the frame setup. - if (MI.getDebugLoc().getLine()) - return MI.getDebugLoc(); - LineZeroLoc = MI.getDebugLoc(); + if (!MI.isMetaInstruction()) { + if (!MI.getFlag(MachineInstr::FrameSetup) && MI.getDebugLoc()) { + // Scan forward to try to find a non-zero line number. The + // prologue_end marks the first breakpoint in the function after the + // frame setup, and a compiler-generated line 0 location is not a + // meaningful breakpoint. If none is found, return the first + // location after the frame setup. + if (MI.getDebugLoc().getLine()) + return std::make_pair(MI.getDebugLoc(), IsEmptyPrologue); + + LineZeroLoc = MI.getDebugLoc(); + } + IsEmptyPrologue = false; } } } - return LineZeroLoc; + return std::make_pair(LineZeroLoc, IsEmptyPrologue); } /// Register a source line with debug info. Returns the unique label that was @@ -2159,8 +2188,16 @@ static void recordSourceLine(AsmPrinter &Asm, unsigned Line, unsigned Col, DebugLoc DwarfDebug::emitInitialLocDirective(const MachineFunction &MF, unsigned CUID) { + std::pair<DebugLoc, bool> PrologEnd = findPrologueEndLoc(&MF); + DebugLoc PrologEndLoc = PrologEnd.first; + bool IsEmptyPrologue = PrologEnd.second; + // Get beginning of function. - if (DebugLoc PrologEndLoc = findPrologueEndLoc(&MF)) { + if (PrologEndLoc) { + // If the prolog is empty, no need to generate scope line for the proc. + if (IsEmptyPrologue) + return PrologEndLoc; + // Ensure the compile unit is created if the function is called before // beginFunction(). (void)getOrCreateDwarfCompileUnit( @@ -2239,7 +2276,7 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { LexicalScope *FnScope = LScopes.getCurrentFunctionScope(); assert(!FnScope || SP == FnScope->getScopeNode()); - DwarfCompileUnit &TheCU = *CUMap.lookup(SP->getUnit()); + DwarfCompileUnit &TheCU = getOrCreateDwarfCompileUnit(SP->getUnit()); if (TheCU.getCUNode()->isDebugDirectivesOnly()) { PrevLabel = nullptr; CurFn = nullptr; @@ -2260,6 +2297,9 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { if (!TheCU.getCUNode()->getDebugInfoForProfiling() && TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly && LScopes.getAbstractScopesList().empty() && !IsDarwin) { + for (const auto &R : Asm->MBBSectionRanges) + addArangeLabel(SymbolCU(&TheCU, R.second.BeginLabel)); + assert(InfoHolder.getScopeVariables().empty()); PrevLabel = nullptr; CurFn = nullptr; @@ -2267,27 +2307,28 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { } #ifndef NDEBUG - size_t NumAbstractScopes = LScopes.getAbstractScopesList().size(); + size_t NumAbstractSubprograms = LScopes.getAbstractScopesList().size(); #endif - // Construct abstract scopes. for (LexicalScope *AScope : LScopes.getAbstractScopesList()) { const auto *SP = cast<DISubprogram>(AScope->getScopeNode()); for (const DINode *DN : SP->getRetainedNodes()) { - if (!Processed.insert(InlinedEntity(DN, nullptr)).second) - continue; - - const MDNode *Scope = nullptr; - if (auto *DV = dyn_cast<DILocalVariable>(DN)) - Scope = DV->getScope(); - else if (auto *DL = dyn_cast<DILabel>(DN)) - Scope = DL->getScope(); - else - llvm_unreachable("Unexpected DI type!"); - - // Collect info for variables/labels that were optimized out. - ensureAbstractEntityIsCreated(TheCU, DN, Scope); - assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes - && "ensureAbstractEntityIsCreated inserted abstract scopes"); + const auto *LS = getRetainedNodeScope(DN); + // Ensure LexicalScope is created for the scope of this node. + auto *LexS = LScopes.getOrCreateAbstractScope(LS); + assert(LexS && "Expected the LexicalScope to be created."); + if (isa<DILocalVariable>(DN) || isa<DILabel>(DN)) { + // Collect info for variables/labels that were optimized out. + if (!Processed.insert(InlinedEntity(DN, nullptr)).second || + TheCU.getExistingAbstractEntity(DN)) + continue; + TheCU.createAbstractEntity(DN, LexS); + } else { + // Remember the node if this is a local declarations. + LocalDeclsPerLS[LS].insert(DN); + } + assert( + LScopes.getAbstractScopesList().size() == NumAbstractSubprograms && + "getOrCreateAbstractScope() inserted an abstract subprogram scope"); } constructAbstractSubprogramScopeDIE(TheCU, AScope); } @@ -2308,6 +2349,7 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { // can be used cross-function) InfoHolder.getScopeVariables().clear(); InfoHolder.getScopeLabels().clear(); + LocalDeclsPerLS.clear(); PrevLabel = nullptr; CurFn = nullptr; } @@ -2507,10 +2549,13 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name, Asm->emitDwarfLengthOrOffset(TheU->getLength()); // Emit the pubnames for this compilation unit. - for (const auto &GI : Globals) { - const char *Name = GI.getKeyData(); - const DIE *Entity = GI.second; - + SmallVector<std::pair<StringRef, const DIE *>, 0> Vec; + for (const auto &GI : Globals) + Vec.emplace_back(GI.first(), GI.second); + llvm::sort(Vec, [](auto &A, auto &B) { + return A.second->getOffset() < B.second->getOffset(); + }); + for (const auto &[Name, Entity] : Vec) { Asm->OutStreamer->AddComment("DIE offset"); Asm->emitDwarfLengthOrOffset(Entity->getOffset()); @@ -2523,7 +2568,7 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name, } Asm->OutStreamer->AddComment("External Name"); - Asm->OutStreamer->emitBytes(StringRef(Name, GI.getKeyLength() + 1)); + Asm->OutStreamer->emitBytes(StringRef(Name.data(), Name.size() + 1)); } Asm->OutStreamer->AddComment("End Mark"); @@ -2566,11 +2611,10 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer, for (const auto &Op : Expr) { assert(Op.getCode() != dwarf::DW_OP_const_type && "3 operand ops not yet supported"); + assert(!Op.getSubCode() && "SubOps not yet supported"); Streamer.emitInt8(Op.getCode(), Comment != End ? *(Comment++) : ""); Offset++; - for (unsigned I = 0; I < 2; ++I) { - if (Op.getDescription().Op[I] == Encoding::SizeNA) - continue; + for (unsigned I = 0; I < Op.getDescription().Op.size(); ++I) { if (Op.getDescription().Op[I] == Encoding::BaseTypeRef) { unsigned Length = Streamer.emitDIERef(*CU->ExprRefedBaseTypes[Op.getRawOperand(I)].Die); @@ -3495,10 +3539,11 @@ template <typename DataT> void DwarfDebug::addAccelNameImpl(const DICompileUnit &CU, AccelTable<DataT> &AppleAccel, StringRef Name, const DIE &Die) { - if (getAccelTableKind() == AccelTableKind::None) + if (getAccelTableKind() == AccelTableKind::None || Name.empty()) return; if (getAccelTableKind() != AccelTableKind::Apple && + CU.getNameTableKind() != DICompileUnit::DebugNameTableKind::Apple && CU.getNameTableKind() != DICompileUnit::DebugNameTableKind::Default) return; @@ -3555,11 +3600,9 @@ dwarf::Form DwarfDebug::getDwarfSectionOffsetForm() const { } const MCSymbol *DwarfDebug::getSectionLabel(const MCSection *S) { - auto I = SectionLabels.find(S); - if (I == SectionLabels.end()) - return nullptr; - return I->second; + return SectionLabels.lookup(S); } + void DwarfDebug::insertSectionLabel(const MCSymbol *S) { if (SectionLabels.insert(std::make_pair(&S->getSection(), S)).second) if (useSplitDwarf() || getDwarfVersion() >= 5) @@ -3583,3 +3626,13 @@ DwarfDebug::getMD5AsBytes(const DIFile *File) const { std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.data()); return CKMem; } + +bool DwarfDebug::alwaysUseRanges(const DwarfCompileUnit &CU) const { + if (MinimizeAddr == MinimizeAddrInV5::Ranges) + return true; + if (MinimizeAddr != MinimizeAddrInV5::Default) + return false; + if (useSplitDwarf()) + return true; + return false; +} diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index 5d2ef8ee79a7..1af4b643eb17 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -318,9 +318,14 @@ class DwarfDebug : public DebugHandlerBase { /// This is a collection of subprogram MDNodes that are processed to /// create DIEs. - SetVector<const DISubprogram *, SmallVector<const DISubprogram *, 16>, - SmallPtrSet<const DISubprogram *, 16>> - ProcessedSPNodes; + SmallSetVector<const DISubprogram *, 16> ProcessedSPNodes; + + /// Map function-local imported entities to their parent local scope + /// (either DILexicalBlock or DISubprogram) for a processed function + /// (including inlined subprograms). + using MDNodeSet = SetVector<const MDNode *, SmallVector<const MDNode *, 2>, + SmallPtrSet<const MDNode *, 2>>; + DenseMap<const DILocalScope *, MDNodeSet> LocalDeclsPerLS; /// If nonnull, stores the current machine function we're processing. const MachineFunction *CurFn = nullptr; @@ -456,9 +461,6 @@ private: using InlinedEntity = DbgValueHistoryMap::InlinedEntity; - void ensureAbstractEntityIsCreated(DwarfCompileUnit &CU, - const DINode *Node, - const MDNode *Scope); void ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU, const DINode *Node, const MDNode *Scope); @@ -598,10 +600,6 @@ private: void finishUnitAttributes(const DICompileUnit *DIUnit, DwarfCompileUnit &NewCU); - /// Construct imported_module or imported_declaration DIE. - void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU, - const DIImportedEntity *N); - /// Register a source line with debug info. Returns the unique /// label that was emitted and which provides correspondence to the /// source line list. @@ -696,9 +694,7 @@ public: /// Returns whether range encodings should be used for single entry range /// lists. - bool alwaysUseRanges() const { - return MinimizeAddr == MinimizeAddrInV5::Ranges; - } + bool alwaysUseRanges(const DwarfCompileUnit &) const; // Returns whether novel exprloc addrx+offset encodings should be used to // reduce debug_addr size. @@ -842,6 +838,10 @@ public: /// If the \p File has an MD5 checksum, return it as an MD5Result /// allocated in the MCContext. std::optional<MD5::MD5Result> getMD5AsBytes(const DIFile *File) const; + + MDNodeSet &getLocalDeclsForScope(const DILocalScope *S) { + return LocalDeclsPerLS[S]; + } }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index ab6967f50e30..7623b7fb7c5d 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -117,10 +117,10 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI, // Walk up the super-register chain until we find a valid number. // For example, EAX on x86_64 is a 32-bit fragment of RAX with offset 0. - for (MCSuperRegIterator SR(MachineReg, &TRI); SR.isValid(); ++SR) { - Reg = TRI.getDwarfRegNum(*SR, false); + for (MCPhysReg SR : TRI.superregs(MachineReg)) { + Reg = TRI.getDwarfRegNum(SR, false); if (Reg >= 0) { - unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg); + unsigned Idx = TRI.getSubRegIndex(SR, MachineReg); unsigned Size = TRI.getSubRegIdxSize(Idx); unsigned RegOffset = TRI.getSubRegIdxOffset(Idx); DwarfRegs.push_back(Register::createRegister(Reg, "super-register")); @@ -142,11 +142,11 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI, // this doesn't find a combination of subregisters that fully cover // the register (even though one may exist). SmallBitVector Coverage(RegSize, false); - for (MCSubRegIterator SR(MachineReg, &TRI); SR.isValid(); ++SR) { - unsigned Idx = TRI.getSubRegIndex(MachineReg, *SR); + for (MCPhysReg SR : TRI.subregs(MachineReg)) { + unsigned Idx = TRI.getSubRegIndex(MachineReg, SR); unsigned Size = TRI.getSubRegIdxSize(Idx); unsigned Offset = TRI.getSubRegIdxOffset(Idx); - Reg = TRI.getDwarfRegNum(*SR, false); + Reg = TRI.getDwarfRegNum(SR, false); if (Reg < 0) continue; @@ -566,6 +566,12 @@ bool DwarfExpression::addExpression( case dwarf::DW_OP_dup: case dwarf::DW_OP_push_object_address: case dwarf::DW_OP_over: + case dwarf::DW_OP_eq: + case dwarf::DW_OP_ne: + case dwarf::DW_OP_gt: + case dwarf::DW_OP_ge: + case dwarf::DW_OP_lt: + case dwarf::DW_OP_le: emitOp(OpNum); break; case dwarf::DW_OP_deref: diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h index 79a6ce7801b7..464f4f048016 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -26,6 +26,7 @@ class DbgEntity; class DbgVariable; class DbgLabel; class DINode; +class DILocalScope; class DwarfCompileUnit; class DwarfUnit; class LexicalScope; @@ -87,7 +88,7 @@ class DwarfFile { DenseMap<LexicalScope *, LabelList> ScopeLabels; // Collection of abstract subprogram DIEs. - DenseMap<const MDNode *, DIE *> AbstractSPDies; + DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs; DenseMap<const DINode *, std::unique_ptr<DbgEntity>> AbstractEntities; /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can @@ -162,8 +163,8 @@ public: return ScopeLabels; } - DenseMap<const MDNode *, DIE *> &getAbstractSPDies() { - return AbstractSPDies; + DenseMap<const DILocalScope *, DIE *> &getAbstractScopeDIEs() { + return AbstractLocalScopeDIEs; } DenseMap<const DINode *, std::unique_ptr<DbgEntity>> &getAbstractEntities() { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index c2ff899c04ab..d30f0ef7af34 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -543,7 +543,7 @@ void DwarfUnit::addAccess(DIE &Die, DINode::DIFlags Flags) { } DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) { - if (!Context || isa<DIFile>(Context)) + if (!Context || isa<DIFile>(Context) || isa<DICompileUnit>(Context)) return &getUnitDie(); if (auto *T = dyn_cast<DIType>(Context)) return getOrCreateTypeDIE(T); @@ -1223,7 +1223,7 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP, "decl has a linkage name and it is different"); if (DeclLinkageName.empty() && // Always emit it for abstract subprograms. - (DD->useAllLinkageNames() || DU->getAbstractSPDies().lookup(SP))) + (DD->useAllLinkageNames() || DU->getAbstractScopeDIEs().lookup(SP))) addLinkageName(SPDie, LinkageName); if (!DeclDie) @@ -1362,16 +1362,16 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR, auto AddBoundTypeEntry = [&](dwarf::Attribute Attr, DISubrange::BoundType Bound) -> void { - if (auto *BV = Bound.dyn_cast<DIVariable *>()) { + if (auto *BV = dyn_cast_if_present<DIVariable *>(Bound)) { if (auto *VarDIE = getDIE(BV)) addDIEEntry(DW_Subrange, Attr, *VarDIE); - } else if (auto *BE = Bound.dyn_cast<DIExpression *>()) { + } else if (auto *BE = dyn_cast_if_present<DIExpression *>(Bound)) { DIELoc *Loc = new (DIEValueAllocator) DIELoc; DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); DwarfExpr.setMemoryLocationKind(); DwarfExpr.addExpression(BE); addBlock(DW_Subrange, Attr, DwarfExpr.finalize()); - } else if (auto *BI = Bound.dyn_cast<ConstantInt *>()) { + } else if (auto *BI = dyn_cast_if_present<ConstantInt *>(Bound)) { if (Attr == dwarf::DW_AT_count) { if (BI->getSExtValue() != -1) addUInt(DW_Subrange, Attr, std::nullopt, BI->getSExtValue()); @@ -1401,10 +1401,10 @@ void DwarfUnit::constructGenericSubrangeDIE(DIE &Buffer, auto AddBoundTypeEntry = [&](dwarf::Attribute Attr, DIGenericSubrange::BoundType Bound) -> void { - if (auto *BV = Bound.dyn_cast<DIVariable *>()) { + if (auto *BV = dyn_cast_if_present<DIVariable *>(Bound)) { if (auto *VarDIE = getDIE(BV)) addDIEEntry(DwGenericSubrange, Attr, *VarDIE); - } else if (auto *BE = Bound.dyn_cast<DIExpression *>()) { + } else if (auto *BE = dyn_cast_if_present<DIExpression *>(Bound)) { if (BE->isConstant() && DIExpression::SignedOrUnsignedConstant::SignedConstant == *BE->isConstant()) { @@ -1463,7 +1463,7 @@ static bool hasVectorBeenPadded(const DICompositeType *CTy) { const auto Subrange = cast<DISubrange>(Elements[0]); const auto NumVecElements = Subrange->getCount() - ? Subrange->getCount().get<ConstantInt *>()->getSExtValue() + ? cast<ConstantInt *>(Subrange->getCount())->getSExtValue() : 0; // Ensure we found the element count and that the actual size is wide diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 0caa6adbfa62..8f17e94c2d1c 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -245,10 +245,10 @@ public: DIE *createTypeDIE(const DIScope *Context, DIE &ContextDIE, const DIType *Ty); /// Find existing DIE or create new DIE for the given type. - DIE *getOrCreateTypeDIE(const MDNode *TyNode); + virtual DIE *getOrCreateTypeDIE(const MDNode *TyNode); /// Get context owner's DIE. - DIE *getOrCreateContextDIE(const DIScope *Context); + virtual DIE *getOrCreateContextDIE(const DIScope *Context); /// Construct DIEs for types that contain vtables. void constructContainingTypeDIEs(); diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp index 67e2c0e07095..eef6b1d93f36 100644 --- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -410,7 +410,7 @@ MCSymbol *EHStreamer::emitExceptionTable() { computeActionsTable(LandingPads, Actions, FirstActions); // Compute the call-site table and call-site ranges. Normally, there is only - // one call-site-range which covers the whole funciton. With + // one call-site-range which covers the whole function. With // -basic-block-sections, there is one call-site-range per basic block // section. SmallVector<CallSiteEntry, 64> CallSites; diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp index 3e75b4371033..59c3fa15885e 100644 --- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp @@ -32,11 +32,7 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index, SmallVector<InlineSite, 8> ReversedInlineStack; auto *InlinedAt = DebugLoc ? DebugLoc->getInlinedAt() : nullptr; while (InlinedAt) { - const DISubprogram *SP = InlinedAt->getScope()->getSubprogram(); - // Use linkage name for C++ if possible. - auto Name = SP->getLinkageName(); - if (Name.empty()) - Name = SP->getName(); + auto Name = InlinedAt->getSubprogramLinkageName(); // Use caching to avoid redundant md5 computation for build speed. uint64_t &CallerGuid = NameGuidMap[Name]; if (!CallerGuid) @@ -46,8 +42,15 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index, ReversedInlineStack.emplace_back(CallerGuid, CallerProbeId); InlinedAt = InlinedAt->getInlinedAt(); } - + uint64_t Discriminator = 0; + // For now only block probes have FS discriminators. See + // MIRFSDiscriminator.cpp for more details. + if (EnableFSDiscriminator && DebugLoc && + (Type == (uint64_t)PseudoProbeType::Block)) + Discriminator = DebugLoc->getDiscriminator(); + assert((EnableFSDiscriminator || Discriminator == 0) && + "Discriminator should not be set in non-FSAFDO mode"); SmallVector<InlineSite, 8> InlineStack(llvm::reverse(ReversedInlineStack)); - Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, InlineStack, - Asm->CurrentFnSym); + Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, Discriminator, + InlineStack, Asm->CurrentFnSym); } diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp index 7a800438592c..6d6432b61f2d 100644 --- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp @@ -638,7 +638,7 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo, const SEHUnwindMapEntry &UME = FuncInfo.SEHUnwindMap[State]; const MCExpr *FilterOrFinally; const MCExpr *ExceptOrNull; - auto *Handler = UME.Handler.get<MachineBasicBlock *>(); + auto *Handler = cast<MachineBasicBlock *>(UME.Handler); if (UME.IsFinally) { FilterOrFinally = create32bitRef(getMCSymbolForMBB(Asm, Handler)); ExceptOrNull = MCConstantExpr::create(0, Ctx); @@ -762,7 +762,11 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { OS.emitInt32(0); AddComment("EHFlags"); - OS.emitInt32(1); + if (MMI->getModule()->getModuleFlag("eh-asynch")) { + OS.emitInt32(0); + } else { + OS.emitInt32(1); + } // UnwindMapEntry { // int32_t ToState; @@ -771,8 +775,8 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { if (UnwindMapXData) { OS.emitLabel(UnwindMapXData); for (const CxxUnwindMapEntry &UME : FuncInfo.CxxUnwindMap) { - MCSymbol *CleanupSym = - getMCSymbolForMBB(Asm, UME.Cleanup.dyn_cast<MachineBasicBlock *>()); + MCSymbol *CleanupSym = getMCSymbolForMBB( + Asm, dyn_cast_if_present<MachineBasicBlock *>(UME.Cleanup)); AddComment("ToState"); OS.emitInt32(UME.ToState); @@ -859,8 +863,8 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { FrameAllocOffsetRef = MCConstantExpr::create(0, Asm->OutContext); } - MCSymbol *HandlerSym = - getMCSymbolForMBB(Asm, HT.Handler.dyn_cast<MachineBasicBlock *>()); + MCSymbol *HandlerSym = getMCSymbolForMBB( + Asm, dyn_cast_if_present<MachineBasicBlock *>(HT.Handler)); AddComment("Adjectives"); OS.emitInt32(HT.Adjectives); @@ -1065,7 +1069,7 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) { assert(!FuncInfo.SEHUnwindMap.empty()); for (const SEHUnwindMapEntry &UME : FuncInfo.SEHUnwindMap) { - auto *Handler = UME.Handler.get<MachineBasicBlock *>(); + auto *Handler = cast<MachineBasicBlock *>(UME.Handler); const MCSymbol *ExceptOrFinally = UME.IsFinally ? getMCSymbolForMBB(Asm, Handler) : Handler->getSymbol(); // -1 is usually the base state for "unwind to caller", but for @@ -1136,7 +1140,7 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) { DenseMap<const MachineBasicBlock *, int> HandlerStates; for (int State = 0; State < NumStates; ++State) { MachineBasicBlock *HandlerBlock = - FuncInfo.ClrEHUnwindMap[State].Handler.get<MachineBasicBlock *>(); + cast<MachineBasicBlock *>(FuncInfo.ClrEHUnwindMap[State].Handler); HandlerStates[HandlerBlock] = State; // Use this loop through all handlers to verify our assumption (used in // the MinEnclosingState computation) that enclosing funclets have lower @@ -1297,7 +1301,7 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) { const MCExpr *ClauseEnd = getOffsetPlusOne(Clause.EndLabel, FuncBeginSym); const ClrEHUnwindMapEntry &Entry = FuncInfo.ClrEHUnwindMap[Clause.State]; - MachineBasicBlock *HandlerBlock = Entry.Handler.get<MachineBasicBlock *>(); + MachineBasicBlock *HandlerBlock = cast<MachineBasicBlock *>(Entry.Handler); MCSymbol *BeginSym = getMCSymbolForMBB(Asm, HandlerBlock); const MCExpr *HandlerBegin = getOffset(BeginSym, FuncBeginSym); MCSymbol *EndSym = EndSymbolMap[Clause.State]; diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp index 7098824dbe4b..5ef850d09d92 100644 --- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp +++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp @@ -1,4 +1,6 @@ #include "llvm/CodeGen/AssignmentTrackingAnalysis.h" +#include "LiveDebugValues/LiveDebugValues.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/IntervalMap.h" #include "llvm/ADT/PostOrderIterator.h" @@ -47,6 +49,12 @@ static cl::opt<bool> EnableMemLocFragFill("mem-loc-frag-fill", cl::init(true), static cl::opt<bool> PrintResults("print-debug-ata", cl::init(false), cl::Hidden); +/// Coalesce adjacent dbg locs describing memory locations that have contiguous +/// fragments. This reduces the cost of LiveDebugValues which does SSA +/// construction for each explicitly stated variable fragment. +static cl::opt<cl::boolOrDefault> + CoalesceAdjacentFragmentsOpt("debug-ata-coalesce-frags", cl::Hidden); + // Implicit conversions are disabled for enum class types, so unfortunately we // need to create a DenseMapInfo wrapper around the specified underlying type. template <> struct llvm::DenseMapInfo<VariableID> { @@ -79,6 +87,8 @@ class FunctionVarLocsBuilder { SmallVector<VarLocInfo> SingleLocVars; public: + unsigned getNumVariables() const { return Variables.size(); } + /// Find or insert \p V and return the ID. VariableID insertVariable(DebugVariable V) { return static_cast<VariableID>(Variables.insert(V)); @@ -105,23 +115,23 @@ public: /// Add a def for a variable that is valid for its lifetime. void addSingleLocVar(DebugVariable Var, DIExpression *Expr, DebugLoc DL, - Value *V) { + RawLocationWrapper R) { VarLocInfo VarLoc; VarLoc.VariableID = insertVariable(Var); VarLoc.Expr = Expr; VarLoc.DL = DL; - VarLoc.V = V; + VarLoc.Values = R; SingleLocVars.emplace_back(VarLoc); } /// Add a def to the wedge of defs just before /p Before. void addVarLoc(Instruction *Before, DebugVariable Var, DIExpression *Expr, - DebugLoc DL, Value *V) { + DebugLoc DL, RawLocationWrapper R) { VarLocInfo VarLoc; VarLoc.VariableID = insertVariable(Var); VarLoc.Expr = Expr; VarLoc.DL = DL; - VarLoc.V = V; + VarLoc.Values = R; VarLocsBeforeInst[Before].emplace_back(VarLoc); } }; @@ -148,7 +158,11 @@ void FunctionVarLocs::print(raw_ostream &OS, const Function &Fn) const { auto PrintLoc = [&OS](const VarLocInfo &Loc) { OS << "DEF Var=[" << (unsigned)Loc.VariableID << "]" - << " Expr=" << *Loc.Expr << " V=" << *Loc.V << "\n"; + << " Expr=" << *Loc.Expr << " Values=("; + for (auto *Op : Loc.Values.location_ops()) { + errs() << Op->getName() << " "; + } + errs() << ")\n"; }; // Print the single location variables. @@ -234,13 +248,13 @@ getDerefOffsetInBytes(const DIExpression *DIExpr) { int64_t Offset = 0; const unsigned NumElements = DIExpr->getNumElements(); const auto Elements = DIExpr->getElements(); - unsigned NextElement = 0; + unsigned ExpectedDerefIdx = 0; // Extract the offset. if (NumElements > 2 && Elements[0] == dwarf::DW_OP_plus_uconst) { Offset = Elements[1]; - NextElement = 2; + ExpectedDerefIdx = 2; } else if (NumElements > 3 && Elements[0] == dwarf::DW_OP_constu) { - NextElement = 3; + ExpectedDerefIdx = 3; if (Elements[2] == dwarf::DW_OP_plus) Offset = Elements[1]; else if (Elements[2] == dwarf::DW_OP_minus) @@ -250,19 +264,21 @@ getDerefOffsetInBytes(const DIExpression *DIExpr) { } // If that's all there is it means there's no deref. - if (NextElement >= NumElements) + if (ExpectedDerefIdx >= NumElements) return std::nullopt; // Check the next element is DW_OP_deref - otherwise this is too complex or // isn't a deref expression. - if (Elements[NextElement] != dwarf::DW_OP_deref) + if (Elements[ExpectedDerefIdx] != dwarf::DW_OP_deref) return std::nullopt; // Check the final operation is either the DW_OP_deref or is a fragment. - if (NumElements == NextElement + 1) + if (NumElements == ExpectedDerefIdx + 1) return Offset; // Ends with deref. - else if (NumElements == NextElement + 3 && - Elements[NextElement] == dwarf::DW_OP_LLVM_fragment) + unsigned ExpectedFragFirstIdx = ExpectedDerefIdx + 1; + unsigned ExpectedFragFinalIdx = ExpectedFragFirstIdx + 2; + if (NumElements == ExpectedFragFinalIdx + 1 && + Elements[ExpectedFragFirstIdx] == dwarf::DW_OP_LLVM_fragment) return Offset; // Ends with deref + fragment. // Don't bother trying to interpret anything more complex. @@ -278,6 +294,24 @@ static DebugAggregate getAggregate(const DebugVariable &Var) { return DebugAggregate(Var.getVariable(), Var.getInlinedAt()); } +static bool shouldCoalesceFragments(Function &F) { + // Enabling fragment coalescing reduces compiler run time when instruction + // referencing is enabled. However, it may cause LiveDebugVariables to create + // incorrect locations. Since instruction-referencing mode effectively + // bypasses LiveDebugVariables we only enable coalescing if the cl::opt flag + // has not been explicitly set and instruction-referencing is turned on. + switch (CoalesceAdjacentFragmentsOpt) { + case cl::boolOrDefault::BOU_UNSET: + return debuginfoShouldUseDebugInstrRef( + Triple(F.getParent()->getTargetTriple())); + case cl::boolOrDefault::BOU_TRUE: + return true; + case cl::boolOrDefault::BOU_FALSE: + return false; + } + llvm_unreachable("Unknown boolOrDefault value"); +} + namespace { /// In dwarf emission, the following sequence /// 1. dbg.value ... Fragment(0, 64) @@ -301,6 +335,7 @@ class MemLocFragmentFill { Function &Fn; FunctionVarLocsBuilder *FnVarLocs; const DenseSet<DebugAggregate> *VarsWithStackSlot; + bool CoalesceAdjacentFragments; // 0 = no memory location. using BaseAddress = unsigned; @@ -315,7 +350,7 @@ class MemLocFragmentFill { /// IDs for memory location base addresses in maps. Use 0 to indicate that /// there's no memory location. - UniqueVector<Value *> Bases; + UniqueVector<RawLocationWrapper> Bases; UniqueVector<DebugAggregate> Aggregates; DenseMap<const BasicBlock *, VarFragMap> LiveIn; DenseMap<const BasicBlock *, VarFragMap> LiveOut; @@ -368,7 +403,7 @@ class MemLocFragmentFill { /// Return a string for the value that \p BaseID represents. std::string toString(unsigned BaseID) { if (BaseID) - return Bases[BaseID]->getName().str(); + return Bases[BaseID].getVariableLocationOp(0)->getName().str(); else return "None"; } @@ -565,6 +600,31 @@ class MemLocFragmentFill { << " bits [" << StartBit << ", " << EndBit << ")\n"); } + /// Inserts a new dbg def if the interval found when looking up \p StartBit + /// in \p FragMap starts before \p StartBit or ends after \p EndBit (which + /// indicates - assuming StartBit->EndBit has just been inserted - that the + /// slice has been coalesced in the map). + void coalesceFragments(BasicBlock &BB, Instruction &Before, unsigned Var, + unsigned StartBit, unsigned EndBit, unsigned Base, + DebugLoc DL, const FragsInMemMap &FragMap) { + if (!CoalesceAdjacentFragments) + return; + // We've inserted the location into the map. The map will have coalesced + // adjacent intervals (variable fragments) that describe the same memory + // location. Use this knowledge to insert a debug location that describes + // that coalesced fragment. This may eclipse other locs we've just + // inserted. This is okay as redundant locs will be cleaned up later. + auto CoalescedFrag = FragMap.find(StartBit); + // Bail if no coalescing has taken place. + if (CoalescedFrag.start() == StartBit && CoalescedFrag.stop() == EndBit) + return; + + LLVM_DEBUG(dbgs() << "- Insert loc for bits " << CoalescedFrag.start() + << " to " << CoalescedFrag.stop() << "\n"); + insertMemLoc(BB, Before, Var, CoalescedFrag.start(), CoalescedFrag.stop(), + Base, DL); + } + void addDef(const VarLocInfo &VarLoc, Instruction &Before, BasicBlock &BB, VarFragMap &LiveSet) { DebugVariable DbgVar = FnVarLocs->getVariable(VarLoc.VariableID); @@ -601,7 +661,7 @@ class MemLocFragmentFill { const auto DerefOffsetInBytes = getDerefOffsetInBytes(DIExpr); const unsigned Base = DerefOffsetInBytes && *DerefOffsetInBytes * 8 == StartBit - ? Bases.insert(VarLoc.V) + ? Bases.insert(VarLoc.Values) : 0; LLVM_DEBUG(dbgs() << "DEF " << DbgVar.getVariable()->getName() << " [" << StartBit << ", " << EndBit << "): " << toString(Base) @@ -630,6 +690,8 @@ class MemLocFragmentFill { if (!FragMap.overlaps(StartBit, EndBit)) { LLVM_DEBUG(dbgs() << "- No overlaps\n"); FragMap.insert(StartBit, EndBit, Base); + coalesceFragments(BB, Before, Var, StartBit, EndBit, Base, VarLoc.DL, + FragMap); return; } // There is at least one overlap. @@ -720,6 +782,9 @@ class MemLocFragmentFill { LLVM_DEBUG(dbgs() << "- Insert DEF into now-empty space\n"); FragMap.insert(StartBit, EndBit, Base); } + + coalesceFragments(BB, Before, Var, StartBit, EndBit, Base, VarLoc.DL, + FragMap); } bool skipVariable(const DILocalVariable *V) { return !V->getSizeInBits(); } @@ -737,8 +802,10 @@ class MemLocFragmentFill { public: MemLocFragmentFill(Function &Fn, - const DenseSet<DebugAggregate> *VarsWithStackSlot) - : Fn(Fn), VarsWithStackSlot(VarsWithStackSlot) {} + const DenseSet<DebugAggregate> *VarsWithStackSlot, + bool CoalesceAdjacentFragments) + : Fn(Fn), VarsWithStackSlot(VarsWithStackSlot), + CoalesceAdjacentFragments(CoalesceAdjacentFragments) {} /// Add variable locations to \p FnVarLocs so that any bits of a variable /// with a memory location have that location explicitly reinstated at each @@ -845,18 +912,20 @@ public: } // Insert new location defs. - for (auto Pair : BBInsertBeforeMap) { + for (auto &Pair : BBInsertBeforeMap) { InsertMap &Map = Pair.second; - for (auto Pair : Map) { + for (auto &Pair : Map) { Instruction *InsertBefore = Pair.first; assert(InsertBefore && "should never be null"); auto FragMemLocs = Pair.second; auto &Ctx = Fn.getContext(); - for (auto FragMemLoc : FragMemLocs) { + for (auto &FragMemLoc : FragMemLocs) { DIExpression *Expr = DIExpression::get(Ctx, std::nullopt); - Expr = *DIExpression::createFragmentExpression( - Expr, FragMemLoc.OffsetInBits, FragMemLoc.SizeInBits); + if (FragMemLoc.SizeInBits != + *Aggregates[FragMemLoc.Var].first->getSizeInBits()) + Expr = *DIExpression::createFragmentExpression( + Expr, FragMemLoc.OffsetInBits, FragMemLoc.SizeInBits); Expr = DIExpression::prepend(Expr, DIExpression::DerefAfter, FragMemLoc.OffsetInBits / 8); DebugVariable Var(Aggregates[FragMemLoc.Var].first, Expr, @@ -961,14 +1030,17 @@ public: } }; - using AssignmentMap = DenseMap<VariableID, Assignment>; - using LocMap = DenseMap<VariableID, LocKind>; - using OverlapMap = DenseMap<VariableID, SmallVector<VariableID, 4>>; + using AssignmentMap = SmallVector<Assignment>; + using LocMap = SmallVector<LocKind>; + using OverlapMap = DenseMap<VariableID, SmallVector<VariableID>>; using UntaggedStoreAssignmentMap = DenseMap<const Instruction *, SmallVector<std::pair<VariableID, at::AssignmentInfo>>>; private: + /// The highest numbered VariableID for partially promoted variables plus 1, + /// the values for which start at 1. + unsigned TrackedVariablesVectorSize = 0; /// Map a variable to the set of variables that it fully contains. OverlapMap VarContains; /// Map untagged stores to the variable fragments they assign to. Used by @@ -984,30 +1056,23 @@ private: void emitDbgValue(LocKind Kind, const DbgVariableIntrinsic *Source, Instruction *After); - static bool mapsAreEqual(const AssignmentMap &A, const AssignmentMap &B) { - if (A.size() != B.size()) - return false; - for (const auto &Pair : A) { - VariableID Var = Pair.first; - const Assignment &AV = Pair.second; - auto R = B.find(Var); - // Check if this entry exists in B, otherwise ret false. - if (R == B.end()) - return false; - // Check that the assignment value is the same. - if (!AV.isSameSourceAssignment(R->second)) - return false; - } - return true; + static bool mapsAreEqual(const BitVector &Mask, const AssignmentMap &A, + const AssignmentMap &B) { + return llvm::all_of(Mask.set_bits(), [&](unsigned VarID) { + return A[VarID].isSameSourceAssignment(B[VarID]); + }); } /// Represents the stack and debug assignments in a block. Used to describe /// the live-in and live-out values for blocks, as well as the "current" /// value as we process each instruction in a block. struct BlockInfo { - /// Dominating assignment to memory for each variable. + /// The set of variables (VariableID) being tracked in this block. + BitVector VariableIDsInBlock; + /// Dominating assignment to memory for each variable, indexed by + /// VariableID. AssignmentMap StackHomeValue; - /// Dominating assignemnt to each variable. + /// Dominating assignemnt to each variable, indexed by VariableID. AssignmentMap DebugValue; /// Location kind for each variable. LiveLoc indicates whether the /// dominating assignment in StackHomeValue (LocKind::Mem), DebugValue @@ -1018,20 +1083,138 @@ private: /// merge of multiple assignments (both are Status::NoneOrPhi). In other /// words, the memory location may well be valid while both DebugValue and /// StackHomeValue contain Assignments that have a Status of NoneOrPhi. + /// Indexed by VariableID. LocMap LiveLoc; + public: + enum AssignmentKind { Stack, Debug }; + const AssignmentMap &getAssignmentMap(AssignmentKind Kind) const { + switch (Kind) { + case Stack: + return StackHomeValue; + case Debug: + return DebugValue; + } + llvm_unreachable("Unknown AssignmentKind"); + } + AssignmentMap &getAssignmentMap(AssignmentKind Kind) { + return const_cast<AssignmentMap &>( + const_cast<const BlockInfo *>(this)->getAssignmentMap(Kind)); + } + + bool isVariableTracked(VariableID Var) const { + return VariableIDsInBlock[static_cast<unsigned>(Var)]; + } + + const Assignment &getAssignment(AssignmentKind Kind, VariableID Var) const { + assert(isVariableTracked(Var) && "Var not tracked in block"); + return getAssignmentMap(Kind)[static_cast<unsigned>(Var)]; + } + + LocKind getLocKind(VariableID Var) const { + assert(isVariableTracked(Var) && "Var not tracked in block"); + return LiveLoc[static_cast<unsigned>(Var)]; + } + + /// Set LocKind for \p Var only: does not set LocKind for VariableIDs of + /// fragments contained win \p Var. + void setLocKind(VariableID Var, LocKind K) { + VariableIDsInBlock.set(static_cast<unsigned>(Var)); + LiveLoc[static_cast<unsigned>(Var)] = K; + } + + /// Set the assignment in the \p Kind assignment map for \p Var only: does + /// not set the assignment for VariableIDs of fragments contained win \p + /// Var. + void setAssignment(AssignmentKind Kind, VariableID Var, + const Assignment &AV) { + VariableIDsInBlock.set(static_cast<unsigned>(Var)); + getAssignmentMap(Kind)[static_cast<unsigned>(Var)] = AV; + } + + /// Return true if there is an assignment matching \p AV in the \p Kind + /// assignment map. Does consider assignments for VariableIDs of fragments + /// contained win \p Var. + bool hasAssignment(AssignmentKind Kind, VariableID Var, + const Assignment &AV) const { + if (!isVariableTracked(Var)) + return false; + return AV.isSameSourceAssignment(getAssignment(Kind, Var)); + } + /// Compare every element in each map to determine structural equality /// (slow). bool operator==(const BlockInfo &Other) const { - return LiveLoc == Other.LiveLoc && - mapsAreEqual(StackHomeValue, Other.StackHomeValue) && - mapsAreEqual(DebugValue, Other.DebugValue); + return VariableIDsInBlock == Other.VariableIDsInBlock && + LiveLoc == Other.LiveLoc && + mapsAreEqual(VariableIDsInBlock, StackHomeValue, + Other.StackHomeValue) && + mapsAreEqual(VariableIDsInBlock, DebugValue, Other.DebugValue); } bool operator!=(const BlockInfo &Other) const { return !(*this == Other); } bool isValid() { return LiveLoc.size() == DebugValue.size() && LiveLoc.size() == StackHomeValue.size(); } + + /// Clear everything and initialise with ⊤-values for all variables. + void init(int NumVars) { + StackHomeValue.clear(); + DebugValue.clear(); + LiveLoc.clear(); + VariableIDsInBlock = BitVector(NumVars); + StackHomeValue.insert(StackHomeValue.begin(), NumVars, + Assignment::makeNoneOrPhi()); + DebugValue.insert(DebugValue.begin(), NumVars, + Assignment::makeNoneOrPhi()); + LiveLoc.insert(LiveLoc.begin(), NumVars, LocKind::None); + } + + /// Helper for join. + template <typename ElmtType, typename FnInputType> + static void joinElmt(int Index, SmallVector<ElmtType> &Target, + const SmallVector<ElmtType> &A, + const SmallVector<ElmtType> &B, + ElmtType (*Fn)(FnInputType, FnInputType)) { + Target[Index] = Fn(A[Index], B[Index]); + } + + /// See comment for AssignmentTrackingLowering::joinBlockInfo. + static BlockInfo join(const BlockInfo &A, const BlockInfo &B, int NumVars) { + // Join A and B. + // + // Intersect = join(a, b) for a in A, b in B where Var(a) == Var(b) + // Difference = join(x, ⊤) for x where Var(x) is in A xor B + // Join = Intersect ∪ Difference + // + // This is achieved by performing a join on elements from A and B with + // variables common to both A and B (join elements indexed by var + // intersect), then adding ⊤-value elements for vars in A xor B. The + // latter part is equivalent to performing join on elements with variables + // in A xor B with the ⊤-value for the map element since join(x, ⊤) = ⊤. + // BlockInfo::init initializes all variable entries to the ⊤ value so we + // don't need to explicitly perform that step as Join.VariableIDsInBlock + // is set to the union of the variables in A and B at the end of this + // function. + BlockInfo Join; + Join.init(NumVars); + + BitVector Intersect = A.VariableIDsInBlock; + Intersect &= B.VariableIDsInBlock; + + for (auto VarID : Intersect.set_bits()) { + joinElmt(VarID, Join.LiveLoc, A.LiveLoc, B.LiveLoc, joinKind); + joinElmt(VarID, Join.DebugValue, A.DebugValue, B.DebugValue, + joinAssignment); + joinElmt(VarID, Join.StackHomeValue, A.StackHomeValue, B.StackHomeValue, + joinAssignment); + } + + Join.VariableIDsInBlock = A.VariableIDsInBlock; + Join.VariableIDsInBlock |= B.VariableIDsInBlock; + assert(Join.isValid()); + return Join; + } }; Function &Fn; @@ -1076,11 +1259,8 @@ private: /// (⊤) in this case (unknown location / assignment). ///@{ static LocKind joinKind(LocKind A, LocKind B); - static LocMap joinLocMap(const LocMap &A, const LocMap &B); static Assignment joinAssignment(const Assignment &A, const Assignment &B); - static AssignmentMap joinAssignmentMap(const AssignmentMap &A, - const AssignmentMap &B); - static BlockInfo joinBlockInfo(const BlockInfo &A, const BlockInfo &B); + BlockInfo joinBlockInfo(const BlockInfo &A, const BlockInfo &B); ///@} /// Process the instructions in \p BB updating \p LiveSet along the way. \p @@ -1092,7 +1272,7 @@ private: /// location information). ///@{ void processNonDbgInstruction(Instruction &I, BlockInfo *LiveSet); - void processDbgInstruction(Instruction &I, BlockInfo *LiveSet); + void processDbgInstruction(DbgInfoIntrinsic &I, BlockInfo *LiveSet); /// Update \p LiveSet after encountering an instruction with a DIAssignID /// attachment, \p I. void processTaggedInstruction(Instruction &I, BlockInfo *LiveSet); @@ -1113,8 +1293,15 @@ private: /// have been called for \p Var first. LocKind getLocKind(BlockInfo *LiveSet, VariableID Var); /// Return true if \p Var has an assignment in \p M matching \p AV. - bool hasVarWithAssignment(VariableID Var, const Assignment &AV, - const AssignmentMap &M); + bool hasVarWithAssignment(BlockInfo *LiveSet, BlockInfo::AssignmentKind Kind, + VariableID Var, const Assignment &AV); + /// Return the set of VariableIDs corresponding the fragments contained fully + /// within the variable/fragment \p Var. + ArrayRef<VariableID> getContainedFragments(VariableID Var) const; + + /// Mark \p Var as having been touched this frame. Note, this applies only + /// to the exact fragment \p Var and not to any fragments contained within. + void touchFragment(VariableID Var); /// Emit info for variables that are fully promoted. bool emitPromotedVarLocs(FunctionVarLocsBuilder *FnVarLocs); @@ -1129,66 +1316,60 @@ public: }; } // namespace +ArrayRef<VariableID> +AssignmentTrackingLowering::getContainedFragments(VariableID Var) const { + auto R = VarContains.find(Var); + if (R == VarContains.end()) + return std::nullopt; + return R->second; +} + +void AssignmentTrackingLowering::touchFragment(VariableID Var) { + VarsTouchedThisFrame.insert(Var); +} + void AssignmentTrackingLowering::setLocKind(BlockInfo *LiveSet, VariableID Var, LocKind K) { auto SetKind = [this](BlockInfo *LiveSet, VariableID Var, LocKind K) { - VarsTouchedThisFrame.insert(Var); - LiveSet->LiveLoc[Var] = K; + LiveSet->setLocKind(Var, K); + touchFragment(Var); }; SetKind(LiveSet, Var, K); // Update the LocKind for all fragments contained within Var. - for (VariableID Frag : VarContains[Var]) + for (VariableID Frag : getContainedFragments(Var)) SetKind(LiveSet, Frag, K); } AssignmentTrackingLowering::LocKind AssignmentTrackingLowering::getLocKind(BlockInfo *LiveSet, VariableID Var) { - auto Pair = LiveSet->LiveLoc.find(Var); - assert(Pair != LiveSet->LiveLoc.end()); - return Pair->second; + return LiveSet->getLocKind(Var); } void AssignmentTrackingLowering::addMemDef(BlockInfo *LiveSet, VariableID Var, const Assignment &AV) { - auto AddDef = [](BlockInfo *LiveSet, VariableID Var, Assignment AV) { - LiveSet->StackHomeValue[Var] = AV; - // Add default (Var -> ⊤) to DebugValue if Var isn't in DebugValue yet. - LiveSet->DebugValue.insert({Var, Assignment::makeNoneOrPhi()}); - // Add default (Var -> ⊤) to LiveLocs if Var isn't in LiveLocs yet. Callers - // of addMemDef will call setLocKind to override. - LiveSet->LiveLoc.insert({Var, LocKind::None}); - }; - AddDef(LiveSet, Var, AV); + LiveSet->setAssignment(BlockInfo::Stack, Var, AV); // Use this assigment for all fragments contained within Var, but do not // provide a Source because we cannot convert Var's value to a value for the // fragment. Assignment FragAV = AV; FragAV.Source = nullptr; - for (VariableID Frag : VarContains[Var]) - AddDef(LiveSet, Frag, FragAV); + for (VariableID Frag : getContainedFragments(Var)) + LiveSet->setAssignment(BlockInfo::Stack, Frag, FragAV); } void AssignmentTrackingLowering::addDbgDef(BlockInfo *LiveSet, VariableID Var, const Assignment &AV) { - auto AddDef = [](BlockInfo *LiveSet, VariableID Var, Assignment AV) { - LiveSet->DebugValue[Var] = AV; - // Add default (Var -> ⊤) to StackHome if Var isn't in StackHome yet. - LiveSet->StackHomeValue.insert({Var, Assignment::makeNoneOrPhi()}); - // Add default (Var -> ⊤) to LiveLocs if Var isn't in LiveLocs yet. Callers - // of addDbgDef will call setLocKind to override. - LiveSet->LiveLoc.insert({Var, LocKind::None}); - }; - AddDef(LiveSet, Var, AV); + LiveSet->setAssignment(BlockInfo::Debug, Var, AV); // Use this assigment for all fragments contained within Var, but do not // provide a Source because we cannot convert Var's value to a value for the // fragment. Assignment FragAV = AV; FragAV.Source = nullptr; - for (VariableID Frag : VarContains[Var]) - AddDef(LiveSet, Frag, FragAV); + for (VariableID Frag : getContainedFragments(Var)) + LiveSet->setAssignment(BlockInfo::Debug, Frag, FragAV); } static DIAssignID *getIDFromInst(const Instruction &I) { @@ -1200,24 +1381,16 @@ static DIAssignID *getIDFromMarker(const DbgAssignIntrinsic &DAI) { } /// Return true if \p Var has an assignment in \p M matching \p AV. -bool AssignmentTrackingLowering::hasVarWithAssignment(VariableID Var, - const Assignment &AV, - const AssignmentMap &M) { - auto AssignmentIsMapped = [](VariableID Var, const Assignment &AV, - const AssignmentMap &M) { - auto R = M.find(Var); - if (R == M.end()) - return false; - return AV.isSameSourceAssignment(R->second); - }; - - if (!AssignmentIsMapped(Var, AV, M)) +bool AssignmentTrackingLowering::hasVarWithAssignment( + BlockInfo *LiveSet, BlockInfo::AssignmentKind Kind, VariableID Var, + const Assignment &AV) { + if (!LiveSet->hasAssignment(Kind, Var, AV)) return false; // Check all the frags contained within Var as these will have all been // mapped to AV at the last store to Var. - for (VariableID Frag : VarContains[Var]) - if (!AssignmentIsMapped(Frag, AV, M)) + for (VariableID Frag : getContainedFragments(Var)) + if (!LiveSet->hasAssignment(Kind, Frag, AV)) return false; return true; } @@ -1242,10 +1415,11 @@ void AssignmentTrackingLowering::emitDbgValue( const DbgVariableIntrinsic *Source, Instruction *After) { DILocation *DL = Source->getDebugLoc(); - auto Emit = [this, Source, After, DL](Value *Val, DIExpression *Expr) { + auto Emit = [this, Source, After, DL](Metadata *Val, DIExpression *Expr) { assert(Expr); if (!Val) - Val = PoisonValue::get(Type::getInt1Ty(Source->getContext())); + Val = ValueAsMetadata::get( + PoisonValue::get(Type::getInt1Ty(Source->getContext()))); // Find a suitable insert point. Instruction *InsertBefore = After->getNextNode(); @@ -1255,7 +1429,7 @@ void AssignmentTrackingLowering::emitDbgValue( VarLocInfo VarLoc; VarLoc.VariableID = static_cast<VariableID>(Var); VarLoc.Expr = Expr; - VarLoc.V = Val; + VarLoc.Values = RawLocationWrapper(Val); VarLoc.DL = DL; // Insert it into the map for later. InsertBeforeMap[InsertBefore].push_back(VarLoc); @@ -1284,16 +1458,13 @@ void AssignmentTrackingLowering::emitDbgValue( // The address-expression has an implicit deref, add it now. std::tie(Val, Expr) = walkToAllocaAndPrependOffsetDeref(Layout, Val, Expr); - Emit(Val, Expr); + Emit(ValueAsMetadata::get(Val), Expr); return; } } if (Kind == LocKind::Val) { - /// Get the value component, converting to Undef if it is variadic. - Value *Val = - Source->hasArgList() ? nullptr : Source->getVariableLocationOp(0); - Emit(Val, Source->getExpression()); + Emit(Source->getRawLocation(), Source->getExpression()); return; } @@ -1371,7 +1542,8 @@ void AssignmentTrackingLowering::processUntaggedInstruction( VarLocInfo VarLoc; VarLoc.VariableID = static_cast<VariableID>(Var); VarLoc.Expr = DIE; - VarLoc.V = const_cast<AllocaInst *>(Info.Base); + VarLoc.Values = RawLocationWrapper( + ValueAsMetadata::get(const_cast<AllocaInst *>(Info.Base))); VarLoc.DL = DILoc; // 3. Insert it into the map for later. InsertBeforeMap[InsertBefore].push_back(VarLoc); @@ -1405,13 +1577,14 @@ void AssignmentTrackingLowering::processTaggedInstruction( // The last assignment to the stack is now AV. Check if the last debug // assignment has a matching Assignment. - if (hasVarWithAssignment(Var, AV, LiveSet->DebugValue)) { + if (hasVarWithAssignment(LiveSet, BlockInfo::Debug, Var, AV)) { // The StackHomeValue and DebugValue for this variable match so we can // emit a stack home location here. LLVM_DEBUG(dbgs() << "Mem, Stack matches Debug program\n";); LLVM_DEBUG(dbgs() << " Stack val: "; AV.dump(dbgs()); dbgs() << "\n"); LLVM_DEBUG(dbgs() << " Debug val: "; - LiveSet->DebugValue[Var].dump(dbgs()); dbgs() << "\n"); + LiveSet->DebugValue[static_cast<unsigned>(Var)].dump(dbgs()); + dbgs() << "\n"); setLocKind(LiveSet, Var, LocKind::Mem); emitDbgValue(LocKind::Mem, DAI, &I); continue; @@ -1434,7 +1607,8 @@ void AssignmentTrackingLowering::processTaggedInstruction( // There's been an assignment to memory that we were using as a // location for this variable, and the Assignment doesn't match what // we'd expect to see in memory. - if (LiveSet->DebugValue[Var].Status == Assignment::NoneOrPhi) { + Assignment DbgAV = LiveSet->getAssignment(BlockInfo::Debug, Var); + if (DbgAV.Status == Assignment::NoneOrPhi) { // We need to terminate any previously open location now. LLVM_DEBUG(dbgs() << "None, No Debug value available\n";); setLocKind(LiveSet, Var, LocKind::None); @@ -1443,9 +1617,8 @@ void AssignmentTrackingLowering::processTaggedInstruction( // The previous DebugValue Value can be used here. LLVM_DEBUG(dbgs() << "Val, Debug value is Known\n";); setLocKind(LiveSet, Var, LocKind::Val); - Assignment PrevAV = LiveSet->DebugValue.lookup(Var); - if (PrevAV.Source) { - emitDbgValue(LocKind::Val, PrevAV.Source, &I); + if (DbgAV.Source) { + emitDbgValue(LocKind::Val, DbgAV.Source, &I); } else { // PrevAV.Source is nullptr so we must emit undef here. emitDbgValue(LocKind::None, DAI, &I); @@ -1479,7 +1652,7 @@ void AssignmentTrackingLowering::processDbgAssign(DbgAssignIntrinsic &DAI, // Check if the DebugValue and StackHomeValue both hold the same // Assignment. - if (hasVarWithAssignment(Var, AV, LiveSet->StackHomeValue)) { + if (hasVarWithAssignment(LiveSet, BlockInfo::Stack, Var, AV)) { // They match. We can use the stack home because the debug intrinsics state // that an assignment happened here, and we know that specific assignment // was the last one to take place in memory for this variable. @@ -1529,9 +1702,22 @@ void AssignmentTrackingLowering::processDbgValue(DbgValueInst &DVI, emitDbgValue(LocKind::Val, &DVI, &DVI); } +static bool hasZeroSizedFragment(DbgVariableIntrinsic &DVI) { + if (auto F = DVI.getExpression()->getFragmentInfo()) + return F->SizeInBits == 0; + return false; +} + void AssignmentTrackingLowering::processDbgInstruction( - Instruction &I, AssignmentTrackingLowering::BlockInfo *LiveSet) { - assert(!isa<DbgAddrIntrinsic>(&I) && "unexpected dbg.addr"); + DbgInfoIntrinsic &I, AssignmentTrackingLowering::BlockInfo *LiveSet) { + auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I); + if (!DVI) + return; + + // Ignore assignments to zero bits of the variable. + if (hasZeroSizedFragment(*DVI)) + return; + if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I)) processDbgAssign(*DAI, LiveSet); else if (auto *DVI = dyn_cast<DbgValueInst>(&I)) @@ -1561,10 +1747,11 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) { ++II; } while (II != EI) { - if (!isa<DbgInfoIntrinsic>(&*II)) + auto *Dbg = dyn_cast<DbgInfoIntrinsic>(&*II); + if (!Dbg) break; resetInsertionPoint(*II); - processDbgInstruction(*II, LiveSet); + processDbgInstruction(*Dbg, LiveSet); assert(LiveSet->isValid()); ++II; } @@ -1597,54 +1784,6 @@ AssignmentTrackingLowering::joinKind(LocKind A, LocKind B) { return A == B ? A : LocKind::None; } -AssignmentTrackingLowering::LocMap -AssignmentTrackingLowering::joinLocMap(const LocMap &A, const LocMap &B) { - // Join A and B. - // - // U = join(a, b) for a in A, b in B where Var(a) == Var(b) - // D = join(x, ⊤) for x where Var(x) is in A xor B - // Join = U ∪ D - // - // This is achieved by performing a join on elements from A and B with - // variables common to both A and B (join elements indexed by var intersect), - // then adding LocKind::None elements for vars in A xor B. The latter part is - // equivalent to performing join on elements with variables in A xor B with - // LocKind::None (⊤) since join(x, ⊤) = ⊤. - LocMap Join; - SmallVector<VariableID, 16> SymmetricDifference; - // Insert the join of the elements with common vars into Join. Add the - // remaining elements to into SymmetricDifference. - for (const auto &[Var, Loc] : A) { - // If this Var doesn't exist in B then add it to the symmetric difference - // set. - auto R = B.find(Var); - if (R == B.end()) { - SymmetricDifference.push_back(Var); - continue; - } - // There is an entry for Var in both, join it. - Join[Var] = joinKind(Loc, R->second); - } - unsigned IntersectSize = Join.size(); - (void)IntersectSize; - - // Add the elements in B with variables that are not in A into - // SymmetricDifference. - for (const auto &Pair : B) { - VariableID Var = Pair.first; - if (A.count(Var) == 0) - SymmetricDifference.push_back(Var); - } - - // Add SymmetricDifference elements to Join and return the result. - for (const auto &Var : SymmetricDifference) - Join.insert({Var, LocKind::None}); - - assert(Join.size() == (IntersectSize + SymmetricDifference.size())); - assert(Join.size() >= A.size() && Join.size() >= B.size()); - return Join; -} - AssignmentTrackingLowering::Assignment AssignmentTrackingLowering::joinAssignment(const Assignment &A, const Assignment &B) { @@ -1687,107 +1826,80 @@ AssignmentTrackingLowering::joinAssignment(const Assignment &A, return Assignment::make(A.ID, Source); } -AssignmentTrackingLowering::AssignmentMap -AssignmentTrackingLowering::joinAssignmentMap(const AssignmentMap &A, - const AssignmentMap &B) { - // Join A and B. - // - // U = join(a, b) for a in A, b in B where Var(a) == Var(b) - // D = join(x, ⊤) for x where Var(x) is in A xor B - // Join = U ∪ D - // - // This is achieved by performing a join on elements from A and B with - // variables common to both A and B (join elements indexed by var intersect), - // then adding LocKind::None elements for vars in A xor B. The latter part is - // equivalent to performing join on elements with variables in A xor B with - // Status::NoneOrPhi (⊤) since join(x, ⊤) = ⊤. - AssignmentMap Join; - SmallVector<VariableID, 16> SymmetricDifference; - // Insert the join of the elements with common vars into Join. Add the - // remaining elements to into SymmetricDifference. - for (const auto &[Var, AV] : A) { - // If this Var doesn't exist in B then add it to the symmetric difference - // set. - auto R = B.find(Var); - if (R == B.end()) { - SymmetricDifference.push_back(Var); - continue; - } - // There is an entry for Var in both, join it. - Join[Var] = joinAssignment(AV, R->second); - } - unsigned IntersectSize = Join.size(); - (void)IntersectSize; - - // Add the elements in B with variables that are not in A into - // SymmetricDifference. - for (const auto &Pair : B) { - VariableID Var = Pair.first; - if (A.count(Var) == 0) - SymmetricDifference.push_back(Var); - } - - // Add SymmetricDifference elements to Join and return the result. - for (auto Var : SymmetricDifference) - Join.insert({Var, Assignment::makeNoneOrPhi()}); - - assert(Join.size() == (IntersectSize + SymmetricDifference.size())); - assert(Join.size() >= A.size() && Join.size() >= B.size()); - return Join; -} - AssignmentTrackingLowering::BlockInfo AssignmentTrackingLowering::joinBlockInfo(const BlockInfo &A, const BlockInfo &B) { - BlockInfo Join; - Join.LiveLoc = joinLocMap(A.LiveLoc, B.LiveLoc); - Join.StackHomeValue = joinAssignmentMap(A.StackHomeValue, B.StackHomeValue); - Join.DebugValue = joinAssignmentMap(A.DebugValue, B.DebugValue); - assert(Join.isValid()); - return Join; + return BlockInfo::join(A, B, TrackedVariablesVectorSize); } bool AssignmentTrackingLowering::join( const BasicBlock &BB, const SmallPtrSet<BasicBlock *, 16> &Visited) { - BlockInfo BBLiveIn; - bool FirstJoin = true; - // LiveIn locs for BB is the join of the already-processed preds' LiveOut - // locs. + + SmallVector<const BasicBlock *> VisitedPreds; + // Ignore backedges if we have not visited the predecessor yet. As the + // predecessor hasn't yet had locations propagated into it, most locations + // will not yet be valid, so treat them as all being uninitialized and + // potentially valid. If a location guessed to be correct here is + // invalidated later, we will remove it when we revisit this block. This + // is essentially the same as initialising all LocKinds and Assignments to + // an implicit ⊥ value which is the identity value for the join operation. for (auto I = pred_begin(&BB), E = pred_end(&BB); I != E; I++) { - // Ignore backedges if we have not visited the predecessor yet. As the - // predecessor hasn't yet had locations propagated into it, most locations - // will not yet be valid, so treat them as all being uninitialized and - // potentially valid. If a location guessed to be correct here is - // invalidated later, we will remove it when we revisit this block. This - // is essentially the same as initialising all LocKinds and Assignments to - // an implicit ⊥ value which is the identity value for the join operation. const BasicBlock *Pred = *I; - if (!Visited.count(Pred)) - continue; + if (Visited.count(Pred)) + VisitedPreds.push_back(Pred); + } + + // No preds visited yet. + if (VisitedPreds.empty()) { + auto It = LiveIn.try_emplace(&BB, BlockInfo()); + bool DidInsert = It.second; + if (DidInsert) + It.first->second.init(TrackedVariablesVectorSize); + return /*Changed*/ DidInsert; + } - auto PredLiveOut = LiveOut.find(Pred); - // Pred must have been processed already. See comment at start of this loop. - assert(PredLiveOut != LiveOut.end()); + // Exactly one visited pred. Copy the LiveOut from that pred into BB LiveIn. + if (VisitedPreds.size() == 1) { + const BlockInfo &PredLiveOut = LiveOut.find(VisitedPreds[0])->second; + auto CurrentLiveInEntry = LiveIn.find(&BB); - // Perform the join of BBLiveIn (current live-in info) and PrevLiveOut. - if (FirstJoin) - BBLiveIn = PredLiveOut->second; + // Check if there isn't an entry, or there is but the LiveIn set has + // changed (expensive check). + if (CurrentLiveInEntry == LiveIn.end()) + LiveIn.insert(std::make_pair(&BB, PredLiveOut)); + else if (PredLiveOut != CurrentLiveInEntry->second) + CurrentLiveInEntry->second = PredLiveOut; else - BBLiveIn = joinBlockInfo(std::move(BBLiveIn), PredLiveOut->second); - FirstJoin = false; + return /*Changed*/ false; + return /*Changed*/ true; + } + + // More than one pred. Join LiveOuts of blocks 1 and 2. + assert(VisitedPreds.size() > 1); + const BlockInfo &PredLiveOut0 = LiveOut.find(VisitedPreds[0])->second; + const BlockInfo &PredLiveOut1 = LiveOut.find(VisitedPreds[1])->second; + BlockInfo BBLiveIn = joinBlockInfo(PredLiveOut0, PredLiveOut1); + + // Join the LiveOuts of subsequent blocks. + ArrayRef Tail = ArrayRef(VisitedPreds).drop_front(2); + for (const BasicBlock *Pred : Tail) { + const auto &PredLiveOut = LiveOut.find(Pred); + assert(PredLiveOut != LiveOut.end() && + "block should have been processed already"); + BBLiveIn = joinBlockInfo(std::move(BBLiveIn), PredLiveOut->second); } + // Save the joined result for BB. auto CurrentLiveInEntry = LiveIn.find(&BB); // Check if there isn't an entry, or there is but the LiveIn set has changed // (expensive check). - if (CurrentLiveInEntry == LiveIn.end() || - BBLiveIn != CurrentLiveInEntry->second) { - LiveIn[&BB] = std::move(BBLiveIn); - // A change has occured. - return true; - } - // No change. - return false; + if (CurrentLiveInEntry == LiveIn.end()) + LiveIn.try_emplace(&BB, std::move(BBLiveIn)); + else if (BBLiveIn != CurrentLiveInEntry->second) + CurrentLiveInEntry->second = std::move(BBLiveIn); + else + return /*Changed*/ false; + return /*Changed*/ true; } /// Return true if A fully contains B. @@ -1823,7 +1935,13 @@ getUntaggedStoreAssignmentInfo(const Instruction &I, const DataLayout &Layout) { /// y does not contain all overlaps because partial overlaps are excluded. /// /// While we're iterating over the function, add single location defs for -/// dbg.declares to \p FnVarLocs +/// dbg.declares to \p FnVarLocs. +/// +/// Variables that are interesting to this pass in are added to +/// FnVarLocs->Variables first. TrackedVariablesVectorSize is set to the ID of +/// the last interesting variable plus 1, meaning variables with ID 1 +/// (inclusive) to TrackedVariablesVectorSize (exclusive) are interesting. The +/// subsequent variables are either stack homed or fully promoted. /// /// Finally, populate UntaggedStoreVars with a mapping of untagged stores to /// the stored-to variable fragments. @@ -1832,7 +1950,9 @@ getUntaggedStoreAssignmentInfo(const Instruction &I, const DataLayout &Layout) { /// to iterate over the function as they can be achieved together in one pass. static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares( Function &Fn, FunctionVarLocsBuilder *FnVarLocs, - AssignmentTrackingLowering::UntaggedStoreAssignmentMap &UntaggedStoreVars) { + const DenseSet<DebugAggregate> &VarsWithStackSlot, + AssignmentTrackingLowering::UntaggedStoreAssignmentMap &UntaggedStoreVars, + unsigned &TrackedVariablesVectorSize) { DenseSet<DebugVariable> Seen; // Map of Variable: [Fragments]. DenseMap<DebugAggregate, SmallVector<DebugVariable, 8>> FragmentMap; @@ -1843,14 +1963,16 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares( // UntaggedStoreVars. // We need to add fragments for untagged stores too so that we can correctly // clobber overlapped fragment locations later. + SmallVector<DbgDeclareInst *> Declares; for (auto &BB : Fn) { for (auto &I : BB) { if (auto *DDI = dyn_cast<DbgDeclareInst>(&I)) { - FnVarLocs->addSingleLocVar(DebugVariable(DDI), DDI->getExpression(), - DDI->getDebugLoc(), DDI->getAddress()); + Declares.push_back(DDI); } else if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) { DebugVariable DV = DebugVariable(DII); DebugAggregate DA = {DV.getVariable(), DV.getInlinedAt()}; + if (!VarsWithStackSlot.contains(DA)) + continue; if (Seen.insert(DV).second) FragmentMap[DA].push_back(DV); } else if (auto Info = getUntaggedStoreAssignmentInfo( @@ -1875,6 +1997,8 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares( DebugVariable DV = DebugVariable(DAI->getVariable(), FragInfo, DAI->getDebugLoc().getInlinedAt()); DebugAggregate DA = {DV.getVariable(), DV.getInlinedAt()}; + if (!VarsWithStackSlot.contains(DA)) + continue; // Cache this info for later. UntaggedStoreVars[&I].push_back( @@ -1887,21 +2011,22 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares( } } - // Sort the fragment map for each DebugAggregate in non-descending - // order of fragment size. Assert no entries are duplicates. + // Sort the fragment map for each DebugAggregate in ascending + // order of fragment size - there should be no duplicates. for (auto &Pair : FragmentMap) { SmallVector<DebugVariable, 8> &Frags = Pair.second; - std::sort( - Frags.begin(), Frags.end(), [](DebugVariable Next, DebugVariable Elmt) { - assert(!(Elmt.getFragmentOrDefault() == Next.getFragmentOrDefault())); - return Elmt.getFragmentOrDefault().SizeInBits > - Next.getFragmentOrDefault().SizeInBits; - }); + std::sort(Frags.begin(), Frags.end(), + [](const DebugVariable &Next, const DebugVariable &Elmt) { + return Elmt.getFragmentOrDefault().SizeInBits > + Next.getFragmentOrDefault().SizeInBits; + }); + // Check for duplicates. + assert(std::adjacent_find(Frags.begin(), Frags.end()) == Frags.end()); } // Build the map. AssignmentTrackingLowering::OverlapMap Map; - for (auto Pair : FragmentMap) { + for (auto &Pair : FragmentMap) { auto &Frags = Pair.second; for (auto It = Frags.begin(), IEnd = Frags.end(); It != IEnd; ++It) { DIExpression::FragmentInfo Frag = It->getFragmentOrDefault(); @@ -1922,6 +2047,15 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares( } } + // VariableIDs are 1-based so the variable-tracking bitvector needs + // NumVariables plus 1 bits. + TrackedVariablesVectorSize = FnVarLocs->getNumVariables() + 1; + + // Finally, insert the declares afterwards, so the first IDs are all + // partially stack homed vars. + for (auto *DDI : Declares) + FnVarLocs->addSingleLocVar(DebugVariable(DDI), DDI->getExpression(), + DDI->getDebugLoc(), DDI->getWrappedLocation()); return Map; } @@ -1942,8 +2076,9 @@ bool AssignmentTrackingLowering::run(FunctionVarLocsBuilder *FnVarLocsBuilder) { // Note that this pass doesn't handle partial overlaps correctly (FWIW // neither does LiveDebugVariables) because that is difficult to do and // appears to be rare occurance. - VarContains = - buildOverlapMapAndRecordDeclares(Fn, FnVarLocs, UntaggedStoreVars); + VarContains = buildOverlapMapAndRecordDeclares( + Fn, FnVarLocs, *VarsWithStackSlot, UntaggedStoreVars, + TrackedVariablesVectorSize); // Prepare for traversal. ReversePostOrderTraversal<Function *> RPOT(&Fn); @@ -2059,14 +2194,14 @@ bool AssignmentTrackingLowering::run(FunctionVarLocsBuilder *FnVarLocsBuilder) { // // Unless we've already done so, create the single location def now. if (AlwaysStackHomed.insert(Aggr).second) { - assert(isa<AllocaInst>(VarLoc.V)); + assert(!VarLoc.Values.hasArgList()); // TODO: When more complex cases are handled VarLoc.Expr should be // built appropriately rather than always using an empty DIExpression. // The assert below is a reminder. assert(Simple); VarLoc.Expr = DIExpression::get(Fn.getContext(), std::nullopt); DebugVariable Var = FnVarLocs->getVariable(VarLoc.VariableID); - FnVarLocs->addSingleLocVar(Var, VarLoc.Expr, VarLoc.DL, VarLoc.V); + FnVarLocs->addSingleLocVar(Var, VarLoc.Expr, VarLoc.DL, VarLoc.Values); InsertedAnyIntrinsics = true; } } @@ -2109,20 +2244,11 @@ bool AssignmentTrackingLowering::emitPromotedVarLocs( // already. if (VarsWithStackSlot->contains(getAggregate(DVI))) continue; - // Wrapper to get a single value (or undef) from DVI. - auto GetValue = [DVI]() -> Value * { - // We can't handle variadic DIExpressions yet so treat those as - // kill locations. - if (DVI->isKillLocation() || DVI->getValue() == nullptr || - DVI->hasArgList()) - return PoisonValue::get(Type::getInt32Ty(DVI->getContext())); - return DVI->getValue(); - }; Instruction *InsertBefore = I.getNextNode(); assert(InsertBefore && "Unexpected: debug intrinsics after a terminator"); FnVarLocs->addVarLoc(InsertBefore, DebugVariable(DVI), DVI->getExpression(), DVI->getDebugLoc(), - GetValue()); + DVI->getWrappedLocation()); InsertedAnyIntrinsics = true; } } @@ -2140,15 +2266,14 @@ static bool removeRedundantDbgLocsUsingBackwardScan(const BasicBlock *BB, FunctionVarLocsBuilder &FnVarLocs) { bool Changed = false; - SmallDenseSet<DebugVariable> VariableSet; - + SmallDenseMap<DebugAggregate, BitVector> VariableDefinedBits; // Scan over the entire block, not just over the instructions mapped by // FnVarLocs, because wedges in FnVarLocs may only be seperated by debug // instructions. for (const Instruction &I : reverse(*BB)) { if (!isa<DbgVariableIntrinsic>(I)) { // Sequence of consecutive defs ended. Clear map for the next one. - VariableSet.clear(); + VariableDefinedBits.clear(); } // Get the location defs that start just before this instruction. @@ -2164,21 +2289,44 @@ removeRedundantDbgLocsUsingBackwardScan(const BasicBlock *BB, // Iterate over the existing defs in reverse. for (auto RIt = Locs->rbegin(), REnd = Locs->rend(); RIt != REnd; ++RIt) { NumDefsScanned++; - const DebugVariable &Key = FnVarLocs.getVariable(RIt->VariableID); - bool FirstDefOfFragment = VariableSet.insert(Key).second; + DebugAggregate Aggr = + getAggregate(FnVarLocs.getVariable(RIt->VariableID)); + uint64_t SizeInBits = Aggr.first->getSizeInBits().value_or(0); - // If the same variable fragment is described more than once it is enough - // to keep the last one (i.e. the first found in this reverse iteration). - if (FirstDefOfFragment) { - // New def found: keep it. + if (SizeInBits == 0) { + // If the size is unknown (0) then keep this location def to be safe. NewDefsReversed.push_back(*RIt); - } else { - // Redundant def found: throw it away. Since the wedge of defs is being - // rebuilt, doing nothing is the same as deleting an entry. - ChangedThisWedge = true; - NumDefsRemoved++; + continue; } - continue; + + // Only keep this location definition if it is not fully eclipsed by + // other definitions in this wedge that come after it + + // Inert the bits the location definition defines. + auto InsertResult = + VariableDefinedBits.try_emplace(Aggr, BitVector(SizeInBits)); + bool FirstDefinition = InsertResult.second; + BitVector &DefinedBits = InsertResult.first->second; + + DIExpression::FragmentInfo Fragment = + RIt->Expr->getFragmentInfo().value_or( + DIExpression::FragmentInfo(SizeInBits, 0)); + bool InvalidFragment = Fragment.endInBits() > SizeInBits; + + // If this defines any previously undefined bits, keep it. + if (FirstDefinition || InvalidFragment || + DefinedBits.find_first_unset_in(Fragment.startInBits(), + Fragment.endInBits()) != -1) { + if (!InvalidFragment) + DefinedBits.set(Fragment.startInBits(), Fragment.endInBits()); + NewDefsReversed.push_back(*RIt); + continue; + } + + // Redundant def found: throw it away. Since the wedge of defs is being + // rebuilt, doing nothing is the same as deleting an entry. + ChangedThisWedge = true; + NumDefsRemoved++; } // Un-reverse the defs and replace the wedge with the pruned version. @@ -2204,7 +2352,8 @@ static bool removeRedundantDbgLocsUsingForwardScan(const BasicBlock *BB, FunctionVarLocsBuilder &FnVarLocs) { bool Changed = false; - DenseMap<DebugVariable, std::pair<Value *, DIExpression *>> VariableMap; + DenseMap<DebugVariable, std::pair<RawLocationWrapper, DIExpression *>> + VariableMap; // Scan over the entire block, not just over the instructions mapped by // FnVarLocs, because wedges in FnVarLocs may only be seperated by debug @@ -2229,9 +2378,9 @@ removeRedundantDbgLocsUsingForwardScan(const BasicBlock *BB, // Update the map if we found a new value/expression describing the // variable, or if the variable wasn't mapped already. - if (VMI == VariableMap.end() || VMI->second.first != Loc.V || + if (VMI == VariableMap.end() || VMI->second.first != Loc.Values || VMI->second.second != Loc.Expr) { - VariableMap[Key] = {Loc.V, Loc.Expr}; + VariableMap[Key] = {Loc.Values, Loc.Expr}; NewDefs.push_back(Loc); continue; } @@ -2311,7 +2460,7 @@ removeUndefDbgLocsFromEntryBlock(const BasicBlock *BB, // Remove undef entries that are encountered before any non-undef // intrinsics from the entry block. - if (isa<UndefValue>(Loc.V) && !HasDefinedBits(Aggr, Var)) { + if (Loc.Values.isKillLocation(Loc.Expr) && !HasDefinedBits(Aggr, Var)) { // Did not insert this Loc, which is the same as removing it. NumDefsRemoved++; ChangedThisWedge = true; @@ -2381,7 +2530,8 @@ static void analyzeFunction(Function &Fn, const DataLayout &Layout, } if (Changed) { - MemLocFragmentFill Pass(Fn, &VarsWithStackSlot); + MemLocFragmentFill Pass(Fn, &VarsWithStackSlot, + shouldCoalesceFragments(Fn)); Pass.run(FnVarLocs); // Remove redundant entries. As well as reducing memory consumption and diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 8f71ec2b490c..80a0bb957cfc 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -1576,6 +1576,11 @@ bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) { bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, CreateCmpXchgInstFun CreateCmpXchg) { ReplacementIRBuilder Builder(AI, AI->getModule()->getDataLayout()); + Builder.setIsFPConstrained( + AI->getFunction()->hasFnAttribute(Attribute::StrictFP)); + + // FIXME: If FP exceptions are observable, we should force them off for the + // loop for the FP atomics. Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop( Builder, AI->getType(), AI->getPointerOperand(), AI->getAlign(), AI->getOrdering(), AI->getSyncScopeID(), diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp index e7e73606de07..6967ca5160c0 100644 --- a/llvm/lib/CodeGen/BasicBlockSections.cpp +++ b/llvm/lib/CodeGen/BasicBlockSections.cpp @@ -91,7 +91,7 @@ cl::opt<std::string> llvm::BBSectionsColdTextPrefix( cl::desc("The text prefix to use for cold basic block clusters"), cl::init(".text.split."), cl::Hidden); -cl::opt<bool> BBSectionsDetectSourceDrift( +static cl::opt<bool> BBSectionsDetectSourceDrift( "bbsections-detect-source-drift", cl::desc("This checks if there is a fdo instr. profile hash " "mismatch for this function"), @@ -123,10 +123,16 @@ public: } // end anonymous namespace char BasicBlockSections::ID = 0; -INITIALIZE_PASS(BasicBlockSections, "bbsections-prepare", - "Prepares for basic block sections, by splitting functions " - "into clusters of basic blocks.", - false, false) +INITIALIZE_PASS_BEGIN( + BasicBlockSections, "bbsections-prepare", + "Prepares for basic block sections, by splitting functions " + "into clusters of basic blocks.", + false, false) +INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReader) +INITIALIZE_PASS_END(BasicBlockSections, "bbsections-prepare", + "Prepares for basic block sections, by splitting functions " + "into clusters of basic blocks.", + false, false) // This function updates and optimizes the branching instructions of every basic // block in a given function to account for changes in the layout. @@ -300,7 +306,7 @@ static bool hasInstrProfHashMismatch(MachineFunction &MF) { if (Existing) { MDTuple *Tuple = cast<MDTuple>(Existing); for (const auto &N : Tuple->operands()) - if (cast<MDString>(N.get())->getString() == MetadataName) + if (N.equalsStr(MetadataName)) return true; } diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index 5bc8d82debc3..5dede452ec34 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -14,12 +14,17 @@ #include "llvm/CodeGen/BasicBlockSectionsProfileReader.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/Support/Error.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include <llvm/ADT/STLExtras.h> using namespace llvm; @@ -35,13 +40,10 @@ bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const { std::pair<bool, SmallVector<BBClusterInfo>> BasicBlockSectionsProfileReader::getBBClusterInfoForFunction( StringRef FuncName) const { - std::pair<bool, SmallVector<BBClusterInfo>> cluster_info(false, {}); auto R = ProgramBBClusterInfo.find(getAliasName(FuncName)); - if (R != ProgramBBClusterInfo.end()) { - cluster_info.second = R->second; - cluster_info.first = true; - } - return cluster_info; + return R != ProgramBBClusterInfo.end() + ? std::pair(true, R->second) + : std::pair(false, SmallVector<BBClusterInfo>{}); } // Basic Block Sections can be enabled for a subset of machine basic blocks. @@ -49,17 +51,19 @@ BasicBlockSectionsProfileReader::getBBClusterInfoForFunction( // block sections are desired. Additionally, machine basic block ids of the // functions can also be specified for a finer granularity. Moreover, a cluster // of basic blocks could be assigned to the same section. +// Optionally, a debug-info filename can be specified for each function to allow +// distinguishing internal-linkage functions of the same name. // A file with basic block sections for all of function main and three blocks // for function foo (of which 1 and 2 are placed in a cluster) looks like this: +// (Profile for function foo is only loaded when its debug-info filename +// matches 'path/to/foo_file.cc'). // ---------------------------- // list.txt: // !main -// !foo +// !foo M=path/to/foo_file.cc // !!1 2 // !!4 -static Error getBBClusterInfo(const MemoryBuffer *MBuf, - ProgramBBClusterInfoMapTy &ProgramBBClusterInfo, - StringMap<StringRef> &FuncAliasMap) { +Error BasicBlockSectionsProfileReader::ReadProfile() { assert(MBuf); line_iterator LineIt(*MBuf, /*SkipBlanks=*/true, /*CommentMarker=*/'#'); @@ -90,9 +94,10 @@ static Error getBBClusterInfo(const MemoryBuffer *MBuf, break; // Check for second "!" which indicates a cluster of basic blocks. if (S.consume_front("!")) { + // Skip the profile when we the profile iterator (FI) refers to the + // past-the-end element. if (FI == ProgramBBClusterInfo.end()) - return invalidProfileError( - "Cluster list does not follow a function name specifier."); + continue; SmallVector<StringRef, 4> BBIDs; S.split(BBIDs, ' '); // Reset current cluster position. @@ -112,18 +117,52 @@ static Error getBBClusterInfo(const MemoryBuffer *MBuf, BBClusterInfo{((unsigned)BBID), CurrentCluster, CurrentPosition++}); } CurrentCluster++; - } else { // This is a function name specifier. + } else { + // This is a function name specifier. It may include a debug info filename + // specifier starting with `M=`. + auto [AliasesStr, DIFilenameStr] = S.split(' '); + SmallString<128> DIFilename; + if (DIFilenameStr.startswith("M=")) { + DIFilename = + sys::path::remove_leading_dotslash(DIFilenameStr.substr(2)); + if (DIFilename.empty()) + return invalidProfileError("Empty module name specifier."); + } else if (!DIFilenameStr.empty()) { + return invalidProfileError("Unknown string found: '" + DIFilenameStr + + "'."); + } // Function aliases are separated using '/'. We use the first function // name for the cluster info mapping and delegate all other aliases to // this one. SmallVector<StringRef, 4> Aliases; - S.split(Aliases, '/'); + AliasesStr.split(Aliases, '/'); + bool FunctionFound = any_of(Aliases, [&](StringRef Alias) { + auto It = FunctionNameToDIFilename.find(Alias); + // No match if this function name is not found in this module. + if (It == FunctionNameToDIFilename.end()) + return false; + // Return a match if debug-info-filename is not specified. Otherwise, + // check for equality. + return DIFilename.empty() || It->second.equals(DIFilename); + }); + if (!FunctionFound) { + // Skip the following profile by setting the profile iterator (FI) to + // the past-the-end element. + FI = ProgramBBClusterInfo.end(); + continue; + } for (size_t i = 1; i < Aliases.size(); ++i) FuncAliasMap.try_emplace(Aliases[i], Aliases.front()); // Prepare for parsing clusters of this function name. // Start a new cluster map for this function name. - FI = ProgramBBClusterInfo.try_emplace(Aliases.front()).first; + auto R = ProgramBBClusterInfo.try_emplace(Aliases.front()); + // Report error when multiple profiles have been specified for the same + // function. + if (!R.second) + return invalidProfileError("Duplicate profile for function '" + + Aliases.front() + "'."); + FI = R.first; CurrentCluster = 0; FuncBBIDs.clear(); } @@ -131,11 +170,28 @@ static Error getBBClusterInfo(const MemoryBuffer *MBuf, return Error::success(); } -void BasicBlockSectionsProfileReader::initializePass() { +bool BasicBlockSectionsProfileReader::doInitialization(Module &M) { if (!MBuf) - return; - if (auto Err = getBBClusterInfo(MBuf, ProgramBBClusterInfo, FuncAliasMap)) + return false; + // Get the function name to debug info filename mapping. + FunctionNameToDIFilename.clear(); + for (const Function &F : M) { + SmallString<128> DIFilename; + if (F.isDeclaration()) + continue; + DISubprogram *Subprogram = F.getSubprogram(); + if (Subprogram) { + llvm::DICompileUnit *CU = Subprogram->getUnit(); + if (CU) + DIFilename = sys::path::remove_leading_dotslash(CU->getFilename()); + } + [[maybe_unused]] bool inserted = + FunctionNameToDIFilename.try_emplace(F.getName(), DIFilename).second; + assert(inserted); + } + if (auto Err = ReadProfile()) report_fatal_error(std::move(Err)); + return false; } ImmutablePass * diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index d491691135dc..3830f25debaf 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -860,6 +860,14 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) { for (Register Reg : NewLiveIns) { if (!LiveRegs.available(*MRI, Reg)) continue; + + // Skip the register if we are about to add one of its super registers. + // TODO: Common this up with the same logic in addLineIns(). + if (any_of(TRI->superregs(Reg), [&](MCPhysReg SReg) { + return NewLiveIns.contains(SReg) && !MRI->isReserved(SReg); + })) + continue; + DebugLoc DL; BuildMI(*Pred, InsertBefore, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Reg); @@ -1207,7 +1215,7 @@ bool BranchFolder::OptimizeBranches(MachineFunction &MF) { MadeChange |= OptimizeBlock(&MBB); // If it is dead, remove it. - if (MBB.pred_empty()) { + if (MBB.pred_empty() && !MBB.isMachineBlockAddressTaken()) { RemoveDeadBlock(&MBB); MadeChange = true; ++NumDeadBlocks; @@ -1507,42 +1515,43 @@ ReoptimizeBlock: } } - bool OptForSize = - MF.getFunction().hasOptSize() || - llvm::shouldOptimizeForSize(MBB, PSI, &MBBFreqInfo); - if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && OptForSize) { - // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch - // direction, thereby defeating careful block placement and regressing - // performance. Therefore, only consider this for optsize functions. + if (!IsEmptyBlock(MBB)) { MachineInstr &TailCall = *MBB->getFirstNonDebugInstr(); if (TII->isUnconditionalTailCall(TailCall)) { - MachineBasicBlock *Pred = *MBB->pred_begin(); - MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; - SmallVector<MachineOperand, 4> PredCond; - bool PredAnalyzable = - !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true); - - if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB && - PredTBB != PredFBB) { - // The predecessor has a conditional branch to this block which consists - // of only a tail call. Try to fold the tail call into the conditional - // branch. - if (TII->canMakeTailCallConditional(PredCond, TailCall)) { - // TODO: It would be nice if analyzeBranch() could provide a pointer - // to the branch instruction so replaceBranchWithTailCall() doesn't - // have to search for it. - TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall); - ++NumTailCalls; - Pred->removeSuccessor(MBB); - MadeChange = true; - return MadeChange; + SmallVector<MachineBasicBlock *> PredsChanged; + for (auto &Pred : MBB->predecessors()) { + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; + SmallVector<MachineOperand, 4> PredCond; + bool PredAnalyzable = + !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true); + + // Only eliminate if MBB == TBB (Taken Basic Block) + if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB && + PredTBB != PredFBB) { + // The predecessor has a conditional branch to this block which + // consists of only a tail call. Try to fold the tail call into the + // conditional branch. + if (TII->canMakeTailCallConditional(PredCond, TailCall)) { + // TODO: It would be nice if analyzeBranch() could provide a pointer + // to the branch instruction so replaceBranchWithTailCall() doesn't + // have to search for it. + TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall); + PredsChanged.push_back(Pred); + } } + // If the predecessor is falling through to this block, we could reverse + // the branch condition and fold the tail call into that. However, after + // that we might have to re-arrange the CFG to fall through to the other + // block and there is a high risk of regressing code size rather than + // improving it. + } + if (!PredsChanged.empty()) { + NumTailCalls += PredsChanged.size(); + for (auto &Pred : PredsChanged) + Pred->removeSuccessor(MBB); + + return true; } - // If the predecessor is falling through to this block, we could reverse - // the branch condition and fold the tail call into that. However, after - // that we might have to re-arrange the CFG to fall through to the other - // block and there is a high risk of regressing code size rather than - // improving it. } } @@ -1876,8 +1885,8 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB, } else { if (Uses.erase(Reg)) { if (Reg.isPhysical()) { - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) - Uses.erase(*SubRegs); // Use sub-registers to be conservative + for (MCPhysReg SubReg : TRI->subregs(Reg)) + Uses.erase(SubReg); // Use sub-registers to be conservative } } addRegAndItsAliases(Reg, TRI, Defs); @@ -1988,8 +1997,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { break; // Remove kills from ActiveDefsSet, these registers had short live ranges. - for (const MachineOperand &MO : TIB->operands()) { - if (!MO.isReg() || !MO.isUse() || !MO.isKill()) + for (const MachineOperand &MO : TIB->all_uses()) { + if (!MO.isKill()) continue; Register Reg = MO.getReg(); if (!Reg) @@ -2006,8 +2015,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { } // Track local defs so we can update liveins. - for (const MachineOperand &MO : TIB->operands()) { - if (!MO.isReg() || !MO.isDef() || MO.isDead()) + for (const MachineOperand &MO : TIB->all_defs()) { + if (MO.isDead()) continue; Register Reg = MO.getReg(); if (!Reg || Reg.isVirtual()) diff --git a/llvm/lib/CodeGen/BranchFolding.h b/llvm/lib/CodeGen/BranchFolding.h index d0b6ed5ebe05..63b2ef04b21b 100644 --- a/llvm/lib/CodeGen/BranchFolding.h +++ b/llvm/lib/CodeGen/BranchFolding.h @@ -113,15 +113,15 @@ class TargetRegisterInfo; }; std::vector<SameTailElt> SameTails; - bool AfterBlockPlacement; - bool EnableTailMerge; - bool EnableHoistCommonCode; - bool UpdateLiveIns; + bool AfterBlockPlacement = false; + bool EnableTailMerge = false; + bool EnableHoistCommonCode = false; + bool UpdateLiveIns = false; unsigned MinCommonTailLength; - const TargetInstrInfo *TII; - const MachineRegisterInfo *MRI; - const TargetRegisterInfo *TRI; - MachineLoopInfo *MLI; + const TargetInstrInfo *TII = nullptr; + const MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineLoopInfo *MLI = nullptr; LivePhysRegs LiveRegs; private: diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp index 016c81dc5aa4..05494f1ddc67 100644 --- a/llvm/lib/CodeGen/BranchRelaxation.cpp +++ b/llvm/lib/CodeGen/BranchRelaxation.cpp @@ -81,9 +81,9 @@ class BranchRelaxation : public MachineFunctionPass { std::unique_ptr<RegScavenger> RS; LivePhysRegs LiveRegs; - MachineFunction *MF; - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; + MachineFunction *MF = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const TargetInstrInfo *TII = nullptr; bool relaxBranchInstructions(); void scanFunction(); @@ -132,6 +132,19 @@ void BranchRelaxation::verify() { assert(BlockInfo[Num].Size == computeBlockSize(MBB)); PrevNum = Num; } + + for (MachineBasicBlock &MBB : *MF) { + for (MachineBasicBlock::iterator J = MBB.getFirstTerminator(); + J != MBB.end(); J = std::next(J)) { + MachineInstr &MI = *J; + if (!MI.isConditionalBranch() && !MI.isUnconditionalBranch()) + continue; + if (MI.getOpcode() == TargetOpcode::FAULTING_OP) + continue; + MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI); + assert(isBlockInRange(MI, *DestBB)); + } + } #endif } diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp index 310273173647..618e41894b29 100644 --- a/llvm/lib/CodeGen/BreakFalseDeps.cpp +++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp @@ -17,6 +17,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/ReachingDefAnalysis.h" @@ -34,9 +35,9 @@ namespace llvm { class BreakFalseDeps : public MachineFunctionPass { private: - MachineFunction *MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; + MachineFunction *MF = nullptr; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; RegisterClassInfo RegClassInfo; /// List of undefined register reads in this block in forward order. @@ -45,7 +46,7 @@ private: /// Storage for register unit liveness. LivePhysRegs LiveRegSet; - ReachingDefAnalysis *RDA; + ReachingDefAnalysis *RDA = nullptr; public: static char ID; // Pass identification, replacement for typeid @@ -123,9 +124,9 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, MCRegister OriginalReg = MO.getReg().asMCReg(); // Update only undef operands that have reg units that are mapped to one root. - for (MCRegUnitIterator Unit(OriginalReg, TRI); Unit.isValid(); ++Unit) { + for (MCRegUnit Unit : TRI->regunits(OriginalReg)) { unsigned NumRoots = 0; - for (MCRegUnitRootIterator Root(*Unit, TRI); Root.isValid(); ++Root) { + for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { NumRoots++; if (NumRoots > 1) return false; @@ -139,9 +140,8 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, // If the instruction has a true dependency, we can hide the false depdency // behind it. - for (MachineOperand &CurrMO : MI->operands()) { - if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() || - !OpRC->contains(CurrMO.getReg())) + for (MachineOperand &CurrMO : MI->all_uses()) { + if (CurrMO.isUndef() || !OpRC->contains(CurrMO.getReg())) continue; // We found a true dependency - replace the undef register with the true // dependency. @@ -290,10 +290,16 @@ bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) { LLVM_DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n"); + // Skip Dead blocks due to ReachingDefAnalysis has no idea about instructions + // in them. + df_iterator_default_set<MachineBasicBlock *> Reachable; + for (MachineBasicBlock *MBB : depth_first_ext(&mf, Reachable)) + (void)MBB /* Mark all reachable blocks */; + // Traverse the basic blocks. - for (MachineBasicBlock &MBB : mf) { - processBasicBlock(&MBB); - } + for (MachineBasicBlock &MBB : mf) + if (Reachable.count(&MBB)) + processBasicBlock(&MBB); return false; } diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp index 25741686a829..6a024287f002 100644 --- a/llvm/lib/CodeGen/CFIInstrInserter.cpp +++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp @@ -141,6 +141,7 @@ INITIALIZE_PASS(CFIInstrInserter, "cfi-instr-inserter", FunctionPass *llvm::createCFIInstrInserter() { return new CFIInstrInserter(); } void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // Initial CFA offset value i.e. the one valid at the beginning of the // function. int InitialOffset = @@ -149,7 +150,7 @@ void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) { // function. Register InitialRegister = MF.getSubtarget().getFrameLowering()->getInitialCFARegister(MF); - const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + InitialRegister = TRI.getDwarfRegNum(InitialRegister, true); unsigned NumRegs = TRI.getNumRegs(); // Initialize MBBMap. diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp index 615687abad81..0377bc002067 100644 --- a/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -97,7 +97,7 @@ bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI, // Trace copies introduced by live range splitting. The inline // spiller can rematerialize through these copies, so the spill // weight must reflect this. - while (MI->isFullCopy()) { + while (TII.isFullCopyInstr(*MI)) { // The copy destination must match the interval register. if (MI->getOperand(0).getReg() != Reg) return false; @@ -133,7 +133,7 @@ bool VirtRegAuxInfo::isLiveAtStatepointVarArg(LiveInterval &LI) { MachineInstr *MI = MO.getParent(); if (MI->getOpcode() != TargetOpcode::STATEPOINT) return false; - return StatepointOpers(MI).getVarIdx() <= MI->getOperandNo(&MO); + return StatepointOpers(MI).getVarIdx() <= MO.getOperandNo(); }); } @@ -157,7 +157,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start, unsigned NumInstr = 0; // Number of instructions using LI SmallPtrSet<MachineInstr *, 8> Visited; - std::pair<Register, Register> TargetHint = MRI.getRegAllocationHint(LI.reg()); + std::pair<unsigned, Register> TargetHint = MRI.getRegAllocationHint(LI.reg()); if (LI.isSpillable()) { Register Reg = LI.reg(); @@ -224,7 +224,16 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start, continue; NumInstr++; - if (MI->isIdentityCopy() || MI->isImplicitDef()) + bool identityCopy = false; + auto DestSrc = TII.isCopyInstr(*MI); + if (DestSrc) { + const MachineOperand *DestRegOp = DestSrc->Destination; + const MachineOperand *SrcRegOp = DestSrc->Source; + identityCopy = DestRegOp->getReg() == SrcRegOp->getReg() && + DestRegOp->getSubReg() == SrcRegOp->getSubReg(); + } + + if (identityCopy || MI->isImplicitDef()) continue; if (!Visited.insert(MI).second) continue; @@ -258,7 +267,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start, } // Get allocation hints from copies. - if (!MI->isCopy()) + if (!TII.isCopyInstr(*MI)) continue; Register HintReg = copyHint(MI, LI.reg(), TRI, MRI); if (!HintReg) diff --git a/llvm/lib/CodeGen/CallBrPrepare.cpp b/llvm/lib/CodeGen/CallBrPrepare.cpp new file mode 100644 index 000000000000..db243a0bfebe --- /dev/null +++ b/llvm/lib/CodeGen/CallBrPrepare.cpp @@ -0,0 +1,231 @@ +//===-- CallBrPrepare - Prepare callbr for code generation ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass lowers callbrs in LLVM IR in order to to assist SelectionDAG's +// codegen. +// +// In particular, this pass assists in inserting register copies for the output +// values of a callbr along the edges leading to the indirect target blocks. +// Though the output SSA value is defined by the callbr instruction itself in +// the IR representation, the value cannot be copied to the appropriate virtual +// registers prior to jumping to an indirect label, since the jump occurs +// within the user-provided assembly blob. +// +// Instead, those copies must occur separately at the beginning of each +// indirect target. That requires that we create a separate SSA definition in +// each of them (via llvm.callbr.landingpad), and may require splitting +// critical edges so we have a location to place the intrinsic. Finally, we +// remap users of the original callbr output SSA value to instead point to the +// appropriate llvm.callbr.landingpad value. +// +// Ideally, this could be done inside SelectionDAG, or in the +// MachineInstruction representation, without the use of an IR-level intrinsic. +// But, within the current framework, it’s simpler to implement as an IR pass. +// (If support for callbr in GlobalISel is implemented, it’s worth considering +// whether this is still required.) +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; + +#define DEBUG_TYPE "callbrprepare" + +namespace { + +class CallBrPrepare : public FunctionPass { + bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT); + bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs, + DominatorTree &DT) const; + void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic, + SSAUpdater &SSAUpdate) const; + +public: + CallBrPrepare() : FunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &Fn) override; + static char ID; +}; + +} // end anonymous namespace + +char CallBrPrepare::ID = 0; +INITIALIZE_PASS_BEGIN(CallBrPrepare, DEBUG_TYPE, "Prepare callbr", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(CallBrPrepare, DEBUG_TYPE, "Prepare callbr", false, false) + +FunctionPass *llvm::createCallBrPass() { return new CallBrPrepare(); } + +void CallBrPrepare::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTreeWrapperPass>(); +} + +static SmallVector<CallBrInst *, 2> FindCallBrs(Function &Fn) { + SmallVector<CallBrInst *, 2> CBRs; + for (BasicBlock &BB : Fn) + if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator())) + if (!CBR->getType()->isVoidTy() && !CBR->use_empty()) + CBRs.push_back(CBR); + return CBRs; +} + +bool CallBrPrepare::SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, + DominatorTree &DT) { + bool Changed = false; + CriticalEdgeSplittingOptions Options(&DT); + Options.setMergeIdenticalEdges(); + + // The indirect destination might be duplicated between another parameter... + // %0 = callbr ... [label %x, label %x] + // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need + // to split the default destination if it's duplicated between an indirect + // destination... + // %1 = callbr ... to label %x [label %x] + // ...hence starting at 1 and checking against successor 0 (aka the default + // destination). + for (CallBrInst *CBR : CBRs) + for (unsigned i = 1, e = CBR->getNumSuccessors(); i != e; ++i) + if (CBR->getSuccessor(i) == CBR->getSuccessor(0) || + isCriticalEdge(CBR, i, /*AllowIdenticalEdges*/ true)) + if (SplitKnownCriticalEdge(CBR, i, Options)) + Changed = true; + return Changed; +} + +bool CallBrPrepare::InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs, + DominatorTree &DT) const { + bool Changed = false; + SmallPtrSet<const BasicBlock *, 4> Visited; + IRBuilder<> Builder(CBRs[0]->getContext()); + for (CallBrInst *CBR : CBRs) { + if (!CBR->getNumIndirectDests()) + continue; + + SSAUpdater SSAUpdate; + SSAUpdate.Initialize(CBR->getType(), CBR->getName()); + SSAUpdate.AddAvailableValue(CBR->getParent(), CBR); + SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR); + + for (BasicBlock *IndDest : CBR->getIndirectDests()) { + if (!Visited.insert(IndDest).second) + continue; + Builder.SetInsertPoint(&*IndDest->begin()); + CallInst *Intrinsic = Builder.CreateIntrinsic( + CBR->getType(), Intrinsic::callbr_landingpad, {CBR}); + SSAUpdate.AddAvailableValue(IndDest, Intrinsic); + UpdateSSA(DT, CBR, Intrinsic, SSAUpdate); + Changed = true; + } + } + return Changed; +} + +static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) { + const auto *I = dyn_cast<Instruction>(U.getUser()); + return I && I->getParent() == BB; +} + +#ifndef NDEBUG +static void PrintDebugDomInfo(const DominatorTree &DT, const Use &U, + const BasicBlock *BB, bool IsDefaultDest) { + if (!isa<Instruction>(U.getUser())) + return; + LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block " + << cast<Instruction>(U.getUser())->getParent()->getName() + << ", is " << (DT.dominates(BB, U) ? "" : "NOT ") + << "dominated by " << BB->getName() << " (" + << (IsDefaultDest ? "in" : "") << "direct)\n"); +} +#endif + +void CallBrPrepare::UpdateSSA(DominatorTree &DT, CallBrInst *CBR, + CallInst *Intrinsic, + SSAUpdater &SSAUpdate) const { + + SmallPtrSet<Use *, 4> Visited; + BasicBlock *DefaultDest = CBR->getDefaultDest(); + BasicBlock *LandingPad = Intrinsic->getParent(); + + SmallVector<Use *, 4> Uses(make_pointer_range(CBR->uses())); + for (Use *U : Uses) { + if (!Visited.insert(U).second) + continue; + +#ifndef NDEBUG + PrintDebugDomInfo(DT, *U, LandingPad, /*IsDefaultDest*/ false); + PrintDebugDomInfo(DT, *U, DefaultDest, /*IsDefaultDest*/ true); +#endif + + // Don't rewrite the use in the newly inserted intrinsic. + if (const auto *II = dyn_cast<IntrinsicInst>(U->getUser())) + if (II->getIntrinsicID() == Intrinsic::callbr_landingpad) + continue; + + // If the Use is in the same BasicBlock as the Intrinsic call, replace + // the Use with the value of the Intrinsic call. + if (IsInSameBasicBlock(*U, LandingPad)) { + U->set(Intrinsic); + continue; + } + + // If the Use is dominated by the default dest, do not touch it. + if (DT.dominates(DefaultDest, *U)) + continue; + + SSAUpdate.RewriteUse(*U); + } +} + +bool CallBrPrepare::runOnFunction(Function &Fn) { + bool Changed = false; + SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(Fn); + + if (CBRs.empty()) + return Changed; + + // It's highly likely that most programs do not contain CallBrInsts. Follow a + // similar pattern from SafeStackLegacyPass::runOnFunction to reuse previous + // domtree analysis if available, otherwise compute it lazily. This avoids + // forcing Dominator Tree Construction at -O0 for programs that likely do not + // contain CallBrInsts. It does pessimize programs with callbr at higher + // optimization levels, as the DominatorTree created here is not reused by + // subsequent passes. + DominatorTree *DT; + std::optional<DominatorTree> LazilyComputedDomTree; + if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>()) + DT = &DTWP->getDomTree(); + else { + LazilyComputedDomTree.emplace(Fn); + DT = &*LazilyComputedDomTree; + } + + if (SplitCriticalEdges(CBRs, *DT)) + Changed = true; + + if (InsertIntrinsicCalls(CBRs, *DT)) + Changed = true; + + return Changed; +} diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp index ce1ef571c9df..b7152587a9fa 100644 --- a/llvm/lib/CodeGen/CallingConvLower.cpp +++ b/llvm/lib/CodeGen/CallingConvLower.cpp @@ -25,12 +25,15 @@ using namespace llvm; -CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf, - SmallVectorImpl<CCValAssign> &locs, LLVMContext &C) - : CallingConv(CC), IsVarArg(isVarArg), MF(mf), - TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) { +CCState::CCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF, + SmallVectorImpl<CCValAssign> &Locs, LLVMContext &Context, + bool NegativeOffsets) + : CallingConv(CC), IsVarArg(IsVarArg), MF(MF), + TRI(*MF.getSubtarget().getRegisterInfo()), Locs(Locs), Context(Context), + NegativeOffsets(NegativeOffsets) { + // No stack is used. - StackOffset = 0; + StackSize = 0; clearByValRegsInfo(); UsedRegs.resize((TRI.getNumRegs()+31)/32); @@ -51,7 +54,7 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT, MVT LocVT, ensureMaxAlignment(Alignment); MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size, Alignment); Size = unsigned(alignTo(Size, MinAlign)); - unsigned Offset = AllocateStack(Size, Alignment); + uint64_t Offset = AllocateStack(Size, Alignment); addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); } @@ -129,7 +132,7 @@ void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs, if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) { #ifndef NDEBUG dbgs() << "Call operand #" << i << " has unhandled type " - << EVT(ArgVT).getEVTString() << '\n'; + << ArgVT << '\n'; #endif llvm_unreachable(nullptr); } @@ -147,7 +150,7 @@ void CCState::AnalyzeCallOperands(SmallVectorImpl<MVT> &ArgVTs, if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) { #ifndef NDEBUG dbgs() << "Call operand #" << i << " has unhandled type " - << EVT(ArgVT).getEVTString() << '\n'; + << ArgVT << '\n'; #endif llvm_unreachable(nullptr); } @@ -164,7 +167,7 @@ void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) { #ifndef NDEBUG dbgs() << "Call result #" << i << " has unhandled type " - << EVT(VT).getEVTString() << '\n'; + << VT << '\n'; #endif llvm_unreachable(nullptr); } @@ -176,7 +179,7 @@ void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) { if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) { #ifndef NDEBUG dbgs() << "Call result has unhandled type " - << EVT(VT).getEVTString() << '\n'; + << VT << '\n'; #endif llvm_unreachable(nullptr); } @@ -197,7 +200,7 @@ static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) { void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs, MVT VT, CCAssignFn Fn) { - unsigned SavedStackOffset = StackOffset; + uint64_t SavedStackSize = StackSize; Align SavedMaxStackArgAlign = MaxStackArgAlign; unsigned NumLocs = Locs.size(); @@ -212,7 +215,7 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs, do { if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) { #ifndef NDEBUG - dbgs() << "Call has unhandled type " << EVT(VT).getEVTString() + dbgs() << "Call has unhandled type " << VT << " while computing remaining regparms\n"; #endif llvm_unreachable(nullptr); @@ -229,7 +232,7 @@ void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs, // Clear the assigned values and stack memory. We leave the registers marked // as allocated so that future queries don't return the same registers, i.e. // when i64 and f64 are both passed in GPRs. - StackOffset = SavedStackOffset; + StackSize = SavedStackSize; MaxStackArgAlign = SavedMaxStackArgAlign; Locs.truncate(NumLocs); } diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 398ff56f737c..6272b654b329 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -11,7 +11,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm-c/Initialization.h" #include "llvm/InitializePasses.h" #include "llvm/PassRegistry.h" @@ -24,6 +23,8 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeBasicBlockSectionsPass(Registry); initializeBranchFolderPassPass(Registry); initializeBranchRelaxationPass(Registry); + initializeBreakFalseDepsPass(Registry); + initializeCallBrPreparePass(Registry); initializeCFGuardLongjmpPass(Registry); initializeCFIFixupPass(Registry); initializeCFIInstrInserterPass(Registry); @@ -48,7 +49,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeFuncletLayoutPass(Registry); initializeGCMachineCodeAnalysisPass(Registry); initializeGCModuleInfoPass(Registry); - initializeHardwareLoopsPass(Registry); + initializeHardwareLoopsLegacyPass(Registry); initializeIfConverterPass(Registry); initializeImplicitNullChecksPass(Registry); initializeIndirectBrExpandPassPass(Registry); @@ -140,7 +141,3 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeWinEHPreparePass(Registry); initializeXRayInstrumentationPass(Registry); } - -void LLVMInitializeCodeGen(LLVMPassRegistryRef R) { - initializeCodeGen(*unwrap(R)); -} diff --git a/llvm/lib/CodeGen/CodeGenCommonISel.cpp b/llvm/lib/CodeGen/CodeGenCommonISel.cpp index a5215969c0dd..577c5dbc8e2d 100644 --- a/llvm/lib/CodeGen/CodeGenCommonISel.cpp +++ b/llvm/lib/CodeGen/CodeGenCommonISel.cpp @@ -173,11 +173,11 @@ llvm::findSplitPointForStackProtector(MachineBasicBlock *BB, return SplitPoint; } -unsigned llvm::getInvertedFPClassTest(unsigned Test) { - unsigned InvertedTest = ~Test & fcAllFlags; - switch (InvertedTest) { - default: - break; +FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test) { + FPClassTest InvertedTest = ~Test; + // Pick the direction with fewer tests + // TODO: Handle more combinations of cases that can be handled together + switch (static_cast<unsigned>(InvertedTest)) { case fcNan: case fcSNan: case fcQNan: @@ -196,9 +196,15 @@ unsigned llvm::getInvertedFPClassTest(unsigned Test) { case fcFinite: case fcPosFinite: case fcNegFinite: + case fcZero | fcNan: + case fcSubnormal | fcZero: + case fcSubnormal | fcZero | fcNan: return InvertedTest; + default: + return fcNone; } - return 0; + + llvm_unreachable("covered FPClassTest"); } static MachineOperand *getSalvageOpsForCopy(const MachineRegisterInfo &MRI, diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index dd431cc6f4f5..b00df0b6c6cb 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/BasicBlockSectionsProfileReader.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -82,7 +83,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -257,13 +257,17 @@ static cl::opt<bool> "CodeGenPrepare.")); static cl::opt<bool> - OptimizePhiTypes("cgp-optimize-phi-types", cl::Hidden, cl::init(false), + OptimizePhiTypes("cgp-optimize-phi-types", cl::Hidden, cl::init(true), cl::desc("Enable converting phi types in CodeGenPrepare")); static cl::opt<unsigned> HugeFuncThresholdInCGPP("cgpp-huge-func", cl::init(10000), cl::Hidden, cl::desc("Least BB number of huge function.")); +static cl::opt<unsigned> + MaxAddressUsersToScan("cgp-max-address-users-to-scan", cl::init(100), + cl::Hidden, + cl::desc("Max number of address users to look at")); namespace { enum ExtType { @@ -294,16 +298,16 @@ class TypePromotionTransaction; class CodeGenPrepare : public FunctionPass { const TargetMachine *TM = nullptr; - const TargetSubtargetInfo *SubtargetInfo; + const TargetSubtargetInfo *SubtargetInfo = nullptr; const TargetLowering *TLI = nullptr; - const TargetRegisterInfo *TRI; + const TargetRegisterInfo *TRI = nullptr; const TargetTransformInfo *TTI = nullptr; const BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr; - const TargetLibraryInfo *TLInfo; - const LoopInfo *LI; + const TargetLibraryInfo *TLInfo = nullptr; + LoopInfo *LI = nullptr; std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; - ProfileSummaryInfo *PSI; + ProfileSummaryInfo *PSI = nullptr; /// As we scan instructions optimizing them, this is the next instruction /// to optimize. Transforms that can invalidate this should update it. @@ -373,6 +377,15 @@ public: bool runOnFunction(Function &F) override; + void releaseMemory() override { + // Clear per function information. + InsertedInsts.clear(); + PromotedInsts.clear(); + FreshBBs.clear(); + BPI.reset(); + BFI.reset(); + } + StringRef getPassName() const override { return "CodeGen Prepare"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -413,7 +426,7 @@ private: void removeAllAssertingVHReferences(Value *V); bool eliminateAssumptions(Function &F); - bool eliminateFallThrough(Function &F); + bool eliminateFallThrough(Function &F, DominatorTree *DT = nullptr); bool eliminateMostlyEmptyBlocks(Function &F); BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB); bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; @@ -494,10 +507,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) { DL = &F.getParent()->getDataLayout(); bool EverMadeChange = false; - // Clear per function information. - InsertedInsts.clear(); - PromotedInsts.clear(); - FreshBBs.clear(); TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); SubtargetInfo = TM->getSubtargetImpl(F); @@ -574,11 +583,15 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // Because the basic algorithm's complex is near O(N!). IsHugeFunc = F.size() > HugeFuncThresholdInCGPP; + // Transformations above may invalidate dominator tree and/or loop info. + DT.reset(); + LI->releaseMemory(); + LI->analyze(getDT(F)); + bool MadeChange = true; bool FuncIterated = false; while (MadeChange) { MadeChange = false; - DT.reset(); for (BasicBlock &BB : llvm::make_early_inc_range(F)) { if (FuncIterated && !FreshBBs.contains(&BB)) @@ -587,6 +600,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) { ModifyDT ModifiedDTOnIteration = ModifyDT::NotModifyDT; bool Changed = optimizeBlock(BB, ModifiedDTOnIteration); + if (ModifiedDTOnIteration == ModifyDT::ModifyBBDT) + DT.reset(); + MadeChange |= Changed; if (IsHugeFunc) { // If the BB is updated, it may still has chance to be optimized. @@ -602,9 +618,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) { FreshBBs.insert(&BB); else if (FuncIterated) FreshBBs.erase(&BB); - - if (ModifiedDTOnIteration == ModifyDT::ModifyBBDT) - DT.reset(); } else { // For small/normal functions, we restart BB iteration if the dominator // tree of the Function was changed. @@ -622,7 +635,12 @@ bool CodeGenPrepare::runOnFunction(Function &F) { MadeChange |= optimizePhiTypes(F); if (MadeChange) - eliminateFallThrough(F); + eliminateFallThrough(F, DT.get()); + +#ifndef NDEBUG + if (MadeChange && VerifyLoopInfo) + LI->verify(getDT(F)); +#endif // Really free removed instructions during promotion. for (Instruction *I : RemovedInsts) @@ -755,7 +773,7 @@ void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) { /// Merge basic blocks which are connected by a single edge, where one of the /// basic blocks has a single successor pointing to the other basic block, /// which has a single predecessor. -bool CodeGenPrepare::eliminateFallThrough(Function &F) { +bool CodeGenPrepare::eliminateFallThrough(Function &F, DominatorTree *DT) { bool Changed = false; // Scan all of the blocks in the function, except for the entry block. // Use a temporary array to avoid iterator being invalidated when @@ -777,13 +795,19 @@ bool CodeGenPrepare::eliminateFallThrough(Function &F) { if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; + // Make an effort to skip unreachable blocks. + if (DT && !DT->isReachableFromEntry(BB)) + continue; + BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); if (Term && !Term->isConditional()) { Changed = true; LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n"); // Merge BB into SinglePred and delete it. - MergeBlockIntoPredecessor(BB); + MergeBlockIntoPredecessor(BB, /* DTU */ nullptr, LI, /* MSSAU */ nullptr, + /* MemDep */ nullptr, + /* PredecessorWithTwoSuccessors */ false, DT); Preds.insert(SinglePred); if (IsHugeFunc) { @@ -1579,6 +1603,7 @@ static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp, /// intrinsic. Return true if any changes were made. bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT) { + bool EdgeCase = false; Value *A, *B; BinaryOperator *Add; if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) { @@ -1587,11 +1612,12 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp, // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases. A = Add->getOperand(0); B = Add->getOperand(1); + EdgeCase = true; } if (!TLI->shouldFormOverflowOp(ISD::UADDO, TLI->getValueType(*DL, Add->getType()), - Add->hasNUsesOrMore(2))) + Add->hasNUsesOrMore(EdgeCase ? 1 : 2))) return false; // We don't want to move around uses of condition values this late, so we @@ -1660,7 +1686,7 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, if (!TLI->shouldFormOverflowOp(ISD::USUBO, TLI->getValueType(*DL, Sub->getType()), - Sub->hasNUsesOrMore(2))) + Sub->hasNUsesOrMore(1))) return false; if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1), @@ -1825,6 +1851,37 @@ static bool foldICmpWithDominatingICmp(CmpInst *Cmp, return true; } +/// Many architectures use the same instruction for both subtract and cmp. Try +/// to swap cmp operands to match subtract operations to allow for CSE. +static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) { + Value *Op0 = Cmp->getOperand(0); + Value *Op1 = Cmp->getOperand(1); + if (!Op0->getType()->isIntegerTy() || isa<Constant>(Op0) || + isa<Constant>(Op1) || Op0 == Op1) + return false; + + // If a subtract already has the same operands as a compare, swapping would be + // bad. If a subtract has the same operands as a compare but in reverse order, + // then swapping is good. + int GoodToSwap = 0; + unsigned NumInspected = 0; + for (const User *U : Op0->users()) { + // Avoid walking many users. + if (++NumInspected > 128) + return false; + if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0)))) + GoodToSwap++; + else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1)))) + GoodToSwap--; + } + + if (GoodToSwap > 0) { + Cmp->swapOperands(); + return true; + } + return false; +} + bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (sinkCmpExpression(Cmp, *TLI)) return true; @@ -1838,6 +1895,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { if (foldICmpWithDominatingICmp(Cmp, *TLI)) return true; + if (swapICmpOperandsToExposeCSEOpportunities(Cmp)) + return true; + return false; } @@ -2129,6 +2189,7 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, /// /// If the transform is performed, return true and set ModifiedDT to true. static bool despeculateCountZeros(IntrinsicInst *CountZeros, + LoopInfo &LI, const TargetLowering *TLI, const DataLayout *DL, ModifyDT &ModifiedDT, SmallSet<BasicBlock *, 32> &FreshBBs, @@ -2168,6 +2229,13 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, if (IsHugeFunc) FreshBBs.insert(EndBlock); + // Update the LoopInfo. The new blocks are in the same loop as the start + // block. + if (Loop *L = LI.getLoopFor(StartBlock)) { + L->addBasicBlockToLoop(CallBlock, LI); + L->addBasicBlockToLoop(EndBlock, LI); + } + // Set up a builder to create a compare, conditional branch, and PHI. IRBuilder<> Builder(CountZeros->getContext()); Builder.SetInsertPoint(StartBlock->getTerminator()); @@ -2279,7 +2347,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { if (!Arg->getType()->isPointerTy()) continue; unsigned AS = Arg->getType()->getPointerAddressSpace(); - return optimizeMemoryInst(CI, Arg, Arg->getType(), AS); + if (optimizeMemoryInst(CI, Arg, Arg->getType(), AS)) + return true; } IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); @@ -2341,7 +2410,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { case Intrinsic::cttz: case Intrinsic::ctlz: // If counting zeros is expensive, try to avoid it. - return despeculateCountZeros(II, TLI, DL, ModifiedDT, FreshBBs, + return despeculateCountZeros(II, *LI, TLI, DL, ModifiedDT, FreshBBs, IsHugeFunc); case Intrinsic::fshl: case Intrinsic::fshr: @@ -2349,24 +2418,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { case Intrinsic::dbg_assign: case Intrinsic::dbg_value: return fixupDbgValue(II); - case Intrinsic::vscale: { - // If datalayout has no special restrictions on vector data layout, - // replace `llvm.vscale` by an equivalent constant expression - // to benefit from cheap constant propagation. - Type *ScalableVectorTy = - VectorType::get(Type::getInt8Ty(II->getContext()), 1, true); - if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinValue() == 8) { - auto *Null = Constant::getNullValue(ScalableVectorTy->getPointerTo()); - auto *One = ConstantInt::getSigned(II->getType(), 1); - auto *CGep = - ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One); - replaceAllUsesWith(II, ConstantExpr::getPtrToInt(CGep, II->getType()), - FreshBBs, IsHugeFunc); - II->eraseFromParent(); - return true; - } - break; - } case Intrinsic::masked_gather: return optimizeGatherScatterInst(II, II->getArgOperand(0)); case Intrinsic::masked_scatter: @@ -2442,6 +2493,8 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, if (!RetI) return false; + assert(LI->getLoopFor(BB) == nullptr && "A return block cannot be in a loop"); + PHINode *PN = nullptr; ExtractValueInst *EVI = nullptr; BitCastInst *BCI = nullptr; @@ -2687,7 +2740,7 @@ void ExtAddrMode::print(raw_ostream &OS) const { if (InBounds) OS << "inbounds "; if (BaseGV) { - OS << (NeedPlus ? " + " : "") << "GV:"; + OS << "GV:"; BaseGV->printAsOperand(OS, /*PrintType=*/false); NeedPlus = true; } @@ -3073,6 +3126,9 @@ class TypePromotionTransaction { ~InstructionRemover() override { delete Replacer; } + InstructionRemover &operator=(const InstructionRemover &other) = delete; + InstructionRemover(const InstructionRemover &other) = delete; + /// Resurrect the instruction and reassign it to the proper uses if /// new value was provided when build this action. void undo() override { @@ -3258,7 +3314,7 @@ class AddressingModeMatcher { bool IgnoreProfitability; /// True if we are optimizing for size. - bool OptSize; + bool OptSize = false; ProfileSummaryInfo *PSI; BlockFrequencyInfo *BFI; @@ -3574,10 +3630,15 @@ private: /// Original Address. Value *Original; + /// Common value among addresses + Value *CommonValue = nullptr; + public: AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue) : SQ(_SQ), Original(OriginalValue) {} + ~AddressingModeCombiner() { eraseCommonValueIfDead(); } + /// Get the combined AddrMode const ExtAddrMode &getAddrMode() const { return AddrModes[0]; } @@ -3662,13 +3723,21 @@ public: if (!initializeMap(Map)) return false; - Value *CommonValue = findCommon(Map); + CommonValue = findCommon(Map); if (CommonValue) AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes); return CommonValue != nullptr; } private: + /// `CommonValue` may be a placeholder inserted by us. + /// If the placeholder is not used, we should remove this dead instruction. + void eraseCommonValueIfDead() { + if (CommonValue && CommonValue->getNumUses() == 0) + if (Instruction *CommonInst = dyn_cast<Instruction>(CommonValue)) + CommonInst->eraseFromParent(); + } + /// Initialize Map with anchor values. For address seen /// we set the value of different field saw in this address. /// At the same time we find a common type for different field we will @@ -3866,17 +3935,17 @@ private: SimplificationTracker &ST) { while (!TraverseOrder.empty()) { Value *Current = TraverseOrder.pop_back_val(); - assert(Map.find(Current) != Map.end() && "No node to fill!!!"); + assert(Map.contains(Current) && "No node to fill!!!"); Value *V = Map[Current]; if (SelectInst *Select = dyn_cast<SelectInst>(V)) { // CurrentValue also must be Select. auto *CurrentSelect = cast<SelectInst>(Current); auto *TrueValue = CurrentSelect->getTrueValue(); - assert(Map.find(TrueValue) != Map.end() && "No True Value!"); + assert(Map.contains(TrueValue) && "No True Value!"); Select->setTrueValue(ST.Get(Map[TrueValue])); auto *FalseValue = CurrentSelect->getFalseValue(); - assert(Map.find(FalseValue) != Map.end() && "No False Value!"); + assert(Map.contains(FalseValue) && "No False Value!"); Select->setFalseValue(ST.Get(Map[FalseValue])); } else { // Must be a Phi node then. @@ -3884,7 +3953,7 @@ private: // Fill the Phi node with values from predecessors. for (auto *B : predecessors(PHI->getParent())) { Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B); - assert(Map.find(PV) != Map.end() && "No predecessor Value!"); + assert(Map.contains(PV) && "No predecessor Value!"); PHI->addIncoming(ST.Get(Map[PV]), B); } } @@ -3908,7 +3977,7 @@ private: while (!Worklist.empty()) { Value *Current = Worklist.pop_back_val(); // if it is already visited or it is an ending value then skip it. - if (Map.find(Current) != Map.end()) + if (Map.contains(Current)) continue; TraverseOrder.push_back(Current); @@ -4627,7 +4696,8 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, return false; } case Instruction::Add: { - // Check to see if we can merge in the RHS then the LHS. If so, we win. + // Check to see if we can merge in one operand, then the other. If so, we + // win. ExtAddrMode BackupAddrMode = AddrMode; unsigned OldSize = AddrModeInsts.size(); // Start a transaction at this point. @@ -4637,9 +4707,15 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); + // Try to match an integer constant second to increase its chance of ending + // up in `BaseOffs`, resp. decrease its chance of ending up in `BaseReg`. + int First = 0, Second = 1; + if (isa<ConstantInt>(AddrInst->getOperand(First)) + && !isa<ConstantInt>(AddrInst->getOperand(Second))) + std::swap(First, Second); AddrMode.InBounds = false; - if (matchAddr(AddrInst->getOperand(1), Depth + 1) && - matchAddr(AddrInst->getOperand(0), Depth + 1)) + if (matchAddr(AddrInst->getOperand(First), Depth + 1) && + matchAddr(AddrInst->getOperand(Second), Depth + 1)) return true; // Restore the old addr mode info. @@ -4647,9 +4723,10 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, AddrModeInsts.resize(OldSize); TPT.rollback(LastKnownGood); - // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. - if (matchAddr(AddrInst->getOperand(0), Depth + 1) && - matchAddr(AddrInst->getOperand(1), Depth + 1)) + // Otherwise this was over-aggressive. Try merging operands in the opposite + // order. + if (matchAddr(AddrInst->getOperand(Second), Depth + 1) && + matchAddr(AddrInst->getOperand(First), Depth + 1)) return true; // Otherwise we definitely can't merge the ADD in. @@ -4698,7 +4775,7 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { const APInt &CVal = CI->getValue(); - if (CVal.getMinSignedBits() <= 64) { + if (CVal.getSignificantBits() <= 64) { ConstantOffset += CVal.getSExtValue() * TypeSize; continue; } @@ -4718,36 +4795,35 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, // just add it to the disp field and check validity. if (VariableOperand == -1) { AddrMode.BaseOffs += ConstantOffset; - if (ConstantOffset == 0 || - TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) { - // Check to see if we can fold the base pointer in too. - if (matchAddr(AddrInst->getOperand(0), Depth + 1)) { + if (matchAddr(AddrInst->getOperand(0), Depth + 1)) { if (!cast<GEPOperator>(AddrInst)->isInBounds()) AddrMode.InBounds = false; return true; - } - } else if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) && - TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 && - ConstantOffset > 0) { - // Record GEPs with non-zero offsets as candidates for splitting in the - // event that the offset cannot fit into the r+i addressing mode. - // Simple and common case that only one GEP is used in calculating the - // address for the memory access. - Value *Base = AddrInst->getOperand(0); - auto *BaseI = dyn_cast<Instruction>(Base); - auto *GEP = cast<GetElementPtrInst>(AddrInst); - if (isa<Argument>(Base) || isa<GlobalValue>(Base) || - (BaseI && !isa<CastInst>(BaseI) && - !isa<GetElementPtrInst>(BaseI))) { - // Make sure the parent block allows inserting non-PHI instructions - // before the terminator. - BasicBlock *Parent = - BaseI ? BaseI->getParent() : &GEP->getFunction()->getEntryBlock(); - if (!Parent->getTerminator()->isEHPad()) - LargeOffsetGEP = std::make_pair(GEP, ConstantOffset); - } } AddrMode.BaseOffs -= ConstantOffset; + + if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) && + TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 && + ConstantOffset > 0) { + // Record GEPs with non-zero offsets as candidates for splitting in + // the event that the offset cannot fit into the r+i addressing mode. + // Simple and common case that only one GEP is used in calculating the + // address for the memory access. + Value *Base = AddrInst->getOperand(0); + auto *BaseI = dyn_cast<Instruction>(Base); + auto *GEP = cast<GetElementPtrInst>(AddrInst); + if (isa<Argument>(Base) || isa<GlobalValue>(Base) || + (BaseI && !isa<CastInst>(BaseI) && + !isa<GetElementPtrInst>(BaseI))) { + // Make sure the parent block allows inserting non-PHI instructions + // before the terminator. + BasicBlock *Parent = BaseI ? BaseI->getParent() + : &GEP->getFunction()->getEntryBlock(); + if (!Parent->getTerminator()->isEHPad()) + LargeOffsetGEP = std::make_pair(GEP, ConstantOffset); + } + } + return false; } @@ -4963,18 +5039,14 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, return true; } -// Max number of memory uses to look at before aborting the search to conserve -// compile time. -static constexpr int MaxMemoryUsesToScan = 20; - /// Recursively walk all the uses of I until we find a memory use. /// If we find an obviously non-foldable instruction, return true. /// Add accessed addresses and types to MemoryUses. static bool FindAllMemoryUses( - Instruction *I, SmallVectorImpl<std::pair<Value *, Type *>> &MemoryUses, + Instruction *I, SmallVectorImpl<std::pair<Use *, Type *>> &MemoryUses, SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI, const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI, int SeenInsts = 0) { + BlockFrequencyInfo *BFI, unsigned &SeenInsts) { // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I).second) return false; @@ -4987,33 +5059,33 @@ static bool FindAllMemoryUses( for (Use &U : I->uses()) { // Conservatively return true if we're seeing a large number or a deep chain // of users. This avoids excessive compilation times in pathological cases. - if (SeenInsts++ >= MaxMemoryUsesToScan) + if (SeenInsts++ >= MaxAddressUsersToScan) return true; Instruction *UserI = cast<Instruction>(U.getUser()); if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) { - MemoryUses.push_back({U.get(), LI->getType()}); + MemoryUses.push_back({&U, LI->getType()}); continue; } if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) { if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) return true; // Storing addr, not into addr. - MemoryUses.push_back({U.get(), SI->getValueOperand()->getType()}); + MemoryUses.push_back({&U, SI->getValueOperand()->getType()}); continue; } if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) { if (U.getOperandNo() != AtomicRMWInst::getPointerOperandIndex()) return true; // Storing addr, not into addr. - MemoryUses.push_back({U.get(), RMW->getValOperand()->getType()}); + MemoryUses.push_back({&U, RMW->getValOperand()->getType()}); continue; } if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) { if (U.getOperandNo() != AtomicCmpXchgInst::getPointerOperandIndex()) return true; // Storing addr, not into addr. - MemoryUses.push_back({U.get(), CmpX->getCompareOperand()->getType()}); + MemoryUses.push_back({&U, CmpX->getCompareOperand()->getType()}); continue; } @@ -5045,6 +5117,17 @@ static bool FindAllMemoryUses( return false; } +static bool FindAllMemoryUses( + Instruction *I, SmallVectorImpl<std::pair<Use *, Type *>> &MemoryUses, + const TargetLowering &TLI, const TargetRegisterInfo &TRI, bool OptSize, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { + unsigned SeenInsts = 0; + SmallPtrSet<Instruction *, 16> ConsideredInsts; + return FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, + PSI, BFI, SeenInsts); +} + + /// Return true if Val is already known to be live at the use site that we're /// folding it into. If so, there is no cost to include it in the addressing /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the @@ -5126,10 +5209,8 @@ bool AddressingModeMatcher::isProfitableToFoldIntoAddressingMode( // we can remove the addressing mode and effectively trade one live register // for another (at worst.) In this context, folding an addressing mode into // the use is just a particularly nice way of sinking it. - SmallVector<std::pair<Value *, Type *>, 16> MemoryUses; - SmallPtrSet<Instruction *, 16> ConsideredInsts; - if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize, PSI, - BFI)) + SmallVector<std::pair<Use *, Type *>, 16> MemoryUses; + if (FindAllMemoryUses(I, MemoryUses, TLI, TRI, OptSize, PSI, BFI)) return false; // Has a non-memory, non-foldable use! // Now that we know that all uses of this instruction are part of a chain of @@ -5142,8 +5223,9 @@ bool AddressingModeMatcher::isProfitableToFoldIntoAddressingMode( // growth since most architectures have some reasonable small and fast way to // compute an effective address. (i.e LEA on x86) SmallVector<Instruction *, 32> MatchedAddrModeInsts; - for (const std::pair<Value *, Type *> &Pair : MemoryUses) { - Value *Address = Pair.first; + for (const std::pair<Use *, Type *> &Pair : MemoryUses) { + Value *Address = Pair.first->get(); + Instruction *UserI = cast<Instruction>(Pair.first->getUser()); Type *AddressAccessTy = Pair.second; unsigned AS = Address->getType()->getPointerAddressSpace(); @@ -5156,7 +5238,7 @@ bool AddressingModeMatcher::isProfitableToFoldIntoAddressingMode( TypePromotionTransaction::ConstRestorationPt LastKnownGood = TPT.getRestorationPoint(); AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, getDTFn, - AddressAccessTy, AS, MemoryInst, Result, + AddressAccessTy, AS, UserI, Result, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI, BFI); Matcher.IgnoreProfitability = true; @@ -5693,7 +5775,8 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst, // Create a scalar GEP if there are more than 2 operands. if (Ops.size() != 2) { // Replace the last index with 0. - Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy); + Ops[FinalIndex] = + Constant::getNullValue(Ops[FinalIndex]->getType()->getScalarType()); Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(Ops).drop_front()); SourceTy = GetElementPtrInst::getIndexedType( SourceTy, ArrayRef(Ops).drop_front()); @@ -6027,6 +6110,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { int64_t Offset = LargeOffsetGEP->second; if (Offset != BaseOffset) { TargetLowering::AddrMode AddrMode; + AddrMode.HasBaseReg = true; AddrMode.BaseOffs = Offset - BaseOffset; // The result type of the GEP might not be the type of the memory // access. @@ -6044,7 +6128,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { // Generate a new GEP to replace the current one. LLVMContext &Ctx = GEP->getContext(); - Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); + Type *PtrIdxTy = DL->getIndexType(GEP->getType()); Type *I8PtrTy = Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace()); Type *I8Ty = Type::getInt8Ty(Ctx); @@ -6062,7 +6146,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) { NewBaseInsertBB = - SplitEdge(NewBaseInsertBB, Invoke->getNormalDest()); + SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI); NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); } else NewBaseInsertPt = std::next(BaseI->getIterator()); @@ -6074,7 +6158,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { } IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt); // Create a new base. - Value *BaseIndex = ConstantInt::get(IntPtrTy, BaseOffset); + Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset); NewBaseGEP = OldBase; if (NewBaseGEP->getType() != I8PtrTy) NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy); @@ -6090,7 +6174,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType()); } else { // Calculate the new offset for the new GEP. - Value *Index = ConstantInt::get(IntPtrTy, Offset - BaseOffset); + Value *Index = ConstantInt::get(PtrIdxTy, Offset - BaseOffset); NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index); if (GEP->getType() != I8PtrTy) @@ -6872,9 +6956,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { return false; TargetLowering::SelectSupportKind SelectKind; - if (VectorCond) - SelectKind = TargetLowering::VectorMaskSelect; - else if (SI->getType()->isVectorTy()) + if (SI->getType()->isVectorTy()) SelectKind = TargetLowering::ScalarCondVectorVal; else SelectKind = TargetLowering::ScalarValSelect; @@ -6915,88 +6997,88 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { // first branch will point directly to select.end, and the corresponding PHI // predecessor block will be the start block. - // First, we split the block containing the select into 2 blocks. + // Collect values that go on the true side and the values that go on the false + // side. + SmallVector<Instruction *> TrueInstrs, FalseInstrs; + for (SelectInst *SI : ASI) { + if (Value *V = SI->getTrueValue(); sinkSelectOperand(TTI, V)) + TrueInstrs.push_back(cast<Instruction>(V)); + if (Value *V = SI->getFalseValue(); sinkSelectOperand(TTI, V)) + FalseInstrs.push_back(cast<Instruction>(V)); + } + + // Split the select block, according to how many (if any) values go on each + // side. BasicBlock *StartBlock = SI->getParent(); BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI)); - BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); - if (IsHugeFunc) - FreshBBs.insert(EndBlock); - BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency()); - // Delete the unconditional branch that was just created by the split. - StartBlock->getTerminator()->eraseFromParent(); + IRBuilder<> IB(SI); + auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); - // These are the new basic blocks for the conditional branch. - // At least one will become an actual new basic block. BasicBlock *TrueBlock = nullptr; BasicBlock *FalseBlock = nullptr; + BasicBlock *EndBlock = nullptr; BranchInst *TrueBranch = nullptr; BranchInst *FalseBranch = nullptr; - - // Sink expensive instructions into the conditional blocks to avoid executing - // them speculatively. - for (SelectInst *SI : ASI) { - if (sinkSelectOperand(TTI, SI->getTrueValue())) { - if (TrueBlock == nullptr) { - TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink", - EndBlock->getParent(), EndBlock); - TrueBranch = BranchInst::Create(EndBlock, TrueBlock); - if (IsHugeFunc) - FreshBBs.insert(TrueBlock); - TrueBranch->setDebugLoc(SI->getDebugLoc()); - } - auto *TrueInst = cast<Instruction>(SI->getTrueValue()); - TrueInst->moveBefore(TrueBranch); - } - if (sinkSelectOperand(TTI, SI->getFalseValue())) { - if (FalseBlock == nullptr) { - FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink", - EndBlock->getParent(), EndBlock); - if (IsHugeFunc) - FreshBBs.insert(FalseBlock); - FalseBranch = BranchInst::Create(EndBlock, FalseBlock); - FalseBranch->setDebugLoc(SI->getDebugLoc()); - } - auto *FalseInst = cast<Instruction>(SI->getFalseValue()); - FalseInst->moveBefore(FalseBranch); - } + if (TrueInstrs.size() == 0) { + FalseBranch = cast<BranchInst>(SplitBlockAndInsertIfElse( + CondFr, &*SplitPt, false, nullptr, nullptr, LI)); + FalseBlock = FalseBranch->getParent(); + EndBlock = cast<BasicBlock>(FalseBranch->getOperand(0)); + } else if (FalseInstrs.size() == 0) { + TrueBranch = cast<BranchInst>(SplitBlockAndInsertIfThen( + CondFr, &*SplitPt, false, nullptr, nullptr, LI)); + TrueBlock = TrueBranch->getParent(); + EndBlock = cast<BasicBlock>(TrueBranch->getOperand(0)); + } else { + Instruction *ThenTerm = nullptr; + Instruction *ElseTerm = nullptr; + SplitBlockAndInsertIfThenElse(CondFr, &*SplitPt, &ThenTerm, &ElseTerm, + nullptr, nullptr, LI); + TrueBranch = cast<BranchInst>(ThenTerm); + FalseBranch = cast<BranchInst>(ElseTerm); + TrueBlock = TrueBranch->getParent(); + FalseBlock = FalseBranch->getParent(); + EndBlock = cast<BasicBlock>(TrueBranch->getOperand(0)); + } + + EndBlock->setName("select.end"); + if (TrueBlock) + TrueBlock->setName("select.true.sink"); + if (FalseBlock) + FalseBlock->setName(FalseInstrs.size() == 0 ? "select.false" + : "select.false.sink"); + + if (IsHugeFunc) { + if (TrueBlock) + FreshBBs.insert(TrueBlock); + if (FalseBlock) + FreshBBs.insert(FalseBlock); + FreshBBs.insert(EndBlock); } - // If there was nothing to sink, then arbitrarily choose the 'false' side - // for a new input value to the PHI. - if (TrueBlock == FalseBlock) { - assert(TrueBlock == nullptr && - "Unexpected basic block transform while optimizing select"); + BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency()); - FalseBlock = BasicBlock::Create(SI->getContext(), "select.false", - EndBlock->getParent(), EndBlock); - if (IsHugeFunc) - FreshBBs.insert(FalseBlock); - auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock); - FalseBranch->setDebugLoc(SI->getDebugLoc()); - } + static const unsigned MD[] = { + LLVMContext::MD_prof, LLVMContext::MD_unpredictable, + LLVMContext::MD_make_implicit, LLVMContext::MD_dbg}; + StartBlock->getTerminator()->copyMetadata(*SI, MD); + + // Sink expensive instructions into the conditional blocks to avoid executing + // them speculatively. + for (Instruction *I : TrueInstrs) + I->moveBefore(TrueBranch); + for (Instruction *I : FalseInstrs) + I->moveBefore(FalseBranch); - // Insert the real conditional branch based on the original condition. // If we did not create a new block for one of the 'true' or 'false' paths // of the condition, it means that side of the branch goes to the end block // directly and the path originates from the start block from the point of // view of the new PHI. - BasicBlock *TT, *FT; - if (TrueBlock == nullptr) { - TT = EndBlock; - FT = FalseBlock; + if (TrueBlock == nullptr) TrueBlock = StartBlock; - } else if (FalseBlock == nullptr) { - TT = TrueBlock; - FT = EndBlock; + else if (FalseBlock == nullptr) FalseBlock = StartBlock; - } else { - TT = TrueBlock; - FT = FalseBlock; - } - IRBuilder<> IB(SI); - auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen"); - IB.CreateCondBr(CondFr, TT, FT, SI); SmallPtrSet<const Instruction *, 2> INS; INS.insert(ASI.begin(), ASI.end()); @@ -7105,7 +7187,7 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) { if (IsHugeFunc) { // Now we clone an instruction, its operands' defs may sink to this BB - // now. So we put the operands defs' BBs into FreshBBs to do optmization. + // now. So we put the operands defs' BBs into FreshBBs to do optimization. for (unsigned I = 0; I < NI->getNumOperands(); ++I) { auto *OpDef = dyn_cast<Instruction>(NI->getOperand(I)); if (!OpDef) @@ -7696,7 +7778,7 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL, // whereas scalable vectors would have to be shifted by // <2log(vscale) + number of bits> in order to store the // low/high parts. Bailing out for now. - if (isa<ScalableVectorType>(StoreType)) + if (StoreType->isScalableTy()) return false; if (!DL.typeSizeEqualsStoreSize(StoreType) || @@ -8051,8 +8133,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) { return true; if ((isa<UIToFPInst>(I) || isa<FPToUIInst>(I) || isa<TruncInst>(I)) && - TLI->optimizeExtendOrTruncateConversion(I, - LI->getLoopFor(I->getParent()))) + TLI->optimizeExtendOrTruncateConversion( + I, LI->getLoopFor(I->getParent()), *TTI)) return true; if (isa<ZExtInst>(I) || isa<SExtInst>(I)) { @@ -8064,7 +8146,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) { return SinkCast(CI); } else { if (TLI->optimizeExtendOrTruncateConversion( - I, LI->getLoopFor(I->getParent()))) + I, LI->getLoopFor(I->getParent()), *TTI)) return true; bool MadeChange = optimizeExt(I); @@ -8128,7 +8210,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) { GEPI->getName(), GEPI); NC->setDebugLoc(GEPI->getDebugLoc()); replaceAllUsesWith(GEPI, NC, FreshBBs, IsHugeFunc); - GEPI->eraseFromParent(); + RecursivelyDeleteTriviallyDeadInstructions( + GEPI, TLInfo, nullptr, + [&](Value *V) { removeAllAssertingVHReferences(V); }); ++NumGEPsElim; optimizeInst(NC, ModifiedDT); return true; diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 48cd8e998ec9..c34a52a6f2de 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -14,15 +14,15 @@ #include "llvm/CodeGen/CommandFlags.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/Triple.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/MC/MCTargetOptionsCommandFlags.h" -#include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Host.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/TargetParser/Host.h" +#include "llvm/TargetParser/SubtargetFeature.h" +#include "llvm/TargetParser/Triple.h" #include <optional> using namespace llvm; @@ -81,7 +81,7 @@ CGOPT(bool, StackSymbolOrdering) CGOPT(bool, StackRealign) CGOPT(std::string, TrapFuncName) CGOPT(bool, UseCtors) -CGOPT(bool, LowerGlobalDtorsViaCxaAtExit) +CGOPT(bool, DisableIntegratedAS) CGOPT(bool, RelaxELFRelocations) CGOPT_EXP(bool, DataSections) CGOPT_EXP(bool, FunctionSections) @@ -89,7 +89,7 @@ CGOPT(bool, IgnoreXCOFFVisibility) CGOPT(bool, XCOFFTracebackTable) CGOPT(std::string, BBSections) CGOPT(unsigned, TLSSize) -CGOPT(bool, EmulatedTLS) +CGOPT_EXP(bool, EmulatedTLS) CGOPT(bool, UniqueSectionNames) CGOPT(bool, UniqueBasicBlockSectionNames) CGOPT(EABI, EABIVersion) @@ -100,10 +100,11 @@ CGOPT(bool, EmitCallSiteInfo) CGOPT(bool, EnableMachineFunctionSplitter) CGOPT(bool, EnableDebugEntryValues) CGOPT(bool, ForceDwarfFrameSection) -CGOPT(bool, XRayOmitFunctionIndex) +CGOPT(bool, XRayFunctionIndex) CGOPT(bool, DebugStrictDwarf) CGOPT(unsigned, AlignLoops) CGOPT(bool, JMCInstrument) +CGOPT(bool, XCOFFReadOnlyPointers) codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { #define CGBINDOPT(NAME) \ @@ -241,14 +242,15 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(EnableNoTrappingFPMath); - static const auto DenormFlagEnumOptions = - cl::values(clEnumValN(DenormalMode::IEEE, "ieee", - "IEEE 754 denormal numbers"), - clEnumValN(DenormalMode::PreserveSign, "preserve-sign", - "the sign of a flushed-to-zero number is preserved " - "in the sign of 0"), - clEnumValN(DenormalMode::PositiveZero, "positive-zero", - "denormals are flushed to positive zero")); + static const auto DenormFlagEnumOptions = cl::values( + clEnumValN(DenormalMode::IEEE, "ieee", "IEEE 754 denormal numbers"), + clEnumValN(DenormalMode::PreserveSign, "preserve-sign", + "the sign of a flushed-to-zero number is preserved " + "in the sign of 0"), + clEnumValN(DenormalMode::PositiveZero, "positive-zero", + "denormals are flushed to positive zero"), + clEnumValN(DenormalMode::Dynamic, "dynamic", + "denormals have unknown treatment")); // FIXME: Doesn't have way to specify separate input and output modes. static cl::opt<DenormalMode::DenormalModeKind> DenormalFPMath( @@ -349,12 +351,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(UseCtors); - static cl::opt<bool> LowerGlobalDtorsViaCxaAtExit( - "lower-global-dtors-via-cxa-atexit", - cl::desc("Lower llvm.global_dtors (global destructors) via __cxa_atexit"), - cl::init(true)); - CGBINDOPT(LowerGlobalDtorsViaCxaAtExit); - static cl::opt<bool> RelaxELFRelocations( "relax-elf-relocations", cl::desc( @@ -466,10 +462,10 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::desc("Always emit a debug frame section."), cl::init(false)); CGBINDOPT(ForceDwarfFrameSection); - static cl::opt<bool> XRayOmitFunctionIndex( - "no-xray-index", cl::desc("Don't emit xray_fn_idx section"), - cl::init(false)); - CGBINDOPT(XRayOmitFunctionIndex); + static cl::opt<bool> XRayFunctionIndex("xray-function-index", + cl::desc("Emit xray_fn_idx section"), + cl::init(true)); + CGBINDOPT(XRayFunctionIndex); static cl::opt<bool> DebugStrictDwarf( "strict-dwarf", cl::desc("use strict dwarf"), cl::init(false)); @@ -485,6 +481,18 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(JMCInstrument); + static cl::opt<bool> XCOFFReadOnlyPointers( + "mxcoff-roptr", + cl::desc("When set to true, const objects with relocatable address " + "values are put into the RO data section."), + cl::init(false)); + CGBINDOPT(XCOFFReadOnlyPointers); + + static cl::opt<bool> DisableIntegratedAS( + "no-integrated-as", cl::desc("Disable integrated assembler"), + cl::init(false)); + CGBINDOPT(DisableIntegratedAS); + #undef CGBINDOPT mc::RegisterMCTargetOptionsFlags(); @@ -538,7 +546,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.GuaranteedTailCallOpt = getEnableGuaranteedTailCallOpt(); Options.StackSymbolOrdering = getStackSymbolOrdering(); Options.UseInitArray = !getUseCtors(); - Options.LowerGlobalDtorsViaCxaAtExit = getLowerGlobalDtorsViaCxaAtExit(); + Options.DisableIntegratedAS = getDisableIntegratedAS(); Options.RelaxELFRelocations = getRelaxELFRelocations(); Options.DataSections = getExplicitDataSections().value_or(TheTriple.hasDefaultDataSections()); @@ -549,8 +557,8 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.UniqueSectionNames = getUniqueSectionNames(); Options.UniqueBasicBlockSectionNames = getUniqueBasicBlockSectionNames(); Options.TLSSize = getTLSSize(); - Options.EmulatedTLS = getEmulatedTLS(); - Options.ExplicitEmulatedTLS = EmulatedTLSView->getNumOccurrences() > 0; + Options.EmulatedTLS = + getExplicitEmulatedTLS().value_or(TheTriple.hasDefaultEmulatedTLS()); Options.ExceptionModel = getExceptionModel(); Options.EmitStackSizeSection = getEnableStackSizeSection(); Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter(); @@ -558,10 +566,11 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.EmitCallSiteInfo = getEmitCallSiteInfo(); Options.EnableDebugEntryValues = getEnableDebugEntryValues(); Options.ForceDwarfFrameSection = getForceDwarfFrameSection(); - Options.XRayOmitFunctionIndex = getXRayOmitFunctionIndex(); + Options.XRayFunctionIndex = getXRayFunctionIndex(); Options.DebugStrictDwarf = getDebugStrictDwarf(); Options.LoopAlignment = getAlignLoops(); Options.JMCInstrument = getJMCInstrument(); + Options.XCOFFReadOnlyPointers = getXCOFFReadOnlyPointers(); Options.MCOptions = mc::InitMCTargetOptionsFromFlags(); diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 9b1f7117fa57..02c67e500bdc 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -18,6 +18,11 @@ // pairs. Validity of each node is expected to be done upon creation, and any // validation errors should halt traversal and prevent further graph // construction. +// Instead of relying on Shuffle operations, vector interleaving and +// deinterleaving can be represented by vector.interleave2 and +// vector.deinterleave2 intrinsics. Scalable vectors can be represented only by +// these intrinsics, whereas, fixed-width vectors are recognized for both +// shufflevector instruction and intrinsics. // // Replacement: // This step traverses the graph built up by identification, delegating to the @@ -62,6 +67,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Local.h" @@ -94,6 +100,13 @@ static bool isInterleavingMask(ArrayRef<int> Mask); /// <1, 3, 5, 7>). static bool isDeinterleavingMask(ArrayRef<int> Mask); +/// Returns true if the operation is a negation of V, and it works for both +/// integers and floats. +static bool isNeg(Value *V); + +/// Returns the operand for negation operation. +static Value *getNegOperand(Value *V); + namespace { class ComplexDeinterleavingLegacyPass : public FunctionPass { @@ -124,7 +137,7 @@ class ComplexDeinterleavingGraph; struct ComplexDeinterleavingCompositeNode { ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op, - Instruction *R, Instruction *I) + Value *R, Value *I) : Operation(Op), Real(R), Imag(I) {} private: @@ -134,22 +147,21 @@ private: public: ComplexDeinterleavingOperation Operation; - Instruction *Real; - Instruction *Imag; + Value *Real; + Value *Imag; - // Instructions that should only exist within this node, there should be no - // users of these instructions outside the node. An example of these would be - // the multiply instructions of a partial multiply operation. - SmallVector<Instruction *> InternalInstructions; - ComplexDeinterleavingRotation Rotation; + // This two members are required exclusively for generating + // ComplexDeinterleavingOperation::Symmetric operations. + unsigned Opcode; + std::optional<FastMathFlags> Flags; + + ComplexDeinterleavingRotation Rotation = + ComplexDeinterleavingRotation::Rotation_0; SmallVector<RawNodePtr> Operands; Value *ReplacementNode = nullptr; - void addInstruction(Instruction *I) { InternalInstructions.push_back(I); } void addOperand(NodePtr Node) { Operands.push_back(Node.get()); } - bool hasAllInternalUses(SmallPtrSet<Instruction *, 16> &AllInstructions); - void dump() { dump(dbgs()); } void dump(raw_ostream &OS) { auto PrintValue = [&](Value *V) { @@ -181,40 +193,105 @@ public: OS << " - "; PrintNodeRef(Op); } - OS << " InternalInstructions:\n"; - for (const auto &I : InternalInstructions) { - OS << " - \""; - I->print(OS, true); - OS << "\"\n"; - } } }; class ComplexDeinterleavingGraph { public: + struct Product { + Value *Multiplier; + Value *Multiplicand; + bool IsPositive; + }; + + using Addend = std::pair<Value *, bool>; using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr; using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr; - explicit ComplexDeinterleavingGraph(const TargetLowering *tl) : TL(tl) {} + + // Helper struct for holding info about potential partial multiplication + // candidates + struct PartialMulCandidate { + Value *Common; + NodePtr Node; + unsigned RealIdx; + unsigned ImagIdx; + bool IsNodeInverted; + }; + + explicit ComplexDeinterleavingGraph(const TargetLowering *TL, + const TargetLibraryInfo *TLI) + : TL(TL), TLI(TLI) {} private: - const TargetLowering *TL; - Instruction *RootValue; - NodePtr RootNode; + const TargetLowering *TL = nullptr; + const TargetLibraryInfo *TLI = nullptr; SmallVector<NodePtr> CompositeNodes; - SmallPtrSet<Instruction *, 16> AllInstructions; + + SmallPtrSet<Instruction *, 16> FinalInstructions; + + /// Root instructions are instructions from which complex computation starts + std::map<Instruction *, NodePtr> RootToNode; + + /// Topologically sorted root instructions + SmallVector<Instruction *, 1> OrderedRoots; + + /// When examining a basic block for complex deinterleaving, if it is a simple + /// one-block loop, then the only incoming block is 'Incoming' and the + /// 'BackEdge' block is the block itself." + BasicBlock *BackEdge = nullptr; + BasicBlock *Incoming = nullptr; + + /// ReductionInfo maps from %ReductionOp to %PHInode and Instruction + /// %OutsideUser as it is shown in the IR: + /// + /// vector.body: + /// %PHInode = phi <vector type> [ zeroinitializer, %entry ], + /// [ %ReductionOp, %vector.body ] + /// ... + /// %ReductionOp = fadd i64 ... + /// ... + /// br i1 %condition, label %vector.body, %middle.block + /// + /// middle.block: + /// %OutsideUser = llvm.vector.reduce.fadd(..., %ReductionOp) + /// + /// %OutsideUser can be `llvm.vector.reduce.fadd` or `fadd` preceding + /// `llvm.vector.reduce.fadd` when unroll factor isn't one. + std::map<Instruction *, std::pair<PHINode *, Instruction *>> ReductionInfo; + + /// In the process of detecting a reduction, we consider a pair of + /// %ReductionOP, which we refer to as real and imag (or vice versa), and + /// traverse the use-tree to detect complex operations. As this is a reduction + /// operation, it will eventually reach RealPHI and ImagPHI, which corresponds + /// to the %ReductionOPs that we suspect to be complex. + /// RealPHI and ImagPHI are used by the identifyPHINode method. + PHINode *RealPHI = nullptr; + PHINode *ImagPHI = nullptr; + + /// Set this flag to true if RealPHI and ImagPHI were reached during reduction + /// detection. + bool PHIsFound = false; + + /// OldToNewPHI maps the original real PHINode to a new, double-sized PHINode. + /// The new PHINode corresponds to a vector of deinterleaved complex numbers. + /// This mapping is populated during + /// ComplexDeinterleavingOperation::ReductionPHI node replacement. It is then + /// used in the ComplexDeinterleavingOperation::ReductionOperation node + /// replacement process. + std::map<PHINode *, PHINode *> OldToNewPHI; NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation, - Instruction *R, Instruction *I) { + Value *R, Value *I) { + assert(((Operation != ComplexDeinterleavingOperation::ReductionPHI && + Operation != ComplexDeinterleavingOperation::ReductionOperation) || + (R && I)) && + "Reduction related nodes must have Real and Imaginary parts"); return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation, R, I); } NodePtr submitCompositeNode(NodePtr Node) { CompositeNodes.push_back(Node); - AllInstructions.insert(Node->Real); - AllInstructions.insert(Node->Imag); - for (auto *I : Node->InternalInstructions) - AllInstructions.insert(I); return Node; } @@ -242,9 +319,9 @@ private: /// Identify the other branch of a Partial Mul, taking the CommonOperandI that /// is partially known from identifyPartialMul, filling in the other half of /// the complex pair. - NodePtr identifyNodeWithImplicitAdd( - Instruction *I, Instruction *J, - std::pair<Instruction *, Instruction *> &CommonOperandI); + NodePtr + identifyNodeWithImplicitAdd(Instruction *I, Instruction *J, + std::pair<Value *, Value *> &CommonOperandI); /// Identifies a complex add pattern and its rotation, based on the following /// patterns. @@ -254,10 +331,76 @@ private: /// 270: r: ar + bi /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); - - NodePtr identifyNode(Instruction *I, Instruction *J); - - Value *replaceNode(RawNodePtr Node); + NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); + + NodePtr identifyNode(Value *R, Value *I); + + /// Determine if a sum of complex numbers can be formed from \p RealAddends + /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. + /// Return nullptr if it is not possible to construct a complex number. + /// \p Flags are needed to generate symmetric Add and Sub operations. + NodePtr identifyAdditions(std::list<Addend> &RealAddends, + std::list<Addend> &ImagAddends, + std::optional<FastMathFlags> Flags, + NodePtr Accumulator); + + /// Extract one addend that have both real and imaginary parts positive. + NodePtr extractPositiveAddend(std::list<Addend> &RealAddends, + std::list<Addend> &ImagAddends); + + /// Determine if sum of multiplications of complex numbers can be formed from + /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result + /// to it. Return nullptr if it is not possible to construct a complex number. + NodePtr identifyMultiplications(std::vector<Product> &RealMuls, + std::vector<Product> &ImagMuls, + NodePtr Accumulator); + + /// Go through pairs of multiplication (one Real and one Imag) and find all + /// possible candidates for partial multiplication and put them into \p + /// Candidates. Returns true if all Product has pair with common operand + bool collectPartialMuls(const std::vector<Product> &RealMuls, + const std::vector<Product> &ImagMuls, + std::vector<PartialMulCandidate> &Candidates); + + /// If the code is compiled with -Ofast or expressions have `reassoc` flag, + /// the order of complex computation operations may be significantly altered, + /// and the real and imaginary parts may not be executed in parallel. This + /// function takes this into consideration and employs a more general approach + /// to identify complex computations. Initially, it gathers all the addends + /// and multiplicands and then constructs a complex expression from them. + NodePtr identifyReassocNodes(Instruction *I, Instruction *J); + + NodePtr identifyRoot(Instruction *I); + + /// Identifies the Deinterleave operation applied to a vector containing + /// complex numbers. There are two ways to represent the Deinterleave + /// operation: + /// * Using two shufflevectors with even indices for /pReal instruction and + /// odd indices for /pImag instructions (only for fixed-width vectors) + /// * Using two extractvalue instructions applied to `vector.deinterleave2` + /// intrinsic (for both fixed and scalable vectors) + NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag); + + /// identifying the operation that represents a complex number repeated in a + /// Splat vector. There are two possible types of splats: ConstantExpr with + /// the opcode ShuffleVector and ShuffleVectorInstr. Both should have an + /// initialization mask with all values set to zero. + NodePtr identifySplat(Value *Real, Value *Imag); + + NodePtr identifyPHINode(Instruction *Real, Instruction *Imag); + + /// Identifies SelectInsts in a loop that has reduction with predication masks + /// and/or predicated tail folding + NodePtr identifySelectNode(Instruction *Real, Instruction *Imag); + + Value *replaceNode(IRBuilderBase &Builder, RawNodePtr Node); + + /// Complete IR modifications after producing new reduction operation: + /// * Populate the PHINode generated for + /// ComplexDeinterleavingOperation::ReductionPHI + /// * Deinterleave the final value outside of the loop and repurpose original + /// reduction users + void processReductionOperation(Value *OperationReplacement, RawNodePtr Node); public: void dump() { dump(dbgs()); } @@ -270,9 +413,18 @@ public: /// current graph. bool identifyNodes(Instruction *RootI); + /// In case \pB is one-block loop, this function seeks potential reductions + /// and populates ReductionInfo. Returns true if any reductions were + /// identified. + bool collectPotentialReductions(BasicBlock *B); + + void identifyReductionNodes(); + + /// Check that every instruction, from the roots to the leaves, has internal + /// uses. + bool checkNodes(); + /// Perform the actual replacement of the underlying instruction graph. - /// Returns false if the deinterleaving operation should be cancelled for the - /// current graph. void replaceNodes(); }; @@ -368,43 +520,39 @@ static bool isDeinterleavingMask(ArrayRef<int> Mask) { return true; } -bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { - bool Changed = false; +bool isNeg(Value *V) { + return match(V, m_FNeg(m_Value())) || match(V, m_Neg(m_Value())); +} - SmallVector<Instruction *> DeadInstrRoots; +Value *getNegOperand(Value *V) { + assert(isNeg(V)); + auto *I = cast<Instruction>(V); + if (I->getOpcode() == Instruction::FNeg) + return I->getOperand(0); - for (auto &I : *B) { - auto *SVI = dyn_cast<ShuffleVectorInst>(&I); - if (!SVI) - continue; + return I->getOperand(1); +} - // Look for a shufflevector that takes separate vectors of the real and - // imaginary components and recombines them into a single vector. - if (!isInterleavingMask(SVI->getShuffleMask())) - continue; +bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { + ComplexDeinterleavingGraph Graph(TL, TLI); + if (Graph.collectPotentialReductions(B)) + Graph.identifyReductionNodes(); - ComplexDeinterleavingGraph Graph(TL); - if (!Graph.identifyNodes(SVI)) - continue; + for (auto &I : *B) + Graph.identifyNodes(&I); + if (Graph.checkNodes()) { Graph.replaceNodes(); - DeadInstrRoots.push_back(SVI); - Changed = true; - } - - for (const auto &I : DeadInstrRoots) { - if (!I || I->getParent() == nullptr) - continue; - llvm::RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + return true; } - return Changed; + return false; } ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyNodeWithImplicitAdd( Instruction *Real, Instruction *Imag, - std::pair<Instruction *, Instruction *> &PartialMatch) { + std::pair<Value *, Value *> &PartialMatch) { LLVM_DEBUG(dbgs() << "identifyNodeWithImplicitAdd " << *Real << " / " << *Imag << "\n"); @@ -413,58 +561,47 @@ ComplexDeinterleavingGraph::identifyNodeWithImplicitAdd( return nullptr; } - if (Real->getOpcode() != Instruction::FMul || - Imag->getOpcode() != Instruction::FMul) { - LLVM_DEBUG(dbgs() << " - Real or imaginary instruction is not fmul\n"); + if ((Real->getOpcode() != Instruction::FMul && + Real->getOpcode() != Instruction::Mul) || + (Imag->getOpcode() != Instruction::FMul && + Imag->getOpcode() != Instruction::Mul)) { + LLVM_DEBUG( + dbgs() << " - Real or imaginary instruction is not fmul or mul\n"); return nullptr; } - Instruction *R0 = dyn_cast<Instruction>(Real->getOperand(0)); - Instruction *R1 = dyn_cast<Instruction>(Real->getOperand(1)); - Instruction *I0 = dyn_cast<Instruction>(Imag->getOperand(0)); - Instruction *I1 = dyn_cast<Instruction>(Imag->getOperand(1)); - if (!R0 || !R1 || !I0 || !I1) { - LLVM_DEBUG(dbgs() << " - Mul operand not Instruction\n"); - return nullptr; - } + Value *R0 = Real->getOperand(0); + Value *R1 = Real->getOperand(1); + Value *I0 = Imag->getOperand(0); + Value *I1 = Imag->getOperand(1); // A +/+ has a rotation of 0. If any of the operands are fneg, we flip the // rotations and use the operand. unsigned Negs = 0; - SmallVector<Instruction *> FNegs; - if (R0->getOpcode() == Instruction::FNeg || - R1->getOpcode() == Instruction::FNeg) { + Value *Op; + if (match(R0, m_Neg(m_Value(Op)))) { Negs |= 1; - if (R0->getOpcode() == Instruction::FNeg) { - FNegs.push_back(R0); - R0 = dyn_cast<Instruction>(R0->getOperand(0)); - } else { - FNegs.push_back(R1); - R1 = dyn_cast<Instruction>(R1->getOperand(0)); - } - if (!R0 || !R1) - return nullptr; + R0 = Op; + } else if (match(R1, m_Neg(m_Value(Op)))) { + Negs |= 1; + R1 = Op; } - if (I0->getOpcode() == Instruction::FNeg || - I1->getOpcode() == Instruction::FNeg) { + + if (isNeg(I0)) { Negs |= 2; Negs ^= 1; - if (I0->getOpcode() == Instruction::FNeg) { - FNegs.push_back(I0); - I0 = dyn_cast<Instruction>(I0->getOperand(0)); - } else { - FNegs.push_back(I1); - I1 = dyn_cast<Instruction>(I1->getOperand(0)); - } - if (!I0 || !I1) - return nullptr; + I0 = Op; + } else if (match(I1, m_Neg(m_Value(Op)))) { + Negs |= 2; + Negs ^= 1; + I1 = Op; } ComplexDeinterleavingRotation Rotation = (ComplexDeinterleavingRotation)Negs; - Instruction *CommonOperand; - Instruction *UncommonRealOp; - Instruction *UncommonImagOp; + Value *CommonOperand; + Value *UncommonRealOp; + Value *UncommonImagOp; if (R0 == I0 || R0 == I1) { CommonOperand = R0; @@ -512,7 +649,6 @@ ComplexDeinterleavingGraph::identifyNodeWithImplicitAdd( Node->Rotation = Rotation; Node->addOperand(CommonNode); Node->addOperand(UncommonNode); - Node->InternalInstructions.append(FNegs); return submitCompositeNode(Node); } @@ -522,26 +658,29 @@ ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real, LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag << "\n"); // Determine rotation + auto IsAdd = [](unsigned Op) { + return Op == Instruction::FAdd || Op == Instruction::Add; + }; + auto IsSub = [](unsigned Op) { + return Op == Instruction::FSub || Op == Instruction::Sub; + }; ComplexDeinterleavingRotation Rotation; - if (Real->getOpcode() == Instruction::FAdd && - Imag->getOpcode() == Instruction::FAdd) + if (IsAdd(Real->getOpcode()) && IsAdd(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_0; - else if (Real->getOpcode() == Instruction::FSub && - Imag->getOpcode() == Instruction::FAdd) + else if (IsSub(Real->getOpcode()) && IsAdd(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_90; - else if (Real->getOpcode() == Instruction::FSub && - Imag->getOpcode() == Instruction::FSub) + else if (IsSub(Real->getOpcode()) && IsSub(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_180; - else if (Real->getOpcode() == Instruction::FAdd && - Imag->getOpcode() == Instruction::FSub) + else if (IsAdd(Real->getOpcode()) && IsSub(Imag->getOpcode())) Rotation = ComplexDeinterleavingRotation::Rotation_270; else { LLVM_DEBUG(dbgs() << " - Unhandled rotation.\n"); return nullptr; } - if (!Real->getFastMathFlags().allowContract() || - !Imag->getFastMathFlags().allowContract()) { + if (isa<FPMathOperator>(Real) && + (!Real->getFastMathFlags().allowContract() || + !Imag->getFastMathFlags().allowContract())) { LLVM_DEBUG(dbgs() << " - Contract is missing from the FastMath flags.\n"); return nullptr; } @@ -560,18 +699,14 @@ ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real, return nullptr; } - Instruction *R0 = dyn_cast<Instruction>(RealMulI->getOperand(0)); - Instruction *R1 = dyn_cast<Instruction>(RealMulI->getOperand(1)); - Instruction *I0 = dyn_cast<Instruction>(ImagMulI->getOperand(0)); - Instruction *I1 = dyn_cast<Instruction>(ImagMulI->getOperand(1)); - if (!R0 || !R1 || !I0 || !I1) { - LLVM_DEBUG(dbgs() << " - Mul operand not Instruction\n"); - return nullptr; - } + Value *R0 = RealMulI->getOperand(0); + Value *R1 = RealMulI->getOperand(1); + Value *I0 = ImagMulI->getOperand(0); + Value *I1 = ImagMulI->getOperand(1); - Instruction *CommonOperand; - Instruction *UncommonRealOp; - Instruction *UncommonImagOp; + Value *CommonOperand; + Value *UncommonRealOp; + Value *UncommonImagOp; if (R0 == I0 || R0 == I1) { CommonOperand = R0; @@ -589,7 +724,7 @@ ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real, Rotation == ComplexDeinterleavingRotation::Rotation_270) std::swap(UncommonRealOp, UncommonImagOp); - std::pair<Instruction *, Instruction *> PartialMatch( + std::pair<Value *, Value *> PartialMatch( (Rotation == ComplexDeinterleavingRotation::Rotation_0 || Rotation == ComplexDeinterleavingRotation::Rotation_180) ? CommonOperand @@ -598,8 +733,16 @@ ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real, Rotation == ComplexDeinterleavingRotation::Rotation_270) ? CommonOperand : nullptr); - NodePtr CNode = identifyNodeWithImplicitAdd( - cast<Instruction>(CR), cast<Instruction>(CI), PartialMatch); + + auto *CRInst = dyn_cast<Instruction>(CR); + auto *CIInst = dyn_cast<Instruction>(CI); + + if (!CRInst || !CIInst) { + LLVM_DEBUG(dbgs() << " - Common operands are not instructions.\n"); + return nullptr; + } + + NodePtr CNode = identifyNodeWithImplicitAdd(CRInst, CIInst, PartialMatch); if (!CNode) { LLVM_DEBUG(dbgs() << " - No cnode identified\n"); return nullptr; @@ -620,8 +763,6 @@ ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real, NodePtr Node = prepareCompositeNode( ComplexDeinterleavingOperation::CMulPartial, Real, Imag); - Node->addInstruction(RealMulI); - Node->addInstruction(ImagMulI); Node->Rotation = Rotation; Node->addOperand(CommonRes); Node->addOperand(UncommonRes); @@ -696,129 +837,603 @@ static bool isInstructionPairMul(Instruction *A, Instruction *B) { return match(A, Pattern) && match(B, Pattern); } +static bool isInstructionPotentiallySymmetric(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FNeg: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + return true; + default: + return false; + } +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, + Instruction *Imag) { + if (Real->getOpcode() != Imag->getOpcode()) + return nullptr; + + if (!isInstructionPotentiallySymmetric(Real) || + !isInstructionPotentiallySymmetric(Imag)) + return nullptr; + + auto *R0 = Real->getOperand(0); + auto *I0 = Imag->getOperand(0); + + NodePtr Op0 = identifyNode(R0, I0); + NodePtr Op1 = nullptr; + if (Op0 == nullptr) + return nullptr; + + if (Real->isBinaryOp()) { + auto *R1 = Real->getOperand(1); + auto *I1 = Imag->getOperand(1); + Op1 = identifyNode(R1, I1); + if (Op1 == nullptr) + return nullptr; + } + + if (isa<FPMathOperator>(Real) && + Real->getFastMathFlags() != Imag->getFastMathFlags()) + return nullptr; + + auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, + Real, Imag); + Node->Opcode = Real->getOpcode(); + if (isa<FPMathOperator>(Real)) + Node->Flags = Real->getFastMathFlags(); + + Node->addOperand(Op0); + if (Real->isBinaryOp()) + Node->addOperand(Op1); + + return submitCompositeNode(Node); +} + ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) { - LLVM_DEBUG(dbgs() << "identifyNode on " << *Real << " / " << *Imag << "\n"); - if (NodePtr CN = getContainingComposite(Real, Imag)) { +ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { + LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n"); + assert(R->getType() == I->getType() && + "Real and imaginary parts should not have different types"); + if (NodePtr CN = getContainingComposite(R, I)) { LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); return CN; } - auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real); - auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag); - if (RealShuffle && ImagShuffle) { - Value *RealOp1 = RealShuffle->getOperand(1); - if (!isa<UndefValue>(RealOp1) && !isa<ConstantAggregateZero>(RealOp1)) { - LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n"); - return nullptr; - } - Value *ImagOp1 = ImagShuffle->getOperand(1); - if (!isa<UndefValue>(ImagOp1) && !isa<ConstantAggregateZero>(ImagOp1)) { - LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n"); - return nullptr; - } + if (NodePtr CN = identifySplat(R, I)) + return CN; + + auto *Real = dyn_cast<Instruction>(R); + auto *Imag = dyn_cast<Instruction>(I); + if (!Real || !Imag) + return nullptr; + + if (NodePtr CN = identifyDeinterleave(Real, Imag)) + return CN; + + if (NodePtr CN = identifyPHINode(Real, Imag)) + return CN; + + if (NodePtr CN = identifySelectNode(Real, Imag)) + return CN; + + auto *VTy = cast<VectorType>(Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + + bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy); + bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CAdd, NewVTy); + + if (HasCMulSupport && isInstructionPairMul(Real, Imag)) { + if (NodePtr CN = identifyPartialMul(Real, Imag)) + return CN; + } + + if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) { + if (NodePtr CN = identifyAdd(Real, Imag)) + return CN; + } + + if (HasCMulSupport && HasCAddSupport) { + if (NodePtr CN = identifyReassocNodes(Real, Imag)) + return CN; + } + + if (NodePtr CN = identifySymmetricOperation(Real, Imag)) + return CN; - Value *RealOp0 = RealShuffle->getOperand(0); - Value *ImagOp0 = ImagShuffle->getOperand(0); + LLVM_DEBUG(dbgs() << " - Not recognised as a valid pattern.\n"); + return nullptr; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, + Instruction *Imag) { + auto IsOperationSupported = [](unsigned Opcode) -> bool { + return Opcode == Instruction::FAdd || Opcode == Instruction::FSub || + Opcode == Instruction::FNeg || Opcode == Instruction::Add || + Opcode == Instruction::Sub; + }; + + if (!IsOperationSupported(Real->getOpcode()) || + !IsOperationSupported(Imag->getOpcode())) + return nullptr; - if (RealOp0 != ImagOp0) { - LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n"); + std::optional<FastMathFlags> Flags; + if (isa<FPMathOperator>(Real)) { + if (Real->getFastMathFlags() != Imag->getFastMathFlags()) { + LLVM_DEBUG(dbgs() << "The flags in Real and Imaginary instructions are " + "not identical\n"); return nullptr; } - ArrayRef<int> RealMask = RealShuffle->getShuffleMask(); - ArrayRef<int> ImagMask = ImagShuffle->getShuffleMask(); - if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) { - LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n"); + Flags = Real->getFastMathFlags(); + if (!Flags->allowReassoc()) { + LLVM_DEBUG( + dbgs() + << "the 'Reassoc' attribute is missing in the FastMath flags\n"); return nullptr; } + } - if (RealMask[0] != 0 || ImagMask[0] != 1) { - LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n"); - return nullptr; + // Collect multiplications and addend instructions from the given instruction + // while traversing it operands. Additionally, verify that all instructions + // have the same fast math flags. + auto Collect = [&Flags](Instruction *Insn, std::vector<Product> &Muls, + std::list<Addend> &Addends) -> bool { + SmallVector<PointerIntPair<Value *, 1, bool>> Worklist = {{Insn, true}}; + SmallPtrSet<Value *, 8> Visited; + while (!Worklist.empty()) { + auto [V, IsPositive] = Worklist.back(); + Worklist.pop_back(); + if (!Visited.insert(V).second) + continue; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) { + Addends.emplace_back(V, IsPositive); + continue; + } + + // If an instruction has more than one user, it indicates that it either + // has an external user, which will be later checked by the checkNodes + // function, or it is a subexpression utilized by multiple expressions. In + // the latter case, we will attempt to separately identify the complex + // operation from here in order to create a shared + // ComplexDeinterleavingCompositeNode. + if (I != Insn && I->getNumUses() > 1) { + LLVM_DEBUG(dbgs() << "Found potential sub-expression: " << *I << "\n"); + Addends.emplace_back(I, IsPositive); + continue; + } + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::Add: + Worklist.emplace_back(I->getOperand(1), IsPositive); + Worklist.emplace_back(I->getOperand(0), IsPositive); + break; + case Instruction::FSub: + Worklist.emplace_back(I->getOperand(1), !IsPositive); + Worklist.emplace_back(I->getOperand(0), IsPositive); + break; + case Instruction::Sub: + if (isNeg(I)) { + Worklist.emplace_back(getNegOperand(I), !IsPositive); + } else { + Worklist.emplace_back(I->getOperand(1), !IsPositive); + Worklist.emplace_back(I->getOperand(0), IsPositive); + } + break; + case Instruction::FMul: + case Instruction::Mul: { + Value *A, *B; + if (isNeg(I->getOperand(0))) { + A = getNegOperand(I->getOperand(0)); + IsPositive = !IsPositive; + } else { + A = I->getOperand(0); + } + + if (isNeg(I->getOperand(1))) { + B = getNegOperand(I->getOperand(1)); + IsPositive = !IsPositive; + } else { + B = I->getOperand(1); + } + Muls.push_back(Product{A, B, IsPositive}); + break; + } + case Instruction::FNeg: + Worklist.emplace_back(I->getOperand(0), !IsPositive); + break; + default: + Addends.emplace_back(I, IsPositive); + continue; + } + + if (Flags && I->getFastMathFlags() != *Flags) { + LLVM_DEBUG(dbgs() << "The instruction's fast math flags are " + "inconsistent with the root instructions' flags: " + << *I << "\n"); + return false; + } } + return true; + }; - // Type checking, the shuffle type should be a vector type of the same - // scalar type, but half the size - auto CheckType = [&](ShuffleVectorInst *Shuffle) { - Value *Op = Shuffle->getOperand(0); - auto *ShuffleTy = cast<FixedVectorType>(Shuffle->getType()); - auto *OpTy = cast<FixedVectorType>(Op->getType()); + std::vector<Product> RealMuls, ImagMuls; + std::list<Addend> RealAddends, ImagAddends; + if (!Collect(Real, RealMuls, RealAddends) || + !Collect(Imag, ImagMuls, ImagAddends)) + return nullptr; - if (OpTy->getScalarType() != ShuffleTy->getScalarType()) - return false; - if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements()) - return false; + if (RealAddends.size() != ImagAddends.size()) + return nullptr; - return true; - }; + NodePtr FinalNode; + if (!RealMuls.empty() || !ImagMuls.empty()) { + // If there are multiplicands, extract positive addend and use it as an + // accumulator + FinalNode = extractPositiveAddend(RealAddends, ImagAddends); + FinalNode = identifyMultiplications(RealMuls, ImagMuls, FinalNode); + if (!FinalNode) + return nullptr; + } - auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool { - if (!CheckType(Shuffle)) - return false; + // Identify and process remaining additions + if (!RealAddends.empty() || !ImagAddends.empty()) { + FinalNode = identifyAdditions(RealAddends, ImagAddends, Flags, FinalNode); + if (!FinalNode) + return nullptr; + } + assert(FinalNode && "FinalNode can not be nullptr here"); + // Set the Real and Imag fields of the final node and submit it + FinalNode->Real = Real; + FinalNode->Imag = Imag; + submitCompositeNode(FinalNode); + return FinalNode; +} - ArrayRef<int> Mask = Shuffle->getShuffleMask(); - int Last = *Mask.rbegin(); +bool ComplexDeinterleavingGraph::collectPartialMuls( + const std::vector<Product> &RealMuls, const std::vector<Product> &ImagMuls, + std::vector<PartialMulCandidate> &PartialMulCandidates) { + // Helper function to extract a common operand from two products + auto FindCommonInstruction = [](const Product &Real, + const Product &Imag) -> Value * { + if (Real.Multiplicand == Imag.Multiplicand || + Real.Multiplicand == Imag.Multiplier) + return Real.Multiplicand; - Value *Op = Shuffle->getOperand(0); - auto *OpTy = cast<FixedVectorType>(Op->getType()); - int NumElements = OpTy->getNumElements(); + if (Real.Multiplier == Imag.Multiplicand || + Real.Multiplier == Imag.Multiplier) + return Real.Multiplier; - // Ensure that the deinterleaving shuffle only pulls from the first - // shuffle operand. - return Last < NumElements; - }; + return nullptr; + }; + + // Iterating over real and imaginary multiplications to find common operands + // If a common operand is found, a partial multiplication candidate is created + // and added to the candidates vector The function returns false if no common + // operands are found for any product + for (unsigned i = 0; i < RealMuls.size(); ++i) { + bool FoundCommon = false; + for (unsigned j = 0; j < ImagMuls.size(); ++j) { + auto *Common = FindCommonInstruction(RealMuls[i], ImagMuls[j]); + if (!Common) + continue; + + auto *A = RealMuls[i].Multiplicand == Common ? RealMuls[i].Multiplier + : RealMuls[i].Multiplicand; + auto *B = ImagMuls[j].Multiplicand == Common ? ImagMuls[j].Multiplier + : ImagMuls[j].Multiplicand; + + auto Node = identifyNode(A, B); + if (Node) { + FoundCommon = true; + PartialMulCandidates.push_back({Common, Node, i, j, false}); + } + + Node = identifyNode(B, A); + if (Node) { + FoundCommon = true; + PartialMulCandidates.push_back({Common, Node, i, j, true}); + } + } + if (!FoundCommon) + return false; + } + return true; +} - if (RealShuffle->getType() != ImagShuffle->getType()) { - LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n"); - return nullptr; +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyMultiplications( + std::vector<Product> &RealMuls, std::vector<Product> &ImagMuls, + NodePtr Accumulator = nullptr) { + if (RealMuls.size() != ImagMuls.size()) + return nullptr; + + std::vector<PartialMulCandidate> Info; + if (!collectPartialMuls(RealMuls, ImagMuls, Info)) + return nullptr; + + // Map to store common instruction to node pointers + std::map<Value *, NodePtr> CommonToNode; + std::vector<bool> Processed(Info.size(), false); + for (unsigned I = 0; I < Info.size(); ++I) { + if (Processed[I]) + continue; + + PartialMulCandidate &InfoA = Info[I]; + for (unsigned J = I + 1; J < Info.size(); ++J) { + if (Processed[J]) + continue; + + PartialMulCandidate &InfoB = Info[J]; + auto *InfoReal = &InfoA; + auto *InfoImag = &InfoB; + + auto NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common); + if (!NodeFromCommon) { + std::swap(InfoReal, InfoImag); + NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common); + } + if (!NodeFromCommon) + continue; + + CommonToNode[InfoReal->Common] = NodeFromCommon; + CommonToNode[InfoImag->Common] = NodeFromCommon; + Processed[I] = true; + Processed[J] = true; } - if (!CheckDeinterleavingShuffle(RealShuffle)) { - LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n"); + } + + std::vector<bool> ProcessedReal(RealMuls.size(), false); + std::vector<bool> ProcessedImag(ImagMuls.size(), false); + NodePtr Result = Accumulator; + for (auto &PMI : Info) { + if (ProcessedReal[PMI.RealIdx] || ProcessedImag[PMI.ImagIdx]) + continue; + + auto It = CommonToNode.find(PMI.Common); + // TODO: Process independent complex multiplications. Cases like this: + // A.real() * B where both A and B are complex numbers. + if (It == CommonToNode.end()) { + LLVM_DEBUG({ + dbgs() << "Unprocessed independent partial multiplication:\n"; + for (auto *Mul : {&RealMuls[PMI.RealIdx], &RealMuls[PMI.RealIdx]}) + dbgs().indent(4) << (Mul->IsPositive ? "+" : "-") << *Mul->Multiplier + << " multiplied by " << *Mul->Multiplicand << "\n"; + }); return nullptr; } - if (!CheckDeinterleavingShuffle(ImagShuffle)) { - LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n"); - return nullptr; + + auto &RealMul = RealMuls[PMI.RealIdx]; + auto &ImagMul = ImagMuls[PMI.ImagIdx]; + + auto NodeA = It->second; + auto NodeB = PMI.Node; + auto IsMultiplicandReal = PMI.Common == NodeA->Real; + // The following table illustrates the relationship between multiplications + // and rotations. If we consider the multiplication (X + iY) * (U + iV), we + // can see: + // + // Rotation | Real | Imag | + // ---------+--------+--------+ + // 0 | x * u | x * v | + // 90 | -y * v | y * u | + // 180 | -x * u | -x * v | + // 270 | y * v | -y * u | + // + // Check if the candidate can indeed be represented by partial + // multiplication + // TODO: Add support for multiplication by complex one + if ((IsMultiplicandReal && PMI.IsNodeInverted) || + (!IsMultiplicandReal && !PMI.IsNodeInverted)) + continue; + + // Determine the rotation based on the multiplications + ComplexDeinterleavingRotation Rotation; + if (IsMultiplicandReal) { + // Detect 0 and 180 degrees rotation + if (RealMul.IsPositive && ImagMul.IsPositive) + Rotation = llvm::ComplexDeinterleavingRotation::Rotation_0; + else if (!RealMul.IsPositive && !ImagMul.IsPositive) + Rotation = llvm::ComplexDeinterleavingRotation::Rotation_180; + else + continue; + + } else { + // Detect 90 and 270 degrees rotation + if (!RealMul.IsPositive && ImagMul.IsPositive) + Rotation = llvm::ComplexDeinterleavingRotation::Rotation_90; + else if (RealMul.IsPositive && !ImagMul.IsPositive) + Rotation = llvm::ComplexDeinterleavingRotation::Rotation_270; + else + continue; } - NodePtr PlaceholderNode = - prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Shuffle, - RealShuffle, ImagShuffle); - PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0); - return submitCompositeNode(PlaceholderNode); + LLVM_DEBUG({ + dbgs() << "Identified partial multiplication (X, Y) * (U, V):\n"; + dbgs().indent(4) << "X: " << *NodeA->Real << "\n"; + dbgs().indent(4) << "Y: " << *NodeA->Imag << "\n"; + dbgs().indent(4) << "U: " << *NodeB->Real << "\n"; + dbgs().indent(4) << "V: " << *NodeB->Imag << "\n"; + dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n"; + }); + + NodePtr NodeMul = prepareCompositeNode( + ComplexDeinterleavingOperation::CMulPartial, nullptr, nullptr); + NodeMul->Rotation = Rotation; + NodeMul->addOperand(NodeA); + NodeMul->addOperand(NodeB); + if (Result) + NodeMul->addOperand(Result); + submitCompositeNode(NodeMul); + Result = NodeMul; + ProcessedReal[PMI.RealIdx] = true; + ProcessedImag[PMI.ImagIdx] = true; } - if (RealShuffle || ImagShuffle) + + // Ensure all products have been processed, if not return nullptr. + if (!all_of(ProcessedReal, [](bool V) { return V; }) || + !all_of(ProcessedImag, [](bool V) { return V; })) { + + // Dump debug information about which partial multiplications are not + // processed. + LLVM_DEBUG({ + dbgs() << "Unprocessed products (Real):\n"; + for (size_t i = 0; i < ProcessedReal.size(); ++i) { + if (!ProcessedReal[i]) + dbgs().indent(4) << (RealMuls[i].IsPositive ? "+" : "-") + << *RealMuls[i].Multiplier << " multiplied by " + << *RealMuls[i].Multiplicand << "\n"; + } + dbgs() << "Unprocessed products (Imag):\n"; + for (size_t i = 0; i < ProcessedImag.size(); ++i) { + if (!ProcessedImag[i]) + dbgs().indent(4) << (ImagMuls[i].IsPositive ? "+" : "-") + << *ImagMuls[i].Multiplier << " multiplied by " + << *ImagMuls[i].Multiplicand << "\n"; + } + }); return nullptr; + } - auto *VTy = cast<FixedVectorType>(Real->getType()); - auto *NewVTy = - FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + return Result; +} - if (TL->isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation::CMulPartial, NewVTy) && - isInstructionPairMul(Real, Imag)) { - return identifyPartialMul(Real, Imag); - } +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyAdditions( + std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends, + std::optional<FastMathFlags> Flags, NodePtr Accumulator = nullptr) { + if (RealAddends.size() != ImagAddends.size()) + return nullptr; - if (TL->isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation::CAdd, NewVTy) && - isInstructionPairAdd(Real, Imag)) { - return identifyAdd(Real, Imag); + NodePtr Result; + // If we have accumulator use it as first addend + if (Accumulator) + Result = Accumulator; + // Otherwise find an element with both positive real and imaginary parts. + else + Result = extractPositiveAddend(RealAddends, ImagAddends); + + if (!Result) + return nullptr; + + while (!RealAddends.empty()) { + auto ItR = RealAddends.begin(); + auto [R, IsPositiveR] = *ItR; + + bool FoundImag = false; + for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { + auto [I, IsPositiveI] = *ItI; + ComplexDeinterleavingRotation Rotation; + if (IsPositiveR && IsPositiveI) + Rotation = ComplexDeinterleavingRotation::Rotation_0; + else if (!IsPositiveR && IsPositiveI) + Rotation = ComplexDeinterleavingRotation::Rotation_90; + else if (!IsPositiveR && !IsPositiveI) + Rotation = ComplexDeinterleavingRotation::Rotation_180; + else + Rotation = ComplexDeinterleavingRotation::Rotation_270; + + NodePtr AddNode; + if (Rotation == ComplexDeinterleavingRotation::Rotation_0 || + Rotation == ComplexDeinterleavingRotation::Rotation_180) { + AddNode = identifyNode(R, I); + } else { + AddNode = identifyNode(I, R); + } + if (AddNode) { + LLVM_DEBUG({ + dbgs() << "Identified addition:\n"; + dbgs().indent(4) << "X: " << *R << "\n"; + dbgs().indent(4) << "Y: " << *I << "\n"; + dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n"; + }); + + NodePtr TmpNode; + if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_0) { + TmpNode = prepareCompositeNode( + ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr); + if (Flags) { + TmpNode->Opcode = Instruction::FAdd; + TmpNode->Flags = *Flags; + } else { + TmpNode->Opcode = Instruction::Add; + } + } else if (Rotation == + llvm::ComplexDeinterleavingRotation::Rotation_180) { + TmpNode = prepareCompositeNode( + ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr); + if (Flags) { + TmpNode->Opcode = Instruction::FSub; + TmpNode->Flags = *Flags; + } else { + TmpNode->Opcode = Instruction::Sub; + } + } else { + TmpNode = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd, + nullptr, nullptr); + TmpNode->Rotation = Rotation; + } + + TmpNode->addOperand(Result); + TmpNode->addOperand(AddNode); + submitCompositeNode(TmpNode); + Result = TmpNode; + RealAddends.erase(ItR); + ImagAddends.erase(ItI); + FoundImag = true; + break; + } + } + if (!FoundImag) + return nullptr; } + return Result; +} +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::extractPositiveAddend( + std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends) { + for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) { + for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { + auto [R, IsPositiveR] = *ItR; + auto [I, IsPositiveI] = *ItI; + if (IsPositiveR && IsPositiveI) { + auto Result = identifyNode(R, I); + if (Result) { + RealAddends.erase(ItR); + ImagAddends.erase(ItI); + return Result; + } + } + } + } return nullptr; } bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { - Instruction *Real; - Instruction *Imag; - if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag)))) - return false; + // This potential root instruction might already have been recognized as + // reduction. Because RootToNode maps both Real and Imaginary parts to + // CompositeNode we should choose only one either Real or Imag instruction to + // use as an anchor for generating complex instruction. + auto It = RootToNode.find(RootI); + if (It != RootToNode.end() && It->second->Real == RootI) { + OrderedRoots.push_back(RootI); + return true; + } - RootValue = RootI; - AllInstructions.insert(RootI); - RootNode = identifyNode(Real, Imag); + auto RootNode = identifyRoot(RootI); + if (!RootNode) + return false; LLVM_DEBUG({ Function *F = RootI->getFunction(); @@ -828,62 +1443,627 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { dump(dbgs()); dbgs() << "\n"; }); + RootToNode[RootI] = RootNode; + OrderedRoots.push_back(RootI); + return true; +} - // Check all instructions have internal uses - for (const auto &Node : CompositeNodes) { - if (!Node->hasAllInternalUses(AllInstructions)) { - LLVM_DEBUG(dbgs() << " - Invalid internal uses\n"); - return false; +bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) { + bool FoundPotentialReduction = false; + + auto *Br = dyn_cast<BranchInst>(B->getTerminator()); + if (!Br || Br->getNumSuccessors() != 2) + return false; + + // Identify simple one-block loop + if (Br->getSuccessor(0) != B && Br->getSuccessor(1) != B) + return false; + + SmallVector<PHINode *> PHIs; + for (auto &PHI : B->phis()) { + if (PHI.getNumIncomingValues() != 2) + continue; + + if (!PHI.getType()->isVectorTy()) + continue; + + auto *ReductionOp = dyn_cast<Instruction>(PHI.getIncomingValueForBlock(B)); + if (!ReductionOp) + continue; + + // Check if final instruction is reduced outside of current block + Instruction *FinalReduction = nullptr; + auto NumUsers = 0u; + for (auto *U : ReductionOp->users()) { + ++NumUsers; + if (U == &PHI) + continue; + FinalReduction = dyn_cast<Instruction>(U); + } + + if (NumUsers != 2 || !FinalReduction || FinalReduction->getParent() == B || + isa<PHINode>(FinalReduction)) + continue; + + ReductionInfo[ReductionOp] = {&PHI, FinalReduction}; + BackEdge = B; + auto BackEdgeIdx = PHI.getBasicBlockIndex(B); + auto IncomingIdx = BackEdgeIdx == 0 ? 1 : 0; + Incoming = PHI.getIncomingBlock(IncomingIdx); + FoundPotentialReduction = true; + + // If the initial value of PHINode is an Instruction, consider it a leaf + // value of a complex deinterleaving graph. + if (auto *InitPHI = + dyn_cast<Instruction>(PHI.getIncomingValueForBlock(Incoming))) + FinalInstructions.insert(InitPHI); + } + return FoundPotentialReduction; +} + +void ComplexDeinterleavingGraph::identifyReductionNodes() { + SmallVector<bool> Processed(ReductionInfo.size(), false); + SmallVector<Instruction *> OperationInstruction; + for (auto &P : ReductionInfo) + OperationInstruction.push_back(P.first); + + // Identify a complex computation by evaluating two reduction operations that + // potentially could be involved + for (size_t i = 0; i < OperationInstruction.size(); ++i) { + if (Processed[i]) + continue; + for (size_t j = i + 1; j < OperationInstruction.size(); ++j) { + if (Processed[j]) + continue; + + auto *Real = OperationInstruction[i]; + auto *Imag = OperationInstruction[j]; + if (Real->getType() != Imag->getType()) + continue; + + RealPHI = ReductionInfo[Real].first; + ImagPHI = ReductionInfo[Imag].first; + PHIsFound = false; + auto Node = identifyNode(Real, Imag); + if (!Node) { + std::swap(Real, Imag); + std::swap(RealPHI, ImagPHI); + Node = identifyNode(Real, Imag); + } + + // If a node is identified and reduction PHINode is used in the chain of + // operations, mark its operation instructions as used to prevent + // re-identification and attach the node to the real part + if (Node && PHIsFound) { + LLVM_DEBUG(dbgs() << "Identified reduction starting from instructions: " + << *Real << " / " << *Imag << "\n"); + Processed[i] = true; + Processed[j] = true; + auto RootNode = prepareCompositeNode( + ComplexDeinterleavingOperation::ReductionOperation, Real, Imag); + RootNode->addOperand(Node); + RootToNode[Real] = RootNode; + RootToNode[Imag] = RootNode; + submitCompositeNode(RootNode); + break; + } } } - return RootNode != nullptr; + + RealPHI = nullptr; + ImagPHI = nullptr; } -Value *ComplexDeinterleavingGraph::replaceNode( - ComplexDeinterleavingGraph::RawNodePtr Node) { - if (Node->ReplacementNode) - return Node->ReplacementNode; +bool ComplexDeinterleavingGraph::checkNodes() { + // Collect all instructions from roots to leaves + SmallPtrSet<Instruction *, 16> AllInstructions; + SmallVector<Instruction *, 8> Worklist; + for (auto &Pair : RootToNode) + Worklist.push_back(Pair.first); - Value *Input0 = replaceNode(Node->Operands[0]); - Value *Input1 = replaceNode(Node->Operands[1]); - Value *Accumulator = - Node->Operands.size() > 2 ? replaceNode(Node->Operands[2]) : nullptr; + // Extract all instructions that are used by all XCMLA/XCADD/ADD/SUB/NEG + // chains + while (!Worklist.empty()) { + auto *I = Worklist.back(); + Worklist.pop_back(); - assert(Input0->getType() == Input1->getType() && - "Node inputs need to be of the same type"); + if (!AllInstructions.insert(I).second) + continue; - Node->ReplacementNode = TL->createComplexDeinterleavingIR( - Node->Real, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); + for (Value *Op : I->operands()) { + if (auto *OpI = dyn_cast<Instruction>(Op)) { + if (!FinalInstructions.count(I)) + Worklist.emplace_back(OpI); + } + } + } - assert(Node->ReplacementNode && "Target failed to create Intrinsic call."); - NumComplexTransformations += 1; - return Node->ReplacementNode; + // Find instructions that have users outside of chain + SmallVector<Instruction *, 2> OuterInstructions; + for (auto *I : AllInstructions) { + // Skip root nodes + if (RootToNode.count(I)) + continue; + + for (User *U : I->users()) { + if (AllInstructions.count(cast<Instruction>(U))) + continue; + + // Found an instruction that is not used by XCMLA/XCADD chain + Worklist.emplace_back(I); + break; + } + } + + // If any instructions are found to be used outside, find and remove roots + // that somehow connect to those instructions. + SmallPtrSet<Instruction *, 16> Visited; + while (!Worklist.empty()) { + auto *I = Worklist.back(); + Worklist.pop_back(); + if (!Visited.insert(I).second) + continue; + + // Found an impacted root node. Removing it from the nodes to be + // deinterleaved + if (RootToNode.count(I)) { + LLVM_DEBUG(dbgs() << "Instruction " << *I + << " could be deinterleaved but its chain of complex " + "operations have an outside user\n"); + RootToNode.erase(I); + } + + if (!AllInstructions.count(I) || FinalInstructions.count(I)) + continue; + + for (User *U : I->users()) + Worklist.emplace_back(cast<Instruction>(U)); + + for (Value *Op : I->operands()) { + if (auto *OpI = dyn_cast<Instruction>(Op)) + Worklist.emplace_back(OpI); + } + } + return !RootToNode.empty(); } -void ComplexDeinterleavingGraph::replaceNodes() { - Value *R = replaceNode(RootNode.get()); - assert(R && "Unable to find replacement for RootValue"); - RootValue->replaceAllUsesWith(R); +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) { + if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) { + if (Intrinsic->getIntrinsicID() != + Intrinsic::experimental_vector_interleave2) + return nullptr; + + auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0)); + auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(1)); + if (!Real || !Imag) + return nullptr; + + return identifyNode(Real, Imag); + } + + auto *SVI = dyn_cast<ShuffleVectorInst>(RootI); + if (!SVI) + return nullptr; + + // Look for a shufflevector that takes separate vectors of the real and + // imaginary components and recombines them into a single vector. + if (!isInterleavingMask(SVI->getShuffleMask())) + return nullptr; + + Instruction *Real; + Instruction *Imag; + if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag)))) + return nullptr; + + return identifyNode(Real, Imag); } -bool ComplexDeinterleavingCompositeNode::hasAllInternalUses( - SmallPtrSet<Instruction *, 16> &AllInstructions) { - if (Operation == ComplexDeinterleavingOperation::Shuffle) +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real, + Instruction *Imag) { + Instruction *I = nullptr; + Value *FinalValue = nullptr; + if (match(Real, m_ExtractValue<0>(m_Instruction(I))) && + match(Imag, m_ExtractValue<1>(m_Specific(I))) && + match(I, m_Intrinsic<Intrinsic::experimental_vector_deinterleave2>( + m_Value(FinalValue)))) { + NodePtr PlaceholderNode = prepareCompositeNode( + llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag); + PlaceholderNode->ReplacementNode = FinalValue; + FinalInstructions.insert(Real); + FinalInstructions.insert(Imag); + return submitCompositeNode(PlaceholderNode); + } + + auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real); + auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag); + if (!RealShuffle || !ImagShuffle) { + if (RealShuffle || ImagShuffle) + LLVM_DEBUG(dbgs() << " - There's a shuffle where there shouldn't be.\n"); + return nullptr; + } + + Value *RealOp1 = RealShuffle->getOperand(1); + if (!isa<UndefValue>(RealOp1) && !isa<ConstantAggregateZero>(RealOp1)) { + LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n"); + return nullptr; + } + Value *ImagOp1 = ImagShuffle->getOperand(1); + if (!isa<UndefValue>(ImagOp1) && !isa<ConstantAggregateZero>(ImagOp1)) { + LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n"); + return nullptr; + } + + Value *RealOp0 = RealShuffle->getOperand(0); + Value *ImagOp0 = ImagShuffle->getOperand(0); + + if (RealOp0 != ImagOp0) { + LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n"); + return nullptr; + } + + ArrayRef<int> RealMask = RealShuffle->getShuffleMask(); + ArrayRef<int> ImagMask = ImagShuffle->getShuffleMask(); + if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) { + LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n"); + return nullptr; + } + + if (RealMask[0] != 0 || ImagMask[0] != 1) { + LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n"); + return nullptr; + } + + // Type checking, the shuffle type should be a vector type of the same + // scalar type, but half the size + auto CheckType = [&](ShuffleVectorInst *Shuffle) { + Value *Op = Shuffle->getOperand(0); + auto *ShuffleTy = cast<FixedVectorType>(Shuffle->getType()); + auto *OpTy = cast<FixedVectorType>(Op->getType()); + + if (OpTy->getScalarType() != ShuffleTy->getScalarType()) + return false; + if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements()) + return false; + return true; + }; - for (auto *User : Real->users()) { - if (!AllInstructions.contains(cast<Instruction>(User))) + auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool { + if (!CheckType(Shuffle)) return false; + + ArrayRef<int> Mask = Shuffle->getShuffleMask(); + int Last = *Mask.rbegin(); + + Value *Op = Shuffle->getOperand(0); + auto *OpTy = cast<FixedVectorType>(Op->getType()); + int NumElements = OpTy->getNumElements(); + + // Ensure that the deinterleaving shuffle only pulls from the first + // shuffle operand. + return Last < NumElements; + }; + + if (RealShuffle->getType() != ImagShuffle->getType()) { + LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n"); + return nullptr; } - for (auto *User : Imag->users()) { - if (!AllInstructions.contains(cast<Instruction>(User))) - return false; + if (!CheckDeinterleavingShuffle(RealShuffle)) { + LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n"); + return nullptr; } - for (auto *I : InternalInstructions) { - for (auto *User : I->users()) { - if (!AllInstructions.contains(cast<Instruction>(User))) + if (!CheckDeinterleavingShuffle(ImagShuffle)) { + LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n"); + return nullptr; + } + + NodePtr PlaceholderNode = + prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Deinterleave, + RealShuffle, ImagShuffle); + PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0); + FinalInstructions.insert(RealShuffle); + FinalInstructions.insert(ImagShuffle); + return submitCompositeNode(PlaceholderNode); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) { + auto IsSplat = [](Value *V) -> bool { + // Fixed-width vector with constants + if (isa<ConstantDataVector>(V)) + return true; + + VectorType *VTy; + ArrayRef<int> Mask; + // Splats are represented differently depending on whether the repeated + // value is a constant or an Instruction + if (auto *Const = dyn_cast<ConstantExpr>(V)) { + if (Const->getOpcode() != Instruction::ShuffleVector) return false; + VTy = cast<VectorType>(Const->getType()); + Mask = Const->getShuffleMask(); + } else if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) { + VTy = Shuf->getType(); + Mask = Shuf->getShuffleMask(); + } else { + return false; } + + // When the data type is <1 x Type>, it's not possible to differentiate + // between the ComplexDeinterleaving::Deinterleave and + // ComplexDeinterleaving::Splat operations. + if (!VTy->isScalableTy() && VTy->getElementCount().getKnownMinValue() == 1) + return false; + + return all_equal(Mask) && Mask[0] == 0; + }; + + if (!IsSplat(R) || !IsSplat(I)) + return nullptr; + + auto *Real = dyn_cast<Instruction>(R); + auto *Imag = dyn_cast<Instruction>(I); + if ((!Real && Imag) || (Real && !Imag)) + return nullptr; + + if (Real && Imag) { + // Non-constant splats should be in the same basic block + if (Real->getParent() != Imag->getParent()) + return nullptr; + + FinalInstructions.insert(Real); + FinalInstructions.insert(Imag); } - return true; + NodePtr PlaceholderNode = + prepareCompositeNode(ComplexDeinterleavingOperation::Splat, R, I); + return submitCompositeNode(PlaceholderNode); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real, + Instruction *Imag) { + if (Real != RealPHI || Imag != ImagPHI) + return nullptr; + + PHIsFound = true; + NodePtr PlaceholderNode = prepareCompositeNode( + ComplexDeinterleavingOperation::ReductionPHI, Real, Imag); + return submitCompositeNode(PlaceholderNode); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifySelectNode(Instruction *Real, + Instruction *Imag) { + auto *SelectReal = dyn_cast<SelectInst>(Real); + auto *SelectImag = dyn_cast<SelectInst>(Imag); + if (!SelectReal || !SelectImag) + return nullptr; + + Instruction *MaskA, *MaskB; + Instruction *AR, *AI, *RA, *BI; + if (!match(Real, m_Select(m_Instruction(MaskA), m_Instruction(AR), + m_Instruction(RA))) || + !match(Imag, m_Select(m_Instruction(MaskB), m_Instruction(AI), + m_Instruction(BI)))) + return nullptr; + + if (MaskA != MaskB && !MaskA->isIdenticalTo(MaskB)) + return nullptr; + + if (!MaskA->getType()->isVectorTy()) + return nullptr; + + auto NodeA = identifyNode(AR, AI); + if (!NodeA) + return nullptr; + + auto NodeB = identifyNode(RA, BI); + if (!NodeB) + return nullptr; + + NodePtr PlaceholderNode = prepareCompositeNode( + ComplexDeinterleavingOperation::ReductionSelect, Real, Imag); + PlaceholderNode->addOperand(NodeA); + PlaceholderNode->addOperand(NodeB); + FinalInstructions.insert(MaskA); + FinalInstructions.insert(MaskB); + return submitCompositeNode(PlaceholderNode); +} + +static Value *replaceSymmetricNode(IRBuilderBase &B, unsigned Opcode, + std::optional<FastMathFlags> Flags, + Value *InputA, Value *InputB) { + Value *I; + switch (Opcode) { + case Instruction::FNeg: + I = B.CreateFNeg(InputA); + break; + case Instruction::FAdd: + I = B.CreateFAdd(InputA, InputB); + break; + case Instruction::Add: + I = B.CreateAdd(InputA, InputB); + break; + case Instruction::FSub: + I = B.CreateFSub(InputA, InputB); + break; + case Instruction::Sub: + I = B.CreateSub(InputA, InputB); + break; + case Instruction::FMul: + I = B.CreateFMul(InputA, InputB); + break; + case Instruction::Mul: + I = B.CreateMul(InputA, InputB); + break; + default: + llvm_unreachable("Incorrect symmetric opcode"); + } + if (Flags) + cast<Instruction>(I)->setFastMathFlags(*Flags); + return I; +} + +Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, + RawNodePtr Node) { + if (Node->ReplacementNode) + return Node->ReplacementNode; + + auto ReplaceOperandIfExist = [&](RawNodePtr &Node, unsigned Idx) -> Value * { + return Node->Operands.size() > Idx + ? replaceNode(Builder, Node->Operands[Idx]) + : nullptr; + }; + + Value *ReplacementNode; + switch (Node->Operation) { + case ComplexDeinterleavingOperation::CAdd: + case ComplexDeinterleavingOperation::CMulPartial: + case ComplexDeinterleavingOperation::Symmetric: { + Value *Input0 = ReplaceOperandIfExist(Node, 0); + Value *Input1 = ReplaceOperandIfExist(Node, 1); + Value *Accumulator = ReplaceOperandIfExist(Node, 2); + assert(!Input1 || (Input0->getType() == Input1->getType() && + "Node inputs need to be of the same type")); + assert(!Accumulator || + (Input0->getType() == Accumulator->getType() && + "Accumulator and input need to be of the same type")); + if (Node->Operation == ComplexDeinterleavingOperation::Symmetric) + ReplacementNode = replaceSymmetricNode(Builder, Node->Opcode, Node->Flags, + Input0, Input1); + else + ReplacementNode = TL->createComplexDeinterleavingIR( + Builder, Node->Operation, Node->Rotation, Input0, Input1, + Accumulator); + break; + } + case ComplexDeinterleavingOperation::Deinterleave: + llvm_unreachable("Deinterleave node should already have ReplacementNode"); + break; + case ComplexDeinterleavingOperation::Splat: { + auto *NewTy = VectorType::getDoubleElementsVectorType( + cast<VectorType>(Node->Real->getType())); + auto *R = dyn_cast<Instruction>(Node->Real); + auto *I = dyn_cast<Instruction>(Node->Imag); + if (R && I) { + // Splats that are not constant are interleaved where they are located + Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode(); + IRBuilder<> IRB(InsertPoint); + ReplacementNode = + IRB.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, NewTy, + {Node->Real, Node->Imag}); + } else { + ReplacementNode = + Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, + NewTy, {Node->Real, Node->Imag}); + } + break; + } + case ComplexDeinterleavingOperation::ReductionPHI: { + // If Operation is ReductionPHI, a new empty PHINode is created. + // It is filled later when the ReductionOperation is processed. + auto *VTy = cast<VectorType>(Node->Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHI()); + OldToNewPHI[dyn_cast<PHINode>(Node->Real)] = NewPHI; + ReplacementNode = NewPHI; + break; + } + case ComplexDeinterleavingOperation::ReductionOperation: + ReplacementNode = replaceNode(Builder, Node->Operands[0]); + processReductionOperation(ReplacementNode, Node); + break; + case ComplexDeinterleavingOperation::ReductionSelect: { + auto *MaskReal = cast<Instruction>(Node->Real)->getOperand(0); + auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0); + auto *A = replaceNode(Builder, Node->Operands[0]); + auto *B = replaceNode(Builder, Node->Operands[1]); + auto *NewMaskTy = VectorType::getDoubleElementsVectorType( + cast<VectorType>(MaskReal->getType())); + auto *NewMask = + Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, + NewMaskTy, {MaskReal, MaskImag}); + ReplacementNode = Builder.CreateSelect(NewMask, A, B); + break; + } + } + + assert(ReplacementNode && "Target failed to create Intrinsic call."); + NumComplexTransformations += 1; + Node->ReplacementNode = ReplacementNode; + return ReplacementNode; +} + +void ComplexDeinterleavingGraph::processReductionOperation( + Value *OperationReplacement, RawNodePtr Node) { + auto *Real = cast<Instruction>(Node->Real); + auto *Imag = cast<Instruction>(Node->Imag); + auto *OldPHIReal = ReductionInfo[Real].first; + auto *OldPHIImag = ReductionInfo[Imag].first; + auto *NewPHI = OldToNewPHI[OldPHIReal]; + + auto *VTy = cast<VectorType>(Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + + // We have to interleave initial origin values coming from IncomingBlock + Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming); + Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming); + + IRBuilder<> Builder(Incoming->getTerminator()); + auto *NewInit = Builder.CreateIntrinsic( + Intrinsic::experimental_vector_interleave2, NewVTy, {InitReal, InitImag}); + + NewPHI->addIncoming(NewInit, Incoming); + NewPHI->addIncoming(OperationReplacement, BackEdge); + + // Deinterleave complex vector outside of loop so that it can be finally + // reduced + auto *FinalReductionReal = ReductionInfo[Real].second; + auto *FinalReductionImag = ReductionInfo[Imag].second; + + Builder.SetInsertPoint( + &*FinalReductionReal->getParent()->getFirstInsertionPt()); + auto *Deinterleave = Builder.CreateIntrinsic( + Intrinsic::experimental_vector_deinterleave2, + OperationReplacement->getType(), OperationReplacement); + + auto *NewReal = Builder.CreateExtractValue(Deinterleave, (uint64_t)0); + FinalReductionReal->replaceUsesOfWith(Real, NewReal); + + Builder.SetInsertPoint(FinalReductionImag); + auto *NewImag = Builder.CreateExtractValue(Deinterleave, 1); + FinalReductionImag->replaceUsesOfWith(Imag, NewImag); +} + +void ComplexDeinterleavingGraph::replaceNodes() { + SmallVector<Instruction *, 16> DeadInstrRoots; + for (auto *RootInstruction : OrderedRoots) { + // Check if this potential root went through check process and we can + // deinterleave it + if (!RootToNode.count(RootInstruction)) + continue; + + IRBuilder<> Builder(RootInstruction); + auto RootNode = RootToNode[RootInstruction]; + Value *R = replaceNode(Builder, RootNode.get()); + + if (RootNode->Operation == + ComplexDeinterleavingOperation::ReductionOperation) { + auto *RootReal = cast<Instruction>(RootNode->Real); + auto *RootImag = cast<Instruction>(RootNode->Imag); + ReductionInfo[RootReal].first->removeIncomingValue(BackEdge); + ReductionInfo[RootImag].first->removeIncomingValue(BackEdge); + DeadInstrRoots.push_back(cast<Instruction>(RootReal)); + DeadInstrRoots.push_back(cast<Instruction>(RootImag)); + } else { + assert(R && "Unable to find replacement for RootInstruction"); + DeadInstrRoots.push_back(RootInstruction); + RootInstruction->replaceAllUsesWith(R); + } + } + + for (auto *I : DeadInstrRoots) + RecursivelyDeleteTriviallyDeadInstructions(I, TLI); } diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp index eb2d449bc4af..106db7c51f27 100644 --- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -49,7 +49,7 @@ CriticalAntiDepBreaker::~CriticalAntiDepBreaker() = default; void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) { const unsigned BBSize = BB->size(); - for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) { + for (unsigned i = 1, e = TRI->getNumRegs(); i != e; ++i) { // Clear out the register class data. Classes[i] = nullptr; @@ -111,7 +111,7 @@ void CriticalAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count, return; assert(Count < InsertPosIndex && "Instruction index out of expected range!"); - for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) { + for (unsigned Reg = 1; Reg != TRI->getNumRegs(); ++Reg) { if (KillIndices[Reg] != ~0u) { // If Reg is currently live, then mark that it can't be renamed as // we don't know the extent of its live-range anymore (now that it @@ -213,9 +213,8 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) { if (MO.isUse() && Special) { if (!KeepRegs.test(Reg)) { - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - KeepRegs.set(*SubRegs); + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) + KeepRegs.set(SubReg); } } } @@ -238,13 +237,11 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) { // itself can't be changed. if (MI.isRegTiedToUseOperand(I) && Classes[Reg] == reinterpret_cast<TargetRegisterClass *>(-1)) { - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) { - KeepRegs.set(*SubRegs); + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) { + KeepRegs.set(SubReg); } - for (MCSuperRegIterator SuperRegs(Reg, TRI); - SuperRegs.isValid(); ++SuperRegs) { - KeepRegs.set(*SuperRegs); + for (MCPhysReg SuperReg : TRI->superregs(Reg)) { + KeepRegs.set(SuperReg); } } } @@ -264,14 +261,11 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) { if (MO.isRegMask()) { auto ClobbersPhysRegAndSubRegs = [&](unsigned PhysReg) { - for (MCSubRegIterator SRI(PhysReg, TRI, true); SRI.isValid(); ++SRI) - if (!MO.clobbersPhysReg(*SRI)) - return false; - - return true; + return all_of(TRI->subregs_inclusive(PhysReg), + [&](MCPhysReg SR) { return MO.clobbersPhysReg(SR); }); }; - for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) { + for (unsigned i = 1, e = TRI->getNumRegs(); i != e; ++i) { if (ClobbersPhysRegAndSubRegs(i)) { DefIndices[i] = Count; KillIndices[i] = ~0u; @@ -297,8 +291,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) { // For the reg itself and all subregs: update the def to current; // reset the kill state, any restrictions, and references. - for (MCSubRegIterator SRI(Reg, TRI, true); SRI.isValid(); ++SRI) { - unsigned SubregReg = *SRI; + for (MCPhysReg SubregReg : TRI->subregs_inclusive(Reg)) { DefIndices[SubregReg] = Count; KillIndices[SubregReg] = ~0u; Classes[SubregReg] = nullptr; @@ -307,8 +300,8 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) { KeepRegs.reset(SubregReg); } // Conservatively mark super-registers as unusable. - for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) - Classes[*SR] = reinterpret_cast<TargetRegisterClass *>(-1); + for (MCPhysReg SR : TRI->superregs(Reg)) + Classes[SR] = reinterpret_cast<TargetRegisterClass *>(-1); } } for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { @@ -470,7 +463,7 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits, LLVM_DEBUG(dbgs() << "Critical path has total latency " << (Max->getDepth() + Max->Latency) << "\n"); LLVM_DEBUG(dbgs() << "Available regs:"); - for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) { + for (unsigned Reg = 1; Reg < TRI->getNumRegs(); ++Reg) { if (KillIndices[Reg] == ~0u) LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI)); } diff --git a/llvm/lib/CodeGen/DFAPacketizer.cpp b/llvm/lib/CodeGen/DFAPacketizer.cpp index 34fb1d286a58..48bb4a07662e 100644 --- a/llvm/lib/CodeGen/DFAPacketizer.cpp +++ b/llvm/lib/CodeGen/DFAPacketizer.cpp @@ -29,8 +29,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/CodeGen/ScheduleDAGInstrs.h" -#include "llvm/CodeGen/ScheduleDAGMutation.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/MC/MCInstrDesc.h" @@ -98,34 +96,6 @@ unsigned DFAPacketizer::getUsedResources(unsigned InstIdx) { return RS[InstIdx] ^ RS[InstIdx - 1]; } -namespace llvm { - -// This class extends ScheduleDAGInstrs and overrides the schedule method -// to build the dependence graph. -class DefaultVLIWScheduler : public ScheduleDAGInstrs { -private: - AAResults *AA; - /// Ordered list of DAG postprocessing steps. - std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations; - -public: - DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI, - AAResults *AA); - - // Actual scheduling work. - void schedule() override; - - /// DefaultVLIWScheduler takes ownership of the Mutation object. - void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) { - Mutations.push_back(std::move(Mutation)); - } - -protected: - void postprocessDAG(); -}; - -} // end namespace llvm - DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI, AAResults *AA) @@ -134,7 +104,7 @@ DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF, } /// Apply each ScheduleDAGMutation step in order. -void DefaultVLIWScheduler::postprocessDAG() { +void DefaultVLIWScheduler::postProcessDAG() { for (auto &M : Mutations) M->apply(this); } @@ -142,7 +112,7 @@ void DefaultVLIWScheduler::postprocessDAG() { void DefaultVLIWScheduler::schedule() { // Build the scheduling graph. buildSchedGraph(AA); - postprocessDAG(); + postProcessDAG(); } VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf, @@ -264,7 +234,7 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB, "added to packet\n " << MI); // End the packet if resource is not available, or if the instruction - // shoud not be added to the current packet. + // should not be added to the current packet. endPacket(MBB, MI); } diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp index e36db43567c5..6a7de3b241fe 100644 --- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -31,8 +31,8 @@ namespace { class DeadMachineInstructionElim : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; - const MachineRegisterInfo *MRI; - const TargetInstrInfo *TII; + const MachineRegisterInfo *MRI = nullptr; + const TargetInstrInfo *TII = nullptr; LiveRegUnits LivePhysRegs; public: @@ -75,27 +75,25 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const { return false; // Examine each operand. - for (const MachineOperand &MO : MI->operands()) { - if (MO.isReg() && MO.isDef()) { - Register Reg = MO.getReg(); - if (Reg.isPhysical()) { - // Don't delete live physreg defs, or any reserved register defs. - if (!LivePhysRegs.available(Reg) || MRI->isReserved(Reg)) - return false; - } else { - if (MO.isDead()) { + for (const MachineOperand &MO : MI->all_defs()) { + Register Reg = MO.getReg(); + if (Reg.isPhysical()) { + // Don't delete live physreg defs, or any reserved register defs. + if (!LivePhysRegs.available(Reg) || MRI->isReserved(Reg)) + return false; + } else { + if (MO.isDead()) { #ifndef NDEBUG - // Basic check on the register. All of them should be 'undef'. - for (auto &U : MRI->use_nodbg_operands(Reg)) - assert(U.isUndef() && "'Undef' use on a 'dead' register is found!"); + // Basic check on the register. All of them should be 'undef'. + for (auto &U : MRI->use_nodbg_operands(Reg)) + assert(U.isUndef() && "'Undef' use on a 'dead' register is found!"); #endif - continue; - } - for (const MachineInstr &Use : MRI->use_nodbg_instructions(Reg)) { - if (&Use != MI) - // This def has a non-debug use. Don't delete the instruction! - return false; - } + continue; + } + for (const MachineInstr &Use : MRI->use_nodbg_instructions(Reg)) { + if (&Use != MI) + // This def has a non-debug use. Don't delete the instruction! + return false; } } } diff --git a/llvm/lib/CodeGen/DetectDeadLanes.cpp b/llvm/lib/CodeGen/DetectDeadLanes.cpp index bbb89855cfff..86e9f3abe010 100644 --- a/llvm/lib/CodeGen/DetectDeadLanes.cpp +++ b/llvm/lib/CodeGen/DetectDeadLanes.cpp @@ -25,7 +25,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/DetectDeadLanes.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -33,98 +33,19 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include <deque> using namespace llvm; #define DEBUG_TYPE "detect-dead-lanes" -namespace { - -/// Contains a bitmask of which lanes of a given virtual register are -/// defined and which ones are actually used. -struct VRegInfo { - LaneBitmask UsedLanes; - LaneBitmask DefinedLanes; -}; - -class DetectDeadLanes : public MachineFunctionPass { -public: - bool runOnMachineFunction(MachineFunction &MF) override; - - static char ID; - DetectDeadLanes() : MachineFunctionPass(ID) {} - - StringRef getPassName() const override { return "Detect Dead Lanes"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - /// Add used lane bits on the register used by operand \p MO. This translates - /// the bitmask based on the operands subregister, and puts the register into - /// the worklist if any new bits were added. - void addUsedLanesOnOperand(const MachineOperand &MO, LaneBitmask UsedLanes); - - /// Given a bitmask \p UsedLanes for the used lanes on a def output of a - /// COPY-like instruction determine the lanes used on the use operands - /// and call addUsedLanesOnOperand() for them. - void transferUsedLanesStep(const MachineInstr &MI, LaneBitmask UsedLanes); - - /// Given a use regiser operand \p Use and a mask of defined lanes, check - /// if the operand belongs to a lowersToCopies() instruction, transfer the - /// mask to the def and put the instruction into the worklist. - void transferDefinedLanesStep(const MachineOperand &Use, - LaneBitmask DefinedLanes); - - /// Given a mask \p DefinedLanes of lanes defined at operand \p OpNum - /// of COPY-like instruction, determine which lanes are defined at the output - /// operand \p Def. - LaneBitmask transferDefinedLanes(const MachineOperand &Def, unsigned OpNum, - LaneBitmask DefinedLanes) const; - - /// Given a mask \p UsedLanes used from the output of instruction \p MI - /// determine which lanes are used from operand \p MO of this instruction. - LaneBitmask transferUsedLanes(const MachineInstr &MI, LaneBitmask UsedLanes, - const MachineOperand &MO) const; - - std::pair<bool, bool> runOnce(MachineFunction &MF); - - LaneBitmask determineInitialDefinedLanes(unsigned Reg); - LaneBitmask determineInitialUsedLanes(unsigned Reg); - - bool isUndefRegAtInput(const MachineOperand &MO, - const VRegInfo &RegInfo) const; - - bool isUndefInput(const MachineOperand &MO, bool *CrossCopy) const; - - const MachineRegisterInfo *MRI; - const TargetRegisterInfo *TRI; - - void PutInWorklist(unsigned RegIdx) { - if (WorklistMembers.test(RegIdx)) - return; - WorklistMembers.set(RegIdx); - Worklist.push_back(RegIdx); - } - - VRegInfo *VRegInfos; - /// Worklist containing virtreg indexes. - std::deque<unsigned> Worklist; - BitVector WorklistMembers; - /// This bitvector is set for each vreg index where the vreg is defined - /// by an instruction where lowersToCopies()==true. - BitVector DefinedByCopy; -}; - -} // end anonymous namespace - -char DetectDeadLanes::ID = 0; -char &llvm::DetectDeadLanesID = DetectDeadLanes::ID; - -INITIALIZE_PASS(DetectDeadLanes, DEBUG_TYPE, "Detect Dead Lanes", false, false) +DeadLaneDetector::DeadLaneDetector(const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI) + : MRI(MRI), TRI(TRI) { + unsigned NumVirtRegs = MRI->getNumVirtRegs(); + VRegInfos = std::unique_ptr<VRegInfo[]>(new VRegInfo[NumVirtRegs]); + WorklistMembers.resize(NumVirtRegs); + DefinedByCopy.resize(NumVirtRegs); +} /// Returns true if \p MI will get lowered to a series of COPY instructions. /// We call this a COPY-like instruction. @@ -159,11 +80,11 @@ static bool isCrossCopy(const MachineRegisterInfo &MRI, unsigned DstSubIdx = 0; switch (MI.getOpcode()) { case TargetOpcode::INSERT_SUBREG: - if (MI.getOperandNo(&MO) == 2) + if (MO.getOperandNo() == 2) DstSubIdx = MI.getOperand(3).getImm(); break; case TargetOpcode::REG_SEQUENCE: { - unsigned OpNum = MI.getOperandNo(&MO); + unsigned OpNum = MO.getOperandNo(); DstSubIdx = MI.getOperand(OpNum+1).getImm(); break; } @@ -184,8 +105,8 @@ static bool isCrossCopy(const MachineRegisterInfo &MRI, return !TRI.getCommonSubClass(SrcRC, DstRC); } -void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO, - LaneBitmask UsedLanes) { +void DeadLaneDetector::addUsedLanesOnOperand(const MachineOperand &MO, + LaneBitmask UsedLanes) { if (!MO.readsReg()) return; Register MOReg = MO.getReg(); @@ -198,7 +119,7 @@ void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO, UsedLanes &= MRI->getMaxLaneMaskForVReg(MOReg); unsigned MORegIdx = Register::virtReg2Index(MOReg); - VRegInfo &MORegInfo = VRegInfos[MORegIdx]; + DeadLaneDetector::VRegInfo &MORegInfo = VRegInfos[MORegIdx]; LaneBitmask PrevUsedLanes = MORegInfo.UsedLanes; // Any change at all? if ((UsedLanes & ~PrevUsedLanes).none()) @@ -210,8 +131,8 @@ void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO, PutInWorklist(MORegIdx); } -void DetectDeadLanes::transferUsedLanesStep(const MachineInstr &MI, - LaneBitmask UsedLanes) { +void DeadLaneDetector::transferUsedLanesStep(const MachineInstr &MI, + LaneBitmask UsedLanes) { for (const MachineOperand &MO : MI.uses()) { if (!MO.isReg() || !MO.getReg().isVirtual()) continue; @@ -220,10 +141,11 @@ void DetectDeadLanes::transferUsedLanesStep(const MachineInstr &MI, } } -LaneBitmask DetectDeadLanes::transferUsedLanes(const MachineInstr &MI, - LaneBitmask UsedLanes, - const MachineOperand &MO) const { - unsigned OpNum = MI.getOperandNo(&MO); +LaneBitmask +DeadLaneDetector::transferUsedLanes(const MachineInstr &MI, + LaneBitmask UsedLanes, + const MachineOperand &MO) const { + unsigned OpNum = MO.getOperandNo(); assert(lowersToCopies(MI) && DefinedByCopy[Register::virtReg2Index(MI.getOperand(0).getReg())]); @@ -265,8 +187,8 @@ LaneBitmask DetectDeadLanes::transferUsedLanes(const MachineInstr &MI, } } -void DetectDeadLanes::transferDefinedLanesStep(const MachineOperand &Use, - LaneBitmask DefinedLanes) { +void DeadLaneDetector::transferDefinedLanesStep(const MachineOperand &Use, + LaneBitmask DefinedLanes) { if (!Use.readsReg()) return; // Check whether the operand writes a vreg and is part of a COPY-like @@ -286,7 +208,7 @@ void DetectDeadLanes::transferDefinedLanesStep(const MachineOperand &Use, if (!DefinedByCopy.test(DefRegIdx)) return; - unsigned OpNum = MI.getOperandNo(&Use); + unsigned OpNum = Use.getOperandNo(); DefinedLanes = TRI->reverseComposeSubRegIndexLaneMask(Use.getSubReg(), DefinedLanes); DefinedLanes = transferDefinedLanes(Def, OpNum, DefinedLanes); @@ -301,8 +223,8 @@ void DetectDeadLanes::transferDefinedLanesStep(const MachineOperand &Use, PutInWorklist(DefRegIdx); } -LaneBitmask DetectDeadLanes::transferDefinedLanes(const MachineOperand &Def, - unsigned OpNum, LaneBitmask DefinedLanes) const { +LaneBitmask DeadLaneDetector::transferDefinedLanes( + const MachineOperand &Def, unsigned OpNum, LaneBitmask DefinedLanes) const { const MachineInstr &MI = *Def.getParent(); // Translate DefinedLanes if necessary. switch (MI.getOpcode()) { @@ -343,7 +265,7 @@ LaneBitmask DetectDeadLanes::transferDefinedLanes(const MachineOperand &Def, return DefinedLanes; } -LaneBitmask DetectDeadLanes::determineInitialDefinedLanes(unsigned Reg) { +LaneBitmask DeadLaneDetector::determineInitialDefinedLanes(unsigned Reg) { // Live-In or unused registers have no definition but are considered fully // defined. if (!MRI->hasOneDef(Reg)) @@ -395,7 +317,7 @@ LaneBitmask DetectDeadLanes::determineInitialDefinedLanes(unsigned Reg) { MOSubReg, MODefinedLanes); } - unsigned OpNum = DefMI.getOperandNo(&MO); + unsigned OpNum = MO.getOperandNo(); DefinedLanes |= transferDefinedLanes(Def, OpNum, MODefinedLanes); } return DefinedLanes; @@ -408,7 +330,7 @@ LaneBitmask DetectDeadLanes::determineInitialDefinedLanes(unsigned Reg) { return MRI->getMaxLaneMaskForVReg(Reg); } -LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) { +LaneBitmask DeadLaneDetector::determineInitialUsedLanes(unsigned Reg) { LaneBitmask UsedLanes = LaneBitmask::getNone(); for (const MachineOperand &MO : MRI->use_nodbg_operands(Reg)) { if (!MO.readsReg()) @@ -449,14 +371,58 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) { return UsedLanes; } -bool DetectDeadLanes::isUndefRegAtInput(const MachineOperand &MO, - const VRegInfo &RegInfo) const { +namespace { + +class DetectDeadLanes : public MachineFunctionPass { +public: + bool runOnMachineFunction(MachineFunction &MF) override; + + static char ID; + DetectDeadLanes() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "Detect Dead Lanes"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + /// update the operand status. + /// The first return value shows whether MF been changed. + /// The second return value indicates we need to call + /// DeadLaneDetector::computeSubRegisterLaneBitInfo and this function again + /// to propagate changes. + std::pair<bool, bool> + modifySubRegisterOperandStatus(const DeadLaneDetector &DLD, + MachineFunction &MF); + + bool isUndefRegAtInput(const MachineOperand &MO, + const DeadLaneDetector::VRegInfo &RegInfo) const; + + bool isUndefInput(const DeadLaneDetector &DLD, const MachineOperand &MO, + bool *CrossCopy) const; + + const MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; +}; + +} // end anonymous namespace + +char DetectDeadLanes::ID = 0; +char &llvm::DetectDeadLanesID = DetectDeadLanes::ID; + +INITIALIZE_PASS(DetectDeadLanes, DEBUG_TYPE, "Detect Dead Lanes", false, false) + +bool DetectDeadLanes::isUndefRegAtInput( + const MachineOperand &MO, const DeadLaneDetector::VRegInfo &RegInfo) const { unsigned SubReg = MO.getSubReg(); LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg); return (RegInfo.DefinedLanes & RegInfo.UsedLanes & Mask).none(); } -bool DetectDeadLanes::isUndefInput(const MachineOperand &MO, +bool DetectDeadLanes::isUndefInput(const DeadLaneDetector &DLD, + const MachineOperand &MO, bool *CrossCopy) const { if (!MO.isUse()) return false; @@ -468,11 +434,11 @@ bool DetectDeadLanes::isUndefInput(const MachineOperand &MO, if (!DefReg.isVirtual()) return false; unsigned DefRegIdx = Register::virtReg2Index(DefReg); - if (!DefinedByCopy.test(DefRegIdx)) + if (!DLD.isDefinedByCopy(DefRegIdx)) return false; - const VRegInfo &DefRegInfo = VRegInfos[DefRegIdx]; - LaneBitmask UsedLanes = transferUsedLanes(MI, DefRegInfo.UsedLanes, MO); + const DeadLaneDetector::VRegInfo &DefRegInfo = DLD.getVRegInfo(DefRegIdx); + LaneBitmask UsedLanes = DLD.transferUsedLanes(MI, DefRegInfo.UsedLanes, MO); if (UsedLanes.any()) return false; @@ -484,7 +450,7 @@ bool DetectDeadLanes::isUndefInput(const MachineOperand &MO, return true; } -std::pair<bool, bool> DetectDeadLanes::runOnce(MachineFunction &MF) { +void DeadLaneDetector::computeSubRegisterLaneBitInfo() { // First pass: Populate defs/uses of vregs with initial values unsigned NumVirtRegs = MRI->getNumVirtRegs(); for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) { @@ -524,7 +490,11 @@ std::pair<bool, bool> DetectDeadLanes::runOnce(MachineFunction &MF) { } dbgs() << "\n"; }); +} +std::pair<bool, bool> +DetectDeadLanes::modifySubRegisterOperandStatus(const DeadLaneDetector &DLD, + MachineFunction &MF) { bool Changed = false; bool Again = false; // Mark operands as dead/unused. @@ -537,7 +507,7 @@ std::pair<bool, bool> DetectDeadLanes::runOnce(MachineFunction &MF) { if (!Reg.isVirtual()) continue; unsigned RegIdx = Register::virtReg2Index(Reg); - const VRegInfo &RegInfo = VRegInfos[RegIdx]; + const DeadLaneDetector::VRegInfo &RegInfo = DLD.getVRegInfo(RegIdx); if (MO.isDef() && !MO.isDead() && RegInfo.UsedLanes.none()) { LLVM_DEBUG(dbgs() << "Marking operand '" << MO << "' as dead in " << MI); @@ -551,7 +521,7 @@ std::pair<bool, bool> DetectDeadLanes::runOnce(MachineFunction &MF) { << "Marking operand '" << MO << "' as undef in " << MI); MO.setIsUndef(); Changed = true; - } else if (isUndefInput(MO, &CrossCopy)) { + } else if (isUndefInput(DLD, MO, &CrossCopy)) { LLVM_DEBUG(dbgs() << "Marking operand '" << MO << "' as undef in " << MI); MO.setIsUndef(); @@ -581,21 +551,16 @@ bool DetectDeadLanes::runOnMachineFunction(MachineFunction &MF) { TRI = MRI->getTargetRegisterInfo(); - unsigned NumVirtRegs = MRI->getNumVirtRegs(); - VRegInfos = new VRegInfo[NumVirtRegs]; - WorklistMembers.resize(NumVirtRegs); - DefinedByCopy.resize(NumVirtRegs); + DeadLaneDetector DLD(MRI, TRI); bool Changed = false; bool Again; do { + DLD.computeSubRegisterLaneBitInfo(); bool LocalChanged; - std::tie(LocalChanged, Again) = runOnce(MF); + std::tie(LocalChanged, Again) = modifySubRegisterOperandStatus(DLD, MF); Changed |= LocalChanged; - } while(Again); + } while (Again); - DefinedByCopy.clear(); - WorklistMembers.clear(); - delete[] VRegInfos; return Changed; } diff --git a/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/llvm/lib/CodeGen/DwarfEHPrepare.cpp index aa81f618dc59..32c94de7280c 100644 --- a/llvm/lib/CodeGen/DwarfEHPrepare.cpp +++ b/llvm/lib/CodeGen/DwarfEHPrepare.cpp @@ -14,10 +14,8 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/Triple.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/DomTreeUpdater.h" -#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/TargetLowering.h" @@ -28,6 +26,7 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -36,6 +35,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/Local.h" #include <cstddef> diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp index 00626604d81c..61867d74bfa2 100644 --- a/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -119,10 +119,10 @@ public: SmallVector<PHIInfo, 8> PHIs; -private: /// The branch condition determined by analyzeBranch. SmallVector<MachineOperand, 4> Cond; +private: /// Instructions in Head that define values used by the conditional blocks. /// The hoisted instructions must be inserted after these instructions. SmallPtrSet<MachineInstr*, 8> InsertAfter; @@ -263,9 +263,8 @@ bool SSAIfConv::InstrDependenciesAllowIfConv(MachineInstr *I) { // Remember clobbered regunits. if (MO.isDef() && Reg.isPhysical()) - for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid(); - ++Units) - ClobberedRegUnits.set(*Units); + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) + ClobberedRegUnits.set(Unit); if (!MO.readsReg() || !Reg.isVirtual()) continue; @@ -343,8 +342,11 @@ bool SSAIfConv::canPredicateInstrs(MachineBasicBlock *MBB) { // Apply predicate to all instructions in the machine block. void SSAIfConv::PredicateBlock(MachineBasicBlock *MBB, bool ReversePredicate) { auto Condition = Cond; - if (ReversePredicate) - TII->reverseBranchCondition(Condition); + if (ReversePredicate) { + bool CanRevCond = !TII->reverseBranchCondition(Condition); + assert(CanRevCond && "Reversed predicate is not supported"); + (void)CanRevCond; + } // Terminators don't need to be predicated as they will be removed. for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->getFirstTerminator(); @@ -391,19 +393,17 @@ bool SSAIfConv::findInsertionPoint() { continue; // I clobbers Reg, so it isn't live before I. if (MO.isDef()) - for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid(); - ++Units) - LiveRegUnits.erase(*Units); + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) + LiveRegUnits.erase(Unit); // Unless I reads Reg. if (MO.readsReg()) Reads.push_back(Reg.asMCReg()); } // Anything read by I is live before I. while (!Reads.empty()) - for (MCRegUnitIterator Units(Reads.pop_back_val(), TRI); Units.isValid(); - ++Units) - if (ClobberedRegUnits.test(*Units)) - LiveRegUnits.insert(*Units); + for (MCRegUnit Unit : TRI->regunits(Reads.pop_back_val())) + if (ClobberedRegUnits.test(Unit)) + LiveRegUnits.insert(Unit); // We can't insert before a terminator. if (I != FirstTerm && I->isTerminator()) @@ -760,14 +760,14 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks, namespace { class EarlyIfConverter : public MachineFunctionPass { - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; MCSchedModel SchedModel; - MachineRegisterInfo *MRI; - MachineDominatorTree *DomTree; - MachineLoopInfo *Loops; - MachineTraceMetrics *Traces; - MachineTraceMetrics::Ensemble *MinInstr; + MachineRegisterInfo *MRI = nullptr; + MachineDominatorTree *DomTree = nullptr; + MachineLoopInfo *Loops = nullptr; + MachineTraceMetrics *Traces = nullptr; + MachineTraceMetrics::Ensemble *MinInstr = nullptr; SSAIfConv IfConv; public: @@ -873,8 +873,40 @@ bool EarlyIfConverter::shouldConvertIf() { if (Stress) return true; + // Do not try to if-convert if the condition has a high chance of being + // predictable. + MachineLoop *CurrentLoop = Loops->getLoopFor(IfConv.Head); + // If the condition is in a loop, consider it predictable if the condition + // itself or all its operands are loop-invariant. E.g. this considers a load + // from a loop-invariant address predictable; we were unable to prove that it + // doesn't alias any of the memory-writes in the loop, but it is likely to + // read to same value multiple times. + if (CurrentLoop && any_of(IfConv.Cond, [&](MachineOperand &MO) { + if (!MO.isReg() || !MO.isUse()) + return false; + Register Reg = MO.getReg(); + if (Register::isPhysicalRegister(Reg)) + return false; + + MachineInstr *Def = MRI->getVRegDef(Reg); + return CurrentLoop->isLoopInvariant(*Def) || + all_of(Def->operands(), [&](MachineOperand &Op) { + if (Op.isImm()) + return true; + if (!MO.isReg() || !MO.isUse()) + return false; + Register Reg = MO.getReg(); + if (Register::isPhysicalRegister(Reg)) + return false; + + MachineInstr *Def = MRI->getVRegDef(Reg); + return CurrentLoop->isLoopInvariant(*Def); + }); + })) + return false; + if (!MinInstr) - MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + MinInstr = Traces->getEnsemble(MachineTraceStrategy::TS_MinInstrCount); MachineTraceMetrics::Trace TBBTrace = MinInstr->getTrace(IfConv.getTPred()); MachineTraceMetrics::Trace FBBTrace = MinInstr->getTrace(IfConv.getFPred()); @@ -1084,13 +1116,13 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) { namespace { class EarlyIfPredicator : public MachineFunctionPass { - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; TargetSchedModel SchedModel; - MachineRegisterInfo *MRI; - MachineDominatorTree *DomTree; - MachineBranchProbabilityInfo *MBPI; - MachineLoopInfo *Loops; + MachineRegisterInfo *MRI = nullptr; + MachineDominatorTree *DomTree = nullptr; + MachineBranchProbabilityInfo *MBPI = nullptr; + MachineLoopInfo *Loops = nullptr; SSAIfConv IfConv; public: diff --git a/llvm/lib/CodeGen/ExecutionDomainFix.cpp b/llvm/lib/CodeGen/ExecutionDomainFix.cpp index 9621ad4b1248..21a7d02a320c 100644 --- a/llvm/lib/CodeGen/ExecutionDomainFix.cpp +++ b/llvm/lib/CodeGen/ExecutionDomainFix.cpp @@ -318,7 +318,7 @@ void ExecutionDomainFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { // If the collapsed operands force a single domain, propagate the collapse. if (isPowerOf2_32(available)) { - unsigned domain = countTrailingZeros(available); + unsigned domain = llvm::countr_zero(available); TII->setExecutionDomain(*mi, domain); visitHardInstr(mi, domain); return; diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index 3838eaadd1d2..500f31bd8e89 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -71,18 +71,18 @@ class MemCmpExpansion { ResultBlock() = default; }; - CallInst *const CI; + CallInst *const CI = nullptr; ResultBlock ResBlock; const uint64_t Size; unsigned MaxLoadSize = 0; uint64_t NumLoadsNonOneByte = 0; const uint64_t NumLoadsPerBlockForZeroCmp; std::vector<BasicBlock *> LoadCmpBlocks; - BasicBlock *EndBlock; - PHINode *PhiRes; + BasicBlock *EndBlock = nullptr; + PHINode *PhiRes = nullptr; const bool IsUsedForZeroCmp; const DataLayout &DL; - DomTreeUpdater *DTU; + DomTreeUpdater *DTU = nullptr; IRBuilder<> Builder; // Represents the decomposition in blocks of the expansion. For example, // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and @@ -288,17 +288,11 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType, Align RhsAlign = RhsSource->getPointerAlignment(DL); if (OffsetBytes > 0) { auto *ByteType = Type::getInt8Ty(CI->getContext()); - LhsSource = Builder.CreateConstGEP1_64( - ByteType, Builder.CreateBitCast(LhsSource, ByteType->getPointerTo()), - OffsetBytes); - RhsSource = Builder.CreateConstGEP1_64( - ByteType, Builder.CreateBitCast(RhsSource, ByteType->getPointerTo()), - OffsetBytes); + LhsSource = Builder.CreateConstGEP1_64(ByteType, LhsSource, OffsetBytes); + RhsSource = Builder.CreateConstGEP1_64(ByteType, RhsSource, OffsetBytes); LhsAlign = commonAlignment(LhsAlign, OffsetBytes); RhsAlign = commonAlignment(RhsAlign, OffsetBytes); } - LhsSource = Builder.CreateBitCast(LhsSource, LoadSizeType->getPointerTo()); - RhsSource = Builder.CreateBitCast(RhsSource, LoadSizeType->getPointerTo()); // Create a constant or a load from the source. Value *Lhs = nullptr; diff --git a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp index cc63984158c8..3a79f20f4732 100644 --- a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp +++ b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -28,8 +28,8 @@ using namespace llvm; namespace { struct ExpandPostRA : public MachineFunctionPass { private: - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI = nullptr; + const TargetInstrInfo *TII = nullptr; public: static char ID; // Pass identification, replacement for typeid @@ -47,9 +47,6 @@ public: private: bool LowerSubregToReg(MachineInstr *MI); - bool LowerCopy(MachineInstr *MI); - - void TransferImplicitOperands(MachineInstr *MI); }; } // end anonymous namespace @@ -59,25 +56,6 @@ char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID; INITIALIZE_PASS(ExpandPostRA, DEBUG_TYPE, "Post-RA pseudo instruction expansion pass", false, false) -/// TransferImplicitOperands - MI is a pseudo-instruction, and the lowered -/// replacement instructions immediately precede it. Copy any implicit -/// operands from MI to the replacement instruction. -void ExpandPostRA::TransferImplicitOperands(MachineInstr *MI) { - MachineBasicBlock::iterator CopyMI = MI; - --CopyMI; - - Register DstReg = MI->getOperand(0).getReg(); - for (const MachineOperand &MO : MI->implicit_operands()) { - CopyMI->addOperand(MO); - - // Be conservative about preserving kills when subregister defs are - // involved. If there was implicit kill of a super-register overlapping the - // copy result, we would kill the subregisters previous copies defined. - if (MO.isKill() && TRI->regsOverlap(DstReg, MO.getReg())) - CopyMI->getOperand(CopyMI->getNumOperands() - 1).setIsKill(false); - } -} - bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) { MachineBasicBlock *MBB = MI->getParent(); assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) && @@ -137,50 +115,6 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) { return true; } -bool ExpandPostRA::LowerCopy(MachineInstr *MI) { - - if (MI->allDefsAreDead()) { - LLVM_DEBUG(dbgs() << "dead copy: " << *MI); - MI->setDesc(TII->get(TargetOpcode::KILL)); - LLVM_DEBUG(dbgs() << "replaced by: " << *MI); - return true; - } - - MachineOperand &DstMO = MI->getOperand(0); - MachineOperand &SrcMO = MI->getOperand(1); - - bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg()); - if (IdentityCopy || SrcMO.isUndef()) { - LLVM_DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy: ") - << *MI); - // No need to insert an identity copy instruction, but replace with a KILL - // if liveness is changed. - if (SrcMO.isUndef() || MI->getNumOperands() > 2) { - // We must make sure the super-register gets killed. Replace the - // instruction with KILL. - MI->setDesc(TII->get(TargetOpcode::KILL)); - LLVM_DEBUG(dbgs() << "replaced by: " << *MI); - return true; - } - // Vanilla identity copy. - MI->eraseFromParent(); - return true; - } - - LLVM_DEBUG(dbgs() << "real copy: " << *MI); - TII->copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(), - DstMO.getReg(), SrcMO.getReg(), SrcMO.isKill()); - - if (MI->getNumOperands() > 2) - TransferImplicitOperands(MI); - LLVM_DEBUG({ - MachineBasicBlock::iterator dMI = MI; - dbgs() << "replaced by: " << *(--dMI); - }); - MI->eraseFromParent(); - return true; -} - /// runOnMachineFunction - Reduce subregister inserts and extracts to register /// copies. /// @@ -211,7 +145,8 @@ bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) { MadeChange |= LowerSubregToReg(&MI); break; case TargetOpcode::COPY: - MadeChange |= LowerCopy(&MI); + TII->lowerCopy(&MI, TRI); + MadeChange = true; break; case TargetOpcode::DBG_VALUE: continue; diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp index f08c47d220ea..79b6dc9154b3 100644 --- a/llvm/lib/CodeGen/ExpandReductions.cpp +++ b/llvm/lib/CodeGen/ExpandReductions.cpp @@ -1,4 +1,4 @@ -//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===// +//===- ExpandReductions.cpp - Expand reduction intrinsics -----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -133,10 +133,38 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { } break; } + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: { + // Canonicalize logical or/and reductions: + // Or reduction for i1 is represented as: + // %val = bitcast <ReduxWidth x i1> to iReduxWidth + // %res = cmp ne iReduxWidth %val, 0 + // And reduction for i1 is represented as: + // %val = bitcast <ReduxWidth x i1> to iReduxWidth + // %res = cmp eq iReduxWidth %val, 11111 + Value *Vec = II->getArgOperand(0); + auto *FTy = cast<FixedVectorType>(Vec->getType()); + unsigned NumElts = FTy->getNumElements(); + if (!isPowerOf2_32(NumElts)) + continue; + + if (FTy->getElementType() == Builder.getInt1Ty()) { + Rdx = Builder.CreateBitCast(Vec, Builder.getIntNTy(NumElts)); + if (ID == Intrinsic::vector_reduce_and) { + Rdx = Builder.CreateICmpEQ( + Rdx, ConstantInt::getAllOnesValue(Rdx->getType())); + } else { + assert(ID == Intrinsic::vector_reduce_or && "Expected or reduction."); + Rdx = Builder.CreateIsNotNull(Rdx); + } + break; + } + + Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK); + break; + } case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_mul: - case Intrinsic::vector_reduce_and: - case Intrinsic::vector_reduce_or: case Intrinsic::vector_reduce_xor: case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smin: diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index 5ee76ff567fb..9807be0bea39 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -171,6 +171,10 @@ struct CachingVPExpander { Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder, VPIntrinsic &PI); + /// Lower this VP fp call to a unpredicated fp call. + Value *expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI, + unsigned UnpredicatedIntrinsicID); + /// Lower this VP reduction to a call to an unpredicated reduction intrinsic. Value *expandPredicationInReduction(IRBuilder<> &Builder, VPReductionIntrinsic &PI); @@ -271,6 +275,38 @@ CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder, return NewBinOp; } +Value *CachingVPExpander::expandPredicationToFPCall( + IRBuilder<> &Builder, VPIntrinsic &VPI, unsigned UnpredicatedIntrinsicID) { + assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) && + "Implicitly dropping %evl in non-speculatable operator!"); + + switch (UnpredicatedIntrinsicID) { + case Intrinsic::fabs: + case Intrinsic::sqrt: { + Value *Op0 = VPI.getOperand(0); + Function *Fn = Intrinsic::getDeclaration( + VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); + Value *NewOp = Builder.CreateCall(Fn, {Op0}, VPI.getName()); + replaceOperation(*NewOp, VPI); + return NewOp; + } + case Intrinsic::experimental_constrained_fma: + case Intrinsic::experimental_constrained_fmuladd: { + Value *Op0 = VPI.getOperand(0); + Value *Op1 = VPI.getOperand(1); + Value *Op2 = VPI.getOperand(2); + Function *Fn = Intrinsic::getDeclaration( + VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); + Value *NewOp = + Builder.CreateConstrainedFPCall(Fn, {Op0, Op1, Op2}, VPI.getName()); + replaceOperation(*NewOp, VPI); + return NewOp; + } + } + + return nullptr; +} + static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI, Type *EltTy) { bool Negative = false; @@ -565,6 +601,15 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { switch (VPI.getIntrinsicID()) { default: break; + case Intrinsic::vp_fneg: { + Value *NewNegOp = Builder.CreateFNeg(VPI.getOperand(0), VPI.getName()); + replaceOperation(*NewNegOp, VPI); + return NewNegOp; + } + case Intrinsic::vp_fabs: + return expandPredicationToFPCall(Builder, VPI, Intrinsic::fabs); + case Intrinsic::vp_sqrt: + return expandPredicationToFPCall(Builder, VPI, Intrinsic::sqrt); case Intrinsic::vp_load: case Intrinsic::vp_store: case Intrinsic::vp_gather: @@ -572,6 +617,10 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { return expandPredicationInMemoryIntrinsic(Builder, VPI); } + if (auto CID = VPI.getConstrainedIntrinsicID()) + if (Value *Call = expandPredicationToFPCall(Builder, VPI, *CID)) + return Call; + return &VPI; } diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp index 55d939de426e..75504ef32250 100644 --- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp +++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp @@ -388,7 +388,7 @@ public: Register Reg = MO.getReg(); assert(Reg.isPhysical() && "Only physical regs are expected"); - if (isCalleeSaved(Reg) && (AllowGCPtrInCSR || !is_contained(GCRegs, Reg))) + if (isCalleeSaved(Reg) && (AllowGCPtrInCSR || !GCRegs.contains(Reg))) continue; LLVM_DEBUG(dbgs() << "Will spill " << printReg(Reg, &TRI) << " at index " @@ -407,7 +407,6 @@ public: void spillRegisters() { for (Register Reg : RegsToSpill) { int FI = CacheFI.getFrameIndex(Reg, EHPad); - const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); NumSpilledRegisters++; RegToSlotIdx[Reg] = FI; @@ -419,6 +418,7 @@ public: bool IsKill = true; MachineBasicBlock::iterator InsertBefore(MI); Reg = performCopyPropagation(Reg, InsertBefore, IsKill, TII, TRI); + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore); TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, IsKill, FI, diff --git a/llvm/lib/CodeGen/GCRootLowering.cpp b/llvm/lib/CodeGen/GCRootLowering.cpp index 80feb0045406..c0ce37091933 100644 --- a/llvm/lib/CodeGen/GCRootLowering.cpp +++ b/llvm/lib/CodeGen/GCRootLowering.cpp @@ -52,8 +52,8 @@ public: /// in the machine code. It inserts labels at safe points and populates a /// GCMetadata record for each function. class GCMachineCodeAnalysis : public MachineFunctionPass { - GCFunctionInfo *FI; - const TargetInstrInfo *TII; + GCFunctionInfo *FI = nullptr; + const TargetInstrInfo *TII = nullptr; void FindSafePoints(MachineFunction &MF); void VisitCallPoint(MachineBasicBlock::iterator CI); diff --git a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp index 356d208fc881..e047996f9aa8 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp @@ -217,10 +217,14 @@ void GISelCSEInfo::handleRemoveInst(MachineInstr *MI) { } void GISelCSEInfo::handleRecordedInsts() { + if (HandlingRecordedInstrs) + return; + HandlingRecordedInstrs = true; while (!TemporaryInsts.empty()) { auto *MI = TemporaryInsts.pop_back_val(); handleRecordedInst(MI); } + HandlingRecordedInstrs = false; } bool GISelCSEInfo::shouldCSE(unsigned Opc) const { @@ -392,9 +396,10 @@ GISelInstProfileBuilder::addNodeIDReg(Register Reg) const { addNodeIDRegType(Ty); if (const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(Reg)) { - if (const auto *RB = RCOrRB.dyn_cast<const RegisterBank *>()) + if (const auto *RB = dyn_cast_if_present<const RegisterBank *>(RCOrRB)) addNodeIDRegType(RB); - else if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>()) + else if (const auto *RC = + dyn_cast_if_present<const TargetRegisterClass *>(RCOrRB)) addNodeIDRegType(RC); } return *this; diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 89872259cfca..28c33e2038e4 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -846,7 +846,7 @@ void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy, unsigned NumValues = SplitVTs.size(); Align BaseAlign = DL.getPrefTypeAlign(RetTy); Type *RetPtrTy = RetTy->getPointerTo(DL.getAllocaAddrSpace()); - LLT OffsetLLTy = getLLTForType(*DL.getIntPtrType(RetPtrTy), DL); + LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetPtrTy), DL); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); @@ -876,8 +876,7 @@ void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy, unsigned NumValues = SplitVTs.size(); Align BaseAlign = DL.getPrefTypeAlign(RetTy); unsigned AS = DL.getAllocaAddrSpace(); - LLT OffsetLLTy = - getLLTForType(*DL.getIntPtrType(RetTy->getPointerTo(AS)), DL); + LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetTy->getPointerTo(AS)), DL); MachinePointerInfo PtrInfo(AS); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index af4bb1634746..cc7fb3ee1109 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -16,7 +16,7 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" -#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/LowLevelTypeUtils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineInstr.h" @@ -399,7 +399,8 @@ namespace { /// Select a preference between two uses. CurrentUse is the current preference /// while *ForCandidate is attributes of the candidate under consideration. -PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse, +PreferredTuple ChoosePreferredUse(MachineInstr &LoadMI, + PreferredTuple &CurrentUse, const LLT TyForCandidate, unsigned OpcodeForCandidate, MachineInstr *MIForCandidate) { @@ -425,8 +426,10 @@ PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse, return {TyForCandidate, OpcodeForCandidate, MIForCandidate}; // Prefer sign extensions to zero extensions as sign-extensions tend to be - // more expensive. - if (CurrentUse.Ty == TyForCandidate) { + // more expensive. Don't do this if the load is already a zero-extend load + // though, otherwise we'll rewrite a zero-extend load into a sign-extend + // later. + if (!isa<GZExtLoad>(LoadMI) && CurrentUse.Ty == TyForCandidate) { if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT && OpcodeForCandidate == TargetOpcode::G_ZEXT) return CurrentUse; @@ -535,7 +538,7 @@ bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI, // For non power-of-2 types, they will very likely be legalized into multiple // loads. Don't bother trying to match them into extending loads. - if (!isPowerOf2_32(LoadValueTy.getSizeInBits())) + if (!llvm::has_single_bit<uint32_t>(LoadValueTy.getSizeInBits())) return false; // Find the preferred type aside from the any-extends (unless it's the only @@ -566,7 +569,7 @@ bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI, .Action != LegalizeActions::Legal) continue; } - Preferred = ChoosePreferredUse(Preferred, + Preferred = ChoosePreferredUse(MI, Preferred, MRI.getType(UseMI.getOperand(0).getReg()), UseMI.getOpcode(), &UseMI); } @@ -727,7 +730,7 @@ bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI, Register PtrReg = LoadMI->getPointerReg(); unsigned RegSize = RegTy.getSizeInBits(); uint64_t LoadSizeBits = LoadMI->getMemSizeInBits(); - unsigned MaskSizeBits = MaskVal.countTrailingOnes(); + unsigned MaskSizeBits = MaskVal.countr_one(); // The mask may not be larger than the in-memory type, as it might cover sign // extended bits @@ -1189,16 +1192,22 @@ void CombinerHelper::applyCombineDivRem(MachineInstr &MI, Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM; // Check which instruction is first in the block so we don't break def-use - // deps by "moving" the instruction incorrectly. - if (dominates(MI, *OtherMI)) + // deps by "moving" the instruction incorrectly. Also keep track of which + // instruction is first so we pick it's operands, avoiding use-before-def + // bugs. + MachineInstr *FirstInst; + if (dominates(MI, *OtherMI)) { Builder.setInstrAndDebugLoc(MI); - else + FirstInst = &MI; + } else { Builder.setInstrAndDebugLoc(*OtherMI); + FirstInst = OtherMI; + } Builder.buildInstr(IsSigned ? TargetOpcode::G_SDIVREM : TargetOpcode::G_UDIVREM, {DestDivReg, DestRemReg}, - {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()}); + { FirstInst->getOperand(1), FirstInst->getOperand(2) }); MI.eraseFromParent(); OtherMI->eraseFromParent(); } @@ -1285,65 +1294,57 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { LegalizerHelper::LegalizeResult::Legalized; } -static std::optional<APFloat> -constantFoldFpUnary(unsigned Opcode, LLT DstTy, const Register Op, - const MachineRegisterInfo &MRI) { - const ConstantFP *MaybeCst = getConstantFPVRegVal(Op, MRI); - if (!MaybeCst) - return std::nullopt; - - APFloat V = MaybeCst->getValueAPF(); - switch (Opcode) { +static APFloat constantFoldFpUnary(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const APFloat &Val) { + APFloat Result(Val); + switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case TargetOpcode::G_FNEG: { - V.changeSign(); - return V; + Result.changeSign(); + return Result; } case TargetOpcode::G_FABS: { - V.clearSign(); - return V; + Result.clearSign(); + return Result; + } + case TargetOpcode::G_FPTRUNC: { + bool Unused; + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + Result.convert(getFltSemanticForLLT(DstTy), APFloat::rmNearestTiesToEven, + &Unused); + return Result; } - case TargetOpcode::G_FPTRUNC: - break; case TargetOpcode::G_FSQRT: { bool Unused; - V.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &Unused); - V = APFloat(sqrt(V.convertToDouble())); + Result.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, + &Unused); + Result = APFloat(sqrt(Result.convertToDouble())); break; } case TargetOpcode::G_FLOG2: { bool Unused; - V.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &Unused); - V = APFloat(log2(V.convertToDouble())); + Result.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, + &Unused); + Result = APFloat(log2(Result.convertToDouble())); break; } } // Convert `APFloat` to appropriate IEEE type depending on `DstTy`. Otherwise, - // `buildFConstant` will assert on size mismatch. Only `G_FPTRUNC`, `G_FSQRT`, - // and `G_FLOG2` reach here. + // `buildFConstant` will assert on size mismatch. Only `G_FSQRT`, and + // `G_FLOG2` reach here. bool Unused; - V.convert(getFltSemanticForLLT(DstTy), APFloat::rmNearestTiesToEven, &Unused); - return V; + Result.convert(Val.getSemantics(), APFloat::rmNearestTiesToEven, &Unused); + return Result; } -bool CombinerHelper::matchCombineConstantFoldFpUnary( - MachineInstr &MI, std::optional<APFloat> &Cst) { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - Cst = constantFoldFpUnary(MI.getOpcode(), DstTy, SrcReg, MRI); - return Cst.has_value(); -} - -void CombinerHelper::applyCombineConstantFoldFpUnary( - MachineInstr &MI, std::optional<APFloat> &Cst) { - assert(Cst && "Optional is unexpectedly empty!"); +void CombinerHelper::applyCombineConstantFoldFpUnary(MachineInstr &MI, + const ConstantFP *Cst) { Builder.setInstrAndDebugLoc(MI); - MachineFunction &MF = Builder.getMF(); - auto *FPVal = ConstantFP::get(MF.getFunction().getContext(), *Cst); - Register DstReg = MI.getOperand(0).getReg(); - Builder.buildFConstant(DstReg, *FPVal); + APFloat Folded = constantFoldFpUnary(MI, MRI, Cst->getValue()); + const ConstantFP *NewCst = ConstantFP::get(Builder.getContext(), Folded); + Builder.buildFConstant(MI.getOperand(0), *NewCst); MI.eraseFromParent(); } @@ -1621,6 +1622,41 @@ void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI, MI.eraseFromParent(); } +bool CombinerHelper::matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected G_SHL"); + // Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) + // Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) + auto &Shl = cast<GenericMachineInstr>(MI); + Register DstReg = Shl.getReg(0); + Register SrcReg = Shl.getReg(1); + Register ShiftReg = Shl.getReg(2); + Register X, C1; + + if (!getTargetLowering().isDesirableToCommuteWithShift(MI, !isPreLegalize())) + return false; + + if (!mi_match(SrcReg, MRI, + m_OneNonDBGUse(m_any_of(m_GAdd(m_Reg(X), m_Reg(C1)), + m_GOr(m_Reg(X), m_Reg(C1)))))) + return false; + + APInt C1Val, C2Val; + if (!mi_match(C1, MRI, m_ICstOrSplat(C1Val)) || + !mi_match(ShiftReg, MRI, m_ICstOrSplat(C2Val))) + return false; + + auto *SrcDef = MRI.getVRegDef(SrcReg); + assert((SrcDef->getOpcode() == TargetOpcode::G_ADD || + SrcDef->getOpcode() == TargetOpcode::G_OR) && "Unexpected op"); + LLT SrcTy = MRI.getType(SrcReg); + MatchInfo = [=](MachineIRBuilder &B) { + auto S1 = B.buildShl(SrcTy, X, ShiftReg); + auto S2 = B.buildShl(SrcTy, C1, ShiftReg); + B.buildInstr(SrcDef->getOpcode(), {DstReg}, {S1, S2}); + }; + return true; +} + bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) { assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); @@ -1658,9 +1694,9 @@ bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI, !mi_match(LHS, MRI, m_GSExt(m_Reg(ExtSrc)))) return false; - // TODO: Should handle vector splat. Register RHS = MI.getOperand(2).getReg(); - auto MaybeShiftAmtVal = getIConstantVRegValWithLookThrough(RHS, MRI); + MachineInstr *MIShiftAmt = MRI.getVRegDef(RHS); + auto MaybeShiftAmtVal = isConstantOrConstantSplatVector(*MIShiftAmt, MRI); if (!MaybeShiftAmtVal) return false; @@ -1675,12 +1711,13 @@ bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI, return false; } - int64_t ShiftAmt = MaybeShiftAmtVal->Value.getSExtValue(); + int64_t ShiftAmt = MaybeShiftAmtVal->getSExtValue(); MatchData.Reg = ExtSrc; MatchData.Imm = ShiftAmt; - unsigned MinLeadingZeros = KB->getKnownZeroes(ExtSrc).countLeadingOnes(); - return MinLeadingZeros >= ShiftAmt; + unsigned MinLeadingZeros = KB->getKnownZeroes(ExtSrc).countl_one(); + unsigned SrcTySize = MRI.getType(ExtSrc).getScalarSizeInBits(); + return MinLeadingZeros >= ShiftAmt && ShiftAmt < SrcTySize; } void CombinerHelper::applyCombineShlOfExtend(MachineInstr &MI, @@ -1763,6 +1800,15 @@ void CombinerHelper::applyCombineUnmergeMergeToPlainValues( for (unsigned Idx = 0; Idx < NumElems; ++Idx) { Register DstReg = MI.getOperand(Idx).getReg(); Register SrcReg = Operands[Idx]; + + // This combine may run after RegBankSelect, so we need to be aware of + // register banks. + const auto &DstCB = MRI.getRegClassOrRegBank(DstReg); + if (!DstCB.isNull() && DstCB != MRI.getRegClassOrRegBank(SrcReg)) { + SrcReg = Builder.buildCopy(MRI.getType(SrcReg), SrcReg).getReg(0); + MRI.setRegClassOrRegBank(SrcReg, DstCB); + } + if (CanReuseInputDirectly) replaceRegWith(MRI, DstReg, SrcReg); else @@ -2426,10 +2472,7 @@ bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) { return true; } -bool CombinerHelper::eraseInst(MachineInstr &MI) { - MI.eraseFromParent(); - return true; -} +void CombinerHelper::eraseInst(MachineInstr &MI) { MI.eraseFromParent(); } bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1, const MachineOperand &MOP2) { @@ -2537,7 +2580,7 @@ bool CombinerHelper::matchConstantOp(const MachineOperand &MOP, int64_t C) { MaybeCst->getSExtValue() == C; } -bool CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI, +void CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI, unsigned OpIdx) { assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?"); Register OldReg = MI.getOperand(0).getReg(); @@ -2545,17 +2588,15 @@ bool CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI, assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?"); MI.eraseFromParent(); replaceRegWith(MRI, OldReg, Replacement); - return true; } -bool CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI, +void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI, Register Replacement) { assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?"); Register OldReg = MI.getOperand(0).getReg(); assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?"); MI.eraseFromParent(); replaceRegWith(MRI, OldReg, Replacement); - return true; } bool CombinerHelper::matchSelectSameVal(MachineInstr &MI) { @@ -2590,36 +2631,32 @@ bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, return isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB); } -bool CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) { +void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); Builder.setInstr(MI); Builder.buildFConstant(MI.getOperand(0), C); MI.eraseFromParent(); - return true; } -bool CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) { +void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); Builder.setInstr(MI); Builder.buildConstant(MI.getOperand(0), C); MI.eraseFromParent(); - return true; } -bool CombinerHelper::replaceInstWithConstant(MachineInstr &MI, APInt C) { +void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, APInt C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); Builder.setInstr(MI); Builder.buildConstant(MI.getOperand(0), C); MI.eraseFromParent(); - return true; } -bool CombinerHelper::replaceInstWithUndef(MachineInstr &MI) { +void CombinerHelper::replaceInstWithUndef(MachineInstr &MI) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); Builder.setInstr(MI); Builder.buildUndef(MI.getOperand(0)); MI.eraseFromParent(); - return true; } bool CombinerHelper::matchSimplifyAddToSub( @@ -2750,9 +2787,7 @@ bool CombinerHelper::matchHoistLogicOpWithSameOpcodeHands( Register Y = RightHandInst->getOperand(1).getReg(); LLT XTy = MRI.getType(X); LLT YTy = MRI.getType(Y); - if (XTy != YTy) - return false; - if (!isLegalOrBeforeLegalizer({LogicOpcode, {XTy, YTy}})) + if (!XTy.isValid() || XTy != YTy) return false; // Optional extra source register. @@ -2779,6 +2814,9 @@ bool CombinerHelper::matchHoistLogicOpWithSameOpcodeHands( } } + if (!isLegalOrBeforeLegalizer({LogicOpcode, {XTy, YTy}})) + return false; + // Record the steps to build the new instructions. // // Steps to build (logic x, y) @@ -3227,7 +3265,7 @@ bool CombinerHelper::matchFoldBinOpIntoSelect(MachineInstr &MI, /// \p SelectOperand is the operand in binary operator \p MI that is the select /// to fold. -bool CombinerHelper::applyFoldBinOpIntoSelect(MachineInstr &MI, +void CombinerHelper::applyFoldBinOpIntoSelect(MachineInstr &MI, const unsigned &SelectOperand) { Builder.setInstrAndDebugLoc(MI); @@ -3263,8 +3301,6 @@ bool CombinerHelper::applyFoldBinOpIntoSelect(MachineInstr &MI, Builder.buildSelect(Dst, SelectCond, FoldTrue, FoldFalse, MI.getFlags()); MI.eraseFromParent(); - - return true; } std::optional<SmallVector<Register, 8>> @@ -3612,275 +3648,6 @@ bool CombinerHelper::matchLoadOrCombine( return true; } -/// Check if the store \p Store is a truncstore that can be merged. That is, -/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty -/// Register then it does not need to match and SrcVal is set to the source -/// value found. -/// On match, returns the start byte offset of the \p SrcVal that is being -/// stored. -static std::optional<int64_t> -getTruncStoreByteOffset(GStore &Store, Register &SrcVal, - MachineRegisterInfo &MRI) { - Register TruncVal; - if (!mi_match(Store.getValueReg(), MRI, m_GTrunc(m_Reg(TruncVal)))) - return std::nullopt; - - // The shift amount must be a constant multiple of the narrow type. - // It is translated to the offset address in the wide source value "y". - // - // x = G_LSHR y, ShiftAmtC - // s8 z = G_TRUNC x - // store z, ... - Register FoundSrcVal; - int64_t ShiftAmt; - if (!mi_match(TruncVal, MRI, - m_any_of(m_GLShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt)), - m_GAShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt))))) { - if (!SrcVal.isValid() || TruncVal == SrcVal) { - if (!SrcVal.isValid()) - SrcVal = TruncVal; - return 0; // If it's the lowest index store. - } - return std::nullopt; - } - - unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits(); - if (ShiftAmt % NarrowBits!= 0) - return std::nullopt; - const unsigned Offset = ShiftAmt / NarrowBits; - - if (SrcVal.isValid() && FoundSrcVal != SrcVal) - return std::nullopt; - - if (!SrcVal.isValid()) - SrcVal = FoundSrcVal; - else if (MRI.getType(SrcVal) != MRI.getType(FoundSrcVal)) - return std::nullopt; - return Offset; -} - -/// Match a pattern where a wide type scalar value is stored by several narrow -/// stores. Fold it into a single store or a BSWAP and a store if the targets -/// supports it. -/// -/// Assuming little endian target: -/// i8 *p = ... -/// i32 val = ... -/// p[0] = (val >> 0) & 0xFF; -/// p[1] = (val >> 8) & 0xFF; -/// p[2] = (val >> 16) & 0xFF; -/// p[3] = (val >> 24) & 0xFF; -/// => -/// *((i32)p) = val; -/// -/// i8 *p = ... -/// i32 val = ... -/// p[0] = (val >> 24) & 0xFF; -/// p[1] = (val >> 16) & 0xFF; -/// p[2] = (val >> 8) & 0xFF; -/// p[3] = (val >> 0) & 0xFF; -/// => -/// *((i32)p) = BSWAP(val); -bool CombinerHelper::matchTruncStoreMerge(MachineInstr &MI, - MergeTruncStoresInfo &MatchInfo) { - auto &StoreMI = cast<GStore>(MI); - LLT MemTy = StoreMI.getMMO().getMemoryType(); - - // We only handle merging simple stores of 1-4 bytes. - if (!MemTy.isScalar()) - return false; - switch (MemTy.getSizeInBits()) { - case 8: - case 16: - case 32: - break; - default: - return false; - } - if (!StoreMI.isSimple()) - return false; - - // We do a simple search for mergeable stores prior to this one. - // Any potential alias hazard along the way terminates the search. - SmallVector<GStore *> FoundStores; - - // We're looking for: - // 1) a (store(trunc(...))) - // 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get - // the partial value stored. - // 3) where the offsets form either a little or big-endian sequence. - - auto &LastStore = StoreMI; - - // The single base pointer that all stores must use. - Register BaseReg; - int64_t LastOffset; - if (!mi_match(LastStore.getPointerReg(), MRI, - m_GPtrAdd(m_Reg(BaseReg), m_ICst(LastOffset)))) { - BaseReg = LastStore.getPointerReg(); - LastOffset = 0; - } - - GStore *LowestIdxStore = &LastStore; - int64_t LowestIdxOffset = LastOffset; - - Register WideSrcVal; - auto LowestShiftAmt = getTruncStoreByteOffset(LastStore, WideSrcVal, MRI); - if (!LowestShiftAmt) - return false; // Didn't match a trunc. - assert(WideSrcVal.isValid()); - - LLT WideStoreTy = MRI.getType(WideSrcVal); - // The wide type might not be a multiple of the memory type, e.g. s48 and s32. - if (WideStoreTy.getSizeInBits() % MemTy.getSizeInBits() != 0) - return false; - const unsigned NumStoresRequired = - WideStoreTy.getSizeInBits() / MemTy.getSizeInBits(); - - SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX); - OffsetMap[*LowestShiftAmt] = LastOffset; - FoundStores.emplace_back(&LastStore); - - // Search the block up for more stores. - // We use a search threshold of 10 instructions here because the combiner - // works top-down within a block, and we don't want to search an unbounded - // number of predecessor instructions trying to find matching stores. - // If we moved this optimization into a separate pass then we could probably - // use a more efficient search without having a hard-coded threshold. - const int MaxInstsToCheck = 10; - int NumInstsChecked = 0; - for (auto II = ++LastStore.getReverseIterator(); - II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck; - ++II) { - NumInstsChecked++; - GStore *NewStore; - if ((NewStore = dyn_cast<GStore>(&*II))) { - if (NewStore->getMMO().getMemoryType() != MemTy || !NewStore->isSimple()) - break; - } else if (II->isLoadFoldBarrier() || II->mayLoad()) { - break; - } else { - continue; // This is a safe instruction we can look past. - } - - Register NewBaseReg; - int64_t MemOffset; - // Check we're storing to the same base + some offset. - if (!mi_match(NewStore->getPointerReg(), MRI, - m_GPtrAdd(m_Reg(NewBaseReg), m_ICst(MemOffset)))) { - NewBaseReg = NewStore->getPointerReg(); - MemOffset = 0; - } - if (BaseReg != NewBaseReg) - break; - - auto ShiftByteOffset = getTruncStoreByteOffset(*NewStore, WideSrcVal, MRI); - if (!ShiftByteOffset) - break; - if (MemOffset < LowestIdxOffset) { - LowestIdxOffset = MemOffset; - LowestIdxStore = NewStore; - } - - // Map the offset in the store and the offset in the combined value, and - // early return if it has been set before. - if (*ShiftByteOffset < 0 || *ShiftByteOffset >= NumStoresRequired || - OffsetMap[*ShiftByteOffset] != INT64_MAX) - break; - OffsetMap[*ShiftByteOffset] = MemOffset; - - FoundStores.emplace_back(NewStore); - // Reset counter since we've found a matching inst. - NumInstsChecked = 0; - if (FoundStores.size() == NumStoresRequired) - break; - } - - if (FoundStores.size() != NumStoresRequired) { - return false; - } - - const auto &DL = LastStore.getMF()->getDataLayout(); - auto &C = LastStore.getMF()->getFunction().getContext(); - // Check that a store of the wide type is both allowed and fast on the target - unsigned Fast = 0; - bool Allowed = getTargetLowering().allowsMemoryAccess( - C, DL, WideStoreTy, LowestIdxStore->getMMO(), &Fast); - if (!Allowed || !Fast) - return false; - - // Check if the pieces of the value are going to the expected places in memory - // to merge the stores. - unsigned NarrowBits = MemTy.getScalarSizeInBits(); - auto checkOffsets = [&](bool MatchLittleEndian) { - if (MatchLittleEndian) { - for (unsigned i = 0; i != NumStoresRequired; ++i) - if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset) - return false; - } else { // MatchBigEndian by reversing loop counter. - for (unsigned i = 0, j = NumStoresRequired - 1; i != NumStoresRequired; - ++i, --j) - if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset) - return false; - } - return true; - }; - - // Check if the offsets line up for the native data layout of this target. - bool NeedBswap = false; - bool NeedRotate = false; - if (!checkOffsets(DL.isLittleEndian())) { - // Special-case: check if byte offsets line up for the opposite endian. - if (NarrowBits == 8 && checkOffsets(DL.isBigEndian())) - NeedBswap = true; - else if (NumStoresRequired == 2 && checkOffsets(DL.isBigEndian())) - NeedRotate = true; - else - return false; - } - - if (NeedBswap && - !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {WideStoreTy}})) - return false; - if (NeedRotate && - !isLegalOrBeforeLegalizer({TargetOpcode::G_ROTR, {WideStoreTy}})) - return false; - - MatchInfo.NeedBSwap = NeedBswap; - MatchInfo.NeedRotate = NeedRotate; - MatchInfo.LowestIdxStore = LowestIdxStore; - MatchInfo.WideSrcVal = WideSrcVal; - MatchInfo.FoundStores = std::move(FoundStores); - return true; -} - -void CombinerHelper::applyTruncStoreMerge(MachineInstr &MI, - MergeTruncStoresInfo &MatchInfo) { - - Builder.setInstrAndDebugLoc(MI); - Register WideSrcVal = MatchInfo.WideSrcVal; - LLT WideStoreTy = MRI.getType(WideSrcVal); - - if (MatchInfo.NeedBSwap) { - WideSrcVal = Builder.buildBSwap(WideStoreTy, WideSrcVal).getReg(0); - } else if (MatchInfo.NeedRotate) { - assert(WideStoreTy.getSizeInBits() % 2 == 0 && - "Unexpected type for rotate"); - auto RotAmt = - Builder.buildConstant(WideStoreTy, WideStoreTy.getSizeInBits() / 2); - WideSrcVal = - Builder.buildRotateRight(WideStoreTy, WideSrcVal, RotAmt).getReg(0); - } - - Builder.buildStore(WideSrcVal, MatchInfo.LowestIdxStore->getPointerReg(), - MatchInfo.LowestIdxStore->getMMO().getPointerInfo(), - MatchInfo.LowestIdxStore->getMMO().getAlign()); - - // Erase the old stores. - for (auto *ST : MatchInfo.FoundStores) - ST->eraseFromParent(); -} - bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI) { assert(MI.getOpcode() == TargetOpcode::G_PHI); @@ -4395,7 +4162,7 @@ bool CombinerHelper::matchBitfieldExtractFromAnd( if (static_cast<uint64_t>(LSBImm) >= Size) return false; - uint64_t Width = APInt(Size, AndImm).countTrailingOnes(); + uint64_t Width = APInt(Size, AndImm).countr_one(); MatchInfo = [=](MachineIRBuilder &B) { auto WidthCst = B.buildConstant(ExtractTy, Width); auto LSBCst = B.buildConstant(ExtractTy, LSBImm); @@ -4496,7 +4263,7 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd( // Calculate start position and width of the extract. const int64_t Pos = ShrAmt; - const int64_t Width = countTrailingOnes(UMask) - ShrAmt; + const int64_t Width = llvm::countr_one(UMask) - ShrAmt; // It's preferable to keep the shift, rather than form G_SBFX. // TODO: remove the G_AND via demanded bits analysis. @@ -4695,6 +4462,62 @@ bool CombinerHelper::matchReassocPtrAdd(MachineInstr &MI, return false; } +bool CombinerHelper::tryReassocBinOp(unsigned Opc, Register DstReg, + Register OpLHS, Register OpRHS, + BuildFnTy &MatchInfo) { + LLT OpRHSTy = MRI.getType(OpRHS); + MachineInstr *OpLHSDef = MRI.getVRegDef(OpLHS); + + if (OpLHSDef->getOpcode() != Opc) + return false; + + MachineInstr *OpRHSDef = MRI.getVRegDef(OpRHS); + Register OpLHSLHS = OpLHSDef->getOperand(1).getReg(); + Register OpLHSRHS = OpLHSDef->getOperand(2).getReg(); + + // If the inner op is (X op C), pull the constant out so it can be folded with + // other constants in the expression tree. Folding is not guaranteed so we + // might have (C1 op C2). In that case do not pull a constant out because it + // won't help and can lead to infinite loops. + if (isConstantOrConstantSplatVector(*MRI.getVRegDef(OpLHSRHS), MRI) && + !isConstantOrConstantSplatVector(*MRI.getVRegDef(OpLHSLHS), MRI)) { + if (isConstantOrConstantSplatVector(*OpRHSDef, MRI)) { + // (Opc (Opc X, C1), C2) -> (Opc X, (Opc C1, C2)) + MatchInfo = [=](MachineIRBuilder &B) { + auto NewCst = B.buildInstr(Opc, {OpRHSTy}, {OpLHSRHS, OpRHS}); + B.buildInstr(Opc, {DstReg}, {OpLHSLHS, NewCst}); + }; + return true; + } + if (getTargetLowering().isReassocProfitable(MRI, OpLHS, OpRHS)) { + // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) + // iff (op x, c1) has one use + MatchInfo = [=](MachineIRBuilder &B) { + auto NewLHSLHS = B.buildInstr(Opc, {OpRHSTy}, {OpLHSLHS, OpRHS}); + B.buildInstr(Opc, {DstReg}, {NewLHSLHS, OpLHSRHS}); + }; + return true; + } + } + + return false; +} + +bool CombinerHelper::matchReassocCommBinOp(MachineInstr &MI, + BuildFnTy &MatchInfo) { + // We don't check if the reassociation will break a legal addressing mode + // here since pointer arithmetic is handled by G_PTR_ADD. + unsigned Opc = MI.getOpcode(); + Register DstReg = MI.getOperand(0).getReg(); + Register LHSReg = MI.getOperand(1).getReg(); + Register RHSReg = MI.getOperand(2).getReg(); + + if (tryReassocBinOp(Opc, DstReg, LHSReg, RHSReg, MatchInfo)) + return true; + if (tryReassocBinOp(Opc, DstReg, RHSReg, LHSReg, MatchInfo)) + return true; + return false; +} bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) { Register Op1 = MI.getOperand(1).getReg(); @@ -4766,7 +4589,7 @@ bool CombinerHelper::matchNarrowBinopFeedingAnd( return false; // No point in combining if there's nothing to truncate. - unsigned NarrowWidth = Mask.countTrailingOnes(); + unsigned NarrowWidth = Mask.countr_one(); if (NarrowWidth == WideTy.getSizeInBits()) return false; LLT NarrowTy = LLT::scalar(NarrowWidth); @@ -4956,7 +4779,7 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) { // Magic algorithm doesn't work for division by 1. We need to emit a select // at the end. // TODO: Use undef values for divisor of 1. - if (!Divisor.isOneValue()) { + if (!Divisor.isOne()) { UnsignedDivisionByConstantInfo magics = UnsignedDivisionByConstantInfo::get(Divisor); @@ -5144,7 +4967,7 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) { auto *CI = cast<ConstantInt>(C); APInt Divisor = CI->getValue(); - unsigned Shift = Divisor.countTrailingZeros(); + unsigned Shift = Divisor.countr_zero(); if (Shift) { Divisor.ashrInPlace(Shift); UseSRA = true; @@ -6185,6 +6008,16 @@ bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI, return CmpInst::isEquality(Pred) && Y.isValid(); } +bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) { + Register ShiftReg = MI.getOperand(2).getReg(); + LLT ResTy = MRI.getType(MI.getOperand(0).getReg()); + auto IsShiftTooBig = [&](const Constant *C) { + auto *CI = dyn_cast<ConstantInt>(C); + return CI && CI->uge(ResTy.getScalarSizeInBits()); + }; + return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig); +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp b/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp new file mode 100644 index 000000000000..d747cbf5aadc --- /dev/null +++ b/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp @@ -0,0 +1,68 @@ +//===- llvm/CodeGen/GlobalISel/GIMatchTableExecutor.cpp -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file implements the GIMatchTableExecutor class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +#define DEBUG_TYPE "gi-match-table-executor" + +using namespace llvm; + +GIMatchTableExecutor::MatcherState::MatcherState(unsigned MaxRenderers) + : Renderers(MaxRenderers) {} + +GIMatchTableExecutor::GIMatchTableExecutor() = default; + +bool GIMatchTableExecutor::isOperandImmEqual( + const MachineOperand &MO, int64_t Value, + const MachineRegisterInfo &MRI) const { + if (MO.isReg() && MO.getReg()) + if (auto VRegVal = getIConstantVRegValWithLookThrough(MO.getReg(), MRI)) + return VRegVal->Value.getSExtValue() == Value; + return false; +} + +bool GIMatchTableExecutor::isBaseWithConstantOffset( + const MachineOperand &Root, const MachineRegisterInfo &MRI) const { + if (!Root.isReg()) + return false; + + MachineInstr *RootI = MRI.getVRegDef(Root.getReg()); + if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) + return false; + + MachineOperand &RHS = RootI->getOperand(2); + MachineInstr *RHSI = MRI.getVRegDef(RHS.getReg()); + if (RHSI->getOpcode() != TargetOpcode::G_CONSTANT) + return false; + + return true; +} + +bool GIMatchTableExecutor::isObviouslySafeToFold(MachineInstr &MI, + MachineInstr &IntoMI) const { + // Immediate neighbours are already folded. + if (MI.getParent() == IntoMI.getParent() && + std::next(MI.getIterator()) == IntoMI.getIterator()) + return true; + + // Convergent instructions cannot be moved in the CFG. + if (MI.isConvergent() && MI.getParent() != IntoMI.getParent()) + return false; + + return !MI.mayLoadOrStore() && !MI.mayRaiseFPException() && + !MI.hasUnmodeledSideEffects() && MI.implicit_operands().empty(); +} diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp index bfbe7e1c3e55..363ffbfa90b5 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -115,7 +116,7 @@ void GISelKnownBits::computeKnownBitsMin(Register Src0, Register Src1, computeKnownBitsImpl(Src0, Known2, DemandedElts, Depth); // Only known if known in both the LHS and RHS. - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); } // Bitfield extract is computed as (Src >> Offset) & Mask, where Mask is @@ -191,7 +192,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, Depth + 1); // Known bits are the values that are shared by every demanded element. - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); // If we don't know any bits, early out. if (Known.isUnknown()) @@ -235,10 +236,10 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, // For COPYs we don't do anything, don't increase the depth. computeKnownBitsImpl(SrcReg, Known2, DemandedElts, Depth + (Opcode != TargetOpcode::COPY)); - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); // If we reach a point where we don't know anything // just stop looking through the operands. - if (Known.One == 0 && Known.Zero == 0) + if (Known.isUnknown()) break; } else { // We know nothing. @@ -750,7 +751,7 @@ unsigned GISelKnownBits::computeNumSignBits(Register R, // Okay, we know that the sign bit in Mask is set. Use CLO to determine // the number of identical bits in the top of the input value. Mask <<= Mask.getBitWidth() - TyBits; - return std::max(FirstAnswer, Mask.countLeadingOnes()); + return std::max(FirstAnswer, Mask.countl_one()); } unsigned GISelKnownBits::computeNumSignBits(Register R, unsigned Depth) { diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 7d811dc0ad8f..9a67a8d05a4d 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/LowLevelTypeUtils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -43,6 +44,7 @@ #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -74,7 +76,6 @@ #include "llvm/Support/CodeGen.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -300,7 +301,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, Register Op0 = getOrCreateVReg(*U.getOperand(0)); Register Op1 = getOrCreateVReg(*U.getOperand(1)); Register Res = getOrCreateVReg(U); - uint16_t Flags = 0; + uint32_t Flags = 0; if (isa<Instruction>(U)) { const Instruction &I = cast<Instruction>(U); Flags = MachineInstr::copyFlagsFromInstruction(I); @@ -314,7 +315,7 @@ bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { Register Op0 = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); - uint16_t Flags = 0; + uint32_t Flags = 0; if (isa<Instruction>(U)) { const Instruction &I = cast<Instruction>(U); Flags = MachineInstr::copyFlagsFromInstruction(I); @@ -345,7 +346,7 @@ bool IRTranslator::translateCompare(const User &U, MIRBuilder.buildCopy( Res, getOrCreateVReg(*Constant::getAllOnesValue(U.getType()))); else { - uint16_t Flags = 0; + uint32_t Flags = 0; if (CI) Flags = MachineInstr::copyFlagsFromInstruction(*CI); MIRBuilder.buildFCmp(Pred, Res, Op0, Op1, Flags); @@ -844,8 +845,8 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB, // For conditional branch lowering, we might try to do something silly like // emit an G_ICMP to compare an existing G_ICMP i1 result with true. If so, // just re-use the existing condition vreg. - if (MRI->getType(CondLHS).getSizeInBits() == 1 && CI && - CI->getZExtValue() == 1 && CB.PredInfo.Pred == CmpInst::ICMP_EQ) { + if (MRI->getType(CondLHS).getSizeInBits() == 1 && CI && CI->isOne() && + CB.PredInfo.Pred == CmpInst::ICMP_EQ) { Cond = CondLHS; } else { Register CondRHS = getOrCreateVReg(*CB.CmpRHS); @@ -1018,7 +1019,7 @@ void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B, LLT MaskTy = SwitchOpTy; if (MaskTy.getSizeInBits() > PtrTy.getSizeInBits() || - !isPowerOf2_32(MaskTy.getSizeInBits())) + !llvm::has_single_bit<uint32_t>(MaskTy.getSizeInBits())) MaskTy = LLT::scalar(PtrTy.getSizeInBits()); else { // Ensure that the type will fit the mask value. @@ -1074,14 +1075,14 @@ void IRTranslator::emitBitTestCase(SwitchCG::BitTestBlock &BB, // Testing for a single bit; just compare the shift count with what it // would need to be to shift a 1 bit in that position. auto MaskTrailingZeros = - MIB.buildConstant(SwitchTy, countTrailingZeros(B.Mask)); + MIB.buildConstant(SwitchTy, llvm::countr_zero(B.Mask)); Cmp = MIB.buildICmp(ICmpInst::ICMP_EQ, LLT::scalar(1), Reg, MaskTrailingZeros) .getReg(0); } else if (PopCount == BB.Range) { // There is only one zero bit in the range, test for it directly. auto MaskTrailingOnes = - MIB.buildConstant(SwitchTy, countTrailingOnes(B.Mask)); + MIB.buildConstant(SwitchTy, llvm::countr_one(B.Mask)); Cmp = MIB.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Reg, MaskTrailingOnes) .getReg(0); } else { @@ -1294,7 +1295,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { AAMDNodes AAInfo = LI.getAAMetadata(); const Value *Ptr = LI.getPointerOperand(); - Type *OffsetIRTy = DL->getIntPtrType(Ptr->getType()); + Type *OffsetIRTy = DL->getIndexType(Ptr->getType()); LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL); if (CLI->supportSwiftError() && isSwiftError(Ptr)) { @@ -1342,7 +1343,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) { ArrayRef<uint64_t> Offsets = *VMap.getOffsets(*SI.getValueOperand()); Register Base = getOrCreateVReg(*SI.getPointerOperand()); - Type *OffsetIRTy = DL->getIntPtrType(SI.getPointerOperandType()); + Type *OffsetIRTy = DL->getIndexType(SI.getPointerOperandType()); LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL); if (CLI->supportSwiftError() && isSwiftError(SI.getPointerOperand())) { @@ -1438,7 +1439,7 @@ bool IRTranslator::translateSelect(const User &U, ArrayRef<Register> Op0Regs = getOrCreateVRegs(*U.getOperand(1)); ArrayRef<Register> Op1Regs = getOrCreateVRegs(*U.getOperand(2)); - uint16_t Flags = 0; + uint32_t Flags = 0; if (const SelectInst *SI = dyn_cast<SelectInst>(&U)) Flags = MachineInstr::copyFlagsFromInstruction(*SI); @@ -1468,8 +1469,14 @@ bool IRTranslator::translateBitCast(const User &U, MachineIRBuilder &MIRBuilder) { // If we're bitcasting to the source type, we can reuse the source vreg. if (getLLTForType(*U.getOperand(0)->getType(), *DL) == - getLLTForType(*U.getType(), *DL)) + getLLTForType(*U.getType(), *DL)) { + // If the source is a ConstantInt then it was probably created by + // ConstantHoisting and we should leave it alone. + if (isa<ConstantInt>(U.getOperand(0))) + return translateCast(TargetOpcode::G_CONSTANT_FOLD_BARRIER, U, + MIRBuilder); return translateCopy(U, *U.getOperand(0), MIRBuilder); + } return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder); } @@ -1488,7 +1495,7 @@ bool IRTranslator::translateGetElementPtr(const User &U, Register BaseReg = getOrCreateVReg(Op0); Type *PtrIRTy = Op0.getType(); LLT PtrTy = getLLTForType(*PtrIRTy, *DL); - Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy); + Type *OffsetIRTy = DL->getIndexType(PtrIRTy); LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL); // Normalize Vector GEP - all scalar operands should be converted to the @@ -1513,7 +1520,7 @@ bool IRTranslator::translateGetElementPtr(const User &U, .getReg(0); PtrIRTy = FixedVectorType::get(PtrIRTy, VectorWidth); PtrTy = getLLTForType(*PtrIRTy, *DL); - OffsetIRTy = DL->getIntPtrType(PtrIRTy); + OffsetIRTy = DL->getIndexType(PtrIRTy); OffsetTy = getLLTForType(*OffsetIRTy, *DL); } @@ -1759,6 +1766,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) { return TargetOpcode::G_FLOG2; case Intrinsic::log10: return TargetOpcode::G_FLOG10; + case Intrinsic::ldexp: + return TargetOpcode::G_FLDEXP; case Intrinsic::nearbyint: return TargetOpcode::G_FNEARBYINT; case Intrinsic::pow: @@ -1851,6 +1860,8 @@ static unsigned getConstrainedOpcode(Intrinsic::ID ID) { return TargetOpcode::G_STRICT_FMA; case Intrinsic::experimental_constrained_sqrt: return TargetOpcode::G_STRICT_FSQRT; + case Intrinsic::experimental_constrained_ldexp: + return TargetOpcode::G_STRICT_FLDEXP; default: return 0; } @@ -1864,7 +1875,7 @@ bool IRTranslator::translateConstrainedFPIntrinsic( if (!Opcode) return false; - unsigned Flags = MachineInstr::copyFlagsFromInstruction(FPI); + uint32_t Flags = MachineInstr::copyFlagsFromInstruction(FPI); if (EB == fp::ExceptionBehavior::ebIgnore) Flags |= MachineInstr::NoFPExcept; @@ -1879,6 +1890,60 @@ bool IRTranslator::translateConstrainedFPIntrinsic( return true; } +std::optional<MCRegister> IRTranslator::getArgPhysReg(Argument &Arg) { + auto VRegs = getOrCreateVRegs(Arg); + if (VRegs.size() != 1) + return std::nullopt; + + // Arguments are lowered as a copy of a livein physical register. + auto *VRegDef = MF->getRegInfo().getVRegDef(VRegs[0]); + if (!VRegDef || !VRegDef->isCopy()) + return std::nullopt; + return VRegDef->getOperand(1).getReg().asMCReg(); +} + +bool IRTranslator::translateIfEntryValueArgument(const DbgValueInst &DebugInst, + MachineIRBuilder &MIRBuilder) { + auto *Arg = dyn_cast<Argument>(DebugInst.getValue()); + if (!Arg) + return false; + + const DIExpression *Expr = DebugInst.getExpression(); + if (!Expr->isEntryValue()) + return false; + + std::optional<MCRegister> PhysReg = getArgPhysReg(*Arg); + if (!PhysReg) { + LLVM_DEBUG(dbgs() << "Dropping dbg.value: expression is entry_value but " + "couldn't find a physical register\n" + << DebugInst << "\n"); + return true; + } + + MIRBuilder.buildDirectDbgValue(*PhysReg, DebugInst.getVariable(), + DebugInst.getExpression()); + return true; +} + +bool IRTranslator::translateIfEntryValueArgument( + const DbgDeclareInst &DebugInst) { + auto *Arg = dyn_cast<Argument>(DebugInst.getAddress()); + if (!Arg) + return false; + + const DIExpression *Expr = DebugInst.getExpression(); + if (!Expr->isEntryValue()) + return false; + + std::optional<MCRegister> PhysReg = getArgPhysReg(*Arg); + if (!PhysReg) + return false; + + MF->setVariableDbgInfo(DebugInst.getVariable(), Expr, *PhysReg, + DebugInst.getDebugLoc()); + return true; +} + bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, MachineIRBuilder &MIRBuilder) { if (auto *MI = dyn_cast<AnyMemIntrinsic>(&CI)) { @@ -1945,12 +2010,16 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, // instructions (in fact, they get ignored if they *do* exist). MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(), getOrCreateFrameIndex(*AI), DI.getDebugLoc()); - } else { - // A dbg.declare describes the address of a source variable, so lower it - // into an indirect DBG_VALUE. - MIRBuilder.buildIndirectDbgValue(getOrCreateVReg(*Address), - DI.getVariable(), DI.getExpression()); + return true; } + + if (translateIfEntryValueArgument(DI)) + return true; + + // A dbg.declare describes the address of a source variable, so lower it + // into an indirect DBG_VALUE. + MIRBuilder.buildIndirectDbgValue(getOrCreateVReg(*Address), + DI.getVariable(), DI.getExpression()); return true; } case Intrinsic::dbg_label: { @@ -1991,16 +2060,32 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, // DI cannot produce a valid DBG_VALUE, so produce an undef DBG_VALUE to // terminate any prior location. MIRBuilder.buildIndirectDbgValue(0, DI.getVariable(), DI.getExpression()); - } else if (const auto *CI = dyn_cast<Constant>(V)) { + return true; + } + if (const auto *CI = dyn_cast<Constant>(V)) { MIRBuilder.buildConstDbgValue(*CI, DI.getVariable(), DI.getExpression()); - } else { - for (Register Reg : getOrCreateVRegs(*V)) { - // FIXME: This does not handle register-indirect values at offset 0. The - // direct/indirect thing shouldn't really be handled by something as - // implicit as reg+noreg vs reg+imm in the first place, but it seems - // pretty baked in right now. - MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression()); - } + return true; + } + if (auto *AI = dyn_cast<AllocaInst>(V); + AI && AI->isStaticAlloca() && DI.getExpression()->startsWithDeref()) { + // If the value is an alloca and the expression starts with a + // dereference, track a stack slot instead of a register, as registers + // may be clobbered. + auto ExprOperands = DI.getExpression()->getElements(); + auto *ExprDerefRemoved = + DIExpression::get(AI->getContext(), ExprOperands.drop_front()); + MIRBuilder.buildFIDbgValue(getOrCreateFrameIndex(*AI), DI.getVariable(), + ExprDerefRemoved); + return true; + } + if (translateIfEntryValueArgument(DI, MIRBuilder)) + return true; + for (Register Reg : getOrCreateVRegs(*V)) { + // FIXME: This does not handle register-indirect values at offset 0. The + // direct/indirect thing shouldn't really be handled by something as + // implicit as reg+noreg vs reg+imm in the first place, but it seems + // pretty baked in right now. + MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression()); } return true; } @@ -2090,6 +2175,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, getOrCreateVReg(*CI.getArgOperand(0)), MachineInstr::copyFlagsFromInstruction(CI)); return true; + case Intrinsic::frexp: { + ArrayRef<Register> VRegs = getOrCreateVRegs(CI); + MIRBuilder.buildFFrexp(VRegs[0], VRegs[1], + getOrCreateVReg(*CI.getArgOperand(0)), + MachineInstr::copyFlagsFromInstruction(CI)); + return true; + } case Intrinsic::memcpy_inline: return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMCPY_INLINE); case Intrinsic::memcpy: @@ -2296,7 +2388,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return CLI->lowerCall(MIRBuilder, Info); } case Intrinsic::fptrunc_round: { - unsigned Flags = MachineInstr::copyFlagsFromInstruction(CI); + uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI); // Convert the metadata argument to a constant integer Metadata *MD = cast<MetadataAsValue>(CI.getArgOperand(1))->getMetadata(); diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp index e0357c50e555..3925611f1485 100644 --- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp @@ -391,10 +391,12 @@ bool InlineAsmLowering::lowerInlineAsm( Inst.addReg(SourceRegs[0]); } else { // Otherwise, this outputs to a register (directly for C_Register / - // C_RegisterClass. Find a register that we can use. + // C_RegisterClass/C_Other. assert(OpInfo.ConstraintType == TargetLowering::C_Register || - OpInfo.ConstraintType == TargetLowering::C_RegisterClass); + OpInfo.ConstraintType == TargetLowering::C_RegisterClass || + OpInfo.ConstraintType == TargetLowering::C_Other); + // Find a register that we can use. if (OpInfo.Regs.empty()) { LLVM_DEBUG(dbgs() << "Couldn't allocate output register for constraint\n"); diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp index f780050ca3f1..9bbef11067ae 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Config/config.h" @@ -104,7 +105,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { CodeGenCoverage CoverageInfo; assert(ISel && "Cannot work without InstructionSelector"); - ISel->setupMF(MF, KB, CoverageInfo, PSI, BFI); + ISel->setupMF(MF, KB, &CoverageInfo, PSI, BFI); // An optimization remark emitter. Used to report failures. MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); @@ -165,12 +166,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { continue; } - // Eliminate hints. - if (isPreISelGenericOptimizationHint(MI.getOpcode())) { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); + // Eliminate hints or G_CONSTANT_FOLD_BARRIER. + if (isPreISelGenericOptimizationHint(MI.getOpcode()) || + MI.getOpcode() == TargetOpcode::G_CONSTANT_FOLD_BARRIER) { + auto [DstReg, SrcReg] = MI.getFirst2Regs(); - // At this point, the destination register class of the hint may have + // At this point, the destination register class of the op may have // been decided. // // Propagate that through to the source register. diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp index 8959d215ecd1..c48591cc2f02 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp @@ -5,64 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -/// \file -/// This file implements the InstructionSelector class. -// -//===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" -#include "llvm/CodeGen/GlobalISel/Utils.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -#define DEBUG_TYPE "instructionselector" - -using namespace llvm; - -InstructionSelector::MatcherState::MatcherState(unsigned MaxRenderers) - : Renderers(MaxRenderers) {} - -InstructionSelector::InstructionSelector() = default; - -bool InstructionSelector::isOperandImmEqual( - const MachineOperand &MO, int64_t Value, - const MachineRegisterInfo &MRI) const { - if (MO.isReg() && MO.getReg()) - if (auto VRegVal = getIConstantVRegValWithLookThrough(MO.getReg(), MRI)) - return VRegVal->Value.getSExtValue() == Value; - return false; -} - -bool InstructionSelector::isBaseWithConstantOffset( - const MachineOperand &Root, const MachineRegisterInfo &MRI) const { - if (!Root.isReg()) - return false; - - MachineInstr *RootI = MRI.getVRegDef(Root.getReg()); - if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) - return false; - - MachineOperand &RHS = RootI->getOperand(2); - MachineInstr *RHSI = MRI.getVRegDef(RHS.getReg()); - if (RHSI->getOpcode() != TargetOpcode::G_CONSTANT) - return false; - - return true; -} -bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI, - MachineInstr &IntoMI) const { - // Immediate neighbours are already folded. - if (MI.getParent() == IntoMI.getParent() && - std::next(MI.getIterator()) == IntoMI.getIterator()) - return true; +namespace llvm { - // Convergent instructions cannot be moved in the CFG. - if (MI.isConvergent() && MI.getParent() != IntoMI.getParent()) - return false; +// vtable anchor +InstructionSelector::~InstructionSelector() = default; - return !MI.mayLoadOrStore() && !MI.mayRaiseFPException() && - !MI.hasUnmodeledSideEffects() && MI.implicit_operands().empty(); -} +} // namespace llvm diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp index 54a82cac95d5..2c77ed8b0600 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp @@ -164,7 +164,8 @@ LegalityPredicate LegalityPredicates::sizeNotMultipleOf(unsigned TypeIdx, LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; - return QueryTy.isScalar() && !isPowerOf2_32(QueryTy.getSizeInBits()); + return QueryTy.isScalar() && + !llvm::has_single_bit<uint32_t>(QueryTy.getSizeInBits()); }; } @@ -184,14 +185,16 @@ LegalityPredicate LegalityPredicates::sameSize(unsigned TypeIdx0, LegalityPredicate LegalityPredicates::memSizeInBytesNotPow2(unsigned MMOIdx) { return [=](const LegalityQuery &Query) { - return !isPowerOf2_32(Query.MMODescrs[MMOIdx].MemoryTy.getSizeInBytes()); + return !llvm::has_single_bit<uint32_t>( + Query.MMODescrs[MMOIdx].MemoryTy.getSizeInBytes()); }; } LegalityPredicate LegalityPredicates::memSizeNotByteSizePow2(unsigned MMOIdx) { return [=](const LegalityQuery &Query) { const LLT MemTy = Query.MMODescrs[MMOIdx].MemoryTy; - return !MemTy.isByteSized() || !isPowerOf2_32(MemTy.getSizeInBytes()); + return !MemTy.isByteSized() || + !llvm::has_single_bit<uint32_t>(MemTy.getSizeInBytes()); }; } diff --git a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp index 1a13f39c100c..aecbe0b7604c 100644 --- a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/GISelWorkList.h" #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" @@ -75,6 +76,7 @@ INITIALIZE_PASS_BEGIN(Legalizer, DEBUG_TYPE, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) INITIALIZE_PASS_END(Legalizer, DEBUG_TYPE, "Legalize the Machine IR a function's Machine IR", false, false) @@ -85,6 +87,8 @@ void Legalizer::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetPassConfig>(); AU.addRequired<GISelCSEAnalysisWrapperPass>(); AU.addPreserved<GISelCSEAnalysisWrapperPass>(); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); getSelectionDAGFallbackAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } @@ -173,7 +177,8 @@ Legalizer::MFResult Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI, ArrayRef<GISelChangeObserver *> AuxObservers, LostDebugLocObserver &LocObserver, - MachineIRBuilder &MIRBuilder) { + MachineIRBuilder &MIRBuilder, + GISelKnownBits *KB) { MIRBuilder.setMF(MF); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -212,7 +217,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI, // Now install the observer as the delegate to MF. // This will keep all the observers notified about new insertions/deletions. RAIIMFObsDelInstaller Installer(MF, WrapperObserver); - LegalizerHelper Helper(MF, LI, WrapperObserver, MIRBuilder); + LegalizerHelper Helper(MF, LI, WrapperObserver, MIRBuilder, KB); LegalizationArtifactCombiner ArtCombiner(MIRBuilder, MRI, LI); bool Changed = false; SmallVector<MachineInstr *, 128> RetryList; @@ -314,8 +319,6 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr); - const size_t NumBlocks = MF.size(); - std::unique_ptr<MachineIRBuilder> MIRBuilder; GISelCSEInfo *CSEInfo = nullptr; bool EnableCSE = EnableCSEInLegalizer.getNumOccurrences() @@ -338,25 +341,18 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) { if (VerifyDebugLocs > DebugLocVerifyLevel::None) AuxObservers.push_back(&LocObserver); + // This allows Known Bits Analysis in the legalizer. + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + const LegalizerInfo &LI = *MF.getSubtarget().getLegalizerInfo(); - MFResult Result = - legalizeMachineFunction(MF, LI, AuxObservers, LocObserver, *MIRBuilder); + MFResult Result = legalizeMachineFunction(MF, LI, AuxObservers, LocObserver, + *MIRBuilder, KB); if (Result.FailedOn) { reportGISelFailure(MF, TPC, MORE, "gisel-legalize", "unable to legalize instruction", *Result.FailedOn); return false; } - // For now don't support if new blocks are inserted - we would need to fix the - // outer loop for that. - if (MF.size() != NumBlocks) { - MachineOptimizationRemarkMissed R("gisel-legalize", "GISelFailure", - MF.getFunction().getSubprogram(), - /*MBB=*/nullptr); - R << "inserting blocks is not supported yet"; - reportGISelFailure(MF, TPC, MORE, R); - return false; - } if (LocObserver.getNumLostDebugLocs()) { MachineOptimizationRemarkMissed R("gisel-legalize", "LostDebugLoc", diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 8a1fce2d3d65..f0da0d88140f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -15,12 +15,14 @@ #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetFrameLowering.h" @@ -102,13 +104,13 @@ LegalizerHelper::LegalizerHelper(MachineFunction &MF, MachineIRBuilder &Builder) : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()), - TLI(*MF.getSubtarget().getTargetLowering()) { } + TLI(*MF.getSubtarget().getTargetLowering()), KB(nullptr) {} LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI, GISelChangeObserver &Observer, - MachineIRBuilder &B) - : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI), - TLI(*MF.getSubtarget().getTargetLowering()) { } + MachineIRBuilder &B, GISelKnownBits *KB) + : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI), + TLI(*MF.getSubtarget().getTargetLowering()), KB(KB) {} LegalizerHelper::LegalizeResult LegalizerHelper::legalizeInstrStep(MachineInstr &MI, @@ -540,6 +542,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE(LOG_F); case TargetOpcode::G_FLOG2: RTLIBCASE(LOG2_F); + case TargetOpcode::G_FLDEXP: + RTLIBCASE(LDEXP_F); case TargetOpcode::G_FCEIL: RTLIBCASE(CEIL_F); case TargetOpcode::G_FFLOOR: @@ -824,6 +828,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { case TargetOpcode::G_FLOG10: case TargetOpcode::G_FLOG: case TargetOpcode::G_FLOG2: + case TargetOpcode::G_FLDEXP: case TargetOpcode::G_FEXP: case TargetOpcode::G_FEXP2: case TargetOpcode::G_FCEIL: @@ -1411,6 +1416,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT); Observer.changedInstr(MI); return Legalized; + case TargetOpcode::G_FLDEXP: + case TargetOpcode::G_STRICT_FLDEXP: + return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy); } } @@ -1504,13 +1512,11 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, if (TypeIdx != 1) return UnableToLegalize; - Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); + auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs(); if (DstTy.isVector()) return UnableToLegalize; - Register Src1 = MI.getOperand(1).getReg(); - LLT SrcTy = MRI.getType(Src1); + LLT SrcTy = MRI.getType(Src1Reg); const int DstSize = DstTy.getSizeInBits(); const int SrcSize = SrcTy.getSizeInBits(); const int WideSize = WideTy.getSizeInBits(); @@ -1522,7 +1528,7 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, if (WideSize >= DstSize) { // Directly pack the bits in the target type. - Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0); + Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0); for (unsigned I = 2; I != NumOps; ++I) { const unsigned Offset = (I - 1) * PartSize; @@ -1753,11 +1759,7 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, LegalizerHelper::LegalizeResult LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - - LLT DstTy = MRI.getType(DstReg); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); unsigned Offset = MI.getOperand(2).getImm(); if (TypeIdx == 0) { @@ -1978,10 +1980,7 @@ LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx, } bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO; - Register Result = MI.getOperand(0).getReg(); - Register OriginalOverflow = MI.getOperand(1).getReg(); - Register LHS = MI.getOperand(2).getReg(); - Register RHS = MI.getOperand(3).getReg(); + auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs(); LLT SrcTy = MRI.getType(LHS); LLT OverflowTy = MRI.getType(OriginalOverflow); unsigned SrcBitWidth = SrcTy.getScalarSizeInBits(); @@ -2560,12 +2559,41 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); Observer.changedInstr(MI); return Legalized; - case TargetOpcode::G_FPOWI: { - if (TypeIdx != 0) - return UnableToLegalize; + case TargetOpcode::G_FPOWI: + case TargetOpcode::G_FLDEXP: + case TargetOpcode::G_STRICT_FLDEXP: { + if (TypeIdx == 0) { + if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP) + return UnableToLegalize; + + Observer.changingInstr(MI); + widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); + widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); + Observer.changedInstr(MI); + return Legalized; + } + + if (TypeIdx == 1) { + // For some reason SelectionDAG tries to promote to a libcall without + // actually changing the integer type for promotion. + Observer.changingInstr(MI); + widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); + Observer.changedInstr(MI); + return Legalized; + } + + return UnableToLegalize; + } + case TargetOpcode::G_FFREXP: { Observer.changingInstr(MI); - widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT); - widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); + + if (TypeIdx == 0) { + widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT); + widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC); + } else { + widenScalarDst(MI, WideTy, 1); + } + Observer.changedInstr(MI); return Legalized; } @@ -2631,12 +2659,34 @@ static void getUnmergePieces(SmallVectorImpl<Register> &Pieces, } LegalizerHelper::LegalizeResult -LegalizerHelper::lowerBitcast(MachineInstr &MI) { +LegalizerHelper::lowerFConstant(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(Dst); - LLT SrcTy = MRI.getType(Src); + MachineFunction &MF = MIRBuilder.getMF(); + const DataLayout &DL = MIRBuilder.getDataLayout(); + + unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace(); + LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); + Align Alignment = Align(DL.getABITypeAlign( + getFloatTypeForLLT(MF.getFunction().getContext(), MRI.getType(Dst)))); + + auto Addr = MIRBuilder.buildConstantPool( + AddrPtrTy, MF.getConstantPool()->getConstantPoolIndex( + MI.getOperand(1).getFPImm(), Alignment)); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad, + MRI.getType(Dst), Alignment); + + MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Addr, *MMO); + MI.eraseFromParent(); + + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerBitcast(MachineInstr &MI) { + auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); if (SrcTy.isVector()) { LLT SrcEltTy = SrcTy.getElementType(); SmallVector<Register, 8> SrcRegs; @@ -2732,11 +2782,7 @@ LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, if (TypeIdx != 1) return UnableToLegalize; - Register Dst = MI.getOperand(0).getReg(); - Register SrcVec = MI.getOperand(1).getReg(); - Register Idx = MI.getOperand(2).getReg(); - LLT SrcVecTy = MRI.getType(SrcVec); - LLT IdxTy = MRI.getType(Idx); + auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs(); LLT SrcEltTy = SrcVecTy.getElementType(); unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; @@ -2872,13 +2918,9 @@ LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, if (TypeIdx != 0) return UnableToLegalize; - Register Dst = MI.getOperand(0).getReg(); - Register SrcVec = MI.getOperand(1).getReg(); - Register Val = MI.getOperand(2).getReg(); - Register Idx = MI.getOperand(3).getReg(); - - LLT VecTy = MRI.getType(Dst); - LLT IdxTy = MRI.getType(Idx); + auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] = + MI.getFirst4RegLLTs(); + LLT VecTy = DstTy; LLT VecEltTy = VecTy.getElementType(); LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; @@ -3004,7 +3046,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { if (!isPowerOf2_32(MemSizeInBits)) { // This load needs splitting into power of 2 sized loads. - LargeSplitSize = PowerOf2Floor(MemSizeInBits); + LargeSplitSize = llvm::bit_floor(MemSizeInBits); SmallSplitSize = MemSizeInBits - LargeSplitSize; } else { // This is already a power of 2, but we still need to split this in half. @@ -3122,7 +3164,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) { uint64_t LargeSplitSize, SmallSplitSize; if (!isPowerOf2_32(MemSizeInBits)) { - LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits()); + LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits()); SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize; } else { auto &Ctx = MF.getFunction().getContext(); @@ -3250,6 +3292,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { switch(MI.getOpcode()) { default: return UnableToLegalize; + case TargetOpcode::G_FCONSTANT: + return lowerFConstant(MI); case TargetOpcode::G_BITCAST: return lowerBitcast(MI); case TargetOpcode::G_SREM: @@ -3274,10 +3318,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { case TargetOpcode::G_UMULO: { // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the // result. - Register Res = MI.getOperand(0).getReg(); - Register Overflow = MI.getOperand(1).getReg(); - Register LHS = MI.getOperand(2).getReg(); - Register RHS = MI.getOperand(3).getReg(); + auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs(); LLT Ty = MRI.getType(Res); unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO @@ -3308,7 +3349,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return Legalized; } case TargetOpcode::G_FNEG: { - Register Res = MI.getOperand(0).getReg(); + auto [Res, SubByReg] = MI.getFirst2Regs(); LLT Ty = MRI.getType(Res); // TODO: Handle vector types once we are able to @@ -3317,23 +3358,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return UnableToLegalize; auto SignMask = MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits())); - Register SubByReg = MI.getOperand(1).getReg(); MIRBuilder.buildXor(Res, SubByReg, SignMask); MI.eraseFromParent(); return Legalized; } case TargetOpcode::G_FSUB: case TargetOpcode::G_STRICT_FSUB: { - Register Res = MI.getOperand(0).getReg(); + auto [Res, LHS, RHS] = MI.getFirst3Regs(); LLT Ty = MRI.getType(Res); // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)). - // First, check if G_FNEG is marked as Lower. If so, we may - // end up with an infinite loop as G_FSUB is used to legalize G_FNEG. - if (LI.getAction({G_FNEG, {Ty}}).Action == Lower) - return UnableToLegalize; - Register LHS = MI.getOperand(1).getReg(); - Register RHS = MI.getOperand(2).getReg(); auto Neg = MIRBuilder.buildFNeg(Ty, RHS); if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB) @@ -3357,11 +3391,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return Legalized; } case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { - Register OldValRes = MI.getOperand(0).getReg(); - Register SuccessRes = MI.getOperand(1).getReg(); - Register Addr = MI.getOperand(2).getReg(); - Register CmpVal = MI.getOperand(3).getReg(); - Register NewVal = MI.getOperand(4).getReg(); + auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs(); MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal, **MI.memoperands_begin()); MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal); @@ -3381,10 +3411,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { case TargetOpcode::G_CTPOP: return lowerBitCount(MI); case G_UADDO: { - Register Res = MI.getOperand(0).getReg(); - Register CarryOut = MI.getOperand(1).getReg(); - Register LHS = MI.getOperand(2).getReg(); - Register RHS = MI.getOperand(3).getReg(); + auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs(); MIRBuilder.buildAdd(Res, LHS, RHS); MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS); @@ -3393,11 +3420,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return Legalized; } case G_UADDE: { - Register Res = MI.getOperand(0).getReg(); - Register CarryOut = MI.getOperand(1).getReg(); - Register LHS = MI.getOperand(2).getReg(); - Register RHS = MI.getOperand(3).getReg(); - Register CarryIn = MI.getOperand(4).getReg(); + auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs(); LLT Ty = MRI.getType(Res); auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS); @@ -3409,10 +3432,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return Legalized; } case G_USUBO: { - Register Res = MI.getOperand(0).getReg(); - Register BorrowOut = MI.getOperand(1).getReg(); - Register LHS = MI.getOperand(2).getReg(); - Register RHS = MI.getOperand(3).getReg(); + auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs(); MIRBuilder.buildSub(Res, LHS, RHS); MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS); @@ -3421,11 +3441,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { return Legalized; } case G_USUBE: { - Register Res = MI.getOperand(0).getReg(); - Register BorrowOut = MI.getOperand(1).getReg(); - Register LHS = MI.getOperand(2).getReg(); - Register RHS = MI.getOperand(3).getReg(); - Register BorrowIn = MI.getOperand(4).getReg(); + auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs(); const LLT CondTy = MRI.getType(BorrowOut); const LLT Ty = MRI.getType(Res); @@ -3470,8 +3486,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { assert(MI.getOperand(2).isImm() && "Expected immediate"); int64_t SizeInBits = MI.getOperand(2).getImm(); - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); + auto [DstReg, SrcReg] = MI.getFirst2Regs(); LLT DstTy = MRI.getType(DstReg); Register TmpRes = MRI.createGenericVirtualRegister(DstTy); @@ -3869,9 +3884,7 @@ LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI, LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) { - Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); // Requires compatible types. Otherwise user of DstReg did not perform unmerge // that should have been artifact combined. Most likely instruction that uses // DstReg has to do more/fewer elements legalization compatible with NarrowTy. @@ -3958,8 +3971,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, LLT NarrowVecTy) { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcVec = MI.getOperand(1).getReg(); + auto [DstReg, SrcVec] = MI.getFirst2Regs(); Register InsertVal; bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT; @@ -4159,6 +4171,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_FLOG: case G_FLOG2: case G_FLOG10: + case G_FLDEXP: case G_FNEARBYINT: case G_FCEIL: case G_FFLOOR: @@ -4234,6 +4247,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_STRICT_FSUB: case G_STRICT_FMUL: case G_STRICT_FMA: + case G_STRICT_FLDEXP: + case G_FFREXP: return fewerElementsVectorMultiEltType(GMI, NumElts); case G_ICMP: case G_FCMP: @@ -4278,13 +4293,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle( if (TypeIdx != 0) return UnableToLegalize; - Register DstReg = MI.getOperand(0).getReg(); - Register Src1Reg = MI.getOperand(1).getReg(); - Register Src2Reg = MI.getOperand(2).getReg(); + auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] = + MI.getFirst3RegLLTs(); ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); - LLT DstTy = MRI.getType(DstReg); - LLT Src1Ty = MRI.getType(Src1Reg); - LLT Src2Ty = MRI.getType(Src2Reg); // The shuffle should be canonicalized by now. if (DstTy != Src1Ty) return UnableToLegalize; @@ -4474,10 +4485,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions( // The semantics of the normal non-sequential reductions allow us to freely // re-associate the operation. - Register SrcReg = MI.getOperand(1).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); if (NarrowTy.isVector() && (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0)) @@ -4865,6 +4873,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: case TargetOpcode::G_EXTRACT: if (TypeIdx != 1) return UnableToLegalize; @@ -4873,6 +4882,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_INSERT: + case TargetOpcode::G_INSERT_VECTOR_ELT: case TargetOpcode::G_FREEZE: case TargetOpcode::G_FNEG: case TargetOpcode::G_FABS: @@ -4887,10 +4897,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, Observer.changedInstr(MI); return Legalized; case TargetOpcode::G_SELECT: { - Register DstReg = MI.getOperand(0).getReg(); - Register CondReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT CondTy = MRI.getType(CondReg); + auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs(); if (TypeIdx == 1) { if (!CondTy.isScalar() || DstTy.getElementCount() != MoreTy.getElementCount()) @@ -4943,28 +4950,50 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_FPTRUNC: + case TargetOpcode::G_FPEXT: { + if (TypeIdx != 0) + return UnableToLegalize; + Observer.changingInstr(MI); + LLT SrcTy = LLT::fixed_vector( + MoreTy.getNumElements(), + MRI.getType(MI.getOperand(1).getReg()).getElementType()); + moreElementsVectorSrc(MI, SrcTy, 1); + moreElementsVectorDst(MI, MoreTy, 0); + Observer.changedInstr(MI); + return Legalized; + } default: return UnableToLegalize; } } -/// Expand source vectors to the size of destination vector. -static LegalizerHelper::LegalizeResult -equalizeVectorShuffleLengths(MachineInstr &MI, MachineIRBuilder &MIRBuilder) { - MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); - - LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); +LegalizerHelper::LegalizeResult +LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) { + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); unsigned MaskNumElts = Mask.size(); unsigned SrcNumElts = SrcTy.getNumElements(); - Register DstReg = MI.getOperand(0).getReg(); LLT DestEltTy = DstTy.getElementType(); - // TODO: Normalize the shuffle vector since mask and vector length don't - // match. - if (MaskNumElts <= SrcNumElts) { - return LegalizerHelper::LegalizeResult::UnableToLegalize; + if (MaskNumElts == SrcNumElts) + return Legalized; + + if (MaskNumElts < SrcNumElts) { + // Extend mask to match new destination vector size with + // undef values. + SmallVector<int, 16> NewMask(Mask); + for (unsigned I = MaskNumElts; I < SrcNumElts; ++I) + NewMask.push_back(-1); + + moreElementsVectorDst(MI, SrcTy, 0); + MIRBuilder.setInstrAndDebugLoc(MI); + MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(), + MI.getOperand(1).getReg(), + MI.getOperand(2).getReg(), NewMask); + MI.eraseFromParent(); + + return Legalized; } unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts); @@ -5014,19 +5043,14 @@ equalizeVectorShuffleLengths(MachineInstr &MI, MachineIRBuilder &MIRBuilder) { LegalizerHelper::LegalizeResult LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI, unsigned int TypeIdx, LLT MoreTy) { - Register DstReg = MI.getOperand(0).getReg(); - Register Src1Reg = MI.getOperand(1).getReg(); - Register Src2Reg = MI.getOperand(2).getReg(); + auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs(); ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); - LLT DstTy = MRI.getType(DstReg); - LLT Src1Ty = MRI.getType(Src1Reg); - LLT Src2Ty = MRI.getType(Src2Reg); unsigned NumElts = DstTy.getNumElements(); unsigned WidenNumElts = MoreTy.getNumElements(); if (DstTy.isVector() && Src1Ty.isVector() && - DstTy.getNumElements() > Src1Ty.getNumElements()) { - return equalizeVectorShuffleLengths(MI, MIRBuilder); + DstTy.getNumElements() != Src1Ty.getNumElements()) { + return equalizeVectorShuffleLengths(MI); } if (TypeIdx != 0) @@ -5218,9 +5242,7 @@ LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx, LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) { - Register DstReg = MI.getOperand(0).getReg(); - Register Src1 = MI.getOperand(1).getReg(); - Register Src2 = MI.getOperand(2).getReg(); + auto [DstReg, Src1, Src2] = MI.getFirst3Regs(); LLT Ty = MRI.getType(DstReg); if (Ty.isVector()) @@ -5471,8 +5493,7 @@ LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx, if (TypeIdx != 0) return UnableToLegalize; - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); + auto [DstReg, SrcReg] = MI.getFirst2Regs(); LLT DstTy = MRI.getType(DstReg); if (DstTy.isVector()) @@ -5539,10 +5560,7 @@ LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx, if (TypeIdx != 1) return UnableToLegalize; - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(SrcReg); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); unsigned NarrowSize = NarrowTy.getSizeInBits(); if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { @@ -5575,10 +5593,7 @@ LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, if (TypeIdx != 1) return UnableToLegalize; - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(SrcReg); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); unsigned NarrowSize = NarrowTy.getSizeInBits(); if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { @@ -5611,9 +5626,7 @@ LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, if (TypeIdx != 1) return UnableToLegalize; - Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); unsigned NarrowSize = NarrowTy.getSizeInBits(); if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) { @@ -5631,6 +5644,31 @@ LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, } LegalizerHelper::LegalizeResult +LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx, + LLT NarrowTy) { + if (TypeIdx != 1) + return UnableToLegalize; + + MachineIRBuilder &B = MIRBuilder; + Register ExpReg = MI.getOperand(2).getReg(); + LLT ExpTy = MRI.getType(ExpReg); + + unsigned ClampSize = NarrowTy.getScalarSizeInBits(); + + // Clamp the exponent to the range of the target type. + auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize)); + auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp); + auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize)); + auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp); + + auto Trunc = B.buildTrunc(NarrowTy, Clamp); + Observer.changingInstr(MI); + MI.getOperand(2).setReg(Trunc.getReg(0)); + Observer.changedInstr(MI); + return Legalized; +} + +LegalizerHelper::LegalizeResult LegalizerHelper::lowerBitCount(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); const auto &TII = MIRBuilder.getTII(); @@ -5649,10 +5687,7 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) { return Legalized; } case TargetOpcode::G_CTLZ: { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(SrcReg); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); unsigned Len = SrcTy.getSizeInBits(); if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) { @@ -5699,10 +5734,7 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) { return Legalized; } case TargetOpcode::G_CTTZ: { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(SrcReg); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); unsigned Len = SrcTy.getSizeInBits(); if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) { @@ -5808,10 +5840,7 @@ static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI, LegalizerHelper::LegalizeResult LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register X = MI.getOperand(1).getReg(); - Register Y = MI.getOperand(2).getReg(); - Register Z = MI.getOperand(3).getReg(); + auto [Dst, X, Y, Z] = MI.getFirst4Regs(); LLT Ty = MRI.getType(Dst); LLT ShTy = MRI.getType(Z); @@ -5850,10 +5879,7 @@ LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register X = MI.getOperand(1).getReg(); - Register Y = MI.getOperand(2).getReg(); - Register Z = MI.getOperand(3).getReg(); + auto [Dst, X, Y, Z] = MI.getFirst4Regs(); LLT Ty = MRI.getType(Dst); LLT ShTy = MRI.getType(Z); @@ -5932,10 +5958,7 @@ LegalizerHelper::lowerFunnelShift(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - Register Amt = MI.getOperand(2).getReg(); - LLT AmtTy = MRI.getType(Amt); + auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs(); auto Zero = MIRBuilder.buildConstant(AmtTy, 0); bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL; unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL; @@ -5946,12 +5969,7 @@ LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) { } LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - Register Amt = MI.getOperand(2).getReg(); - LLT DstTy = MRI.getType(Dst); - LLT SrcTy = MRI.getType(Src); - LLT AmtTy = MRI.getType(Amt); + auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs(); unsigned EltSizeInBits = DstTy.getScalarSizeInBits(); bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL; @@ -6021,8 +6039,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) { // representation. LegalizerHelper::LegalizeResult LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); + auto [Dst, Src] = MI.getFirst2Regs(); const LLT S64 = LLT::scalar(64); const LLT S32 = LLT::scalar(32); const LLT S1 = LLT::scalar(1); @@ -6077,10 +6094,7 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) { } LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(Dst); - LLT SrcTy = MRI.getType(Src); + auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); if (SrcTy == LLT::scalar(1)) { auto True = MIRBuilder.buildFConstant(DstTy, 1.0); @@ -6105,10 +6119,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) { } LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(Dst); - LLT SrcTy = MRI.getType(Src); + auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); const LLT S64 = LLT::scalar(64); const LLT S32 = LLT::scalar(32); @@ -6151,10 +6162,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) { } LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(Dst); - LLT SrcTy = MRI.getType(Src); + auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); const LLT S64 = LLT::scalar(64); const LLT S32 = LLT::scalar(32); @@ -6194,10 +6202,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) { } LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(Dst); - LLT SrcTy = MRI.getType(Src); + auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); const LLT S64 = LLT::scalar(64); const LLT S32 = LLT::scalar(32); @@ -6263,17 +6268,27 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) { // f64 -> f16 conversion using round-to-nearest-even rounding mode. LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + + auto [Dst, Src] = MI.getFirst2Regs(); + assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) && + MRI.getType(Src).getScalarType() == LLT::scalar(64)); if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly. return UnableToLegalize; + if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) { + unsigned Flags = MI.getFlags(); + auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags); + MIRBuilder.buildFPTrunc(Dst, Src32, Flags); + MI.eraseFromParent(); + return Legalized; + } + const unsigned ExpMask = 0x7ff; const unsigned ExpBiasf64 = 1023; const unsigned ExpBiasf16 = 15; - const LLT S32 = LLT::scalar(32); - const LLT S1 = LLT::scalar(1); auto Unmerge = MIRBuilder.buildUnmerge(S32, Src); Register U = Unmerge.getReg(0); @@ -6368,11 +6383,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - - LLT DstTy = MRI.getType(Dst); - LLT SrcTy = MRI.getType(Src); + auto [DstTy, SrcTy] = MI.getFirst2LLTs(); const LLT S64 = LLT::scalar(64); const LLT S16 = LLT::scalar(16); @@ -6385,9 +6396,7 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) { // TODO: If RHS is a constant SelectionDAGBuilder expands this into a // multiplication tree. LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); + auto [Dst, Src0, Src1] = MI.getFirst3Regs(); LLT Ty = MRI.getType(Dst); auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1); @@ -6412,9 +6421,7 @@ static CmpInst::Predicate minMaxToCompare(unsigned Opc) { } LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); + auto [Dst, Src0, Src1] = MI.getFirst3Regs(); const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode()); LLT CmpType = MRI.getType(Dst).changeElementSize(1); @@ -6428,13 +6435,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerFCopySign(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); - - const LLT Src0Ty = MRI.getType(Src0); - const LLT Src1Ty = MRI.getType(Src1); - + auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs(); const int Src0Size = Src0Ty.getScalarSizeInBits(); const int Src1Size = Src1Ty.getScalarSizeInBits(); @@ -6475,9 +6476,7 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) { unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ? TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE; - Register Dst = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); + auto [Dst, Src0, Src1] = MI.getFirst3Regs(); LLT Ty = MRI.getType(Dst); if (!MI.getFlag(MachineInstr::FmNoNans)) { @@ -6516,8 +6515,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) { - Register DstReg = MI.getOperand(0).getReg(); - Register X = MI.getOperand(1).getReg(); + auto [DstReg, X] = MI.getFirst2Regs(); const unsigned Flags = MI.getFlags(); const LLT Ty = MRI.getType(DstReg); const LLT CondTy = Ty.changeElementSize(1); @@ -6547,10 +6545,8 @@ LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) { return Legalized; } -LegalizerHelper::LegalizeResult -LegalizerHelper::lowerFFloor(MachineInstr &MI) { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); +LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) { + auto [DstReg, SrcReg] = MI.getFirst2Regs(); unsigned Flags = MI.getFlags(); LLT Ty = MRI.getType(DstReg); const LLT CondTy = Ty.changeElementSize(1); @@ -6577,11 +6573,8 @@ LegalizerHelper::lowerFFloor(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerMergeValues(MachineInstr &MI) { const unsigned NumOps = MI.getNumOperands(); - Register DstReg = MI.getOperand(0).getReg(); - Register Src0Reg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(Src0Reg); - unsigned PartSize = SrcTy.getSizeInBits(); + auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs(); + unsigned PartSize = Src0Ty.getSizeInBits(); LLT WideTy = LLT::scalar(DstTy.getSizeInBits()); Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0); @@ -6729,11 +6722,8 @@ LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerShuffleVector(MachineInstr &MI) { - Register DstReg = MI.getOperand(0).getReg(); - Register Src0Reg = MI.getOperand(1).getReg(); - Register Src1Reg = MI.getOperand(2).getReg(); - LLT Src0Ty = MRI.getType(Src0Reg); - LLT DstTy = MRI.getType(DstReg); + auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] = + MI.getFirst3RegLLTs(); LLT IdxTy = LLT::scalar(32); ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask(); @@ -6822,13 +6812,9 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerExtract(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); unsigned Offset = MI.getOperand(2).getImm(); - LLT DstTy = MRI.getType(Dst); - LLT SrcTy = MRI.getType(Src); - // Extract sub-vector or one element if (SrcTy.isVector()) { unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); @@ -6837,7 +6823,7 @@ LegalizerHelper::lowerExtract(MachineInstr &MI) { if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) && (Offset + DstSize <= SrcTy.getSizeInBits())) { // Unmerge and allow access to each Src element for the artifact combiner. - auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), Src); + auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg); // Take element(s) we need to extract and copy it (merge them). SmallVector<Register, 8> SubVectorElts; @@ -6846,9 +6832,9 @@ LegalizerHelper::lowerExtract(MachineInstr &MI) { SubVectorElts.push_back(Unmerge.getReg(Idx)); } if (SubVectorElts.size() == 1) - MIRBuilder.buildCopy(Dst, SubVectorElts[0]); + MIRBuilder.buildCopy(DstReg, SubVectorElts[0]); else - MIRBuilder.buildMergeLikeInstr(Dst, SubVectorElts); + MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts); MI.eraseFromParent(); return Legalized; @@ -6861,15 +6847,15 @@ LegalizerHelper::lowerExtract(MachineInstr &MI) { LLT SrcIntTy = SrcTy; if (!SrcTy.isScalar()) { SrcIntTy = LLT::scalar(SrcTy.getSizeInBits()); - Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0); + SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0); } if (Offset == 0) - MIRBuilder.buildTrunc(Dst, Src); + MIRBuilder.buildTrunc(DstReg, SrcReg); else { auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset); - auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt); - MIRBuilder.buildTrunc(Dst, Shr); + auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt); + MIRBuilder.buildTrunc(DstReg, Shr); } MI.eraseFromParent(); @@ -6880,9 +6866,7 @@ LegalizerHelper::lowerExtract(MachineInstr &MI) { } LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - Register InsertSrc = MI.getOperand(2).getReg(); + auto [Dst, Src, InsertSrc] = MI.getFirst3Regs(); uint64_t Offset = MI.getOperand(3).getImm(); LLT DstTy = MRI.getType(Src); @@ -6972,14 +6956,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) { - Register Dst0 = MI.getOperand(0).getReg(); - Register Dst1 = MI.getOperand(1).getReg(); - Register LHS = MI.getOperand(2).getReg(); - Register RHS = MI.getOperand(3).getReg(); + auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] = + MI.getFirst4RegLLTs(); const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO; - LLT Ty = MRI.getType(Dst0); - LLT BoolTy = MRI.getType(Dst1); + LLT Ty = Dst0Ty; + LLT BoolTy = Dst1Ty; if (IsAdd) MIRBuilder.buildAdd(Dst0, LHS, RHS); @@ -7008,9 +6990,7 @@ LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) { - Register Res = MI.getOperand(0).getReg(); - Register LHS = MI.getOperand(1).getReg(); - Register RHS = MI.getOperand(2).getReg(); + auto [Res, LHS, RHS] = MI.getFirst3Regs(); LLT Ty = MRI.getType(Res); bool IsSigned; bool IsAdd; @@ -7085,9 +7065,7 @@ LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) { - Register Res = MI.getOperand(0).getReg(); - Register LHS = MI.getOperand(1).getReg(); - Register RHS = MI.getOperand(2).getReg(); + auto [Res, LHS, RHS] = MI.getFirst3Regs(); LLT Ty = MRI.getType(Res); LLT BoolTy = Ty.changeElementSize(1); bool IsSigned; @@ -7157,9 +7135,7 @@ LegalizerHelper::lowerShlSat(MachineInstr &MI) { MI.getOpcode() == TargetOpcode::G_USHLSAT) && "Expected shlsat opcode!"); bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT; - Register Res = MI.getOperand(0).getReg(); - Register LHS = MI.getOperand(1).getReg(); - Register RHS = MI.getOperand(2).getReg(); + auto [Res, LHS, RHS] = MI.getFirst3Regs(); LLT Ty = MRI.getType(Res); LLT BoolTy = Ty.changeElementSize(1); @@ -7185,10 +7161,8 @@ LegalizerHelper::lowerShlSat(MachineInstr &MI) { return Legalized; } -LegalizerHelper::LegalizeResult -LegalizerHelper::lowerBswap(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); +LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) { + auto [Dst, Src] = MI.getFirst2Regs(); const LLT Ty = MRI.getType(Src); unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8; unsigned BaseShiftAmt = (SizeInBytes - 1) * 8; @@ -7233,8 +7207,7 @@ static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B, LegalizerHelper::LegalizeResult LegalizerHelper::lowerBitreverse(MachineInstr &MI) { - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); + auto [Dst, Src] = MI.getFirst2Regs(); const LLT Ty = MRI.getType(Src); unsigned Size = Ty.getSizeInBits(); @@ -7312,23 +7285,23 @@ LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(SrcReg); - uint64_t Mask = MI.getOperand(2).getImm(); + auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs(); + FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm()); - if (Mask == 0) { + if (Mask == fcNone) { MIRBuilder.buildConstant(DstReg, 0); MI.eraseFromParent(); return Legalized; } - if ((Mask & fcAllFlags) == fcAllFlags) { + if (Mask == fcAllFlags) { MIRBuilder.buildConstant(DstReg, 1); MI.eraseFromParent(); return Legalized; } + // TODO: Try inverting the test with getInvertedFPClassTest like the DAG + // version + unsigned BitSize = SrcTy.getScalarSizeInBits(); const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType()); @@ -7345,7 +7318,7 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) { APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf; APInt QNaNBitMask = APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1); - APInt InvertionMask = APInt::getAllOnesValue(DstTy.getScalarSizeInBits()); + APInt InvertionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits()); auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit); auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask); @@ -7358,8 +7331,10 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) { MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs); auto Res = MIRBuilder.buildConstant(DstTy, 0); + // Clang doesn't support capture of structured bindings: + LLT DstTyCopy = DstTy; const auto appendToRes = [&](MachineInstrBuilder ToAppend) { - Res = MIRBuilder.buildOr(DstTy, Res, ToAppend); + Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend); }; // Tests that involve more than one class should be processed first. @@ -7382,8 +7357,20 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) { Mask &= ~fcNegFinite; } + if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) { + // fcZero | fcSubnormal => test all exponent bits are 0 + // TODO: Handle sign bit specific cases + // TODO: Handle inverted case + if (PartialCheck == (fcZero | fcSubnormal)) { + auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC); + appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, + ExpBits, ZeroC)); + Mask &= ~PartialCheck; + } + } + // Check for individual classes. - if (unsigned PartialCheck = Mask & fcZero) { + if (FPClassTest PartialCheck = Mask & fcZero) { if (PartialCheck == fcPosZero) appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, AsInt, ZeroC)); @@ -7395,7 +7382,21 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) { AsInt, SignBitC)); } - if (unsigned PartialCheck = Mask & fcInf) { + if (FPClassTest PartialCheck = Mask & fcSubnormal) { + // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set) + // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set) + auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs; + auto OneC = MIRBuilder.buildConstant(IntTy, 1); + auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC); + auto SubnormalRes = + MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne, + MIRBuilder.buildConstant(IntTy, AllOneMantissa)); + if (PartialCheck == fcNegSubnormal) + SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign); + appendToRes(SubnormalRes); + } + + if (FPClassTest PartialCheck = Mask & fcInf) { if (PartialCheck == fcPosInf) appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, AsInt, InfC)); @@ -7410,7 +7411,7 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) { } } - if (unsigned PartialCheck = Mask & fcNan) { + if (FPClassTest PartialCheck = Mask & fcNan) { auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask); if (PartialCheck == fcNan) { // isnan(V) ==> abs(V) u> int(inf) @@ -7431,21 +7432,7 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) { } } - if (unsigned PartialCheck = Mask & fcSubnormal) { - // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set) - // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set) - auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs; - auto OneC = MIRBuilder.buildConstant(IntTy, 1); - auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC); - auto SubnormalRes = - MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne, - MIRBuilder.buildConstant(IntTy, AllOneMantissa)); - if (PartialCheck == fcNegSubnormal) - SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign); - appendToRes(SubnormalRes); - } - - if (unsigned PartialCheck = Mask & fcNormal) { + if (FPClassTest PartialCheck = Mask & fcNormal) { // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u< // (max_exp-1)) APInt ExpLSB = ExpMask & ~(ExpMask.shl(1)); @@ -7472,12 +7459,8 @@ LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) { LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) { // Implement vector G_SELECT in terms of XOR, AND, OR. - Register DstReg = MI.getOperand(0).getReg(); - Register MaskReg = MI.getOperand(1).getReg(); - Register Op1Reg = MI.getOperand(2).getReg(); - Register Op2Reg = MI.getOperand(3).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT MaskTy = MRI.getType(MaskReg); + auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] = + MI.getFirst4RegLLTs(); if (!DstTy.isVector()) return UnableToLegalize; @@ -7591,7 +7574,7 @@ LegalizerHelper::lowerVectorReduction(MachineInstr &MI) { Observer.changedInstr(MI); return Legalized; } - return UnableToLegalize;; + return UnableToLegalize; } static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { @@ -7638,7 +7621,7 @@ static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps, // SDAGisms map cleanly to GISel concepts. if (NewTy.isVector()) NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32); - NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1)); + NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1)); unsigned NewTySize = NewTy.getSizeInBytes(); assert(NewTySize > 0 && "Could not find appropriate type"); @@ -7826,9 +7809,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE); - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - Register Len = MI.getOperand(2).getReg(); + auto [Dst, Src, Len] = MI.getFirst3Regs(); const auto *MMOIt = MI.memoperands_begin(); const MachineMemOperand *MemOp = *MMOIt; @@ -8091,9 +8072,7 @@ LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) { Align DstAlign = MemOp->getBaseAlign(); Align SrcAlign; - Register Dst = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - Register Len = MI.getOperand(2).getReg(); + auto [Dst, Src, Len] = MI.getFirst3Regs(); if (Opc != TargetOpcode::G_MEMSET) { assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index 4b6c3a156709..1f2e481c63e0 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -13,6 +13,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/ADT/SmallBitVector.h" +#include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -21,7 +22,6 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/LowLevelTypeImpl.h" #include <algorithm> using namespace llvm; diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp index 7c6eac8c8ce0..49f40495d6fc 100644 --- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp @@ -10,6 +10,8 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/LoadStoreOpt.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" @@ -18,7 +20,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" -#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/LowLevelTypeUtils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -305,7 +307,7 @@ bool LoadStoreOpt::mergeStores(SmallVectorImpl<GStore *> &StoresToMerge) { const auto &DL = MF->getFunction().getParent()->getDataLayout(); bool AnyMerged = false; do { - unsigned NumPow2 = PowerOf2Floor(StoresToMerge.size()); + unsigned NumPow2 = llvm::bit_floor(StoresToMerge.size()); unsigned MaxSizeBits = NumPow2 * OrigTy.getSizeInBits().getFixedValue(); // Compute the biggest store we can generate to handle the number of stores. unsigned MergeSizeBits; @@ -400,7 +402,9 @@ bool LoadStoreOpt::doSingleStoreMerge(SmallVectorImpl<GStore *> &Stores) { auto NewStore = Builder.buildStore(WideReg, FirstStore->getPointerReg(), *WideMMO); (void) NewStore; - LLVM_DEBUG(dbgs() << "Created merged store: " << *NewStore); + LLVM_DEBUG(dbgs() << "Merged " << Stores.size() + << " stores into merged store: " << *NewStore); + LLVM_DEBUG(for (auto *MI : Stores) dbgs() << " " << *MI;); NumStoresMerged += Stores.size(); MachineOptimizationRemarkEmitter MORE(*MF, nullptr); @@ -445,20 +449,19 @@ bool LoadStoreOpt::processMergeCandidate(StoreMergeCandidate &C) { for (auto AliasInfo : reverse(C.PotentialAliases)) { MachineInstr *PotentialAliasOp = AliasInfo.first; unsigned PreCheckedIdx = AliasInfo.second; - if (static_cast<unsigned>(Idx) > PreCheckedIdx) { - // Need to check this alias. - if (GISelAddressing::instMayAlias(CheckStore, *PotentialAliasOp, *MRI, - AA)) { - LLVM_DEBUG(dbgs() << "Potential alias " << *PotentialAliasOp - << " detected\n"); - return true; - } - } else { + if (static_cast<unsigned>(Idx) < PreCheckedIdx) { // Once our store index is lower than the index associated with the // potential alias, we know that we've already checked for this alias // and all of the earlier potential aliases too. return false; } + // Need to check this alias. + if (GISelAddressing::instMayAlias(CheckStore, *PotentialAliasOp, *MRI, + AA)) { + LLVM_DEBUG(dbgs() << "Potential alias " << *PotentialAliasOp + << " detected\n"); + return true; + } } return false; }; @@ -616,11 +619,304 @@ bool LoadStoreOpt::mergeBlockStores(MachineBasicBlock &MBB) { return Changed; } +/// Check if the store \p Store is a truncstore that can be merged. That is, +/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty +/// Register then it does not need to match and SrcVal is set to the source +/// value found. +/// On match, returns the start byte offset of the \p SrcVal that is being +/// stored. +static std::optional<int64_t> +getTruncStoreByteOffset(GStore &Store, Register &SrcVal, + MachineRegisterInfo &MRI) { + Register TruncVal; + if (!mi_match(Store.getValueReg(), MRI, m_GTrunc(m_Reg(TruncVal)))) + return std::nullopt; + + // The shift amount must be a constant multiple of the narrow type. + // It is translated to the offset address in the wide source value "y". + // + // x = G_LSHR y, ShiftAmtC + // s8 z = G_TRUNC x + // store z, ... + Register FoundSrcVal; + int64_t ShiftAmt; + if (!mi_match(TruncVal, MRI, + m_any_of(m_GLShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt)), + m_GAShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt))))) { + if (!SrcVal.isValid() || TruncVal == SrcVal) { + if (!SrcVal.isValid()) + SrcVal = TruncVal; + return 0; // If it's the lowest index store. + } + return std::nullopt; + } + + unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits(); + if (ShiftAmt % NarrowBits != 0) + return std::nullopt; + const unsigned Offset = ShiftAmt / NarrowBits; + + if (SrcVal.isValid() && FoundSrcVal != SrcVal) + return std::nullopt; + + if (!SrcVal.isValid()) + SrcVal = FoundSrcVal; + else if (MRI.getType(SrcVal) != MRI.getType(FoundSrcVal)) + return std::nullopt; + return Offset; +} + +/// Match a pattern where a wide type scalar value is stored by several narrow +/// stores. Fold it into a single store or a BSWAP and a store if the targets +/// supports it. +/// +/// Assuming little endian target: +/// i8 *p = ... +/// i32 val = ... +/// p[0] = (val >> 0) & 0xFF; +/// p[1] = (val >> 8) & 0xFF; +/// p[2] = (val >> 16) & 0xFF; +/// p[3] = (val >> 24) & 0xFF; +/// => +/// *((i32)p) = val; +/// +/// i8 *p = ... +/// i32 val = ... +/// p[0] = (val >> 24) & 0xFF; +/// p[1] = (val >> 16) & 0xFF; +/// p[2] = (val >> 8) & 0xFF; +/// p[3] = (val >> 0) & 0xFF; +/// => +/// *((i32)p) = BSWAP(val); +bool LoadStoreOpt::mergeTruncStore(GStore &StoreMI, + SmallPtrSetImpl<GStore *> &DeletedStores) { + LLT MemTy = StoreMI.getMMO().getMemoryType(); + + // We only handle merging simple stores of 1-4 bytes. + if (!MemTy.isScalar()) + return false; + switch (MemTy.getSizeInBits()) { + case 8: + case 16: + case 32: + break; + default: + return false; + } + if (!StoreMI.isSimple()) + return false; + + // We do a simple search for mergeable stores prior to this one. + // Any potential alias hazard along the way terminates the search. + SmallVector<GStore *> FoundStores; + + // We're looking for: + // 1) a (store(trunc(...))) + // 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get + // the partial value stored. + // 3) where the offsets form either a little or big-endian sequence. + + auto &LastStore = StoreMI; + + // The single base pointer that all stores must use. + Register BaseReg; + int64_t LastOffset; + if (!mi_match(LastStore.getPointerReg(), *MRI, + m_GPtrAdd(m_Reg(BaseReg), m_ICst(LastOffset)))) { + BaseReg = LastStore.getPointerReg(); + LastOffset = 0; + } + + GStore *LowestIdxStore = &LastStore; + int64_t LowestIdxOffset = LastOffset; + + Register WideSrcVal; + auto LowestShiftAmt = getTruncStoreByteOffset(LastStore, WideSrcVal, *MRI); + if (!LowestShiftAmt) + return false; // Didn't match a trunc. + assert(WideSrcVal.isValid()); + + LLT WideStoreTy = MRI->getType(WideSrcVal); + // The wide type might not be a multiple of the memory type, e.g. s48 and s32. + if (WideStoreTy.getSizeInBits() % MemTy.getSizeInBits() != 0) + return false; + const unsigned NumStoresRequired = + WideStoreTy.getSizeInBits() / MemTy.getSizeInBits(); + + SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX); + OffsetMap[*LowestShiftAmt] = LastOffset; + FoundStores.emplace_back(&LastStore); + + const int MaxInstsToCheck = 10; + int NumInstsChecked = 0; + for (auto II = ++LastStore.getReverseIterator(); + II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck; + ++II) { + NumInstsChecked++; + GStore *NewStore; + if ((NewStore = dyn_cast<GStore>(&*II))) { + if (NewStore->getMMO().getMemoryType() != MemTy || !NewStore->isSimple()) + break; + } else if (II->isLoadFoldBarrier() || II->mayLoad()) { + break; + } else { + continue; // This is a safe instruction we can look past. + } + + Register NewBaseReg; + int64_t MemOffset; + // Check we're storing to the same base + some offset. + if (!mi_match(NewStore->getPointerReg(), *MRI, + m_GPtrAdd(m_Reg(NewBaseReg), m_ICst(MemOffset)))) { + NewBaseReg = NewStore->getPointerReg(); + MemOffset = 0; + } + if (BaseReg != NewBaseReg) + break; + + auto ShiftByteOffset = getTruncStoreByteOffset(*NewStore, WideSrcVal, *MRI); + if (!ShiftByteOffset) + break; + if (MemOffset < LowestIdxOffset) { + LowestIdxOffset = MemOffset; + LowestIdxStore = NewStore; + } + + // Map the offset in the store and the offset in the combined value, and + // early return if it has been set before. + if (*ShiftByteOffset < 0 || *ShiftByteOffset >= NumStoresRequired || + OffsetMap[*ShiftByteOffset] != INT64_MAX) + break; + OffsetMap[*ShiftByteOffset] = MemOffset; + + FoundStores.emplace_back(NewStore); + // Reset counter since we've found a matching inst. + NumInstsChecked = 0; + if (FoundStores.size() == NumStoresRequired) + break; + } + + if (FoundStores.size() != NumStoresRequired) { + if (FoundStores.size() == 1) + return false; + // We didn't find enough stores to merge into the size of the original + // source value, but we may be able to generate a smaller store if we + // truncate the source value. + WideStoreTy = LLT::scalar(FoundStores.size() * MemTy.getScalarSizeInBits()); + } + + unsigned NumStoresFound = FoundStores.size(); + + const auto &DL = LastStore.getMF()->getDataLayout(); + auto &C = LastStore.getMF()->getFunction().getContext(); + // Check that a store of the wide type is both allowed and fast on the target + unsigned Fast = 0; + bool Allowed = TLI->allowsMemoryAccess( + C, DL, WideStoreTy, LowestIdxStore->getMMO(), &Fast); + if (!Allowed || !Fast) + return false; + + // Check if the pieces of the value are going to the expected places in memory + // to merge the stores. + unsigned NarrowBits = MemTy.getScalarSizeInBits(); + auto checkOffsets = [&](bool MatchLittleEndian) { + if (MatchLittleEndian) { + for (unsigned i = 0; i != NumStoresFound; ++i) + if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset) + return false; + } else { // MatchBigEndian by reversing loop counter. + for (unsigned i = 0, j = NumStoresFound - 1; i != NumStoresFound; + ++i, --j) + if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset) + return false; + } + return true; + }; + + // Check if the offsets line up for the native data layout of this target. + bool NeedBswap = false; + bool NeedRotate = false; + if (!checkOffsets(DL.isLittleEndian())) { + // Special-case: check if byte offsets line up for the opposite endian. + if (NarrowBits == 8 && checkOffsets(DL.isBigEndian())) + NeedBswap = true; + else if (NumStoresFound == 2 && checkOffsets(DL.isBigEndian())) + NeedRotate = true; + else + return false; + } + + if (NeedBswap && + !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {WideStoreTy}}, *MF)) + return false; + if (NeedRotate && + !isLegalOrBeforeLegalizer( + {TargetOpcode::G_ROTR, {WideStoreTy, WideStoreTy}}, *MF)) + return false; + + Builder.setInstrAndDebugLoc(StoreMI); + + if (WideStoreTy != MRI->getType(WideSrcVal)) + WideSrcVal = Builder.buildTrunc(WideStoreTy, WideSrcVal).getReg(0); + + if (NeedBswap) { + WideSrcVal = Builder.buildBSwap(WideStoreTy, WideSrcVal).getReg(0); + } else if (NeedRotate) { + assert(WideStoreTy.getSizeInBits() % 2 == 0 && + "Unexpected type for rotate"); + auto RotAmt = + Builder.buildConstant(WideStoreTy, WideStoreTy.getSizeInBits() / 2); + WideSrcVal = + Builder.buildRotateRight(WideStoreTy, WideSrcVal, RotAmt).getReg(0); + } + + Builder.buildStore(WideSrcVal, LowestIdxStore->getPointerReg(), + LowestIdxStore->getMMO().getPointerInfo(), + LowestIdxStore->getMMO().getAlign()); + + // Erase the old stores. + for (auto *ST : FoundStores) { + ST->eraseFromParent(); + DeletedStores.insert(ST); + } + return true; +} + +bool LoadStoreOpt::mergeTruncStoresBlock(MachineBasicBlock &BB) { + bool Changed = false; + SmallVector<GStore *, 16> Stores; + SmallPtrSet<GStore *, 8> DeletedStores; + // Walk up the block so we can see the most eligible stores. + for (MachineInstr &MI : llvm::reverse(BB)) + if (auto *StoreMI = dyn_cast<GStore>(&MI)) + Stores.emplace_back(StoreMI); + + for (auto *StoreMI : Stores) { + if (DeletedStores.count(StoreMI)) + continue; + if (mergeTruncStore(*StoreMI, DeletedStores)) + Changed = true; + } + return Changed; +} + bool LoadStoreOpt::mergeFunctionStores(MachineFunction &MF) { bool Changed = false; - for (auto &BB : MF) { + for (auto &BB : MF){ Changed |= mergeBlockStores(BB); + Changed |= mergeTruncStoresBlock(BB); + } + + // Erase all dead instructions left over by the merging. + if (Changed) { + for (auto &BB : MF) { + for (auto &I : make_early_inc_range(make_range(BB.rbegin(), BB.rend()))) { + if (isTriviallyDead(I, *MRI)) + I.eraseFromParent(); + } + } } + return Changed; } diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp index bf4dcc2c2459..55984423e5bc 100644 --- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp @@ -54,7 +54,7 @@ bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def, MachineInstr &MIUse = *MOUse.getParent(); InsertMBB = MIUse.getParent(); if (MIUse.isPHI()) - InsertMBB = MIUse.getOperand(MIUse.getOperandNo(&MOUse) + 1).getMBB(); + InsertMBB = MIUse.getOperand(MOUse.getOperandNo() + 1).getMBB(); return InsertMBB == Def.getParent(); } @@ -99,7 +99,7 @@ bool Localizer::localizeInterBlock(MachineFunction &MF, MachineBasicBlock *InsertMBB; LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent(); dbgs() << "Checking use: " << MIUse - << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n'); + << " #Opd: " << MOUse.getOperandNo() << '\n'); if (isLocalUse(MOUse, MI, InsertMBB)) { // Even if we're in the same block, if the block is very large we could // still have many long live ranges. Try to do intra-block localization diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 9100e064f30f..962b54ec5d6b 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -80,11 +80,11 @@ MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI, assert( cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) && "Expected inlined-at fields to agree"); - return buildInstr(TargetOpcode::DBG_VALUE) - .addFrameIndex(FI) - .addImm(0) - .addMetadata(Variable) - .addMetadata(Expr); + return insertInstr(buildInstrNoInsert(TargetOpcode::DBG_VALUE) + .addFrameIndex(FI) + .addImm(0) + .addMetadata(Variable) + .addMetadata(Expr)); } MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, @@ -164,6 +164,15 @@ MachineInstrBuilder MachineIRBuilder::buildGlobalValue(const DstOp &Res, return MIB; } +MachineInstrBuilder MachineIRBuilder::buildConstantPool(const DstOp &Res, + unsigned Idx) { + assert(Res.getLLTTy(*getMRI()).isPointer() && "invalid operand type"); + auto MIB = buildInstr(TargetOpcode::G_CONSTANT_POOL); + Res.addDefToMIB(*getMRI(), MIB); + MIB.addConstantPoolIndex(Idx); + return MIB; +} + MachineInstrBuilder MachineIRBuilder::buildJumpTable(const LLT PtrTy, unsigned JTI) { return buildInstr(TargetOpcode::G_JUMP_TABLE, {PtrTy}, {}) @@ -229,17 +238,25 @@ MachineIRBuilder::buildPadVectorWithUndefElements(const DstOp &Res, LLT ResTy = Res.getLLTTy(*getMRI()); LLT Op0Ty = Op0.getLLTTy(*getMRI()); - assert((ResTy.isVector() && Op0Ty.isVector()) && "Non vector type"); - assert((ResTy.getElementType() == Op0Ty.getElementType()) && - "Different vector element types"); - assert((ResTy.getNumElements() > Op0Ty.getNumElements()) && - "Op0 has more elements"); + assert(ResTy.isVector() && "Res non vector type"); - auto Unmerge = buildUnmerge(Op0Ty.getElementType(), Op0); SmallVector<Register, 8> Regs; - for (auto Op : Unmerge.getInstr()->defs()) - Regs.push_back(Op.getReg()); - Register Undef = buildUndef(Op0Ty.getElementType()).getReg(0); + if (Op0Ty.isVector()) { + assert((ResTy.getElementType() == Op0Ty.getElementType()) && + "Different vector element types"); + assert((ResTy.getNumElements() > Op0Ty.getNumElements()) && + "Op0 has more elements"); + auto Unmerge = buildUnmerge(Op0Ty.getElementType(), Op0); + + for (auto Op : Unmerge.getInstr()->defs()) + Regs.push_back(Op.getReg()); + } else { + assert((ResTy.getSizeInBits() > Op0Ty.getSizeInBits()) && + "Op0 has more size"); + Regs.push_back(Op0.getReg()); + } + Register Undef = + buildUndef(Op0Ty.isVector() ? Op0Ty.getElementType() : Op0Ty).getReg(0); unsigned NumberOfPadElts = ResTy.getNumElements() - Regs.size(); for (unsigned i = 0; i < NumberOfPadElts; ++i) Regs.push_back(Undef); diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp index 080f3ca540f2..885a1056b2ea 100644 --- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -69,8 +69,8 @@ INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) -RegBankSelect::RegBankSelect(Mode RunningMode) - : MachineFunctionPass(ID), OptMode(RunningMode) { +RegBankSelect::RegBankSelect(char &PassID, Mode RunningMode) + : MachineFunctionPass(PassID), OptMode(RunningMode) { if (RegBankSelectMode.getNumOccurrences() != 0) { OptMode = RegBankSelectMode; if (RegBankSelectMode != RunningMode) @@ -162,8 +162,10 @@ bool RegBankSelect::repairReg( MI = MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY) .addDef(Dst) .addUse(Src); - LLVM_DEBUG(dbgs() << "Copy: " << printReg(Src) << " to: " << printReg(Dst) - << '\n'); + LLVM_DEBUG(dbgs() << "Copy: " << printReg(Src) << ':' + << printRegClassOrBank(Src, *MRI, TRI) + << " to: " << printReg(Dst) << ':' + << printRegClassOrBank(Dst, *MRI, TRI) << '\n'); } else { // TODO: Support with G_IMPLICIT_DEF + G_INSERT sequence or G_EXTRACT // sequence. diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 07448548c295..080600d3cc98 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -230,10 +230,7 @@ bool llvm::isTriviallyDead(const MachineInstr &MI, return false; // Instructions without side-effects are dead iff they only define dead vregs. - for (const auto &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef()) - continue; - + for (const auto &MO : MI.all_defs()) { Register Reg = MO.getReg(); if (Reg.isPhysical() || !MRI.use_nodbg_empty(Reg)) return false; @@ -711,14 +708,14 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, Align llvm::inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO) { - auto PSV = MPO.V.dyn_cast<const PseudoSourceValue *>(); + auto PSV = dyn_cast_if_present<const PseudoSourceValue *>(MPO.V); if (auto FSPV = dyn_cast_or_null<FixedStackPseudoSourceValue>(PSV)) { MachineFrameInfo &MFI = MF.getFrameInfo(); return commonAlignment(MFI.getObjectAlign(FSPV->getFrameIndex()), MPO.Offset); } - if (const Value *V = MPO.V.dyn_cast<const Value *>()) { + if (const Value *V = dyn_cast_if_present<const Value *>(MPO.V)) { const Module *M = MF.getFunction().getParent(); return V->getPointerAlignment(M->getDataLayout()); } @@ -797,7 +794,7 @@ llvm::ConstantFoldCTLZ(Register Src, const MachineRegisterInfo &MRI) { auto MaybeCst = getIConstantVRegVal(R, MRI); if (!MaybeCst) return std::nullopt; - return MaybeCst->countLeadingZeros(); + return MaybeCst->countl_zero(); }; if (Ty.isVector()) { // Try to constant fold each element. diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp index 2ccf2def48f8..f259cbc1d788 100644 --- a/llvm/lib/CodeGen/GlobalMerge.cpp +++ b/llvm/lib/CodeGen/GlobalMerge.cpp @@ -67,7 +67,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/BasicBlock.h" @@ -92,6 +91,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -156,7 +156,7 @@ namespace { /// Whether we should merge global variables that have external linkage. bool MergeExternalGlobals = false; - bool IsMachO; + bool IsMachO = false; bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst, unsigned AddrSpace) const; @@ -652,6 +652,14 @@ bool GlobalMerge::doInitialization(Module &M) { if (isMustKeepGlobalVariable(&GV)) continue; + // Don't merge tagged globals, as each global should have its own unique + // memory tag at runtime. TODO(hctim): This can be relaxed: constant globals + // with compatible alignment and the same contents may be merged as long as + // the globals occupy the same number of tag granules (i.e. `size_a / 16 == + // size_b / 16`). + if (GV.isTagged()) + continue; + Type *Ty = GV.getValueType(); if (DL.getTypeAllocSize(Ty) < MaxOffset) { if (TM && diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp index 258ad1931b12..e7b14d700a44 100644 --- a/llvm/lib/CodeGen/HardwareLoops.cpp +++ b/llvm/lib/CodeGen/HardwareLoops.cpp @@ -15,8 +15,10 @@ /// //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/HardwareLoops.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -115,12 +117,12 @@ namespace { using TTI = TargetTransformInfo; - class HardwareLoops : public FunctionPass { + class HardwareLoopsLegacy : public FunctionPass { public: static char ID; - HardwareLoops() : FunctionPass(ID) { - initializeHardwareLoopsPass(*PassRegistry::getPassRegistry()); + HardwareLoopsLegacy() : FunctionPass(ID) { + initializeHardwareLoopsLegacyPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -131,29 +133,44 @@ namespace { AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + AU.addPreserved<BranchProbabilityInfoWrapperPass>(); } + }; + + class HardwareLoopsImpl { + public: + HardwareLoopsImpl(ScalarEvolution &SE, LoopInfo &LI, bool PreserveLCSSA, + DominatorTree &DT, const DataLayout &DL, + const TargetTransformInfo &TTI, TargetLibraryInfo *TLI, + AssumptionCache &AC, OptimizationRemarkEmitter *ORE, + HardwareLoopOptions &Opts) + : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT), DL(DL), TTI(TTI), + TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { } + bool run(Function &F); + + private: // Try to convert the given Loop into a hardware loop. - bool TryConvertLoop(Loop *L); + bool TryConvertLoop(Loop *L, LLVMContext &Ctx); // Given that the target believes the loop to be profitable, try to // convert it. bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo); - private: - ScalarEvolution *SE = nullptr; - LoopInfo *LI = nullptr; - const DataLayout *DL = nullptr; - OptimizationRemarkEmitter *ORE = nullptr; - const TargetTransformInfo *TTI = nullptr; - DominatorTree *DT = nullptr; - bool PreserveLCSSA = false; - AssumptionCache *AC = nullptr; - TargetLibraryInfo *LibInfo = nullptr; - Module *M = nullptr; + ScalarEvolution &SE; + LoopInfo &LI; + bool PreserveLCSSA; + DominatorTree &DT; + const DataLayout &DL; + const TargetTransformInfo &TTI; + TargetLibraryInfo *TLI = nullptr; + AssumptionCache &AC; + OptimizationRemarkEmitter *ORE; + HardwareLoopOptions &Opts; bool MadeChange = false; }; @@ -182,8 +199,9 @@ namespace { public: HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE, const DataLayout &DL, - OptimizationRemarkEmitter *ORE) : - SE(SE), DL(DL), ORE(ORE), L(Info.L), M(L->getHeader()->getModule()), + OptimizationRemarkEmitter *ORE, + HardwareLoopOptions &Opts) : + SE(SE), DL(DL), ORE(ORE), Opts(Opts), L(Info.L), M(L->getHeader()->getModule()), ExitCount(Info.ExitCount), CountType(Info.CountType), ExitBranch(Info.ExitBranch), @@ -197,6 +215,7 @@ namespace { ScalarEvolution &SE; const DataLayout &DL; OptimizationRemarkEmitter *ORE = nullptr; + HardwareLoopOptions &Opts; Loop *L = nullptr; Module *M = nullptr; const SCEV *ExitCount = nullptr; @@ -209,40 +228,83 @@ namespace { }; } -char HardwareLoops::ID = 0; +char HardwareLoopsLegacy::ID = 0; -bool HardwareLoops::runOnFunction(Function &F) { +bool HardwareLoopsLegacy::runOnFunction(Function &F) { if (skipFunction(F)) return false; LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n"); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - DL = &F.getParent()->getDataLayout(); - ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto &DL = F.getParent()->getDataLayout(); + auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - LibInfo = TLIP ? &TLIP->getTLI(F) : nullptr; - PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - M = F.getParent(); + auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + + HardwareLoopOptions Opts; + if (ForceHardwareLoops.getNumOccurrences()) + Opts.setForce(ForceHardwareLoops); + if (ForceHardwareLoopPHI.getNumOccurrences()) + Opts.setForcePhi(ForceHardwareLoopPHI); + if (ForceNestedLoop.getNumOccurrences()) + Opts.setForceNested(ForceNestedLoop); + if (ForceGuardLoopEntry.getNumOccurrences()) + Opts.setForceGuard(ForceGuardLoopEntry); + if (LoopDecrement.getNumOccurrences()) + Opts.setDecrement(LoopDecrement); + if (CounterBitWidth.getNumOccurrences()) + Opts.setCounterBitwidth(CounterBitWidth); - for (Loop *L : *LI) - if (L->isOutermost()) - TryConvertLoop(L); + HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT, DL, TTI, TLI, AC, ORE, + Opts); + return Impl.run(F); +} + +PreservedAnalyses HardwareLoopsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F); + auto &AC = AM.getResult<AssumptionAnalysis>(F); + auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + auto &DL = F.getParent()->getDataLayout(); + + HardwareLoopsImpl Impl(SE, LI, true, DT, DL, TTI, TLI, AC, ORE, Opts); + bool Changed = Impl.run(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve<LoopAnalysis>(); + PA.preserve<ScalarEvolutionAnalysis>(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<BranchProbabilityAnalysis>(); + return PA; +} +bool HardwareLoopsImpl::run(Function &F) { + LLVMContext &Ctx = F.getParent()->getContext(); + for (Loop *L : LI) + if (L->isOutermost()) + TryConvertLoop(L, Ctx); return MadeChange; } // Return true if the search should stop, which will be when an inner loop is // converted and the parent loop doesn't support containing a hardware loop. -bool HardwareLoops::TryConvertLoop(Loop *L) { +bool HardwareLoopsImpl::TryConvertLoop(Loop *L, LLVMContext &Ctx) { // Process nested loops first. bool AnyChanged = false; for (Loop *SL : *L) - AnyChanged |= TryConvertLoop(SL); + AnyChanged |= TryConvertLoop(SL, Ctx); if (AnyChanged) { reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested", ORE, L); @@ -252,39 +314,39 @@ bool HardwareLoops::TryConvertLoop(Loop *L) { LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n"); HardwareLoopInfo HWLoopInfo(L); - if (!HWLoopInfo.canAnalyze(*LI)) { + if (!HWLoopInfo.canAnalyze(LI)) { reportHWLoopFailure("cannot analyze loop, irreducible control flow", "HWLoopCannotAnalyze", ORE, L); return false; } - if (!ForceHardwareLoops && - !TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo)) { + if (!Opts.Force && + !TTI.isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { reportHWLoopFailure("it's not profitable to create a hardware-loop", "HWLoopNotProfitable", ORE, L); return false; } // Allow overriding of the counter width and loop decrement value. - if (CounterBitWidth.getNumOccurrences()) - HWLoopInfo.CountType = - IntegerType::get(M->getContext(), CounterBitWidth); + if (Opts.Bitwidth.has_value()) { + HWLoopInfo.CountType = IntegerType::get(Ctx, Opts.Bitwidth.value()); + } - if (LoopDecrement.getNumOccurrences()) + if (Opts.Decrement.has_value()) HWLoopInfo.LoopDecrement = - ConstantInt::get(HWLoopInfo.CountType, LoopDecrement); + ConstantInt::get(HWLoopInfo.CountType, Opts.Decrement.value()); MadeChange |= TryConvertLoop(HWLoopInfo); - return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop); + return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.ForceNested); } -bool HardwareLoops::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) { +bool HardwareLoopsImpl::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) { Loop *L = HWLoopInfo.L; LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L); - if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT, ForceNestedLoop, - ForceHardwareLoopPHI)) { + if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT, Opts.getForceNested(), + Opts.getForcePhi())) { // TODO: there can be many reasons a loop is not considered a // candidate, so we should let isHardwareLoopCandidate fill in the // reason and then report a better message here. @@ -300,11 +362,11 @@ bool HardwareLoops::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) { // If we don't have a preheader, then insert one. if (!Preheader) - Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA); + Preheader = InsertPreheaderForLoop(L, &DT, &LI, nullptr, PreserveLCSSA); if (!Preheader) return false; - HardwareLoop HWLoop(HWLoopInfo, *SE, *DL, ORE); + HardwareLoop HWLoop(HWLoopInfo, SE, DL, ORE, Opts); HWLoop.Create(); ++NumHWLoops; return true; @@ -322,7 +384,7 @@ void HardwareLoop::Create() { Value *Setup = InsertIterationSetup(LoopCountInit); - if (UsePHICounter || ForceHardwareLoopPHI) { + if (UsePHICounter || Opts.ForcePhi) { Instruction *LoopDec = InsertLoopRegDec(LoopCountInit); Value *EltsRem = InsertPHICounter(Setup, LoopDec); LoopDec->setOperand(0, EltsRem); @@ -397,7 +459,8 @@ Value *HardwareLoop::InitLoopCount() { if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount, SE.getZero(ExitCount->getType()))) { LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n"); - UseLoopGuard |= ForceGuardLoopEntry; + if (Opts.ForceGuard) + UseLoopGuard = true; } else UseLoopGuard = false; @@ -441,7 +504,7 @@ Value *HardwareLoop::InitLoopCount() { Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { IRBuilder<> Builder(BeginBB->getTerminator()); Type *Ty = LoopCountInit->getType(); - bool UsePhi = UsePHICounter || ForceHardwareLoopPHI; + bool UsePhi = UsePHICounter || Opts.ForcePhi; Intrinsic::ID ID = UseLoopGuard ? (UsePhi ? Intrinsic::test_start_loop_iterations : Intrinsic::test_set_loop_iterations) @@ -533,11 +596,11 @@ void HardwareLoop::UpdateBranch(Value *EltsRem) { RecursivelyDeleteTriviallyDeadInstructions(OldCond); } -INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false) +INITIALIZE_PASS_BEGIN(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) -INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false) +INITIALIZE_PASS_END(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false) -FunctionPass *llvm::createHardwareLoopsPass() { return new HardwareLoops(); } +FunctionPass *llvm::createHardwareLoopsLegacyPass() { return new HardwareLoopsLegacy(); } diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp index 105ab908d3fa..2ad5820bd9fb 100644 --- a/llvm/lib/CodeGen/IfConversion.cpp +++ b/llvm/lib/CodeGen/IfConversion.cpp @@ -71,8 +71,6 @@ static cl::opt<bool> DisableTriangleR("disable-ifcvt-triangle-rev", cl::init(false), cl::Hidden); static cl::opt<bool> DisableTriangleF("disable-ifcvt-triangle-false", cl::init(false), cl::Hidden); -static cl::opt<bool> DisableTriangleFR("disable-ifcvt-triangle-false-rev", - cl::init(false), cl::Hidden); static cl::opt<bool> DisableDiamond("disable-ifcvt-diamond", cl::init(false), cl::Hidden); static cl::opt<bool> DisableForkedDiamond("disable-ifcvt-forked-diamond", @@ -189,16 +187,16 @@ namespace { std::vector<BBInfo> BBAnalysis; TargetSchedModel SchedModel; - const TargetLoweringBase *TLI; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - const MachineBranchProbabilityInfo *MBPI; - MachineRegisterInfo *MRI; + const TargetLoweringBase *TLI = nullptr; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const MachineBranchProbabilityInfo *MBPI = nullptr; + MachineRegisterInfo *MRI = nullptr; LivePhysRegs Redefs; - bool PreRegAlloc; - bool MadeChange; + bool PreRegAlloc = true; + bool MadeChange = false; int FnNum = -1; std::function<bool(const MachineFunction &)> PredicateFtor; @@ -532,7 +530,6 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) { if (DisableTriangle && !isFalse && !isRev) break; if (DisableTriangleR && !isFalse && isRev) break; if (DisableTriangleF && isFalse && !isRev) break; - if (DisableTriangleFR && isFalse && isRev) break; LLVM_DEBUG(dbgs() << "Ifcvt (Triangle"); if (isFalse) LLVM_DEBUG(dbgs() << " false"); @@ -1512,19 +1509,9 @@ static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) { MIB.addReg(Reg, RegState::Implicit | RegState::Define); continue; } - if (LiveBeforeMI.count(Reg)) + if (any_of(TRI->subregs_inclusive(Reg), + [&](MCPhysReg S) { return LiveBeforeMI.count(S); })) MIB.addReg(Reg, RegState::Implicit); - else { - bool HasLiveSubReg = false; - for (MCSubRegIterator S(Reg, TRI); S.isValid(); ++S) { - if (!LiveBeforeMI.count(*S)) - continue; - HasLiveSubReg = true; - break; - } - if (HasLiveSubReg) - MIB.addReg(Reg, RegState::Implicit); - } } } @@ -1958,17 +1945,15 @@ bool IfConverter::IfConvertDiamondCommon( } else if (!RedefsByFalse.count(Reg)) { // These are defined before ctrl flow reach the 'false' instructions. // They cannot be modified by the 'true' instructions. - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - ExtUses.insert(*SubRegs); + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) + ExtUses.insert(SubReg); } } for (MCPhysReg Reg : Defs) { if (!ExtUses.count(Reg)) { - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - RedefsByFalse.insert(*SubRegs); + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) + RedefsByFalse.insert(SubReg); } } } @@ -2244,6 +2229,15 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) { assert(!FromMBB.hasAddressTaken() && "Removing a BB whose address is taken!"); + // If we're about to splice an INLINEASM_BR from FromBBI, we need to update + // ToBBI's successor list accordingly. + if (FromMBB.mayHaveInlineAsmBr()) + for (MachineInstr &MI : FromMBB) + if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) + for (MachineOperand &MO : MI.operands()) + if (MO.isMBB() && !ToBBI.BB->isSuccessor(MO.getMBB())) + ToBBI.BB->addSuccessor(MO.getMBB(), BranchProbability::getZero()); + // In case FromMBB contains terminators (e.g. return instruction), // first move the non-terminator instructions, then the terminators. MachineBasicBlock::iterator FromTI = FromMBB.getFirstTerminator(); diff --git a/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/llvm/lib/CodeGen/ImplicitNullChecks.cpp index fa493af0eea7..b2a7aad73411 100644 --- a/llvm/lib/CodeGen/ImplicitNullChecks.cpp +++ b/llvm/lib/CodeGen/ImplicitNullChecks.cpp @@ -94,7 +94,7 @@ class ImplicitNullChecks : public MachineFunctionPass { /// computeDependence). bool CanReorder; - /// If non-None, then an instruction in \p Insts that also must be + /// If non-std::nullopt, then an instruction in \p Insts that also must be /// hoisted. std::optional<ArrayRef<MachineInstr *>::iterator> PotentialDependence; @@ -778,9 +778,7 @@ void ImplicitNullChecks::rewriteNullChecks( // The original operation may define implicit-defs alongside // the value. MachineBasicBlock *MBB = NC.getMemOperation()->getParent(); - for (const MachineOperand &MO : FaultingInstr->operands()) { - if (!MO.isReg() || !MO.isDef()) - continue; + for (const MachineOperand &MO : FaultingInstr->all_defs()) { Register Reg = MO.getReg(); if (!Reg || MBB->isLiveIn(Reg)) continue; @@ -788,8 +786,8 @@ void ImplicitNullChecks::rewriteNullChecks( } if (auto *DepMI = NC.getOnlyDependency()) { - for (auto &MO : DepMI->operands()) { - if (!MO.isReg() || !MO.getReg() || !MO.isDef() || MO.isDead()) + for (auto &MO : DepMI->all_defs()) { + if (!MO.getReg() || MO.isDead()) continue; if (!NC.getNotNullSucc()->isLiveIn(MO.getReg())) NC.getNotNullSucc()->addLiveIn(MO.getReg()); diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index cf4fff878ad1..c62f3db9d321 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -165,8 +165,8 @@ class InlineSpiller : public Spiller { const MachineBlockFrequencyInfo &MBFI; // Variables that are valid during spill(), but used by multiple methods. - LiveRangeEdit *Edit; - LiveInterval *StackInt; + LiveRangeEdit *Edit = nullptr; + LiveInterval *StackInt = nullptr; int StackSlot; Register Original; @@ -175,6 +175,7 @@ class InlineSpiller : public Spiller { // All COPY instructions to/from snippets. // They are ignored since both operands refer to the same stack slot. + // For bundled copies, this will only include the first header copy. SmallPtrSet<MachineInstr*, 8> SnippetCopies; // Values that failed to remat at some point. @@ -257,19 +258,64 @@ Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass, /// isFullCopyOf - If MI is a COPY to or from Reg, return the other register, /// otherwise return 0. -static Register isFullCopyOf(const MachineInstr &MI, Register Reg) { - if (!MI.isFullCopy()) +static Register isCopyOf(const MachineInstr &MI, Register Reg, + const TargetInstrInfo &TII) { + if (!TII.isCopyInstr(MI)) return Register(); - if (MI.getOperand(0).getReg() == Reg) - return MI.getOperand(1).getReg(); - if (MI.getOperand(1).getReg() == Reg) - return MI.getOperand(0).getReg(); + + const MachineOperand &DstOp = MI.getOperand(0); + const MachineOperand &SrcOp = MI.getOperand(1); + + // TODO: Probably only worth allowing subreg copies with undef dests. + if (DstOp.getSubReg() != SrcOp.getSubReg()) + return Register(); + if (DstOp.getReg() == Reg) + return SrcOp.getReg(); + if (SrcOp.getReg() == Reg) + return DstOp.getReg(); + return Register(); +} + +/// Check for a copy bundle as formed by SplitKit. +static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg, + const TargetInstrInfo &TII) { + if (!FirstMI.isBundled()) + return isCopyOf(FirstMI, Reg, TII); + + assert(!FirstMI.isBundledWithPred() && FirstMI.isBundledWithSucc() && + "expected to see first instruction in bundle"); + + Register SnipReg; + MachineBasicBlock::const_instr_iterator I = FirstMI.getIterator(); + while (I->isBundledWithSucc()) { + const MachineInstr &MI = *I; + auto CopyInst = TII.isCopyInstr(MI); + if (!CopyInst) + return Register(); + + const MachineOperand &DstOp = *CopyInst->Destination; + const MachineOperand &SrcOp = *CopyInst->Source; + if (DstOp.getReg() == Reg) { + if (!SnipReg) + SnipReg = SrcOp.getReg(); + else if (SnipReg != SrcOp.getReg()) + return Register(); + } else if (SrcOp.getReg() == Reg) { + if (!SnipReg) + SnipReg = DstOp.getReg(); + else if (SnipReg != DstOp.getReg()) + return Register(); + } + + ++I; + } + return Register(); } static void getVDefInterval(const MachineInstr &MI, LiveIntervals &LIS) { - for (const MachineOperand &MO : MI.operands()) - if (MO.isReg() && MO.isDef() && MO.getReg().isVirtual()) + for (const MachineOperand &MO : MI.all_defs()) + if (MO.getReg().isVirtual()) LIS.getInterval(MO.getReg()); } @@ -307,14 +353,14 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) { MachineInstr *UseMI = nullptr; // Check that all uses satisfy our criteria. - for (MachineRegisterInfo::reg_instr_nodbg_iterator - RI = MRI.reg_instr_nodbg_begin(SnipLI.reg()), - E = MRI.reg_instr_nodbg_end(); + for (MachineRegisterInfo::reg_bundle_nodbg_iterator + RI = MRI.reg_bundle_nodbg_begin(SnipLI.reg()), + E = MRI.reg_bundle_nodbg_end(); RI != E;) { MachineInstr &MI = *RI++; // Allow copies to/from Reg. - if (isFullCopyOf(MI, Reg)) + if (isCopyOfBundle(MI, Reg, TII)) continue; // Allow stack slot loads. @@ -351,9 +397,8 @@ void InlineSpiller::collectRegsToSpill() { if (Original == Reg) return; - for (MachineInstr &MI : - llvm::make_early_inc_range(MRI.reg_instructions(Reg))) { - Register SnipReg = isFullCopyOf(MI, Reg); + for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) { + Register SnipReg = isCopyOfBundle(MI, Reg, TII); if (!isSibling(SnipReg)) continue; LiveInterval &SnipLI = LIS.getInterval(SnipReg); @@ -475,21 +520,22 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) { // Find all spills and copies of VNI. for (MachineInstr &MI : - llvm::make_early_inc_range(MRI.use_nodbg_instructions(Reg))) { - if (!MI.isCopy() && !MI.mayStore()) + llvm::make_early_inc_range(MRI.use_nodbg_bundles(Reg))) { + if (!MI.mayStore() && !TII.isCopyInstr(MI)) continue; SlotIndex Idx = LIS.getInstructionIndex(MI); if (LI->getVNInfoAt(Idx) != VNI) continue; // Follow sibling copies down the dominator tree. - if (Register DstReg = isFullCopyOf(MI, Reg)) { + if (Register DstReg = isCopyOfBundle(MI, Reg, TII)) { if (isSibling(DstReg)) { - LiveInterval &DstLI = LIS.getInterval(DstReg); - VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot()); - assert(DstVNI && "Missing defined value"); - assert(DstVNI->def == Idx.getRegSlot() && "Wrong copy def slot"); - WorkList.push_back(std::make_pair(&DstLI, DstVNI)); + LiveInterval &DstLI = LIS.getInterval(DstReg); + VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot()); + assert(DstVNI && "Missing defined value"); + assert(DstVNI->def == Idx.getRegSlot() && "Wrong copy def slot"); + + WorkList.push_back(std::make_pair(&DstLI, DstVNI)); } continue; } @@ -593,8 +639,8 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { if (!ParentVNI) { LLVM_DEBUG(dbgs() << "\tadding <undef> flags: "); - for (MachineOperand &MO : MI.operands()) - if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg()) + for (MachineOperand &MO : MI.all_uses()) + if (MO.getReg() == VirtReg.reg()) MO.setIsUndef(); LLVM_DEBUG(dbgs() << UseIdx << '\t' << MI); return true; @@ -826,7 +872,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops, if (Ops.back().first != MI || MI->isBundled()) return false; - bool WasCopy = MI->isCopy(); + bool WasCopy = TII.isCopyInstr(*MI).has_value(); Register ImpReg; // TII::foldMemoryOperand will do what we need here for statepoint @@ -1111,7 +1157,7 @@ void InlineSpiller::spillAroundUses(Register Reg) { Idx = VNI->def; // Check for a sibling copy. - Register SibReg = isFullCopyOf(MI, Reg); + Register SibReg = isCopyOfBundle(MI, Reg, TII); if (SibReg && isSibling(SibReg)) { // This may actually be a copy between snippets. if (isRegToSpill(SibReg)) { @@ -1202,8 +1248,8 @@ void InlineSpiller::spillAll() { llvm::make_early_inc_range(MRI.reg_instructions(Reg))) { assert(SnippetCopies.count(&MI) && "Remaining use wasn't a snippet copy"); // FIXME: Do this with a LiveRangeEdit callback. - LIS.RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); + LIS.getSlotIndexes()->removeSingleMachineInstrFromMaps(MI); + MI.eraseFromBundle(); } } @@ -1250,7 +1296,7 @@ void HoistSpillHelper::addToMergeableSpills(MachineInstr &Spill, int StackSlot, LiveInterval &OrigLI = LIS.getInterval(Original); // save a copy of LiveInterval in StackSlotToOrigLI because the original // LiveInterval may be cleared after all its references are spilled. - if (StackSlotToOrigLI.find(StackSlot) == StackSlotToOrigLI.end()) { + if (!StackSlotToOrigLI.contains(StackSlot)) { auto LI = std::make_unique<LiveInterval>(OrigLI.reg(), OrigLI.weight()); LI->assign(OrigLI, Allocator); StackSlotToOrigLI[StackSlot] = std::move(LI); @@ -1459,7 +1505,7 @@ void HoistSpillHelper::runHoistSpills( MachineBasicBlock *Block = (*RIt)->getBlock(); // If Block contains an original spill, simply continue. - if (SpillsToKeep.find(*RIt) != SpillsToKeep.end() && !SpillsToKeep[*RIt]) { + if (SpillsToKeep.contains(*RIt) && !SpillsToKeep[*RIt]) { SpillsInSubTreeMap[*RIt].first.insert(*RIt); // SpillsInSubTreeMap[*RIt].second contains the cost of spill. SpillsInSubTreeMap[*RIt].second = MBFI.getBlockFreq(Block); @@ -1469,7 +1515,7 @@ void HoistSpillHelper::runHoistSpills( // Collect spills in subtree of current node (*RIt) to // SpillsInSubTreeMap[*RIt].first. for (MachineDomTreeNode *Child : (*RIt)->children()) { - if (SpillsInSubTreeMap.find(Child) == SpillsInSubTreeMap.end()) + if (!SpillsInSubTreeMap.contains(Child)) continue; // The stmt "SpillsInSubTree = SpillsInSubTreeMap[*RIt].first" below // should be placed before getting the begin and end iterators of @@ -1508,8 +1554,7 @@ void HoistSpillHelper::runHoistSpills( for (auto *const SpillBB : SpillsInSubTree) { // When SpillBB is a BB contains original spill, insert the spill // to SpillsToRm. - if (SpillsToKeep.find(SpillBB) != SpillsToKeep.end() && - !SpillsToKeep[SpillBB]) { + if (SpillsToKeep.contains(SpillBB) && !SpillsToKeep[SpillBB]) { MachineInstr *SpillToRm = SpillBBToSpill[SpillBB]; SpillsToRm.push_back(SpillToRm); } diff --git a/llvm/lib/CodeGen/InterferenceCache.cpp b/llvm/lib/CodeGen/InterferenceCache.cpp index 3cab9e5734ee..ae197ee5553a 100644 --- a/llvm/lib/CodeGen/InterferenceCache.cpp +++ b/llvm/lib/CodeGen/InterferenceCache.cpp @@ -93,8 +93,8 @@ void InterferenceCache::Entry::revalidate(LiveIntervalUnion *LIUArray, // Invalidate all iterators. PrevPos = SlotIndex(); unsigned i = 0; - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i) - RegUnits[i].VirtTag = LIUArray[*Units].getTag(); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) + RegUnits[i++].VirtTag = LIUArray[Unit].getTag(); } void InterferenceCache::Entry::reset(MCRegister physReg, @@ -110,20 +110,21 @@ void InterferenceCache::Entry::reset(MCRegister physReg, // Reset iterators. PrevPos = SlotIndex(); RegUnits.clear(); - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - RegUnits.push_back(LIUArray[*Units]); - RegUnits.back().Fixed = &LIS->getRegUnit(*Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + RegUnits.push_back(LIUArray[Unit]); + RegUnits.back().Fixed = &LIS->getRegUnit(Unit); } } bool InterferenceCache::Entry::valid(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI) { unsigned i = 0, e = RegUnits.size(); - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i) { + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { if (i == e) return false; - if (LIUArray[*Units].changedSince(RegUnits[i].VirtTag)) + if (LIUArray[Unit].changedSince(RegUnits[i].VirtTag)) return false; + ++i; } return i == e; } diff --git a/llvm/lib/CodeGen/InterferenceCache.h b/llvm/lib/CodeGen/InterferenceCache.h index 97464da9f17b..2a176b4f2cf7 100644 --- a/llvm/lib/CodeGen/InterferenceCache.h +++ b/llvm/lib/CodeGen/InterferenceCache.h @@ -54,7 +54,7 @@ class LLVM_LIBRARY_VISIBILITY InterferenceCache { unsigned RefCount = 0; /// MF - The current function. - MachineFunction *MF; + MachineFunction *MF = nullptr; /// Indexes - Mapping block numbers to SlotIndex ranges. SlotIndexes *Indexes = nullptr; @@ -156,7 +156,8 @@ class LLVM_LIBRARY_VISIBILITY InterferenceCache { public: InterferenceCache() = default; - + InterferenceCache &operator=(const InterferenceCache &other) = delete; + InterferenceCache(const InterferenceCache &other) = delete; ~InterferenceCache() { free(PhysRegEntries); } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 0582378be4cd..6b3848531569 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -58,6 +58,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -103,7 +104,7 @@ private: const TargetLowering *TLI = nullptr; /// The maximum supported interleave factor. - unsigned MaxFactor; + unsigned MaxFactor = 0u; /// Transform an interleaved load into target specific intrinsics. bool lowerInterleavedLoad(LoadInst *LI, @@ -113,6 +114,16 @@ private: bool lowerInterleavedStore(StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts); + /// Transform a load and a deinterleave intrinsic into target specific + /// instructions. + bool lowerDeinterleaveIntrinsic(IntrinsicInst *II, + SmallVector<Instruction *, 32> &DeadInsts); + + /// Transform an interleave intrinsic and a store into target specific + /// instructions. + bool lowerInterleaveIntrinsic(IntrinsicInst *II, + SmallVector<Instruction *, 32> &DeadInsts); + /// Returns true if the uses of an interleaved load by the /// extractelement instructions in \p Extracts can be replaced by uses of the /// shufflevector instructions in \p Shuffles instead. If so, the necessary @@ -202,86 +213,15 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor, /// The particular case of an RE-interleave mask is: /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> -static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor, - unsigned MaxFactor, unsigned OpNumElts) { - unsigned NumElts = Mask.size(); +static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, + unsigned MaxFactor) { + unsigned NumElts = SVI->getShuffleMask().size(); if (NumElts < 4) return false; // Check potential Factors. for (Factor = 2; Factor <= MaxFactor; Factor++) { - if (NumElts % Factor) - continue; - - unsigned LaneLen = NumElts / Factor; - if (!isPowerOf2_32(LaneLen)) - continue; - - // Check whether each element matches the general interleaved rule. - // Ignore undef elements, as long as the defined elements match the rule. - // Outer loop processes all factors (x, y, z in the above example) - unsigned I = 0, J; - for (; I < Factor; I++) { - unsigned SavedLaneValue; - unsigned SavedNoUndefs = 0; - - // Inner loop processes consecutive accesses (x, x+1... in the example) - for (J = 0; J < LaneLen - 1; J++) { - // Lane computes x's position in the Mask - unsigned Lane = J * Factor + I; - unsigned NextLane = Lane + Factor; - int LaneValue = Mask[Lane]; - int NextLaneValue = Mask[NextLane]; - - // If both are defined, values must be sequential - if (LaneValue >= 0 && NextLaneValue >= 0 && - LaneValue + 1 != NextLaneValue) - break; - - // If the next value is undef, save the current one as reference - if (LaneValue >= 0 && NextLaneValue < 0) { - SavedLaneValue = LaneValue; - SavedNoUndefs = 1; - } - - // Undefs are allowed, but defined elements must still be consecutive: - // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, .... - // Verify this by storing the last non-undef followed by an undef - // Check that following non-undef masks are incremented with the - // corresponding distance. - if (SavedNoUndefs > 0 && LaneValue < 0) { - SavedNoUndefs++; - if (NextLaneValue >= 0 && - SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue) - break; - } - } - - if (J < LaneLen - 1) - break; - - int StartMask = 0; - if (Mask[I] >= 0) { - // Check that the start of the I range (J=0) is greater than 0 - StartMask = Mask[I]; - } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) { - // StartMask defined by the last value in lane - StartMask = Mask[(LaneLen - 1) * Factor + I] - J; - } else if (SavedNoUndefs > 0) { - // StartMask defined by some non-zero value in the j loop - StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs); - } - // else StartMask remains set to 0, i.e. all elements are undefs - - if (StartMask < 0) - break; - // We must stay within the vectors; This case can happen with undefs. - if (StartMask + LaneLen > OpNumElts*2) - break; - } - - // Found an interleaved mask of current factor. - if (I == Factor) + if (SVI->isInterleave(Factor)) return true; } @@ -311,8 +251,10 @@ bool InterleavedAccess::lowerInterleavedLoad( continue; } if (auto *BI = dyn_cast<BinaryOperator>(User)) { - if (all_of(BI->users(), - [](auto *U) { return isa<ShuffleVectorInst>(U); })) { + if (all_of(BI->users(), [](auto *U) { + auto *SVI = dyn_cast<ShuffleVectorInst>(U); + return SVI && isa<UndefValue>(SVI->getOperand(1)); + })) { for (auto *SVI : BI->users()) BinOpShuffles.insert(cast<ShuffleVectorInst>(SVI)); continue; @@ -500,9 +442,7 @@ bool InterleavedAccess::lowerInterleavedStore( // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; - unsigned OpNumElts = - cast<FixedVectorType>(SVI->getOperand(0)->getType())->getNumElements(); - if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts)) + if (!isReInterleaveMask(SVI, Factor, MaxFactor)) return false; LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n"); @@ -517,6 +457,47 @@ bool InterleavedAccess::lowerInterleavedStore( return true; } +bool InterleavedAccess::lowerDeinterleaveIntrinsic( + IntrinsicInst *DI, SmallVector<Instruction *, 32> &DeadInsts) { + LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0)); + + if (!LI || !LI->hasOneUse() || !LI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI)) + return false; + + // We now have a target-specific load, so delete the old one. + DeadInsts.push_back(DI); + DeadInsts.push_back(LI); + return true; +} + +bool InterleavedAccess::lowerInterleaveIntrinsic( + IntrinsicInst *II, SmallVector<Instruction *, 32> &DeadInsts) { + if (!II->hasOneUse()) + return false; + + StoreInst *SI = dyn_cast<StoreInst>(*(II->users().begin())); + + if (!SI || !SI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerInterleaveIntrinsicToStore(II, SI)) + return false; + + // We now have a target-specific store, so delete the old one. + DeadInsts.push_back(SI); + DeadInsts.push_back(II); + return true; +} + bool InterleavedAccess::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); if (!TPC || !LowerInterleavedAccesses) @@ -539,6 +520,15 @@ bool InterleavedAccess::runOnFunction(Function &F) { if (auto *SI = dyn_cast<StoreInst>(&I)) Changed |= lowerInterleavedStore(SI, DeadInsts); + + if (auto *II = dyn_cast<IntrinsicInst>(&I)) { + // At present, we only have intrinsics to represent (de)interleaving + // with a factor of 2. + if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2) + Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts); + if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2) + Changed |= lowerInterleaveIntrinsic(II, DeadInsts); + } } for (auto *I : DeadInsts) diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp index 0d36badfa10f..d0ad6e45b4d3 100644 --- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp +++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp @@ -318,7 +318,7 @@ public: // See Proof(2): Trailing zero bits indicate a left shift. This removes // leading bits from the result even if they are undefined. - decErrorMSBs(C.countTrailingZeros()); + decErrorMSBs(C.countr_zero()); A *= C; pushBOperation(Mul, C); @@ -475,7 +475,7 @@ public: // // If this can be proven add shiftAmt to the error counter // `ErrorMSBs`. Otherwise set all bits as undefined. - if (A.countTrailingZeros() < shiftAmt) + if (A.countr_zero() < shiftAmt) ErrorMSBs = A.getBitWidth(); else incErrorMSBs(shiftAmt); @@ -678,6 +678,8 @@ public: EI = new ElementInfo[VTy->getNumElements()]; } + VectorInfo &operator=(const VectorInfo &other) = delete; + virtual ~VectorInfo() { delete[] EI; } unsigned getDimension() const { return VTy->getNumElements(); } diff --git a/llvm/lib/CodeGen/KCFI.cpp b/llvm/lib/CodeGen/KCFI.cpp new file mode 100644 index 000000000000..bffa02ca8afd --- /dev/null +++ b/llvm/lib/CodeGen/KCFI.cpp @@ -0,0 +1,111 @@ +//===---- KCFI.cpp - Implements Kernel Control-Flow Integrity (KCFI) ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements Kernel Control-Flow Integrity (KCFI) indirect call +// check lowering. For each call instruction with a cfi-type attribute, it +// emits an arch-specific check before the call, and bundles the check and +// the call to prevent unintentional modifications. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "kcfi" +#define KCFI_PASS_NAME "Insert KCFI indirect call checks" + +STATISTIC(NumKCFIChecksAdded, "Number of indirect call checks added"); + +namespace { +class KCFI : public MachineFunctionPass { +public: + static char ID; + + KCFI() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return KCFI_PASS_NAME; } + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + /// Machine instruction info used throughout the class. + const TargetInstrInfo *TII = nullptr; + + /// Target lowering for arch-specific parts. + const TargetLowering *TLI = nullptr; + + /// Emits a KCFI check before an indirect call. + /// \returns true if the check was added and false otherwise. + bool emitCheck(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator I) const; +}; + +char KCFI::ID = 0; +} // end anonymous namespace + +INITIALIZE_PASS(KCFI, DEBUG_TYPE, KCFI_PASS_NAME, false, false) + +FunctionPass *llvm::createKCFIPass() { return new KCFI(); } + +bool KCFI::emitCheck(MachineBasicBlock &MBB, + MachineBasicBlock::instr_iterator MBBI) const { + assert(TII && "Target instruction info was not initialized"); + assert(TLI && "Target lowering was not initialized"); + + // If the call instruction is bundled, we can only emit a check safely if + // it's the first instruction in the bundle. + if (MBBI->isBundled() && !std::prev(MBBI)->isBundle()) + report_fatal_error("Cannot emit a KCFI check for a bundled call"); + + // Emit a KCFI check for the call instruction at MBBI. The implementation + // must unfold memory operands if applicable. + MachineInstr *Check = TLI->EmitKCFICheck(MBB, MBBI, TII); + + // Clear the original call's CFI type. + assert(MBBI->isCall() && "Unexpected instruction type"); + MBBI->setCFIType(*MBB.getParent(), 0); + + // If not already bundled, bundle the check and the call to prevent + // further changes. + if (!MBBI->isBundled()) + finalizeBundle(MBB, Check->getIterator(), std::next(MBBI->getIterator())); + + ++NumKCFIChecksAdded; + return true; +} + +bool KCFI::runOnMachineFunction(MachineFunction &MF) { + const Module *M = MF.getMMI().getModule(); + if (!M->getModuleFlag("kcfi")) + return false; + + const auto &SubTarget = MF.getSubtarget(); + TII = SubTarget.getInstrInfo(); + TLI = SubTarget.getTargetLowering(); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + // Use instr_iterator because we don't want to skip bundles. + for (MachineBasicBlock::instr_iterator MII = MBB.instr_begin(), + MIE = MBB.instr_end(); + MII != MIE; ++MII) { + if (MII->isCall() && MII->getCFIType()) + Changed |= emitCheck(MBB, MII); + } + } + + return Changed; +} diff --git a/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/llvm/lib/CodeGen/LLVMTargetMachine.cpp index 3192dcadb5f5..d02ec1db1165 100644 --- a/llvm/lib/CodeGen/LLVMTargetMachine.cpp +++ b/llvm/lib/CodeGen/LLVMTargetMachine.cpp @@ -274,16 +274,17 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx, // emission fails. const MCSubtargetInfo &STI = *getMCSubtargetInfo(); const MCRegisterInfo &MRI = *getMCRegisterInfo(); - MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getMCInstrInfo(), *Ctx); - MCAsmBackend *MAB = - getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions); + std::unique_ptr<MCCodeEmitter> MCE( + getTarget().createMCCodeEmitter(*getMCInstrInfo(), *Ctx)); + std::unique_ptr<MCAsmBackend> MAB( + getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions)); if (!MCE || !MAB) return true; const Triple &T = getTargetTriple(); std::unique_ptr<MCStreamer> AsmStreamer(getTarget().createMCObjectStreamer( - T, *Ctx, std::unique_ptr<MCAsmBackend>(MAB), MAB->createObjectWriter(Out), - std::unique_ptr<MCCodeEmitter>(MCE), STI, Options.MCOptions.MCRelaxAll, + T, *Ctx, std::move(MAB), MAB->createObjectWriter(Out), std::move(MCE), + STI, Options.MCOptions.MCRelaxAll, Options.MCOptions.MCIncrementalLinkerCompatible, /*DWARFMustBeAtTheEnd*/ true)); diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index ba417322d4f6..57df9b67fd02 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -601,7 +601,7 @@ public: if (Var.getInlinedAt()) return false; - if (Expr->getNumElements() > 0) + if (Expr->getNumElements() > 0 && !Expr->isDeref()) return false; return true; @@ -1544,12 +1544,12 @@ std::optional<ValueIDNum> InstrRefBasedLDV::getValueForInstrRef( if (Size != MainRegSize || Offset) { // Enumerate all subregisters, searching. Register NewReg = 0; - for (MCSubRegIterator SRI(Reg, TRI, false); SRI.isValid(); ++SRI) { - unsigned Subreg = TRI->getSubRegIndex(Reg, *SRI); + for (MCPhysReg SR : TRI->subregs(Reg)) { + unsigned Subreg = TRI->getSubRegIndex(Reg, SR); unsigned SubregSize = TRI->getSubRegIdxSize(Subreg); unsigned SubregOffset = TRI->getSubRegIdxOffset(Subreg); if (SubregSize == Size && SubregOffset == Offset) { - NewReg = *SRI; + NewReg = SR; break; } } @@ -2066,12 +2066,12 @@ bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) { }; // Then, transfer subreg bits. - for (MCSubRegIterator SRI(Reg, TRI, false); SRI.isValid(); ++SRI) { + for (MCPhysReg SR : TRI->subregs(Reg)) { // Ensure this reg is tracked, - (void)MTracker->lookupOrTrackRegister(*SRI); - unsigned SubregIdx = TRI->getSubRegIndex(Reg, *SRI); + (void)MTracker->lookupOrTrackRegister(SR); + unsigned SubregIdx = TRI->getSubRegIndex(Reg, SR); unsigned SpillID = MTracker->getLocID(Loc, SubregIdx); - DoTransfer(*SRI, SpillID); + DoTransfer(SR, SpillID); } // Directly lookup size of main source reg, and transfer. @@ -2101,10 +2101,10 @@ bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) { MTracker->setReg(DestReg, ReadValue); }; - for (MCSubRegIterator SRI(Reg, TRI, false); SRI.isValid(); ++SRI) { - unsigned Subreg = TRI->getSubRegIndex(Reg, *SRI); + for (MCPhysReg SR : TRI->subregs(Reg)) { + unsigned Subreg = TRI->getSubRegIndex(Reg, SR); unsigned SpillID = MTracker->getLocID(*Loc, Subreg); - DoTransfer(*SRI, SpillID); + DoTransfer(SR, SpillID); } // Directly look up this registers slot idx by size, and transfer. @@ -2513,8 +2513,8 @@ void InstrRefBasedLDV::placeMLocPHIs( Register R = MTracker->LocIdxToLocID[L]; SmallSet<Register, 8> FoundRegUnits; bool AnyIllegal = false; - for (MCRegUnitIterator RUI(R.asMCReg(), TRI); RUI.isValid(); ++RUI) { - for (MCRegUnitRootIterator URoot(*RUI, TRI); URoot.isValid(); ++URoot){ + for (MCRegUnit Unit : TRI->regunits(R.asMCReg())) { + for (MCRegUnitRootIterator URoot(Unit, TRI); URoot.isValid(); ++URoot) { if (!MTracker->isRegisterTracked(*URoot)) { // Not all roots were loaded into the tracking map: this register // isn't actually def'd anywhere, we only read from it. Generate PHIs @@ -3179,7 +3179,7 @@ void InstrRefBasedLDV::buildVLocValueMap( SmallPtrSet<MachineBasicBlock *, 32> DefBlocks; for (const MachineBasicBlock *ExpMBB : BlocksToExplore) { auto &TransferFunc = AllTheVLocs[ExpMBB->getNumber()].Vars; - if (TransferFunc.find(Var) != TransferFunc.end()) + if (TransferFunc.contains(Var)) DefBlocks.insert(const_cast<MachineBasicBlock *>(ExpMBB)); } @@ -3295,7 +3295,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // to be visited next time around. for (auto *s : MBB->successors()) { // Ignore out of scope / not-to-be-explored successors. - if (LiveInIdx.find(s) == LiveInIdx.end()) + if (!LiveInIdx.contains(s)) continue; if (BBToOrder[s] > BBToOrder[MBB]) { @@ -3411,7 +3411,7 @@ void InstrRefBasedLDV::initialSetup(MachineFunction &MF) { for (MachineBasicBlock *MBB : RPOT) processMBB(MBB); for (MachineBasicBlock &MBB : MF) - if (BBToOrder.find(&MBB) == BBToOrder.end()) + if (!BBToOrder.contains(&MBB)) processMBB(&MBB); // Order value substitutions by their "source" operand pair, for quick lookup. @@ -3716,7 +3716,12 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, unsigned BlockNo = Num.getBlock(); LocIdx LocNo = Num.getLoc(); - Num = MInLocs[BlockNo][LocNo.asU64()]; + ValueIDNum ResolvedValue = MInLocs[BlockNo][LocNo.asU64()]; + // If there is no resolved value for this live-in then it is not directly + // reachable from the entry block -- model it as a PHI on entry to this + // block, which means we leave the ValueIDNum unchanged. + if (ResolvedValue != ValueIDNum::EmptyValue) + Num = ResolvedValue; } // Later, we'll be looking up ranges of instruction numbers. llvm::sort(DebugPHINumToValue); @@ -4050,10 +4055,7 @@ public: /// ValueIsPHI - Check if the instruction that defines the specified value /// is a PHI instruction. static LDVSSAPhi *ValueIsPHI(BlockValueNum Val, LDVSSAUpdater *Updater) { - auto PHIIt = Updater->PHIs.find(Val); - if (PHIIt == Updater->PHIs.end()) - return nullptr; - return PHIIt->second; + return Updater->PHIs.lookup(Val); } /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source @@ -4195,7 +4197,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIsImpl( // Are all these things actually defined? for (auto &PHIIt : PHI->IncomingValues) { // Any undef input means DBG_PHIs didn't dominate the use point. - if (Updater.UndefMap.find(&PHIIt.first->BB) != Updater.UndefMap.end()) + if (Updater.UndefMap.contains(&PHIIt.first->BB)) return std::nullopt; ValueIDNum ValueToCheck; diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h index 2fdc37c6dda2..30de18e53c4f 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h @@ -656,7 +656,7 @@ public: // If we discover a new machine location, assign it an mphi with this // block number. - unsigned CurBB; + unsigned CurBB = -1; /// Cached local copy of the number of registers the target has. unsigned NumRegs; @@ -740,7 +740,7 @@ public: unsigned getLocID(SpillLocationNo Spill, StackSlotPos Idx) { unsigned SlotNo = Spill.id() - 1; SlotNo *= NumSlotIdxes; - assert(StackSlotIdxes.find(Idx) != StackSlotIdxes.end()); + assert(StackSlotIdxes.contains(Idx)); SlotNo += StackSlotIdxes[Idx]; SlotNo += NumRegs; return SlotNo; @@ -1094,7 +1094,7 @@ private: MLocTracker *MTracker = nullptr; /// Number of the current block LiveDebugValues is stepping through. - unsigned CurBB; + unsigned CurBB = -1; /// Number of the current instruction LiveDebugValues is evaluating. unsigned CurInst; @@ -1197,7 +1197,7 @@ private: /// For an instruction reference given by \p InstNo and \p OpNo in instruction /// \p MI returns the Value pointed to by that instruction reference if any - /// exists, otherwise returns None. + /// exists, otherwise returns std::nullopt. std::optional<ValueIDNum> getValueForInstrRef(unsigned InstNo, unsigned OpNo, MachineInstr &MI, const ValueTable *MLiveOuts, diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp index 9dba9a88f703..0c0a4e13c7c9 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp @@ -8,7 +8,6 @@ #include "LiveDebugValues.h" -#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -19,6 +18,7 @@ #include "llvm/PassRegistry.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" /// \file LiveDebugValues.cpp /// @@ -81,7 +81,7 @@ public: private: std::unique_ptr<LDVImpl> InstrRefImpl; std::unique_ptr<LDVImpl> VarLocImpl; - TargetPassConfig *TPC; + TargetPassConfig *TPC = nullptr; MachineDominatorTree MDT; }; } // namespace diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp index b78757b855f4..116c6b7e2d19 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp @@ -1116,7 +1116,7 @@ VarLocBasedLDV::~VarLocBasedLDV() = default; /// location, erase the variable from the Vars set. void VarLocBasedLDV::OpenRangesSet::erase(const VarLoc &VL) { // Erasure helper. - auto DoErase = [VL, this](DebugVariable VarToErase) { + auto DoErase = [&VL, this](DebugVariable VarToErase) { auto *EraseFrom = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars; auto It = EraseFrom->find(VarToErase); if (It != EraseFrom->end()) { @@ -1312,7 +1312,7 @@ void VarLocBasedLDV::cleanupEntryValueTransfers( return; auto TransRange = EntryValTransfers.equal_range(TRInst); - for (auto TDPair : llvm::make_range(TransRange.first, TransRange.second)) { + for (auto &TDPair : llvm::make_range(TransRange.first, TransRange.second)) { const VarLoc &EmittedEV = VarLocIDs[TDPair.second]; if (std::tie(EntryVL.Var, EntryVL.Locs[0].Value.RegNo, EntryVL.Expr) == std::tie(EmittedEV.Var, EmittedEV.Locs[0].Value.RegNo, @@ -1347,7 +1347,7 @@ void VarLocBasedLDV::removeEntryValue(const MachineInstr &MI, // Try to get non-debug instruction responsible for the DBG_VALUE. const MachineInstr *TransferInst = nullptr; Register Reg = MI.getDebugOperand(0).getReg(); - if (Reg.isValid() && RegSetInstrs.find(Reg) != RegSetInstrs.end()) + if (Reg.isValid() && RegSetInstrs.contains(Reg)) TransferInst = RegSetInstrs.find(Reg)->second; // Case of the parameter's DBG_VALUE at the start of entry MBB. @@ -2151,7 +2151,9 @@ bool VarLocBasedLDV::isEntryValueCandidate( // TODO: Add support for parameters that have a pre-existing debug expressions // (e.g. fragments). - if (MI.getDebugExpression()->getNumElements() > 0) + // A simple deref expression is equivalent to an indirect debug value. + const DIExpression *Expr = MI.getDebugExpression(); + if (Expr->getNumElements() > 0 && !Expr->isDeref()) return false; return true; @@ -2160,8 +2162,8 @@ bool VarLocBasedLDV::isEntryValueCandidate( /// Collect all register defines (including aliases) for the given instruction. static void collectRegDefs(const MachineInstr &MI, DefinedRegsSet &Regs, const TargetRegisterInfo *TRI) { - for (const MachineOperand &MO : MI.operands()) { - if (MO.isReg() && MO.isDef() && MO.getReg() && MO.getReg().isPhysical()) { + for (const MachineOperand &MO : MI.all_defs()) { + if (MO.getReg() && MO.getReg().isPhysical()) { Regs.insert(MO.getReg()); for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI) Regs.insert(*AI); diff --git a/llvm/lib/CodeGen/LiveInterval.cpp b/llvm/lib/CodeGen/LiveInterval.cpp index 7cd3d26cf5b3..1cf354349c56 100644 --- a/llvm/lib/CodeGen/LiveInterval.cpp +++ b/llvm/lib/CodeGen/LiveInterval.cpp @@ -445,7 +445,7 @@ bool LiveRange::overlaps(const LiveRange &Other, const CoalescerPair &CP, while (true) { // J has just been advanced to satisfy: - assert(J->end >= I->start); + assert(J->end > I->start); // Check for an overlap. if (J->start < I->end) { // I and J are overlapping. Find the later start. @@ -460,11 +460,11 @@ bool LiveRange::overlaps(const LiveRange &Other, const CoalescerPair &CP, std::swap(I, J); std::swap(IE, JE); } - // Advance J until J->end >= I->start. + // Advance J until J->end > I->start. do if (++J == JE) return false; - while (J->end < I->start); + while (J->end <= I->start); } } diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index a49f6b0604c5..da55e7f7284b 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -280,9 +280,7 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { bool IsReserved = false; for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { bool IsRootReserved = true; - for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); - Super.isValid(); ++Super) { - MCRegister Reg = *Super; + for (MCPhysReg Reg : TRI->superregs_inclusive(*Root)) { if (!MRI->reg_empty(Reg)) LICalc->createDeadDefs(LR, Reg); // A register unit is considered reserved if all its roots and all their @@ -299,9 +297,7 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) { // Ignore uses of reserved registers. We only track defs of those. if (!IsReserved) { for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { - for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); - Super.isValid(); ++Super) { - MCRegister Reg = *Super; + for (MCPhysReg Reg : TRI->superregs_inclusive(*Root)) { if (!MRI->reg_empty(Reg)) LICalc->extendToUses(LR, Reg); } @@ -333,8 +329,7 @@ void LiveIntervals::computeLiveInRegUnits() { SlotIndex Begin = Indexes->getMBBStartIdx(&MBB); LLVM_DEBUG(dbgs() << Begin << "\t" << printMBBReference(MBB)); for (const auto &LI : MBB.liveins()) { - for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) { - unsigned Unit = *Units; + for (MCRegUnit Unit : TRI->regunits(LI.PhysReg)) { LiveRange *LR = RegUnitRanges[Unit]; if (!LR) { // Use segment set to speed-up initial computation of the live range. @@ -708,9 +703,8 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) { // Find the regunit intervals for the assigned register. They may overlap // the virtual register live range, cancelling any kills. RU.clear(); - for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); - ++Unit) { - const LiveRange &RURange = getRegUnit(*Unit); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + const LiveRange &RURange = getRegUnit(Unit); if (RURange.empty()) continue; RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end))); @@ -1056,10 +1050,9 @@ public: // For physregs, only update the regunits that actually have a // precomputed live range. - for (MCRegUnitIterator Units(Reg.asMCReg(), &TRI); Units.isValid(); - ++Units) - if (LiveRange *LR = getRegUnitLI(*Units)) - updateRange(*LR, *Units, LaneBitmask::getNone()); + for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) + if (LiveRange *LR = getRegUnitLI(Unit)) + updateRange(*LR, Unit, LaneBitmask::getNone()); } if (hasRegMask) updateRegMaskSlots(); @@ -1707,8 +1700,8 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB, } void LiveIntervals::removePhysRegDefAt(MCRegister Reg, SlotIndex Pos) { - for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) { - if (LiveRange *LR = getCachedRegUnit(*Unit)) + for (MCRegUnit Unit : TRI->regunits(Reg)) { + if (LiveRange *LR = getCachedRegUnit(Unit)) if (VNInfo *VNI = LR->getVNInfoAt(Pos)) LR->removeValNo(VNI); } diff --git a/llvm/lib/CodeGen/LivePhysRegs.cpp b/llvm/lib/CodeGen/LivePhysRegs.cpp index d4848f16dcf2..96380d408482 100644 --- a/llvm/lib/CodeGen/LivePhysRegs.cpp +++ b/llvm/lib/CodeGen/LivePhysRegs.cpp @@ -265,14 +265,9 @@ void llvm::addLiveIns(MachineBasicBlock &MBB, const LivePhysRegs &LiveRegs) { if (MRI.isReserved(Reg)) continue; // Skip the register if we are about to add one of its super registers. - bool ContainsSuperReg = false; - for (MCSuperRegIterator SReg(Reg, &TRI); SReg.isValid(); ++SReg) { - if (LiveRegs.contains(*SReg) && !MRI.isReserved(*SReg)) { - ContainsSuperReg = true; - break; - } - } - if (ContainsSuperReg) + if (any_of(TRI.superregs(Reg), [&](MCPhysReg SReg) { + return LiveRegs.contains(SReg) && !MRI.isReserved(SReg); + })) continue; MBB.addLiveIn(Reg); } diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index d8b024fbdfea..ff49e080090c 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -82,7 +82,7 @@ void LiveRangeEdit::scanRemattable() { for (VNInfo *VNI : getParent().valnos) { if (VNI->isUnused()) continue; - unsigned Original = VRM->getOriginal(getReg()); + Register Original = VRM->getOriginal(getReg()); LiveInterval &OrigLI = LIS.getInterval(Original); VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def); if (!OrigVNI) @@ -181,11 +181,9 @@ bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI, SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned DestReg, - const Remat &RM, + Register DestReg, const Remat &RM, const TargetRegisterInfo &tri, - bool Late, - unsigned SubIdx, + bool Late, unsigned SubIdx, MachineInstr *ReplaceIndexMI) { assert(RM.OrigMI && "Invalid remat"); TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI, tri); @@ -288,8 +286,12 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) { // Never delete a bundled instruction. if (MI->isBundled()) { + // TODO: Handle deleting copy bundles + LLVM_DEBUG(dbgs() << "Won't delete dead bundled inst: " << Idx << '\t' + << *MI); return; } + // Never delete inline asm. if (MI->isInlineAsm()) { LLVM_DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI); @@ -306,7 +308,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) { LLVM_DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI); // Collect virtual registers to be erased after MI is gone. - SmallVector<unsigned, 8> RegsToErase; + SmallVector<Register, 8> RegsToErase; bool ReadsPhysRegs = false; bool isOrigDef = false; Register Dest; @@ -350,7 +352,8 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) { // unlikely to change anything. We typically don't want to shrink the // PIC base register that has lots of uses everywhere. // Always shrink COPY uses that probably come from live range splitting. - if ((MI->readsVirtualRegister(Reg) && (MI->isCopy() || MO.isDef())) || + if ((MI->readsVirtualRegister(Reg) && + (MO.isDef() || TII.isCopyInstr(*MI))) || (MO.readsReg() && (MRI.hasOneNonDBGUse(Reg) || useIsKill(LI, MO)))) ToShrink.insert(&LI); else if (MO.readsReg()) diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp index 93f5314539cd..af7d6c4403b8 100644 --- a/llvm/lib/CodeGen/LiveRangeShrink.cpp +++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -109,6 +110,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) { return false; MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); @@ -197,7 +199,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) { // is because it needs more accurate model to handle register // pressure correctly. MachineInstr &DefInstr = *MRI.def_instr_begin(Reg); - if (!DefInstr.isCopy()) + if (!TII.isCopyInstr(DefInstr)) NumEligibleUse++; Insert = FindDominatedInstruction(DefInstr, Insert, IOM); } else { diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp index 6ca7f00a7885..6df7e5c10862 100644 --- a/llvm/lib/CodeGen/LiveRegMatrix.cpp +++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp @@ -93,8 +93,8 @@ static bool foreachUnit(const TargetRegisterInfo *TRI, } } } else { - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - if (Func(*Units, VRegInterval)) + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + if (Func(Unit, VRegInterval)) return true; } } @@ -136,8 +136,8 @@ void LiveRegMatrix::unassign(const LiveInterval &VirtReg) { } bool LiveRegMatrix::isPhysRegUsed(MCRegister PhysReg) const { - for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) { - if (!Matrix[*Unit].empty()) + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + if (!Matrix[Unit].empty()) return true; } return false; @@ -216,7 +216,7 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End, LR.addSegment(Seg); // Check for interference with that segment - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { // LR is stack-allocated. LiveRegMatrix caches queries by a key that // includes the address of the live range. If (for the same reg unit) this // checkInterference overload is called twice, without any other query() @@ -230,7 +230,7 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End, // subtle bugs due to query identity. Avoiding caching, for example, would // greatly simplify things. LiveIntervalUnion::Query Q; - Q.reset(UserTag, LR, Matrix[*Units]); + Q.reset(UserTag, LR, Matrix[Unit]); if (Q.checkInterference()) return true; } @@ -239,8 +239,8 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End, Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const { const LiveInterval *VRegInterval = nullptr; - for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) { - if ((VRegInterval = Matrix[*Unit].getOneVReg())) + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + if ((VRegInterval = Matrix[Unit].getOneVReg())) return VRegInterval->reg(); } diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp index 34c81c92707e..9cd74689ba10 100644 --- a/llvm/lib/CodeGen/LiveVariables.cpp +++ b/llvm/lib/CodeGen/LiveVariables.cpp @@ -191,8 +191,7 @@ LiveVariables::FindLastPartialDef(Register Reg, unsigned LastDefReg = 0; unsigned LastDefDist = 0; MachineInstr *LastDef = nullptr; - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - unsigned SubReg = *SubRegs; + for (MCPhysReg SubReg : TRI->subregs(Reg)) { MachineInstr *Def = PhysRegDef[SubReg]; if (!Def) continue; @@ -208,15 +207,13 @@ LiveVariables::FindLastPartialDef(Register Reg, return nullptr; PartDefRegs.insert(LastDefReg); - for (unsigned i = 0, e = LastDef->getNumOperands(); i != e; ++i) { - MachineOperand &MO = LastDef->getOperand(i); - if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0) + for (MachineOperand &MO : LastDef->all_defs()) { + if (MO.getReg() == 0) continue; Register DefReg = MO.getReg(); if (TRI->isSubRegister(Reg, DefReg)) { - for (MCSubRegIterator SubRegs(DefReg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - PartDefRegs.insert(*SubRegs); + for (MCPhysReg SubReg : TRI->subregs_inclusive(DefReg)) + PartDefRegs.insert(SubReg); } } return LastDef; @@ -245,8 +242,7 @@ void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) { true/*IsImp*/)); PhysRegDef[Reg] = LastPartialDef; SmallSet<unsigned, 8> Processed; - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - unsigned SubReg = *SubRegs; + for (MCPhysReg SubReg : TRI->subregs(Reg)) { if (Processed.count(SubReg)) continue; if (PartDefRegs.count(SubReg)) @@ -257,8 +253,8 @@ void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) { false/*IsDef*/, true/*IsImp*/)); PhysRegDef[SubReg] = LastPartialDef; - for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS) - Processed.insert(*SS); + for (MCPhysReg SS : TRI->subregs(SubReg)) + Processed.insert(SS); } } } else if (LastDef && !PhysRegUse[Reg] && @@ -268,9 +264,8 @@ void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) { true/*IsImp*/)); // Remember this use. - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - PhysRegUse[*SubRegs] = &MI; + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) + PhysRegUse[SubReg] = &MI; } /// FindLastRefOrPartRef - Return the last reference or partial reference of @@ -284,8 +279,7 @@ MachineInstr *LiveVariables::FindLastRefOrPartRef(Register Reg) { MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef; unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef]; unsigned LastPartDefDist = 0; - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - unsigned SubReg = *SubRegs; + for (MCPhysReg SubReg : TRI->subregs(Reg)) { MachineInstr *Def = PhysRegDef[SubReg]; if (Def && Def != LastDef) { // There was a def of this sub-register in between. This is a partial @@ -333,8 +327,7 @@ bool LiveVariables::HandlePhysRegKill(Register Reg, MachineInstr *MI) { MachineInstr *LastPartDef = nullptr; unsigned LastPartDefDist = 0; SmallSet<unsigned, 8> PartUses; - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - unsigned SubReg = *SubRegs; + for (MCPhysReg SubReg : TRI->subregs(Reg)) { MachineInstr *Def = PhysRegDef[SubReg]; if (Def && Def != LastDef) { // There was a def of this sub-register in between. This is a partial @@ -347,9 +340,8 @@ bool LiveVariables::HandlePhysRegKill(Register Reg, MachineInstr *MI) { continue; } if (MachineInstr *Use = PhysRegUse[SubReg]) { - for (MCSubRegIterator SS(SubReg, TRI, /*IncludeSelf=*/true); SS.isValid(); - ++SS) - PartUses.insert(*SS); + for (MCPhysReg SS : TRI->subregs_inclusive(SubReg)) + PartUses.insert(SS); unsigned Dist = DistanceMap[Use]; if (Dist > LastRefOrPartRefDist) { LastRefOrPartRefDist = Dist; @@ -364,8 +356,7 @@ bool LiveVariables::HandlePhysRegKill(Register Reg, MachineInstr *MI) { // dead EAX = op implicit-def AL // That is, EAX def is dead but AL def extends pass it. PhysRegDef[Reg]->addRegisterDead(Reg, TRI, true); - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - unsigned SubReg = *SubRegs; + for (MCPhysReg SubReg : TRI->subregs(Reg)) { if (!PartUses.count(SubReg)) continue; bool NeedDef = true; @@ -384,12 +375,11 @@ bool LiveVariables::HandlePhysRegKill(Register Reg, MachineInstr *MI) { LastSubRef->addRegisterKilled(SubReg, TRI, true); else { LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true); - for (MCSubRegIterator SS(SubReg, TRI, /*IncludeSelf=*/true); - SS.isValid(); ++SS) - PhysRegUse[*SS] = LastRefOrPartRef; + for (MCPhysReg SS : TRI->subregs_inclusive(SubReg)) + PhysRegUse[SS] = LastRefOrPartRef; } - for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS) - PartUses.erase(*SS); + for (MCPhysReg SS : TRI->subregs(SubReg)) + PartUses.erase(SS); } } else if (LastRefOrPartRef == PhysRegDef[Reg] && LastRefOrPartRef != MI) { if (LastPartDef) @@ -430,9 +420,9 @@ void LiveVariables::HandleRegMask(const MachineOperand &MO) { // Kill the largest clobbered super-register. // This avoids needless implicit operands. unsigned Super = Reg; - for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) - if ((PhysRegDef[*SR] || PhysRegUse[*SR]) && MO.clobbersPhysReg(*SR)) - Super = *SR; + for (MCPhysReg SR : TRI->superregs(Reg)) + if ((PhysRegDef[SR] || PhysRegUse[SR]) && MO.clobbersPhysReg(SR)) + Super = SR; HandlePhysRegKill(Super, nullptr); } } @@ -442,12 +432,10 @@ void LiveVariables::HandlePhysRegDef(Register Reg, MachineInstr *MI, // What parts of the register are previously defined? SmallSet<unsigned, 32> Live; if (PhysRegDef[Reg] || PhysRegUse[Reg]) { - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) - Live.insert(*SubRegs); + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) + Live.insert(SubReg); } else { - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - unsigned SubReg = *SubRegs; + for (MCPhysReg SubReg : TRI->subregs(Reg)) { // If a register isn't itself defined, but all parts that make up of it // are defined, then consider it also defined. // e.g. @@ -457,9 +445,8 @@ void LiveVariables::HandlePhysRegDef(Register Reg, MachineInstr *MI, if (Live.count(SubReg)) continue; if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) { - for (MCSubRegIterator SS(SubReg, TRI, /*IncludeSelf=*/true); - SS.isValid(); ++SS) - Live.insert(*SS); + for (MCPhysReg SS : TRI->subregs_inclusive(SubReg)) + Live.insert(SS); } } } @@ -468,8 +455,7 @@ void LiveVariables::HandlePhysRegDef(Register Reg, MachineInstr *MI, // is referenced. HandlePhysRegKill(Reg, MI); // Only some of the sub-registers are used. - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - unsigned SubReg = *SubRegs; + for (MCPhysReg SubReg : TRI->subregs(Reg)) { if (!Live.count(SubReg)) // Skip if this sub-register isn't defined. continue; @@ -484,9 +470,7 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr &MI, SmallVectorImpl<unsigned> &Defs) { while (!Defs.empty()) { Register Reg = Defs.pop_back_val(); - for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true); - SubRegs.isValid(); ++SubRegs) { - unsigned SubReg = *SubRegs; + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) { PhysRegDef[SubReg] = &MI; PhysRegUse[SubReg] = nullptr; } @@ -699,7 +683,7 @@ void LiveVariables::recomputeForSingleDefVirtReg(Register Reg) { if (UseMI.isPHI()) { // If Reg is used in a phi then it is live-to-end of the corresponding // predecessor. - unsigned Idx = UseMI.getOperandNo(&UseMO); + unsigned Idx = UseMO.getOperandNo(); LiveToEndBlocks.push_back(UseMI.getOperand(Idx + 1).getMBB()); } else if (&UseBB == &DefBB) { // A non-phi use in the same BB as the single def must come after the def. diff --git a/llvm/lib/CodeGen/LowLevelType.cpp b/llvm/lib/CodeGen/LowLevelType.cpp index b47c96e50831..24c30b756737 100644 --- a/llvm/lib/CodeGen/LowLevelType.cpp +++ b/llvm/lib/CodeGen/LowLevelType.cpp @@ -12,74 +12,55 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/LowLevelType.h" -#include "llvm/ADT/APFloat.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; -LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) { - if (auto VTy = dyn_cast<VectorType>(&Ty)) { - auto EC = VTy->getElementCount(); - LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL); - if (EC.isScalar()) - return ScalarTy; - return LLT::vector(EC, ScalarTy); - } - - if (auto PTy = dyn_cast<PointerType>(&Ty)) { - unsigned AddrSpace = PTy->getAddressSpace(); - return LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); - } - - if (Ty.isSized()) { +LLT::LLT(MVT VT) { + if (VT.isVector()) { + bool asVector = VT.getVectorMinNumElements() > 1; + init(/*IsPointer=*/false, asVector, /*IsScalar=*/!asVector, + VT.getVectorElementCount(), VT.getVectorElementType().getSizeInBits(), + /*AddressSpace=*/0); + } else if (VT.isValid() && !VT.isScalableTargetExtVT()) { // Aggregates are no different from real scalars as far as GlobalISel is // concerned. - auto SizeInBits = DL.getTypeSizeInBits(&Ty); - assert(SizeInBits != 0 && "invalid zero-sized type"); - return LLT::scalar(SizeInBits); + init(/*IsPointer=*/false, /*IsVector=*/false, /*IsScalar=*/true, + ElementCount::getFixed(0), VT.getSizeInBits(), /*AddressSpace=*/0); + } else { + IsScalar = false; + IsPointer = false; + IsVector = false; + RawData = 0; } - - return LLT(); } -MVT llvm::getMVTForLLT(LLT Ty) { - if (!Ty.isVector()) - return MVT::getIntegerVT(Ty.getSizeInBits()); - - return MVT::getVectorVT( - MVT::getIntegerVT(Ty.getElementType().getSizeInBits()), - Ty.getNumElements()); +void LLT::print(raw_ostream &OS) const { + if (isVector()) { + OS << "<"; + OS << getElementCount() << " x " << getElementType() << ">"; + } else if (isPointer()) + OS << "p" << getAddressSpace(); + else if (isValid()) { + assert(isScalar() && "unexpected type"); + OS << "s" << getScalarSizeInBits(); + } else + OS << "LLT_invalid"; } -EVT llvm::getApproximateEVTForLLT(LLT Ty, const DataLayout &DL, - LLVMContext &Ctx) { - if (Ty.isVector()) { - EVT EltVT = getApproximateEVTForLLT(Ty.getElementType(), DL, Ctx); - return EVT::getVectorVT(Ctx, EltVT, Ty.getElementCount()); - } - - return EVT::getIntegerVT(Ctx, Ty.getSizeInBits()); -} - -LLT llvm::getLLTForMVT(MVT Ty) { - if (!Ty.isVector()) - return LLT::scalar(Ty.getSizeInBits()); - - return LLT::scalarOrVector(Ty.getVectorElementCount(), - Ty.getVectorElementType().getSizeInBits()); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LLT::dump() const { + print(dbgs()); + dbgs() << '\n'; } +#endif -const llvm::fltSemantics &llvm::getFltSemanticForLLT(LLT Ty) { - assert(Ty.isScalar() && "Expected a scalar type."); - switch (Ty.getSizeInBits()) { - case 16: - return APFloat::IEEEhalf(); - case 32: - return APFloat::IEEEsingle(); - case 64: - return APFloat::IEEEdouble(); - case 128: - return APFloat::IEEEquad(); - } - llvm_unreachable("Invalid FP type size."); -} +const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo; +const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo; +const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo; +const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo; +const constexpr LLT::BitFieldInfo LLT::VectorScalableFieldInfo; +const constexpr LLT::BitFieldInfo LLT::VectorSizeFieldInfo; +const constexpr LLT::BitFieldInfo LLT::PointerVectorElementsFieldInfo; +const constexpr LLT::BitFieldInfo LLT::PointerVectorScalableFieldInfo; +const constexpr LLT::BitFieldInfo LLT::PointerVectorSizeFieldInfo; +const constexpr LLT::BitFieldInfo LLT::PointerVectorAddressSpaceFieldInfo; diff --git a/llvm/lib/CodeGen/LowLevelTypeUtils.cpp b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp new file mode 100644 index 000000000000..bc2ea3f05b6d --- /dev/null +++ b/llvm/lib/CodeGen/LowLevelTypeUtils.cpp @@ -0,0 +1,85 @@ +//===-- llvm/CodeGen/LowLevelTypeUtils.cpp --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file implements the more header-heavy bits of the LLT class to +/// avoid polluting users' namespaces. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/LowLevelTypeUtils.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +using namespace llvm; + +LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) { + if (auto VTy = dyn_cast<VectorType>(&Ty)) { + auto EC = VTy->getElementCount(); + LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL); + if (EC.isScalar()) + return ScalarTy; + return LLT::vector(EC, ScalarTy); + } + + if (auto PTy = dyn_cast<PointerType>(&Ty)) { + unsigned AddrSpace = PTy->getAddressSpace(); + return LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)); + } + + if (Ty.isSized() && !Ty.isScalableTargetExtTy()) { + // Aggregates are no different from real scalars as far as GlobalISel is + // concerned. + auto SizeInBits = DL.getTypeSizeInBits(&Ty); + assert(SizeInBits != 0 && "invalid zero-sized type"); + return LLT::scalar(SizeInBits); + } + + return LLT(); +} + +MVT llvm::getMVTForLLT(LLT Ty) { + if (!Ty.isVector()) + return MVT::getIntegerVT(Ty.getSizeInBits()); + + return MVT::getVectorVT( + MVT::getIntegerVT(Ty.getElementType().getSizeInBits()), + Ty.getNumElements()); +} + +EVT llvm::getApproximateEVTForLLT(LLT Ty, const DataLayout &DL, + LLVMContext &Ctx) { + if (Ty.isVector()) { + EVT EltVT = getApproximateEVTForLLT(Ty.getElementType(), DL, Ctx); + return EVT::getVectorVT(Ctx, EltVT, Ty.getElementCount()); + } + + return EVT::getIntegerVT(Ctx, Ty.getSizeInBits()); +} + +LLT llvm::getLLTForMVT(MVT Ty) { + if (!Ty.isVector()) + return LLT::scalar(Ty.getSizeInBits()); + + return LLT::scalarOrVector(Ty.getVectorElementCount(), + Ty.getVectorElementType().getSizeInBits()); +} + +const llvm::fltSemantics &llvm::getFltSemanticForLLT(LLT Ty) { + assert(Ty.isScalar() && "Expected a scalar type."); + switch (Ty.getSizeInBits()) { + case 16: + return APFloat::IEEEhalf(); + case 32: + return APFloat::IEEEsingle(); + case 64: + return APFloat::IEEEdouble(); + case 128: + return APFloat::IEEEquad(); + } + llvm_unreachable("Invalid FP type size."); +} diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp index ad8a17f25ec5..8d17cceeb3cd 100644 --- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp +++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp @@ -18,10 +18,13 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/xxhash.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" using namespace llvm; @@ -30,6 +33,13 @@ using namespace sampleprofutil; #define DEBUG_TYPE "mirfs-discriminators" +// TODO(xur): Remove this option and related code once we make true as the +// default. +cl::opt<bool> ImprovedFSDiscriminator( + "improved-fs-discriminator", cl::Hidden, cl::init(false), + cl::desc("New FS discriminators encoding (incompatible with the original " + "encoding)")); + char MIRAddFSDiscriminators::ID = 0; INITIALIZE_PASS(MIRAddFSDiscriminators, DEBUG_TYPE, @@ -42,11 +52,12 @@ FunctionPass *llvm::createMIRAddFSDiscriminatorsPass(FSDiscriminatorPass P) { return new MIRAddFSDiscriminators(P); } +// TODO(xur): Remove this once we switch to ImprovedFSDiscriminator. // Compute a hash value using debug line number, and the line numbers from the // inline stack. -static uint64_t getCallStackHash(const MachineBasicBlock &BB, - const MachineInstr &MI, - const DILocation *DIL) { +static uint64_t getCallStackHashV0(const MachineBasicBlock &BB, + const MachineInstr &MI, + const DILocation *DIL) { auto updateHash = [](const StringRef &Str) -> uint64_t { if (Str.empty()) return 0; @@ -62,6 +73,19 @@ static uint64_t getCallStackHash(const MachineBasicBlock &BB, return Ret; } +static uint64_t getCallStackHash(const DILocation *DIL) { + auto hashCombine = [](const uint64_t Seed, const uint64_t Val) { + std::hash<uint64_t> Hasher; + return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2)); + }; + uint64_t Ret = 0; + for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) { + Ret = hashCombine(Ret, xxh3_64bits(ArrayRef<uint8_t>(DIL->getLine()))); + Ret = hashCombine(Ret, xxh3_64bits(DIL->getSubprogramLinkageName())); + } + return Ret; +} + // Traverse the CFG and assign FD discriminators. If two instructions // have the same lineno and discriminator, but residing in different BBs, // the latter instruction will get a new discriminator value. The new @@ -70,11 +94,16 @@ static uint64_t getCallStackHash(const MachineBasicBlock &BB, bool MIRAddFSDiscriminators::runOnMachineFunction(MachineFunction &MF) { if (!EnableFSDiscriminator) return false; - if (!MF.getFunction().shouldEmitDebugInfoForProfiling()) + + bool HasPseudoProbe = MF.getFunction().getParent()->getNamedMetadata( + PseudoProbeDescMetadataName); + + if (!HasPseudoProbe && !MF.getFunction().shouldEmitDebugInfoForProfiling()) return false; bool Changed = false; - using LocationDiscriminator = std::tuple<StringRef, unsigned, unsigned>; + using LocationDiscriminator = + std::tuple<StringRef, unsigned, unsigned, uint64_t>; using BBSet = DenseSet<const MachineBasicBlock *>; using LocationDiscriminatorBBMap = DenseMap<LocationDiscriminator, BBSet>; using LocationDiscriminatorCurrPassMap = @@ -84,7 +113,12 @@ bool MIRAddFSDiscriminators::runOnMachineFunction(MachineFunction &MF) { LocationDiscriminatorCurrPassMap LDCM; // Mask of discriminators before this pass. - unsigned BitMaskBefore = getN1Bits(LowBit); + // TODO(xur): simplify this once we switch to ImprovedFSDiscriminator. + unsigned LowBitTemp = LowBit; + assert(LowBit > 0 && "LowBit in FSDiscriminator cannot be 0"); + if (ImprovedFSDiscriminator) + LowBitTemp -= 1; + unsigned BitMaskBefore = getN1Bits(LowBitTemp); // Mask of discriminators including this pass. unsigned BitMaskNow = getN1Bits(HighBit); // Mask of discriminators for bits specific to this pass. @@ -92,17 +126,42 @@ bool MIRAddFSDiscriminators::runOnMachineFunction(MachineFunction &MF) { unsigned NumNewD = 0; LLVM_DEBUG(dbgs() << "MIRAddFSDiscriminators working on Func: " - << MF.getFunction().getName() << "\n"); + << MF.getFunction().getName() << " Highbit=" << HighBit + << "\n"); + for (MachineBasicBlock &BB : MF) { for (MachineInstr &I : BB) { + if (HasPseudoProbe) { + // Only assign discriminators to pseudo probe instructions. Call + // instructions are excluded since their dwarf discriminators are used + // for other purposes, i.e, storing probe ids. + if (!I.isPseudoProbe()) + continue; + } else if (ImprovedFSDiscriminator && I.isMetaInstruction()) { + continue; + } const DILocation *DIL = I.getDebugLoc().get(); if (!DIL) continue; - unsigned LineNo = DIL->getLine(); + + // Use the id of pseudo probe to compute the discriminator. + unsigned LineNo = + I.isPseudoProbe() ? I.getOperand(1).getImm() : DIL->getLine(); if (LineNo == 0) continue; unsigned Discriminator = DIL->getDiscriminator(); - LocationDiscriminator LD{DIL->getFilename(), LineNo, Discriminator}; + // Clean up discriminators for pseudo probes at the first FS discriminator + // pass as their discriminators should not ever be used. + if ((Pass == FSDiscriminatorPass::Pass1) && I.isPseudoProbe()) { + Discriminator = 0; + I.setDebugLoc(DIL->cloneWithDiscriminator(0)); + } + uint64_t CallStackHashVal = 0; + if (ImprovedFSDiscriminator) + CallStackHashVal = getCallStackHash(DIL); + + LocationDiscriminator LD{DIL->getFilename(), LineNo, Discriminator, + CallStackHashVal}; auto &BBMap = LDBM[LD]; auto R = BBMap.insert(&BB); if (BBMap.size() == 1) @@ -111,7 +170,8 @@ bool MIRAddFSDiscriminators::runOnMachineFunction(MachineFunction &MF) { unsigned DiscriminatorCurrPass; DiscriminatorCurrPass = R.second ? ++LDCM[LD] : LDCM[LD]; DiscriminatorCurrPass = DiscriminatorCurrPass << LowBit; - DiscriminatorCurrPass += getCallStackHash(BB, I, DIL); + if (!ImprovedFSDiscriminator) + DiscriminatorCurrPass += getCallStackHashV0(BB, I, DIL); DiscriminatorCurrPass &= BitMaskThisPass; unsigned NewD = Discriminator | DiscriminatorCurrPass; const auto *const NewDIL = DIL->cloneWithDiscriminator(NewD); diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp index c136b08223b8..a4c1ba340e46 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp +++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp @@ -214,6 +214,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) { .Case("nsw", MIToken::kw_nsw) .Case("exact", MIToken::kw_exact) .Case("nofpexcept", MIToken::kw_nofpexcept) + .Case("unpredictable", MIToken::kw_unpredictable) .Case("debug-location", MIToken::kw_debug_location) .Case("debug-instr-number", MIToken::kw_debug_instr_number) .Case("dbg-instr-ref", MIToken::kw_dbg_instr_ref) diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h index ac484cdfd6c8..7149c29d6ba7 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.h +++ b/llvm/lib/CodeGen/MIRParser/MILexer.h @@ -73,6 +73,7 @@ struct MIToken { kw_nsw, kw_exact, kw_nofpexcept, + kw_unpredictable, kw_debug_location, kw_debug_instr_number, kw_dbg_instr_ref, diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 525f49347fc4..bfd9286ff59c 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/AsmParser/Parser.h" #include "llvm/AsmParser/SlotMapping.h" +#include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MIRFormatter.h" #include "llvm/CodeGen/MIRPrinter.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -62,7 +63,6 @@ #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SourceMgr.h" @@ -470,7 +470,7 @@ public: bool parseJumpTableIndexOperand(MachineOperand &Dest); bool parseExternalSymbolOperand(MachineOperand &Dest); bool parseMCSymbolOperand(MachineOperand &Dest); - bool parseMDNode(MDNode *&Node); + [[nodiscard]] bool parseMDNode(MDNode *&Node); bool parseDIExpression(MDNode *&Expr); bool parseDILocation(MDNode *&Expr); bool parseMetadataOperand(MachineOperand &Dest); @@ -1451,7 +1451,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Token.is(MIToken::kw_nuw) || Token.is(MIToken::kw_nsw) || Token.is(MIToken::kw_exact) || - Token.is(MIToken::kw_nofpexcept)) { + Token.is(MIToken::kw_nofpexcept) || + Token.is(MIToken::kw_unpredictable)) { // Mine frame and fast math flags if (Token.is(MIToken::kw_frame_setup)) Flags |= MachineInstr::FrameSetup; @@ -1479,6 +1480,8 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) { Flags |= MachineInstr::IsExact; if (Token.is(MIToken::kw_nofpexcept)) Flags |= MachineInstr::NoFPExcept; + if (Token.is(MIToken::kw_unpredictable)) + Flags |= MachineInstr::Unpredictable; lex(); } @@ -2414,7 +2417,7 @@ bool MIParser::parseMetadataOperand(MachineOperand &Dest) { bool MIParser::parseCFIOffset(int &Offset) { if (Token.isNot(MIToken::IntegerLiteral)) return error("expected a cfi offset"); - if (Token.integerValue().getMinSignedBits() > 32) + if (Token.integerValue().getSignificantBits() > 32) return error("expected a 32 bit integer (the cfi offset is too large)"); Offset = (int)Token.integerValue().getExtValue(); lex(); @@ -2520,7 +2523,7 @@ bool MIParser::parseCFIOperand(MachineOperand &Dest) { parseCFIAddressSpace(AddressSpace)) return true; CFIIndex = MF.addFrameInst(MCCFIInstruction::createLLVMDefAspaceCfa( - nullptr, Reg, Offset, AddressSpace)); + nullptr, Reg, Offset, AddressSpace, SMLoc())); break; case MIToken::kw_cfi_remember_state: CFIIndex = MF.addFrameInst(MCCFIInstruction::createRememberState(nullptr)); @@ -3001,7 +3004,7 @@ bool MIParser::parseOffset(int64_t &Offset) { lex(); if (Token.isNot(MIToken::IntegerLiteral)) return error("expected an integer literal after '" + Sign + "'"); - if (Token.integerValue().getMinSignedBits() > 64) + if (Token.integerValue().getSignificantBits() > 64) return error("expected 64-bit integer (too large)"); Offset = Token.integerValue().getExtValue(); if (IsNegative) @@ -3471,7 +3474,8 @@ bool MIParser::parseHeapAllocMarker(MDNode *&Node) { assert(Token.is(MIToken::kw_heap_alloc_marker) && "Invalid token for a heap alloc marker!"); lex(); - parseMDNode(Node); + if (parseMDNode(Node)) + return true; if (!Node) return error("expected a MDNode after 'heap-alloc-marker'"); if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) || @@ -3487,7 +3491,8 @@ bool MIParser::parsePCSections(MDNode *&Node) { assert(Token.is(MIToken::kw_pcsections) && "Invalid token for a PC sections!"); lex(); - parseMDNode(Node); + if (parseMDNode(Node)) + return true; if (!Node) return error("expected a MDNode after 'pcsections'"); if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) || diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index a20c2bfe6c0f..b2e570c5e67e 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -130,6 +130,16 @@ public: const yaml::StringValue &RegisterSource, bool IsRestored, int FrameIdx); + struct VarExprLoc { + DILocalVariable *DIVar = nullptr; + DIExpression *DIExpr = nullptr; + DILocation *DILoc = nullptr; + }; + + std::optional<VarExprLoc> parseVarExprLoc(PerFunctionMIParsingState &PFS, + const yaml::StringValue &VarStr, + const yaml::StringValue &ExprStr, + const yaml::StringValue &LocStr); template <typename T> bool parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS, const T &Object, @@ -392,7 +402,7 @@ bool MIRParserImpl::initializeCallSiteInfo( MachineFunction &MF = PFS.MF; SMDiagnostic Error; const LLVMTargetMachine &TM = MF.getTarget(); - for (auto YamlCSInfo : YamlMF.CallSitesInfo) { + for (auto &YamlCSInfo : YamlMF.CallSitesInfo) { yaml::CallSiteInfo::MachineInstrLoc MILoc = YamlCSInfo.CallLocation; if (MILoc.BlockNum >= MF.size()) return error(Twine(MF.getName()) + @@ -468,6 +478,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF, MF.setHasEHCatchret(YamlMF.HasEHCatchret); MF.setHasEHScopes(YamlMF.HasEHScopes); MF.setHasEHFunclets(YamlMF.HasEHFunclets); + MF.setIsOutlined(YamlMF.IsOutlined); if (YamlMF.Legalized) MF.getProperties().set(MachineFunctionProperties::Property::Legalized); @@ -792,6 +803,24 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS, return true; } + for (const auto &Object : YamlMF.EntryValueObjects) { + SMDiagnostic Error; + Register Reg; + if (parseNamedRegisterReference(PFS, Reg, Object.EntryValueRegister.Value, + Error)) + return error(Error, Object.EntryValueRegister.SourceRange); + if (!Reg.isPhysical()) + return error(Object.EntryValueRegister.SourceRange.Start, + "Expected physical register for entry value field"); + std::optional<VarExprLoc> MaybeInfo = parseVarExprLoc( + PFS, Object.DebugVar, Object.DebugExpr, Object.DebugLoc); + if (!MaybeInfo) + return true; + if (MaybeInfo->DIVar || MaybeInfo->DIExpr || MaybeInfo->DILoc) + PFS.MF.setVariableDbgInfo(MaybeInfo->DIVar, MaybeInfo->DIExpr, + Reg.asMCReg(), MaybeInfo->DILoc); + } + // Initialize the ordinary frame objects. for (const auto &Object : YamlMF.StackObjects) { int ObjectIdx; @@ -887,26 +916,37 @@ static bool typecheckMDNode(T *&Result, MDNode *Node, return false; } -template <typename T> -bool MIRParserImpl::parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS, - const T &Object, int FrameIdx) { - // Debug information can only be attached to stack objects; Fixed stack - // objects aren't supported. - MDNode *Var = nullptr, *Expr = nullptr, *Loc = nullptr; - if (parseMDNode(PFS, Var, Object.DebugVar) || - parseMDNode(PFS, Expr, Object.DebugExpr) || - parseMDNode(PFS, Loc, Object.DebugLoc)) - return true; - if (!Var && !Expr && !Loc) - return false; +std::optional<MIRParserImpl::VarExprLoc> MIRParserImpl::parseVarExprLoc( + PerFunctionMIParsingState &PFS, const yaml::StringValue &VarStr, + const yaml::StringValue &ExprStr, const yaml::StringValue &LocStr) { + MDNode *Var = nullptr; + MDNode *Expr = nullptr; + MDNode *Loc = nullptr; + if (parseMDNode(PFS, Var, VarStr) || parseMDNode(PFS, Expr, ExprStr) || + parseMDNode(PFS, Loc, LocStr)) + return std::nullopt; DILocalVariable *DIVar = nullptr; DIExpression *DIExpr = nullptr; DILocation *DILoc = nullptr; - if (typecheckMDNode(DIVar, Var, Object.DebugVar, "DILocalVariable", *this) || - typecheckMDNode(DIExpr, Expr, Object.DebugExpr, "DIExpression", *this) || - typecheckMDNode(DILoc, Loc, Object.DebugLoc, "DILocation", *this)) + if (typecheckMDNode(DIVar, Var, VarStr, "DILocalVariable", *this) || + typecheckMDNode(DIExpr, Expr, ExprStr, "DIExpression", *this) || + typecheckMDNode(DILoc, Loc, LocStr, "DILocation", *this)) + return std::nullopt; + return VarExprLoc{DIVar, DIExpr, DILoc}; +} + +template <typename T> +bool MIRParserImpl::parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS, + const T &Object, int FrameIdx) { + std::optional<VarExprLoc> MaybeInfo = + parseVarExprLoc(PFS, Object.DebugVar, Object.DebugExpr, Object.DebugLoc); + if (!MaybeInfo) return true; - PFS.MF.setVariableDbgInfo(DIVar, DIExpr, FrameIdx, DILoc); + // Debug information can only be attached to stack objects; Fixed stack + // objects aren't supported. + if (MaybeInfo->DIVar || MaybeInfo->DIExpr || MaybeInfo->DILoc) + PFS.MF.setVariableDbgInfo(MaybeInfo->DIVar, MaybeInfo->DIExpr, FrameIdx, + MaybeInfo->DILoc); return false; } diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 0a4b28ac79a7..b91d9c4727fc 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -47,7 +48,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" -#include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/YAMLTraits.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -119,6 +119,9 @@ public: const MachineJumpTableInfo &JTI); void convertStackObjects(yaml::MachineFunction &YMF, const MachineFunction &MF, ModuleSlotTracker &MST); + void convertEntryValueObjects(yaml::MachineFunction &YMF, + const MachineFunction &MF, + ModuleSlotTracker &MST); void convertCallSiteObjects(yaml::MachineFunction &YMF, const MachineFunction &MF, ModuleSlotTracker &MST); @@ -200,6 +203,7 @@ void MIRPrinter::print(const MachineFunction &MF) { YamlMF.HasEHCatchret = MF.hasEHCatchret(); YamlMF.HasEHScopes = MF.hasEHScopes(); YamlMF.HasEHFunclets = MF.hasEHFunclets(); + YamlMF.IsOutlined = MF.isOutlined(); YamlMF.UseDebugInstrRef = MF.useDebugInstrRef(); YamlMF.Legalized = MF.getProperties().hasProperty( @@ -220,6 +224,7 @@ void MIRPrinter::print(const MachineFunction &MF) { MST.incorporateFunction(MF.getFunction()); convert(MST, YamlMF.FrameInfo, MF.getFrameInfo()); convertStackObjects(YamlMF, MF, MST); + convertEntryValueObjects(YamlMF, MF, MST); convertCallSiteObjects(YamlMF, MF, MST); for (const auto &Sub : MF.DebugValueSubstitutions) { const auto &SubSrc = Sub.Src; @@ -372,6 +377,19 @@ void MIRPrinter::convert(ModuleSlotTracker &MST, } } +void MIRPrinter::convertEntryValueObjects(yaml::MachineFunction &YMF, + const MachineFunction &MF, + ModuleSlotTracker &MST) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + for (const MachineFunction::VariableDbgInfo &DebugVar : + MF.getEntryValueVariableDbgInfo()) { + yaml::EntryValueObject &Obj = YMF.EntryValueObjects.emplace_back(); + printStackObjectDbgInfo(DebugVar, Obj, MST); + MCRegister EntryValReg = DebugVar.getEntryValueRegister(); + printRegMIR(EntryValReg, Obj.EntryValueRegister, TRI); + } +} + void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF, const MachineFunction &MF, ModuleSlotTracker &MST) { @@ -490,17 +508,17 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF, // Print the debug variable information. for (const MachineFunction::VariableDbgInfo &DebugVar : - MF.getVariableDbgInfo()) { - assert(DebugVar.Slot >= MFI.getObjectIndexBegin() && - DebugVar.Slot < MFI.getObjectIndexEnd() && + MF.getInStackSlotVariableDbgInfo()) { + int Idx = DebugVar.getStackSlot(); + assert(Idx >= MFI.getObjectIndexBegin() && Idx < MFI.getObjectIndexEnd() && "Invalid stack object index"); - if (DebugVar.Slot < 0) { // Negative index means fixed objects. + if (Idx < 0) { // Negative index means fixed objects. auto &Object = - YMF.FixedStackObjects[FixedStackObjectsIdx[DebugVar.Slot + + YMF.FixedStackObjects[FixedStackObjectsIdx[Idx + MFI.getNumFixedObjects()]]; printStackObjectDbgInfo(DebugVar, Object, MST); } else { - auto &Object = YMF.StackObjects[StackObjectsIdx[DebugVar.Slot]]; + auto &Object = YMF.StackObjects[StackObjectsIdx[Idx]]; printStackObjectDbgInfo(DebugVar, Object, MST); } } @@ -783,6 +801,8 @@ void MIPrinter::print(const MachineInstr &MI) { OS << "nofpexcept "; if (MI.getFlag(MachineInstr::NoMerge)) OS << "nomerge "; + if (MI.getFlag(MachineInstr::Unpredictable)) + OS << "unpredictable "; OS << TII->getName(MI.getOpcode()); if (I < E) diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp index a8996a586909..96f8589e682d 100644 --- a/llvm/lib/CodeGen/MIRSampleProfile.cpp +++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp @@ -18,17 +18,21 @@ #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/PseudoProbe.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" +#include <optional> using namespace llvm; using namespace sampleprof; @@ -57,6 +61,7 @@ static cl::opt<bool> ViewBFIAfter("fs-viewbfi-after", cl::Hidden, cl::init(false), cl::desc("View BFI after MIR loader")); +extern cl::opt<bool> ImprovedFSDiscriminator; char MIRProfileLoaderPass::ID = 0; INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE, @@ -72,10 +77,11 @@ INITIALIZE_PASS_END(MIRProfileLoaderPass, DEBUG_TYPE, "Load MIR Sample Profile", char &llvm::MIRProfileLoaderPassID = MIRProfileLoaderPass::ID; -FunctionPass *llvm::createMIRProfileLoaderPass(std::string File, - std::string RemappingFile, - FSDiscriminatorPass P) { - return new MIRProfileLoaderPass(File, RemappingFile, P); +FunctionPass * +llvm::createMIRProfileLoaderPass(std::string File, std::string RemappingFile, + FSDiscriminatorPass P, + IntrusiveRefCntPtr<vfs::FileSystem> FS) { + return new MIRProfileLoaderPass(File, RemappingFile, P, std::move(FS)); } namespace llvm { @@ -89,6 +95,22 @@ extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI; // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= extern cl::opt<std::string> ViewBlockFreqFuncName; +std::optional<PseudoProbe> extractProbe(const MachineInstr &MI) { + if (MI.isPseudoProbe()) { + PseudoProbe Probe; + Probe.Id = MI.getOperand(1).getImm(); + Probe.Type = MI.getOperand(2).getImm(); + Probe.Attr = MI.getOperand(3).getImm(); + Probe.Factor = 1; + DILocation *DebugLoc = MI.getDebugLoc(); + Probe.Discriminator = DebugLoc ? DebugLoc->getDiscriminator() : 0; + return Probe; + } + + // Ignore callsite probes since they do not have FS discriminators. + return std::nullopt; +} + namespace afdo_detail { template <> struct IRTraits<MachineBasicBlock> { using InstructionT = MachineInstr; @@ -118,7 +140,7 @@ template <> struct IRTraits<MachineBasicBlock> { } // namespace afdo_detail class MIRProfileLoader final - : public SampleProfileLoaderBaseImpl<MachineBasicBlock> { + : public SampleProfileLoaderBaseImpl<MachineFunction> { public: void setInitVals(MachineDominatorTree *MDT, MachinePostDominatorTree *MPDT, MachineLoopInfo *MLI, MachineBlockFrequencyInfo *MBFI, @@ -136,9 +158,10 @@ public: assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit"); } - MIRProfileLoader(StringRef Name, StringRef RemapName) - : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)) { - } + MIRProfileLoader(StringRef Name, StringRef RemapName, + IntrusiveRefCntPtr<vfs::FileSystem> FS) + : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName), + std::move(FS)) {} void setBranchProbs(MachineFunction &F); bool runOnFunction(MachineFunction &F); @@ -162,11 +185,18 @@ protected: unsigned HighBit; bool ProfileIsValid = true; + ErrorOr<uint64_t> getInstWeight(const MachineInstr &MI) override { + if (FunctionSamples::ProfileIsProbeBased) + return getProbeWeight(MI); + if (ImprovedFSDiscriminator && MI.isMetaInstruction()) + return std::error_code(); + return getInstWeightImpl(MI); + } }; template <> -void SampleProfileLoaderBaseImpl< - MachineBasicBlock>::computeDominanceAndLoopInfo(MachineFunction &F) {} +void SampleProfileLoaderBaseImpl<MachineFunction>::computeDominanceAndLoopInfo( + MachineFunction &F) {} void MIRProfileLoader::setBranchProbs(MachineFunction &F) { LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch probs\n"); @@ -254,8 +284,8 @@ void MIRProfileLoader::setBranchProbs(MachineFunction &F) { bool MIRProfileLoader::doInitialization(Module &M) { auto &Ctx = M.getContext(); - auto ReaderOrErr = sampleprof::SampleProfileReader::create(Filename, Ctx, P, - RemappingFilename); + auto ReaderOrErr = sampleprof::SampleProfileReader::create( + Filename, Ctx, *FS, P, RemappingFilename); if (std::error_code EC = ReaderOrErr.getError()) { std::string Msg = "Could not open profile: " + EC.message(); Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); @@ -265,20 +295,41 @@ bool MIRProfileLoader::doInitialization(Module &M) { Reader = std::move(ReaderOrErr.get()); Reader->setModule(&M); ProfileIsValid = (Reader->read() == sampleprof_error::success); - Reader->getSummary(); + + // Load pseudo probe descriptors for probe-based function samples. + if (Reader->profileIsProbeBased()) { + ProbeManager = std::make_unique<PseudoProbeManager>(M); + if (!ProbeManager->moduleIsProbed(M)) { + return false; + } + } return true; } bool MIRProfileLoader::runOnFunction(MachineFunction &MF) { + // Do not load non-FS profiles. A line or probe can get a zero-valued + // discriminator at certain pass which could result in accidentally loading + // the corresponding base counter in the non-FS profile, while a non-zero + // discriminator would end up getting zero samples. This could in turn undo + // the sample distribution effort done by previous BFI maintenance and the + // probe distribution factor work for pseudo probes. + if (!Reader->profileIsFS()) + return false; + Function &Func = MF.getFunction(); clearFunctionData(false); Samples = Reader->getSamplesFor(Func); if (!Samples || Samples->empty()) return false; - if (getFunctionLoc(MF) == 0) - return false; + if (FunctionSamples::ProfileIsProbeBased) { + if (!ProbeManager->profileIsValid(MF.getFunction(), *Samples)) + return false; + } else { + if (getFunctionLoc(MF) == 0) + return false; + } DenseSet<GlobalValue::GUID> InlinedGUIDs; bool Changed = computeAndPropagateWeights(MF, InlinedGUIDs); @@ -291,14 +342,16 @@ bool MIRProfileLoader::runOnFunction(MachineFunction &MF) { } // namespace llvm -MIRProfileLoaderPass::MIRProfileLoaderPass(std::string FileName, - std::string RemappingFileName, - FSDiscriminatorPass P) - : MachineFunctionPass(ID), ProfileFileName(FileName), P(P), - MIRSampleLoader( - std::make_unique<MIRProfileLoader>(FileName, RemappingFileName)) { +MIRProfileLoaderPass::MIRProfileLoaderPass( + std::string FileName, std::string RemappingFileName, FSDiscriminatorPass P, + IntrusiveRefCntPtr<vfs::FileSystem> FS) + : MachineFunctionPass(ID), ProfileFileName(FileName), P(P) { LowBit = getFSPassBitBegin(P); HighBit = getFSPassBitEnd(P); + + auto VFS = FS ? std::move(FS) : vfs::getRealFileSystem(); + MIRSampleLoader = std::make_unique<MIRProfileLoader>( + FileName, RemappingFileName, std::move(VFS)); assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit"); } diff --git a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp index e634a2b284c3..812d57984e6c 100644 --- a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp +++ b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp @@ -39,7 +39,7 @@ VRegRenamer::getVRegRenameMap(const std::vector<NamedVReg> &VRegs) { StringMap<unsigned> VRegNameCollisionMap; auto GetUniqueVRegName = [&VRegNameCollisionMap](const NamedVReg &Reg) { - if (VRegNameCollisionMap.find(Reg.getName()) == VRegNameCollisionMap.end()) + if (!VRegNameCollisionMap.contains(Reg.getName())) VRegNameCollisionMap[Reg.getName()] = 0; const unsigned Counter = ++VRegNameCollisionMap[Reg.getName()]; return Reg.getName() + "__" + std::to_string(Counter); diff --git a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp index 5cc8ad3d609e..7b3746fde503 100644 --- a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp +++ b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp @@ -13,6 +13,7 @@ #include "AllocationOrder.h" #include "RegAllocEvictionAdvisor.h" #include "RegAllocGreedy.h" +#include "llvm/Analysis/InteractiveModelRunner.h" #include "llvm/Analysis/MLModelRunner.h" #include "llvm/Analysis/TensorSpec.h" #if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) || defined(LLVM_HAVE_TFLITE) @@ -38,6 +39,7 @@ #include "llvm/Support/ErrorHandling.h" #include <array> +#include <bitset> #include <memory> using namespace llvm; @@ -52,6 +54,14 @@ using CompiledModelType = RegallocEvictModel; using CompiledModelType = NoopSavedModelImpl; #endif +static cl::opt<std::string> InteractiveChannelBaseName( + "regalloc-evict-interactive-channel-base", cl::Hidden, + cl::desc( + "Base file path for the interactive mode. The incoming filename should " + "have the name <regalloc-evict-interactive-channel-base>.in, while the " + "outgoing name should be " + "<regalloc-evict-interactive-channel-base>.out")); + // Options that only make sense in development mode #ifdef LLVM_HAVE_TFLITE #include "RegAllocScore.h" @@ -74,12 +84,12 @@ static cl::opt<bool> EnableDevelopmentFeatures( static const bool EnableDevelopmentFeatures = false; #endif // #ifdef LLVM_HAVE_TFLITE -extern cl::opt<unsigned> EvictInterferenceCutoff; - /// The score injection pass. /// This pass calculates the score for a function and inserts it in the log, but /// this happens only in development mode. It's a no-op otherwise. namespace llvm { +extern cl::opt<unsigned> EvictInterferenceCutoff; + class RegAllocScoring : public MachineFunctionPass { public: static char ID; @@ -213,6 +223,8 @@ static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences}; // will be guaranteed to be to a mask == 1 position. Using a macro here to // avoid 'not used' warnings (and keep cond compilation to a minimum) #define DecisionName "index_to_evict" +static const TensorSpec DecisionSpec = + TensorSpec::createSpec<int64_t>(DecisionName, {1}); // Named features index. enum FeatureIDs { @@ -382,14 +394,21 @@ private: std::unique_ptr<RegAllocEvictionAdvisor> getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { - if (!Runner) - Runner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>( - MF.getFunction().getContext(), InputFeatures, DecisionName); + if (!Runner) { + if (InteractiveChannelBaseName.empty()) + Runner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>( + MF.getFunction().getContext(), InputFeatures, DecisionName); + else + Runner = std::make_unique<InteractiveModelRunner>( + MF.getFunction().getContext(), InputFeatures, DecisionSpec, + InteractiveChannelBaseName + ".out", + InteractiveChannelBaseName + ".in"); + } return std::make_unique<MLEvictAdvisor>( MF, RA, Runner.get(), getAnalysis<MachineBlockFrequencyInfo>(), getAnalysis<MachineLoopInfo>()); } - std::unique_ptr<ReleaseModeModelRunner<CompiledModelType>> Runner; + std::unique_ptr<MLModelRunner> Runner; }; // =================================== @@ -398,8 +417,6 @@ private: // // Features we log #ifdef LLVM_HAVE_TFLITE -static const TensorSpec Output = - TensorSpec::createSpec<int64_t>(DecisionName, {1}); static const TensorSpec Reward = TensorSpec::createSpec<float>("reward", {1}); // Features we bind on the model. The tensor names have a prefix, and we also @@ -458,7 +475,7 @@ public: void logRewardIfNeeded(const MachineFunction &MF, llvm::function_ref<float()> GetReward) override { - if (!Log) + if (!Log || !Log->hasAnyObservationForContext(MF.getName())) return; // The function pass manager would run all the function passes for a // function, so we assume the last context belongs to this function. If @@ -512,7 +529,7 @@ private: // We always log the output; in particular, if we're not evaluating, we // don't have an output spec json file. That's why we handle the // 'normal' output separately. - LFS.push_back(Output); + LFS.push_back(DecisionSpec); Log = std::make_unique<Logger>(std::move(OS), LFS, Reward, /*IncludeReward*/ true); @@ -557,6 +574,7 @@ MLEvictAdvisor::MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA, Runner(std::move(Runner)), MBFI(MBFI), Loops(Loops), InitialQSize(MLEvictAdvisor::getInitialQueueSize(MF)) { assert(this->Runner); + Runner->switchContext(MF.getName()); DoNotNormalize.set(FeatureIDs::mask); DoNotNormalize.set(FeatureIDs::is_free); DoNotNormalize.set(FeatureIDs::is_hint); @@ -594,8 +612,8 @@ bool MLEvictAdvisor::loadInterferenceFeatures( unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg()); SmallVector<const LiveInterval *, MaxInterferences> InterferingIntervals; - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit); // Different from the default heuristic, we don't make any assumptions // about what having more than 10 results in the query may mean. const auto &IFIntervals = Q.interferingVRegs(EvictInterferenceCutoff); @@ -1134,7 +1152,10 @@ bool RegAllocScoring::runOnMachineFunction(MachineFunction &MF) { #endif // #ifdef LLVM_HAVE_TFLITE RegAllocEvictionAdvisorAnalysis *llvm::createReleaseModeAdvisor() { - return new ReleaseModeEvictionAdvisorAnalysis(); + return llvm::isEmbeddedModelEvaluatorValid<CompiledModelType>() || + !InteractiveChannelBaseName.empty() + ? new ReleaseModeEvictionAdvisorAnalysis() + : nullptr; } // In all cases except development mode, we don't need scoring. diff --git a/llvm/lib/CodeGen/MLRegallocPriorityAdvisor.cpp b/llvm/lib/CodeGen/MLRegallocPriorityAdvisor.cpp index 320a184bdcc5..422781593a9c 100644 --- a/llvm/lib/CodeGen/MLRegallocPriorityAdvisor.cpp +++ b/llvm/lib/CodeGen/MLRegallocPriorityAdvisor.cpp @@ -14,6 +14,7 @@ #include "RegAllocGreedy.h" #include "RegAllocPriorityAdvisor.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/InteractiveModelRunner.h" #include "llvm/Analysis/MLModelRunner.h" #include "llvm/Analysis/ReleaseModeModelRunner.h" #include "llvm/Analysis/TensorSpec.h" @@ -40,6 +41,16 @@ using namespace llvm; +static cl::opt<std::string> InteractiveChannelBaseName( + "regalloc-priority-interactive-channel-base", cl::Hidden, + cl::desc( + "Base file path for the interactive mode. The incoming filename should " + "have the name <regalloc-priority-interactive-channel-base>.in, while " + "the outgoing name should be " + "<regalloc-priority-interactive-channel-base>.out")); + +using CompiledModelType = NoopSavedModelImpl; + // Options that only make sense in development mode #ifdef LLVM_HAVE_TFLITE #include "RegAllocScore.h" @@ -65,6 +76,9 @@ static const std::vector<int64_t> PerLiveRangeShape{1}; M(float, weight, PerLiveRangeShape, "weight") #define DecisionName "priority" +static const TensorSpec DecisionSpec = + TensorSpec::createSpec<float>(DecisionName, {1}); + // Named features index. enum FeatureIDs { @@ -125,13 +139,20 @@ private: std::unique_ptr<RegAllocPriorityAdvisor> getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override { - if (!Runner) - Runner = std::make_unique<ReleaseModeModelRunner<NoopSavedModelImpl>>( - MF.getFunction().getContext(), InputFeatures, DecisionName); + if (!Runner) { + if (InteractiveChannelBaseName.empty()) + Runner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>( + MF.getFunction().getContext(), InputFeatures, DecisionName); + else + Runner = std::make_unique<InteractiveModelRunner>( + MF.getFunction().getContext(), InputFeatures, DecisionSpec, + InteractiveChannelBaseName + ".out", + InteractiveChannelBaseName + ".in"); + } return std::make_unique<MLPriorityAdvisor>( MF, RA, &getAnalysis<SlotIndexes>(), Runner.get()); } - std::unique_ptr<ReleaseModeModelRunner<NoopSavedModelImpl>> Runner; + std::unique_ptr<MLModelRunner> Runner; }; // =================================== @@ -140,9 +161,6 @@ private: // // Features we log #ifdef LLVM_HAVE_TFLITE - -static const TensorSpec Output = - TensorSpec::createSpec<float>(DecisionName, {1}); static const TensorSpec Reward = TensorSpec::createSpec<float>("reward", {1}); #define _DECL_TRAIN_FEATURES(type, name, shape, _) \ @@ -179,7 +197,7 @@ public: void logRewardIfNeeded(const MachineFunction &MF, llvm::function_ref<float()> GetReward) override { - if (!Log) + if (!Log || !Log->hasAnyObservationForContext(MF.getName())) return; // The function pass manager would run all the function passes for a // function, so we assume the last context belongs to this function. If @@ -231,7 +249,7 @@ private: // We always log the output; in particular, if we're not evaluating, we // don't have an output spec json file. That's why we handle the // 'normal' output separately. - LFS.push_back(Output); + LFS.push_back(DecisionSpec); Log = std::make_unique<Logger>(std::move(OS), LFS, Reward, /*IncludeReward*/ true); @@ -258,7 +276,10 @@ private: } // namespace llvm RegAllocPriorityAdvisorAnalysis *llvm::createReleaseModePriorityAdvisor() { - return new ReleaseModePriorityAdvisorAnalysis(); + return llvm::isEmbeddedModelEvaluatorValid<CompiledModelType>() || + !InteractiveChannelBaseName.empty() + ? new ReleaseModePriorityAdvisorAnalysis() + : nullptr; } MLPriorityAdvisor::MLPriorityAdvisor(const MachineFunction &MF, @@ -268,6 +289,7 @@ MLPriorityAdvisor::MLPriorityAdvisor(const MachineFunction &MF, : RegAllocPriorityAdvisor(MF, RA, Indexes), DefaultAdvisor(MF, RA, Indexes), Runner(std::move(Runner)) { assert(this->Runner); + Runner->switchContext(MF.getName()); } float MLPriorityAdvisor::getPriorityImpl(const LiveInterval &LI) const { diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 5ef377f2a1c0..231544494c32 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -12,12 +12,14 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SlotIndexes.h" @@ -664,6 +666,15 @@ void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) { getParent()->splice(++NewBefore->getIterator(), getIterator()); } +static int findJumpTableIndex(const MachineBasicBlock &MBB) { + MachineBasicBlock::const_iterator TerminatorI = MBB.getFirstTerminator(); + if (TerminatorI == MBB.end()) + return -1; + const MachineInstr &Terminator = *TerminatorI; + const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); + return TII->getJumpTableIndex(Terminator); +} + void MachineBasicBlock::updateTerminator( MachineBasicBlock *PreviousLayoutSuccessor) { LLVM_DEBUG(dbgs() << "Updating terminators on " << printMBBReference(*this) @@ -975,8 +986,8 @@ MachineBasicBlock *MachineBasicBlock::getFallThrough(bool JumpToFallThrough) { // If there is some explicit branch to the fallthrough block, it can obviously // reach, even though the branch should get folded to fall through implicitly. - if (!JumpToFallThrough && (MachineFunction::iterator(TBB) == Fallthrough || - MachineFunction::iterator(FBB) == Fallthrough)) + if (JumpToFallThrough && (MachineFunction::iterator(TBB) == Fallthrough || + MachineFunction::iterator(FBB) == Fallthrough)) return &*Fallthrough; // If it's an unconditional branch to some block not the fall through, it @@ -1033,6 +1044,50 @@ MachineBasicBlock *MachineBasicBlock::splitAt(MachineInstr &MI, return SplitBB; } +// Returns `true` if there are possibly other users of the jump table at +// `JumpTableIndex` except for the ones in `IgnoreMBB`. +static bool jumpTableHasOtherUses(const MachineFunction &MF, + const MachineBasicBlock &IgnoreMBB, + int JumpTableIndex) { + assert(JumpTableIndex >= 0 && "need valid index"); + const MachineJumpTableInfo &MJTI = *MF.getJumpTableInfo(); + const MachineJumpTableEntry &MJTE = MJTI.getJumpTables()[JumpTableIndex]; + // Take any basic block from the table; every user of the jump table must + // show up in the predecessor list. + const MachineBasicBlock *MBB = nullptr; + for (MachineBasicBlock *B : MJTE.MBBs) { + if (B != nullptr) { + MBB = B; + break; + } + } + if (MBB == nullptr) + return true; // can't rule out other users if there isn't any block. + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + SmallVector<MachineOperand, 4> Cond; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (Pred == &IgnoreMBB) + continue; + MachineBasicBlock *DummyT = nullptr; + MachineBasicBlock *DummyF = nullptr; + Cond.clear(); + if (!TII.analyzeBranch(*Pred, DummyT, DummyF, Cond, + /*AllowModify=*/false)) { + // analyzable direct jump + continue; + } + int PredJTI = findJumpTableIndex(*Pred); + if (PredJTI >= 0) { + if (PredJTI == JumpTableIndex) + return true; + continue; + } + // Be conservative for unanalyzable jumps. + return true; + } + return false; +} + MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( MachineBasicBlock *Succ, Pass &P, std::vector<SparseBitVector<>> *LiveInSets) { @@ -1044,6 +1099,16 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( DebugLoc DL; // FIXME: this is nowhere MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock(); + + // Is there an indirect jump with jump table? + bool ChangedIndirectJump = false; + int JTI = findJumpTableIndex(*this); + if (JTI >= 0) { + MachineJumpTableInfo &MJTI = *MF->getJumpTableInfo(); + MJTI.ReplaceMBBInJumpTable(JTI, Succ, NMBB); + ChangedIndirectJump = true; + } + MF->insert(std::next(MachineFunction::iterator(this)), NMBB); LLVM_DEBUG(dbgs() << "Splitting critical edge: " << printMBBReference(*this) << " -- " << printMBBReference(*NMBB) << " -- " @@ -1066,9 +1131,8 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( if (LV) for (MachineInstr &MI : llvm::make_range(getFirstInstrTerminator(), instr_end())) { - for (MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || MO.getReg() == 0 || !MO.isUse() || !MO.isKill() || - MO.isUndef()) + for (MachineOperand &MO : MI.all_uses()) { + if (MO.getReg() == 0 || !MO.isKill() || MO.isUndef()) continue; Register Reg = MO.getReg(); if (Reg.isPhysical() || LV->getVarInfo(Reg).removeKill(MI)) { @@ -1109,7 +1173,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( // as the fallthrough successor if (Succ == PrevFallthrough) PrevFallthrough = NMBB; - updateTerminator(PrevFallthrough); + + if (!ChangedIndirectJump) + updateTerminator(PrevFallthrough); if (Indexes) { SmallVector<MachineInstr*, 4> NewTerminators; @@ -1284,8 +1350,13 @@ bool MachineBasicBlock::canSplitCriticalEdge( if (MF->getTarget().requiresStructuredCFG()) return false; + // Do we have an Indirect jump with a jumptable that we can rewrite? + int JTI = findJumpTableIndex(*this); + if (JTI >= 0 && !jumpTableHasOtherUses(*MF, *this, JTI)) + return true; + // We may need to update this's terminator, but we can't do that if - // analyzeBranch fails. If this uses a jump table, we won't touch it. + // analyzeBranch fails. const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; @@ -1391,7 +1462,7 @@ void MachineBasicBlock::replacePhiUsesWith(MachineBasicBlock *Old, } } -/// Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE +/// Find the next valid DebugLoc starting at MBBI, skipping any debug /// instructions. Return UnknownLoc if there is none. DebugLoc MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { @@ -1403,6 +1474,8 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) { } DebugLoc MachineBasicBlock::rfindDebugLoc(reverse_instr_iterator MBBI) { + if (MBBI == instr_rend()) + return findDebugLoc(instr_begin()); // Skip debug declarations, we don't want a DebugLoc from them. MBBI = skipDebugInstructionsBackward(MBBI, instr_rbegin()); if (!MBBI->isDebugInstr()) @@ -1410,13 +1483,15 @@ DebugLoc MachineBasicBlock::rfindDebugLoc(reverse_instr_iterator MBBI) { return {}; } -/// Find the previous valid DebugLoc preceding MBBI, skipping and DBG_VALUE +/// Find the previous valid DebugLoc preceding MBBI, skipping any debug /// instructions. Return UnknownLoc if there is none. DebugLoc MachineBasicBlock::findPrevDebugLoc(instr_iterator MBBI) { - if (MBBI == instr_begin()) return {}; + if (MBBI == instr_begin()) + return {}; // Skip debug instructions, we don't want a DebugLoc from them. MBBI = prev_nodbg(MBBI, instr_begin()); - if (!MBBI->isDebugInstr()) return MBBI->getDebugLoc(); + if (!MBBI->isDebugInstr()) + return MBBI->getDebugLoc(); return {}; } diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 7bbc347a8cf8..912e9ec993e3 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -213,10 +213,9 @@ static cl::opt<bool> RenumberBlocksBeforeView( "into a dot graph. Only used when a function is being printed."), cl::init(false), cl::Hidden); +namespace llvm { extern cl::opt<bool> EnableExtTspBlockPlacement; extern cl::opt<bool> ApplyExtTspWithoutProfile; - -namespace llvm { extern cl::opt<unsigned> StaticLikelyProb; extern cl::opt<unsigned> ProfileLikelyProb; @@ -354,15 +353,15 @@ class MachineBlockPlacement : public MachineFunctionPass { /// Pair struct containing basic block and taildup profitability struct BlockAndTailDupResult { - MachineBasicBlock *BB; + MachineBasicBlock *BB = nullptr; bool ShouldTailDup; }; /// Triple struct containing edge weight and the edge. struct WeightedEdge { BlockFrequency Weight; - MachineBasicBlock *Src; - MachineBasicBlock *Dest; + MachineBasicBlock *Src = nullptr; + MachineBasicBlock *Dest = nullptr; }; /// work lists of blocks that are ready to be laid out @@ -373,32 +372,32 @@ class MachineBlockPlacement : public MachineFunctionPass { DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges; /// Machine Function - MachineFunction *F; + MachineFunction *F = nullptr; /// A handle to the branch probability pass. - const MachineBranchProbabilityInfo *MBPI; + const MachineBranchProbabilityInfo *MBPI = nullptr; /// A handle to the function-wide block frequency pass. std::unique_ptr<MBFIWrapper> MBFI; /// A handle to the loop info. - MachineLoopInfo *MLI; + MachineLoopInfo *MLI = nullptr; /// Preferred loop exit. /// Member variable for convenience. It may be removed by duplication deep /// in the call stack. - MachineBasicBlock *PreferredLoopExit; + MachineBasicBlock *PreferredLoopExit = nullptr; /// A handle to the target's instruction info. - const TargetInstrInfo *TII; + const TargetInstrInfo *TII = nullptr; /// A handle to the target's lowering info. - const TargetLoweringBase *TLI; + const TargetLoweringBase *TLI = nullptr; /// A handle to the post dominator tree. - MachinePostDominatorTree *MPDT; + MachinePostDominatorTree *MPDT = nullptr; - ProfileSummaryInfo *PSI; + ProfileSummaryInfo *PSI = nullptr; /// Duplicator used to duplicate tails during placement. /// @@ -412,7 +411,7 @@ class MachineBlockPlacement : public MachineFunctionPass { /// True: use block profile count to compute tail duplication cost. /// False: use block frequency to compute tail duplication cost. - bool UseProfileCount; + bool UseProfileCount = false; /// Allocator and owner of BlockChain structures. /// @@ -1160,7 +1159,7 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds( // tail-duplicated into. // Skip any blocks that are already placed or not in this loop. if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) - || BlockToChain[Pred] == &Chain) + || (BlockToChain[Pred] == &Chain && !Succ->succ_empty())) continue; if (!TailDup.canTailDuplicate(Succ, Pred)) { if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors)) @@ -2018,7 +2017,7 @@ MachineBlockPlacement::FallThroughGains( for (MachineBasicBlock *Succ : BestPred->successors()) { if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ)) continue; - if (ComputedEdges.find(Succ) != ComputedEdges.end()) + if (ComputedEdges.contains(Succ)) continue; BlockChain *SuccChain = BlockToChain[Succ]; if ((SuccChain && (Succ != *SuccChain->begin())) || diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp index cd8644029530..f879c5fcf20c 100644 --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -68,12 +68,12 @@ static cl::opt<int> namespace { class MachineCSE : public MachineFunctionPass { - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - AliasAnalysis *AA; - MachineDominatorTree *DT; - MachineRegisterInfo *MRI; - MachineBlockFrequencyInfo *MBFI; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + AliasAnalysis *AA = nullptr; + MachineDominatorTree *DT = nullptr; + MachineRegisterInfo *MRI = nullptr; + MachineBlockFrequencyInfo *MBFI = nullptr; public: static char ID; // Pass identification @@ -175,9 +175,7 @@ INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE, bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI, MachineBasicBlock *MBB) { bool Changed = false; - for (MachineOperand &MO : MI->operands()) { - if (!MO.isReg() || !MO.isUse()) - continue; + for (MachineOperand &MO : MI->all_uses()) { Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; @@ -291,9 +289,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI, PhysDefVector &PhysDefs, bool &PhysUseDef) const { // First, add all uses to PhysRefs. - for (const MachineOperand &MO : MI->operands()) { - if (!MO.isReg() || MO.isDef()) - continue; + for (const MachineOperand &MO : MI->all_uses()) { Register Reg = MO.getReg(); if (!Reg) continue; @@ -483,8 +479,8 @@ bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg, // Heuristics #2: If the expression doesn't not use a vr and the only use // of the redundant computation are copies, do not cse. bool HasVRegUse = false; - for (const MachineOperand &MO : MI->operands()) { - if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual()) { + for (const MachineOperand &MO : MI->all_uses()) { + if (MO.getReg().isVirtual()) { HasVRegUse = true; break; } diff --git a/llvm/lib/CodeGen/MachineCheckDebugify.cpp b/llvm/lib/CodeGen/MachineCheckDebugify.cpp index 1e5b8dd0bbb0..874f726d2947 100644 --- a/llvm/lib/CodeGen/MachineCheckDebugify.cpp +++ b/llvm/lib/CodeGen/MachineCheckDebugify.cpp @@ -11,6 +11,7 @@ /// DILocalVariable which mir-debugifiy generated before. //===----------------------------------------------------------------------===// +#include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp index 974d570ece51..c65937935ed8 100644 --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -63,22 +63,22 @@ static cl::opt<bool> VerifyPatternOrder( namespace { class MachineCombiner : public MachineFunctionPass { - const TargetSubtargetInfo *STI; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; + const TargetSubtargetInfo *STI = nullptr; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; MCSchedModel SchedModel; - MachineRegisterInfo *MRI; - MachineLoopInfo *MLI; // Current MachineLoopInfo - MachineTraceMetrics *Traces; - MachineTraceMetrics::Ensemble *MinInstr; - MachineBlockFrequencyInfo *MBFI; - ProfileSummaryInfo *PSI; + MachineRegisterInfo *MRI = nullptr; + MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo + MachineTraceMetrics *Traces = nullptr; + MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr; + MachineBlockFrequencyInfo *MBFI = nullptr; + ProfileSummaryInfo *PSI = nullptr; RegisterClassInfo RegClassInfo; TargetSchedModel TSchedModel; /// True if optimizing for code size. - bool OptSize; + bool OptSize = false; public: static char ID; @@ -95,7 +95,8 @@ private: bool isTransientMI(const MachineInstr *MI); unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, - MachineTraceMetrics::Trace BlockTrace); + MachineTraceMetrics::Trace BlockTrace, + const MachineBasicBlock &MBB); unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot, MachineTraceMetrics::Trace BlockTrace); bool @@ -148,7 +149,8 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { +MachineInstr * +MachineCombiner::getOperandDef(const MachineOperand &MO) { MachineInstr *DefInstr = nullptr; // We need a virtual register definition. if (MO.isReg() && MO.getReg().isVirtual()) @@ -207,18 +209,17 @@ bool MachineCombiner::isTransientMI(const MachineInstr *MI) { unsigned MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, - MachineTraceMetrics::Trace BlockTrace) { + MachineTraceMetrics::Trace BlockTrace, + const MachineBasicBlock &MBB) { SmallVector<unsigned, 16> InstrDepth; // For each instruction in the new sequence compute the depth based on the // operands. Use the trace information when possible. For new operands which // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth for (auto *InstrPtr : InsInstrs) { // for each Use unsigned IDepth = 0; - for (const MachineOperand &MO : InstrPtr->operands()) { + for (const MachineOperand &MO : InstrPtr->all_uses()) { // Check for virtual register operand. - if (!(MO.isReg() && MO.getReg().isVirtual())) - continue; - if (!MO.isUse()) + if (!MO.getReg().isVirtual()) continue; unsigned DepthOp = 0; unsigned LatencyOp = 0; @@ -237,7 +238,9 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs, InstrPtr, UseIdx); } else { MachineInstr *DefInstr = getOperandDef(MO); - if (DefInstr) { + if (DefInstr && (TII->getMachineCombinerTraceStrategy() != + MachineTraceStrategy::TS_Local || + DefInstr->getParent() == &MBB)) { DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth; if (!isTransientMI(DefInstr)) LatencyOp = TSchedModel.computeOperandLatency( @@ -267,11 +270,9 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, // Check each definition in NewRoot and compute the latency unsigned NewRootLatency = 0; - for (const MachineOperand &MO : NewRoot->operands()) { + for (const MachineOperand &MO : NewRoot->all_defs()) { // Check for virtual register operand. - if (!(MO.isReg() && MO.getReg().isVirtual())) - continue; - if (!MO.isDef()) + if (!MO.getReg().isVirtual()) continue; // Get the first instruction that uses MO MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); @@ -374,7 +375,8 @@ bool MachineCombiner::improvesCriticalPathLen( MachineCombinerPattern Pattern, bool SlackIsAccurate) { // Get depth and latency of NewRoot and Root. - unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); + unsigned NewRootDepth = + getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: " @@ -399,8 +401,13 @@ bool MachineCombiner::improvesCriticalPathLen( // Account for the latency of the inserted and deleted instructions by unsigned NewRootLatency, RootLatency; - std::tie(NewRootLatency, RootLatency) = - getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); + if (TII->accumulateInstrSeqToRootLatency(*Root)) { + std::tie(NewRootLatency, RootLatency) = + getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); + } else { + NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back()); + RootLatency = TSchedModel.computeInstrLatency(Root); + } unsigned RootSlack = BlockTrace.getInstrSlack(*Root); unsigned NewCycleCount = NewRootDepth + NewRootLatency; @@ -483,20 +490,19 @@ bool MachineCombiner::preservesResourceLen( /// \param MI current machine instruction /// \param InsInstrs new instructions to insert in \p MBB /// \param DelInstrs instruction to delete from \p MBB -/// \param MinInstr is a pointer to the machine trace information +/// \param TraceEnsemble is a pointer to the machine trace information /// \param RegUnits set of live registers, needed to compute instruction depths /// \param TII is target instruction info, used to call target hook /// \param Pattern is used to call target hook finalizeInsInstrs /// \param IncrementalUpdate if true, compute instruction depths incrementally, /// otherwise invalidate the trace -static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, - SmallVector<MachineInstr *, 16> InsInstrs, - SmallVector<MachineInstr *, 16> DelInstrs, - MachineTraceMetrics::Ensemble *MinInstr, - SparseSet<LiveRegUnit> &RegUnits, - const TargetInstrInfo *TII, - MachineCombinerPattern Pattern, - bool IncrementalUpdate) { +static void insertDeleteInstructions( + MachineBasicBlock *MBB, MachineInstr &MI, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + MachineTraceMetrics::Ensemble *TraceEnsemble, + SparseSet<LiveRegUnit> &RegUnits, const TargetInstrInfo *TII, + MachineCombinerPattern Pattern, bool IncrementalUpdate) { // If we want to fix up some placeholder for some target, do it now. // We need this because in genAlternativeCodeSequence, we have not decided the // better pattern InsInstrs or DelInstrs, so we don't want generate some @@ -522,9 +528,9 @@ static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, if (IncrementalUpdate) for (auto *InstrPtr : InsInstrs) - MinInstr->updateDepth(MBB, *InstrPtr, RegUnits); + TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits); else - MinInstr->invalidate(MBB); + TraceEnsemble->invalidate(MBB); NumInstCombined++; } @@ -550,7 +556,7 @@ void MachineCombiner::verifyPatternOrder( unsigned NewRootLatency, RootLatency; std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences( - Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB)); + Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB)); long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency); assert(CurrentLatencyDiff <= PrevLatencyDiff && "Current pattern is better than previous pattern."); @@ -574,8 +580,8 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { decltype(BlockIter) LastUpdate; // Check if the block is in a loop. const MachineLoop *ML = MLI->getLoopFor(MBB); - if (!MinInstr) - MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + if (!TraceEnsemble) + TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy()); SparseSet<LiveRegUnit> RegUnits; RegUnits.setUniverse(TRI->getNumRegUnits()); @@ -647,7 +653,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { if (IncrementalUpdate && LastUpdate != BlockIter) { // Update depths since the last incremental update. - MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits); + TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits); LastUpdate = BlockIter; } @@ -661,7 +667,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { } if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) { // Replace DelInstrs with InsInstrs. - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, RegUnits, TII, P, IncrementalUpdate); Changed |= true; @@ -674,7 +680,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { if (ML && TII->isThroughputPattern(P)) { LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n"); - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, RegUnits, TII, P, IncrementalUpdate); // Eagerly stop after the first pattern fires. Changed = true; @@ -683,7 +689,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize (" << InsInstrs.size() << " < " << DelInstrs.size() << ")\n"); - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, RegUnits, TII, P, IncrementalUpdate); // Eagerly stop after the first pattern fires. Changed = true; @@ -694,7 +700,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { // instruction depths incrementally. // NOTE: Only the instruction depths up to MI are accurate. All other // trace information is not updated. - MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB); Traces->verifyAnalysis(); if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, InstrIdxForVirtReg, P, @@ -706,7 +712,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { LastUpdate = BlockIter; } - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble, RegUnits, TII, P, IncrementalUpdate); // Eagerly stop after the first pattern fires. @@ -741,7 +747,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { MBFI = (PSI && PSI->hasProfileSummary()) ? &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() : nullptr; - MinInstr = nullptr; + TraceEnsemble = nullptr; OptSize = MF.getFunction().hasOptSize(); RegClassInfo.runOnMachineFunction(MF); diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 871824553aa4..3453e6c0b8be 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -80,11 +80,15 @@ using namespace llvm; STATISTIC(NumDeletes, "Number of dead copies deleted"); STATISTIC(NumCopyForwards, "Number of copy uses forwarded"); STATISTIC(NumCopyBackwardPropagated, "Number of copy defs backward propagated"); +STATISTIC(SpillageChainsLength, "Length of spillage chains"); +STATISTIC(NumSpillageChains, "Number of spillage chains"); DEBUG_COUNTER(FwdCounter, "machine-cp-fwd", "Controls which register COPYs are forwarded"); static cl::opt<bool> MCPUseCopyInstr("mcp-use-is-copy-instr", cl::init(false), cl::Hidden); +static cl::opt<cl::boolOrDefault> + EnableSpillageCopyElimination("enable-spill-copy-elim", cl::Hidden); namespace { @@ -103,7 +107,7 @@ static std::optional<DestSourcePair> isCopyInstr(const MachineInstr &MI, class CopyTracker { struct CopyInfo { - MachineInstr *MI; + MachineInstr *MI, *LastSeenUseInCopy; SmallVector<MCRegister, 4> DefRegs; bool Avail; }; @@ -117,8 +121,8 @@ public: const TargetRegisterInfo &TRI) { for (MCRegister Reg : Regs) { // Source of copy is no longer available for propagation. - for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) { - auto CI = Copies.find(*RUI); + for (MCRegUnit Unit : TRI.regunits(Reg)) { + auto CI = Copies.find(Unit); if (CI != Copies.end()) CI->second.Avail = false; } @@ -133,8 +137,8 @@ public: // and invalidate all of them. SmallSet<MCRegister, 8> RegsToInvalidate; RegsToInvalidate.insert(Reg); - for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) { - auto I = Copies.find(*RUI); + for (MCRegUnit Unit : TRI.regunits(Reg)) { + auto I = Copies.find(Unit); if (I != Copies.end()) { if (MachineInstr *MI = I->second.MI) { std::optional<DestSourcePair> CopyOperands = @@ -150,15 +154,15 @@ public: } } for (MCRegister InvalidReg : RegsToInvalidate) - for (MCRegUnitIterator RUI(InvalidReg, &TRI); RUI.isValid(); ++RUI) - Copies.erase(*RUI); + for (MCRegUnit Unit : TRI.regunits(InvalidReg)) + Copies.erase(Unit); } /// Clobber a single register, removing it from the tracker's copy maps. void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI, const TargetInstrInfo &TII, bool UseCopyInstr) { - for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) { - auto I = Copies.find(*RUI); + for (MCRegUnit Unit : TRI.regunits(Reg)) { + auto I = Copies.find(Unit); if (I != Copies.end()) { // When we clobber the source of a copy, we need to clobber everything // it defined. @@ -188,16 +192,17 @@ public: MCRegister Def = CopyOperands->Destination->getReg().asMCReg(); // Remember Def is defined by the copy. - for (MCRegUnitIterator RUI(Def, &TRI); RUI.isValid(); ++RUI) - Copies[*RUI] = {MI, {}, true}; + for (MCRegUnit Unit : TRI.regunits(Def)) + Copies[Unit] = {MI, nullptr, {}, true}; // Remember source that's copied to Def. Once it's clobbered, then // it's no longer available for copy propagation. - for (MCRegUnitIterator RUI(Src, &TRI); RUI.isValid(); ++RUI) { - auto I = Copies.insert({*RUI, {nullptr, {}, false}}); + for (MCRegUnit Unit : TRI.regunits(Src)) { + auto I = Copies.insert({Unit, {nullptr, nullptr, {}, false}}); auto &Copy = I.first->second; if (!is_contained(Copy.DefRegs, Def)) Copy.DefRegs.push_back(Def); + Copy.LastSeenUseInCopy = MI; } } @@ -223,16 +228,16 @@ public: return nullptr; if (CI->second.DefRegs.size() != 1) return nullptr; - MCRegUnitIterator RUI(CI->second.DefRegs[0], &TRI); - return findCopyForUnit(*RUI, TRI, true); + MCRegUnit RU = *TRI.regunits(CI->second.DefRegs[0]).begin(); + return findCopyForUnit(RU, TRI, true); } MachineInstr *findAvailBackwardCopy(MachineInstr &I, MCRegister Reg, const TargetRegisterInfo &TRI, const TargetInstrInfo &TII, bool UseCopyInstr) { - MCRegUnitIterator RUI(Reg, &TRI); - MachineInstr *AvailCopy = findCopyDefViaUnit(*RUI, TRI); + MCRegUnit RU = *TRI.regunits(Reg).begin(); + MachineInstr *AvailCopy = findCopyDefViaUnit(RU, TRI); if (!AvailCopy) return nullptr; @@ -260,9 +265,9 @@ public: const TargetInstrInfo &TII, bool UseCopyInstr) { // We check the first RegUnit here, since we'll only be interested in the // copy if it copies the entire register anyway. - MCRegUnitIterator RUI(Reg, &TRI); + MCRegUnit RU = *TRI.regunits(Reg).begin(); MachineInstr *AvailCopy = - findCopyForUnit(*RUI, TRI, /*MustBeAvailable=*/true); + findCopyForUnit(RU, TRI, /*MustBeAvailable=*/true); if (!AvailCopy) return nullptr; @@ -286,15 +291,57 @@ public: return AvailCopy; } + // Find last COPY that defines Reg before Current MachineInstr. + MachineInstr *findLastSeenDefInCopy(const MachineInstr &Current, + MCRegister Reg, + const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII, + bool UseCopyInstr) { + MCRegUnit RU = *TRI.regunits(Reg).begin(); + auto CI = Copies.find(RU); + if (CI == Copies.end() || !CI->second.Avail) + return nullptr; + + MachineInstr *DefCopy = CI->second.MI; + std::optional<DestSourcePair> CopyOperands = + isCopyInstr(*DefCopy, TII, UseCopyInstr); + Register Def = CopyOperands->Destination->getReg(); + if (!TRI.isSubRegisterEq(Def, Reg)) + return nullptr; + + for (const MachineInstr &MI : + make_range(static_cast<const MachineInstr *>(DefCopy)->getIterator(), + Current.getIterator())) + for (const MachineOperand &MO : MI.operands()) + if (MO.isRegMask()) + if (MO.clobbersPhysReg(Def)) { + LLVM_DEBUG(dbgs() << "MCP: Removed tracking of " + << printReg(Def, &TRI) << "\n"); + return nullptr; + } + + return DefCopy; + } + + // Find last COPY that uses Reg. + MachineInstr *findLastSeenUseInCopy(MCRegister Reg, + const TargetRegisterInfo &TRI) { + MCRegUnit RU = *TRI.regunits(Reg).begin(); + auto CI = Copies.find(RU); + if (CI == Copies.end()) + return nullptr; + return CI->second.LastSeenUseInCopy; + } + void clear() { Copies.clear(); } }; class MachineCopyPropagation : public MachineFunctionPass { - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; - const MachineRegisterInfo *MRI; + const TargetRegisterInfo *TRI = nullptr; + const TargetInstrInfo *TII = nullptr; + const MachineRegisterInfo *MRI = nullptr; // Return true if this is a copy instruction and false otherwise. bool UseCopyInstr; @@ -325,6 +372,7 @@ private: void ReadRegister(MCRegister Reg, MachineInstr &Reader, DebugType DT); void ForwardCopyPropagateBlock(MachineBasicBlock &MBB); void BackwardCopyPropagateBlock(MachineBasicBlock &MBB); + void EliminateSpillageCopies(MachineBasicBlock &MBB); bool eraseIfRedundant(MachineInstr &Copy, MCRegister Src, MCRegister Def); void forwardUses(MachineInstr &MI); void propagateDefs(MachineInstr &MI); @@ -345,7 +393,7 @@ private: CopyTracker Tracker; - bool Changed; + bool Changed = false; }; } // end anonymous namespace @@ -362,8 +410,8 @@ void MachineCopyPropagation::ReadRegister(MCRegister Reg, MachineInstr &Reader, // If 'Reg' is defined by a copy, the copy is no longer a candidate // for elimination. If a copy is "read" by a debug user, record the user // for propagation. - for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { - if (MachineInstr *Copy = Tracker.findCopyForUnit(*RUI, *TRI)) { + for (MCRegUnit Unit : TRI->regunits(Reg)) { + if (MachineInstr *Copy = Tracker.findCopyForUnit(Unit, *TRI)) { if (DT == RegularUse) { LLVM_DEBUG(dbgs() << "MCP: Copy is used - not dead: "; Copy->dump()); MaybeDeadCopies.remove(Copy); @@ -433,6 +481,12 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, make_range(PrevCopy->getIterator(), Copy.getIterator())) MI.clearRegisterKills(CopyDef, TRI); + // Clear undef flag from remaining copy if needed. + if (!CopyOperands->Source->isUndef()) { + PrevCopy->getOperand(PrevCopyOperands->Source->getOperandNo()) + .setIsUndef(false); + } + Copy.eraseFromParent(); Changed = true; ++NumDeletes; @@ -595,12 +649,19 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) { const MachineOperand &CopySrc = *CopyOperands->Source; Register CopySrcReg = CopySrc.getReg(); - // FIXME: Don't handle partial uses of wider COPYs yet. + Register ForwardedReg = CopySrcReg; + // MI might use a sub-register of the Copy destination, in which case the + // forwarded register is the matching sub-register of the Copy source. if (MOUse.getReg() != CopyDstReg) { - LLVM_DEBUG( - dbgs() << "MCP: FIXME! Not forwarding COPY to sub-register use:\n " - << MI); - continue; + unsigned SubRegIdx = TRI->getSubRegIndex(CopyDstReg, MOUse.getReg()); + assert(SubRegIdx && + "MI source is not a sub-register of Copy destination"); + ForwardedReg = TRI->getSubReg(CopySrcReg, SubRegIdx); + if (!ForwardedReg) { + LLVM_DEBUG(dbgs() << "MCP: Copy source does not have sub-register " + << TRI->getSubRegIndexName(SubRegIdx) << '\n'); + continue; + } } // Don't forward COPYs of reserved regs unless they are constant. @@ -630,10 +691,11 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) { } LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MOUse.getReg(), TRI) - << "\n with " << printReg(CopySrcReg, TRI) + << "\n with " << printReg(ForwardedReg, TRI) << "\n in " << MI << " from " << *Copy); - MOUse.setReg(CopySrcReg); + MOUse.setReg(ForwardedReg); + if (!CopySrc.isRenamable()) MOUse.setIsRenamable(false); MOUse.setIsUndef(CopySrc.isUndef()); @@ -844,16 +906,11 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) { Tracker.clear(); } -static bool isBackwardPropagatableCopy(MachineInstr &MI, +static bool isBackwardPropagatableCopy(const DestSourcePair &CopyOperands, const MachineRegisterInfo &MRI, - const TargetInstrInfo &TII, - bool UseCopyInstr) { - std::optional<DestSourcePair> CopyOperands = - isCopyInstr(MI, TII, UseCopyInstr); - assert(CopyOperands && "MI is expected to be a COPY"); - - Register Def = CopyOperands->Destination->getReg(); - Register Src = CopyOperands->Source->getReg(); + const TargetInstrInfo &TII) { + Register Def = CopyOperands.Destination->getReg(); + Register Src = CopyOperands.Source->getReg(); if (!Def || !Src) return false; @@ -861,7 +918,7 @@ static bool isBackwardPropagatableCopy(MachineInstr &MI, if (MRI.isReserved(Def) || MRI.isReserved(Src)) return false; - return CopyOperands->Source->isRenamable() && CopyOperands->Source->isKill(); + return CopyOperands.Source->isRenamable() && CopyOperands.Source->isKill(); } void MachineCopyPropagation::propagateDefs(MachineInstr &MI) { @@ -936,14 +993,13 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( Register SrcReg = CopyOperands->Source->getReg(); if (!TRI->regsOverlap(DefReg, SrcReg)) { - MCRegister Def = DefReg.asMCReg(); - MCRegister Src = SrcReg.asMCReg(); - // Unlike forward cp, we don't invoke propagateDefs here, // just let forward cp do COPY-to-COPY propagation. - if (isBackwardPropagatableCopy(MI, *MRI, *TII, UseCopyInstr)) { - Tracker.invalidateRegister(Src, *TRI, *TII, UseCopyInstr); - Tracker.invalidateRegister(Def, *TRI, *TII, UseCopyInstr); + if (isBackwardPropagatableCopy(*CopyOperands, *MRI, *TII)) { + Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII, + UseCopyInstr); + Tracker.invalidateRegister(DefReg.asMCReg(), *TRI, *TII, + UseCopyInstr); Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr); continue; } @@ -976,9 +1032,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( // Check if the register in the debug instruction is utilized // in a copy instruction, so we can update the debug info if the // register is changed. - for (MCRegUnitIterator RUI(MO.getReg().asMCReg(), TRI); RUI.isValid(); - ++RUI) { - if (auto *Copy = Tracker.findCopyDefViaUnit(*RUI, *TRI)) { + for (MCRegUnit Unit : TRI->regunits(MO.getReg().asMCReg())) { + if (auto *Copy = Tracker.findCopyDefViaUnit(Unit, *TRI)) { CopyDbgUsers[Copy].insert(&MI); } } @@ -1008,10 +1063,345 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( Tracker.clear(); } +static void LLVM_ATTRIBUTE_UNUSED printSpillReloadChain( + DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &SpillChain, + DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &ReloadChain, + MachineInstr *Leader) { + auto &SC = SpillChain[Leader]; + auto &RC = ReloadChain[Leader]; + for (auto I = SC.rbegin(), E = SC.rend(); I != E; ++I) + (*I)->dump(); + for (MachineInstr *MI : RC) + MI->dump(); +} + +// Remove spill-reload like copy chains. For example +// r0 = COPY r1 +// r1 = COPY r2 +// r2 = COPY r3 +// r3 = COPY r4 +// <def-use r4> +// r4 = COPY r3 +// r3 = COPY r2 +// r2 = COPY r1 +// r1 = COPY r0 +// will be folded into +// r0 = COPY r1 +// r1 = COPY r4 +// <def-use r4> +// r4 = COPY r1 +// r1 = COPY r0 +// TODO: Currently we don't track usage of r0 outside the chain, so we +// conservatively keep its value as it was before the rewrite. +// +// The algorithm is trying to keep +// property#1: No Def of spill COPY in the chain is used or defined until the +// paired reload COPY in the chain uses the Def. +// +// property#2: NO Source of COPY in the chain is used or defined until the next +// COPY in the chain defines the Source, except the innermost spill-reload +// pair. +// +// The algorithm is conducted by checking every COPY inside the MBB, assuming +// the COPY is a reload COPY, then try to find paired spill COPY by searching +// the COPY defines the Src of the reload COPY backward. If such pair is found, +// it either belongs to an existing chain or a new chain depends on +// last available COPY uses the Def of the reload COPY. +// Implementation notes, we use CopyTracker::findLastDefCopy(Reg, ...) to find +// out last COPY that defines Reg; we use CopyTracker::findLastUseCopy(Reg, ...) +// to find out last COPY that uses Reg. When we are encountered with a Non-COPY +// instruction, we check registers in the operands of this instruction. If this +// Reg is defined by a COPY, we untrack this Reg via +// CopyTracker::clobberRegister(Reg, ...). +void MachineCopyPropagation::EliminateSpillageCopies(MachineBasicBlock &MBB) { + // ChainLeader maps MI inside a spill-reload chain to its innermost reload COPY. + // Thus we can track if a MI belongs to an existing spill-reload chain. + DenseMap<MachineInstr *, MachineInstr *> ChainLeader; + // SpillChain maps innermost reload COPY of a spill-reload chain to a sequence + // of COPYs that forms spills of a spill-reload chain. + // ReloadChain maps innermost reload COPY of a spill-reload chain to a + // sequence of COPYs that forms reloads of a spill-reload chain. + DenseMap<MachineInstr *, SmallVector<MachineInstr *>> SpillChain, ReloadChain; + // If a COPY's Source has use or def until next COPY defines the Source, + // we put the COPY in this set to keep property#2. + DenseSet<const MachineInstr *> CopySourceInvalid; + + auto TryFoldSpillageCopies = + [&, this](const SmallVectorImpl<MachineInstr *> &SC, + const SmallVectorImpl<MachineInstr *> &RC) { + assert(SC.size() == RC.size() && "Spill-reload should be paired"); + + // We need at least 3 pairs of copies for the transformation to apply, + // because the first outermost pair cannot be removed since we don't + // recolor outside of the chain and that we need at least one temporary + // spill slot to shorten the chain. If we only have a chain of two + // pairs, we already have the shortest sequence this code can handle: + // the outermost pair for the temporary spill slot, and the pair that + // use that temporary spill slot for the other end of the chain. + // TODO: We might be able to simplify to one spill-reload pair if collecting + // more infomation about the outermost COPY. + if (SC.size() <= 2) + return; + + // If violate property#2, we don't fold the chain. + for (const MachineInstr *Spill : make_range(SC.begin() + 1, SC.end())) + if (CopySourceInvalid.count(Spill)) + return; + + for (const MachineInstr *Reload : make_range(RC.begin(), RC.end() - 1)) + if (CopySourceInvalid.count(Reload)) + return; + + auto CheckCopyConstraint = [this](Register Def, Register Src) { + for (const TargetRegisterClass *RC : TRI->regclasses()) { + if (RC->contains(Def) && RC->contains(Src)) + return true; + } + return false; + }; + + auto UpdateReg = [](MachineInstr *MI, const MachineOperand *Old, + const MachineOperand *New) { + for (MachineOperand &MO : MI->operands()) { + if (&MO == Old) + MO.setReg(New->getReg()); + } + }; + + std::optional<DestSourcePair> InnerMostSpillCopy = + isCopyInstr(*SC[0], *TII, UseCopyInstr); + std::optional<DestSourcePair> OuterMostSpillCopy = + isCopyInstr(*SC.back(), *TII, UseCopyInstr); + std::optional<DestSourcePair> InnerMostReloadCopy = + isCopyInstr(*RC[0], *TII, UseCopyInstr); + std::optional<DestSourcePair> OuterMostReloadCopy = + isCopyInstr(*RC.back(), *TII, UseCopyInstr); + if (!CheckCopyConstraint(OuterMostSpillCopy->Source->getReg(), + InnerMostSpillCopy->Source->getReg()) || + !CheckCopyConstraint(InnerMostReloadCopy->Destination->getReg(), + OuterMostReloadCopy->Destination->getReg())) + return; + + SpillageChainsLength += SC.size() + RC.size(); + NumSpillageChains += 1; + UpdateReg(SC[0], InnerMostSpillCopy->Destination, + OuterMostSpillCopy->Source); + UpdateReg(RC[0], InnerMostReloadCopy->Source, + OuterMostReloadCopy->Destination); + + for (size_t I = 1; I < SC.size() - 1; ++I) { + SC[I]->eraseFromParent(); + RC[I]->eraseFromParent(); + NumDeletes += 2; + } + }; + + auto IsFoldableCopy = [this](const MachineInstr &MaybeCopy) { + if (MaybeCopy.getNumImplicitOperands() > 0) + return false; + std::optional<DestSourcePair> CopyOperands = + isCopyInstr(MaybeCopy, *TII, UseCopyInstr); + if (!CopyOperands) + return false; + Register Src = CopyOperands->Source->getReg(); + Register Def = CopyOperands->Destination->getReg(); + return Src && Def && !TRI->regsOverlap(Src, Def) && + CopyOperands->Source->isRenamable() && + CopyOperands->Destination->isRenamable(); + }; + + auto IsSpillReloadPair = [&, this](const MachineInstr &Spill, + const MachineInstr &Reload) { + if (!IsFoldableCopy(Spill) || !IsFoldableCopy(Reload)) + return false; + std::optional<DestSourcePair> SpillCopy = + isCopyInstr(Spill, *TII, UseCopyInstr); + std::optional<DestSourcePair> ReloadCopy = + isCopyInstr(Reload, *TII, UseCopyInstr); + if (!SpillCopy || !ReloadCopy) + return false; + return SpillCopy->Source->getReg() == ReloadCopy->Destination->getReg() && + SpillCopy->Destination->getReg() == ReloadCopy->Source->getReg(); + }; + + auto IsChainedCopy = [&, this](const MachineInstr &Prev, + const MachineInstr &Current) { + if (!IsFoldableCopy(Prev) || !IsFoldableCopy(Current)) + return false; + std::optional<DestSourcePair> PrevCopy = + isCopyInstr(Prev, *TII, UseCopyInstr); + std::optional<DestSourcePair> CurrentCopy = + isCopyInstr(Current, *TII, UseCopyInstr); + if (!PrevCopy || !CurrentCopy) + return false; + return PrevCopy->Source->getReg() == CurrentCopy->Destination->getReg(); + }; + + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + std::optional<DestSourcePair> CopyOperands = + isCopyInstr(MI, *TII, UseCopyInstr); + + // Update track information via non-copy instruction. + SmallSet<Register, 8> RegsToClobber; + if (!CopyOperands) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (!Reg) + continue; + MachineInstr *LastUseCopy = + Tracker.findLastSeenUseInCopy(Reg.asMCReg(), *TRI); + if (LastUseCopy) { + LLVM_DEBUG(dbgs() << "MCP: Copy source of\n"); + LLVM_DEBUG(LastUseCopy->dump()); + LLVM_DEBUG(dbgs() << "might be invalidated by\n"); + LLVM_DEBUG(MI.dump()); + CopySourceInvalid.insert(LastUseCopy); + } + // Must be noted Tracker.clobberRegister(Reg, ...) removes tracking of + // Reg, i.e, COPY that defines Reg is removed from the mapping as well + // as marking COPYs that uses Reg unavailable. + // We don't invoke CopyTracker::clobberRegister(Reg, ...) if Reg is not + // defined by a previous COPY, since we don't want to make COPYs uses + // Reg unavailable. + if (Tracker.findLastSeenDefInCopy(MI, Reg.asMCReg(), *TRI, *TII, + UseCopyInstr)) + // Thus we can keep the property#1. + RegsToClobber.insert(Reg); + } + for (Register Reg : RegsToClobber) { + Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr); + LLVM_DEBUG(dbgs() << "MCP: Removed tracking of " << printReg(Reg, TRI) + << "\n"); + } + continue; + } + + Register Src = CopyOperands->Source->getReg(); + Register Def = CopyOperands->Destination->getReg(); + // Check if we can find a pair spill-reload copy. + LLVM_DEBUG(dbgs() << "MCP: Searching paired spill for reload: "); + LLVM_DEBUG(MI.dump()); + MachineInstr *MaybeSpill = + Tracker.findLastSeenDefInCopy(MI, Src.asMCReg(), *TRI, *TII, UseCopyInstr); + bool MaybeSpillIsChained = ChainLeader.count(MaybeSpill); + if (!MaybeSpillIsChained && MaybeSpill && + IsSpillReloadPair(*MaybeSpill, MI)) { + // Check if we already have an existing chain. Now we have a + // spill-reload pair. + // L2: r2 = COPY r3 + // L5: r3 = COPY r2 + // Looking for a valid COPY before L5 which uses r3. + // This can be serverial cases. + // Case #1: + // No COPY is found, which can be r3 is def-use between (L2, L5), we + // create a new chain for L2 and L5. + // Case #2: + // L2: r2 = COPY r3 + // L5: r3 = COPY r2 + // Such COPY is found and is L2, we create a new chain for L2 and L5. + // Case #3: + // L2: r2 = COPY r3 + // L3: r1 = COPY r3 + // L5: r3 = COPY r2 + // we create a new chain for L2 and L5. + // Case #4: + // L2: r2 = COPY r3 + // L3: r1 = COPY r3 + // L4: r3 = COPY r1 + // L5: r3 = COPY r2 + // Such COPY won't be found since L4 defines r3. we create a new chain + // for L2 and L5. + // Case #5: + // L2: r2 = COPY r3 + // L3: r3 = COPY r1 + // L4: r1 = COPY r3 + // L5: r3 = COPY r2 + // COPY is found and is L4 which belongs to an existing chain, we add + // L2 and L5 to this chain. + LLVM_DEBUG(dbgs() << "MCP: Found spill: "); + LLVM_DEBUG(MaybeSpill->dump()); + MachineInstr *MaybePrevReload = + Tracker.findLastSeenUseInCopy(Def.asMCReg(), *TRI); + auto Leader = ChainLeader.find(MaybePrevReload); + MachineInstr *L = nullptr; + if (Leader == ChainLeader.end() || + (MaybePrevReload && !IsChainedCopy(*MaybePrevReload, MI))) { + L = &MI; + assert(!SpillChain.count(L) && + "SpillChain should not have contained newly found chain"); + } else { + assert(MaybePrevReload && + "Found a valid leader through nullptr should not happend"); + L = Leader->second; + assert(SpillChain[L].size() > 0 && + "Existing chain's length should be larger than zero"); + } + assert(!ChainLeader.count(&MI) && !ChainLeader.count(MaybeSpill) && + "Newly found paired spill-reload should not belong to any chain " + "at this point"); + ChainLeader.insert({MaybeSpill, L}); + ChainLeader.insert({&MI, L}); + SpillChain[L].push_back(MaybeSpill); + ReloadChain[L].push_back(&MI); + LLVM_DEBUG(dbgs() << "MCP: Chain " << L << " now is:\n"); + LLVM_DEBUG(printSpillReloadChain(SpillChain, ReloadChain, L)); + } else if (MaybeSpill && !MaybeSpillIsChained) { + // MaybeSpill is unable to pair with MI. That's to say adding MI makes + // the chain invalid. + // The COPY defines Src is no longer considered as a candidate of a + // valid chain. Since we expect the Def of a spill copy isn't used by + // any COPY instruction until a reload copy. For example: + // L1: r1 = COPY r2 + // L2: r3 = COPY r1 + // If we later have + // L1: r1 = COPY r2 + // L2: r3 = COPY r1 + // L3: r2 = COPY r1 + // L1 and L3 can't be a valid spill-reload pair. + // Thus we keep the property#1. + LLVM_DEBUG(dbgs() << "MCP: Not paired spill-reload:\n"); + LLVM_DEBUG(MaybeSpill->dump()); + LLVM_DEBUG(MI.dump()); + Tracker.clobberRegister(Src.asMCReg(), *TRI, *TII, UseCopyInstr); + LLVM_DEBUG(dbgs() << "MCP: Removed tracking of " << printReg(Src, TRI) + << "\n"); + } + Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr); + } + + for (auto I = SpillChain.begin(), E = SpillChain.end(); I != E; ++I) { + auto &SC = I->second; + assert(ReloadChain.count(I->first) && + "Reload chain of the same leader should exist"); + auto &RC = ReloadChain[I->first]; + TryFoldSpillageCopies(SC, RC); + } + + MaybeDeadCopies.clear(); + CopyDbgUsers.clear(); + Tracker.clear(); +} + bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; + bool isSpillageCopyElimEnabled = false; + switch (EnableSpillageCopyElimination) { + case cl::BOU_UNSET: + isSpillageCopyElimEnabled = + MF.getSubtarget().enableSpillageCopyElimination(); + break; + case cl::BOU_TRUE: + isSpillageCopyElimEnabled = true; + break; + case cl::BOU_FALSE: + isSpillageCopyElimEnabled = false; + break; + } + Changed = false; TRI = MF.getSubtarget().getRegisterInfo(); @@ -1019,6 +1409,8 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); for (MachineBasicBlock &MBB : MF) { + if (isSpillageCopyElimEnabled) + EliminateSpillageCopies(MBB); BackwardCopyPropagateBlock(MBB); ForwardCopyPropagateBlock(MBB); } diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp index adf1b51a950d..c264e199cf47 100644 --- a/llvm/lib/CodeGen/MachineDebugify.cpp +++ b/llvm/lib/CodeGen/MachineDebugify.cpp @@ -116,8 +116,8 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, // Emit DBG_VALUEs for register definitions. SmallVector<MachineOperand *, 4> RegDefs; - for (MachineOperand &MO : MI.operands()) - if (MO.isReg() && MO.isDef() && MO.getReg()) + for (MachineOperand &MO : MI.all_defs()) + if (MO.getReg()) RegDefs.push_back(&MO); for (MachineOperand *MO : RegDefs) BuildMI(MBB, InsertBeforeIt, MI.getDebugLoc(), DbgValDesc, diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp index daf6a218165d..280d3a6a41ed 100644 --- a/llvm/lib/CodeGen/MachineFrameInfo.cpp +++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp @@ -128,8 +128,8 @@ BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const { // Saved CSRs are not pristine. for (const auto &I : getCalleeSavedInfo()) - for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S) - BV.reset(*S); + for (MCPhysReg S : TRI->subregs_inclusive(I.getReg())) + BV.reset(S); return BV; } diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 59e6647fa643..88939e96e07f 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -22,7 +22,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -45,6 +45,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instruction.h" @@ -119,7 +120,7 @@ void setUnsafeStackSize(const Function &F, MachineFrameInfo &FrameInfo) { auto *MetadataName = "unsafe-stack-size"; if (auto &N = Existing->getOperand(0)) { - if (cast<MDString>(N.get())->getString() == MetadataName) { + if (N.equalsStr(MetadataName)) { if (auto &Op = Existing->getOperand(1)) { auto Val = mdconst::extract<ConstantInt>(Op)->getZExtValue(); FrameInfo.setUnsafeStackSize(Val); @@ -211,6 +212,14 @@ void MachineFunction::init() { Alignment = std::max(Alignment, STI->getTargetLowering()->getPrefFunctionAlignment()); + // -fsanitize=function and -fsanitize=kcfi instrument indirect function calls + // to load a type hash before the function label. Ensure functions are aligned + // by a least 4 to avoid unaligned access, which is especially important for + // -mno-unaligned-access. + if (F.hasMetadata(LLVMContext::MD_func_sanitize) || + F.getMetadata(LLVMContext::MD_kcfi_type)) + Alignment = std::max(Alignment, Align(4)); + if (AlignAllFunctions) Alignment = Align(1ULL << AlignAllFunctions); @@ -427,8 +436,7 @@ void MachineFunction::deleteMachineInstr(MachineInstr *MI) { // be triggered during the implementation of support for the // call site info of a new architecture. If the assertion is triggered, // back trace will tell where to insert a call to updateCallSiteInfo(). - assert((!MI->isCandidateForCallSiteEntry() || - CallSitesInfo.find(MI) == CallSitesInfo.end()) && + assert((!MI->isCandidateForCallSiteEntry() || !CallSitesInfo.contains(MI)) && "Call site info was not updated!"); // Strip it for parts. The operand array and the MI object itself are // independently recyclable. @@ -1083,11 +1091,10 @@ auto MachineFunction::salvageCopySSAImpl(MachineInstr &MI) if (State.first.isVirtual()) { // Virtual register def -- we can just look up where this happens. MachineInstr *Inst = MRI.def_begin(State.first)->getParent(); - for (auto &MO : Inst->operands()) { - if (!MO.isReg() || !MO.isDef() || MO.getReg() != State.first) + for (auto &MO : Inst->all_defs()) { + if (MO.getReg() != State.first) continue; - return ApplySubregisters( - {Inst->getDebugInstrNum(), Inst->getOperandNo(&MO)}); + return ApplySubregisters({Inst->getDebugInstrNum(), MO.getOperandNo()}); } llvm_unreachable("Vreg def with no corresponding operand?"); @@ -1102,14 +1109,13 @@ auto MachineFunction::salvageCopySSAImpl(MachineInstr &MI) auto RMII = CurInst->getReverseIterator(); auto PrevInstrs = make_range(RMII, CurInst->getParent()->instr_rend()); for (auto &ToExamine : PrevInstrs) { - for (auto &MO : ToExamine.operands()) { + for (auto &MO : ToExamine.all_defs()) { // Test for operand that defines something aliasing RegToSeek. - if (!MO.isReg() || !MO.isDef() || - !TRI.regsOverlap(RegToSeek, MO.getReg())) + if (!TRI.regsOverlap(RegToSeek, MO.getReg())) continue; return ApplySubregisters( - {ToExamine.getDebugInstrNum(), ToExamine.getOperandNo(&MO)}); + {ToExamine.getDebugInstrNum(), MO.getOperandNo()}); } } @@ -1395,7 +1401,7 @@ MachineConstantPool::~MachineConstantPool() { } /// Test whether the given two constants can be allocated the same constant pool -/// entry. +/// entry referenced by \param A. static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B, const DataLayout &DL) { // Handle the trivial case quickly. @@ -1415,6 +1421,8 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B, if (StoreSize != DL.getTypeStoreSize(B->getType()) || StoreSize > 128) return false; + bool ContainsUndefOrPoisonA = A->containsUndefOrPoisonElement(); + Type *IntTy = IntegerType::get(A->getContext(), StoreSize*8); // Try constant folding a bitcast of both instructions to an integer. If we @@ -1434,7 +1442,14 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B, B = ConstantFoldCastOperand(Instruction::BitCast, const_cast<Constant *>(B), IntTy, DL); - return A == B; + if (A != B) + return false; + + // Constants only safely match if A doesn't contain undef/poison. + // As we'll be reusing A, it doesn't matter if B contain undef/poison. + // TODO: Handle cases where A and B have the same undef/poison elements. + // TODO: Merge A and B with mismatching undef/poison elements. + return !ContainsUndefOrPoisonA; } /// Create a new entry in the constant pool or return an existing one. @@ -1490,6 +1505,17 @@ void MachineConstantPool::print(raw_ostream &OS) const { } } +//===----------------------------------------------------------------------===// +// Template specialization for MachineFunction implementation of +// ProfileSummaryInfo::getEntryCount(). +//===----------------------------------------------------------------------===// +template <> +std::optional<Function::ProfileCount> +ProfileSummaryInfo::getEntryCount<llvm::MachineFunction>( + const llvm::MachineFunction *F) const { + return F->getFunction().getEntryCount(); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void MachineConstantPool::dump() const { print(dbgs()); } #endif diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp index 613c52900331..fbc071536d22 100644 --- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp +++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp @@ -24,6 +24,9 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/EHUtils.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/CodeGen/BasicBlockSectionUtils.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -83,88 +86,44 @@ public: } // end anonymous namespace /// setDescendantEHBlocksCold - This splits all EH pads and blocks reachable -/// only by EH pad as cold. This will help mark EH pads statically cold instead -/// of relying on profile data. -static void -setDescendantEHBlocksCold(SmallVectorImpl<MachineBasicBlock *> &EHBlocks, - MachineFunction &MF) { - MachineBasicBlock *StartBlock = &MF.front(); - // A block can be unknown if its not reachable from anywhere - // EH if its only reachable from start blocks via some path through EH pads - // NonEH if it's reachable from Non EH blocks as well. - enum Status { Unknown = 0, EH = 1, NonEH = 2 }; - DenseSet<MachineBasicBlock *> WorkList; - DenseMap<MachineBasicBlock *, Status> Statuses; - - auto getStatus = [&](MachineBasicBlock *MBB) { - if (Statuses.find(MBB) != Statuses.end()) - return Statuses[MBB]; - else - return Unknown; - }; - - auto checkPredecessors = [&](MachineBasicBlock *MBB, Status Stat) { - for (auto *PredMBB : MBB->predecessors()) { - Status PredStatus = getStatus(PredMBB); - // If status of predecessor block has gone above current block - // we update current blocks status. - if (PredStatus > Stat) - Stat = PredStatus; - } - return Stat; - }; - - auto addSuccesors = [&](MachineBasicBlock *MBB) { - for (auto *SuccMBB : MBB->successors()) { - if (!SuccMBB->isEHPad()) - WorkList.insert(SuccMBB); - } - }; - - // Insert the successors of start block - // and landing pads successor. - Statuses[StartBlock] = NonEH; - addSuccesors(StartBlock); - for (auto *LP : EHBlocks) { - addSuccesors(LP); - Statuses[LP] = EH; - } - - // Worklist iterative algorithm. - while (!WorkList.empty()) { - auto *MBB = *WorkList.begin(); - WorkList.erase(MBB); - - Status OldStatus = getStatus(MBB); - - // Check on predecessors and check for - // Status update. - Status NewStatus = checkPredecessors(MBB, OldStatus); - - // Did the block status change? - bool changed = OldStatus != NewStatus; - if (changed) { - addSuccesors(MBB); - Statuses[MBB] = NewStatus; - } +/// only by EH pad as cold. This will help mark EH pads statically cold +/// instead of relying on profile data. +static void setDescendantEHBlocksCold(MachineFunction &MF) { + DenseSet<MachineBasicBlock *> EHBlocks; + computeEHOnlyBlocks(MF, EHBlocks); + for (auto Block : EHBlocks) { + Block->setSectionID(MBBSectionID::ColdSectionID); } +} - for (auto Entry : Statuses) { - if (Entry.second == EH) - Entry.first->setSectionID(MBBSectionID::ColdSectionID); - } +static void finishAdjustingBasicBlocksAndLandingPads(MachineFunction &MF) { + auto Comparator = [](const MachineBasicBlock &X, const MachineBasicBlock &Y) { + return X.getSectionID().Type < Y.getSectionID().Type; + }; + llvm::sortBasicBlocksAndUpdateBranches(MF, Comparator); + llvm::avoidZeroOffsetLandingPad(MF); } static bool isColdBlock(const MachineBasicBlock &MBB, const MachineBlockFrequencyInfo *MBFI, ProfileSummaryInfo *PSI) { std::optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB); - if (!Count) - return true; - - if (PercentileCutoff > 0) { - return PSI->isColdCountNthPercentile(PercentileCutoff, *Count); + // For instrumentation profiles and sample profiles, we use different ways + // to judge whether a block is cold and should be split. + if (PSI->hasInstrumentationProfile() || PSI->hasCSInstrumentationProfile()) { + // If using instrument profile, which is deemed "accurate", no count means + // cold. + if (!Count) + return true; + if (PercentileCutoff > 0) + return PSI->isColdCountNthPercentile(PercentileCutoff, *Count); + // Fallthrough to end of function. + } else if (PSI->hasSampleProfile()) { + // For sample profile, no count means "do not judege coldness". + if (!Count) + return false; } + return (*Count < ColdCountThreshold); } @@ -204,6 +163,17 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { if (UseProfileData) { MBFI = &getAnalysis<MachineBlockFrequencyInfo>(); PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + // If we don't have a good profile (sample profile is not deemed + // as a "good profile") and the function is not hot, then early + // return. (Because we can only trust hot functions when profile + // quality is not good.) + if (PSI->hasSampleProfile() && !PSI->isFunctionHotInCallGraph(&MF, *MBFI)) { + // Split all EH code and it's descendant statically by default. + if (SplitAllEHCode) + setDescendantEHBlocksCold(MF); + finishAdjustingBasicBlocksAndLandingPads(MF); + return true; + } } SmallVector<MachineBasicBlock *, 2> LandingPads; @@ -219,9 +189,10 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { // Split all EH code and it's descendant statically by default. if (SplitAllEHCode) - setDescendantEHBlocksCold(LandingPads, MF); + setDescendantEHBlocksCold(MF); // We only split out eh pads if all of them are cold. else { + // Here we have UseProfileData == true. bool HasHotLandingPads = false; for (const MachineBasicBlock *LP : LandingPads) { if (!isColdBlock(*LP, MBFI, PSI)) @@ -232,11 +203,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { LP->setSectionID(MBBSectionID::ColdSectionID); } } - auto Comparator = [](const MachineBasicBlock &X, const MachineBasicBlock &Y) { - return X.getSectionID().Type < Y.getSectionID().Type; - }; - llvm::sortBasicBlocksAndUpdateBranches(MF, Comparator); - llvm::avoidZeroOffsetLandingPad(MF); + + finishAdjustingBasicBlocksAndLandingPads(MF); return true; } diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index 8e0777f8438a..a9309487a7a7 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -28,6 +29,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -49,7 +51,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include <algorithm> @@ -95,7 +96,8 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) { /// the MCInstrDesc. MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &TID, DebugLoc DL, bool NoImp) - : MCID(&TID), DbgLoc(std::move(DL)), DebugInstrNum(0) { + : MCID(&TID), NumOperands(0), Flags(0), AsmPrinterFlags(0), + DbgLoc(std::move(DL)), DebugInstrNum(0) { assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor"); // Reserve space for the expected number of operands. @@ -113,8 +115,8 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &TID, /// Does not copy the number from debug instruction numbering, to preserve /// uniqueness. MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI) - : MCID(&MI.getDesc()), Info(MI.Info), DbgLoc(MI.getDebugLoc()), - DebugInstrNum(0) { + : MCID(&MI.getDesc()), NumOperands(0), Flags(0), AsmPrinterFlags(0), + Info(MI.Info), DbgLoc(MI.getDebugLoc()), DebugInstrNum(0) { assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor"); CapOperands = OperandCapacity::get(MI.getNumOperands()); @@ -149,6 +151,12 @@ MachineRegisterInfo *MachineInstr::getRegInfo() { return nullptr; } +const MachineRegisterInfo *MachineInstr::getRegInfo() const { + if (const MachineBasicBlock *MBB = getParent()) + return &MBB->getParent()->getRegInfo(); + return nullptr; +} + void MachineInstr::removeRegOperandsFromUseLists(MachineRegisterInfo &MRI) { for (MachineOperand &MO : operands()) if (MO.isReg()) @@ -185,6 +193,8 @@ static void moveOperands(MachineOperand *Dst, MachineOperand *Src, /// an explicit operand it is added at the end of the explicit operand list /// (before the first implicit operand). void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) { + assert(isUInt<LLVM_MI_NUMOPERANDS_BITS>(NumOperands + 1) && + "Cannot add more operands."); assert(MCID && "Cannot add operands before providing an instr descriptor"); // Check if we're adding one of our existing operands. @@ -526,14 +536,14 @@ void MachineInstr::cloneInstrSymbols(MachineFunction &MF, setPCSections(MF, MI.getPCSections()); } -uint16_t MachineInstr::mergeFlagsWith(const MachineInstr &Other) const { +uint32_t MachineInstr::mergeFlagsWith(const MachineInstr &Other) const { // For now, the just return the union of the flags. If the flags get more // complicated over time, we might need more logic here. return getFlags() | Other.getFlags(); } -uint16_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) { - uint16_t MIFlags = 0; +uint32_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) { + uint32_t MIFlags = 0; // Copy the wrapping flags. if (const OverflowingBinaryOperator *OB = dyn_cast<OverflowingBinaryOperator>(&I)) { @@ -567,6 +577,9 @@ uint16_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) { MIFlags |= MachineInstr::MIFlag::FmReassoc; } + if (I.getMetadata(LLVMContext::MD_unpredictable)) + MIFlags |= MachineInstr::MIFlag::Unpredictable; + return MIFlags; } @@ -1715,7 +1728,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, if (FirstOp) FirstOp = false; else OS << ","; OS << " "; - if (isDebugValue() && MO.isMetadata()) { + if (isDebugValueLike() && MO.isMetadata()) { // Pretty print DBG_VALUE* instructions. auto *DIV = dyn_cast<DILocalVariable>(MO.getMetadata()); if (DIV && !DIV->getName().empty()) @@ -1871,7 +1884,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, } // Print extra comments for DEBUG_VALUE. - if (isDebugValue() && getDebugVariableOp().isMetadata()) { + if (isDebugValueLike() && getDebugVariableOp().isMetadata()) { if (!HaveSemi) { OS << ";"; HaveSemi = true; @@ -2378,3 +2391,72 @@ unsigned MachineInstr::getDebugInstrNum(MachineFunction &MF) { DebugInstrNum = MF.getNewDebugInstrNum(); return DebugInstrNum; } + +std::tuple<LLT, LLT> MachineInstr::getFirst2LLTs() const { + return std::tuple(getRegInfo()->getType(getOperand(0).getReg()), + getRegInfo()->getType(getOperand(1).getReg())); +} + +std::tuple<LLT, LLT, LLT> MachineInstr::getFirst3LLTs() const { + return std::tuple(getRegInfo()->getType(getOperand(0).getReg()), + getRegInfo()->getType(getOperand(1).getReg()), + getRegInfo()->getType(getOperand(2).getReg())); +} + +std::tuple<LLT, LLT, LLT, LLT> MachineInstr::getFirst4LLTs() const { + return std::tuple(getRegInfo()->getType(getOperand(0).getReg()), + getRegInfo()->getType(getOperand(1).getReg()), + getRegInfo()->getType(getOperand(2).getReg()), + getRegInfo()->getType(getOperand(3).getReg())); +} + +std::tuple<LLT, LLT, LLT, LLT, LLT> MachineInstr::getFirst5LLTs() const { + return std::tuple(getRegInfo()->getType(getOperand(0).getReg()), + getRegInfo()->getType(getOperand(1).getReg()), + getRegInfo()->getType(getOperand(2).getReg()), + getRegInfo()->getType(getOperand(3).getReg()), + getRegInfo()->getType(getOperand(4).getReg())); +} + +std::tuple<Register, LLT, Register, LLT> +MachineInstr::getFirst2RegLLTs() const { + Register Reg0 = getOperand(0).getReg(); + Register Reg1 = getOperand(1).getReg(); + return std::tuple(Reg0, getRegInfo()->getType(Reg0), Reg1, + getRegInfo()->getType(Reg1)); +} + +std::tuple<Register, LLT, Register, LLT, Register, LLT> +MachineInstr::getFirst3RegLLTs() const { + Register Reg0 = getOperand(0).getReg(); + Register Reg1 = getOperand(1).getReg(); + Register Reg2 = getOperand(2).getReg(); + return std::tuple(Reg0, getRegInfo()->getType(Reg0), Reg1, + getRegInfo()->getType(Reg1), Reg2, + getRegInfo()->getType(Reg2)); +} + +std::tuple<Register, LLT, Register, LLT, Register, LLT, Register, LLT> +MachineInstr::getFirst4RegLLTs() const { + Register Reg0 = getOperand(0).getReg(); + Register Reg1 = getOperand(1).getReg(); + Register Reg2 = getOperand(2).getReg(); + Register Reg3 = getOperand(3).getReg(); + return std::tuple( + Reg0, getRegInfo()->getType(Reg0), Reg1, getRegInfo()->getType(Reg1), + Reg2, getRegInfo()->getType(Reg2), Reg3, getRegInfo()->getType(Reg3)); +} + +std::tuple<Register, LLT, Register, LLT, Register, LLT, Register, LLT, Register, + LLT> +MachineInstr::getFirst5RegLLTs() const { + Register Reg0 = getOperand(0).getReg(); + Register Reg1 = getOperand(1).getReg(); + Register Reg2 = getOperand(2).getReg(); + Register Reg3 = getOperand(3).getReg(); + Register Reg4 = getOperand(4).getReg(); + return std::tuple( + Reg0, getRegInfo()->getType(Reg0), Reg1, getRegInfo()->getType(Reg1), + Reg2, getRegInfo()->getType(Reg2), Reg3, getRegInfo()->getType(Reg3), + Reg4, getRegInfo()->getType(Reg4)); +} diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index 0c059a145ca4..b9db34f7be95 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -58,8 +58,7 @@ bool UnpackMachineBundles::runOnMachineFunction(MachineFunction &MF) { if (MI->isBundle()) { while (++MII != MIE && MII->isBundledWithPred()) { MII->unbundleFromPred(); - for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MII->getOperand(i); + for (MachineOperand &MO : MII->operands()) { if (MO.isReg() && MO.isInternalRead()) MO.setIsInternalRead(false); } @@ -149,8 +148,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, if (MII->isDebugInstr()) continue; - for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MII->getOperand(i); + for (MachineOperand &MO : MII->operands()) { if (!MO.isReg()) continue; if (MO.isDef()) { @@ -199,8 +197,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, } if (!MO.isDead() && Reg.isPhysical()) { - for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) { - unsigned SubReg = *SubRegs; + for (MCPhysReg SubReg : TRI->subregs(Reg)) { if (LocalDefSet.insert(SubReg).second) LocalDefs.push_back(SubReg); } @@ -310,6 +307,34 @@ VirtRegInfo llvm::AnalyzeVirtRegInBundle( return RI; } +std::pair<LaneBitmask, LaneBitmask> +llvm::AnalyzeVirtRegLanesInBundle(const MachineInstr &MI, Register Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) { + + LaneBitmask UseMask, DefMask; + + for (ConstMIBundleOperands O(MI); O.isValid(); ++O) { + const MachineOperand &MO = *O; + if (!MO.isReg() || MO.getReg() != Reg) + continue; + + unsigned SubReg = MO.getSubReg(); + if (SubReg == 0 && MO.isUse() && !MO.isUndef()) + UseMask |= MRI.getMaxLaneMaskForVReg(Reg); + + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg); + if (MO.isDef()) { + if (!MO.isUndef()) + UseMask |= ~SubRegMask; + DefMask |= SubRegMask; + } else if (!MO.isUndef()) + UseMask |= SubRegMask; + } + + return {UseMask, DefMask}; +} + PhysRegInfo llvm::AnalyzePhysRegInBundle(const MachineInstr &MI, Register Reg, const TargetRegisterInfo *TRI) { bool AllDefsDead = true; diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 1c09c01df3aa..4e80e9b58c06 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -112,26 +112,26 @@ STATISTIC(NumNotHoistedDueToHotness, namespace { class MachineLICMBase : public MachineFunctionPass { - const TargetInstrInfo *TII; - const TargetLoweringBase *TLI; - const TargetRegisterInfo *TRI; - const MachineFrameInfo *MFI; - MachineRegisterInfo *MRI; + const TargetInstrInfo *TII = nullptr; + const TargetLoweringBase *TLI = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const MachineFrameInfo *MFI = nullptr; + MachineRegisterInfo *MRI = nullptr; TargetSchedModel SchedModel; - bool PreRegAlloc; - bool HasProfileData; + bool PreRegAlloc = false; + bool HasProfileData = false; // Various analyses that we use... - AliasAnalysis *AA; // Alias analysis info. - MachineBlockFrequencyInfo *MBFI; // Machine block frequncy info - MachineLoopInfo *MLI; // Current MachineLoopInfo - MachineDominatorTree *DT; // Machine dominator tree for the cur loop + AliasAnalysis *AA = nullptr; // Alias analysis info. + MachineBlockFrequencyInfo *MBFI = nullptr; // Machine block frequncy info + MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo + MachineDominatorTree *DT = nullptr; // Machine dominator tree for the cur loop // State that is updated as we process loops - bool Changed; // True if a loop is changed. - bool FirstInLoop; // True if it's the first LICM in the loop. - MachineLoop *CurLoop; // The current loop we are working on. - MachineBasicBlock *CurPreheader; // The preheader for CurLoop. + bool Changed = false; // True if a loop is changed. + bool FirstInLoop = false; // True if it's the first LICM in the loop. + MachineLoop *CurLoop = nullptr; // The current loop we are working on. + MachineBasicBlock *CurPreheader = nullptr; // The preheader for CurLoop. // Exit blocks for CurLoop. SmallVector<MachineBasicBlock *, 8> ExitBlocks; @@ -163,7 +163,7 @@ namespace { // If a MBB does not dominate loop exiting blocks then it may not safe // to hoist loads from this block. // Tri-state: 0 - false, 1 - true, 2 - unknown - unsigned SpeculationState; + unsigned SpeculationState = SpeculateUnknown; public: MachineLICMBase(char &PassID, bool PreRegAlloc) @@ -575,8 +575,8 @@ void MachineLICMBase::HoistRegionPostRA() { if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) { bool Safe = true; MachineInstr *MI = Candidate.MI; - for (const MachineOperand &MO : MI->operands()) { - if (!MO.isReg() || MO.isDef() || !MO.getReg()) + for (const MachineOperand &MO : MI->all_uses()) { + if (!MO.getReg()) continue; Register Reg = MO.getReg(); if (PhysRegDefs.test(Reg) || @@ -600,8 +600,9 @@ void MachineLICMBase::AddToLiveIns(MCRegister Reg) { if (!BB->isLiveIn(Reg)) BB->addLiveIn(Reg); for (MachineInstr &MI : *BB) { - for (MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.getReg() || MO.isDef()) continue; + for (MachineOperand &MO : MI.all_uses()) { + if (!MO.getReg()) + continue; if (MO.getReg() == Reg || TRI->isSuperRegister(Reg, MO.getReg())) MO.setIsKill(false); } @@ -669,8 +670,8 @@ bool MachineLICMBase::isTriviallyReMaterializable( if (!TII->isTriviallyReMaterializable(MI)) return false; - for (const MachineOperand &MO : MI.operands()) { - if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual()) + for (const MachineOperand &MO : MI.all_uses()) { + if (MO.getReg().isVirtual()) return false; } @@ -866,7 +867,7 @@ MachineLICMBase::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, continue; const int *PS = TRI->getRegClassPressureSets(RC); for (; *PS != -1; ++PS) { - if (Cost.find(*PS) == Cost.end()) + if (!Cost.contains(*PS)) Cost[*PS] = RCCost; else Cost[*PS] += RCCost; @@ -1014,9 +1015,7 @@ bool MachineLICMBase::HasLoopPHIUse(const MachineInstr *MI) const { SmallVector<const MachineInstr*, 8> Work(1, MI); do { MI = Work.pop_back_val(); - for (const MachineOperand &MO : MI->operands()) { - if (!MO.isReg() || !MO.isDef()) - continue; + for (const MachineOperand &MO : MI->all_defs()) { Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; @@ -1455,8 +1454,8 @@ bool MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) { // Clear the kill flags of any register this instruction defines, // since they may need to be live throughout the entire loop // rather than just live for part of it. - for (MachineOperand &MO : MI->operands()) - if (MO.isReg() && MO.isDef() && !MO.isDead()) + for (MachineOperand &MO : MI->all_defs()) + if (!MO.isDead()) MRI->clearKillFlags(MO.getReg()); // Add to the CSE map. diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp index c400ce190b46..c44b968b317d 100644 --- a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp +++ b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp @@ -39,17 +39,29 @@ STATISTIC(NumRemoved, "Number of redundant instructions removed."); namespace { class MachineLateInstrsCleanup : public MachineFunctionPass { - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI = nullptr; + const TargetInstrInfo *TII = nullptr; + + // Data structures to map regs to their definitions and kills per MBB. + struct Reg2MIMap : public SmallDenseMap<Register, MachineInstr *> { + bool hasIdentical(Register Reg, MachineInstr *ArgMI) { + MachineInstr *MI = lookup(Reg); + return MI && MI->isIdenticalTo(*ArgMI); + } + }; - // Data structures to map regs to their definitions per MBB. - using Reg2DefMap = std::map<Register, MachineInstr*>; - std::vector<Reg2DefMap> RegDefs; + std::vector<Reg2MIMap> RegDefs; + std::vector<Reg2MIMap> RegKills; // Walk through the instructions in MBB and remove any redundant // instructions. bool processBlock(MachineBasicBlock *MBB); + void removeRedundantDef(MachineInstr *MI); + void clearKillsForDef(Register Reg, MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + BitVector &VisitedPreds); + public: static char ID; // Pass identification, replacement for typeid @@ -88,6 +100,8 @@ bool MachineLateInstrsCleanup::runOnMachineFunction(MachineFunction &MF) { RegDefs.clear(); RegDefs.resize(MF.getNumBlockIDs()); + RegKills.clear(); + RegKills.resize(MF.getNumBlockIDs()); // Visit all MBBs in an order that maximises the reuse from predecessors. bool Changed = false; @@ -102,41 +116,36 @@ bool MachineLateInstrsCleanup::runOnMachineFunction(MachineFunction &MF) { // in MBB and if needed continue in predecessors until a use/def of Reg is // encountered. This seems to be faster in practice than tracking kill flags // in a map. -static void clearKillsForDef(Register Reg, MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - BitVector &VisitedPreds, - const TargetRegisterInfo *TRI) { +void MachineLateInstrsCleanup:: +clearKillsForDef(Register Reg, MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + BitVector &VisitedPreds) { VisitedPreds.set(MBB->getNumber()); - while (I != MBB->begin()) { - --I; - bool Found = false; - for (auto &MO : I->operands()) - if (MO.isReg() && TRI->regsOverlap(MO.getReg(), Reg)) { - if (MO.isDef()) - return; - if (MO.readsReg()) { - MO.setIsKill(false); - Found = true; // Keep going for an implicit kill of the super-reg. - } - } - if (Found) - return; + + // Kill flag in MBB + if (MachineInstr *KillMI = RegKills[MBB->getNumber()].lookup(Reg)) { + KillMI->clearRegisterKills(Reg, TRI); + return; } + // Def in MBB (missing kill flag) + if (MachineInstr *DefMI = RegDefs[MBB->getNumber()].lookup(Reg)) + if (DefMI->getParent() == MBB) + return; + // If an earlier def is not in MBB, continue in predecessors. if (!MBB->isLiveIn(Reg)) MBB->addLiveIn(Reg); assert(!MBB->pred_empty() && "Predecessor def not found!"); for (MachineBasicBlock *Pred : MBB->predecessors()) if (!VisitedPreds.test(Pred->getNumber())) - clearKillsForDef(Reg, Pred, Pred->end(), VisitedPreds, TRI); + clearKillsForDef(Reg, Pred, Pred->end(), VisitedPreds); } -static void removeRedundantDef(MachineInstr *MI, - const TargetRegisterInfo *TRI) { +void MachineLateInstrsCleanup::removeRedundantDef(MachineInstr *MI) { Register Reg = MI->getOperand(0).getReg(); BitVector VisitedPreds(MI->getMF()->getNumBlockIDs()); - clearKillsForDef(Reg, MI->getParent(), MI->getIterator(), VisitedPreds, TRI); + clearKillsForDef(Reg, MI->getParent(), MI->getIterator(), VisitedPreds); MI->eraseFromParent(); ++NumRemoved; } @@ -172,18 +181,18 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg, bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) { bool Changed = false; - Reg2DefMap &MBBDefs = RegDefs[MBB->getNumber()]; + Reg2MIMap &MBBDefs = RegDefs[MBB->getNumber()]; + Reg2MIMap &MBBKills = RegKills[MBB->getNumber()]; // Find reusable definitions in the predecessor(s). - if (!MBB->pred_empty() && !MBB->isEHPad()) { + if (!MBB->pred_empty() && !MBB->isEHPad() && + !MBB->isInlineAsmBrIndirectTarget()) { MachineBasicBlock *FirstPred = *MBB->pred_begin(); for (auto [Reg, DefMI] : RegDefs[FirstPred->getNumber()]) if (llvm::all_of( drop_begin(MBB->predecessors()), [&, &Reg = Reg, &DefMI = DefMI](const MachineBasicBlock *Pred) { - auto PredDefI = RegDefs[Pred->getNumber()].find(Reg); - return PredDefI != RegDefs[Pred->getNumber()].end() && - DefMI->isIdenticalTo(*PredDefI->second); + return RegDefs[Pred->getNumber()].hasIdentical(Reg, DefMI); })) { MBBDefs[Reg] = DefMI; LLVM_DEBUG(dbgs() << "Reusable instruction from pred(s): in " @@ -200,6 +209,7 @@ bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) { // it) are valid. if (MI.modifiesRegister(FrameReg, TRI)) { MBBDefs.clear(); + MBBKills.clear(); continue; } @@ -207,24 +217,23 @@ bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) { bool IsCandidate = isCandidate(&MI, DefedReg, FrameReg); // Check for an earlier identical and reusable instruction. - if (IsCandidate) { - auto DefI = MBBDefs.find(DefedReg); - if (DefI != MBBDefs.end() && MI.isIdenticalTo(*DefI->second)) { - LLVM_DEBUG(dbgs() << "Removing redundant instruction in " - << printMBBReference(*MBB) << ": " << MI;); - removeRedundantDef(&MI, TRI); - Changed = true; - continue; - } + if (IsCandidate && MBBDefs.hasIdentical(DefedReg, &MI)) { + LLVM_DEBUG(dbgs() << "Removing redundant instruction in " + << printMBBReference(*MBB) << ": " << MI;); + removeRedundantDef(&MI); + Changed = true; + continue; } // Clear any entries in map that MI clobbers. - for (auto DefI = MBBDefs.begin(); DefI != MBBDefs.end();) { - Register Reg = DefI->first; - if (MI.modifiesRegister(Reg, TRI)) - DefI = MBBDefs.erase(DefI); - else - ++DefI; + for (auto DefI : llvm::make_early_inc_range(MBBDefs)) { + Register Reg = DefI.first; + if (MI.modifiesRegister(Reg, TRI)) { + MBBDefs.erase(Reg); + MBBKills.erase(Reg); + } else if (MI.findRegisterUseOperandIdx(Reg, true /*isKill*/, TRI) != -1) + // Keep track of register kills. + MBBKills[Reg] = &MI; } // Record this MI for potential later reuse. @@ -232,6 +241,7 @@ bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "Found interesting instruction in " << printMBBReference(*MBB) << ": " << MI;); MBBDefs[DefedReg] = &MI; + assert(!MBBKills.count(DefedReg) && "Should already have been removed."); } } diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp index fb3af385a0c1..37a0ff3d71c8 100644 --- a/llvm/lib/CodeGen/MachineLoopInfo.cpp +++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp @@ -14,7 +14,6 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/Analysis/LoopInfoImpl.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -23,6 +22,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" +#include "llvm/Support/GenericLoopInfoImpl.h" using namespace llvm; diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp index a0c0166d06f0..921feb253d64 100644 --- a/llvm/lib/CodeGen/MachineModuleInfo.cpp +++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp @@ -56,11 +56,10 @@ void MachineModuleInfo::finalize() { MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI) : TM(std::move(MMI.TM)), - Context(MMI.TM.getTargetTriple(), MMI.TM.getMCAsmInfo(), - MMI.TM.getMCRegisterInfo(), MMI.TM.getMCSubtargetInfo(), nullptr, - &MMI.TM.Options.MCOptions, false), + Context(TM.getTargetTriple(), TM.getMCAsmInfo(), TM.getMCRegisterInfo(), + TM.getMCSubtargetInfo(), nullptr, &TM.Options.MCOptions, false), MachineFunctions(std::move(MMI.MachineFunctions)) { - Context.setObjectFileInfo(MMI.TM.getObjFileLowering()); + Context.setObjectFileInfo(TM.getObjFileLowering()); ObjFileMMI = MMI.ObjFileMMI; CurCallSite = MMI.CurCallSite; ExternalContext = MMI.ExternalContext; @@ -107,6 +106,10 @@ MachineFunction &MachineModuleInfo::getOrCreateMachineFunction(Function &F) { const TargetSubtargetInfo &STI = *TM.getSubtargetImpl(F); MF = new MachineFunction(F, TM, STI, NextFnNum++, *this); MF->initTargetMachineFunctionInfo(STI); + + // MRI callback for target specific initializations. + TM.registerMachineRegisterInfoCallback(*MF); + // Update the set entry. I.first->second.reset(MF); } else { diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index 0a7b12e9ccb9..788c134b6ee8 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineOperand.h" -#include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/MIRFormatter.h" @@ -53,6 +52,11 @@ static MachineFunction *getMFIfAvailable(MachineOperand &MO) { getMFIfAvailable(const_cast<const MachineOperand &>(MO))); } +unsigned MachineOperand::getOperandNo() const { + assert(getParent() && "Operand does not belong to any instruction!"); + return getParent()->getOperandNo(this); +} + void MachineOperand::setReg(Register Reg) { if (getReg() == Reg) return; // No change. @@ -986,7 +990,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, case MachineOperand::MO_Predicate: { auto Pred = static_cast<CmpInst::Predicate>(getPredicate()); OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred(" - << CmpInst::getPredicateName(Pred) << ')'; + << Pred << ')'; break; } case MachineOperand::MO_ShuffleMask: @@ -1022,10 +1026,10 @@ unsigned MachinePointerInfo::getAddrSpace() const { return AddrSpace; } /// Offset + Size byte. bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const { - if (!V.is<const Value *>()) + if (!isa<const Value *>(V)) return false; - const Value *BasePtr = V.get<const Value *>(); + const Value *BasePtr = cast<const Value *>(V); if (BasePtr == nullptr) return false; @@ -1070,8 +1074,8 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f, AtomicOrdering FailureOrdering) : PtrInfo(ptrinfo), MemoryType(type), FlagVals(f), BaseAlign(a), AAInfo(AAInfo), Ranges(Ranges) { - assert((PtrInfo.V.isNull() || PtrInfo.V.is<const PseudoSourceValue *>() || - isa<PointerType>(PtrInfo.V.get<const Value *>()->getType())) && + assert((PtrInfo.V.isNull() || isa<const PseudoSourceValue *>(PtrInfo.V) || + isa<PointerType>(cast<const Value *>(PtrInfo.V)->getType())) && "invalid pointer value"); assert((isLoad() || isStore()) && "Not a load/store!"); @@ -1093,16 +1097,6 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f, s == ~UINT64_C(0) ? LLT() : LLT::scalar(8 * s), a, AAInfo, Ranges, SSID, Ordering, FailureOrdering) {} -/// Profile - Gather unique data for the object. -/// -void MachineMemOperand::Profile(FoldingSetNodeID &ID) const { - ID.AddInteger(getOffset()); - ID.AddInteger(getMemoryType().getUniqueRAWLLTData()); - ID.AddPointer(getOpaqueValue()); - ID.AddInteger(getFlags()); - ID.AddInteger(getBaseAlign().value()); -} - void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) { // The Value and Offset may differ due to CSE. But the flags and size // should be the same. diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp index c7ba66bd3678..a0769105c929 100644 --- a/llvm/lib/CodeGen/MachineOutliner.cpp +++ b/llvm/lib/CodeGen/MachineOutliner.cpp @@ -89,11 +89,14 @@ STATISTIC(NumOutlined, "Number of candidates outlined"); STATISTIC(FunctionsCreated, "Number of functions created"); // Statistics for instruction mapping. -STATISTIC(NumLegalInUnsignedVec, "Number of legal instrs in unsigned vector"); +STATISTIC(NumLegalInUnsignedVec, "Outlinable instructions mapped"); STATISTIC(NumIllegalInUnsignedVec, - "Number of illegal instrs in unsigned vector"); -STATISTIC(NumInvisible, "Number of invisible instrs in unsigned vector"); -STATISTIC(UnsignedVecSize, "Size of unsigned vector"); + "Unoutlinable instructions mapped + number of sentinel values"); +STATISTIC(NumSentinels, "Sentinel values inserted during mapping"); +STATISTIC(NumInvisible, + "Invisible instructions skipped during mapping"); +STATISTIC(UnsignedVecSize, + "Total number of instructions mapped and saved to mapping vector"); // Set to true if the user wants the outliner to run on linkonceodr linkage // functions. This is false by default because the linker can dedupe linkonceodr @@ -113,6 +116,11 @@ static cl::opt<unsigned> OutlinerReruns( cl::desc( "Number of times to rerun the outliner after the initial outline")); +static cl::opt<unsigned> OutlinerBenefitThreshold( + "outliner-benefit-threshold", cl::init(1), cl::Hidden, + cl::desc( + "The minimum size in bytes before an outlining candidate is accepted")); + namespace { /// Maps \p MachineInstrs to unsigned integers and stores the mappings. @@ -136,11 +144,11 @@ struct InstructionMapper { DenseMap<MachineBasicBlock *, unsigned> MBBFlagsMap; /// The vector of unsigned integers that the module is mapped to. - std::vector<unsigned> UnsignedVec; + SmallVector<unsigned> UnsignedVec; /// Stores the location of the instruction associated with the integer /// at index i in \p UnsignedVec for each index i. - std::vector<MachineBasicBlock::iterator> InstrList; + SmallVector<MachineBasicBlock::iterator> InstrList; // Set if we added an illegal number in the previous step. // Since each illegal number is unique, we only need one of them between @@ -157,8 +165,8 @@ struct InstructionMapper { unsigned mapToLegalUnsigned( MachineBasicBlock::iterator &It, bool &CanOutlineWithPrevInstr, bool &HaveLegalRange, unsigned &NumLegalInBlock, - std::vector<unsigned> &UnsignedVecForMBB, - std::vector<MachineBasicBlock::iterator> &InstrListForMBB) { + SmallVector<unsigned> &UnsignedVecForMBB, + SmallVector<MachineBasicBlock::iterator> &InstrListForMBB) { // We added something legal, so we should unset the AddedLegalLastTime // flag. AddedIllegalLastTime = false; @@ -211,8 +219,8 @@ struct InstructionMapper { /// \returns The integer that \p *It was mapped to. unsigned mapToIllegalUnsigned( MachineBasicBlock::iterator &It, bool &CanOutlineWithPrevInstr, - std::vector<unsigned> &UnsignedVecForMBB, - std::vector<MachineBasicBlock::iterator> &InstrListForMBB) { + SmallVector<unsigned> &UnsignedVecForMBB, + SmallVector<MachineBasicBlock::iterator> &InstrListForMBB) { // Can't outline an illegal instruction. Set the flag. CanOutlineWithPrevInstr = false; @@ -254,12 +262,20 @@ struct InstructionMapper { /// \param TII \p TargetInstrInfo for the function. void convertToUnsignedVec(MachineBasicBlock &MBB, const TargetInstrInfo &TII) { + LLVM_DEBUG(dbgs() << "*** Converting MBB '" << MBB.getName() + << "' to unsigned vector ***\n"); unsigned Flags = 0; // Don't even map in this case. if (!TII.isMBBSafeToOutlineFrom(MBB, Flags)) return; + auto OutlinableRanges = TII.getOutlinableRanges(MBB, Flags); + LLVM_DEBUG(dbgs() << MBB.getName() << ": " << OutlinableRanges.size() + << " outlinable range(s)\n"); + if (OutlinableRanges.empty()) + return; + // Store info for the MBB for later outlining. MBBFlagsMap[&MBB] = Flags; @@ -279,40 +295,71 @@ struct InstructionMapper { // FIXME: Should this all just be handled in the target, rather than using // repeated calls to getOutliningType? - std::vector<unsigned> UnsignedVecForMBB; - std::vector<MachineBasicBlock::iterator> InstrListForMBB; - - for (MachineBasicBlock::iterator Et = MBB.end(); It != Et; ++It) { - // Keep track of where this instruction is in the module. - switch (TII.getOutliningType(It, Flags)) { - case InstrType::Illegal: + SmallVector<unsigned> UnsignedVecForMBB; + SmallVector<MachineBasicBlock::iterator> InstrListForMBB; + + LLVM_DEBUG(dbgs() << "*** Mapping outlinable ranges ***\n"); + for (auto &OutlinableRange : OutlinableRanges) { + auto OutlinableRangeBegin = OutlinableRange.first; + auto OutlinableRangeEnd = OutlinableRange.second; +#ifndef NDEBUG + LLVM_DEBUG( + dbgs() << "Mapping " + << std::distance(OutlinableRangeBegin, OutlinableRangeEnd) + << " instruction range\n"); + // Everything outside of an outlinable range is illegal. + unsigned NumSkippedInRange = 0; +#endif + for (; It != OutlinableRangeBegin; ++It) { +#ifndef NDEBUG + ++NumSkippedInRange; +#endif mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB, InstrListForMBB); - break; - - case InstrType::Legal: - mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange, - NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB); - break; - - case InstrType::LegalTerminator: - mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange, - NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB); - // The instruction also acts as a terminator, so we have to record that - // in the string. - mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB, + } +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << "Skipped " << NumSkippedInRange + << " instructions outside outlinable range\n"); +#endif + assert(It != MBB.end() && "Should still have instructions?"); + // `It` is now positioned at the beginning of a range of instructions + // which may be outlinable. Check if each instruction is known to be safe. + for (; It != OutlinableRangeEnd; ++It) { + // Keep track of where this instruction is in the module. + switch (TII.getOutliningType(It, Flags)) { + case InstrType::Illegal: + mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB, + InstrListForMBB); + break; + + case InstrType::Legal: + mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange, + NumLegalInBlock, UnsignedVecForMBB, + InstrListForMBB); + break; + + case InstrType::LegalTerminator: + mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange, + NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB); - break; - - case InstrType::Invisible: - // Normally this is set by mapTo(Blah)Unsigned, but we just want to - // skip this instruction. So, unset the flag here. - ++NumInvisible; - AddedIllegalLastTime = false; - break; + // The instruction also acts as a terminator, so we have to record + // that in the string. + mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB, + InstrListForMBB); + break; + + case InstrType::Invisible: + // Normally this is set by mapTo(Blah)Unsigned, but we just want to + // skip this instruction. So, unset the flag here. + ++NumInvisible; + AddedIllegalLastTime = false; + break; + } } } + LLVM_DEBUG(dbgs() << "HaveLegalRange = " << HaveLegalRange << "\n"); + // Are there enough legal instructions in the block for outlining to be // possible? if (HaveLegalRange) { @@ -322,8 +369,9 @@ struct InstructionMapper { // repeated substring. mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB, InstrListForMBB); - llvm::append_range(InstrList, InstrListForMBB); - llvm::append_range(UnsignedVec, UnsignedVecForMBB); + ++NumSentinels; + append_range(InstrList, InstrListForMBB); + append_range(UnsignedVec, UnsignedVecForMBB); } } @@ -533,11 +581,19 @@ void MachineOutliner::findCandidates( // First, find all of the repeated substrings in the tree of minimum length // 2. std::vector<Candidate> CandidatesForRepeatedSeq; + LLVM_DEBUG(dbgs() << "*** Discarding overlapping candidates *** \n"); + LLVM_DEBUG( + dbgs() << "Searching for overlaps in all repeated sequences...\n"); for (const SuffixTree::RepeatedSubstring &RS : ST) { CandidatesForRepeatedSeq.clear(); unsigned StringLen = RS.Length; + LLVM_DEBUG(dbgs() << " Sequence length: " << StringLen << "\n"); + // Debug code to keep track of how many candidates we removed. +#ifndef NDEBUG + unsigned NumDiscarded = 0; + unsigned NumKept = 0; +#endif for (const unsigned &StartIdx : RS.StartIndices) { - unsigned EndIdx = StartIdx + StringLen - 1; // Trick: Discard some candidates that would be incompatible with the // ones we've already found for this sequence. This will save us some // work in candidate selection. @@ -559,23 +615,39 @@ void MachineOutliner::findCandidates( // That is, one must either // * End before the other starts // * Start after the other ends - if (llvm::all_of(CandidatesForRepeatedSeq, [&StartIdx, - &EndIdx](const Candidate &C) { - return (EndIdx < C.getStartIdx() || StartIdx > C.getEndIdx()); - })) { - // It doesn't overlap with anything, so we can outline it. - // Each sequence is over [StartIt, EndIt]. - // Save the candidate and its location. - - MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx]; - MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx]; - MachineBasicBlock *MBB = StartIt->getParent(); - - CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt, - EndIt, MBB, FunctionList.size(), - Mapper.MBBFlagsMap[MBB]); + unsigned EndIdx = StartIdx + StringLen - 1; + auto FirstOverlap = find_if( + CandidatesForRepeatedSeq, [StartIdx, EndIdx](const Candidate &C) { + return EndIdx >= C.getStartIdx() && StartIdx <= C.getEndIdx(); + }); + if (FirstOverlap != CandidatesForRepeatedSeq.end()) { +#ifndef NDEBUG + ++NumDiscarded; + LLVM_DEBUG(dbgs() << " .. DISCARD candidate @ [" << StartIdx + << ", " << EndIdx << "]; overlaps with candidate @ [" + << FirstOverlap->getStartIdx() << ", " + << FirstOverlap->getEndIdx() << "]\n"); +#endif + continue; } + // It doesn't overlap with anything, so we can outline it. + // Each sequence is over [StartIt, EndIt]. + // Save the candidate and its location. +#ifndef NDEBUG + ++NumKept; +#endif + MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx]; + MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx]; + MachineBasicBlock *MBB = StartIt->getParent(); + CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt, EndIt, + MBB, FunctionList.size(), + Mapper.MBBFlagsMap[MBB]); } +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << " Candidates discarded: " << NumDiscarded + << "\n"); + LLVM_DEBUG(dbgs() << " Candidates kept: " << NumKept << "\n\n"); +#endif // We've found something we might want to outline. // Create an OutlinedFunction to store it and check if it'd be beneficial @@ -588,21 +660,21 @@ void MachineOutliner::findCandidates( const TargetInstrInfo *TII = CandidatesForRepeatedSeq[0].getMF()->getSubtarget().getInstrInfo(); - OutlinedFunction OF = + std::optional<OutlinedFunction> OF = TII->getOutliningCandidateInfo(CandidatesForRepeatedSeq); // If we deleted too many candidates, then there's nothing worth outlining. // FIXME: This should take target-specified instruction sizes into account. - if (OF.Candidates.size() < 2) + if (!OF || OF->Candidates.size() < 2) continue; // Is it better to outline this candidate than not? - if (OF.getBenefit() < 1) { - emitNotOutliningCheaperRemark(StringLen, CandidatesForRepeatedSeq, OF); + if (OF->getBenefit() < OutlinerBenefitThreshold) { + emitNotOutliningCheaperRemark(StringLen, CandidatesForRepeatedSeq, *OF); continue; } - FunctionList.push_back(OF); + FunctionList.push_back(*OF); } } @@ -616,6 +688,7 @@ MachineFunction *MachineOutliner::createOutlinedFunction( if (OutlineRepeatedNum > 0) FunctionName += std::to_string(OutlineRepeatedNum + 1) + "_"; FunctionName += std::to_string(Name); + LLVM_DEBUG(dbgs() << "NEW FUNCTION: " << FunctionName << "\n"); // Create the function using an IR-level function. LLVMContext &C = M.getContext(); @@ -653,6 +726,7 @@ MachineFunction *MachineOutliner::createOutlinedFunction( MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); + MF.setIsOutlined(true); MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock(); // Insert the new function into the module. @@ -720,7 +794,7 @@ MachineFunction *MachineOutliner::createOutlinedFunction( Mangler Mg; // Get the mangled name of the function for the linkage name. std::string Dummy; - llvm::raw_string_ostream MangledNameStream(Dummy); + raw_string_ostream MangledNameStream(Dummy); Mg.getNameWithPrefix(MangledNameStream, F, false); DISubprogram *OutlinedSP = DB.createFunction( @@ -750,30 +824,51 @@ bool MachineOutliner::outline(Module &M, std::vector<OutlinedFunction> &FunctionList, InstructionMapper &Mapper, unsigned &OutlinedFunctionNum) { - + LLVM_DEBUG(dbgs() << "*** Outlining ***\n"); + LLVM_DEBUG(dbgs() << "NUMBER OF POTENTIAL FUNCTIONS: " << FunctionList.size() + << "\n"); bool OutlinedSomething = false; // Sort by benefit. The most beneficial functions should be outlined first. - llvm::stable_sort(FunctionList, [](const OutlinedFunction &LHS, - const OutlinedFunction &RHS) { - return LHS.getBenefit() > RHS.getBenefit(); - }); + stable_sort(FunctionList, + [](const OutlinedFunction &LHS, const OutlinedFunction &RHS) { + return LHS.getBenefit() > RHS.getBenefit(); + }); // Walk over each function, outlining them as we go along. Functions are // outlined greedily, based off the sort above. + auto *UnsignedVecBegin = Mapper.UnsignedVec.begin(); + LLVM_DEBUG(dbgs() << "WALKING FUNCTION LIST\n"); for (OutlinedFunction &OF : FunctionList) { +#ifndef NDEBUG + auto NumCandidatesBefore = OF.Candidates.size(); +#endif // If we outlined something that overlapped with a candidate in a previous // step, then we can't outline from it. - erase_if(OF.Candidates, [&Mapper](Candidate &C) { - return std::any_of( - Mapper.UnsignedVec.begin() + C.getStartIdx(), - Mapper.UnsignedVec.begin() + C.getEndIdx() + 1, - [](unsigned I) { return (I == static_cast<unsigned>(-1)); }); + erase_if(OF.Candidates, [&UnsignedVecBegin](Candidate &C) { + return std::any_of(UnsignedVecBegin + C.getStartIdx(), + UnsignedVecBegin + C.getEndIdx() + 1, [](unsigned I) { + return I == static_cast<unsigned>(-1); + }); }); +#ifndef NDEBUG + auto NumCandidatesAfter = OF.Candidates.size(); + LLVM_DEBUG(dbgs() << "PRUNED: " << NumCandidatesBefore - NumCandidatesAfter + << "/" << NumCandidatesBefore << " candidates\n"); +#endif + // If we made it unbeneficial to outline this function, skip it. - if (OF.getBenefit() < 1) + if (OF.getBenefit() < OutlinerBenefitThreshold) { + LLVM_DEBUG(dbgs() << "SKIP: Expected benefit (" << OF.getBenefit() + << " B) < threshold (" << OutlinerBenefitThreshold + << " B)\n"); continue; + } + + LLVM_DEBUG(dbgs() << "OUTLINE: Expected benefit (" << OF.getBenefit() + << " B) > threshold (" << OutlinerBenefitThreshold + << " B)\n"); // It's beneficial. Create the function and outline its sequence's // occurrences. @@ -786,6 +881,7 @@ bool MachineOutliner::outline(Module &M, const TargetInstrInfo &TII = *STI.getInstrInfo(); // Replace occurrences of the sequence with calls to the new function. + LLVM_DEBUG(dbgs() << "CREATE OUTLINED CALLS\n"); for (Candidate &C : OF.Candidates) { MachineBasicBlock &MBB = *C.getMBB(); MachineBasicBlock::iterator StartIt = C.front(); @@ -793,6 +889,18 @@ bool MachineOutliner::outline(Module &M, // Insert the call. auto CallInst = TII.insertOutlinedCall(M, MBB, StartIt, *MF, C); +// Insert the call. +#ifndef NDEBUG + auto MBBBeingOutlinedFromName = + MBB.getName().empty() ? "<unknown>" : MBB.getName().str(); + auto MFBeingOutlinedFromName = MBB.getParent()->getName().empty() + ? "<unknown>" + : MBB.getParent()->getName().str(); + LLVM_DEBUG(dbgs() << " CALL: " << MF->getName() << " in " + << MFBeingOutlinedFromName << ":" + << MBBBeingOutlinedFromName << "\n"); + LLVM_DEBUG(dbgs() << " .. " << *CallInst); +#endif // If the caller tracks liveness, then we need to make sure that // anything we outline doesn't break liveness assumptions. The outlined @@ -859,9 +967,8 @@ bool MachineOutliner::outline(Module &M, MBB.erase(std::next(StartIt), std::next(EndIt)); // Keep track of what we removed by marking them all as -1. - for (unsigned &I : - llvm::make_range(Mapper.UnsignedVec.begin() + C.getStartIdx(), - Mapper.UnsignedVec.begin() + C.getEndIdx() + 1)) + for (unsigned &I : make_range(UnsignedVecBegin + C.getStartIdx(), + UnsignedVecBegin + C.getEndIdx() + 1)) I = static_cast<unsigned>(-1); OutlinedSomething = true; @@ -878,13 +985,12 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M, MachineModuleInfo &MMI) { // Build instruction mappings for each function in the module. Start by // iterating over each Function in M. + LLVM_DEBUG(dbgs() << "*** Populating mapper ***\n"); for (Function &F : M) { + LLVM_DEBUG(dbgs() << "MAPPING FUNCTION: " << F.getName() << "\n"); if (F.hasFnAttribute("nooutline")) { - LLVM_DEBUG({ - dbgs() << "... Skipping function with nooutline attribute: " - << F.getName() << "\n"; - }); + LLVM_DEBUG(dbgs() << "SKIP: Function has nooutline attribute\n"); continue; } @@ -894,44 +1000,58 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M, // If it doesn't, then there's nothing to outline from. Move to the next // Function. - if (!MF) + if (!MF) { + LLVM_DEBUG(dbgs() << "SKIP: Function does not have a MachineFunction\n"); continue; + } const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - - if (!RunOnAllFunctions && !TII->shouldOutlineFromFunctionByDefault(*MF)) + if (!RunOnAllFunctions && !TII->shouldOutlineFromFunctionByDefault(*MF)) { + LLVM_DEBUG(dbgs() << "SKIP: Target does not want to outline from " + "function by default\n"); continue; + } // We have a MachineFunction. Ask the target if it's suitable for outlining. // If it isn't, then move on to the next Function in the module. - if (!TII->isFunctionSafeToOutlineFrom(*MF, OutlineFromLinkOnceODRs)) + if (!TII->isFunctionSafeToOutlineFrom(*MF, OutlineFromLinkOnceODRs)) { + LLVM_DEBUG(dbgs() << "SKIP: " << MF->getName() + << ": unsafe to outline from\n"); continue; + } // We have a function suitable for outlining. Iterate over every // MachineBasicBlock in MF and try to map its instructions to a list of // unsigned integers. + const unsigned MinMBBSize = 2; + for (MachineBasicBlock &MBB : *MF) { + LLVM_DEBUG(dbgs() << " MAPPING MBB: '" << MBB.getName() << "'\n"); // If there isn't anything in MBB, then there's no point in outlining from // it. // If there are fewer than 2 instructions in the MBB, then it can't ever // contain something worth outlining. // FIXME: This should be based off of the maximum size in B of an outlined // call versus the size in B of the MBB. - if (MBB.empty() || MBB.size() < 2) + if (MBB.size() < MinMBBSize) { + LLVM_DEBUG(dbgs() << " SKIP: MBB size less than minimum size of " + << MinMBBSize << "\n"); continue; + } // Check if MBB could be the target of an indirect branch. If it is, then // we don't want to outline from it. - if (MBB.hasAddressTaken()) + if (MBB.hasAddressTaken()) { + LLVM_DEBUG(dbgs() << " SKIP: MBB's address is taken\n"); continue; + } // MBB is suitable for outlining. Map it to a list of unsigneds. Mapper.convertToUnsignedVec(MBB, *TII); } - - // Statistics. - UnsignedVecSize = Mapper.UnsignedVec.size(); } + // Statistics. + UnsignedVecSize = Mapper.UnsignedVec.size(); } void MachineOutliner::initSizeRemarkInfo( diff --git a/llvm/lib/CodeGen/MachinePassManager.cpp b/llvm/lib/CodeGen/MachinePassManager.cpp index 039634f3d047..439ff8babcc6 100644 --- a/llvm/lib/CodeGen/MachinePassManager.cpp +++ b/llvm/lib/CodeGen/MachinePassManager.cpp @@ -91,8 +91,8 @@ Error MachineFunctionPassManager::run(Module &M, // TODO: EmitSizeRemarks PreservedAnalyses PassPA = P->run(MF, MFAM); - PI.runAfterPass(*P, MF, PassPA); MFAM.invalidate(MF, PassPA); + PI.runAfterPass(*P, MF, PassPA); } } } while (true); diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index adb630469003..c7e7497dab36 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -496,7 +496,7 @@ void SwingSchedulerDAG::schedule() { updatePhiDependences(); Topo.InitDAGTopologicalSorting(); changeDependences(); - postprocessDAG(); + postProcessDAG(); LLVM_DEBUG(dump()); NodeSetType NodeSets; @@ -865,13 +865,11 @@ void SwingSchedulerDAG::updatePhiDependences() { unsigned HasPhiDef = 0; MachineInstr *MI = I.getInstr(); // Iterate over each operand, and we process the definitions. - for (MachineInstr::mop_iterator MOI = MI->operands_begin(), - MOE = MI->operands_end(); - MOI != MOE; ++MOI) { - if (!MOI->isReg()) + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) continue; - Register Reg = MOI->getReg(); - if (MOI->isDef()) { + Register Reg = MO.getReg(); + if (MO.isDef()) { // If the register is used by a Phi, then create an anti dependence. for (MachineRegisterInfo::use_instr_iterator UI = MRI.use_instr_begin(Reg), @@ -893,7 +891,7 @@ void SwingSchedulerDAG::updatePhiDependences() { } } } - } else if (MOI->isUse()) { + } else if (MO.isUse()) { // If the register is defined by a Phi, then create a true dependence. MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg); if (DefMI == nullptr) @@ -903,7 +901,7 @@ void SwingSchedulerDAG::updatePhiDependences() { if (!MI->isPHI()) { SDep Dep(SU, SDep::Data, Reg); Dep.setLatency(0); - ST.adjustSchedDependency(SU, 0, &I, MI->getOperandNo(MOI), Dep); + ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep); I.addPred(Dep); } else { HasPhiUse = Reg; @@ -1559,31 +1557,28 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, const MachineInstr *MI = SU->getInstr(); if (MI->isPHI()) continue; - for (const MachineOperand &MO : MI->operands()) - if (MO.isReg() && MO.isUse()) { - Register Reg = MO.getReg(); - if (Reg.isVirtual()) - Uses.insert(Reg); - else if (MRI.isAllocatable(Reg)) - for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid(); - ++Units) - Uses.insert(*Units); - } + for (const MachineOperand &MO : MI->all_uses()) { + Register Reg = MO.getReg(); + if (Reg.isVirtual()) + Uses.insert(Reg); + else if (MRI.isAllocatable(Reg)) + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) + Uses.insert(Unit); + } } for (SUnit *SU : NS) - for (const MachineOperand &MO : SU->getInstr()->operands()) - if (MO.isReg() && MO.isDef() && !MO.isDead()) { + for (const MachineOperand &MO : SU->getInstr()->all_defs()) + if (!MO.isDead()) { Register Reg = MO.getReg(); if (Reg.isVirtual()) { if (!Uses.count(Reg)) LiveOutRegs.push_back(RegisterMaskPair(Reg, LaneBitmask::getNone())); } else if (MRI.isAllocatable(Reg)) { - for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid(); - ++Units) - if (!Uses.count(*Units)) - LiveOutRegs.push_back(RegisterMaskPair(*Units, - LaneBitmask::getNone())); + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) + if (!Uses.count(Unit)) + LiveOutRegs.push_back( + RegisterMaskPair(Unit, LaneBitmask::getNone())); } } RPTracker.addLiveRegs(LiveOutRegs); @@ -2316,7 +2311,7 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep, return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD); } -void SwingSchedulerDAG::postprocessDAG() { +void SwingSchedulerDAG::postProcessDAG() { for (auto &M : Mutations) M->apply(this); } @@ -2654,10 +2649,7 @@ bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, if (!isLoopCarried(SSD, *Phi)) return false; unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent()); - for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) { - MachineOperand &DMO = Def->getOperand(i); - if (!DMO.isReg() || !DMO.isDef()) - continue; + for (MachineOperand &DMO : Def->all_defs()) { if (DMO.getReg() == LoopReg) return true; } diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp index 1ad08e19feae..0048918fc53b 100644 --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -101,13 +101,13 @@ MachineRegisterInfo::constrainRegAttrs(Register Reg, const auto RegCB = getRegClassOrRegBank(Reg); if (RegCB.isNull()) setRegClassOrRegBank(Reg, ConstrainingRegCB); - else if (RegCB.is<const TargetRegisterClass *>() != - ConstrainingRegCB.is<const TargetRegisterClass *>()) + else if (isa<const TargetRegisterClass *>(RegCB) != + isa<const TargetRegisterClass *>(ConstrainingRegCB)) return false; - else if (RegCB.is<const TargetRegisterClass *>()) { + else if (isa<const TargetRegisterClass *>(RegCB)) { if (!::constrainRegClass( - *this, Reg, RegCB.get<const TargetRegisterClass *>(), - ConstrainingRegCB.get<const TargetRegisterClass *>(), MinNumRegs)) + *this, Reg, cast<const TargetRegisterClass *>(RegCB), + cast<const TargetRegisterClass *>(ConstrainingRegCB), MinNumRegs)) return false; } else if (RegCB != ConstrainingRegCB) return false; @@ -644,16 +644,8 @@ void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) { bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const { const TargetRegisterInfo *TRI = getTargetRegisterInfo(); for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) { - bool IsRootReserved = true; - for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true); - Super.isValid(); ++Super) { - MCRegister Reg = *Super; - if (!isReserved(Reg)) { - IsRootReserved = false; - break; - } - } - if (IsRootReserved) + if (all_of(TRI->superregs_inclusive(*Root), + [&](MCPhysReg Super) { return isReserved(Super); })) return true; } return false; diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index 6de8f8da9254..324084fb9c32 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -21,8 +21,6 @@ using namespace llvm; -const Register MachineSSAContext::ValueRefNull{}; - void MachineSSAContext::setFunction(MachineFunction &Fn) { MF = &Fn; RegInfo = &MF->getRegInfo(); @@ -42,10 +40,8 @@ void MachineSSAContext::appendBlockTerms( void MachineSSAContext::appendBlockDefs(SmallVectorImpl<Register> &defs, const MachineBasicBlock &block) { for (const MachineInstr &instr : block.instrs()) { - for (const MachineOperand &op : instr.operands()) { - if (op.isReg() && op.isDef()) - defs.push_back(op.getReg()); - } + for (const MachineOperand &op : instr.all_defs()) + defs.push_back(op.getReg()); } } @@ -56,7 +52,7 @@ MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return RegInfo->getVRegDef(value)->getParent(); } -bool MachineSSAContext::isConstantValuePhi(const MachineInstr &Phi) { +bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { return Phi.isConstantValuePHI(); } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 5ab5a40e7574..ba5432459d12 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachinePassRegistry.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" @@ -56,7 +57,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> @@ -98,9 +98,13 @@ cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden, cl::opt<bool> MISchedDumpReservedCycles( "misched-dump-reserved-cycles", cl::Hidden, cl::init(false), cl::desc("Dump resource usage at schedule boundary.")); +cl::opt<bool> MischedDetailResourceBooking( + "misched-detail-resource-booking", cl::Hidden, cl::init(false), + cl::desc("Show details of invoking getNextResoufceCycle.")); #else const bool ViewMISchedDAGs = false; const bool PrintDAGs = false; +const bool MischedDetailResourceBooking = false; #ifdef LLVM_ENABLE_DUMP const bool MISchedDumpReservedCycles = false; #endif // LLVM_ENABLE_DUMP @@ -147,6 +151,28 @@ static cl::opt<unsigned> cl::desc("The threshold for fast cluster"), cl::init(1000)); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +static cl::opt<bool> MISchedDumpScheduleTrace( + "misched-dump-schedule-trace", cl::Hidden, cl::init(false), + cl::desc("Dump resource usage at schedule boundary.")); +static cl::opt<unsigned> + HeaderColWidth("misched-dump-schedule-trace-col-header-width", cl::Hidden, + cl::desc("Set width of the columns with " + "the resources and schedule units"), + cl::init(19)); +static cl::opt<unsigned> + ColWidth("misched-dump-schedule-trace-col-width", cl::Hidden, + cl::desc("Set width of the columns showing resource booking."), + cl::init(5)); +static cl::opt<bool> MISchedSortResourcesInTrace( + "misched-sort-resources-in-trace", cl::Hidden, cl::init(true), + cl::desc("Sort the resources printed in the dump trace")); +#endif + +static cl::opt<unsigned> + MIResourceCutOff("misched-resource-cutoff", cl::Hidden, + cl::desc("Number of intervals to track"), cl::init(10)); + // DAG subtrees must have at least this many nodes. static const unsigned MinSubtreeSize = 8; @@ -777,7 +803,7 @@ void ScheduleDAGMI::schedule() { // Build the DAG. buildSchedGraph(AA); - postprocessDAG(); + postProcessDAG(); SmallVector<SUnit*, 8> TopRoots, BotRoots; findRootsAndBiasEdges(TopRoots, BotRoots); @@ -844,7 +870,7 @@ void ScheduleDAGMI::schedule() { } /// Apply each ScheduleDAGMutation step in order. -void ScheduleDAGMI::postprocessDAG() { +void ScheduleDAGMI::postProcessDAG() { for (auto &m : Mutations) m->apply(this); } @@ -931,7 +957,181 @@ void ScheduleDAGMI::placeDebugValues() { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +static const char *scheduleTableLegend = " i: issue\n x: resource booked"; + +LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceTopDown() const { + // Bail off when there is no schedule model to query. + if (!SchedModel.hasInstrSchedModel()) + return; + + // Nothing to show if there is no or just one instruction. + if (BB->size() < 2) + return; + + dbgs() << " * Schedule table (TopDown):\n"; + dbgs() << scheduleTableLegend << "\n"; + const unsigned FirstCycle = getSUnit(&*(std::begin(*this)))->TopReadyCycle; + unsigned LastCycle = getSUnit(&*(std::prev(std::end(*this))))->TopReadyCycle; + for (MachineInstr &MI : *this) { + SUnit *SU = getSUnit(&MI); + if (!SU) + continue; + const MCSchedClassDesc *SC = getSchedClass(SU); + for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); + PI != PE; ++PI) { + if (SU->TopReadyCycle + PI->Cycles - 1 > LastCycle) + LastCycle = SU->TopReadyCycle + PI->Cycles - 1; + } + } + // Print the header with the cycles + dbgs() << llvm::left_justify("Cycle", HeaderColWidth); + for (unsigned C = FirstCycle; C <= LastCycle; ++C) + dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth); + dbgs() << "|\n"; + + for (MachineInstr &MI : *this) { + SUnit *SU = getSUnit(&MI); + if (!SU) { + dbgs() << "Missing SUnit\n"; + continue; + } + std::string NodeName("SU("); + NodeName += std::to_string(SU->NodeNum) + ")"; + dbgs() << llvm::left_justify(NodeName, HeaderColWidth); + unsigned C = FirstCycle; + for (; C <= LastCycle; ++C) { + if (C == SU->TopReadyCycle) + dbgs() << llvm::left_justify("| i", ColWidth); + else + dbgs() << llvm::left_justify("|", ColWidth); + } + dbgs() << "|\n"; + const MCSchedClassDesc *SC = getSchedClass(SU); + + SmallVector<MCWriteProcResEntry, 4> ResourcesIt( + make_range(SchedModel.getWriteProcResBegin(SC), + SchedModel.getWriteProcResEnd(SC))); + + if (MISchedSortResourcesInTrace) + llvm::stable_sort(ResourcesIt, + [](const MCWriteProcResEntry &LHS, + const MCWriteProcResEntry &RHS) -> bool { + return LHS.StartAtCycle < RHS.StartAtCycle || + (LHS.StartAtCycle == RHS.StartAtCycle && + LHS.Cycles < RHS.Cycles); + }); + for (const MCWriteProcResEntry &PI : ResourcesIt) { + C = FirstCycle; + const std::string ResName = + SchedModel.getResourceName(PI.ProcResourceIdx); + dbgs() << llvm::right_justify(ResName + " ", HeaderColWidth); + for (; C < SU->TopReadyCycle + PI.StartAtCycle; ++C) { + dbgs() << llvm::left_justify("|", ColWidth); + } + for (unsigned I = 0, E = PI.Cycles - PI.StartAtCycle; I != E; ++I, ++C) + dbgs() << llvm::left_justify("| x", ColWidth); + while (C++ <= LastCycle) + dbgs() << llvm::left_justify("|", ColWidth); + // Place end char + dbgs() << "| \n"; + } + } +} + +LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceBottomUp() const { + // Bail off when there is no schedule model to query. + if (!SchedModel.hasInstrSchedModel()) + return; + + // Nothing to show if there is no or just one instruction. + if (BB->size() < 2) + return; + + dbgs() << " * Schedule table (BottomUp):\n"; + dbgs() << scheduleTableLegend << "\n"; + + const int FirstCycle = getSUnit(&*(std::begin(*this)))->BotReadyCycle; + int LastCycle = getSUnit(&*(std::prev(std::end(*this))))->BotReadyCycle; + for (MachineInstr &MI : *this) { + SUnit *SU = getSUnit(&MI); + if (!SU) + continue; + const MCSchedClassDesc *SC = getSchedClass(SU); + for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC), + PE = SchedModel.getWriteProcResEnd(SC); + PI != PE; ++PI) { + if ((int)SU->BotReadyCycle - PI->Cycles + 1 < LastCycle) + LastCycle = (int)SU->BotReadyCycle - PI->Cycles + 1; + } + } + // Print the header with the cycles + dbgs() << llvm::left_justify("Cycle", HeaderColWidth); + for (int C = FirstCycle; C >= LastCycle; --C) + dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth); + dbgs() << "|\n"; + + for (MachineInstr &MI : *this) { + SUnit *SU = getSUnit(&MI); + if (!SU) { + dbgs() << "Missing SUnit\n"; + continue; + } + std::string NodeName("SU("); + NodeName += std::to_string(SU->NodeNum) + ")"; + dbgs() << llvm::left_justify(NodeName, HeaderColWidth); + int C = FirstCycle; + for (; C >= LastCycle; --C) { + if (C == (int)SU->BotReadyCycle) + dbgs() << llvm::left_justify("| i", ColWidth); + else + dbgs() << llvm::left_justify("|", ColWidth); + } + dbgs() << "|\n"; + const MCSchedClassDesc *SC = getSchedClass(SU); + SmallVector<MCWriteProcResEntry, 4> ResourcesIt( + make_range(SchedModel.getWriteProcResBegin(SC), + SchedModel.getWriteProcResEnd(SC))); + + if (MISchedSortResourcesInTrace) + llvm::stable_sort(ResourcesIt, + [](const MCWriteProcResEntry &LHS, + const MCWriteProcResEntry &RHS) -> bool { + return LHS.StartAtCycle < RHS.StartAtCycle || + (LHS.StartAtCycle == RHS.StartAtCycle && + LHS.Cycles < RHS.Cycles); + }); + for (const MCWriteProcResEntry &PI : ResourcesIt) { + C = FirstCycle; + const std::string ResName = + SchedModel.getResourceName(PI.ProcResourceIdx); + dbgs() << llvm::right_justify(ResName + " ", HeaderColWidth); + for (; C > ((int)SU->BotReadyCycle - (int)PI.StartAtCycle); --C) { + dbgs() << llvm::left_justify("|", ColWidth); + } + for (unsigned I = 0, E = PI.Cycles - PI.StartAtCycle; I != E; ++I, --C) + dbgs() << llvm::left_justify("| x", ColWidth); + while (C-- >= LastCycle) + dbgs() << llvm::left_justify("|", ColWidth); + // Place end char + dbgs() << "| \n"; + } + } +} +#endif + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const { + if (MISchedDumpScheduleTrace) { + if (ForceTopDown) + dumpScheduleTraceTopDown(); + else if (ForceBottomUp) + dumpScheduleTraceBottomUp(); + else { + dbgs() << "* Schedule table (Bidirectional): not implemented\n"; + } + } + for (MachineInstr &MI : *this) { if (SUnit *SU = getSUnit(&MI)) dumpNode(*SU); @@ -967,8 +1167,8 @@ void ScheduleDAGMILive::collectVRegUses(SUnit &SU) { // Ignore re-defs. if (TrackLaneMasks) { bool FoundDef = false; - for (const MachineOperand &MO2 : MI.operands()) { - if (MO2.isReg() && MO2.isDef() && MO2.getReg() == Reg && !MO2.isDead()) { + for (const MachineOperand &MO2 : MI.all_defs()) { + if (MO2.getReg() == Reg && !MO2.isDead()) { FoundDef = true; break; } @@ -1223,7 +1423,7 @@ void ScheduleDAGMILive::schedule() { LLVM_DEBUG(SchedImpl->dumpPolicy()); buildDAGWithRegPressure(); - postprocessDAG(); + postProcessDAG(); SmallVector<SUnit*, 8> TopRoots, BotRoots; findRootsAndBiasEdges(TopRoots, BotRoots); @@ -2008,6 +2208,7 @@ void SchedBoundary::reset() { ZoneCritResIdx = 0; IsResourceLimited = false; ReservedCycles.clear(); + ReservedResourceSegments.clear(); ReservedCyclesIndex.clear(); ResourceGroupSubUnitMasks.clear(); #if LLVM_ENABLE_ABI_BREAKING_CHECKS @@ -2036,7 +2237,8 @@ init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) { PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { unsigned PIdx = PI->ProcResourceIdx; unsigned Factor = SchedModel->getResourceFactor(PIdx); - RemainingCounts[PIdx] += (Factor * PI->Cycles); + assert(PI->Cycles >= PI->StartAtCycle); + RemainingCounts[PIdx] += (Factor * (PI->Cycles - PI->StartAtCycle)); } } } @@ -2089,14 +2291,24 @@ unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) { /// Compute the next cycle at which the given processor resource unit /// can be scheduled. unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx, - unsigned Cycles) { + unsigned Cycles, + unsigned StartAtCycle) { + if (SchedModel && SchedModel->enableIntervals()) { + if (isTop()) + return ReservedResourceSegments[InstanceIdx].getFirstAvailableAtFromTop( + CurrCycle, StartAtCycle, Cycles); + + return ReservedResourceSegments[InstanceIdx].getFirstAvailableAtFromBottom( + CurrCycle, StartAtCycle, Cycles); + } + unsigned NextUnreserved = ReservedCycles[InstanceIdx]; // If this resource has never been used, always return cycle zero. if (NextUnreserved == InvalidCycle) - return 0; + return CurrCycle; // For bottom-up scheduling add the cycles needed for the current operation. if (!isTop()) - NextUnreserved += Cycles; + NextUnreserved = std::max(CurrCycle, NextUnreserved + Cycles); return NextUnreserved; } @@ -2105,8 +2317,12 @@ unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx, /// instance in the reserved cycles vector. std::pair<unsigned, unsigned> SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, - unsigned Cycles) { - + unsigned Cycles, unsigned StartAtCycle) { + if (MischedDetailResourceBooking) { + LLVM_DEBUG(dbgs() << " Resource booking (@" << CurrCycle << "c): \n"); + LLVM_DEBUG(dumpReservedCycles()); + LLVM_DEBUG(dbgs() << " getNextResourceCycle (@" << CurrCycle << "c): \n"); + } unsigned MinNextUnreserved = InvalidCycle; unsigned InstanceIdx = 0; unsigned StartIndex = ReservedCyclesIndex[PIdx]; @@ -2134,7 +2350,7 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, for (unsigned I = 0, End = NumberOfInstances; I < End; ++I) { unsigned NextUnreserved, NextInstanceIdx; std::tie(NextUnreserved, NextInstanceIdx) = - getNextResourceCycle(SC, SubUnits[I], Cycles); + getNextResourceCycle(SC, SubUnits[I], Cycles, StartAtCycle); if (MinNextUnreserved > NextUnreserved) { InstanceIdx = NextInstanceIdx; MinNextUnreserved = NextUnreserved; @@ -2145,12 +2361,21 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, for (unsigned I = StartIndex, End = StartIndex + NumberOfInstances; I < End; ++I) { - unsigned NextUnreserved = getNextResourceCycleByInstance(I, Cycles); + unsigned NextUnreserved = + getNextResourceCycleByInstance(I, Cycles, StartAtCycle); + if (MischedDetailResourceBooking) + LLVM_DEBUG(dbgs() << " Instance " << I - StartIndex << " available @" + << NextUnreserved << "c\n"); if (MinNextUnreserved > NextUnreserved) { InstanceIdx = I; MinNextUnreserved = NextUnreserved; } } + if (MischedDetailResourceBooking) + LLVM_DEBUG(dbgs() << " selecting " << SchedModel->getResourceName(PIdx) + << "[" << InstanceIdx - StartIndex << "]" + << " available @" << MinNextUnreserved << "c" + << "\n"); return std::make_pair(MinNextUnreserved, InstanceIdx); } @@ -2195,8 +2420,10 @@ bool SchedBoundary::checkHazard(SUnit *SU) { SchedModel->getWriteProcResEnd(SC))) { unsigned ResIdx = PE.ProcResourceIdx; unsigned Cycles = PE.Cycles; + unsigned StartAtCycle = PE.StartAtCycle; unsigned NRCycle, InstanceIdx; - std::tie(NRCycle, InstanceIdx) = getNextResourceCycle(SC, ResIdx, Cycles); + std::tie(NRCycle, InstanceIdx) = + getNextResourceCycle(SC, ResIdx, Cycles, StartAtCycle); if (NRCycle > CurrCycle) { #if LLVM_ENABLE_ABI_BREAKING_CHECKS MaxObservedStall = std::max(Cycles, MaxObservedStall); @@ -2347,9 +2574,10 @@ void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) { /// \return the next cycle at which the instruction may execute without /// oversubscribing resources. unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx, - unsigned Cycles, unsigned NextCycle) { + unsigned Cycles, unsigned NextCycle, + unsigned StartAtCycle) { unsigned Factor = SchedModel->getResourceFactor(PIdx); - unsigned Count = Factor * Cycles; + unsigned Count = Factor * (Cycles - StartAtCycle); LLVM_DEBUG(dbgs() << " " << SchedModel->getResourceName(PIdx) << " +" << Cycles << "x" << Factor << "u\n"); @@ -2369,7 +2597,8 @@ unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx, } // For reserved resources, record the highest cycle using the resource. unsigned NextAvailable, InstanceIdx; - std::tie(NextAvailable, InstanceIdx) = getNextResourceCycle(SC, PIdx, Cycles); + std::tie(NextAvailable, InstanceIdx) = + getNextResourceCycle(SC, PIdx, Cycles, StartAtCycle); if (NextAvailable > CurrCycle) { LLVM_DEBUG(dbgs() << " Resource conflict: " << SchedModel->getResourceName(PIdx) @@ -2448,8 +2677,8 @@ void SchedBoundary::bumpNode(SUnit *SU) { for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC), PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { - unsigned RCycle = - countResource(SC, PI->ProcResourceIdx, PI->Cycles, NextCycle); + unsigned RCycle = countResource(SC, PI->ProcResourceIdx, PI->Cycles, + NextCycle, PI->StartAtCycle); if (RCycle > NextCycle) NextCycle = RCycle; } @@ -2463,14 +2692,33 @@ void SchedBoundary::bumpNode(SUnit *SU) { PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) { unsigned PIdx = PI->ProcResourceIdx; if (SchedModel->getProcResource(PIdx)->BufferSize == 0) { - unsigned ReservedUntil, InstanceIdx; - std::tie(ReservedUntil, InstanceIdx) = - getNextResourceCycle(SC, PIdx, 0); - if (isTop()) { - ReservedCycles[InstanceIdx] = - std::max(ReservedUntil, NextCycle + PI->Cycles); - } else - ReservedCycles[InstanceIdx] = NextCycle; + + if (SchedModel && SchedModel->enableIntervals()) { + unsigned ReservedUntil, InstanceIdx; + std::tie(ReservedUntil, InstanceIdx) = + getNextResourceCycle(SC, PIdx, PI->Cycles, PI->StartAtCycle); + if (isTop()) { + ReservedResourceSegments[InstanceIdx].add( + ResourceSegments::getResourceIntervalTop( + NextCycle, PI->StartAtCycle, PI->Cycles), + MIResourceCutOff); + } else { + ReservedResourceSegments[InstanceIdx].add( + ResourceSegments::getResourceIntervalBottom( + NextCycle, PI->StartAtCycle, PI->Cycles), + MIResourceCutOff); + } + } else { + + unsigned ReservedUntil, InstanceIdx; + std::tie(ReservedUntil, InstanceIdx) = + getNextResourceCycle(SC, PIdx, PI->Cycles, PI->StartAtCycle); + if (isTop()) { + ReservedCycles[InstanceIdx] = + std::max(ReservedUntil, NextCycle + PI->Cycles); + } else + ReservedCycles[InstanceIdx] = NextCycle; + } } } } @@ -2610,8 +2858,14 @@ LLVM_DUMP_METHOD void SchedBoundary::dumpReservedCycles() const { const unsigned NumUnits = SchedModel->getProcResource(ResIdx)->NumUnits; std::string ResName = SchedModel->getResourceName(ResIdx); for (unsigned UnitIdx = 0; UnitIdx < NumUnits; ++UnitIdx) { - dbgs() << ResName << "(" << UnitIdx - << ") = " << ReservedCycles[StartIdx + UnitIdx] << "\n"; + dbgs() << ResName << "(" << UnitIdx << ") = "; + if (SchedModel && SchedModel->enableIntervals()) { + if (ReservedResourceSegments.count(StartIdx + UnitIdx)) + dbgs() << ReservedResourceSegments.at(StartIdx + UnitIdx); + else + dbgs() << "{ }\n"; + } else + dbgs() << ReservedCycles[StartIdx + UnitIdx] << "\n"; } StartIdx += NumUnits; } @@ -3978,3 +4232,101 @@ void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) { void ScheduleDAGMI::viewGraph() { viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName()); } + +/// Sort predicate for the intervals stored in an instance of +/// ResourceSegments. Intervals are always disjoint (no intersection +/// for any pairs of intervals), therefore we can sort the totality of +/// the intervals by looking only at the left boundary. +static bool sortIntervals(const ResourceSegments::IntervalTy &A, + const ResourceSegments::IntervalTy &B) { + return A.first < B.first; +} + +unsigned ResourceSegments::getFirstAvailableAt( + unsigned CurrCycle, unsigned StartAtCycle, unsigned Cycle, + std::function<ResourceSegments::IntervalTy(unsigned, unsigned, unsigned)> + IntervalBuilder) const { + assert(std::is_sorted(std::begin(_Intervals), std::end(_Intervals), + sortIntervals) && + "Cannot execute on an un-sorted set of intervals."); + unsigned RetCycle = CurrCycle; + ResourceSegments::IntervalTy NewInterval = + IntervalBuilder(RetCycle, StartAtCycle, Cycle); + for (auto &Interval : _Intervals) { + if (!intersects(NewInterval, Interval)) + continue; + + // Move the interval right next to the top of the one it + // intersects. + assert(Interval.second > NewInterval.first && + "Invalid intervals configuration."); + RetCycle += (unsigned)Interval.second - (unsigned)NewInterval.first; + NewInterval = IntervalBuilder(RetCycle, StartAtCycle, Cycle); + } + return RetCycle; +} + +void ResourceSegments::add(ResourceSegments::IntervalTy A, + const unsigned CutOff) { + assert(A.first < A.second && "Cannot add empty resource usage"); + assert(CutOff > 0 && "0-size interval history has no use."); + assert(all_of(_Intervals, + [&A](const ResourceSegments::IntervalTy &Interval) -> bool { + return !intersects(A, Interval); + }) && + "A resource is being overwritten"); + _Intervals.push_back(A); + + sortAndMerge(); + + // Do not keep the full history of the intervals, just the + // latest #CutOff. + while (_Intervals.size() > CutOff) + _Intervals.pop_front(); +} + +bool ResourceSegments::intersects(ResourceSegments::IntervalTy A, + ResourceSegments::IntervalTy B) { + assert(A.first <= A.second && "Invalid interval"); + assert(B.first <= B.second && "Invalid interval"); + + // Share one boundary. + if ((A.first == B.first) || (A.second == B.second)) + return true; + + // full intersersect: [ *** ) B + // [***) A + if ((A.first > B.first) && (A.second < B.second)) + return true; + + // right intersect: [ ***) B + // [*** ) A + if ((A.first > B.first) && (A.first < B.second) && (A.second > B.second)) + return true; + + // left intersect: [*** ) B + // [ ***) A + if ((A.first < B.first) && (B.first < A.second) && (B.second > B.first)) + return true; + + return false; +} + +void ResourceSegments::sortAndMerge() { + if (_Intervals.size() <= 1) + return; + + // First sort the collection. + _Intervals.sort(sortIntervals); + + // can use next because I have at least 2 elements in the list + auto next = std::next(std::begin(_Intervals)); + auto E = std::end(_Intervals); + for (; next != E; ++next) { + if (std::prev(next)->second >= next->first) { + next->first = std::prev(next)->first; + _Intervals.erase(std::prev(next)); + continue; + } + } +} diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 8429d468254a..8da97dc7e742 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -115,15 +115,15 @@ STATISTIC(NumPostRACopySink, "Number of copies sunk after RA"); namespace { class MachineSinking : public MachineFunctionPass { - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - MachineRegisterInfo *MRI; // Machine register information - MachineDominatorTree *DT; // Machine dominator tree - MachinePostDominatorTree *PDT; // Machine post dominator tree - MachineCycleInfo *CI; - MachineBlockFrequencyInfo *MBFI; - const MachineBranchProbabilityInfo *MBPI; - AliasAnalysis *AA; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI = nullptr; // Machine register information + MachineDominatorTree *DT = nullptr; // Machine dominator tree + MachinePostDominatorTree *PDT = nullptr; // Machine post dominator tree + MachineCycleInfo *CI = nullptr; + MachineBlockFrequencyInfo *MBFI = nullptr; + const MachineBranchProbabilityInfo *MBPI = nullptr; + AliasAnalysis *AA = nullptr; RegisterClassInfo RegClassInfo; // Remember which edges have been considered for breaking. @@ -268,6 +268,44 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE, "Machine code sinking", false, false) +/// Return true if a target defined block prologue instruction interferes +/// with a sink candidate. +static bool blockPrologueInterferes(const MachineBasicBlock *BB, + MachineBasicBlock::const_iterator End, + const MachineInstr &MI, + const TargetRegisterInfo *TRI, + const TargetInstrInfo *TII, + const MachineRegisterInfo *MRI) { + for (MachineBasicBlock::const_iterator PI = BB->getFirstNonPHI(); PI != End; + ++PI) { + // Only check target defined prologue instructions + if (!TII->isBasicBlockPrologue(*PI)) + continue; + for (auto &MO : MI.operands()) { + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (!Reg) + continue; + if (MO.isUse()) { + if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg)) + continue; + if (PI->modifiesRegister(Reg, TRI)) + return true; + } else { + if (PI->readsRegister(Reg, TRI)) + return true; + // Check for interference with non-dead defs + auto *DefOp = PI->findRegisterDefOperand(Reg, false, true, TRI); + if (DefOp && !DefOp->isDead()) + return true; + } + } + } + + return false; +} + bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI, MachineBasicBlock *MBB) { if (!MI.isCopy()) @@ -331,7 +369,7 @@ bool MachineSinking::AllUsesDominatedByBlock(Register Reg, // %p = PHI %y, %bb.0, %def, %bb.1 if (all_of(MRI->use_nodbg_operands(Reg), [&](MachineOperand &MO) { MachineInstr *UseInst = MO.getParent(); - unsigned OpNo = UseInst->getOperandNo(&MO); + unsigned OpNo = MO.getOperandNo(); MachineBasicBlock *UseBlock = UseInst->getParent(); return UseBlock == MBB && UseInst->isPHI() && UseInst->getOperand(OpNo + 1).getMBB() == DefMBB; @@ -602,9 +640,7 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr &MI, // MI is cheap, we probably don't want to break the critical edge for it. // However, if this would allow some definitions of its source operands // to be sunk then it's probably worth it. - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.isUse()) - continue; + for (const MachineOperand &MO : MI.all_uses()) { Register Reg = MO.getReg(); if (Reg == 0) continue; @@ -806,12 +842,10 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI, continue; if (Reg.isPhysical()) { - if (MO.isUse() && - (MRI->isConstantPhysReg(Reg) || TII->isIgnorableUse(MO))) - continue; - - // Don't handle non-constant and non-ignorable physical register. - return false; + // Don't handle non-constant and non-ignorable physical register uses. + if (MO.isUse() && !MRI->isConstantPhysReg(Reg) && !TII->isIgnorableUse(MO)) + return false; + continue; } // Users for the defs are all dominated by SuccToSinkTo. @@ -972,16 +1006,24 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB, if (MBB == SuccToSinkTo) return nullptr; + if (!SuccToSinkTo) + return nullptr; + // It's not safe to sink instructions to EH landing pad. Control flow into // landing pad is implicitly defined. - if (SuccToSinkTo && SuccToSinkTo->isEHPad()) + if (SuccToSinkTo->isEHPad()) return nullptr; // It ought to be okay to sink instructions into an INLINEASM_BR target, but // only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in // the source block (which this code does not yet do). So for now, forbid // doing so. - if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget()) + if (SuccToSinkTo->isInlineAsmBrIndirectTarget()) + return nullptr; + + MachineBasicBlock::const_iterator InsertPos = + SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin()); + if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI)) return nullptr; return SuccToSinkTo; @@ -1302,45 +1344,6 @@ bool MachineSinking::SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I) { return true; } -/// Return true if a target defined block prologue instruction interferes -/// with a sink candidate. -static bool blockPrologueInterferes(MachineBasicBlock *BB, - MachineBasicBlock::iterator End, - MachineInstr &MI, - const TargetRegisterInfo *TRI, - const TargetInstrInfo *TII, - const MachineRegisterInfo *MRI) { - if (BB->begin() == End) - return false; // no prologue - for (MachineBasicBlock::iterator PI = BB->getFirstNonPHI(); PI != End; ++PI) { - // Only check target defined prologue instructions - if (!TII->isBasicBlockPrologue(*PI)) - continue; - for (auto &MO : MI.operands()) { - if (!MO.isReg()) - continue; - Register Reg = MO.getReg(); - if (!Reg) - continue; - if (MO.isUse()) { - if (Reg.isPhysical() && - (TII->isIgnorableUse(MO) || (MRI && MRI->isConstantPhysReg(Reg)))) - continue; - if (PI->modifiesRegister(Reg, TRI)) - return true; - } else { - if (PI->readsRegister(Reg, TRI)) - return true; - // Check for interference with non-dead defs - auto *DefOp = PI->findRegisterDefOperand(Reg, false, true, TRI); - if (DefOp && !DefOp->isDead()) - return true; - } - } - } - return false; -} - /// SinkInstruction - Determine whether it is safe to sink the specified machine /// instruction out of its current block into a successor. bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, @@ -1383,9 +1386,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, // If the instruction to move defines a dead physical register which is live // when leaving the basic block, don't move it because it could turn into a // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>) - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || MO.isUse()) - continue; + for (const MachineOperand &MO : MI.all_defs()) { Register Reg = MO.getReg(); if (Reg == 0 || !Reg.isPhysical()) continue; @@ -1463,8 +1464,8 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, // Collect debug users of any vreg that this inst defines. SmallVector<MIRegs, 4> DbgUsersToSink; - for (auto &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual()) + for (auto &MO : MI.all_defs()) { + if (!MO.getReg().isVirtual()) continue; if (!SeenDbgUsers.count(MO.getReg())) continue; @@ -1498,10 +1499,8 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore, // Note that we have to clear the kill flags for any register this instruction // uses as we may sink over another instruction which currently kills the // used registers. - for (MachineOperand &MO : MI.operands()) { - if (MO.isReg() && MO.isUse()) - RegsToClearKillFlags.insert(MO.getReg()); // Remember to clear kill flags. - } + for (MachineOperand &MO : MI.all_uses()) + RegsToClearKillFlags.insert(MO.getReg()); // Remember to clear kill flags. return true; } @@ -1517,8 +1516,8 @@ void MachineSinking::SalvageUnsunkDebugUsersOfCopy( SmallVector<MachineInstr *, 4> DbgDefUsers; SmallVector<Register, 4> DbgUseRegs; const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - for (auto &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual()) + for (auto &MO : MI.all_defs()) { + if (!MO.getReg().isVirtual()) continue; DbgUseRegs.push_back(MO.getReg()); for (auto &User : MRI.use_instructions(MO.getReg())) { @@ -1700,8 +1699,8 @@ static void updateLiveIn(MachineInstr *MI, MachineBasicBlock *SuccBB, MachineFunction &MF = *SuccBB->getParent(); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (unsigned DefReg : DefedRegsInCopy) - for (MCSubRegIterator S(DefReg, TRI, true); S.isValid(); ++S) - SuccBB->removeLiveIn(*S); + for (MCPhysReg S : TRI->subregs_inclusive(DefReg)) + SuccBB->removeLiveIn(S); for (auto U : UsedOpsInCopy) { Register SrcReg = MI->getOperand(U).getReg(); LaneBitmask Mask; @@ -1793,9 +1792,8 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB, } // Record debug use of each reg unit. - for (auto RI = MCRegUnitIterator(MO.getReg(), TRI); RI.isValid(); - ++RI) - MIUnits[*RI].push_back(MO.getReg()); + for (MCRegUnit Unit : TRI->regunits(MO.getReg())) + MIUnits[Unit].push_back(MO.getReg()); } } if (IsValid) { @@ -1844,12 +1842,9 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB, // recorded which reg units that DBG_VALUEs read, if this instruction // writes any of those units then the corresponding DBG_VALUEs must sink. MapVector<MachineInstr *, MIRegs::second_type> DbgValsToSinkMap; - for (auto &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef()) - continue; - - for (auto RI = MCRegUnitIterator(MO.getReg(), TRI); RI.isValid(); ++RI) { - for (const auto &MIRegs : SeenDbgInstrs.lookup(*RI)) { + for (auto &MO : MI.all_defs()) { + for (MCRegUnit Unit : TRI->regunits(MO.getReg())) { + for (const auto &MIRegs : SeenDbgInstrs.lookup(Unit)) { auto &Regs = DbgValsToSinkMap[MIRegs.first]; for (unsigned Reg : MIRegs.second) Regs.push_back(Reg); diff --git a/llvm/lib/CodeGen/MachineSizeOpts.cpp b/llvm/lib/CodeGen/MachineSizeOpts.cpp index 28712d1a816b..53bed7397d09 100644 --- a/llvm/lib/CodeGen/MachineSizeOpts.cpp +++ b/llvm/lib/CodeGen/MachineSizeOpts.cpp @@ -24,168 +24,11 @@ extern cl::opt<bool> ForcePGSO; extern cl::opt<int> PgsoCutoffInstrProf; extern cl::opt<int> PgsoCutoffSampleProf; -namespace { -namespace machine_size_opts_detail { - -/// Like ProfileSummaryInfo::isColdBlock but for MachineBasicBlock. -bool isColdBlock(const MachineBasicBlock *MBB, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - auto Count = MBFI->getBlockProfileCount(MBB); - return Count && PSI->isColdCount(*Count); -} - -bool isColdBlock(BlockFrequency BlockFreq, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - auto Count = MBFI->getProfileCountFromFreq(BlockFreq.getFrequency()); - return Count && PSI->isColdCount(*Count); -} - -/// Like ProfileSummaryInfo::isHotBlockNthPercentile but for MachineBasicBlock. -static bool isHotBlockNthPercentile(int PercentileCutoff, - const MachineBasicBlock *MBB, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - auto Count = MBFI->getBlockProfileCount(MBB); - return Count && PSI->isHotCountNthPercentile(PercentileCutoff, *Count); -} - -static bool isHotBlockNthPercentile(int PercentileCutoff, - BlockFrequency BlockFreq, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - auto Count = MBFI->getProfileCountFromFreq(BlockFreq.getFrequency()); - return Count && PSI->isHotCountNthPercentile(PercentileCutoff, *Count); -} - -static bool isColdBlockNthPercentile(int PercentileCutoff, - const MachineBasicBlock *MBB, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - auto Count = MBFI->getBlockProfileCount(MBB); - return Count && PSI->isColdCountNthPercentile(PercentileCutoff, *Count); -} - -static bool isColdBlockNthPercentile(int PercentileCutoff, - BlockFrequency BlockFreq, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - auto Count = MBFI->getProfileCountFromFreq(BlockFreq.getFrequency()); - return Count && PSI->isColdCountNthPercentile(PercentileCutoff, *Count); -} - -/// Like ProfileSummaryInfo::isFunctionColdInCallGraph but for -/// MachineFunction. -bool isFunctionColdInCallGraph( - const MachineFunction *MF, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo &MBFI) { - if (auto FunctionCount = MF->getFunction().getEntryCount()) - if (!PSI->isColdCount(FunctionCount->getCount())) - return false; - for (const auto &MBB : *MF) - if (!isColdBlock(&MBB, PSI, &MBFI)) - return false; - return true; -} - -/// Like ProfileSummaryInfo::isFunctionHotInCallGraphNthPercentile but for -/// MachineFunction. -bool isFunctionHotInCallGraphNthPercentile( - int PercentileCutoff, - const MachineFunction *MF, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo &MBFI) { - if (auto FunctionCount = MF->getFunction().getEntryCount()) - if (PSI->isHotCountNthPercentile(PercentileCutoff, - FunctionCount->getCount())) - return true; - for (const auto &MBB : *MF) - if (isHotBlockNthPercentile(PercentileCutoff, &MBB, PSI, &MBFI)) - return true; - return false; -} - -bool isFunctionColdInCallGraphNthPercentile( - int PercentileCutoff, const MachineFunction *MF, ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo &MBFI) { - if (auto FunctionCount = MF->getFunction().getEntryCount()) - if (!PSI->isColdCountNthPercentile(PercentileCutoff, - FunctionCount->getCount())) - return false; - for (const auto &MBB : *MF) - if (!isColdBlockNthPercentile(PercentileCutoff, &MBB, PSI, &MBFI)) - return false; - return true; -} -} // namespace machine_size_opts_detail - -struct MachineBasicBlockBFIAdapter { - static bool isFunctionColdInCallGraph(const MachineFunction *MF, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo &MBFI) { - return machine_size_opts_detail::isFunctionColdInCallGraph(MF, PSI, MBFI); - } - static bool isFunctionHotInCallGraphNthPercentile( - int CutOff, - const MachineFunction *MF, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo &MBFI) { - return machine_size_opts_detail::isFunctionHotInCallGraphNthPercentile( - CutOff, MF, PSI, MBFI); - } - static bool isFunctionColdInCallGraphNthPercentile( - int CutOff, const MachineFunction *MF, ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo &MBFI) { - return machine_size_opts_detail::isFunctionColdInCallGraphNthPercentile( - CutOff, MF, PSI, MBFI); - } - static bool isColdBlock(const MachineBasicBlock *MBB, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - return machine_size_opts_detail::isColdBlock(MBB, PSI, MBFI); - } - static bool isColdBlock(BlockFrequency BlockFreq, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - return machine_size_opts_detail::isColdBlock(BlockFreq, PSI, MBFI); - } - static bool isHotBlockNthPercentile(int CutOff, - const MachineBasicBlock *MBB, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - return machine_size_opts_detail::isHotBlockNthPercentile( - CutOff, MBB, PSI, MBFI); - } - static bool isHotBlockNthPercentile(int CutOff, - BlockFrequency BlockFreq, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - return machine_size_opts_detail::isHotBlockNthPercentile( - CutOff, BlockFreq, PSI, MBFI); - } - static bool isColdBlockNthPercentile(int CutOff, const MachineBasicBlock *MBB, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - return machine_size_opts_detail::isColdBlockNthPercentile(CutOff, MBB, PSI, - MBFI); - } - static bool isColdBlockNthPercentile(int CutOff, BlockFrequency BlockFreq, - ProfileSummaryInfo *PSI, - const MachineBlockFrequencyInfo *MBFI) { - return machine_size_opts_detail::isColdBlockNthPercentile(CutOff, BlockFreq, - PSI, MBFI); - } -}; -} // end anonymous namespace - bool llvm::shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *MBFI, PGSOQueryType QueryType) { - return shouldFuncOptimizeForSizeImpl<MachineBasicBlockBFIAdapter>( - MF, PSI, MBFI, QueryType); + return shouldFuncOptimizeForSizeImpl(MF, PSI, MBFI, QueryType); } bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB, @@ -193,8 +36,7 @@ bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB, const MachineBlockFrequencyInfo *MBFI, PGSOQueryType QueryType) { assert(MBB); - return shouldOptimizeForSizeImpl<MachineBasicBlockBFIAdapter>( - MBB, PSI, MBFI, QueryType); + return shouldOptimizeForSizeImpl(MBB, PSI, MBFI, QueryType); } bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB, @@ -205,6 +47,6 @@ bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB, if (!PSI || !MBFIW) return false; BlockFrequency BlockFreq = MBFIW->getBlockFreq(MBB); - return shouldOptimizeForSizeImpl<MachineBasicBlockBFIAdapter>( - BlockFreq, PSI, &MBFIW->getMBFI(), QueryType); + return shouldOptimizeForSizeImpl(BlockFreq, PSI, &MBFIW->getMBFI(), + QueryType); } diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp index 5c6efd4af074..4f66f2e672d1 100644 --- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -318,6 +318,21 @@ public: : MachineTraceMetrics::Ensemble(mtm) {} }; +/// Pick only the current basic block for the trace and do not choose any +/// predecessors/successors. +class LocalEnsemble : public MachineTraceMetrics::Ensemble { + const char *getName() const override { return "Local"; } + const MachineBasicBlock *pickTracePred(const MachineBasicBlock *) override { + return nullptr; + }; + const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock *) override { + return nullptr; + }; + +public: + LocalEnsemble(MachineTraceMetrics *MTM) + : MachineTraceMetrics::Ensemble(MTM) {} +}; } // end anonymous namespace // Select the preferred predecessor for MBB. @@ -380,15 +395,19 @@ MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) { // Get an Ensemble sub-class for the requested trace strategy. MachineTraceMetrics::Ensemble * -MachineTraceMetrics::getEnsemble(MachineTraceMetrics::Strategy strategy) { - assert(strategy < TS_NumStrategies && "Invalid trace strategy enum"); - Ensemble *&E = Ensembles[strategy]; +MachineTraceMetrics::getEnsemble(MachineTraceStrategy strategy) { + assert(strategy < MachineTraceStrategy::TS_NumStrategies && + "Invalid trace strategy enum"); + Ensemble *&E = Ensembles[static_cast<size_t>(strategy)]; if (E) return E; // Allocate new Ensemble on demand. switch (strategy) { - case TS_MinInstrCount: return (E = new MinInstrCountEnsemble(this)); + case MachineTraceStrategy::TS_MinInstrCount: + return (E = new MinInstrCountEnsemble(this)); + case MachineTraceStrategy::TS_Local: + return (E = new LocalEnsemble(this)); default: llvm_unreachable("Invalid trace strategy enum"); } } @@ -655,9 +674,7 @@ static bool getDataDeps(const MachineInstr &UseMI, return false; bool HasPhysRegs = false; - for (MachineInstr::const_mop_iterator I = UseMI.operands_begin(), - E = UseMI.operands_end(); I != E; ++I) { - const MachineOperand &MO = *I; + for (const MachineOperand &MO : UseMI.operands()) { if (!MO.isReg()) continue; Register Reg = MO.getReg(); @@ -669,7 +686,7 @@ static bool getDataDeps(const MachineInstr &UseMI, } // Collect virtual register reads. if (MO.readsReg()) - Deps.push_back(DataDep(MRI, Reg, UseMI.getOperandNo(I))); + Deps.push_back(DataDep(MRI, Reg, MO.getOperandNo())); } return HasPhysRegs; } @@ -703,9 +720,7 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI, SmallVector<MCRegister, 8> Kills; SmallVector<unsigned, 8> LiveDefOps; - for (MachineInstr::const_mop_iterator MI = UseMI->operands_begin(), - ME = UseMI->operands_end(); MI != ME; ++MI) { - const MachineOperand &MO = *MI; + for (const MachineOperand &MO : UseMI->operands()) { if (!MO.isReg() || !MO.getReg().isPhysical()) continue; MCRegister Reg = MO.getReg().asMCReg(); @@ -714,17 +729,17 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI, if (MO.isDead()) Kills.push_back(Reg); else - LiveDefOps.push_back(UseMI->getOperandNo(MI)); + LiveDefOps.push_back(MO.getOperandNo()); } else if (MO.isKill()) Kills.push_back(Reg); // Identify dependencies. if (!MO.readsReg()) continue; - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) { - SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units); + for (MCRegUnit Unit : TRI->regunits(Reg)) { + SparseSet<LiveRegUnit>::iterator I = RegUnits.find(Unit); if (I == RegUnits.end()) continue; - Deps.push_back(DataDep(I->MI, I->Op, UseMI->getOperandNo(MI))); + Deps.push_back(DataDep(I->MI, I->Op, MO.getOperandNo())); break; } } @@ -732,15 +747,14 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI, // Update RegUnits to reflect live registers after UseMI. // First kills. for (MCRegister Kill : Kills) - for (MCRegUnitIterator Units(Kill, TRI); Units.isValid(); ++Units) - RegUnits.erase(*Units); + for (MCRegUnit Unit : TRI->regunits(Kill)) + RegUnits.erase(Unit); // Second, live defs. for (unsigned DefOp : LiveDefOps) { - for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg().asMCReg(), - TRI); - Units.isValid(); ++Units) { - LiveRegUnit &LRU = RegUnits[*Units]; + for (MCRegUnit Unit : + TRI->regunits(UseMI->getOperand(DefOp).getReg().asMCReg())) { + LiveRegUnit &LRU = RegUnits[Unit]; LRU.MI = UseMI; LRU.Op = DefOp; } @@ -895,31 +909,27 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height, const TargetRegisterInfo *TRI) { SmallVector<unsigned, 8> ReadOps; - for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(), - MOE = MI.operands_end(); - MOI != MOE; ++MOI) { - const MachineOperand &MO = *MOI; + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; Register Reg = MO.getReg(); if (!Reg.isPhysical()) continue; if (MO.readsReg()) - ReadOps.push_back(MI.getOperandNo(MOI)); + ReadOps.push_back(MO.getOperandNo()); if (!MO.isDef()) continue; // This is a def of Reg. Remove corresponding entries from RegUnits, and // update MI Height to consider the physreg dependencies. - for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid(); - ++Units) { - SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units); + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) { + SparseSet<LiveRegUnit>::iterator I = RegUnits.find(Unit); if (I == RegUnits.end()) continue; unsigned DepHeight = I->Cycle; if (!MI.isTransient()) { // We may not know the UseMI of this dependency, if it came from the // live-in list. SchedModel can handle a NULL UseMI. - DepHeight += SchedModel.computeOperandLatency(&MI, MI.getOperandNo(MOI), + DepHeight += SchedModel.computeOperandLatency(&MI, MO.getOperandNo(), I->MI, I->Op); } Height = std::max(Height, DepHeight); @@ -931,8 +941,8 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height, // Now we know the height of MI. Update any regunits read. for (size_t I = 0, E = ReadOps.size(); I != E; ++I) { MCRegister Reg = MI.getOperand(ReadOps[I]).getReg().asMCReg(); - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) { - LiveRegUnit &LRU = RegUnits[*Units]; + for (MCRegUnit Unit : TRI->regunits(Reg)) { + LiveRegUnit &LRU = RegUnits[Unit]; // Set the height to the highest reader of the unit. if (LRU.Cycle <= Height && LRU.MI != &MI) { LRU.Cycle = Height; @@ -1087,10 +1097,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) { } // Go through the block backwards. - for (MachineBasicBlock::const_iterator BI = MBB->end(), BB = MBB->begin(); - BI != BB;) { - const MachineInstr &MI = *--BI; - + for (const MachineInstr &MI : reverse(*MBB)) { // Find the MI height as determined by virtual register uses in the // trace below. unsigned Cycle = 0; @@ -1137,11 +1144,10 @@ computeInstrHeights(const MachineBasicBlock *MBB) { } // Transfer the live regunits to the live-in list. - for (SparseSet<LiveRegUnit>::const_iterator - RI = RegUnits.begin(), RE = RegUnits.end(); RI != RE; ++RI) { - TBI.LiveIns.push_back(LiveInReg(RI->RegUnit, RI->Cycle)); - LLVM_DEBUG(dbgs() << ' ' << printRegUnit(RI->RegUnit, MTM.TRI) << '@' - << RI->Cycle); + for (const LiveRegUnit &RU : RegUnits) { + TBI.LiveIns.push_back(LiveInReg(RU.RegUnit, RU.Cycle)); + LLVM_DEBUG(dbgs() << ' ' << printRegUnit(RU.RegUnit, MTM.TRI) << '@' + << RU.Cycle); } LLVM_DEBUG(dbgs() << '\n'); diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp index 2fe5e40a58c2..0e02c50284c6 100644 --- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp @@ -20,9 +20,7 @@ using namespace llvm; template <> bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::hasDivergentDefs( const MachineInstr &I) const { - for (auto &op : I.operands()) { - if (!op.isReg() || !op.isDef()) - continue; + for (auto &op : I.all_defs()) { if (isDivergent(op.getReg())) return true; } @@ -31,21 +29,17 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::hasDivergentDefs( template <> bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::markDefsDivergent( - const MachineInstr &Instr, bool AllDefsDivergent) { + const MachineInstr &Instr) { bool insertedDivergent = false; const auto &MRI = F.getRegInfo(); + const auto &RBI = *F.getSubtarget().getRegBankInfo(); const auto &TRI = *MRI.getTargetRegisterInfo(); - for (auto &op : Instr.operands()) { - if (!op.isReg() || !op.isDef()) - continue; + for (auto &op : Instr.all_defs()) { if (!op.getReg().isVirtual()) continue; assert(!op.getSubReg()); - if (!AllDefsDivergent) { - auto *RC = MRI.getRegClassOrNull(op.getReg()); - if (RC && !TRI.isDivergentRegClass(RC)) - continue; - } + if (TRI.isUniformReg(MRI, RBI, op.getReg())) + continue; insertedDivergent |= markDivergent(op.getReg()); } return insertedDivergent; @@ -64,7 +58,7 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() { } if (uniformity == InstructionUniformity::NeverUniform) { - markDefsDivergent(instr, /* AllDefsDivergent = */ false); + markDivergent(instr); } } } @@ -73,12 +67,10 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() { template <> void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::pushUsers( Register Reg) { + assert(isDivergent(Reg)); const auto &RegInfo = F.getRegInfo(); for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) { - if (isAlwaysUniform(UserInstr)) - continue; - if (markDivergent(UserInstr)) - Worklist.push_back(&UserInstr); + markDivergent(UserInstr); } } @@ -88,9 +80,10 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::pushUsers( assert(!isAlwaysUniform(Instr)); if (Instr.isTerminator()) return; - for (const MachineOperand &op : Instr.operands()) { - if (op.isReg() && op.isDef() && op.getReg().isVirtual()) - pushUsers(op.getReg()); + for (const MachineOperand &op : Instr.all_defs()) { + auto Reg = op.getReg(); + if (isDivergent(Reg)) + pushUsers(Reg); } } @@ -102,7 +95,12 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::usesValueFromCycle( if (!Op.isReg() || !Op.readsReg()) continue; auto Reg = Op.getReg(); - assert(Reg.isVirtual()); + + // FIXME: Physical registers need to be properly checked instead of always + // returning true + if (Reg.isPhysical()) + return true; + auto *Def = F.getRegInfo().getVRegDef(Reg); if (DefCycle.contains(Def->getParent())) return true; @@ -110,18 +108,59 @@ bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::usesValueFromCycle( return false; } +template <> +void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>:: + propagateTemporalDivergence(const MachineInstr &I, + const MachineCycle &DefCycle) { + const auto &RegInfo = F.getRegInfo(); + for (auto &Op : I.all_defs()) { + if (!Op.getReg().isVirtual()) + continue; + auto Reg = Op.getReg(); + if (isDivergent(Reg)) + continue; + for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) { + if (DefCycle.contains(UserInstr.getParent())) + continue; + markDivergent(UserInstr); + } + } +} + +template <> +bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse( + const MachineOperand &U) const { + if (!U.isReg()) + return false; + + auto Reg = U.getReg(); + if (isDivergent(Reg)) + return true; + + const auto &RegInfo = F.getRegInfo(); + auto *Def = RegInfo.getOneDef(Reg); + if (!Def) + return true; + + auto *DefInstr = Def->getParent(); + auto *UseInstr = U.getParent(); + return isTemporalDivergent(*UseInstr->getParent(), *DefInstr); +} + // This ensures explicit instantiation of // GenericUniformityAnalysisImpl::ImplDeleter::operator() template class llvm::GenericUniformityInfo<MachineSSAContext>; template struct llvm::GenericUniformityAnalysisImplDeleter< llvm::GenericUniformityAnalysisImpl<MachineSSAContext>>; -MachineUniformityInfo -llvm::computeMachineUniformityInfo(MachineFunction &F, - const MachineCycleInfo &cycleInfo, - const MachineDomTree &domTree) { +MachineUniformityInfo llvm::computeMachineUniformityInfo( + MachineFunction &F, const MachineCycleInfo &cycleInfo, + const MachineDomTree &domTree, bool HasBranchDivergence) { assert(F.getRegInfo().isSSA() && "Expected to be run on SSA form!"); - return MachineUniformityInfo(F, domTree, cycleInfo); + MachineUniformityInfo UI(F, domTree, cycleInfo); + if (HasBranchDivergence) + UI.compute(); + return UI; } namespace { @@ -181,7 +220,9 @@ void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const { bool MachineUniformityAnalysisPass::runOnMachineFunction(MachineFunction &MF) { auto &DomTree = getAnalysis<MachineDominatorTree>().getBase(); auto &CI = getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo(); - UI = computeMachineUniformityInfo(MF, CI, DomTree); + // FIXME: Query TTI::hasBranchDivergence. -run-pass seems to end up with a + // default NoTTI + UI = computeMachineUniformityInfo(MF, CI, DomTree, true); return false; } diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index ddd5a027c2cd..7acd3c4039e8 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -31,13 +31,13 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" -#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/CodeGenCommonISel.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeCalc.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/LowLevelType.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -58,6 +58,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" @@ -71,7 +72,6 @@ #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/LowLevelTypeImpl.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ModRef.h" #include "llvm/Support/raw_ostream.h" @@ -95,19 +95,19 @@ namespace { Pass *const PASS; const char *Banner; - const MachineFunction *MF; - const TargetMachine *TM; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - const MachineRegisterInfo *MRI; - const RegisterBankInfo *RBI; + const MachineFunction *MF = nullptr; + const TargetMachine *TM = nullptr; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const MachineRegisterInfo *MRI = nullptr; + const RegisterBankInfo *RBI = nullptr; - unsigned foundErrors; + unsigned foundErrors = 0; // Avoid querying the MachineFunctionProperties for each operand. - bool isFunctionRegBankSelected; - bool isFunctionSelected; - bool isFunctionTracksDebugUserValues; + bool isFunctionRegBankSelected = false; + bool isFunctionSelected = false; + bool isFunctionTracksDebugUserValues = false; using RegVector = SmallVector<Register, 16>; using RegMaskVector = SmallVector<const uint32_t *, 4>; @@ -115,8 +115,8 @@ namespace { using RegMap = DenseMap<Register, const MachineInstr *>; using BlockSet = SmallPtrSet<const MachineBasicBlock *, 8>; - const MachineInstr *FirstNonPHI; - const MachineInstr *FirstTerminator; + const MachineInstr *FirstNonPHI = nullptr; + const MachineInstr *FirstTerminator = nullptr; BlockSet FunctionBlocks; BitVector regsReserved; @@ -208,10 +208,10 @@ namespace { } // Analysis information if available - LiveVariables *LiveVars; - LiveIntervals *LiveInts; - LiveStacks *LiveStks; - SlotIndexes *Indexes; + LiveVariables *LiveVars = nullptr; + LiveIntervals *LiveInts = nullptr; + LiveStacks *LiveStks = nullptr; + SlotIndexes *Indexes = nullptr; void visitMachineFunctionBefore(); void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB); @@ -296,6 +296,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addUsedIfAvailable<LiveStacks>(); AU.addUsedIfAvailable<LiveVariables>(); + AU.addUsedIfAvailable<SlotIndexes>(); + AU.addUsedIfAvailable<LiveIntervals>(); AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -627,8 +629,11 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) { // it is an entry block or landing pad. for (const auto &LI : MBB->liveins()) { if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() && - MBB->getIterator() != MBB->getParent()->begin()) { - report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB); + MBB->getIterator() != MBB->getParent()->begin() && + !MBB->isInlineAsmBrIndirectTarget()) { + report("MBB has allocatable live-in, but isn't entry, landing-pad, or " + "inlineasm-br-indirect-target.", + MBB); report_context(LI.PhysReg); } } @@ -1746,6 +1751,13 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { report("alignment immediate must be >= 1", MI); break; } + case TargetOpcode::G_CONSTANT_POOL: { + if (!MI->getOperand(1).isCPI()) + report("Src operand 1 must be a constant pool index", MI); + if (!MRI->getType(MI->getOperand(0).getReg()).isPointer()) + report("Dst operand 0 must be a pointer", MI); + break; + } default: break; } @@ -2162,6 +2174,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { } const RegisterBank *RegBank = MRI->getRegBankOrNull(Reg); + const RegisterBankInfo *RBI = MF->getSubtarget().getRegBankInfo(); // If we're post-RegBankSelect, the gvreg must have a bank. if (!RegBank && isFunctionRegBankSelected) { @@ -2173,12 +2186,12 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { // Make sure the register fits into its register bank if any. if (RegBank && Ty.isValid() && - RegBank->getSize() < Ty.getSizeInBits()) { + RBI->getMaximumSize(RegBank->getID()) < Ty.getSizeInBits()) { report("Register bank is too small for virtual register", MO, MONum); errs() << "Register bank " << RegBank->getName() << " too small(" - << RegBank->getSize() << ") to fit " << Ty.getSizeInBits() - << "-bits\n"; + << RBI->getMaximumSize(RegBank->getID()) << ") to fit " + << Ty.getSizeInBits() << "-bits\n"; return; } } @@ -2427,12 +2440,11 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { SlotIndex UseIdx = LiveInts->getInstructionIndex(*MI); // Check the cached regunit intervals. if (Reg.isPhysical() && !isReserved(Reg)) { - for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid(); - ++Units) { - if (MRI->isReservedRegUnit(*Units)) + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) { + if (MRI->isReservedRegUnit(Unit)) continue; - if (const LiveRange *LR = LiveInts->getCachedRegUnit(*Units)) - checkLivenessAtUse(MO, MONum, UseIdx, *LR, *Units); + if (const LiveRange *LR = LiveInts->getCachedRegUnit(Unit)) + checkLivenessAtUse(MO, MONum, UseIdx, *LR, Unit); } } @@ -3096,108 +3108,109 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR, return; } - // No more checks for live-out segments. - if (S.end == LiveInts->getMBBEndIdx(EndMBB)) - return; - - // RegUnit intervals are allowed dead phis. - if (!Reg.isVirtual() && VNI->isPHIDef() && S.start == VNI->def && - S.end == VNI->def.getDeadSlot()) - return; - - // The live segment is ending inside EndMBB - const MachineInstr *MI = - LiveInts->getInstructionFromIndex(S.end.getPrevSlot()); - if (!MI) { - report("Live segment doesn't end at a valid instruction", EndMBB); - report_context(LR, Reg, LaneMask); - report_context(S); - return; - } - - // The block slot must refer to a basic block boundary. - if (S.end.isBlock()) { - report("Live segment ends at B slot of an instruction", EndMBB); - report_context(LR, Reg, LaneMask); - report_context(S); - } + // Checks for non-live-out segments. + if (S.end != LiveInts->getMBBEndIdx(EndMBB)) { + // RegUnit intervals are allowed dead phis. + if (!Reg.isVirtual() && VNI->isPHIDef() && S.start == VNI->def && + S.end == VNI->def.getDeadSlot()) + return; - if (S.end.isDead()) { - // Segment ends on the dead slot. - // That means there must be a dead def. - if (!SlotIndex::isSameInstr(S.start, S.end)) { - report("Live segment ending at dead slot spans instructions", EndMBB); + // The live segment is ending inside EndMBB + const MachineInstr *MI = + LiveInts->getInstructionFromIndex(S.end.getPrevSlot()); + if (!MI) { + report("Live segment doesn't end at a valid instruction", EndMBB); report_context(LR, Reg, LaneMask); report_context(S); + return; } - } - // After tied operands are rewritten, a live segment can only end at an - // early-clobber slot if it is being redefined by an early-clobber def. - // TODO: Before tied operands are rewritten, a live segment can only end at an - // early-clobber slot if the last use is tied to an early-clobber def. - if (MF->getProperties().hasProperty( - MachineFunctionProperties::Property::TiedOpsRewritten) && - S.end.isEarlyClobber()) { - if (I+1 == LR.end() || (I+1)->start != S.end) { - report("Live segment ending at early clobber slot must be " - "redefined by an EC def in the same instruction", EndMBB); + // The block slot must refer to a basic block boundary. + if (S.end.isBlock()) { + report("Live segment ends at B slot of an instruction", EndMBB); report_context(LR, Reg, LaneMask); report_context(S); } - } - // The following checks only apply to virtual registers. Physreg liveness - // is too weird to check. - if (Reg.isVirtual()) { - // A live segment can end with either a redefinition, a kill flag on a - // use, or a dead flag on a def. - bool hasRead = false; - bool hasSubRegDef = false; - bool hasDeadDef = false; - for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) { - if (!MOI->isReg() || MOI->getReg() != Reg) - continue; - unsigned Sub = MOI->getSubReg(); - LaneBitmask SLM = Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub) - : LaneBitmask::getAll(); - if (MOI->isDef()) { - if (Sub != 0) { - hasSubRegDef = true; - // An operand %0:sub0 reads %0:sub1..n. Invert the lane - // mask for subregister defs. Read-undef defs will be handled by - // readsReg below. - SLM = ~SLM; - } - if (MOI->isDead()) - hasDeadDef = true; + if (S.end.isDead()) { + // Segment ends on the dead slot. + // That means there must be a dead def. + if (!SlotIndex::isSameInstr(S.start, S.end)) { + report("Live segment ending at dead slot spans instructions", EndMBB); + report_context(LR, Reg, LaneMask); + report_context(S); } - if (LaneMask.any() && (LaneMask & SLM).none()) - continue; - if (MOI->readsReg()) - hasRead = true; } - if (S.end.isDead()) { - // Make sure that the corresponding machine operand for a "dead" live - // range has the dead flag. We cannot perform this check for subregister - // liveranges as partially dead values are allowed. - if (LaneMask.none() && !hasDeadDef) { - report("Instruction ending live segment on dead slot has no dead flag", - MI); + + // After tied operands are rewritten, a live segment can only end at an + // early-clobber slot if it is being redefined by an early-clobber def. + // TODO: Before tied operands are rewritten, a live segment can only end at + // an early-clobber slot if the last use is tied to an early-clobber def. + if (MF->getProperties().hasProperty( + MachineFunctionProperties::Property::TiedOpsRewritten) && + S.end.isEarlyClobber()) { + if (I + 1 == LR.end() || (I + 1)->start != S.end) { + report("Live segment ending at early clobber slot must be " + "redefined by an EC def in the same instruction", + EndMBB); report_context(LR, Reg, LaneMask); report_context(S); } - } else { - if (!hasRead) { - // When tracking subregister liveness, the main range must start new - // values on partial register writes, even if there is no read. - if (!MRI->shouldTrackSubRegLiveness(Reg) || LaneMask.any() || - !hasSubRegDef) { - report("Instruction ending live segment doesn't read the register", - MI); + } + + // The following checks only apply to virtual registers. Physreg liveness + // is too weird to check. + if (Reg.isVirtual()) { + // A live segment can end with either a redefinition, a kill flag on a + // use, or a dead flag on a def. + bool hasRead = false; + bool hasSubRegDef = false; + bool hasDeadDef = false; + for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) { + if (!MOI->isReg() || MOI->getReg() != Reg) + continue; + unsigned Sub = MOI->getSubReg(); + LaneBitmask SLM = + Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub) : LaneBitmask::getAll(); + if (MOI->isDef()) { + if (Sub != 0) { + hasSubRegDef = true; + // An operand %0:sub0 reads %0:sub1..n. Invert the lane + // mask for subregister defs. Read-undef defs will be handled by + // readsReg below. + SLM = ~SLM; + } + if (MOI->isDead()) + hasDeadDef = true; + } + if (LaneMask.any() && (LaneMask & SLM).none()) + continue; + if (MOI->readsReg()) + hasRead = true; + } + if (S.end.isDead()) { + // Make sure that the corresponding machine operand for a "dead" live + // range has the dead flag. We cannot perform this check for subregister + // liveranges as partially dead values are allowed. + if (LaneMask.none() && !hasDeadDef) { + report( + "Instruction ending live segment on dead slot has no dead flag", + MI); report_context(LR, Reg, LaneMask); report_context(S); } + } else { + if (!hasRead) { + // When tracking subregister liveness, the main range must start new + // values on partial register writes, even if there is no read. + if (!MRI->shouldTrackSubRegLiveness(Reg) || LaneMask.any() || + !hasSubRegDef) { + report("Instruction ending live segment doesn't read the register", + MI); + report_context(LR, Reg, LaneMask); + report_context(S); + } + } } } } diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index af9fef0720f9..0bef513342ff 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -74,10 +74,7 @@ void ModuloScheduleExpander::expand() { // stage difference for each use. Keep the maximum value. for (MachineInstr *MI : Schedule.getInstructions()) { int DefStage = Schedule.getStage(MI); - for (const MachineOperand &Op : MI->operands()) { - if (!Op.isReg() || !Op.isDef()) - continue; - + for (const MachineOperand &Op : MI->all_defs()) { Register Reg = Op.getReg(); unsigned MaxDiff = 0; bool PhiIsSwapped = false; @@ -743,9 +740,7 @@ void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB, continue; } bool used = true; - for (const MachineOperand &MO : MI->operands()) { - if (!MO.isReg() || !MO.isDef()) - continue; + for (const MachineOperand &MO : MI->all_defs()) { Register reg = MO.getReg(); // Assume physical registers are used, unless they are marked dead. if (reg.isPhysical()) { diff --git a/llvm/lib/CodeGen/OptimizePHIs.cpp b/llvm/lib/CodeGen/OptimizePHIs.cpp index e68a6398cf51..d997fbbed5a6 100644 --- a/llvm/lib/CodeGen/OptimizePHIs.cpp +++ b/llvm/lib/CodeGen/OptimizePHIs.cpp @@ -34,8 +34,8 @@ STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles"); namespace { class OptimizePHIs : public MachineFunctionPass { - MachineRegisterInfo *MRI; - const TargetInstrInfo *TII; + MachineRegisterInfo *MRI = nullptr; + const TargetInstrInfo *TII = nullptr; public: static char ID; // Pass identification diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp index 51035d2e442f..dbb9a9ffdf60 100644 --- a/llvm/lib/CodeGen/PHIElimination.cpp +++ b/llvm/lib/CodeGen/PHIElimination.cpp @@ -63,9 +63,9 @@ static cl::opt<bool> NoPhiElimLiveOutEarlyExit( namespace { class PHIElimination : public MachineFunctionPass { - MachineRegisterInfo *MRI; // Machine register information - LiveVariables *LV; - LiveIntervals *LIS; + MachineRegisterInfo *MRI = nullptr; // Machine register information + LiveVariables *LV = nullptr; + LiveIntervals *LIS = nullptr; public: static char ID; // Pass identification, replacement for typeid diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index c3458be0f883..a08cc78f11b1 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -150,11 +150,11 @@ namespace { class RecurrenceInstr; class PeepholeOptimizer : public MachineFunctionPass { - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - MachineRegisterInfo *MRI; - MachineDominatorTree *DT; // Machine dominator tree - MachineLoopInfo *MLI; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI = nullptr; + MachineDominatorTree *DT = nullptr; // Machine dominator tree + MachineLoopInfo *MLI = nullptr; public: static char ID; // Pass identification diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp index 98fc7e07a1b4..170008ab67cb 100644 --- a/llvm/lib/CodeGen/PostRASchedulerList.cpp +++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp @@ -182,7 +182,7 @@ namespace { private: /// Apply each ScheduleDAGMutation step in order. - void postprocessDAG(); + void postProcessDAG(); void ReleaseSucc(SUnit *SU, SDep *SuccEdge); void ReleaseSuccessors(SUnit *SU); @@ -407,7 +407,7 @@ void SchedulePostRATDList::schedule() { } } - postprocessDAG(); + postProcessDAG(); LLVM_DEBUG(dbgs() << "********** List Scheduling **********\n"); LLVM_DEBUG(dump()); @@ -436,7 +436,7 @@ void SchedulePostRATDList::finishBlock() { } /// Apply each ScheduleDAGMutation step in order. -void SchedulePostRATDList::postprocessDAG() { +void SchedulePostRATDList::postProcessDAG() { for (auto &M : Mutations) M->apply(this); } diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp index 87e2f9f20021..3448c56e4994 100644 --- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp @@ -6,14 +6,16 @@ // //===----------------------------------------------------------------------===// // -// This pass implements IR lowering for the llvm.load.relative and llvm.objc.* -// intrinsics. +// This pass implements IR lowering for the llvm.memcpy, llvm.memmove, +// llvm.memset, llvm.load.relative and llvm.objc.* intrinsics. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/PreISelIntrinsicLowering.h" #include "llvm/Analysis/ObjCARCInstKind.h" #include "llvm/Analysis/ObjCARCUtil.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -24,9 +26,44 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" using namespace llvm; +/// Threshold to leave statically sized memory intrinsic calls. Calls of known +/// size larger than this will be expanded by the pass. Calls of unknown or +/// lower size will be left for expansion in codegen. +static cl::opt<int64_t> MemIntrinsicExpandSizeThresholdOpt( + "mem-intrinsic-expand-size", + cl::desc("Set minimum mem intrinsic size to expand in IR"), cl::init(-1), + cl::Hidden); + +namespace { + +struct PreISelIntrinsicLowering { + const function_ref<TargetTransformInfo &(Function &)> LookupTTI; + const function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo; + + /// If this is true, assume it's preferably to leave memory intrinsic calls + /// for replacement with a library call later. Otherwise this depends on + /// TargetLibraryInfo availability of the corresponding function. + const bool UseMemIntrinsicLibFunc; + + explicit PreISelIntrinsicLowering( + function_ref<TargetTransformInfo &(Function &)> LookupTTI_, + function_ref<TargetLibraryInfo &(Function &)> LookupLibInfo_, + bool UseMemIntrinsicLibFunc_ = true) + : LookupTTI(LookupTTI_), LookupLibInfo(LookupLibInfo_), + UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {} + + static bool shouldExpandMemIntrinsicWithSize(Value *Size, + const TargetTransformInfo &TTI); + bool expandMemIntrinsicUses(Function &F) const; + bool lowerIntrinsics(Module &M) const; +}; + +} // namespace + static bool lowerLoadRelative(Function &F) { if (F.use_empty()) return false; @@ -133,16 +170,104 @@ static bool lowerObjCCall(Function &F, const char *NewFn, return true; } -static bool lowerIntrinsics(Module &M) { +// TODO: Should refine based on estimated number of accesses (e.g. does it +// require splitting based on alignment) +bool PreISelIntrinsicLowering::shouldExpandMemIntrinsicWithSize( + Value *Size, const TargetTransformInfo &TTI) { + ConstantInt *CI = dyn_cast<ConstantInt>(Size); + if (!CI) + return true; + uint64_t Threshold = MemIntrinsicExpandSizeThresholdOpt.getNumOccurrences() + ? MemIntrinsicExpandSizeThresholdOpt + : TTI.getMaxMemIntrinsicInlineSizeThreshold(); + uint64_t SizeVal = CI->getZExtValue(); + + // Treat a threshold of 0 as a special case to force expansion of all + // intrinsics, including size 0. + return SizeVal > Threshold || Threshold == 0; +} + +// TODO: Handle atomic memcpy and memcpy.inline +// TODO: Pass ScalarEvolution +bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const { + Intrinsic::ID ID = F.getIntrinsicID(); bool Changed = false; - for (Function &F : M) { - if (F.getName().startswith("llvm.load.relative.")) { - Changed |= lowerLoadRelative(F); - continue; + + for (User *U : llvm::make_early_inc_range(F.users())) { + Instruction *Inst = cast<Instruction>(U); + + switch (ID) { + case Intrinsic::memcpy: { + auto *Memcpy = cast<MemCpyInst>(Inst); + Function *ParentFunc = Memcpy->getFunction(); + const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); + if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) { + if (UseMemIntrinsicLibFunc && + LookupLibInfo(*ParentFunc).has(LibFunc_memcpy)) + break; + + expandMemCpyAsLoop(Memcpy, TTI); + Changed = true; + Memcpy->eraseFromParent(); + } + + break; + } + case Intrinsic::memmove: { + auto *Memmove = cast<MemMoveInst>(Inst); + Function *ParentFunc = Memmove->getFunction(); + const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); + if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) { + if (UseMemIntrinsicLibFunc && + LookupLibInfo(*ParentFunc).has(LibFunc_memmove)) + break; + + if (expandMemMoveAsLoop(Memmove, TTI)) { + Changed = true; + Memmove->eraseFromParent(); + } + } + + break; } + case Intrinsic::memset: { + auto *Memset = cast<MemSetInst>(Inst); + Function *ParentFunc = Memset->getFunction(); + const TargetTransformInfo &TTI = LookupTTI(*ParentFunc); + if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) { + if (UseMemIntrinsicLibFunc && + LookupLibInfo(*Memset->getFunction()).has(LibFunc_memset)) + break; + + expandMemSetAsLoop(Memset); + Changed = true; + Memset->eraseFromParent(); + } + + break; + } + default: + llvm_unreachable("unhandled intrinsic"); + } + } + + return Changed; +} + +bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const { + bool Changed = false; + for (Function &F : M) { switch (F.getIntrinsicID()) { default: break; + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + Changed |= expandMemIntrinsicUses(F); + break; + case Intrinsic::load_relative: + Changed |= lowerLoadRelative(F); + break; case Intrinsic::objc_autorelease: Changed |= lowerObjCCall(F, "objc_autorelease"); break; @@ -231,7 +356,23 @@ public: PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {} - bool runOnModule(Module &M) override { return lowerIntrinsics(M); } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } + + bool runOnModule(Module &M) override { + auto LookupTTI = [this](Function &F) -> TargetTransformInfo & { + return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + }; + + auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + }; + + PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI); + return Lowering.lowerIntrinsics(M); + } }; } // end anonymous namespace @@ -248,7 +389,18 @@ ModulePass *llvm::createPreISelIntrinsicLoweringPass() { PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M, ModuleAnalysisManager &AM) { - if (!lowerIntrinsics(M)) + auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + + auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult<TargetLibraryAnalysis>(F); + }; + + auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & { + return FAM.getResult<TargetIRAnalysis>(F); + }; + + PreISelIntrinsicLowering Lowering(LookupTTI, LookupTLI); + if (!Lowering.lowerIntrinsics(M)) return PreservedAnalyses::all(); else return PreservedAnalyses::none(); diff --git a/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/llvm/lib/CodeGen/ProcessImplicitDefs.cpp index 7e46dd35ce47..be81ecab9c89 100644 --- a/llvm/lib/CodeGen/ProcessImplicitDefs.cpp +++ b/llvm/lib/CodeGen/ProcessImplicitDefs.cpp @@ -27,9 +27,9 @@ namespace { /// Process IMPLICIT_DEF instructions and make sure there is one implicit_def /// for each use. Add isUndef marker to implicit_def defs and their uses. class ProcessImplicitDefs : public MachineFunctionPass { - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - MachineRegisterInfo *MRI; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI = nullptr; SmallSetVector<MachineInstr*, 16> WorkList; @@ -72,8 +72,8 @@ bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) { !MI->isRegSequence() && !MI->isPHI()) return false; - for (const MachineOperand &MO : MI->operands()) - if (MO.isReg() && MO.isUse() && MO.readsReg()) + for (const MachineOperand &MO : MI->all_uses()) + if (MO.readsReg()) return false; return true; } diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index cc70ec477650..e323aaaeefaf 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -96,7 +96,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; private: - RegScavenger *RS; + RegScavenger *RS = nullptr; // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved // stack frame indexes. @@ -111,11 +111,11 @@ private: // Flag to control whether to use the register scavenger to resolve // frame index materialization registers. Set according to // TRI->requiresFrameIndexScavenging() for the current function. - bool FrameIndexVirtualScavenging; + bool FrameIndexVirtualScavenging = false; // Flag to control whether the scavenger should be passed even though // FrameIndexVirtualScavenging is used. - bool FrameIndexEliminationScavenging; + bool FrameIndexEliminationScavenging = false; // Emit remarks. MachineOptimizationRemarkEmitter *ORE = nullptr; @@ -309,19 +309,20 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) { SpillSize += MFI.getObjectSize(Idx); } - float SpillPct = + [[maybe_unused]] float SpillPct = static_cast<float>(SpillSize) / static_cast<float>(StackSize); - float VarPct = 1.0f - SpillPct; - int64_t VariableSize = StackSize - SpillSize; - dbgs() << formatv("{0}/{1} ({3:P}) spills, {2}/{1} ({4:P}) variables", - SpillSize, StackSize, VariableSize, SpillPct, VarPct); + LLVM_DEBUG( + dbgs() << formatv("{0}/{1} ({3:P}) spills, {2}/{1} ({4:P}) variables", + SpillSize, StackSize, StackSize - SpillSize, SpillPct, + 1.0f - SpillPct)); if (UnsafeStackSize != 0) { - float UnsafePct = - static_cast<float>(UnsafeStackSize) / static_cast<float>(StackSize); - dbgs() << formatv(", {0}/{2} ({1:P}) unsafe stack", UnsafeStackSize, - UnsafePct, StackSize); + LLVM_DEBUG(dbgs() << formatv(", {0}/{2} ({1:P}) unsafe stack", + UnsafeStackSize, + static_cast<float>(UnsafeStackSize) / + static_cast<float>(StackSize), + StackSize)); } - dbgs() << "\n"; + LLVM_DEBUG(dbgs() << "\n"); } ORE->emit([&]() { @@ -375,8 +376,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &MF) { } assert(!MFI.isMaxCallFrameSizeComputed() || - (MFI.getMaxCallFrameSize() == MaxCallFrameSize && - MFI.adjustsStack() == AdjustsStack)); + (MFI.getMaxCallFrameSize() >= MaxCallFrameSize && + !(AdjustsStack && !MFI.adjustsStack()))); MFI.setAdjustsStack(AdjustsStack); MFI.setMaxCallFrameSize(MaxCallFrameSize); @@ -692,7 +693,7 @@ void PEI::spillCalleeSavedRegs(MachineFunction &MF) { /// AdjustStackOffset - Helper function used to adjust the stack frame offset. static inline void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx, bool StackGrowsDown, int64_t &Offset, - Align &MaxAlign, unsigned Skew) { + Align &MaxAlign) { // If the stack grows down, add the object size to find the lowest address. if (StackGrowsDown) Offset += MFI.getObjectSize(FrameIdx); @@ -704,7 +705,7 @@ static inline void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx, MaxAlign = std::max(MaxAlign, Alignment); // Adjust to alignment boundary. - Offset = alignTo(Offset, Alignment, Skew); + Offset = alignTo(Offset, Alignment); if (StackGrowsDown) { LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset @@ -828,11 +829,10 @@ static inline bool scavengeStackSlot(MachineFrameInfo &MFI, int FrameIdx, static void AssignProtectedObjSet(const StackObjSet &UnassignedObjs, SmallSet<int, 16> &ProtectedObjs, MachineFrameInfo &MFI, bool StackGrowsDown, - int64_t &Offset, Align &MaxAlign, - unsigned Skew) { + int64_t &Offset, Align &MaxAlign) { for (int i : UnassignedObjs) { - AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew); + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign); ProtectedObjs.insert(i); } } @@ -858,9 +858,6 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { && "Local area offset should be in direction of stack growth"); int64_t Offset = LocalAreaOffset; - // Skew to be applied to alignment. - unsigned Skew = TFI.getStackAlignmentSkew(MF); - #ifdef EXPENSIVE_CHECKS for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) if (!MFI.isDeadObjectIndex(i) && @@ -908,8 +905,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { if (!StackGrowsDown && MFI.isDeadObjectIndex(FrameIndex)) continue; - AdjustStackOffset(MFI, FrameIndex, StackGrowsDown, Offset, MaxAlign, - Skew); + AdjustStackOffset(MFI, FrameIndex, StackGrowsDown, Offset, MaxAlign); } } @@ -930,7 +926,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { SmallVector<int, 2> SFIs; RS->getScavengingFrameIndices(SFIs); for (int SFI : SFIs) - AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign, Skew); + AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign); } // FIXME: Once this is working, then enable flag will change to a target @@ -941,7 +937,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { Align Alignment = MFI.getLocalFrameMaxAlign(); // Adjust to alignment boundary. - Offset = alignTo(Offset, Alignment, Skew); + Offset = alignTo(Offset, Alignment); LLVM_DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n"); @@ -987,8 +983,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { "Stack protector on non-default stack expected to not be " "pre-allocated by LocalStackSlotPass."); } else if (!MFI.getUseLocalStackAllocationBlock()) { - AdjustStackOffset(MFI, StackProtectorFI, StackGrowsDown, Offset, MaxAlign, - Skew); + AdjustStackOffset(MFI, StackProtectorFI, StackGrowsDown, Offset, + MaxAlign); } else if (!MFI.isObjectPreAllocated(MFI.getStackProtectorIndex())) { llvm_unreachable( "Stack protector not pre-allocated by LocalStackSlotPass."); @@ -1036,11 +1032,11 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { "LocalStackSlotPass."); AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown, - Offset, MaxAlign, Skew); + Offset, MaxAlign); AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown, - Offset, MaxAlign, Skew); + Offset, MaxAlign); AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown, - Offset, MaxAlign, Skew); + Offset, MaxAlign); } SmallVector<int, 8> ObjectsToAllocate; @@ -1071,7 +1067,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { // Allocate the EH registration node first if one is present. if (EHRegNodeFrameIndex != std::numeric_limits<int>::max()) AdjustStackOffset(MFI, EHRegNodeFrameIndex, StackGrowsDown, Offset, - MaxAlign, Skew); + MaxAlign); // Give the targets a chance to order the objects the way they like it. if (MF.getTarget().getOptLevel() != CodeGenOpt::None && @@ -1093,7 +1089,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { for (auto &Object : ObjectsToAllocate) if (!scavengeStackSlot(MFI, Object, StackGrowsDown, MaxAlign, StackBytesFree)) - AdjustStackOffset(MFI, Object, StackGrowsDown, Offset, MaxAlign, Skew); + AdjustStackOffset(MFI, Object, StackGrowsDown, Offset, MaxAlign); // Make sure the special register scavenging spill slot is closest to the // stack pointer. @@ -1101,7 +1097,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { SmallVector<int, 2> SFIs; RS->getScavengingFrameIndices(SFIs); for (int SFI : SFIs) - AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign, Skew); + AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign); } if (!TFI.targetHandlesStackFrameRounding()) { @@ -1127,7 +1123,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { // SP not FP. Align to MaxAlign so this works. StackAlign = std::max(StackAlign, MaxAlign); int64_t OffsetBeforeAlignment = Offset; - Offset = alignTo(Offset, StackAlign, Skew); + Offset = alignTo(Offset, StackAlign); // If we have increased the offset to fulfill the alignment constrants, // then the scavenging spill slots may become harder to reach from the @@ -1291,8 +1287,8 @@ void PEI::insertZeroCallUsedRegs(MachineFunction &MF) { MCRegister Reg = MO.getReg(); // This picks up sibling registers (e.q. %al -> %ah). - for (MCRegUnitIterator Unit(Reg, &TRI); Unit.isValid(); ++Unit) - RegsToZero.reset(*Unit); + for (MCRegUnit Unit : TRI.regunits(Reg)) + RegsToZero.reset(Unit); for (MCPhysReg SReg : TRI.sub_and_superregs_inclusive(Reg)) RegsToZero.reset(SReg); @@ -1463,14 +1459,24 @@ void PEI::replaceFrameIndicesBackward(MachineBasicBlock *BB, assert(MF.getSubtarget().getRegisterInfo() && "getRegisterInfo() must be implemented!"); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); - RS->enterBasicBlockEnd(*BB); + RegScavenger *LocalRS = FrameIndexEliminationScavenging ? RS : nullptr; + if (LocalRS) + LocalRS->enterBasicBlockEnd(*BB); for (MachineInstr &MI : make_early_inc_range(reverse(*BB))) { + if (TII.isFrameInstr(MI)) { + TFI.eliminateCallFramePseudoInstr(MF, *BB, &MI); + continue; + } + + // Step backwards to get the liveness state at (immedately after) MI. + if (LocalRS) + LocalRS->backward(MI); - // Register scavenger backward step - MachineBasicBlock::iterator Step(MI); for (unsigned i = 0; i != MI.getNumOperands(); ++i) { if (!MI.getOperand(i).isFI()) continue; @@ -1478,49 +1484,20 @@ void PEI::replaceFrameIndicesBackward(MachineBasicBlock *BB, if (replaceFrameIndexDebugInstr(MF, MI, i, SPAdj)) continue; - // If this instruction has a FrameIndex operand, we need to - // use that target machine register info object to eliminate - // it. - - // TRI.eliminateFrameIndex may lower the frame index to a sequence of - // instructions. It also can remove/change instructions passed by the - // iterator and invalidate the iterator. We have to take care of this. For - // that we support two iterators: *Step* - points to the position up to - // which the scavenger should scan by the next iteration to have liveness - // information up to date. *Curr* - keeps track of the correct RS->MBBI - - // the scan start point. It points to the currently processed instruction - // right before the frame lowering. + // Eliminate this FrameIndex operand. // - // ITERATORS WORK AS FOLLOWS: - // *Step* is shifted one step back right before the frame lowering and - // one step forward right after it. No matter how many instructions were - // inserted, *Step* will be right after the position which is going to be - // processed in the next iteration, thus, in the correct position for the - // scavenger to go up to. - // *Curr* is shifted one step forward right before calling - // TRI.eliminateFrameIndex and one step backward after. Thus, we make sure - // it points right to the position that is the correct starting point for - // the scavenger to scan. - MachineBasicBlock::iterator Curr = ++RS->getCurrentPosition(); - - // Shift back - --Step; - + // Save and restore the scavenger's position around the call to + // eliminateFrameIndex in case it erases MI and invalidates the iterator. + MachineBasicBlock::iterator Save; + if (LocalRS) + Save = std::next(LocalRS->getCurrentPosition()); bool Removed = TRI.eliminateFrameIndex(MI, SPAdj, i, RS); - // Restore to unify logic with a shift back that happens in the end of - // the outer loop. - ++Step; - RS->skipTo(--Curr); + if (LocalRS) + LocalRS->skipTo(std::prev(Save)); + if (Removed) break; } - - // Shift it to make RS collect reg info up to the current instruction. - if (Step != BB->begin()) - Step--; - - // Update register states. - RS->backward(Step); } } @@ -1532,7 +1509,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF, const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - if (RS && TRI.supportsBackwardScavenger()) + if (TRI.supportsBackwardScavenger()) return replaceFrameIndicesBackward(BB, MF, SPAdj); if (RS && FrameIndexEliminationScavenging) diff --git a/llvm/lib/CodeGen/PseudoProbeInserter.cpp b/llvm/lib/CodeGen/PseudoProbeInserter.cpp index 86ea3ec67178..913e0035b046 100644 --- a/llvm/lib/CodeGen/PseudoProbeInserter.cpp +++ b/llvm/lib/CodeGen/PseudoProbeInserter.cpp @@ -128,10 +128,7 @@ public: private: uint64_t getFuncGUID(Module *M, DILocation *DL) { - auto *SP = DL->getScope()->getSubprogram(); - auto Name = SP->getLinkageName(); - if (Name.empty()) - Name = SP->getName(); + auto Name = DL->getSubprogramLinkageName(); return Function::getGUID(Name); } diff --git a/llvm/lib/CodeGen/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp index dcb1a44c75e4..abf3b1e6fbb9 100644 --- a/llvm/lib/CodeGen/RDFGraph.cpp +++ b/llvm/lib/CodeGen/RDFGraph.cpp @@ -8,7 +8,6 @@ // // Target-independent, SSA-based data flow graph for register data flow (RDF). // -#include "llvm/CodeGen/RDFGraph.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" @@ -19,6 +18,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RDFGraph.h" #include "llvm/CodeGen/RDFRegisters.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -38,64 +38,69 @@ #include <utility> #include <vector> -using namespace llvm; -using namespace rdf; - // Printing functions. Have them here first, so that the rest of the code // can use them. -namespace llvm { -namespace rdf { - -raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P) { - if (!P.Mask.all()) - OS << ':' << PrintLaneMask(P.Mask); - return OS; -} +namespace llvm::rdf { -raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterRef> &P) { - auto &TRI = P.G.getTRI(); - if (P.Obj.Reg > 0 && P.Obj.Reg < TRI.getNumRegs()) - OS << TRI.getName(P.Obj.Reg); - else - OS << '#' << P.Obj.Reg; - OS << PrintLaneMaskOpt(P.Obj.Mask); +raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterRef> &P) { + P.G.getPRI().print(OS, P.Obj); return OS; } -raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) { - auto NA = P.G.addr<NodeBase*>(P.Obj); +raw_ostream &operator<<(raw_ostream &OS, const Print<NodeId> &P) { + if (P.Obj == 0) + return OS << "null"; + auto NA = P.G.addr<NodeBase *>(P.Obj); uint16_t Attrs = NA.Addr->getAttrs(); uint16_t Kind = NodeAttrs::kind(Attrs); uint16_t Flags = NodeAttrs::flags(Attrs); switch (NodeAttrs::type(Attrs)) { - case NodeAttrs::Code: - switch (Kind) { - case NodeAttrs::Func: OS << 'f'; break; - case NodeAttrs::Block: OS << 'b'; break; - case NodeAttrs::Stmt: OS << 's'; break; - case NodeAttrs::Phi: OS << 'p'; break; - default: OS << "c?"; break; - } + case NodeAttrs::Code: + switch (Kind) { + case NodeAttrs::Func: + OS << 'f'; break; - case NodeAttrs::Ref: - if (Flags & NodeAttrs::Undef) - OS << '/'; - if (Flags & NodeAttrs::Dead) - OS << '\\'; - if (Flags & NodeAttrs::Preserving) - OS << '+'; - if (Flags & NodeAttrs::Clobbering) - OS << '~'; - switch (Kind) { - case NodeAttrs::Use: OS << 'u'; break; - case NodeAttrs::Def: OS << 'd'; break; - case NodeAttrs::Block: OS << 'b'; break; - default: OS << "r?"; break; - } + case NodeAttrs::Block: + OS << 'b'; + break; + case NodeAttrs::Stmt: + OS << 's'; + break; + case NodeAttrs::Phi: + OS << 'p'; break; default: - OS << '?'; + OS << "c?"; + break; + } + break; + case NodeAttrs::Ref: + if (Flags & NodeAttrs::Undef) + OS << '/'; + if (Flags & NodeAttrs::Dead) + OS << '\\'; + if (Flags & NodeAttrs::Preserving) + OS << '+'; + if (Flags & NodeAttrs::Clobbering) + OS << '~'; + switch (Kind) { + case NodeAttrs::Use: + OS << 'u'; break; + case NodeAttrs::Def: + OS << 'd'; + break; + case NodeAttrs::Block: + OS << 'b'; + break; + default: + OS << "r?"; + break; + } + break; + default: + OS << '?'; + break; } OS << P.Obj; if (Flags & NodeAttrs::Shadow) @@ -103,15 +108,14 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) { return OS; } -static void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA, - const DataFlowGraph &G) { - OS << Print(RA.Id, G) << '<' - << Print(RA.Addr->getRegRef(G), G) << '>'; +static void printRefHeader(raw_ostream &OS, const Ref RA, + const DataFlowGraph &G) { + OS << Print(RA.Id, G) << '<' << Print(RA.Addr->getRegRef(G), G) << '>'; if (RA.Addr->getFlags() & NodeAttrs::Fixed) OS << '!'; } -raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<Def> &P) { printRefHeader(OS, P.Obj, P.G); OS << '('; if (NodeId N = P.Obj.Addr->getReachingDef()) @@ -128,7 +132,7 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<DefNode*>> &P) { return OS; } -raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<Use> &P) { printRefHeader(OS, P.Obj, P.G); OS << '('; if (NodeId N = P.Obj.Addr->getReachingDef()) @@ -139,8 +143,7 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<UseNode*>> &P) { return OS; } -raw_ostream &operator<< (raw_ostream &OS, - const Print<NodeAddr<PhiUseNode*>> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<PhiUse> &P) { printRefHeader(OS, P.Obj, P.G); OS << '('; if (NodeId N = P.Obj.Addr->getReachingDef()) @@ -154,22 +157,22 @@ raw_ostream &operator<< (raw_ostream &OS, return OS; } -raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<RefNode*>> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<Ref> &P) { switch (P.Obj.Addr->getKind()) { - case NodeAttrs::Def: - OS << PrintNode<DefNode*>(P.Obj, P.G); - break; - case NodeAttrs::Use: - if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef) - OS << PrintNode<PhiUseNode*>(P.Obj, P.G); - else - OS << PrintNode<UseNode*>(P.Obj, P.G); - break; + case NodeAttrs::Def: + OS << PrintNode<DefNode *>(P.Obj, P.G); + break; + case NodeAttrs::Use: + if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef) + OS << PrintNode<PhiUseNode *>(P.Obj, P.G); + else + OS << PrintNode<UseNode *>(P.Obj, P.G); + break; } return OS; } -raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<NodeList> &P) { unsigned N = P.Obj.size(); for (auto I : P.Obj) { OS << Print(I.Id, P.G); @@ -179,7 +182,7 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeList> &P) { return OS; } -raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<NodeSet> &P) { unsigned N = P.Obj.size(); for (auto I : P.Obj) { OS << Print(I, P.G); @@ -191,45 +194,43 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) { namespace { - template <typename T> - struct PrintListV { - PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {} +template <typename T> struct PrintListV { + PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {} - using Type = T; - const NodeList &List; - const DataFlowGraph &G; - }; + using Type = T; + const NodeList &List; + const DataFlowGraph &G; +}; - template <typename T> - raw_ostream &operator<< (raw_ostream &OS, const PrintListV<T> &P) { - unsigned N = P.List.size(); - for (NodeAddr<T> A : P.List) { - OS << PrintNode<T>(A, P.G); - if (--N) - OS << ", "; - } - return OS; +template <typename T> +raw_ostream &operator<<(raw_ostream &OS, const PrintListV<T> &P) { + unsigned N = P.List.size(); + for (NodeAddr<T> A : P.List) { + OS << PrintNode<T>(A, P.G); + if (--N) + OS << ", "; } + return OS; +} } // end anonymous namespace -raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<Phi> &P) { OS << Print(P.Obj.Id, P.G) << ": phi [" - << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']'; + << PrintListV<RefNode *>(P.Obj.Addr->members(P.G), P.G) << ']'; return OS; } -raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<StmtNode *>> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<Stmt> &P) { const MachineInstr &MI = *P.Obj.Addr->getCode(); unsigned Opc = MI.getOpcode(); OS << Print(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc); // Print the target for calls and branches (for readability). if (MI.isCall() || MI.isBranch()) { MachineInstr::const_mop_iterator T = - llvm::find_if(MI.operands(), - [] (const MachineOperand &Op) -> bool { - return Op.isMBB() || Op.isGlobal() || Op.isSymbol(); - }); + llvm::find_if(MI.operands(), [](const MachineOperand &Op) -> bool { + return Op.isMBB() || Op.isGlobal() || Op.isSymbol(); + }); if (T != MI.operands_end()) { OS << ' '; if (T->isMBB()) @@ -240,32 +241,30 @@ raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<StmtNode *>> &P) { OS << T->getSymbolName(); } } - OS << " [" << PrintListV<RefNode*>(P.Obj.Addr->members(P.G), P.G) << ']'; + OS << " [" << PrintListV<RefNode *>(P.Obj.Addr->members(P.G), P.G) << ']'; return OS; } -raw_ostream &operator<< (raw_ostream &OS, - const Print<NodeAddr<InstrNode*>> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<Instr> &P) { switch (P.Obj.Addr->getKind()) { - case NodeAttrs::Phi: - OS << PrintNode<PhiNode*>(P.Obj, P.G); - break; - case NodeAttrs::Stmt: - OS << PrintNode<StmtNode*>(P.Obj, P.G); - break; - default: - OS << "instr? " << Print(P.Obj.Id, P.G); - break; + case NodeAttrs::Phi: + OS << PrintNode<PhiNode *>(P.Obj, P.G); + break; + case NodeAttrs::Stmt: + OS << PrintNode<StmtNode *>(P.Obj, P.G); + break; + default: + OS << "instr? " << Print(P.Obj.Id, P.G); + break; } return OS; } -raw_ostream &operator<< (raw_ostream &OS, - const Print<NodeAddr<BlockNode*>> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<Block> &P) { MachineBasicBlock *BB = P.Obj.Addr->getCode(); unsigned NP = BB->pred_size(); std::vector<int> Ns; - auto PrintBBs = [&OS] (std::vector<int> Ns) -> void { + auto PrintBBs = [&OS](std::vector<int> Ns) -> void { unsigned N = Ns.size(); for (int I : Ns) { OS << "%bb." << I; @@ -289,20 +288,21 @@ raw_ostream &operator<< (raw_ostream &OS, OS << '\n'; for (auto I : P.Obj.Addr->members(P.G)) - OS << PrintNode<InstrNode*>(I, P.G) << '\n'; + OS << PrintNode<InstrNode *>(I, P.G) << '\n'; return OS; } -raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<FuncNode *>> &P) { - OS << "DFG dump:[\n" << Print(P.Obj.Id, P.G) << ": Function: " - << P.Obj.Addr->getCode()->getName() << '\n'; +raw_ostream &operator<<(raw_ostream &OS, const Print<Func> &P) { + OS << "DFG dump:[\n" + << Print(P.Obj.Id, P.G) + << ": Function: " << P.Obj.Addr->getCode()->getName() << '\n'; for (auto I : P.Obj.Addr->members(P.G)) - OS << PrintNode<BlockNode*>(I, P.G) << '\n'; + OS << PrintNode<BlockNode *>(I, P.G) << '\n'; OS << "]\n"; return OS; } -raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) { +raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterSet> &P) { OS << '{'; for (auto I : P.Obj) OS << ' ' << Print(I, P.G); @@ -310,16 +310,16 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterSet> &P) { return OS; } -raw_ostream &operator<< (raw_ostream &OS, const Print<RegisterAggr> &P) { - P.Obj.print(OS); +raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterAggr> &P) { + OS << P.Obj; return OS; } -raw_ostream &operator<< (raw_ostream &OS, - const Print<DataFlowGraph::DefStack> &P) { - for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E; ) { - OS << Print(I->Id, P.G) - << '<' << Print(I->Addr->getRegRef(P.G), P.G) << '>'; +raw_ostream &operator<<(raw_ostream &OS, + const Print<DataFlowGraph::DefStack> &P) { + for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E;) { + OS << Print(I->Id, P.G) << '<' << Print(I->Addr->getRegRef(P.G), P.G) + << '>'; I.down(); if (I != E) OS << ' '; @@ -327,9 +327,6 @@ raw_ostream &operator<< (raw_ostream &OS, return OS; } -} // end namespace rdf -} // end namespace llvm - // Node allocation functions. // // Node allocator is like a slab memory allocator: it allocates blocks of @@ -340,13 +337,13 @@ raw_ostream &operator<< (raw_ostream &OS, // and within that block is described in the header file. // void NodeAllocator::startNewBlock() { - void *T = MemPool.Allocate(NodesPerBlock*NodeMemSize, NodeMemSize); - char *P = static_cast<char*>(T); + void *T = MemPool.Allocate(NodesPerBlock * NodeMemSize, NodeMemSize); + char *P = static_cast<char *>(T); Blocks.push_back(P); // Check if the block index is still within the allowed range, i.e. less // than 2^N, where N is the number of bits in NodeId for the block index. // BitsPerIndex is the number of bits per node index. - assert((Blocks.size() < ((size_t)1 << (8*sizeof(NodeId)-BitsPerIndex))) && + assert((Blocks.size() < ((size_t)1 << (8 * sizeof(NodeId) - BitsPerIndex))) && "Out of bits for block index"); ActiveEnd = P; } @@ -356,18 +353,17 @@ bool NodeAllocator::needNewBlock() { return true; char *ActiveBegin = Blocks.back(); - uint32_t Index = (ActiveEnd-ActiveBegin)/NodeMemSize; + uint32_t Index = (ActiveEnd - ActiveBegin) / NodeMemSize; return Index >= NodesPerBlock; } -NodeAddr<NodeBase*> NodeAllocator::New() { +Node NodeAllocator::New() { if (needNewBlock()) startNewBlock(); - uint32_t ActiveB = Blocks.size()-1; - uint32_t Index = (ActiveEnd - Blocks[ActiveB])/NodeMemSize; - NodeAddr<NodeBase*> NA = { reinterpret_cast<NodeBase*>(ActiveEnd), - makeId(ActiveB, Index) }; + uint32_t ActiveB = Blocks.size() - 1; + uint32_t Index = (ActiveEnd - Blocks[ActiveB]) / NodeMemSize; + Node NA = {reinterpret_cast<NodeBase *>(ActiveEnd), makeId(ActiveB, Index)}; ActiveEnd += NodeMemSize; return NA; } @@ -376,9 +372,9 @@ NodeId NodeAllocator::id(const NodeBase *P) const { uintptr_t A = reinterpret_cast<uintptr_t>(P); for (unsigned i = 0, n = Blocks.size(); i != n; ++i) { uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]); - if (A < B || A >= B + NodesPerBlock*NodeMemSize) + if (A < B || A >= B + NodesPerBlock * NodeMemSize) continue; - uint32_t Idx = (A-B)/NodeMemSize; + uint32_t Idx = (A - B) / NodeMemSize; return makeId(i, Idx); } llvm_unreachable("Invalid node address"); @@ -391,7 +387,7 @@ void NodeAllocator::clear() { } // Insert node NA after "this" in the circular chain. -void NodeBase::append(NodeAddr<NodeBase*> NA) { +void NodeBase::append(Node NA) { NodeId Nx = Next; // If NA is already "next", do nothing. if (Next != NA.Id) { @@ -406,9 +402,9 @@ void NodeBase::append(NodeAddr<NodeBase*> NA) { RegisterRef RefNode::getRegRef(const DataFlowGraph &G) const { assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref); if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef) - return G.unpack(Ref.PR); - assert(Ref.Op != nullptr); - return G.makeRegRef(*Ref.Op); + return G.unpack(RefData.PR); + assert(RefData.Op != nullptr); + return G.makeRegRef(*RefData.Op); } // Set the register reference in the reference node directly (for references @@ -416,7 +412,7 @@ RegisterRef RefNode::getRegRef(const DataFlowGraph &G) const { void RefNode::setRegRef(RegisterRef RR, DataFlowGraph &G) { assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref); assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef); - Ref.PR = G.pack(RR); + RefData.PR = G.pack(RR); } // Set the register reference in the reference node based on a machine @@ -425,83 +421,82 @@ void RefNode::setRegRef(MachineOperand *Op, DataFlowGraph &G) { assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref); assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)); (void)G; - Ref.Op = Op; + RefData.Op = Op; } // Get the owner of a given reference node. -NodeAddr<NodeBase*> RefNode::getOwner(const DataFlowGraph &G) { - NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext()); +Node RefNode::getOwner(const DataFlowGraph &G) { + Node NA = G.addr<NodeBase *>(getNext()); while (NA.Addr != this) { if (NA.Addr->getType() == NodeAttrs::Code) return NA; - NA = G.addr<NodeBase*>(NA.Addr->getNext()); + NA = G.addr<NodeBase *>(NA.Addr->getNext()); } llvm_unreachable("No owner in circular list"); } // Connect the def node to the reaching def node. -void DefNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) { - Ref.RD = DA.Id; - Ref.Sib = DA.Addr->getReachedDef(); +void DefNode::linkToDef(NodeId Self, Def DA) { + RefData.RD = DA.Id; + RefData.Sib = DA.Addr->getReachedDef(); DA.Addr->setReachedDef(Self); } // Connect the use node to the reaching def node. -void UseNode::linkToDef(NodeId Self, NodeAddr<DefNode*> DA) { - Ref.RD = DA.Id; - Ref.Sib = DA.Addr->getReachedUse(); +void UseNode::linkToDef(NodeId Self, Def DA) { + RefData.RD = DA.Id; + RefData.Sib = DA.Addr->getReachedUse(); DA.Addr->setReachedUse(Self); } // Get the first member of the code node. -NodeAddr<NodeBase*> CodeNode::getFirstMember(const DataFlowGraph &G) const { - if (Code.FirstM == 0) - return NodeAddr<NodeBase*>(); - return G.addr<NodeBase*>(Code.FirstM); +Node CodeNode::getFirstMember(const DataFlowGraph &G) const { + if (CodeData.FirstM == 0) + return Node(); + return G.addr<NodeBase *>(CodeData.FirstM); } // Get the last member of the code node. -NodeAddr<NodeBase*> CodeNode::getLastMember(const DataFlowGraph &G) const { - if (Code.LastM == 0) - return NodeAddr<NodeBase*>(); - return G.addr<NodeBase*>(Code.LastM); +Node CodeNode::getLastMember(const DataFlowGraph &G) const { + if (CodeData.LastM == 0) + return Node(); + return G.addr<NodeBase *>(CodeData.LastM); } // Add node NA at the end of the member list of the given code node. -void CodeNode::addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) { - NodeAddr<NodeBase*> ML = getLastMember(G); +void CodeNode::addMember(Node NA, const DataFlowGraph &G) { + Node ML = getLastMember(G); if (ML.Id != 0) { ML.Addr->append(NA); } else { - Code.FirstM = NA.Id; + CodeData.FirstM = NA.Id; NodeId Self = G.id(this); NA.Addr->setNext(Self); } - Code.LastM = NA.Id; + CodeData.LastM = NA.Id; } // Add node NA after member node MA in the given code node. -void CodeNode::addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA, - const DataFlowGraph &G) { +void CodeNode::addMemberAfter(Node MA, Node NA, const DataFlowGraph &G) { MA.Addr->append(NA); - if (Code.LastM == MA.Id) - Code.LastM = NA.Id; + if (CodeData.LastM == MA.Id) + CodeData.LastM = NA.Id; } // Remove member node NA from the given code node. -void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) { - NodeAddr<NodeBase*> MA = getFirstMember(G); +void CodeNode::removeMember(Node NA, const DataFlowGraph &G) { + Node MA = getFirstMember(G); assert(MA.Id != 0); // Special handling if the member to remove is the first member. if (MA.Id == NA.Id) { - if (Code.LastM == MA.Id) { + if (CodeData.LastM == MA.Id) { // If it is the only member, set both first and last to 0. - Code.FirstM = Code.LastM = 0; + CodeData.FirstM = CodeData.LastM = 0; } else { // Otherwise, advance the first member. - Code.FirstM = MA.Addr->getNext(); + CodeData.FirstM = MA.Addr->getNext(); } return; } @@ -512,37 +507,37 @@ void CodeNode::removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G) { MA.Addr->setNext(NA.Addr->getNext()); // If the member to remove happens to be the last one, update the // LastM indicator. - if (Code.LastM == NA.Id) - Code.LastM = MA.Id; + if (CodeData.LastM == NA.Id) + CodeData.LastM = MA.Id; return; } - MA = G.addr<NodeBase*>(MX); + MA = G.addr<NodeBase *>(MX); } llvm_unreachable("No such member"); } // Return the list of all members of the code node. NodeList CodeNode::members(const DataFlowGraph &G) const { - static auto True = [] (NodeAddr<NodeBase*>) -> bool { return true; }; + static auto True = [](Node) -> bool { return true; }; return members_if(True, G); } // Return the owner of the given instr node. -NodeAddr<NodeBase*> InstrNode::getOwner(const DataFlowGraph &G) { - NodeAddr<NodeBase*> NA = G.addr<NodeBase*>(getNext()); +Node InstrNode::getOwner(const DataFlowGraph &G) { + Node NA = G.addr<NodeBase *>(getNext()); while (NA.Addr != this) { assert(NA.Addr->getType() == NodeAttrs::Code); if (NA.Addr->getKind() == NodeAttrs::Block) return NA; - NA = G.addr<NodeBase*>(NA.Addr->getNext()); + NA = G.addr<NodeBase *>(NA.Addr->getNext()); } llvm_unreachable("No owner in circular list"); } // Add the phi node PA to the given block node. -void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) { - NodeAddr<NodeBase*> M = getFirstMember(G); +void BlockNode::addPhi(Phi PA, const DataFlowGraph &G) { + Node M = getFirstMember(G); if (M.Id == 0) { addMember(PA, G); return; @@ -552,15 +547,15 @@ void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) { if (M.Addr->getKind() == NodeAttrs::Stmt) { // If the first member of the block is a statement, insert the phi as // the first member. - Code.FirstM = PA.Id; + CodeData.FirstM = PA.Id; PA.Addr->setNext(M.Id); } else { // If the first member is a phi, find the last phi, and append PA to it. assert(M.Addr->getKind() == NodeAttrs::Phi); - NodeAddr<NodeBase*> MN = M; + Node MN = M; do { M = MN; - MN = G.addr<NodeBase*>(M.Addr->getNext()); + MN = G.addr<NodeBase *>(M.Addr->getNext()); assert(MN.Addr->getType() == NodeAttrs::Code); } while (MN.Addr->getKind() == NodeAttrs::Phi); @@ -571,19 +566,17 @@ void BlockNode::addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G) { // Find the block node corresponding to the machine basic block BB in the // given func node. -NodeAddr<BlockNode*> FuncNode::findBlock(const MachineBasicBlock *BB, - const DataFlowGraph &G) const { - auto EqBB = [BB] (NodeAddr<NodeBase*> NA) -> bool { - return NodeAddr<BlockNode*>(NA).Addr->getCode() == BB; - }; +Block FuncNode::findBlock(const MachineBasicBlock *BB, + const DataFlowGraph &G) const { + auto EqBB = [BB](Node NA) -> bool { return Block(NA).Addr->getCode() == BB; }; NodeList Ms = members_if(EqBB, G); if (!Ms.empty()) return Ms[0]; - return NodeAddr<BlockNode*>(); + return Block(); } // Get the block node for the entry block in the given function. -NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) { +Block FuncNode::getEntryBlock(const DataFlowGraph &G) { MachineBasicBlock *EntryB = &getCode()->front(); return findBlock(EntryB, G); } @@ -593,14 +586,14 @@ NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) { // For a given instruction, check if there are any bits of RR that can remain // unchanged across this def. -bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum) - const { +bool TargetOperandInfo::isPreserving(const MachineInstr &In, + unsigned OpNum) const { return TII.isPredicated(In); } // Check if the definition of RR produces an unspecified value. -bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum) - const { +bool TargetOperandInfo::isClobbering(const MachineInstr &In, + unsigned OpNum) const { const MachineOperand &Op = In.getOperand(OpNum); if (Op.isRegMask()) return true; @@ -612,8 +605,8 @@ bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum) } // Check if the given instruction specifically requires -bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum) - const { +bool TargetOperandInfo::isFixedReg(const MachineInstr &In, + unsigned OpNum) const { if (In.isCall() || In.isReturn() || In.isInlineAsm()) return true; // Check for a tail call. @@ -642,19 +635,20 @@ bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum) // DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii, - const TargetRegisterInfo &tri, const MachineDominatorTree &mdt, - const MachineDominanceFrontier &mdf) + const TargetRegisterInfo &tri, + const MachineDominatorTree &mdt, + const MachineDominanceFrontier &mdf) : DefaultTOI(std::make_unique<TargetOperandInfo>(tii)), MF(mf), TII(tii), TRI(tri), PRI(tri, mf), MDT(mdt), MDF(mdf), TOI(*DefaultTOI), - LiveIns(PRI) { -} + LiveIns(PRI) {} DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii, - const TargetRegisterInfo &tri, const MachineDominatorTree &mdt, - const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi) + const TargetRegisterInfo &tri, + const MachineDominatorTree &mdt, + const MachineDominanceFrontier &mdf, + const TargetOperandInfo &toi) : MF(mf), TII(tii), TRI(tri), PRI(tri, mf), MDT(mdt), MDF(mdf), TOI(toi), - LiveIns(PRI) { -} + LiveIns(PRI) {} // The implementation of the definition stack. // Each register reference has its own definition stack. In particular, @@ -663,7 +657,8 @@ DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii, // Construct a stack iterator. DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S, - bool Top) : DS(S) { + bool Top) + : DS(S) { if (!Top) { // Initialize to bottom. Pos = 0; @@ -671,7 +666,7 @@ DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S, } // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty). Pos = DS.Stack.size(); - while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos-1])) + while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos - 1])) Pos--; } @@ -695,7 +690,7 @@ void DataFlowGraph::DefStack::pop() { // Push a delimiter for block node N on the stack. void DataFlowGraph::DefStack::start_block(NodeId N) { assert(N != 0); - Stack.push_back(NodeAddr<DefNode*>(nullptr, N)); + Stack.push_back(Def(nullptr, N)); } // Remove all nodes from the top of the stack, until the delimited for @@ -705,7 +700,7 @@ void DataFlowGraph::DefStack::clear_block(NodeId N) { assert(N != 0); unsigned P = Stack.size(); while (P > 0) { - bool Found = isDelimiter(Stack[P-1], N); + bool Found = isDelimiter(Stack[P - 1], N); P--; if (Found) break; @@ -723,7 +718,7 @@ unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const { assert(P < SS); do { P++; - IsDelim = isDelimiter(Stack[P-1]); + IsDelim = isDelimiter(Stack[P - 1]); } while (P < SS && IsDelim); assert(!IsDelim); return P; @@ -734,11 +729,11 @@ unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const { // Get the preceding valid position before P (skipping all delimiters). // The input position P does not have to point to a non-delimiter. assert(P > 0 && P <= Stack.size()); - bool IsDelim = isDelimiter(Stack[P-1]); + bool IsDelim = isDelimiter(Stack[P - 1]); do { if (--P == 0) break; - IsDelim = isDelimiter(Stack[P-1]); + IsDelim = isDelimiter(Stack[P - 1]); } while (P > 0 && IsDelim); assert(!IsDelim); return P; @@ -746,11 +741,10 @@ unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const { // Register information. -RegisterSet DataFlowGraph::getLandingPadLiveIns() const { - RegisterSet LR; +RegisterAggr DataFlowGraph::getLandingPadLiveIns() const { + RegisterAggr LR(getPRI()); const Function &F = MF.getFunction(); - const Constant *PF = F.hasPersonalityFn() ? F.getPersonalityFn() - : nullptr; + const Constant *PF = F.hasPersonalityFn() ? F.getPersonalityFn() : nullptr; const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering(); if (RegisterId R = TLI.getExceptionPointerRegister(PF)) LR.insert(RegisterRef(R)); @@ -778,8 +772,8 @@ NodeId DataFlowGraph::id(const NodeBase *P) const { } // Allocate a new node and set the attributes to Attrs. -NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) { - NodeAddr<NodeBase*> P = Memory.New(); +Node DataFlowGraph::newNode(uint16_t Attrs) { + Node P = Memory.New(); P.Addr->init(); P.Addr->setAttrs(Attrs); return P; @@ -787,16 +781,16 @@ NodeAddr<NodeBase*> DataFlowGraph::newNode(uint16_t Attrs) { // Make a copy of the given node B, except for the data-flow links, which // are set to 0. -NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) { - NodeAddr<NodeBase*> NA = newNode(0); +Node DataFlowGraph::cloneNode(const Node B) { + Node NA = newNode(0); memcpy(NA.Addr, B.Addr, sizeof(NodeBase)); // Ref nodes need to have the data-flow links reset. if (NA.Addr->getType() == NodeAttrs::Ref) { - NodeAddr<RefNode*> RA = NA; + Ref RA = NA; RA.Addr->setReachingDef(0); RA.Addr->setSibling(0); if (NA.Addr->getKind() == NodeAttrs::Def) { - NodeAddr<DefNode*> DA = NA; + Def DA = NA; DA.Addr->setReachedDef(0); DA.Addr->setReachedUse(0); } @@ -806,75 +800,105 @@ NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) { // Allocation routines for specific node types/kinds. -NodeAddr<UseNode*> DataFlowGraph::newUse(NodeAddr<InstrNode*> Owner, - MachineOperand &Op, uint16_t Flags) { - NodeAddr<UseNode*> UA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags); +Use DataFlowGraph::newUse(Instr Owner, MachineOperand &Op, uint16_t Flags) { + Use UA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags); UA.Addr->setRegRef(&Op, *this); return UA; } -NodeAddr<PhiUseNode*> DataFlowGraph::newPhiUse(NodeAddr<PhiNode*> Owner, - RegisterRef RR, NodeAddr<BlockNode*> PredB, uint16_t Flags) { - NodeAddr<PhiUseNode*> PUA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags); +PhiUse DataFlowGraph::newPhiUse(Phi Owner, RegisterRef RR, Block PredB, + uint16_t Flags) { + PhiUse PUA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags); assert(Flags & NodeAttrs::PhiRef); PUA.Addr->setRegRef(RR, *this); PUA.Addr->setPredecessor(PredB.Id); return PUA; } -NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner, - MachineOperand &Op, uint16_t Flags) { - NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags); +Def DataFlowGraph::newDef(Instr Owner, MachineOperand &Op, uint16_t Flags) { + Def DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags); DA.Addr->setRegRef(&Op, *this); return DA; } -NodeAddr<DefNode*> DataFlowGraph::newDef(NodeAddr<InstrNode*> Owner, - RegisterRef RR, uint16_t Flags) { - NodeAddr<DefNode*> DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags); +Def DataFlowGraph::newDef(Instr Owner, RegisterRef RR, uint16_t Flags) { + Def DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags); assert(Flags & NodeAttrs::PhiRef); DA.Addr->setRegRef(RR, *this); return DA; } -NodeAddr<PhiNode*> DataFlowGraph::newPhi(NodeAddr<BlockNode*> Owner) { - NodeAddr<PhiNode*> PA = newNode(NodeAttrs::Code | NodeAttrs::Phi); +Phi DataFlowGraph::newPhi(Block Owner) { + Phi PA = newNode(NodeAttrs::Code | NodeAttrs::Phi); Owner.Addr->addPhi(PA, *this); return PA; } -NodeAddr<StmtNode*> DataFlowGraph::newStmt(NodeAddr<BlockNode*> Owner, - MachineInstr *MI) { - NodeAddr<StmtNode*> SA = newNode(NodeAttrs::Code | NodeAttrs::Stmt); +Stmt DataFlowGraph::newStmt(Block Owner, MachineInstr *MI) { + Stmt SA = newNode(NodeAttrs::Code | NodeAttrs::Stmt); SA.Addr->setCode(MI); Owner.Addr->addMember(SA, *this); return SA; } -NodeAddr<BlockNode*> DataFlowGraph::newBlock(NodeAddr<FuncNode*> Owner, - MachineBasicBlock *BB) { - NodeAddr<BlockNode*> BA = newNode(NodeAttrs::Code | NodeAttrs::Block); +Block DataFlowGraph::newBlock(Func Owner, MachineBasicBlock *BB) { + Block BA = newNode(NodeAttrs::Code | NodeAttrs::Block); BA.Addr->setCode(BB); Owner.Addr->addMember(BA, *this); return BA; } -NodeAddr<FuncNode*> DataFlowGraph::newFunc(MachineFunction *MF) { - NodeAddr<FuncNode*> FA = newNode(NodeAttrs::Code | NodeAttrs::Func); +Func DataFlowGraph::newFunc(MachineFunction *MF) { + Func FA = newNode(NodeAttrs::Code | NodeAttrs::Func); FA.Addr->setCode(MF); return FA; } // Build the data flow graph. -void DataFlowGraph::build(unsigned Options) { +void DataFlowGraph::build(const Config &config) { reset(); - Func = newFunc(&MF); + BuildCfg = config; + MachineRegisterInfo &MRI = MF.getRegInfo(); + ReservedRegs = MRI.getReservedRegs(); + bool SkipReserved = BuildCfg.Options & BuildOptions::OmitReserved; + + auto Insert = [](auto &Set, auto &&Range) { + Set.insert(Range.begin(), Range.end()); + }; + + if (BuildCfg.TrackRegs.empty()) { + std::set<RegisterId> BaseSet; + if (BuildCfg.Classes.empty()) { + // Insert every register. + for (unsigned R = 0, E = getPRI().getTRI().getNumRegs(); R != E; ++R) + BaseSet.insert(R); + } else { + for (const TargetRegisterClass *RC : BuildCfg.Classes) { + for (MCPhysReg R : *RC) + BaseSet.insert(R); + } + } + for (RegisterId R : BaseSet) { + if (SkipReserved && ReservedRegs[R]) + continue; + Insert(TrackedUnits, getPRI().getUnits(RegisterRef(R))); + } + } else { + // Track set in Config overrides everything. + for (unsigned R : BuildCfg.TrackRegs) { + if (SkipReserved && ReservedRegs[R]) + continue; + Insert(TrackedUnits, getPRI().getUnits(RegisterRef(R))); + } + } + + TheFunc = newFunc(&MF); if (MF.empty()) return; for (MachineBasicBlock &B : MF) { - NodeAddr<BlockNode*> BA = newBlock(Func, &B); + Block BA = newBlock(TheFunc, &B); BlockNodes.insert(std::make_pair(&B, BA)); for (MachineInstr &I : B) { if (I.isDebugInstr()) @@ -883,21 +907,13 @@ void DataFlowGraph::build(unsigned Options) { } } - NodeAddr<BlockNode*> EA = Func.Addr->getEntryBlock(*this); - NodeList Blocks = Func.Addr->members(*this); - - // Collect information about block references. - RegisterSet AllRefs; - for (NodeAddr<BlockNode*> BA : Blocks) - for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) - for (NodeAddr<RefNode*> RA : IA.Addr->members(*this)) - AllRefs.insert(RA.Addr->getRegRef(*this)); + Block EA = TheFunc.Addr->getEntryBlock(*this); + NodeList Blocks = TheFunc.Addr->members(*this); // Collect function live-ins and entry block live-ins. - MachineRegisterInfo &MRI = MF.getRegInfo(); MachineBasicBlock &EntryB = *EA.Addr->getCode(); assert(EntryB.pred_empty() && "Function entry block has predecessors"); - for (std::pair<unsigned,unsigned> P : MRI.liveins()) + for (std::pair<unsigned, unsigned> P : MRI.liveins()) LiveIns.insert(RegisterRef(P.first)); if (MRI.tracksLiveness()) { for (auto I : EntryB.liveins()) @@ -905,12 +921,12 @@ void DataFlowGraph::build(unsigned Options) { } // Add function-entry phi nodes for the live-in registers. - //for (std::pair<RegisterId,LaneBitmask> P : LiveIns) { - for (auto I = LiveIns.rr_begin(), E = LiveIns.rr_end(); I != E; ++I) { - RegisterRef RR = *I; - NodeAddr<PhiNode*> PA = newPhi(EA); + for (RegisterRef RR : LiveIns.refs()) { + if (RR.isReg() && !isTracked(RR)) // isReg is likely guaranteed + continue; + Phi PA = newPhi(EA); uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving; - NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags); + Def DA = newDef(PA, RR, PhiFlags); PA.Addr->addMember(DA, *this); } @@ -919,9 +935,9 @@ void DataFlowGraph::build(unsigned Options) { // branches in the program, or fall-throughs from other blocks. They // are entered from the exception handling runtime and target's ABI // may define certain registers as defined on entry to such a block. - RegisterSet EHRegs = getLandingPadLiveIns(); + RegisterAggr EHRegs = getLandingPadLiveIns(); if (!EHRegs.empty()) { - for (NodeAddr<BlockNode*> BA : Blocks) { + for (Block BA : Blocks) { const MachineBasicBlock &B = *BA.Addr->getCode(); if (!B.isEHPad()) continue; @@ -932,15 +948,17 @@ void DataFlowGraph::build(unsigned Options) { Preds.push_back(findBlock(PB)); // Build phi nodes for each live-in. - for (RegisterRef RR : EHRegs) { - NodeAddr<PhiNode*> PA = newPhi(BA); + for (RegisterRef RR : EHRegs.refs()) { + if (RR.isReg() && !isTracked(RR)) + continue; + Phi PA = newPhi(BA); uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving; // Add def: - NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags); + Def DA = newDef(PA, RR, PhiFlags); PA.Addr->addMember(DA, *this); // Add uses (no reaching defs for phi uses): - for (NodeAddr<BlockNode*> PBA : Preds) { - NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA); + for (Block PBA : Preds) { + PhiUse PUA = newPhiUse(PA, RR, PBA); PA.Addr->addMember(PUA, *this); } } @@ -949,24 +967,23 @@ void DataFlowGraph::build(unsigned Options) { // Build a map "PhiM" which will contain, for each block, the set // of references that will require phi definitions in that block. - BlockRefsMap PhiM; - for (NodeAddr<BlockNode*> BA : Blocks) + BlockRefsMap PhiM(getPRI()); + for (Block BA : Blocks) recordDefsForDF(PhiM, BA); - for (NodeAddr<BlockNode*> BA : Blocks) - buildPhis(PhiM, AllRefs, BA); + for (Block BA : Blocks) + buildPhis(PhiM, BA); // Link all the refs. This will recursively traverse the dominator tree. DefStackMap DM; linkBlockRefs(DM, EA); // Finally, remove all unused phi nodes. - if (!(Options & BuildOptions::KeepDeadPhis)) + if (!(BuildCfg.Options & BuildOptions::KeepDeadPhis)) removeUnusedPhis(); } RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const { - assert(PhysicalRegisterInfo::isRegMaskId(Reg) || - Register::isPhysicalRegister(Reg)); + assert(RegisterRef::isRegId(Reg) || RegisterRef::isMaskId(Reg)); assert(Reg != 0); if (Sub != 0) Reg = TRI.getSubReg(Reg, Sub); @@ -977,7 +994,8 @@ RegisterRef DataFlowGraph::makeRegRef(const MachineOperand &Op) const { assert(Op.isReg() || Op.isRegMask()); if (Op.isReg()) return makeRegRef(Op.getReg(), Op.getSubReg()); - return RegisterRef(PRI.getRegMaskId(Op.getRegMask()), LaneBitmask::getAll()); + return RegisterRef(getPRI().getRegMaskId(Op.getRegMask()), + LaneBitmask::getAll()); } // For each stack in the map DefM, push the delimiter for block B on it. @@ -1006,14 +1024,14 @@ void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) { // Push all definitions from the instruction node IA to an appropriate // stack in DefM. -void DataFlowGraph::pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) { +void DataFlowGraph::pushAllDefs(Instr IA, DefStackMap &DefM) { pushClobbers(IA, DefM); pushDefs(IA, DefM); } // Push all definitions from the instruction node IA to an appropriate // stack in DefM. -void DataFlowGraph::pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DefM) { +void DataFlowGraph::pushClobbers(Instr IA, DefStackMap &DefM) { NodeSet Visited; std::set<RegisterId> Defined; @@ -1029,35 +1047,37 @@ void DataFlowGraph::pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DefM) { // unspecified order), but the order does not matter from the data- // -flow perspective. - for (NodeAddr<DefNode*> DA : IA.Addr->members_if(IsDef, *this)) { + for (Def DA : IA.Addr->members_if(IsDef, *this)) { if (Visited.count(DA.Id)) continue; if (!(DA.Addr->getFlags() & NodeAttrs::Clobbering)) continue; NodeList Rel = getRelatedRefs(IA, DA); - NodeAddr<DefNode*> PDA = Rel.front(); + Def PDA = Rel.front(); RegisterRef RR = PDA.Addr->getRegRef(*this); // Push the definition on the stack for the register and all aliases. // The def stack traversal in linkNodeUp will check the exact aliasing. DefM[RR.Reg].push(DA); Defined.insert(RR.Reg); - for (RegisterId A : PRI.getAliasSet(RR.Reg)) { + for (RegisterId A : getPRI().getAliasSet(RR.Reg)) { + if (RegisterRef::isRegId(A) && !isTracked(RegisterRef(A))) + continue; // Check that we don't push the same def twice. assert(A != RR.Reg); if (!Defined.count(A)) DefM[A].push(DA); } // Mark all the related defs as visited. - for (NodeAddr<NodeBase*> T : Rel) + for (Node T : Rel) Visited.insert(T.Id); } } // Push all definitions from the instruction node IA to an appropriate // stack in DefM. -void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) { +void DataFlowGraph::pushDefs(Instr IA, DefStackMap &DefM) { NodeSet Visited; #ifndef NDEBUG std::set<RegisterId> Defined; @@ -1075,44 +1095,45 @@ void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) { // unspecified order), but the order does not matter from the data- // -flow perspective. - for (NodeAddr<DefNode*> DA : IA.Addr->members_if(IsDef, *this)) { + for (Def DA : IA.Addr->members_if(IsDef, *this)) { if (Visited.count(DA.Id)) continue; if (DA.Addr->getFlags() & NodeAttrs::Clobbering) continue; NodeList Rel = getRelatedRefs(IA, DA); - NodeAddr<DefNode*> PDA = Rel.front(); + Def PDA = Rel.front(); RegisterRef RR = PDA.Addr->getRegRef(*this); #ifndef NDEBUG // Assert if the register is defined in two or more unrelated defs. // This could happen if there are two or more def operands defining it. if (!Defined.insert(RR.Reg).second) { - MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode(); - dbgs() << "Multiple definitions of register: " - << Print(RR, *this) << " in\n " << *MI << "in " - << printMBBReference(*MI->getParent()) << '\n'; + MachineInstr *MI = Stmt(IA).Addr->getCode(); + dbgs() << "Multiple definitions of register: " << Print(RR, *this) + << " in\n " << *MI << "in " << printMBBReference(*MI->getParent()) + << '\n'; llvm_unreachable(nullptr); } #endif // Push the definition on the stack for the register and all aliases. // The def stack traversal in linkNodeUp will check the exact aliasing. DefM[RR.Reg].push(DA); - for (RegisterId A : PRI.getAliasSet(RR.Reg)) { + for (RegisterId A : getPRI().getAliasSet(RR.Reg)) { + if (RegisterRef::isRegId(A) && !isTracked(RegisterRef(A))) + continue; // Check that we don't push the same def twice. assert(A != RR.Reg); DefM[A].push(DA); } // Mark all the related defs as visited. - for (NodeAddr<NodeBase*> T : Rel) + for (Node T : Rel) Visited.insert(T.Id); } } // Return the list of all reference nodes related to RA, including RA itself. // See "getNextRelated" for the meaning of a "related reference". -NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA, - NodeAddr<RefNode*> RA) const { +NodeList DataFlowGraph::getRelatedRefs(Instr IA, Ref RA) const { assert(IA.Id != 0 && RA.Id != 0); NodeList Refs; @@ -1128,7 +1149,9 @@ NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA, void DataFlowGraph::reset() { Memory.clear(); BlockNodes.clear(); - Func = NodeAddr<FuncNode*>(); + TrackedUnits.clear(); + ReservedRegs.clear(); + TheFunc = Func(); } // Return the next reference node in the instruction node IA that is related @@ -1137,36 +1160,38 @@ void DataFlowGraph::reset() { // characteristics. Specific examples of related nodes are shadow reference // nodes. // Return the equivalent of nullptr if there are no more related references. -NodeAddr<RefNode*> DataFlowGraph::getNextRelated(NodeAddr<InstrNode*> IA, - NodeAddr<RefNode*> RA) const { +Ref DataFlowGraph::getNextRelated(Instr IA, Ref RA) const { assert(IA.Id != 0 && RA.Id != 0); - auto Related = [this,RA](NodeAddr<RefNode*> TA) -> bool { + auto IsRelated = [this, RA](Ref TA) -> bool { if (TA.Addr->getKind() != RA.Addr->getKind()) return false; - if (TA.Addr->getRegRef(*this) != RA.Addr->getRegRef(*this)) + if (!getPRI().equal_to(TA.Addr->getRegRef(*this), + RA.Addr->getRegRef(*this))) { return false; + } return true; }; - auto RelatedStmt = [&Related,RA](NodeAddr<RefNode*> TA) -> bool { - return Related(TA) && - &RA.Addr->getOp() == &TA.Addr->getOp(); - }; - auto RelatedPhi = [&Related,RA](NodeAddr<RefNode*> TA) -> bool { - if (!Related(TA)) + + RegisterRef RR = RA.Addr->getRegRef(*this); + if (IA.Addr->getKind() == NodeAttrs::Stmt) { + auto Cond = [&IsRelated, RA](Ref TA) -> bool { + return IsRelated(TA) && &RA.Addr->getOp() == &TA.Addr->getOp(); + }; + return RA.Addr->getNextRef(RR, Cond, true, *this); + } + + assert(IA.Addr->getKind() == NodeAttrs::Phi); + auto Cond = [&IsRelated, RA](Ref TA) -> bool { + if (!IsRelated(TA)) return false; if (TA.Addr->getKind() != NodeAttrs::Use) return true; // For phi uses, compare predecessor blocks. - const NodeAddr<const PhiUseNode*> TUA = TA; - const NodeAddr<const PhiUseNode*> RUA = RA; - return TUA.Addr->getPredecessor() == RUA.Addr->getPredecessor(); + return PhiUse(TA).Addr->getPredecessor() == + PhiUse(RA).Addr->getPredecessor(); }; - - RegisterRef RR = RA.Addr->getRegRef(*this); - if (IA.Addr->getKind() == NodeAttrs::Stmt) - return RA.Addr->getNextRef(RR, RelatedStmt, true, *this); - return RA.Addr->getNextRef(RR, RelatedPhi, true, *this); + return RA.Addr->getNextRef(RR, Cond, true, *this); } // Find the next node related to RA in IA that satisfies condition P. @@ -1175,12 +1200,11 @@ NodeAddr<RefNode*> DataFlowGraph::getNextRelated(NodeAddr<InstrNode*> IA, // first element is the element after which such a node should be inserted, // and the second element is a null-address. template <typename Predicate> -std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>> -DataFlowGraph::locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, - Predicate P) const { +std::pair<Ref, Ref> DataFlowGraph::locateNextRef(Instr IA, Ref RA, + Predicate P) const { assert(IA.Id != 0 && RA.Id != 0); - NodeAddr<RefNode*> NA; + Ref NA; NodeId Start = RA.Id; while (true) { NA = getNextRelated(IA, RA); @@ -1193,17 +1217,16 @@ DataFlowGraph::locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA, if (NA.Id != 0 && NA.Id != Start) return std::make_pair(RA, NA); - return std::make_pair(RA, NodeAddr<RefNode*>()); + return std::make_pair(RA, Ref()); } // Get the next shadow node in IA corresponding to RA, and optionally create // such a node if it does not exist. -NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA, - NodeAddr<RefNode*> RA, bool Create) { +Ref DataFlowGraph::getNextShadow(Instr IA, Ref RA, bool Create) { assert(IA.Id != 0 && RA.Id != 0); uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow; - auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool { + auto IsShadow = [Flags](Ref TA) -> bool { return TA.Addr->getFlags() == Flags; }; auto Loc = locateNextRef(IA, RA, IsShadow); @@ -1211,30 +1234,18 @@ NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA, return Loc.second; // Create a copy of RA and mark is as shadow. - NodeAddr<RefNode*> NA = cloneNode(RA); + Ref NA = cloneNode(RA); NA.Addr->setFlags(Flags | NodeAttrs::Shadow); IA.Addr->addMemberAfter(Loc.first, NA, *this); return NA; } -// Get the next shadow node in IA corresponding to RA. Return null-address -// if such a node does not exist. -NodeAddr<RefNode*> DataFlowGraph::getNextShadow(NodeAddr<InstrNode*> IA, - NodeAddr<RefNode*> RA) const { - assert(IA.Id != 0 && RA.Id != 0); - uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow; - auto IsShadow = [Flags] (NodeAddr<RefNode*> TA) -> bool { - return TA.Addr->getFlags() == Flags; - }; - return locateNextRef(IA, RA, IsShadow).second; -} - // Create a new statement node in the block node BA that corresponds to // the machine instruction MI. -void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { - NodeAddr<StmtNode*> SA = newStmt(BA, &In); +void DataFlowGraph::buildStmt(Block BA, MachineInstr &In) { + Stmt SA = newStmt(BA, &In); - auto isCall = [] (const MachineInstr &In) -> bool { + auto isCall = [](const MachineInstr &In) -> bool { if (In.isCall()) return true; // Is tail call? @@ -1251,14 +1262,14 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { return false; }; - auto isDefUndef = [this] (const MachineInstr &In, RegisterRef DR) -> bool { + auto isDefUndef = [this](const MachineInstr &In, RegisterRef DR) -> bool { // This instruction defines DR. Check if there is a use operand that // would make DR live on entry to the instruction. - for (const MachineOperand &Op : In.operands()) { - if (!Op.isReg() || Op.getReg() == 0 || !Op.isUse() || Op.isUndef()) + for (const MachineOperand &Op : In.all_uses()) { + if (Op.getReg() == 0 || Op.isUndef()) continue; RegisterRef UR = makeRegRef(Op); - if (PRI.alias(DR, UR)) + if (getPRI().alias(DR, UR)) return false; } return true; @@ -1278,7 +1289,7 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) continue; Register R = Op.getReg(); - if (!R || !R.isPhysical()) + if (!R || !R.isPhysical() || !isTracked(RegisterRef(R))) continue; uint16_t Flags = NodeAttrs::None; if (TOI.isPreserving(In, OpN)) { @@ -1293,7 +1304,7 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { Flags |= NodeAttrs::Fixed; if (IsCall && Op.isDead()) Flags |= NodeAttrs::Dead; - NodeAddr<DefNode*> DA = newDef(SA, Op, Flags); + Def DA = newDef(SA, Op, Flags); SA.Addr->addMember(DA, *this); assert(!DoneDefs.test(R)); DoneDefs.set(R); @@ -1305,15 +1316,17 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { MachineOperand &Op = In.getOperand(OpN); if (!Op.isRegMask()) continue; - uint16_t Flags = NodeAttrs::Clobbering | NodeAttrs::Fixed | - NodeAttrs::Dead; - NodeAddr<DefNode*> DA = newDef(SA, Op, Flags); + uint16_t Flags = NodeAttrs::Clobbering | NodeAttrs::Fixed | NodeAttrs::Dead; + Def DA = newDef(SA, Op, Flags); SA.Addr->addMember(DA, *this); // Record all clobbered registers in DoneDefs. const uint32_t *RM = Op.getRegMask(); - for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) - if (!(RM[i/32] & (1u << (i%32)))) + for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) { + if (!isTracked(RegisterRef(i))) + continue; + if (!(RM[i / 32] & (1u << (i % 32)))) DoneClobbers.set(i); + } } // Process implicit defs, skipping those that have already been added @@ -1323,7 +1336,7 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { if (!Op.isReg() || !Op.isDef() || !Op.isImplicit()) continue; Register R = Op.getReg(); - if (!R || !R.isPhysical() || DoneDefs.test(R)) + if (!R || !R.isPhysical() || !isTracked(RegisterRef(R)) || DoneDefs.test(R)) continue; RegisterRef RR = makeRegRef(Op); uint16_t Flags = NodeAttrs::None; @@ -1342,7 +1355,7 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { continue; Flags |= NodeAttrs::Dead; } - NodeAddr<DefNode*> DA = newDef(SA, Op, Flags); + Def DA = newDef(SA, Op, Flags); SA.Addr->addMember(DA, *this); DoneDefs.set(R); } @@ -1352,22 +1365,21 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) { if (!Op.isReg() || !Op.isUse()) continue; Register R = Op.getReg(); - if (!R || !R.isPhysical()) + if (!R || !R.isPhysical() || !isTracked(RegisterRef(R))) continue; uint16_t Flags = NodeAttrs::None; if (Op.isUndef()) Flags |= NodeAttrs::Undef; if (TOI.isFixedReg(In, OpN)) Flags |= NodeAttrs::Fixed; - NodeAddr<UseNode*> UA = newUse(SA, Op, Flags); + Use UA = newUse(SA, Op, Flags); SA.Addr->addMember(UA, *this); } } // Scan all defs in the block node BA and record in PhiM the locations of // phi nodes corresponding to these defs. -void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, - NodeAddr<BlockNode*> BA) { +void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, Block BA) { // Check all defs from block BA and record them in each block in BA's // iterated dominance frontier. This information will later be used to // create phi nodes. @@ -1382,14 +1394,18 @@ void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, // in the block's iterated dominance frontier. // This is done to make sure that each defined reference gets only one // phi node, even if it is defined multiple times. - RegisterSet Defs; - for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) - for (NodeAddr<RefNode*> RA : IA.Addr->members_if(IsDef, *this)) - Defs.insert(RA.Addr->getRegRef(*this)); + RegisterAggr Defs(getPRI()); + for (Instr IA : BA.Addr->members(*this)) { + for (Ref RA : IA.Addr->members_if(IsDef, *this)) { + RegisterRef RR = RA.Addr->getRegRef(*this); + if (RR.isReg() && isTracked(RR)) + Defs.insert(RR); + } + } // Calculate the iterated dominance frontier of BB. const MachineDominanceFrontier::DomSetType &DF = DFLoc->second; - SetVector<MachineBasicBlock*> IDF(DF.begin(), DF.end()); + SetVector<MachineBasicBlock *> IDF(DF.begin(), DF.end()); for (unsigned i = 0; i < IDF.size(); ++i) { auto F = MDF.find(IDF[i]); if (F != MDF.end()) @@ -1399,98 +1415,37 @@ void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, // Finally, add the set of defs to each block in the iterated dominance // frontier. for (auto *DB : IDF) { - NodeAddr<BlockNode*> DBA = findBlock(DB); - PhiM[DBA.Id].insert(Defs.begin(), Defs.end()); + Block DBA = findBlock(DB); + PhiM[DBA.Id].insert(Defs); } } // Given the locations of phi nodes in the map PhiM, create the phi nodes // that are located in the block node BA. -void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs, - NodeAddr<BlockNode*> BA) { +void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, Block BA) { // Check if this blocks has any DF defs, i.e. if there are any defs // that this block is in the iterated dominance frontier of. auto HasDF = PhiM.find(BA.Id); if (HasDF == PhiM.end() || HasDF->second.empty()) return; - // First, remove all R in Refs in such that there exists T in Refs - // such that T covers R. In other words, only leave those refs that - // are not covered by another ref (i.e. maximal with respect to covering). - - auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef { - for (RegisterRef I : RRs) - if (I != RR && RegisterAggr::isCoverOf(I, RR, PRI)) - RR = I; - return RR; - }; - - RegisterSet MaxDF; - for (RegisterRef I : HasDF->second) - MaxDF.insert(MaxCoverIn(I, HasDF->second)); - - std::vector<RegisterRef> MaxRefs; - for (RegisterRef I : MaxDF) - MaxRefs.push_back(MaxCoverIn(I, AllRefs)); - - // Now, for each R in MaxRefs, get the alias closure of R. If the closure - // only has R in it, create a phi a def for R. Otherwise, create a phi, - // and add a def for each S in the closure. - - // Sort the refs so that the phis will be created in a deterministic order. - llvm::sort(MaxRefs); - // Remove duplicates. - auto NewEnd = std::unique(MaxRefs.begin(), MaxRefs.end()); - MaxRefs.erase(NewEnd, MaxRefs.end()); - - auto Aliased = [this,&MaxRefs](RegisterRef RR, - std::vector<unsigned> &Closure) -> bool { - for (unsigned I : Closure) - if (PRI.alias(RR, MaxRefs[I])) - return true; - return false; - }; - // Prepare a list of NodeIds of the block's predecessors. NodeList Preds; const MachineBasicBlock *MBB = BA.Addr->getCode(); for (MachineBasicBlock *PB : MBB->predecessors()) Preds.push_back(findBlock(PB)); - while (!MaxRefs.empty()) { - // Put the first element in the closure, and then add all subsequent - // elements from MaxRefs to it, if they alias at least one element - // already in the closure. - // ClosureIdx: vector of indices in MaxRefs of members of the closure. - std::vector<unsigned> ClosureIdx = { 0 }; - for (unsigned i = 1; i != MaxRefs.size(); ++i) - if (Aliased(MaxRefs[i], ClosureIdx)) - ClosureIdx.push_back(i); - - // Build a phi for the closure. - unsigned CS = ClosureIdx.size(); - NodeAddr<PhiNode*> PA = newPhi(BA); - - // Add defs. - for (unsigned X = 0; X != CS; ++X) { - RegisterRef RR = MaxRefs[ClosureIdx[X]]; - uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving; - NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags); - PA.Addr->addMember(DA, *this); - } + const RegisterAggr &Defs = PhiM[BA.Id]; + uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving; + + for (RegisterRef RR : Defs.refs()) { + Phi PA = newPhi(BA); + PA.Addr->addMember(newDef(PA, RR, PhiFlags), *this); + // Add phi uses. - for (NodeAddr<BlockNode*> PBA : Preds) { - for (unsigned X = 0; X != CS; ++X) { - RegisterRef RR = MaxRefs[ClosureIdx[X]]; - NodeAddr<PhiUseNode*> PUA = newPhiUse(PA, RR, PBA); - PA.Addr->addMember(PUA, *this); - } + for (Block PBA : Preds) { + PA.Addr->addMember(newPhiUse(PA, RR, PBA), *this); } - - // Erase from MaxRefs all elements in the closure. - auto Begin = MaxRefs.begin(); - for (unsigned Idx : llvm::reverse(ClosureIdx)) - MaxRefs.erase(Begin + Idx); } } @@ -1503,16 +1458,16 @@ void DataFlowGraph::removeUnusedPhis() { // that are easily determinable to be unnecessary. SetVector<NodeId> PhiQ; - for (NodeAddr<BlockNode*> BA : Func.Addr->members(*this)) { + for (Block BA : TheFunc.Addr->members(*this)) { for (auto P : BA.Addr->members_if(IsPhi, *this)) PhiQ.insert(P.Id); } static auto HasUsedDef = [](NodeList &Ms) -> bool { - for (NodeAddr<NodeBase*> M : Ms) { + for (Node M : Ms) { if (M.Addr->getKind() != NodeAttrs::Def) continue; - NodeAddr<DefNode*> DA = M; + Def DA = M; if (DA.Addr->getReachedDef() != 0 || DA.Addr->getReachedUse() != 0) return true; } @@ -1523,15 +1478,15 @@ void DataFlowGraph::removeUnusedPhis() { // For each removed phi, collect the potentially affected phis and add // them back to the queue. while (!PhiQ.empty()) { - auto PA = addr<PhiNode*>(PhiQ[0]); + auto PA = addr<PhiNode *>(PhiQ[0]); PhiQ.remove(PA.Id); NodeList Refs = PA.Addr->members(*this); if (HasUsedDef(Refs)) continue; - for (NodeAddr<RefNode*> RA : Refs) { + for (Ref RA : Refs) { if (NodeId RD = RA.Addr->getReachingDef()) { - auto RDA = addr<DefNode*>(RD); - NodeAddr<InstrNode*> OA = RDA.Addr->getOwner(*this); + auto RDA = addr<DefNode *>(RD); + Instr OA = RDA.Addr->getOwner(*this); if (IsPhi(OA)) PhiQ.insert(OA.Id); } @@ -1540,7 +1495,7 @@ void DataFlowGraph::removeUnusedPhis() { else unlinkUse(RA, true); } - NodeAddr<BlockNode*> BA = PA.Addr->getOwner(*this); + Block BA = PA.Addr->getOwner(*this); BA.Addr->removeMember(PA, *this); } } @@ -1549,15 +1504,14 @@ void DataFlowGraph::removeUnusedPhis() { // reaching def of TA to the appropriate def node. Create any shadow nodes // as appropriate. template <typename T> -void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA, - DefStack &DS) { +void DataFlowGraph::linkRefUp(Instr IA, NodeAddr<T> TA, DefStack &DS) { if (DS.empty()) return; RegisterRef RR = TA.Addr->getRegRef(*this); NodeAddr<T> TAP; // References from the def stack that have been examined so far. - RegisterAggr Defs(PRI); + RegisterAggr Defs(getPRI()); for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) { RegisterRef QR = I->Addr->getRegRef(*this); @@ -1573,7 +1527,7 @@ void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA, } // The reaching def. - NodeAddr<DefNode*> RDA = *I; + Def RDA = *I; // Pick the reached node. if (TAP.Id == 0) { @@ -1594,14 +1548,13 @@ void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA, // Create data-flow links for all reference nodes in the statement node SA. template <typename Predicate> -void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA, - Predicate P) { +void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, Stmt SA, Predicate P) { #ifndef NDEBUG - RegisterSet Defs; + RegisterSet Defs(getPRI()); #endif // Link all nodes (upwards in the data-flow) with their reaching defs. - for (NodeAddr<RefNode*> RA : SA.Addr->members_if(P, *this)) { + for (Ref RA : SA.Addr->members_if(P, *this)) { uint16_t Kind = RA.Addr->getKind(); assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use); RegisterRef RR = RA.Addr->getRegRef(*this); @@ -1616,9 +1569,9 @@ void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA, continue; DefStack &DS = F->second; if (Kind == NodeAttrs::Use) - linkRefUp<UseNode*>(SA, RA, DS); + linkRefUp<UseNode *>(SA, RA, DS); else if (Kind == NodeAttrs::Def) - linkRefUp<DefNode*>(SA, RA, DS); + linkRefUp<DefNode *>(SA, RA, DS); else llvm_unreachable("Unexpected node in instruction"); } @@ -1626,14 +1579,14 @@ void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA, // Create data-flow links for all instructions in the block node BA. This // will include updating any phi nodes in BA. -void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) { +void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, Block BA) { // Push block delimiters. markBlock(BA.Id, DefM); - auto IsClobber = [] (NodeAddr<RefNode*> RA) -> bool { + auto IsClobber = [](Ref RA) -> bool { return IsDef(RA) && (RA.Addr->getFlags() & NodeAttrs::Clobbering); }; - auto IsNoClobber = [] (NodeAddr<RefNode*> RA) -> bool { + auto IsNoClobber = [](Ref RA) -> bool { return IsDef(RA) && !(RA.Addr->getFlags() & NodeAttrs::Clobbering); }; @@ -1641,7 +1594,7 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) { // For each non-phi instruction in the block, link all the defs and uses // to their reaching defs. For any member of the block (including phis), // push the defs on the corresponding stacks. - for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) { + for (Instr IA : BA.Addr->members(*this)) { // Ignore phi nodes here. They will be linked part by part from the // predecessors. if (IA.Addr->getKind() == NodeAttrs::Stmt) { @@ -1662,39 +1615,38 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) { MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode()); for (auto *I : *N) { MachineBasicBlock *SB = I->getBlock(); - NodeAddr<BlockNode*> SBA = findBlock(SB); + Block SBA = findBlock(SB); linkBlockRefs(DefM, SBA); } // Link the phi uses from the successor blocks. - auto IsUseForBA = [BA](NodeAddr<NodeBase*> NA) -> bool { + auto IsUseForBA = [BA](Node NA) -> bool { if (NA.Addr->getKind() != NodeAttrs::Use) return false; assert(NA.Addr->getFlags() & NodeAttrs::PhiRef); - NodeAddr<PhiUseNode*> PUA = NA; - return PUA.Addr->getPredecessor() == BA.Id; + return PhiUse(NA).Addr->getPredecessor() == BA.Id; }; - RegisterSet EHLiveIns = getLandingPadLiveIns(); + RegisterAggr EHLiveIns = getLandingPadLiveIns(); MachineBasicBlock *MBB = BA.Addr->getCode(); for (MachineBasicBlock *SB : MBB->successors()) { bool IsEHPad = SB->isEHPad(); - NodeAddr<BlockNode*> SBA = findBlock(SB); - for (NodeAddr<InstrNode*> IA : SBA.Addr->members_if(IsPhi, *this)) { + Block SBA = findBlock(SB); + for (Instr IA : SBA.Addr->members_if(IsPhi, *this)) { // Do not link phi uses for landing pad live-ins. if (IsEHPad) { // Find what register this phi is for. - NodeAddr<RefNode*> RA = IA.Addr->getFirstMember(*this); + Ref RA = IA.Addr->getFirstMember(*this); assert(RA.Id != 0); - if (EHLiveIns.count(RA.Addr->getRegRef(*this))) + if (EHLiveIns.hasCoverOf(RA.Addr->getRegRef(*this))) continue; } // Go over each phi use associated with MBB, and link it. for (auto U : IA.Addr->members_if(IsUseForBA, *this)) { - NodeAddr<PhiUseNode*> PUA = U; + PhiUse PUA = U; RegisterRef RR = PUA.Addr->getRegRef(*this); - linkRefUp<UseNode*>(IA, PUA, DefM[RR.Reg]); + linkRefUp<UseNode *>(IA, PUA, DefM[RR.Reg]); } } } @@ -1704,7 +1656,7 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) { } // Remove the use node UA from any data-flow and structural links. -void DataFlowGraph::unlinkUseDF(NodeAddr<UseNode*> UA) { +void DataFlowGraph::unlinkUseDF(Use UA) { NodeId RD = UA.Addr->getReachingDef(); NodeId Sib = UA.Addr->getSibling(); @@ -1713,8 +1665,8 @@ void DataFlowGraph::unlinkUseDF(NodeAddr<UseNode*> UA) { return; } - auto RDA = addr<DefNode*>(RD); - auto TA = addr<UseNode*>(RDA.Addr->getReachedUse()); + auto RDA = addr<DefNode *>(RD); + auto TA = addr<UseNode *>(RDA.Addr->getReachedUse()); if (TA.Id == UA.Id) { RDA.Addr->setReachedUse(Sib); return; @@ -1726,12 +1678,12 @@ void DataFlowGraph::unlinkUseDF(NodeAddr<UseNode*> UA) { TA.Addr->setSibling(UA.Addr->getSibling()); return; } - TA = addr<UseNode*>(S); + TA = addr<UseNode *>(S); } } // Remove the def node DA from any data-flow and structural links. -void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) { +void DataFlowGraph::unlinkDefDF(Def DA) { // // RD // | reached @@ -1756,10 +1708,10 @@ void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) { // Also, defs reached by DA are now "promoted" to being reached by RD, // so all of them will need to be spliced into the sibling chain where // DA belongs. - auto getAllNodes = [this] (NodeId N) -> NodeList { + auto getAllNodes = [this](NodeId N) -> NodeList { NodeList Res; while (N) { - auto RA = addr<RefNode*>(N); + auto RA = addr<RefNode *>(N); // Keep the nodes in the exact sibling order. Res.push_back(RA); N = RA.Addr->getSibling(); @@ -1770,14 +1722,14 @@ void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) { NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse()); if (RD == 0) { - for (NodeAddr<RefNode*> I : ReachedDefs) + for (Ref I : ReachedDefs) I.Addr->setSibling(0); - for (NodeAddr<RefNode*> I : ReachedUses) + for (Ref I : ReachedUses) I.Addr->setSibling(0); } - for (NodeAddr<DefNode*> I : ReachedDefs) + for (Def I : ReachedDefs) I.Addr->setReachingDef(RD); - for (NodeAddr<UseNode*> I : ReachedUses) + for (Use I : ReachedUses) I.Addr->setReachingDef(RD); NodeId Sib = DA.Addr->getSibling(); @@ -1787,8 +1739,8 @@ void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) { } // Update the reaching def node and remove DA from the sibling list. - auto RDA = addr<DefNode*>(RD); - auto TA = addr<DefNode*>(RDA.Addr->getReachedDef()); + auto RDA = addr<DefNode *>(RD); + auto TA = addr<DefNode *>(RDA.Addr->getReachedDef()); if (TA.Id == DA.Id) { // If DA is the first reached def, just update the RD's reached def // to the DA's sibling. @@ -1802,20 +1754,46 @@ void DataFlowGraph::unlinkDefDF(NodeAddr<DefNode*> DA) { TA.Addr->setSibling(Sib); break; } - TA = addr<DefNode*>(S); + TA = addr<DefNode *>(S); } } // Splice the DA's reached defs into the RDA's reached def chain. if (!ReachedDefs.empty()) { - auto Last = NodeAddr<DefNode*>(ReachedDefs.back()); + auto Last = Def(ReachedDefs.back()); Last.Addr->setSibling(RDA.Addr->getReachedDef()); RDA.Addr->setReachedDef(ReachedDefs.front().Id); } // Splice the DA's reached uses into the RDA's reached use chain. if (!ReachedUses.empty()) { - auto Last = NodeAddr<UseNode*>(ReachedUses.back()); + auto Last = Use(ReachedUses.back()); Last.Addr->setSibling(RDA.Addr->getReachedUse()); RDA.Addr->setReachedUse(ReachedUses.front().Id); } } + +bool DataFlowGraph::isTracked(RegisterRef RR) const { + return !disjoint(getPRI().getUnits(RR), TrackedUnits); +} + +bool DataFlowGraph::hasUntrackedRef(Stmt S, bool IgnoreReserved) const { + SmallVector<MachineOperand *> Ops; + + for (Ref R : S.Addr->members(*this)) { + Ops.push_back(&R.Addr->getOp()); + RegisterRef RR = R.Addr->getRegRef(*this); + if (IgnoreReserved && RR.isReg() && ReservedRegs[RR.idx()]) + continue; + if (!isTracked(RR)) + return true; + } + for (const MachineOperand &Op : S.Addr->getCode()->operands()) { + if (!Op.isReg() && !Op.isRegMask()) + continue; + if (llvm::find(Ops, &Op) == Ops.end()) + return true; + } + return false; +} + +} // end namespace llvm::rdf diff --git a/llvm/lib/CodeGen/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp index 902b29d41ce1..11f3fedaa5f9 100644 --- a/llvm/lib/CodeGen/RDFLiveness.cpp +++ b/llvm/lib/CodeGen/RDFLiveness.cpp @@ -22,7 +22,6 @@ // and Embedded Architectures and Compilers", 8 (4), // <10.1145/2086696.2086706>. <hal-00647369> // -#include "llvm/CodeGen/RDFLiveness.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" @@ -34,6 +33,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/RDFGraph.h" +#include "llvm/CodeGen/RDFLiveness.h" #include "llvm/CodeGen/RDFRegisters.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/MC/LaneBitmask.h" @@ -51,31 +51,27 @@ #include <vector> using namespace llvm; -using namespace rdf; static cl::opt<unsigned> MaxRecNest("rdf-liveness-max-rec", cl::init(25), - cl::Hidden, cl::desc("Maximum recursion level")); - -namespace llvm { -namespace rdf { - - raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) { - OS << '{'; - for (const auto &I : P.Obj) { - OS << ' ' << printReg(I.first, &P.G.getTRI()) << '{'; - for (auto J = I.second.begin(), E = I.second.end(); J != E; ) { - OS << Print(J->first, P.G) << PrintLaneMaskOpt(J->second); - if (++J != E) - OS << ','; - } - OS << '}'; + cl::Hidden, + cl::desc("Maximum recursion level")); + +namespace llvm::rdf { + +raw_ostream &operator<<(raw_ostream &OS, const Print<Liveness::RefMap> &P) { + OS << '{'; + for (const auto &I : P.Obj) { + OS << ' ' << printReg(I.first, &P.G.getTRI()) << '{'; + for (auto J = I.second.begin(), E = I.second.end(); J != E;) { + OS << Print(J->first, P.G) << PrintLaneMaskShort(J->second); + if (++J != E) + OS << ','; } - OS << " }"; - return OS; + OS << '}'; } - -} // end namespace rdf -} // end namespace llvm + OS << " }"; + return OS; +} // The order in the returned sequence is the order of reaching defs in the // upward traversal: the first def is the closest to the given reference RefA, @@ -106,11 +102,12 @@ namespace rdf { // the data-flow. NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, - NodeAddr<RefNode*> RefA, bool TopShadows, bool FullChain, - const RegisterAggr &DefRRs) { + NodeAddr<RefNode *> RefA, bool TopShadows, + bool FullChain, + const RegisterAggr &DefRRs) { NodeList RDefs; // Return value. SetVector<NodeId> DefQ; - DenseMap<MachineInstr*, uint32_t> OrdMap; + DenseMap<MachineInstr *, uint32_t> OrdMap; // Dead defs will be treated as if they were live, since they are actually // on the data-flow path. They cannot be ignored because even though they @@ -124,12 +121,12 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, // whole point of a shadow is that it will have a reaching def that // is not aliased to the reaching defs of the related shadows. NodeId Start = RefA.Id; - auto SNA = DFG.addr<RefNode*>(Start); + auto SNA = DFG.addr<RefNode *>(Start); if (NodeId RD = SNA.Addr->getReachingDef()) DefQ.insert(RD); if (TopShadows) { for (auto S : DFG.getRelatedRefs(RefA.Addr->getOwner(DFG), RefA)) - if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef()) + if (NodeId RD = NodeAddr<RefNode *>(S).Addr->getReachingDef()) DefQ.insert(RD); } @@ -140,7 +137,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, // It is possible that a collection of non-covering (individually) defs // will be sufficient, but keep going until a covering one is found. for (unsigned i = 0; i < DefQ.size(); ++i) { - auto TA = DFG.addr<DefNode*>(DefQ[i]); + auto TA = DFG.addr<DefNode *>(DefQ[i]); if (TA.Addr->getFlags() & NodeAttrs::PhiRef) continue; // Stop at the covering/overwriting def of the initial register reference. @@ -151,7 +148,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, // Get the next level of reaching defs. This will include multiple // reaching defs for shadows. for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA)) - if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef()) + if (NodeId RD = NodeAddr<RefNode *>(S).Addr->getReachingDef()) DefQ.insert(RD); // Don't visit sibling defs. They share the same reaching def (which // will be visited anyway), but they define something not aliased to @@ -159,42 +156,42 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, } // Return the MachineBasicBlock containing a given instruction. - auto Block = [this] (NodeAddr<InstrNode*> IA) -> MachineBasicBlock* { + auto Block = [this](NodeAddr<InstrNode *> IA) -> MachineBasicBlock * { if (IA.Addr->getKind() == NodeAttrs::Stmt) - return NodeAddr<StmtNode*>(IA).Addr->getCode()->getParent(); + return NodeAddr<StmtNode *>(IA).Addr->getCode()->getParent(); assert(IA.Addr->getKind() == NodeAttrs::Phi); - NodeAddr<PhiNode*> PA = IA; - NodeAddr<BlockNode*> BA = PA.Addr->getOwner(DFG); + NodeAddr<PhiNode *> PA = IA; + NodeAddr<BlockNode *> BA = PA.Addr->getOwner(DFG); return BA.Addr->getCode(); }; - SmallSet<NodeId,32> Defs; + SmallSet<NodeId, 32> Defs; // Remove all non-phi defs that are not aliased to RefRR, and separate // the the remaining defs into buckets for containing blocks. - std::map<NodeId, NodeAddr<InstrNode*>> Owners; - std::map<MachineBasicBlock*, SmallVector<NodeId,32>> Blocks; + std::map<NodeId, NodeAddr<InstrNode *>> Owners; + std::map<MachineBasicBlock *, SmallVector<NodeId, 32>> Blocks; for (NodeId N : DefQ) { - auto TA = DFG.addr<DefNode*>(N); + auto TA = DFG.addr<DefNode *>(N); bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef; if (!IsPhi && !PRI.alias(RefRR, TA.Addr->getRegRef(DFG))) continue; Defs.insert(TA.Id); - NodeAddr<InstrNode*> IA = TA.Addr->getOwner(DFG); + NodeAddr<InstrNode *> IA = TA.Addr->getOwner(DFG); Owners[TA.Id] = IA; Blocks[Block(IA)].push_back(IA.Id); } - auto Precedes = [this,&OrdMap] (NodeId A, NodeId B) { + auto Precedes = [this, &OrdMap](NodeId A, NodeId B) { if (A == B) return false; - NodeAddr<InstrNode*> OA = DFG.addr<InstrNode*>(A); - NodeAddr<InstrNode*> OB = DFG.addr<InstrNode*>(B); + NodeAddr<InstrNode *> OA = DFG.addr<InstrNode *>(A); + NodeAddr<InstrNode *> OB = DFG.addr<InstrNode *>(B); bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt; bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt; if (StmtA && StmtB) { - const MachineInstr *InA = NodeAddr<StmtNode*>(OA).Addr->getCode(); - const MachineInstr *InB = NodeAddr<StmtNode*>(OB).Addr->getCode(); + const MachineInstr *InA = NodeAddr<StmtNode *>(OA).Addr->getCode(); + const MachineInstr *InB = NodeAddr<StmtNode *>(OB).Addr->getCode(); assert(InA->getParent() == InB->getParent()); auto FA = OrdMap.find(InA); if (FA != OrdMap.end()) @@ -217,14 +214,14 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, return !StmtA; }; - auto GetOrder = [&OrdMap] (MachineBasicBlock &B) { + auto GetOrder = [&OrdMap](MachineBasicBlock &B) { uint32_t Pos = 0; for (MachineInstr &In : B) OrdMap.insert({&In, ++Pos}); }; // For each block, sort the nodes in it. - std::vector<MachineBasicBlock*> TmpBB; + std::vector<MachineBasicBlock *> TmpBB; for (auto &Bucket : Blocks) { TmpBB.push_back(Bucket.first); if (Bucket.second.size() > 2) @@ -261,18 +258,17 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, RegisterAggr RRs(DefRRs); - auto DefInSet = [&Defs] (NodeAddr<RefNode*> TA) -> bool { - return TA.Addr->getKind() == NodeAttrs::Def && - Defs.count(TA.Id); + auto DefInSet = [&Defs](NodeAddr<RefNode *> TA) -> bool { + return TA.Addr->getKind() == NodeAttrs::Def && Defs.count(TA.Id); }; for (NodeId T : TmpInst) { if (!FullChain && RRs.hasCoverOf(RefRR)) break; - auto TA = DFG.addr<InstrNode*>(T); + auto TA = DFG.addr<InstrNode *>(T); bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA); NodeList Ds; - for (NodeAddr<DefNode*> DA : TA.Addr->members_if(DefInSet, DFG)) { + for (NodeAddr<DefNode *> DA : TA.Addr->members_if(DefInSet, DFG)) { RegisterRef QR = DA.Addr->getRegRef(DFG); // Add phi defs even if they are covered by subsequent defs. This is // for cases where the reached use is not covered by any of the defs @@ -286,7 +282,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, Ds.push_back(DA); } llvm::append_range(RDefs, Ds); - for (NodeAddr<DefNode*> DA : Ds) { + for (NodeAddr<DefNode *> DA : Ds) { // When collecting a full chain of definitions, do not consider phi // defs to actually define a register. uint16_t Flags = DA.Addr->getFlags(); @@ -296,7 +292,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, } } - auto DeadP = [](const NodeAddr<DefNode*> DA) -> bool { + auto DeadP = [](const NodeAddr<DefNode *> DA) -> bool { return DA.Addr->getFlags() & NodeAttrs::Dead; }; llvm::erase_if(RDefs, DeadP); @@ -304,81 +300,82 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR, return RDefs; } -std::pair<NodeSet,bool> -Liveness::getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA, - NodeSet &Visited, const NodeSet &Defs) { +std::pair<NodeSet, bool> +Liveness::getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode *> RefA, + NodeSet &Visited, const NodeSet &Defs) { return getAllReachingDefsRecImpl(RefRR, RefA, Visited, Defs, 0, MaxRecNest); } -std::pair<NodeSet,bool> -Liveness::getAllReachingDefsRecImpl(RegisterRef RefRR, NodeAddr<RefNode*> RefA, - NodeSet &Visited, const NodeSet &Defs, unsigned Nest, unsigned MaxNest) { +std::pair<NodeSet, bool> +Liveness::getAllReachingDefsRecImpl(RegisterRef RefRR, NodeAddr<RefNode *> RefA, + NodeSet &Visited, const NodeSet &Defs, + unsigned Nest, unsigned MaxNest) { if (Nest > MaxNest) - return { NodeSet(), false }; + return {NodeSet(), false}; // Collect all defined registers. Do not consider phis to be defining // anything, only collect "real" definitions. RegisterAggr DefRRs(PRI); for (NodeId D : Defs) { - const auto DA = DFG.addr<const DefNode*>(D); + const auto DA = DFG.addr<const DefNode *>(D); if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef)) DefRRs.insert(DA.Addr->getRegRef(DFG)); } NodeList RDs = getAllReachingDefs(RefRR, RefA, false, true, DefRRs); if (RDs.empty()) - return { Defs, true }; + return {Defs, true}; // Make a copy of the preexisting definitions and add the newly found ones. NodeSet TmpDefs = Defs; - for (NodeAddr<NodeBase*> R : RDs) + for (NodeAddr<NodeBase *> R : RDs) TmpDefs.insert(R.Id); NodeSet Result = Defs; - for (NodeAddr<DefNode*> DA : RDs) { + for (NodeAddr<DefNode *> DA : RDs) { Result.insert(DA.Id); if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef)) continue; - NodeAddr<PhiNode*> PA = DA.Addr->getOwner(DFG); + NodeAddr<PhiNode *> PA = DA.Addr->getOwner(DFG); if (!Visited.insert(PA.Id).second) continue; // Go over all phi uses and get the reaching defs for each use. for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) { const auto &T = getAllReachingDefsRecImpl(RefRR, U, Visited, TmpDefs, - Nest+1, MaxNest); + Nest + 1, MaxNest); if (!T.second) - return { T.first, false }; + return {T.first, false}; Result.insert(T.first.begin(), T.first.end()); } } - return { Result, true }; + return {Result, true}; } /// Find the nearest ref node aliased to RefRR, going upwards in the data /// flow, starting from the instruction immediately preceding Inst. -NodeAddr<RefNode*> Liveness::getNearestAliasedRef(RegisterRef RefRR, - NodeAddr<InstrNode*> IA) { - NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG); +NodeAddr<RefNode *> Liveness::getNearestAliasedRef(RegisterRef RefRR, + NodeAddr<InstrNode *> IA) { + NodeAddr<BlockNode *> BA = IA.Addr->getOwner(DFG); NodeList Ins = BA.Addr->members(DFG); NodeId FindId = IA.Id; auto E = Ins.rend(); - auto B = std::find_if(Ins.rbegin(), E, - [FindId] (const NodeAddr<InstrNode*> T) { - return T.Id == FindId; - }); + auto B = + std::find_if(Ins.rbegin(), E, [FindId](const NodeAddr<InstrNode *> T) { + return T.Id == FindId; + }); // Do not scan IA (which is what B would point to). if (B != E) ++B; do { // Process the range of instructions from B to E. - for (NodeAddr<InstrNode*> I : make_range(B, E)) { + for (NodeAddr<InstrNode *> I : make_range(B, E)) { NodeList Refs = I.Addr->members(DFG); - NodeAddr<RefNode*> Clob, Use; + NodeAddr<RefNode *> Clob, Use; // Scan all the refs in I aliased to RefRR, and return the one that // is the closest to the output of I, i.e. def > clobber > use. - for (NodeAddr<RefNode*> R : Refs) { + for (NodeAddr<RefNode *> R : Refs) { if (!PRI.alias(R.Addr->getRegRef(DFG), RefRR)) continue; if (DFG.IsDef(R)) { @@ -398,7 +395,7 @@ NodeAddr<RefNode*> Liveness::getNearestAliasedRef(RegisterRef RefRR, // Go up to the immediate dominator, if any. MachineBasicBlock *BB = BA.Addr->getCode(); - BA = NodeAddr<BlockNode*>(); + BA = NodeAddr<BlockNode *>(); if (MachineDomTreeNode *N = MDT.getNode(BB)) { if ((N = N->getIDom())) BA = DFG.findBlock(N->getBlock()); @@ -411,11 +408,11 @@ NodeAddr<RefNode*> Liveness::getNearestAliasedRef(RegisterRef RefRR, E = Ins.rend(); } while (true); - return NodeAddr<RefNode*>(); + return NodeAddr<RefNode *>(); } -NodeSet Liveness::getAllReachedUses(RegisterRef RefRR, - NodeAddr<DefNode*> DefA, const RegisterAggr &DefRRs) { +NodeSet Liveness::getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode *> DefA, + const RegisterAggr &DefRRs) { NodeSet Uses; // If the original register is already covered by all the intervening @@ -428,7 +425,7 @@ NodeSet Liveness::getAllReachedUses(RegisterRef RefRR, bool IsDead = DefA.Addr->getFlags() & NodeAttrs::Dead; NodeId U = !IsDead ? DefA.Addr->getReachedUse() : 0; while (U != 0) { - auto UA = DFG.addr<UseNode*>(U); + auto UA = DFG.addr<UseNode *>(U); if (!(UA.Addr->getFlags() & NodeAttrs::Undef)) { RegisterRef UR = UA.Addr->getRegRef(DFG); if (PRI.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR)) @@ -439,7 +436,7 @@ NodeSet Liveness::getAllReachedUses(RegisterRef RefRR, // Traverse all reached defs. This time dead defs cannot be ignored. for (NodeId D = DefA.Addr->getReachedDef(), NextD; D != 0; D = NextD) { - auto DA = DFG.addr<DefNode*>(D); + auto DA = DFG.addr<DefNode *>(D); NextD = DA.Addr->getSibling(); RegisterRef DR = DA.Addr->getRegRef(DFG); // If this def is already covered, it cannot reach anything new. @@ -464,20 +461,21 @@ void Liveness::computePhiInfo() { RealUseMap.clear(); NodeList Phis; - NodeAddr<FuncNode*> FA = DFG.getFunc(); + NodeAddr<FuncNode *> FA = DFG.getFunc(); NodeList Blocks = FA.Addr->members(DFG); - for (NodeAddr<BlockNode*> BA : Blocks) { + for (NodeAddr<BlockNode *> BA : Blocks) { auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG); llvm::append_range(Phis, Ps); } // phi use -> (map: reaching phi -> set of registers defined in between) - std::map<NodeId,std::map<NodeId,RegisterAggr>> PhiUp; - std::vector<NodeId> PhiUQ; // Work list of phis for upward propagation. - std::unordered_map<NodeId,RegisterAggr> PhiDRs; // Phi -> registers defined by it. + std::map<NodeId, std::map<NodeId, RegisterAggr>> PhiUp; + std::vector<NodeId> PhiUQ; // Work list of phis for upward propagation. + std::unordered_map<NodeId, RegisterAggr> + PhiDRs; // Phi -> registers defined by it. // Go over all phis. - for (NodeAddr<PhiNode*> PhiA : Phis) { + for (NodeAddr<PhiNode *> PhiA : Phis) { // Go over all defs and collect the reached uses that are non-phi uses // (i.e. the "real uses"). RefMap &RealUses = RealUseMap[PhiA.Id]; @@ -488,7 +486,7 @@ void Liveness::computePhiInfo() { SetVector<NodeId> DefQ; NodeSet PhiDefs; RegisterAggr DRs(PRI); - for (NodeAddr<RefNode*> R : PhiRefs) { + for (NodeAddr<RefNode *> R : PhiRefs) { if (!DFG.IsRef<NodeAttrs::Def>(R)) continue; DRs.insert(R.Addr->getRegRef(DFG)); @@ -503,17 +501,17 @@ void Liveness::computePhiInfo() { // This set of uses will later be trimmed to only contain these uses that // are actually reached by the phi defs. for (unsigned i = 0; i < DefQ.size(); ++i) { - NodeAddr<DefNode*> DA = DFG.addr<DefNode*>(DefQ[i]); + NodeAddr<DefNode *> DA = DFG.addr<DefNode *>(DefQ[i]); // Visit all reached uses. Phi defs should not really have the "dead" // flag set, but check it anyway for consistency. bool IsDead = DA.Addr->getFlags() & NodeAttrs::Dead; NodeId UN = !IsDead ? DA.Addr->getReachedUse() : 0; while (UN != 0) { - NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN); + NodeAddr<UseNode *> A = DFG.addr<UseNode *>(UN); uint16_t F = A.Addr->getFlags(); if ((F & (NodeAttrs::Undef | NodeAttrs::PhiRef)) == 0) { RegisterRef R = A.Addr->getRegRef(DFG); - RealUses[R.Reg].insert({A.Id,R.Mask}); + RealUses[R.Reg].insert({A.Id, R.Mask}); } UN = A.Addr->getSibling(); } @@ -522,9 +520,9 @@ void Liveness::computePhiInfo() { // later. NodeId DN = DA.Addr->getReachedDef(); while (DN != 0) { - NodeAddr<DefNode*> A = DFG.addr<DefNode*>(DN); + NodeAddr<DefNode *> A = DFG.addr<DefNode *>(DN); for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) { - uint16_t Flags = NodeAddr<DefNode*>(T).Addr->getFlags(); + uint16_t Flags = NodeAddr<DefNode *>(T).Addr->getFlags(); // Must traverse the reached-def chain. Consider: // def(D0) -> def(R0) -> def(R0) -> use(D0) // The reachable use of D0 passes through a def of R0. @@ -546,21 +544,25 @@ void Liveness::computePhiInfo() { // = R1:0 u6 Not reached by d1 (covered collectively // by d3 and d5), but following reached // defs and uses from d1 will lead here. - for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) { + for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE;) { // For each reached register UI->first, there is a set UI->second, of // uses of it. For each such use, check if it is reached by this phi, // i.e. check if the set of its reaching uses intersects the set of // this phi's defs. NodeRefSet Uses = UI->second; UI->second.clear(); - for (std::pair<NodeId,LaneBitmask> I : Uses) { - auto UA = DFG.addr<UseNode*>(I.first); + for (std::pair<NodeId, LaneBitmask> I : Uses) { + auto UA = DFG.addr<UseNode *>(I.first); // Undef flag is checked above. assert((UA.Addr->getFlags() & NodeAttrs::Undef) == 0); - RegisterRef R(UI->first, I.second); + RegisterRef UseR(UI->first, I.second); // Ref from Uses + // R = intersection of the ref from the phi and the ref from Uses + RegisterRef R = PhiDRs.at(PhiA.Id).intersectWith(UseR); + if (!R) + continue; // Calculate the exposed part of the reached use. RegisterAggr Covered(PRI); - for (NodeAddr<DefNode*> DA : getAllReachingDefs(R, UA)) { + for (NodeAddr<DefNode *> DA : getAllReachingDefs(R, UA)) { if (PhiDefs.count(DA.Id)) break; Covered.insert(DA.Addr->getRegRef(DFG)); @@ -590,7 +592,7 @@ void Liveness::computePhiInfo() { for (auto I : PhiRefs) { if (!DFG.IsRef<NodeAttrs::Use>(I) || SeenUses.count(I.Id)) continue; - NodeAddr<PhiUseNode*> PUA = I; + NodeAddr<PhiUseNode *> PUA = I; if (PUA.Addr->getReachingDef() == 0) continue; @@ -598,10 +600,10 @@ void Liveness::computePhiInfo() { NodeList Ds = getAllReachingDefs(UR, PUA, true, false, NoRegs); RegisterAggr DefRRs(PRI); - for (NodeAddr<DefNode*> D : Ds) { + for (NodeAddr<DefNode *> D : Ds) { if (D.Addr->getFlags() & NodeAttrs::PhiRef) { NodeId RP = D.Addr->getOwner(DFG).Id; - std::map<NodeId,RegisterAggr> &M = PhiUp[PUA.Id]; + std::map<NodeId, RegisterAggr> &M = PhiUp[PUA.Id]; auto F = M.find(RP); if (F == M.end()) M.insert(std::make_pair(RP, DefRRs)); @@ -611,7 +613,7 @@ void Liveness::computePhiInfo() { DefRRs.insert(D.Addr->getRegRef(DFG)); } - for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PhiA, PUA)) + for (NodeAddr<PhiUseNode *> T : DFG.getRelatedRefs(PhiA, PUA)) SeenUses.insert(T.Id); } } @@ -652,9 +654,11 @@ void Liveness::computePhiInfo() { // The operation "clearIn" can be expensive. For a given set of intervening // defs, cache the result of subtracting these defs from a given register // ref. + using RefHash = std::hash<RegisterRef>; + using RefEqual = std::equal_to<RegisterRef>; using SubMap = std::unordered_map<RegisterRef, RegisterRef>; std::unordered_map<RegisterAggr, SubMap> Subs; - auto ClearIn = [] (RegisterRef RR, const RegisterAggr &Mid, SubMap &SM) { + auto ClearIn = [](RegisterRef RR, const RegisterAggr &Mid, SubMap &SM) { if (Mid.empty()) return RR; auto F = SM.find(RR); @@ -667,12 +671,12 @@ void Liveness::computePhiInfo() { // Go over all phis. for (unsigned i = 0; i < PhiUQ.size(); ++i) { - auto PA = DFG.addr<PhiNode*>(PhiUQ[i]); + auto PA = DFG.addr<PhiNode *>(PhiUQ[i]); NodeList PUs = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG); RefMap &RUM = RealUseMap[PA.Id]; - for (NodeAddr<UseNode*> UA : PUs) { - std::map<NodeId,RegisterAggr> &PUM = PhiUp[UA.Id]; + for (NodeAddr<UseNode *> UA : PUs) { + std::map<NodeId, RegisterAggr> &PUM = PhiUp[UA.Id]; RegisterRef UR = UA.Addr->getRegRef(DFG); for (const std::pair<const NodeId, RegisterAggr> &P : PUM) { bool Changed = false; @@ -683,7 +687,10 @@ void Liveness::computePhiInfo() { if (MidDefs.hasCoverOf(UR)) continue; - SubMap &SM = Subs[MidDefs]; + if (Subs.find(MidDefs) == Subs.end()) { + Subs.insert({MidDefs, SubMap(1, RefHash(), RefEqual(PRI))}); + } + SubMap &SM = Subs.at(MidDefs); // General algorithm: // for each (R,U) : U is use node of R, U is reached by PA @@ -699,13 +706,13 @@ void Liveness::computePhiInfo() { if (!DRs.hasAliasOf(R)) continue; R = PRI.mapTo(DRs.intersectWith(R), T.first); - for (std::pair<NodeId,LaneBitmask> V : T.second) { + for (std::pair<NodeId, LaneBitmask> V : T.second) { LaneBitmask M = R.Mask & V.second; if (M.none()) continue; if (RegisterRef SS = ClearIn(RegisterRef(R.Reg, M), MidDefs, SM)) { NodeRefSet &RS = RealUseMap[P.first][SS.Reg]; - Changed |= RS.insert({V.first,SS.Mask}).second; + Changed |= RS.insert({V.first, SS.Mask}).second; } } } @@ -720,10 +727,10 @@ void Liveness::computePhiInfo() { dbgs() << "Real use map:\n"; for (auto I : RealUseMap) { dbgs() << "phi " << Print(I.first, DFG); - NodeAddr<PhiNode*> PA = DFG.addr<PhiNode*>(I.first); + NodeAddr<PhiNode *> PA = DFG.addr<PhiNode *>(I.first); NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG); if (!Ds.empty()) { - RegisterRef RR = NodeAddr<DefNode*>(Ds[0]).Addr->getRegRef(DFG); + RegisterRef RR = NodeAddr<DefNode *>(Ds[0]).Addr->getRegRef(DFG); dbgs() << '<' << Print(RR, DFG) << '>'; } else { dbgs() << "<noreg>"; @@ -737,10 +744,10 @@ void Liveness::computeLiveIns() { // Populate the node-to-block map. This speeds up the calculations // significantly. NBMap.clear(); - for (NodeAddr<BlockNode*> BA : DFG.getFunc().Addr->members(DFG)) { + for (NodeAddr<BlockNode *> BA : DFG.getFunc().Addr->members(DFG)) { MachineBasicBlock *BB = BA.Addr->getCode(); - for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) { - for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG)) + for (NodeAddr<InstrNode *> IA : BA.Addr->members(DFG)) { + for (NodeAddr<RefNode *> RA : IA.Addr->members(DFG)) NBMap.insert(std::make_pair(RA.Id, BB)); NBMap.insert(std::make_pair(IA.Id, BB)); } @@ -754,7 +761,7 @@ void Liveness::computeLiveIns() { auto F1 = MDF.find(&B); if (F1 == MDF.end()) continue; - SetVector<MachineBasicBlock*> IDFB(F1->second.begin(), F1->second.end()); + SetVector<MachineBasicBlock *> IDFB(F1->second.begin(), F1->second.end()); for (unsigned i = 0; i < IDFB.size(); ++i) { auto F2 = MDF.find(IDFB[i]); if (F2 != MDF.end()) @@ -771,16 +778,17 @@ void Liveness::computeLiveIns() { computePhiInfo(); - NodeAddr<FuncNode*> FA = DFG.getFunc(); + NodeAddr<FuncNode *> FA = DFG.getFunc(); NodeList Blocks = FA.Addr->members(DFG); // Build the phi live-on-entry map. - for (NodeAddr<BlockNode*> BA : Blocks) { + for (NodeAddr<BlockNode *> BA : Blocks) { MachineBasicBlock *MB = BA.Addr->getCode(); RefMap &LON = PhiLON[MB]; - for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG)) + for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG)) { for (const RefMap::value_type &S : RealUseMap[P.Id]) LON[S.first].insert(S.second.begin(), S.second.end()); + } } if (Trace) { @@ -793,9 +801,9 @@ void Liveness::computeLiveIns() { // Build the phi live-on-exit map. Each phi node has some set of reached // "real" uses. Propagate this set backwards into the block predecessors // through the reaching defs of the corresponding phi uses. - for (NodeAddr<BlockNode*> BA : Blocks) { + for (NodeAddr<BlockNode *> BA : Blocks) { NodeList Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG); - for (NodeAddr<PhiNode*> PA : Phis) { + for (NodeAddr<PhiNode *> PA : Phis) { RefMap &RUs = RealUseMap[PA.Id]; if (RUs.empty()) continue; @@ -804,7 +812,7 @@ void Liveness::computeLiveIns() { for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) { if (!SeenUses.insert(U.Id).second) continue; - NodeAddr<PhiUseNode*> PUA = U; + NodeAddr<PhiUseNode *> PUA = U; if (PUA.Addr->getReachingDef() == 0) continue; @@ -819,18 +827,18 @@ void Liveness::computeLiveIns() { // For each reached "real" use, identify the set of reaching defs // coming from each predecessor P, and add them to PhiLOX[P]. // - auto PrA = DFG.addr<BlockNode*>(PUA.Addr->getPredecessor()); + auto PrA = DFG.addr<BlockNode *>(PUA.Addr->getPredecessor()); RefMap &LOX = PhiLOX[PrA.Addr->getCode()]; for (const std::pair<const RegisterId, NodeRefSet> &RS : RUs) { // We need to visit each individual use. - for (std::pair<NodeId,LaneBitmask> P : RS.second) { + for (std::pair<NodeId, LaneBitmask> P : RS.second) { // Create a register ref corresponding to the use, and find // all reaching defs starting from the phi use, and treating // all related shadows as a single use cluster. RegisterRef S(RS.first, P.second); NodeList Ds = getAllReachingDefs(S, PUA, true, false, NoRegs); - for (NodeAddr<DefNode*> D : Ds) { + for (NodeAddr<DefNode *> D : Ds) { // Calculate the mask corresponding to the visited def. RegisterAggr TA(PRI); TA.insert(D.Addr->getRegRef(DFG)).intersect(S); @@ -840,11 +848,11 @@ void Liveness::computeLiveIns() { } } - for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PA, PUA)) + for (NodeAddr<PhiUseNode *> T : DFG.getRelatedRefs(PA, PUA)) SeenUses.insert(T.Id); - } // for U : phi uses - } // for P : Phis - } // for B : Blocks + } // for U : phi uses + } // for P : Phis + } // for B : Blocks if (Trace) { dbgs() << "Phi live-on-exit map:\n"; @@ -865,23 +873,21 @@ void Liveness::computeLiveIns() { std::vector<RegisterRef> LV; for (const MachineBasicBlock::RegisterMaskPair &LI : B.liveins()) LV.push_back(RegisterRef(LI.PhysReg, LI.LaneMask)); - llvm::sort(LV); + llvm::sort(LV, std::less<RegisterRef>(PRI)); dbgs() << printMBBReference(B) << "\t rec = {"; for (auto I : LV) dbgs() << ' ' << Print(I, DFG); dbgs() << " }\n"; - //dbgs() << "\tcomp = " << Print(LiveMap[&B], DFG) << '\n'; + // dbgs() << "\tcomp = " << Print(LiveMap[&B], DFG) << '\n'; LV.clear(); - const RegisterAggr &LG = LiveMap[&B]; - for (auto I = LG.rr_begin(), E = LG.rr_end(); I != E; ++I) - LV.push_back(*I); - llvm::sort(LV); + for (RegisterRef RR : LiveMap[&B].refs()) + LV.push_back(RR); + llvm::sort(LV, std::less<RegisterRef>(PRI)); dbgs() << "\tcomp = {"; for (auto I : LV) dbgs() << ' ' << Print(I, DFG); dbgs() << " }\n"; - } } } @@ -896,7 +902,7 @@ void Liveness::resetLiveIns() { B.removeLiveIn(I); // Add the newly computed live-ins. const RegisterAggr &LiveIns = LiveMap[&B]; - for (const RegisterRef R : make_range(LiveIns.rr_begin(), LiveIns.rr_end())) + for (RegisterRef R : LiveIns.refs()) B.addLiveIn({MCPhysReg(R.Reg), R.Mask}); } } @@ -907,7 +913,7 @@ void Liveness::resetKills() { } void Liveness::resetKills(MachineBasicBlock *B) { - auto CopyLiveIns = [this] (MachineBasicBlock *B, BitVector &LV) -> void { + auto CopyLiveIns = [this](MachineBasicBlock *B, BitVector &LV) -> void { for (auto I : B->liveins()) { MCSubRegIndexIterator S(I.PhysReg, &TRI); if (!S.isValid()) { @@ -933,21 +939,21 @@ void Liveness::resetKills(MachineBasicBlock *B) { continue; MI.clearKillInfo(); - for (auto &Op : MI.operands()) { + for (auto &Op : MI.all_defs()) { // An implicit def of a super-register may not necessarily start a // live range of it, since an implicit use could be used to keep parts // of it live. Instead of analyzing the implicit operands, ignore // implicit defs. - if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) + if (Op.isImplicit()) continue; Register R = Op.getReg(); if (!R.isPhysical()) continue; - for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) - Live.reset(*SR); + for (MCPhysReg SR : TRI.subregs_inclusive(R)) + Live.reset(SR); } - for (auto &Op : MI.operands()) { - if (!Op.isReg() || !Op.isUse() || Op.isUndef()) + for (auto &Op : MI.all_uses()) { + if (Op.isUndef()) continue; Register R = Op.getReg(); if (!R.isPhysical()) @@ -961,8 +967,8 @@ void Liveness::resetKills(MachineBasicBlock *B) { } if (!IsLive) Op.setIsKill(true); - for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR) - Live.set(*SR); + for (MCPhysReg SR : TRI.subregs_inclusive(R)) + Live.set(SR); } } } @@ -1048,9 +1054,9 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { const NodeRefSet &OldDefs = LE.second; for (NodeRef OR : OldDefs) { // R is a def node that was live-on-exit - auto DA = DFG.addr<DefNode*>(OR.first); - NodeAddr<InstrNode*> IA = DA.Addr->getOwner(DFG); - NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG); + auto DA = DFG.addr<DefNode *>(OR.first); + NodeAddr<InstrNode *> IA = DA.Addr->getOwner(DFG); + NodeAddr<BlockNode *> BA = IA.Addr->getOwner(DFG); if (B != BA.Addr->getCode()) { // Defs from a different block need to be preserved. Defs from this // block will need to be processed further, except for phi defs, the @@ -1081,10 +1087,10 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { // There could be other defs in this block that are a part of that // chain. Check that now: accumulate the registers from these defs, // and if they all together cover LRef, it is not live-on-entry. - for (NodeAddr<DefNode*> TA : getAllReachingDefs(DA)) { + for (NodeAddr<DefNode *> TA : getAllReachingDefs(DA)) { // DefNode -> InstrNode -> BlockNode. - NodeAddr<InstrNode*> ITA = TA.Addr->getOwner(DFG); - NodeAddr<BlockNode*> BTA = ITA.Addr->getOwner(DFG); + NodeAddr<InstrNode *> ITA = TA.Addr->getOwner(DFG); + NodeAddr<BlockNode *> BTA = ITA.Addr->getOwner(DFG); // Reaching defs are ordered in the upward direction. if (BTA.Addr->getCode() != B) { // We have reached past the beginning of B, and the accumulated @@ -1093,7 +1099,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { // Subtract all accumulated defs (RRs) from LRef. RegisterRef T = RRs.clearIn(LRef); assert(T); - NewDefs.insert({TA.Id,T.Mask}); + NewDefs.insert({TA.Id, T.Mask}); break; } @@ -1118,16 +1124,16 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { // Scan the block for upward-exposed uses and add them to the tracking set. for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) { - NodeAddr<InstrNode*> IA = I; + NodeAddr<InstrNode *> IA = I; if (IA.Addr->getKind() != NodeAttrs::Stmt) continue; - for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) { + for (NodeAddr<UseNode *> UA : IA.Addr->members_if(DFG.IsUse, DFG)) { if (UA.Addr->getFlags() & NodeAttrs::Undef) continue; RegisterRef RR = UA.Addr->getRegRef(DFG); - for (NodeAddr<DefNode*> D : getAllReachingDefs(UA)) + for (NodeAddr<DefNode *> D : getAllReachingDefs(UA)) if (getBlockWithRef(D.Id) != B) - LiveIn[RR.Reg].insert({D.Id,RR.Mask}); + LiveIn[RR.Reg].insert({D.Id, RR.Mask}); } } @@ -1145,7 +1151,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { LaneBitmask M; for (auto P : R.second) M |= P.second; - Local.insert(RegisterRef(R.first,M)); + Local.insert(RegisterRef(R.first, M)); } if (Trace) { @@ -1164,6 +1170,8 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) { } void Liveness::emptify(RefMap &M) { - for (auto I = M.begin(), E = M.end(); I != E; ) + for (auto I = M.begin(), E = M.end(); I != E;) I = I->second.empty() ? M.erase(I) : std::next(I); } + +} // namespace llvm::rdf diff --git a/llvm/lib/CodeGen/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp index 8760ba118934..90520c4c3c71 100644 --- a/llvm/lib/CodeGen/RDFRegisters.cpp +++ b/llvm/lib/CodeGen/RDFRegisters.cpp @@ -15,17 +15,18 @@ #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include <cassert> #include <cstdint> #include <set> #include <utility> -using namespace llvm; -using namespace rdf; +namespace llvm::rdf { PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri, - const MachineFunction &mf) + const MachineFunction &mf) : TRI(tri) { RegInfos.resize(TRI.getNumRegs()); @@ -57,7 +58,7 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri, UnitInfos[U].Reg = F; } else { for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) { - std::pair<uint32_t,LaneBitmask> P = *I; + std::pair<uint32_t, LaneBitmask> P = *I; UnitInfo &UI = UnitInfos[P.first]; UI.Reg = F; if (P.second.any()) { @@ -80,15 +81,15 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri, if (Op.isRegMask()) RegMasks.insert(Op.getRegMask()); - MaskInfos.resize(RegMasks.size()+1); + MaskInfos.resize(RegMasks.size() + 1); for (uint32_t M = 1, NM = RegMasks.size(); M <= NM; ++M) { BitVector PU(TRI.getNumRegUnits()); const uint32_t *MB = RegMasks.get(M); for (unsigned I = 1, E = TRI.getNumRegs(); I != E; ++I) { if (!(MB[I / 32] & (1u << (I % 32)))) continue; - for (MCRegUnitIterator U(MCRegister::from(I), &TRI); U.isValid(); ++U) - PU.set(*U); + for (MCRegUnit Unit : TRI.regunits(MCRegister::from(I))) + PU.set(Unit); } MaskInfos[M].Units = PU.flip(); } @@ -97,134 +98,75 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri, for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) { BitVector AS(TRI.getNumRegs()); for (MCRegUnitRootIterator R(U, &TRI); R.isValid(); ++R) - for (MCSuperRegIterator S(*R, &TRI, true); S.isValid(); ++S) - AS.set(*S); + for (MCPhysReg S : TRI.superregs_inclusive(*R)) + AS.set(S); AliasInfos[U].Regs = AS; } } +bool PhysicalRegisterInfo::alias(RegisterRef RA, RegisterRef RB) const { + return !disjoint(getUnits(RA), getUnits(RB)); +} + std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const { - // Do not include RR in the alias set. + // Do not include Reg in the alias set. std::set<RegisterId> AS; - assert(isRegMaskId(Reg) || Register::isPhysicalRegister(Reg)); - if (isRegMaskId(Reg)) { + assert(!RegisterRef::isUnitId(Reg) && "No units allowed"); + if (RegisterRef::isMaskId(Reg)) { // XXX SLOW const uint32_t *MB = getRegMaskBits(Reg); for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) { - if (MB[i/32] & (1u << (i%32))) + if (MB[i / 32] & (1u << (i % 32))) continue; AS.insert(i); } - for (const uint32_t *RM : RegMasks) { - RegisterId MI = getRegMaskId(RM); - if (MI != Reg && aliasMM(RegisterRef(Reg), RegisterRef(MI))) - AS.insert(MI); - } return AS; } + assert(RegisterRef::isRegId(Reg)); for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI) AS.insert(*AI); - for (const uint32_t *RM : RegMasks) { - RegisterId MI = getRegMaskId(RM); - if (aliasRM(RegisterRef(Reg), RegisterRef(MI))) - AS.insert(MI); - } + return AS; } -bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const { - assert(Register::isPhysicalRegister(RA.Reg)); - assert(Register::isPhysicalRegister(RB.Reg)); - - MCRegUnitMaskIterator UMA(RA.Reg, &TRI); - MCRegUnitMaskIterator UMB(RB.Reg, &TRI); - // Reg units are returned in the numerical order. - while (UMA.isValid() && UMB.isValid()) { - // Skip units that are masked off in RA. - std::pair<RegisterId,LaneBitmask> PA = *UMA; - if (PA.second.any() && (PA.second & RA.Mask).none()) { - ++UMA; - continue; - } - // Skip units that are masked off in RB. - std::pair<RegisterId,LaneBitmask> PB = *UMB; - if (PB.second.any() && (PB.second & RB.Mask).none()) { - ++UMB; - continue; - } +std::set<RegisterId> PhysicalRegisterInfo::getUnits(RegisterRef RR) const { + std::set<RegisterId> Units; - if (PA.first == PB.first) - return true; - if (PA.first < PB.first) - ++UMA; - else if (PB.first < PA.first) - ++UMB; - } - return false; -} + if (RR.Reg == 0) + return Units; // Empty -bool PhysicalRegisterInfo::aliasRM(RegisterRef RR, RegisterRef RM) const { - assert(Register::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg)); - const uint32_t *MB = getRegMaskBits(RM.Reg); - bool Preserved = MB[RR.Reg/32] & (1u << (RR.Reg%32)); - // If the lane mask information is "full", e.g. when the given lane mask - // is a superset of the lane mask from the register class, check the regmask - // bit directly. - if (RR.Mask == LaneBitmask::getAll()) - return !Preserved; - const TargetRegisterClass *RC = RegInfos[RR.Reg].RegClass; - if (RC != nullptr && (RR.Mask & RC->LaneMask) == RC->LaneMask) - return !Preserved; - - // Otherwise, check all subregisters whose lane mask overlaps the given - // mask. For each such register, if it is preserved by the regmask, then - // clear the corresponding bits in the given mask. If at the end, all - // bits have been cleared, the register does not alias the regmask (i.e. - // is it preserved by it). - LaneBitmask M = RR.Mask; - for (MCSubRegIndexIterator SI(RR.Reg, &TRI); SI.isValid(); ++SI) { - LaneBitmask SM = TRI.getSubRegIndexLaneMask(SI.getSubRegIndex()); - if ((SM & RR.Mask).none()) - continue; - unsigned SR = SI.getSubReg(); - if (!(MB[SR/32] & (1u << (SR%32)))) - continue; - // The subregister SR is preserved. - M &= ~SM; - if (M.none()) - return false; + if (RR.isReg()) { + if (RR.Mask.none()) + return Units; // Empty + for (MCRegUnitMaskIterator UM(RR.idx(), &TRI); UM.isValid(); ++UM) { + auto [U, M] = *UM; + if (M.none() || (M & RR.Mask).any()) + Units.insert(U); + } + return Units; } - return true; -} - -bool PhysicalRegisterInfo::aliasMM(RegisterRef RM, RegisterRef RN) const { - assert(isRegMaskId(RM.Reg) && isRegMaskId(RN.Reg)); + assert(RR.isMask()); unsigned NumRegs = TRI.getNumRegs(); - const uint32_t *BM = getRegMaskBits(RM.Reg); - const uint32_t *BN = getRegMaskBits(RN.Reg); - - for (unsigned w = 0, nw = NumRegs/32; w != nw; ++w) { - // Intersect the negations of both words. Disregard reg=0, - // i.e. 0th bit in the 0th word. - uint32_t C = ~BM[w] & ~BN[w]; - if (w == 0) - C &= ~1; - if (C) - return true; + const uint32_t *MB = getRegMaskBits(RR.idx()); + for (unsigned I = 0, E = (NumRegs + 31) / 32; I != E; ++I) { + uint32_t C = ~MB[I]; // Clobbered regs + if (I == 0) // Reg 0 should be ignored + C &= maskLeadingOnes<unsigned>(31); + if (I + 1 == E && NumRegs % 32 != 0) // Last word may be partial + C &= maskTrailingOnes<unsigned>(NumRegs % 32); + if (C == 0) + continue; + while (C != 0) { + unsigned T = llvm::countr_zero(C); + unsigned CR = 32 * I + T; // Clobbered reg + for (MCRegUnit U : TRI.regunits(CR)) + Units.insert(U); + C &= ~(1u << T); + } } - - // Check the remaining registers in the last word. - unsigned TailRegs = NumRegs % 32; - if (TailRegs == 0) - return false; - unsigned TW = NumRegs / 32; - uint32_t TailMask = (1u << TailRegs) - 1; - if (~BM[TW] & ~BN[TW] & TailMask) - return true; - - return false; + return Units; } RegisterRef PhysicalRegisterInfo::mapTo(RegisterRef RR, unsigned R) const { @@ -234,20 +176,133 @@ RegisterRef PhysicalRegisterInfo::mapTo(RegisterRef RR, unsigned R) const { return RegisterRef(R, TRI.composeSubRegIndexLaneMask(Idx, RR.Mask)); if (unsigned Idx = TRI.getSubRegIndex(RR.Reg, R)) { const RegInfo &RI = RegInfos[R]; - LaneBitmask RCM = RI.RegClass ? RI.RegClass->LaneMask - : LaneBitmask::getAll(); + LaneBitmask RCM = + RI.RegClass ? RI.RegClass->LaneMask : LaneBitmask::getAll(); LaneBitmask M = TRI.reverseComposeSubRegIndexLaneMask(Idx, RR.Mask); return RegisterRef(R, M & RCM); } llvm_unreachable("Invalid arguments: unrelated registers?"); } +bool PhysicalRegisterInfo::equal_to(RegisterRef A, RegisterRef B) const { + if (!A.isReg() || !B.isReg()) { + // For non-regs, or comparing reg and non-reg, use only the Reg member. + return A.Reg == B.Reg; + } + + if (A.Reg == B.Reg) + return A.Mask == B.Mask; + + // Compare reg units lexicographically. + MCRegUnitMaskIterator AI(A.Reg, &getTRI()); + MCRegUnitMaskIterator BI(B.Reg, &getTRI()); + while (AI.isValid() && BI.isValid()) { + auto [AReg, AMask] = *AI; + auto [BReg, BMask] = *BI; + + // Lane masks are "none" for units that don't correspond to subregs + // e.g. a single unit in a leaf register, or aliased unit. + if (AMask.none()) + AMask = LaneBitmask::getAll(); + if (BMask.none()) + BMask = LaneBitmask::getAll(); + + // If both iterators point to a unit contained in both A and B, then + // compare the units. + if ((AMask & A.Mask).any() && (BMask & B.Mask).any()) { + if (AReg != BReg) + return false; + // Units are equal, move on to the next ones. + ++AI; + ++BI; + continue; + } + + if ((AMask & A.Mask).none()) + ++AI; + if ((BMask & B.Mask).none()) + ++BI; + } + // One or both have reached the end. + return static_cast<int>(AI.isValid()) == static_cast<int>(BI.isValid()); +} + +bool PhysicalRegisterInfo::less(RegisterRef A, RegisterRef B) const { + if (!A.isReg() || !B.isReg()) { + // For non-regs, or comparing reg and non-reg, use only the Reg member. + return A.Reg < B.Reg; + } + + if (A.Reg == B.Reg) + return A.Mask < B.Mask; + if (A.Mask == B.Mask) + return A.Reg < B.Reg; + + // Compare reg units lexicographically. + llvm::MCRegUnitMaskIterator AI(A.Reg, &getTRI()); + llvm::MCRegUnitMaskIterator BI(B.Reg, &getTRI()); + while (AI.isValid() && BI.isValid()) { + auto [AReg, AMask] = *AI; + auto [BReg, BMask] = *BI; + + // Lane masks are "none" for units that don't correspond to subregs + // e.g. a single unit in a leaf register, or aliased unit. + if (AMask.none()) + AMask = LaneBitmask::getAll(); + if (BMask.none()) + BMask = LaneBitmask::getAll(); + + // If both iterators point to a unit contained in both A and B, then + // compare the units. + if ((AMask & A.Mask).any() && (BMask & B.Mask).any()) { + if (AReg != BReg) + return AReg < BReg; + // Units are equal, move on to the next ones. + ++AI; + ++BI; + continue; + } + + if ((AMask & A.Mask).none()) + ++AI; + if ((BMask & B.Mask).none()) + ++BI; + } + // One or both have reached the end: assume invalid < valid. + return static_cast<int>(AI.isValid()) < static_cast<int>(BI.isValid()); +} + +void PhysicalRegisterInfo::print(raw_ostream &OS, RegisterRef A) const { + if (A.Reg == 0 || A.isReg()) { + if (0 < A.idx() && A.idx() < TRI.getNumRegs()) + OS << TRI.getName(A.idx()); + else + OS << printReg(A.idx(), &TRI); + OS << PrintLaneMaskShort(A.Mask); + } else if (A.isUnit()) { + OS << printRegUnit(A.idx(), &TRI); + } else { + assert(A.isMask()); + // RegMask SS flag is preserved by idx(). + unsigned Idx = Register::stackSlot2Index(A.idx()); + const char *Fmt = Idx < 0x10000 ? "%04x" : "%08x"; + OS << "M#" << format(Fmt, Idx); + } +} + +void PhysicalRegisterInfo::print(raw_ostream &OS, const RegisterAggr &A) const { + OS << '{'; + for (unsigned U : A.units()) + OS << ' ' << printRegUnit(U, &TRI); + OS << " }"; +} + bool RegisterAggr::hasAliasOf(RegisterRef RR) const { - if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) + if (RR.isMask()) return Units.anyCommon(PRI.getMaskUnits(RR.Reg)); for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair<uint32_t,LaneBitmask> P = *U; + std::pair<uint32_t, LaneBitmask> P = *U; if (P.second.none() || (P.second & RR.Mask).any()) if (Units.test(P.first)) return true; @@ -256,13 +311,13 @@ bool RegisterAggr::hasAliasOf(RegisterRef RR) const { } bool RegisterAggr::hasCoverOf(RegisterRef RR) const { - if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) { + if (RR.isMask()) { BitVector T(PRI.getMaskUnits(RR.Reg)); return T.reset(Units).none(); } for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair<uint32_t,LaneBitmask> P = *U; + std::pair<uint32_t, LaneBitmask> P = *U; if (P.second.none() || (P.second & RR.Mask).any()) if (!Units.test(P.first)) return false; @@ -271,13 +326,13 @@ bool RegisterAggr::hasCoverOf(RegisterRef RR) const { } RegisterAggr &RegisterAggr::insert(RegisterRef RR) { - if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) { + if (RR.isMask()) { Units |= PRI.getMaskUnits(RR.Reg); return *this; } for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) { - std::pair<uint32_t,LaneBitmask> P = *U; + std::pair<uint32_t, LaneBitmask> P = *U; if (P.second.none() || (P.second & RR.Mask).any()) Units.set(P.first); } @@ -350,22 +405,14 @@ RegisterRef RegisterAggr::makeRegRef() const { LaneBitmask M; for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) { - std::pair<uint32_t,LaneBitmask> P = *I; + std::pair<uint32_t, LaneBitmask> P = *I; if (Units.test(P.first)) M |= P.second.none() ? LaneBitmask::getAll() : P.second; } return RegisterRef(F, M); } -void RegisterAggr::print(raw_ostream &OS) const { - OS << '{'; - for (int U = Units.find_first(); U >= 0; U = Units.find_next(U)) - OS << ' ' << printRegUnit(U, &PRI.getTRI()); - OS << " }"; -} - -RegisterAggr::rr_iterator::rr_iterator(const RegisterAggr &RG, - bool End) +RegisterAggr::ref_iterator::ref_iterator(const RegisterAggr &RG, bool End) : Owner(&RG) { for (int U = RG.Units.find_first(); U >= 0; U = RG.Units.find_next(U)) { RegisterRef R = RG.PRI.getRefForUnit(U); @@ -375,7 +422,23 @@ RegisterAggr::rr_iterator::rr_iterator(const RegisterAggr &RG, Index = End ? Masks.size() : 0; } -raw_ostream &rdf::operator<<(raw_ostream &OS, const RegisterAggr &A) { - A.print(OS); +raw_ostream &operator<<(raw_ostream &OS, const RegisterAggr &A) { + A.getPRI().print(OS, A); return OS; } + +raw_ostream &operator<<(raw_ostream &OS, const PrintLaneMaskShort &P) { + if (P.Mask.all()) + return OS; + if (P.Mask.none()) + return OS << ":*none*"; + + LaneBitmask::Type Val = P.Mask.getAsInteger(); + if ((Val & 0xffff) == Val) + return OS << ':' << format("%04llX", Val); + if ((Val & 0xffffffff) == Val) + return OS << ':' << format("%08llX", Val); + return OS << ':' << PrintLaneMask(P.Mask); +} + +} // namespace llvm::rdf diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index d9ced9191fae..75fbc8ba35b1 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -65,13 +65,13 @@ void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) { // This is the entry block. if (MBB->pred_empty()) { for (const auto &LI : MBB->liveins()) { - for (MCRegUnitIterator Unit(LI.PhysReg, TRI); Unit.isValid(); ++Unit) { + for (MCRegUnit Unit : TRI->regunits(LI.PhysReg)) { // Treat function live-ins as if they were defined just before the first // instruction. Usually, function arguments are set up immediately // before the call. - if (LiveRegs[*Unit] != -1) { - LiveRegs[*Unit] = -1; - MBBReachingDefs[MBBNumber][*Unit].push_back(-1); + if (LiveRegs[Unit] != -1) { + LiveRegs[Unit] = -1; + MBBReachingDefs[MBBNumber][Unit].push_back(-1); } } } @@ -128,16 +128,15 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) { for (auto &MO : MI->operands()) { if (!isValidRegDef(MO)) continue; - for (MCRegUnitIterator Unit(MO.getReg().asMCReg(), TRI); Unit.isValid(); - ++Unit) { + for (MCRegUnit Unit : TRI->regunits(MO.getReg().asMCReg())) { // This instruction explicitly defines the current reg unit. - LLVM_DEBUG(dbgs() << printRegUnit(*Unit, TRI) << ":\t" << CurInstr - << '\t' << *MI); + LLVM_DEBUG(dbgs() << printRegUnit(Unit, TRI) << ":\t" << CurInstr << '\t' + << *MI); // How many instructions since this reg unit was last written? - if (LiveRegs[*Unit] != CurInstr) { - LiveRegs[*Unit] = CurInstr; - MBBReachingDefs[MBBNumber][*Unit].push_back(CurInstr); + if (LiveRegs[Unit] != CurInstr) { + LiveRegs[Unit] = CurInstr; + MBBReachingDefs[MBBNumber][Unit].push_back(CurInstr); } } } @@ -269,8 +268,8 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, assert(MBBNumber < MBBReachingDefs.size() && "Unexpected basic block number."); int LatestDef = ReachingDefDefaultVal; - for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) { - for (int Def : MBBReachingDefs[MBBNumber][*Unit]) { + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + for (int Def : MBBReachingDefs[MBBNumber][Unit]) { if (Def >= InstId) break; DefRes = Def; diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp index 91795f3d27fe..666199139630 100644 --- a/llvm/lib/CodeGen/RegAllocBasic.cpp +++ b/llvm/lib/CodeGen/RegAllocBasic.cpp @@ -58,7 +58,7 @@ class RABasic : public MachineFunctionPass, public RegAllocBase, private LiveRangeEdit::Delegate { // context - MachineFunction *MF; + MachineFunction *MF = nullptr; // state std::unique_ptr<Spiller> SpillerInstance; @@ -213,8 +213,8 @@ bool RABasic::spillInterferences(const LiveInterval &VirtReg, SmallVector<const LiveInterval *, 8> Intfs; // Collect interferences assigned to any alias of the physical register. - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit); for (const auto *Intf : reverse(Q.interferingVRegs())) { if (!Intf->isSpillable() || Intf->weight() > VirtReg.weight()) return false; diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp index b1743d3f987d..81f3d2c8099f 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp @@ -43,6 +43,7 @@ static cl::opt<bool> EnableLocalReassignment( "may be compile time intensive"), cl::init(false)); +namespace llvm { cl::opt<unsigned> EvictInterferenceCutoff( "regalloc-eviction-max-interference-cutoff", cl::Hidden, cl::desc("Number of interferences after which we declare " @@ -50,6 +51,7 @@ cl::opt<unsigned> EvictInterferenceCutoff( "is a compilation cost-saving consideration. To " "disable, pass a very large number."), cl::init(10)); +} #define DEBUG_TYPE "regalloc" #ifdef LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL @@ -100,9 +102,7 @@ template <> Pass *llvm::callDefaultCtor<RegAllocEvictionAdvisorAnalysis>() { #endif break; case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release: -#if defined(LLVM_HAVE_TF_AOT) Ret = createReleaseModeAdvisor(); -#endif break; } if (Ret) @@ -201,8 +201,8 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost( unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg()); EvictionCost Cost; - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit); // If there is 10 or more interferences, chances are one is heavier. const auto &Interferences = Q.interferingVRegs(EvictInterferenceCutoff); if (Interferences.size() >= EvictInterferenceCutoff) diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h index 46838570a2fc..52dd946a6854 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h @@ -121,7 +121,7 @@ public: protected: RegAllocEvictionAdvisor(const MachineFunction &MF, const RAGreedy &RA); - Register canReassign(const LiveInterval &VirtReg, Register PrevReg) const; + bool canReassign(const LiveInterval &VirtReg, MCRegister FromReg) const; // Get the upper limit of elements in the given Order we need to analize. // TODO: is this heuristic, we could consider learning it. diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 775e66e48406..864beb8720f4 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -75,15 +75,15 @@ namespace { } private: - MachineFrameInfo *MFI; - MachineRegisterInfo *MRI; - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; + MachineFrameInfo *MFI = nullptr; + MachineRegisterInfo *MRI = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const TargetInstrInfo *TII = nullptr; RegisterClassInfo RegClassInfo; const RegClassFilterFunc ShouldAllocateClass; /// Basic block currently being allocated. - MachineBasicBlock *MBB; + MachineBasicBlock *MBB = nullptr; /// Maps virtual regs to the frame index where these values are spilled. IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg; @@ -106,7 +106,7 @@ namespace { } }; - using LiveRegMap = SparseSet<LiveReg>; + using LiveRegMap = SparseSet<LiveReg, identity<unsigned>, uint16_t>; /// This map contains entries for each virtual register that is currently /// available in a physical register. LiveRegMap LiveVirtRegs; @@ -161,8 +161,8 @@ namespace { /// Mark a physreg as used in this instruction. void markRegUsedInInstr(MCPhysReg PhysReg) { - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) - UsedInInstr.insert(*Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) + UsedInInstr.insert(Unit); } // Check if physreg is clobbered by instruction's regmask(s). @@ -176,10 +176,10 @@ namespace { bool isRegUsedInInstr(MCPhysReg PhysReg, bool LookAtPhysRegUses) const { if (LookAtPhysRegUses && isClobberedByRegMasks(PhysReg)) return true; - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - if (UsedInInstr.count(*Units)) + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + if (UsedInInstr.count(Unit)) return true; - if (LookAtPhysRegUses && PhysRegUses.count(*Units)) + if (LookAtPhysRegUses && PhysRegUses.count(Unit)) return true; } return false; @@ -188,14 +188,14 @@ namespace { /// Mark physical register as being used in a register use operand. /// This is only used by the special livethrough handling code. void markPhysRegUsedInInstr(MCPhysReg PhysReg) { - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) - PhysRegUses.insert(*Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) + PhysRegUses.insert(Unit); } /// Remove mark of physical register being used in the instruction. void unmarkRegUsedInInstr(MCPhysReg PhysReg) { - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) - UsedInInstr.erase(*Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) + UsedInInstr.erase(Unit); } enum : unsigned { @@ -240,6 +240,8 @@ namespace { void addRegClassDefCounts(std::vector<unsigned> &RegClassDefCounts, Register Reg) const; + void findAndSortDefOperandIndexes(const MachineInstr &MI); + void allocateInstruction(MachineInstr &MI); void handleDebugValue(MachineInstr &MI); void handleBundle(MachineInstr &MI); @@ -265,18 +267,18 @@ namespace { void allocVirtRegUndef(MachineOperand &MO); void assignDanglingDebugValues(MachineInstr &Def, Register VirtReg, MCPhysReg Reg); - void defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum, + bool defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg); - void defineVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg, + bool defineVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg, bool LookAtPhysRegUses = false); - void useVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg); + bool useVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg); MachineBasicBlock::iterator getMBBBeginInsertionPoint(MachineBasicBlock &MBB, SmallSet<Register, 2> &PrologLiveIns) const; void reloadAtBegin(MachineBasicBlock &MBB); - void setPhysReg(MachineInstr &MI, MachineOperand &MO, MCPhysReg PhysReg); + bool setPhysReg(MachineInstr &MI, MachineOperand &MO, MCPhysReg PhysReg); Register traceCopies(Register VirtReg) const; Register traceCopyChain(Register Reg) const; @@ -308,13 +310,13 @@ bool RegAllocFast::shouldAllocateRegister(const Register Reg) const { } void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) { - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) - RegUnitStates[*UI] = NewState; + for (MCRegUnit Unit : TRI->regunits(PhysReg)) + RegUnitStates[Unit] = NewState; } bool RegAllocFast::isPhysRegFree(MCPhysReg PhysReg) const { - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) { - if (RegUnitStates[*UI] != regFree) + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + if (RegUnitStates[Unit] != regFree) return false; } return true; @@ -552,7 +554,7 @@ void RegAllocFast::reloadAtBegin(MachineBasicBlock &MBB) { if (PhysReg == 0) continue; - MCRegister FirstUnit = *MCRegUnitIterator(PhysReg, TRI); + MCRegister FirstUnit = *TRI->regunits(PhysReg).begin(); if (RegUnitStates[FirstUnit] == regLiveIn) continue; @@ -593,8 +595,7 @@ bool RegAllocFast::definePhysReg(MachineInstr &MI, MCPhysReg Reg) { bool RegAllocFast::displacePhysReg(MachineInstr &MI, MCPhysReg PhysReg) { bool displacedAny = false; - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) { - unsigned Unit = *UI; + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { switch (unsigned VirtReg = RegUnitStates[Unit]) { default: { LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg); @@ -623,7 +624,7 @@ bool RegAllocFast::displacePhysReg(MachineInstr &MI, MCPhysReg PhysReg) { void RegAllocFast::freePhysReg(MCPhysReg PhysReg) { LLVM_DEBUG(dbgs() << "Freeing " << printReg(PhysReg, TRI) << ':'); - MCRegister FirstUnit = *MCRegUnitIterator(PhysReg, TRI); + MCRegister FirstUnit = *TRI->regunits(PhysReg).begin(); switch (unsigned VirtReg = RegUnitStates[FirstUnit]) { case regFree: LLVM_DEBUG(dbgs() << '\n'); @@ -648,8 +649,8 @@ void RegAllocFast::freePhysReg(MCPhysReg PhysReg) { /// disabled - it can be allocated directly. /// \returns spillImpossible when PhysReg or an alias can't be spilled. unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const { - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) { - switch (unsigned VirtReg = RegUnitStates[*UI]) { + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + switch (unsigned VirtReg = RegUnitStates[Unit]) { case regFree: break; case regPreAssigned: @@ -875,10 +876,11 @@ void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) { /// Variation of defineVirtReg() with special handling for livethrough regs /// (tied or earlyclobber) that may interfere with preassigned uses. -void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum, +/// \return true if MI's MachineOperands were re-arranged/invalidated. +bool RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg) { if (!shouldAllocateRegister(VirtReg)) - return; + return false; LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg); if (LRI != LiveVirtRegs.end()) { MCPhysReg PrevReg = LRI->PhysReg; @@ -909,11 +911,13 @@ void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum, /// perform an allocation if: /// - It is a dead definition without any uses. /// - The value is live out and all uses are in different basic blocks. -void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum, +/// +/// \return true if MI's MachineOperands were re-arranged/invalidated. +bool RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg, bool LookAtPhysRegUses) { assert(VirtReg.isVirtual() && "Not a virtual register"); if (!shouldAllocateRegister(VirtReg)) - return; + return false; MachineOperand &MO = MI.getOperand(OpNum); LiveRegMap::iterator LRI; bool New; @@ -948,6 +952,23 @@ void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum, << LRI->Reloaded << '\n'); bool Kill = LRI->LastUse == nullptr; spill(SpillBefore, VirtReg, PhysReg, Kill, LRI->LiveOut); + + // We need to place additional spills for each indirect destination of an + // INLINEASM_BR. + if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) { + int FI = StackSlotForVirtReg[VirtReg]; + const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); + for (MachineOperand &MO : MI.operands()) { + if (MO.isMBB()) { + MachineBasicBlock *Succ = MO.getMBB(); + TII->storeRegToStackSlot(*Succ, Succ->begin(), PhysReg, Kill, + FI, &RC, TRI, VirtReg); + ++NumStores; + Succ->addLiveIn(PhysReg); + } + } + } + LRI->LastUse = nullptr; } LRI->LiveOut = false; @@ -957,15 +978,16 @@ void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum, BundleVirtRegsMap[VirtReg] = PhysReg; } markRegUsedInInstr(PhysReg); - setPhysReg(MI, MO, PhysReg); + return setPhysReg(MI, MO, PhysReg); } /// Allocates a register for a VirtReg use. -void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum, +/// \return true if MI's MachineOperands were re-arranged/invalidated. +bool RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg) { assert(VirtReg.isVirtual() && "Not a virtual register"); if (!shouldAllocateRegister(VirtReg)) - return; + return false; MachineOperand &MO = MI.getOperand(OpNum); LiveRegMap::iterator LRI; bool New; @@ -1002,8 +1024,7 @@ void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum, if (LRI->Error) { const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC); - setPhysReg(MI, MO, *AllocationOrder.begin()); - return; + return setPhysReg(MI, MO, *AllocationOrder.begin()); } } @@ -1013,18 +1034,17 @@ void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum, BundleVirtRegsMap[VirtReg] = LRI->PhysReg; } markRegUsedInInstr(LRI->PhysReg); - setPhysReg(MI, MO, LRI->PhysReg); + return setPhysReg(MI, MO, LRI->PhysReg); } -/// Changes operand OpNum in MI the refer the PhysReg, considering subregs. This -/// may invalidate any operand pointers. Return true if the operand kills its -/// register. -void RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO, +/// Changes operand OpNum in MI the refer the PhysReg, considering subregs. +/// \return true if MI's MachineOperands were re-arranged/invalidated. +bool RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO, MCPhysReg PhysReg) { if (!MO.getSubReg()) { MO.setReg(PhysReg); MO.setIsRenamable(true); - return; + return false; } // Handle subregister index. @@ -1040,7 +1060,8 @@ void RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO, // register kill. if (MO.isKill()) { MI.addRegisterKilled(PhysReg, TRI, true); - return; + // Conservatively assume implicit MOs were re-arranged + return true; } // A <def,read-undef> of a sub-register requires an implicit def of the full @@ -1050,7 +1071,10 @@ void RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO, MI.addRegisterDead(PhysReg, TRI, true); else MI.addRegisterDefined(PhysReg, TRI); + // Conservatively assume implicit MOs were re-arranged + return true; } + return false; } #ifndef NDEBUG @@ -1090,8 +1114,8 @@ void RegAllocFast::dumpState() const { if (PhysReg != 0) { assert(Register::isPhysicalRegister(PhysReg) && "mapped to physreg"); - for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) { - assert(RegUnitStates[*UI] == VirtReg && "inverse map valid"); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + assert(RegUnitStates[Unit] == VirtReg && "inverse map valid"); } } } @@ -1130,6 +1154,72 @@ void RegAllocFast::addRegClassDefCounts(std::vector<unsigned> &RegClassDefCounts } } +/// Compute \ref DefOperandIndexes so it contains the indices of "def" operands +/// that are to be allocated. Those are ordered in a way that small classes, +/// early clobbers and livethroughs are allocated first. +void RegAllocFast::findAndSortDefOperandIndexes(const MachineInstr &MI) { + DefOperandIndexes.clear(); + + // Track number of defs which may consume a register from the class. + std::vector<unsigned> RegClassDefCounts(TRI->getNumRegClasses(), 0); + assert(RegClassDefCounts[0] == 0); + + LLVM_DEBUG(dbgs() << "Need to assign livethroughs\n"); + for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) { + const MachineOperand &MO = MI.getOperand(I); + if (!MO.isReg()) + continue; + Register Reg = MO.getReg(); + if (MO.readsReg()) { + if (Reg.isPhysical()) { + LLVM_DEBUG(dbgs() << "mark extra used: " << printReg(Reg, TRI) << '\n'); + markPhysRegUsedInInstr(Reg); + } + } + + if (MO.isDef()) { + if (Reg.isVirtual() && shouldAllocateRegister(Reg)) + DefOperandIndexes.push_back(I); + + addRegClassDefCounts(RegClassDefCounts, Reg); + } + } + + llvm::sort(DefOperandIndexes, [&](uint16_t I0, uint16_t I1) { + const MachineOperand &MO0 = MI.getOperand(I0); + const MachineOperand &MO1 = MI.getOperand(I1); + Register Reg0 = MO0.getReg(); + Register Reg1 = MO1.getReg(); + const TargetRegisterClass &RC0 = *MRI->getRegClass(Reg0); + const TargetRegisterClass &RC1 = *MRI->getRegClass(Reg1); + + // Identify regclass that are easy to use up completely just in this + // instruction. + unsigned ClassSize0 = RegClassInfo.getOrder(&RC0).size(); + unsigned ClassSize1 = RegClassInfo.getOrder(&RC1).size(); + + bool SmallClass0 = ClassSize0 < RegClassDefCounts[RC0.getID()]; + bool SmallClass1 = ClassSize1 < RegClassDefCounts[RC1.getID()]; + if (SmallClass0 > SmallClass1) + return true; + if (SmallClass0 < SmallClass1) + return false; + + // Allocate early clobbers and livethrough operands first. + bool Livethrough0 = MO0.isEarlyClobber() || MO0.isTied() || + (MO0.getSubReg() == 0 && !MO0.isUndef()); + bool Livethrough1 = MO1.isEarlyClobber() || MO1.isTied() || + (MO1.getSubReg() == 0 && !MO1.isUndef()); + if (Livethrough0 > Livethrough1) + return true; + if (Livethrough0 < Livethrough1) + return false; + + // Tie-break rule: operand index. + return I0 < I1; + }); +} + void RegAllocFast::allocateInstruction(MachineInstr &MI) { // The basic algorithm here is: // 1. Mark registers of def operands as free @@ -1201,6 +1291,10 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { // Allocate virtreg defs. if (HasDef) { if (HasVRegDef) { + // Note that Implicit MOs can get re-arranged by defineVirtReg(), so loop + // multiple times to ensure no operand is missed. + bool ReArrangedImplicitOps = true; + // Special handling for early clobbers, tied operands or subregister defs: // Compared to "normal" defs these: // - Must not use a register that is pre-assigned for a use operand. @@ -1208,90 +1302,45 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { // heuristic to figure out a good operand order before doing // assignments. if (NeedToAssignLiveThroughs) { - DefOperandIndexes.clear(); PhysRegUses.clear(); - // Track number of defs which may consume a register from the class. - std::vector<unsigned> RegClassDefCounts(TRI->getNumRegClasses(), 0); - assert(RegClassDefCounts[0] == 0); - - LLVM_DEBUG(dbgs() << "Need to assign livethroughs\n"); - for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) { - const MachineOperand &MO = MI.getOperand(I); - if (!MO.isReg()) - continue; - Register Reg = MO.getReg(); - if (MO.readsReg()) { - if (Reg.isPhysical()) { - LLVM_DEBUG(dbgs() << "mark extra used: " << printReg(Reg, TRI) - << '\n'); - markPhysRegUsedInInstr(Reg); + while (ReArrangedImplicitOps) { + ReArrangedImplicitOps = false; + findAndSortDefOperandIndexes(MI); + for (uint16_t OpIdx : DefOperandIndexes) { + MachineOperand &MO = MI.getOperand(OpIdx); + LLVM_DEBUG(dbgs() << "Allocating " << MO << '\n'); + unsigned Reg = MO.getReg(); + if (MO.isEarlyClobber() || + (MO.isTied() && !TiedOpIsUndef(MO, OpIdx)) || + (MO.getSubReg() && !MO.isUndef())) { + ReArrangedImplicitOps = defineLiveThroughVirtReg(MI, OpIdx, Reg); + } else { + ReArrangedImplicitOps = defineVirtReg(MI, OpIdx, Reg); + } + if (ReArrangedImplicitOps) { + // Implicit operands of MI were re-arranged, + // re-compute DefOperandIndexes. + break; } - } - - if (MO.isDef()) { - if (Reg.isVirtual() && shouldAllocateRegister(Reg)) - DefOperandIndexes.push_back(I); - - addRegClassDefCounts(RegClassDefCounts, Reg); - } - } - - llvm::sort(DefOperandIndexes, [&](uint16_t I0, uint16_t I1) { - const MachineOperand &MO0 = MI.getOperand(I0); - const MachineOperand &MO1 = MI.getOperand(I1); - Register Reg0 = MO0.getReg(); - Register Reg1 = MO1.getReg(); - const TargetRegisterClass &RC0 = *MRI->getRegClass(Reg0); - const TargetRegisterClass &RC1 = *MRI->getRegClass(Reg1); - - // Identify regclass that are easy to use up completely just in this - // instruction. - unsigned ClassSize0 = RegClassInfo.getOrder(&RC0).size(); - unsigned ClassSize1 = RegClassInfo.getOrder(&RC1).size(); - - bool SmallClass0 = ClassSize0 < RegClassDefCounts[RC0.getID()]; - bool SmallClass1 = ClassSize1 < RegClassDefCounts[RC1.getID()]; - if (SmallClass0 > SmallClass1) - return true; - if (SmallClass0 < SmallClass1) - return false; - - // Allocate early clobbers and livethrough operands first. - bool Livethrough0 = MO0.isEarlyClobber() || MO0.isTied() || - (MO0.getSubReg() == 0 && !MO0.isUndef()); - bool Livethrough1 = MO1.isEarlyClobber() || MO1.isTied() || - (MO1.getSubReg() == 0 && !MO1.isUndef()); - if (Livethrough0 > Livethrough1) - return true; - if (Livethrough0 < Livethrough1) - return false; - - // Tie-break rule: operand index. - return I0 < I1; - }); - - for (uint16_t OpIdx : DefOperandIndexes) { - MachineOperand &MO = MI.getOperand(OpIdx); - LLVM_DEBUG(dbgs() << "Allocating " << MO << '\n'); - unsigned Reg = MO.getReg(); - if (MO.isEarlyClobber() || - (MO.isTied() && !TiedOpIsUndef(MO, OpIdx)) || - (MO.getSubReg() && !MO.isUndef())) { - defineLiveThroughVirtReg(MI, OpIdx, Reg); - } else { - defineVirtReg(MI, OpIdx, Reg); } } } else { // Assign virtual register defs. - for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) { - MachineOperand &MO = MI.getOperand(I); - if (!MO.isReg() || !MO.isDef()) - continue; - Register Reg = MO.getReg(); - if (Reg.isVirtual()) - defineVirtReg(MI, I, Reg); + while (ReArrangedImplicitOps) { + ReArrangedImplicitOps = false; + for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) { + MachineOperand &MO = MI.getOperand(I); + if (!MO.isReg() || !MO.isDef()) + continue; + Register Reg = MO.getReg(); + if (Reg.isVirtual()) { + ReArrangedImplicitOps = defineVirtReg(MI, I, Reg); + if (ReArrangedImplicitOps) { + break; + } + } + } } } } @@ -1304,9 +1353,11 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { if (!MO.isReg() || !MO.isDef()) continue; + Register Reg = MO.getReg(); + // subreg defs don't free the full register. We left the subreg number // around as a marker in setPhysReg() to recognize this case here. - if (MO.getSubReg() != 0) { + if (Reg.isPhysical() && MO.getSubReg() != 0) { MO.setSubReg(0); continue; } @@ -1317,7 +1368,6 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { // Do not free tied operands and early clobbers. if ((MO.isTied() && !TiedOpIsUndef(MO, I)) || MO.isEarlyClobber()) continue; - Register Reg = MO.getReg(); if (!Reg) continue; if (Reg.isVirtual()) { @@ -1364,38 +1414,42 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { } // Allocate virtreg uses and insert reloads as necessary. + // Implicit MOs can get moved/removed by useVirtReg(), so loop multiple + // times to ensure no operand is missed. bool HasUndefUse = false; - for (unsigned I = 0; I < MI.getNumOperands(); ++I) { - MachineOperand &MO = MI.getOperand(I); - if (!MO.isReg() || !MO.isUse()) - continue; - Register Reg = MO.getReg(); - if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) - continue; - - if (MO.isUndef()) { - HasUndefUse = true; - continue; - } - + bool ReArrangedImplicitMOs = true; + while (ReArrangedImplicitMOs) { + ReArrangedImplicitMOs = false; + for (unsigned I = 0; I < MI.getNumOperands(); ++I) { + MachineOperand &MO = MI.getOperand(I); + if (!MO.isReg() || !MO.isUse()) + continue; + Register Reg = MO.getReg(); + if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) + continue; - // Populate MayLiveAcrossBlocks in case the use block is allocated before - // the def block (removing the vreg uses). - mayLiveIn(Reg); + if (MO.isUndef()) { + HasUndefUse = true; + continue; + } + // Populate MayLiveAcrossBlocks in case the use block is allocated before + // the def block (removing the vreg uses). + mayLiveIn(Reg); - assert(!MO.isInternalRead() && "Bundles not supported"); - assert(MO.readsReg() && "reading use"); - useVirtReg(MI, I, Reg); + assert(!MO.isInternalRead() && "Bundles not supported"); + assert(MO.readsReg() && "reading use"); + ReArrangedImplicitMOs = useVirtReg(MI, I, Reg); + if (ReArrangedImplicitMOs) + break; + } } // Allocate undef operands. This is a separate step because in a situation // like ` = OP undef %X, %X` both operands need the same register assign // so we should perform the normal assignment first. if (HasUndefUse) { - for (MachineOperand &MO : MI.uses()) { - if (!MO.isReg() || !MO.isUse()) - continue; + for (MachineOperand &MO : MI.all_uses()) { Register Reg = MO.getReg(); if (!Reg.isVirtual() || !shouldAllocateRegister(Reg)) continue; @@ -1407,8 +1461,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { // Free early clobbers. if (HasEarlyClobber) { - for (MachineOperand &MO : llvm::reverse(MI.operands())) { - if (!MO.isReg() || !MO.isDef() || !MO.isEarlyClobber()) + for (MachineOperand &MO : llvm::reverse(MI.all_defs())) { + if (!MO.isEarlyClobber()) continue; assert(!MO.getSubReg() && "should be already handled in def processing"); diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index b43a4d2a4b85..68f6ea3268a9 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -444,31 +444,27 @@ MCRegister RAGreedy::tryAssign(const LiveInterval &VirtReg, // Interference eviction //===----------------------------------------------------------------------===// -Register RegAllocEvictionAdvisor::canReassign(const LiveInterval &VirtReg, - Register PrevReg) const { - auto Order = - AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix); - MCRegister PhysReg; - for (auto I = Order.begin(), E = Order.end(); I != E && !PhysReg; ++I) { - if ((*I).id() == PrevReg.id()) - continue; +bool RegAllocEvictionAdvisor::canReassign(const LiveInterval &VirtReg, + MCRegister FromReg) const { + auto HasRegUnitInterference = [&](MCRegUnit Unit) { + // Instantiate a "subquery", not to be confused with the Queries array. + LiveIntervalUnion::Query SubQ(VirtReg, Matrix->getLiveUnions()[Unit]); + return SubQ.checkInterference(); + }; - MCRegUnitIterator Units(*I, TRI); - for (; Units.isValid(); ++Units) { - // Instantiate a "subquery", not to be confused with the Queries array. - LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]); - if (subQ.checkInterference()) - break; + for (MCRegister Reg : + AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix)) { + if (Reg == FromReg) + continue; + // If no units have interference, reassignment is possible. + if (none_of(TRI->regunits(Reg), HasRegUnitInterference)) { + LLVM_DEBUG(dbgs() << "can reassign: " << VirtReg << " from " + << printReg(FromReg, TRI) << " to " + << printReg(Reg, TRI) << '\n'); + return true; } - // If no units have interference, break out with the current PhysReg. - if (!Units.isValid()) - PhysReg = *I; } - if (PhysReg) - LLVM_DEBUG(dbgs() << "can reassign: " << VirtReg << " from " - << printReg(PrevReg, TRI) << " to " - << printReg(PhysReg, TRI) << '\n'); - return PhysReg; + return false; } /// evictInterference - Evict any interferring registers that prevent VirtReg @@ -487,8 +483,8 @@ void RAGreedy::evictInterference(const LiveInterval &VirtReg, // Collect all interfering virtregs first. SmallVector<const LiveInterval *, 8> Intfs; - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit); // We usually have the interfering VRegs cached so collectInterferingVRegs() // should be fast, we may need to recalculate if when different physregs // overlap the same register unit so we had different SubRanges queried @@ -1286,10 +1282,12 @@ static LaneBitmask getInstReadLaneMask(const MachineRegisterInfo &MRI, /// VirtReg. static bool readsLaneSubset(const MachineRegisterInfo &MRI, const MachineInstr *MI, const LiveInterval &VirtReg, - const TargetRegisterInfo *TRI, SlotIndex Use) { + const TargetRegisterInfo *TRI, SlotIndex Use, + const TargetInstrInfo *TII) { // Early check the common case. - if (MI->isCopy() && - MI->getOperand(0).getSubReg() == MI->getOperand(1).getSubReg()) + auto DestSrc = TII->isCopyInstr(*MI); + if (DestSrc && + DestSrc->Destination->getSubReg() == DestSrc->Source->getSubReg()) return false; // FIXME: We're only considering uses, but should be consider defs too? @@ -1348,14 +1346,14 @@ unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg, // the allocation. for (const SlotIndex Use : Uses) { if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use)) { - if (MI->isFullCopy() || + if (TII->isFullCopyInstr(*MI) || (SplitSubClass && SuperRCNumAllocatableRegs == getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC, TII, TRI, RegClassInfo)) || // TODO: Handle split for subranges with subclass constraints? (!SplitSubClass && VirtReg.hasSubRanges() && - !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use))) { + !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use, TII))) { LLVM_DEBUG(dbgs() << " skip:\t" << Use << '\t' << *MI); continue; } @@ -1404,9 +1402,9 @@ void RAGreedy::calcGapWeights(MCRegister PhysReg, GapWeight.assign(NumGaps, 0.0f); // Add interference from each overlapping register. - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - if (!Matrix->query(const_cast<LiveInterval&>(SA->getParent()), *Units) - .checkInterference()) + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + if (!Matrix->query(const_cast<LiveInterval &>(SA->getParent()), Unit) + .checkInterference()) continue; // We know that VirtReg is a continuous interval from FirstInstr to @@ -1417,7 +1415,7 @@ void RAGreedy::calcGapWeights(MCRegister PhysReg, // StartIdx and after StopIdx. // LiveIntervalUnion::SegmentIter IntI = - Matrix->getLiveUnions()[*Units] .find(StartIdx); + Matrix->getLiveUnions()[Unit].find(StartIdx); for (unsigned Gap = 0; IntI.valid() && IntI.start() < StopIdx; ++IntI) { // Skip the gaps before IntI. while (Uses[Gap+1].getBoundaryIndex() < IntI.start()) @@ -1439,8 +1437,8 @@ void RAGreedy::calcGapWeights(MCRegister PhysReg, } // Add fixed interference. - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - const LiveRange &LR = LIS->getRegUnit(*Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + const LiveRange &LR = LIS->getRegUnit(Unit); LiveRange::const_iterator I = LR.find(StartIdx); LiveRange::const_iterator E = LR.end(); @@ -1771,8 +1769,8 @@ bool RAGreedy::mayRecolorAllInterferences( SmallLISet &RecoloringCandidates, const SmallVirtRegSet &FixedRegisters) { const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg()); - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { - LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit); // If there is LastChanceRecoloringMaxInterference or more interferences, // chances are one would not be recolorable. if (Q.interferingVRegs(LastChanceRecoloringMaxInterference).size() >= @@ -1960,7 +1958,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(const LiveInterval &VirtReg, // don't add it to NewVRegs because its physical register will be restored // below. Other vregs in CurrentNewVRegs are created by calling // selectOrSplit and should be added into NewVRegs. - for (Register &R : CurrentNewVRegs) { + for (Register R : CurrentNewVRegs) { if (RecoloringCandidates.count(&LIS->getInterval(R))) continue; NewVRegs.push_back(R); @@ -2142,7 +2140,7 @@ void RAGreedy::initializeCSRCost() { /// \p Out is not cleared before being populated. void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) { for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) { - if (!Instr.isFullCopy()) + if (!TII->isFullCopyInstr(Instr)) continue; // Look for the other end of the copy. Register OtherReg = Instr.getOperand(0).getReg(); @@ -2457,21 +2455,22 @@ RAGreedy::RAGreedyStats RAGreedy::computeStats(MachineBasicBlock &MBB) { MI.getOpcode() == TargetOpcode::STATEPOINT; }; for (MachineInstr &MI : MBB) { - if (MI.isCopy()) { - const MachineOperand &Dest = MI.getOperand(0); - const MachineOperand &Src = MI.getOperand(1); + auto DestSrc = TII->isCopyInstr(MI); + if (DestSrc) { + const MachineOperand &Dest = *DestSrc->Destination; + const MachineOperand &Src = *DestSrc->Source; Register SrcReg = Src.getReg(); Register DestReg = Dest.getReg(); // Only count `COPY`s with a virtual register as source or destination. if (SrcReg.isVirtual() || DestReg.isVirtual()) { if (SrcReg.isVirtual()) { SrcReg = VRM->getPhys(SrcReg); - if (Src.getSubReg()) + if (SrcReg && Src.getSubReg()) SrcReg = TRI->getSubReg(SrcReg, Src.getSubReg()); } if (DestReg.isVirtual()) { DestReg = VRM->getPhys(DestReg); - if (Dest.getSubReg()) + if (DestReg && Dest.getSubReg()) DestReg = TRI->getSubReg(DestReg, Dest.getSubReg()); } if (SrcReg != DestReg) diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h index e0ac88c0aeb9..0f8f9a7d5811 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.h +++ b/llvm/lib/CodeGen/RegAllocGreedy.h @@ -166,20 +166,20 @@ private: SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>; // context - MachineFunction *MF; + MachineFunction *MF = nullptr; // Shortcuts to some useful interface. - const TargetInstrInfo *TII; + const TargetInstrInfo *TII = nullptr; // analyses - SlotIndexes *Indexes; - MachineBlockFrequencyInfo *MBFI; - MachineDominatorTree *DomTree; - MachineLoopInfo *Loops; - MachineOptimizationRemarkEmitter *ORE; - EdgeBundles *Bundles; - SpillPlacement *SpillPlacer; - LiveDebugVariables *DebugVars; + SlotIndexes *Indexes = nullptr; + MachineBlockFrequencyInfo *MBFI = nullptr; + MachineDominatorTree *DomTree = nullptr; + MachineLoopInfo *Loops = nullptr; + MachineOptimizationRemarkEmitter *ORE = nullptr; + EdgeBundles *Bundles = nullptr; + SpillPlacement *SpillPlacer = nullptr; + LiveDebugVariables *DebugVars = nullptr; // state std::unique_ptr<Spiller> SpillerInstance; @@ -204,7 +204,7 @@ private: CO_Interf = 2 }; - uint8_t CutOffInfo; + uint8_t CutOffInfo = CutOffStage::CO_None; #ifndef NDEBUG static const char *const StageName[]; @@ -278,9 +278,9 @@ private: /// Flags for the live range priority calculation, determined once per /// machine function. - bool RegClassPriorityTrumpsGlobalness; + bool RegClassPriorityTrumpsGlobalness = false; - bool ReverseLocalAssignment; + bool ReverseLocalAssignment = false; public: RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses); diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp index b3d926eeb552..925a0f085c4b 100644 --- a/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -634,8 +634,8 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM, // vregLI overlaps fixed regunit interference. bool Interference = false; - for (MCRegUnitIterator Units(PReg, &TRI); Units.isValid(); ++Units) { - if (VRegLI.overlaps(LIS.getRegUnit(*Units))) { + for (MCRegUnit Unit : TRI.regunits(PReg)) { + if (VRegLI.overlaps(LIS.getRegUnit(Unit))) { Interference = true; break; } diff --git a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp index b3a13cc92316..e031019a4c91 100644 --- a/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp +++ b/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp @@ -81,9 +81,7 @@ template <> Pass *llvm::callDefaultCtor<RegAllocPriorityAdvisorAnalysis>() { #endif break; case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Release: -#if defined(LLVM_HAVE_TF_AOT_REGALLOCPRIORITYMODEL) Ret = createReleaseModePriorityAdvisor(); -#endif break; } if (Ret) diff --git a/llvm/lib/CodeGen/RegUsageInfoCollector.cpp b/llvm/lib/CodeGen/RegUsageInfoCollector.cpp index 16afd15e29e4..6657cf3c1ef4 100644 --- a/llvm/lib/CodeGen/RegUsageInfoCollector.cpp +++ b/llvm/lib/CodeGen/RegUsageInfoCollector.cpp @@ -208,8 +208,8 @@ computeCalleeSavedRegs(BitVector &SavedRegs, MachineFunction &MF) { MCPhysReg Reg = CSRegs[i]; if (SavedRegs.test(Reg)) { // Save subregisters - for (MCSubRegIterator SR(Reg, &TRI); SR.isValid(); ++SR) - SavedRegs.set(*SR); + for (MCPhysReg SR : TRI.subregs(Reg)) + SavedRegs.set(SR); } } } diff --git a/llvm/lib/CodeGen/RegisterBank.cpp b/llvm/lib/CodeGen/RegisterBank.cpp index 512b21aeacaf..8e0a0b0dc282 100644 --- a/llvm/lib/CodeGen/RegisterBank.cpp +++ b/llvm/lib/CodeGen/RegisterBank.cpp @@ -11,6 +11,7 @@ #include "llvm/CodeGen/RegisterBank.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/Config/llvm-config.h" #include "llvm/Support/Debug.h" @@ -21,15 +22,16 @@ using namespace llvm; const unsigned RegisterBank::InvalidID = UINT_MAX; -RegisterBank::RegisterBank( - unsigned ID, const char *Name, unsigned Size, - const uint32_t *CoveredClasses, unsigned NumRegClasses) - : ID(ID), Name(Name), Size(Size) { +RegisterBank::RegisterBank(unsigned ID, const char *Name, + const uint32_t *CoveredClasses, + unsigned NumRegClasses) + : ID(ID), Name(Name) { ContainedRegClasses.resize(NumRegClasses); ContainedRegClasses.setBitsInMask(CoveredClasses); } -bool RegisterBank::verify(const TargetRegisterInfo &TRI) const { +bool RegisterBank::verify(const RegisterBankInfo &RBI, + const TargetRegisterInfo &TRI) const { assert(isValid() && "Invalid register bank"); for (unsigned RCId = 0, End = TRI.getNumRegClasses(); RCId != End; ++RCId) { const TargetRegisterClass &RC = *TRI.getRegClass(RCId); @@ -50,7 +52,7 @@ bool RegisterBank::verify(const TargetRegisterInfo &TRI) const { // Verify that the Size of the register bank is big enough to cover // all the register classes it covers. - assert(getSize() >= TRI.getRegSizeInBits(SubRC) && + assert(RBI.getMaximumSize(getID()) >= TRI.getRegSizeInBits(SubRC) && "Size is not big enough for all the subclasses!"); assert(covers(SubRC) && "Not all subclasses are covered"); } @@ -64,7 +66,7 @@ bool RegisterBank::covers(const TargetRegisterClass &RC) const { } bool RegisterBank::isValid() const { - return ID != InvalidID && Name != nullptr && Size != 0 && + return ID != InvalidID && Name != nullptr && // A register bank that does not cover anything is useless. !ContainedRegClasses.empty(); } @@ -89,7 +91,7 @@ void RegisterBank::print(raw_ostream &OS, bool IsForDebug, OS << getName(); if (!IsForDebug) return; - OS << "(ID:" << getID() << ", Size:" << getSize() << ")\n" + OS << "(ID:" << getID() << ")\n" << "isValid:" << isValid() << '\n' << "Number of Covered register classes: " << ContainedRegClasses.count() << '\n'; diff --git a/llvm/lib/CodeGen/RegisterBankInfo.cpp b/llvm/lib/CodeGen/RegisterBankInfo.cpp index 27ed17b9f4f6..658a09fd8700 100644 --- a/llvm/lib/CodeGen/RegisterBankInfo.cpp +++ b/llvm/lib/CodeGen/RegisterBankInfo.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/RegisterBankInfo.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" @@ -52,9 +53,11 @@ const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1; //------------------------------------------------------------------------------ // RegisterBankInfo implementation. //------------------------------------------------------------------------------ -RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks, - unsigned NumRegBanks) - : RegBanks(RegBanks), NumRegBanks(NumRegBanks) { +RegisterBankInfo::RegisterBankInfo(const RegisterBank **RegBanks, + unsigned NumRegBanks, const unsigned *Sizes, + unsigned HwMode) + : RegBanks(RegBanks), NumRegBanks(NumRegBanks), Sizes(Sizes), + HwMode(HwMode) { #ifndef NDEBUG for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank"); @@ -70,7 +73,7 @@ bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const { assert(Idx == RegBank.getID() && "ID does not match the index in the array"); LLVM_DEBUG(dbgs() << "Verify " << RegBank << '\n'); - assert(RegBank.verify(TRI) && "RegBank is invalid"); + assert(RegBank.verify(*this, TRI) && "RegBank is invalid"); } #endif // NDEBUG return true; @@ -79,31 +82,32 @@ bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const { const RegisterBank * RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const { - if (Reg.isPhysical()) { + if (!Reg.isVirtual()) { // FIXME: This was probably a copy to a virtual register that does have a // type we could use. - return &getRegBankFromRegClass(getMinimalPhysRegClass(Reg, TRI), LLT()); + const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg, TRI); + return RC ? &getRegBankFromRegClass(*RC, LLT()) : nullptr; } - assert(Reg && "NoRegister does not have a register bank"); const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); - if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>()) + if (auto *RB = dyn_cast_if_present<const RegisterBank *>(RegClassOrBank)) return RB; - if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>()) + if (auto *RC = + dyn_cast_if_present<const TargetRegisterClass *>(RegClassOrBank)) return &getRegBankFromRegClass(*RC, MRI.getType(Reg)); return nullptr; } -const TargetRegisterClass & +const TargetRegisterClass * RegisterBankInfo::getMinimalPhysRegClass(Register Reg, const TargetRegisterInfo &TRI) const { assert(Reg.isPhysical() && "Reg must be a physreg"); const auto &RegRCIt = PhysRegMinimalRCs.find(Reg); if (RegRCIt != PhysRegMinimalRCs.end()) - return *RegRCIt->second; - const TargetRegisterClass *PhysRC = TRI.getMinimalPhysRegClass(Reg); + return RegRCIt->second; + const TargetRegisterClass *PhysRC = TRI.getMinimalPhysRegClassLLT(Reg, LLT()); PhysRegMinimalRCs[Reg] = PhysRC; - return *PhysRC; + return PhysRC; } const RegisterBank *RegisterBankInfo::getRegBankFromConstraints( @@ -131,10 +135,10 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister( // If the register already has a class, fallback to MRI::constrainRegClass. auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); - if (RegClassOrBank.is<const TargetRegisterClass *>()) + if (isa<const TargetRegisterClass *>(RegClassOrBank)) return MRI.constrainRegClass(Reg, &RC); - const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>(); + const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank); // Otherwise, all we can do is ensure the bank covers the class, and set it. if (RB && !RB->covers(RC)) return nullptr; @@ -498,7 +502,7 @@ unsigned RegisterBankInfo::getSizeInBits(Register Reg, // Instead, we need to access a register class that contains Reg and // get the size of that register class. // Because this is expensive, we'll cache the register class by calling - auto *RC = &getMinimalPhysRegClass(Reg, TRI); + auto *RC = getMinimalPhysRegClass(Reg, TRI); assert(RC && "Expecting Register class"); return TRI.getRegSizeInBits(*RC); } @@ -515,12 +519,14 @@ LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const { } #endif -bool RegisterBankInfo::PartialMapping::verify() const { +bool RegisterBankInfo::PartialMapping::verify( + const RegisterBankInfo &RBI) const { assert(RegBank && "Register bank not set"); assert(Length && "Empty mapping"); assert((StartIdx <= getHighBitIdx()) && "Overflow, switch to APInt?"); // Check if the minimum width fits into RegBank. - assert(RegBank->getSize() >= Length && "Register bank too small for Mask"); + assert(RBI.getMaximumSize(RegBank->getID()) >= Length && + "Register bank too small for Mask"); return true; } @@ -545,13 +551,14 @@ bool RegisterBankInfo::ValueMapping::partsAllUniform() const { return true; } -bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const { +bool RegisterBankInfo::ValueMapping::verify(const RegisterBankInfo &RBI, + unsigned MeaningfulBitWidth) const { assert(NumBreakDowns && "Value mapped nowhere?!"); unsigned OrigValueBitWidth = 0; for (const RegisterBankInfo::PartialMapping &PartMap : *this) { // Check that each register bank is big enough to hold the partial value: // this check is done by PartialMapping::verify - assert(PartMap.verify() && "Partial mapping is invalid"); + assert(PartMap.verify(RBI) && "Partial mapping is invalid"); // The original value should completely be mapped. // Thus the maximum accessed index + 1 is the size of the original value. OrigValueBitWidth = @@ -625,8 +632,9 @@ bool RegisterBankInfo::InstructionMapping::verify( (void)MOMapping; // Register size in bits. // This size must match what the mapping expects. - assert(MOMapping.verify(RBI->getSizeInBits( - Reg, MF.getRegInfo(), *MF.getSubtarget().getRegisterInfo())) && + assert(MOMapping.verify(*RBI, RBI->getSizeInBits( + Reg, MF.getRegInfo(), + *MF.getSubtarget().getRegisterInfo())) && "Value mapping is invalid"); } return true; diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index ab1215974fc5..e49885b6ad96 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -116,7 +116,7 @@ static cl::opt<unsigned> LargeIntervalFreqThreshold( cl::desc("For a large interval, if it is coalesed with other live " "intervals many times more than the threshold, stop its " "coalescing to control the compile time. "), - cl::init(100)); + cl::init(256)); namespace { @@ -153,12 +153,6 @@ namespace { using DbgValueLoc = std::pair<SlotIndex, MachineInstr*>; DenseMap<Register, std::vector<DbgValueLoc>> DbgVRegToValues; - /// VRegs may be repeatedly coalesced, and have many DBG_VALUEs attached. - /// To avoid repeatedly merging sets of DbgValueLocs, instead record - /// which vregs have been coalesced, and where to. This map is from - /// vreg => {set of vregs merged in}. - DenseMap<Register, SmallVector<Register, 4>> DbgMergedVRegNums; - /// A LaneMask to remember on which subregister live ranges we need to call /// shrinkToUses() later. LaneBitmask ShrinkMask; @@ -404,14 +398,14 @@ char RegisterCoalescer::ID = 0; char &llvm::RegisterCoalescerID = RegisterCoalescer::ID; -INITIALIZE_PASS_BEGIN(RegisterCoalescer, "simple-register-coalescing", - "Simple Register Coalescing", false, false) +INITIALIZE_PASS_BEGIN(RegisterCoalescer, "register-coalescer", + "Register Coalescer", false, false) INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(SlotIndexes) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(RegisterCoalescer, "simple-register-coalescing", - "Simple Register Coalescing", false, false) +INITIALIZE_PASS_END(RegisterCoalescer, "register-coalescer", + "Register Coalescer", false, false) [[nodiscard]] static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI, Register &Src, @@ -1257,8 +1251,8 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP, static bool definesFullReg(const MachineInstr &MI, Register Reg) { assert(!Reg.isPhysical() && "This code cannot handle physreg aliasing"); - for (const MachineOperand &Op : MI.operands()) { - if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg) + for (const MachineOperand &Op : MI.all_defs()) { + if (Op.getReg() != Reg) continue; // Return true if we define the full register or don't care about the value // inside other subregisters. @@ -1502,11 +1496,18 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, LLVM_DEBUG(dbgs() << "Removing undefined SubRange " << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n"); - // VNI is in ValNo - remove any segments in this SubRange that have this ValNo + if (VNInfo *RmValNo = SR.getVNInfoAt(CurrIdx.getRegSlot())) { + // VNI is in ValNo - remove any segments in this SubRange that have + // this ValNo SR.removeValNo(RmValNo); - UpdatedSubRanges = true; } + + // We may not have a defined value at this point, but still need to + // clear out any empty subranges tentatively created by + // updateRegDefUses. The original subrange def may have only undefed + // some lanes. + UpdatedSubRanges = true; } else { // We know that this lane is defined by this instruction, // but at this point it may be empty because it is not used by @@ -1545,9 +1546,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, // no live-ranges would have been created for ECX. // Fix that! SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI); - for (MCRegUnitIterator Units(NewMI.getOperand(0).getReg(), TRI); - Units.isValid(); ++Units) - if (LiveRange *LR = LIS->getCachedRegUnit(*Units)) + for (MCRegUnit Unit : TRI->regunits(NewMI.getOperand(0).getReg())) + if (LiveRange *LR = LIS->getCachedRegUnit(Unit)) LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator()); } @@ -1561,8 +1561,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI); for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) { MCRegister Reg = NewMIImplDefs[i]; - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) - if (LiveRange *LR = LIS->getCachedRegUnit(*Units)) + for (MCRegUnit Unit : TRI->regunits(Reg)) + if (LiveRange *LR = LIS->getCachedRegUnit(Unit)) LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator()); } @@ -1713,8 +1713,8 @@ MachineInstr *RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) { // is still part of the function (but about to be erased), mark all // defs of DstReg in it as <undef>, so that shrinkToUses would // ignore them. - for (MachineOperand &MO : CopyMI->operands()) - if (MO.isReg() && MO.isDef() && MO.getReg() == DstReg) + for (MachineOperand &MO : CopyMI->all_defs()) + if (MO.getReg() == DstReg) MO.setIsUndef(true); LIS->shrinkToUses(&DstLI); @@ -2164,14 +2164,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { // Deny any overlapping intervals. This depends on all the reserved // register live ranges to look like dead defs. if (!MRI->isConstantPhysReg(DstReg)) { - for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) { + for (MCRegUnit Unit : TRI->regunits(DstReg)) { // Abort if not all the regunits are reserved. - for (MCRegUnitRootIterator RI(*UI, TRI); RI.isValid(); ++RI) { + for (MCRegUnitRootIterator RI(Unit, TRI); RI.isValid(); ++RI) { if (!MRI->isReserved(*RI)) return false; } - if (RHS.overlaps(LIS->getRegUnit(*UI))) { - LLVM_DEBUG(dbgs() << "\t\tInterference: " << printRegUnit(*UI, TRI) + if (RHS.overlaps(LIS->getRegUnit(Unit))) { + LLVM_DEBUG(dbgs() << "\t\tInterference: " << printRegUnit(Unit, TRI) << '\n'); return false; } @@ -2202,6 +2202,7 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { // ... // use %physreg_x CopyMI = MRI->getVRegDef(SrcReg); + deleteInstr(CopyMI); } else { // VReg is copied into physreg: // %y = def @@ -2246,15 +2247,15 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) { << printReg(DstReg, TRI) << " at " << CopyRegIdx << "\n"); LIS->removePhysRegDefAt(DstReg.asMCReg(), CopyRegIdx); + deleteInstr(CopyMI); + // Create a new dead def at the new def location. - for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) { - LiveRange &LR = LIS->getRegUnit(*UI); + for (MCRegUnit Unit : TRI->regunits(DstReg)) { + LiveRange &LR = LIS->getRegUnit(Unit); LR.createDeadDef(DestRegIdx, LIS->getVNInfoAllocator()); } } - deleteInstr(CopyMI); - // We don't track kills for reserved registers. MRI->clearKillFlags(CP.getSrcReg()); @@ -2569,8 +2570,8 @@ public: LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const { LaneBitmask L; - for (const MachineOperand &MO : DefMI->operands()) { - if (!MO.isReg() || MO.getReg() != Reg || !MO.isDef()) + for (const MachineOperand &MO : DefMI->all_defs()) { + if (MO.getReg() != Reg) continue; L |= TRI->getSubRegIndexLaneMask( TRI->composeSubRegIndices(SubIdx, MO.getSubReg())); @@ -2786,13 +2787,22 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) { // // When it happens, treat that IMPLICIT_DEF as a normal value, and don't try // to erase the IMPLICIT_DEF instruction. - if (DefMI && - DefMI->getParent() != Indexes->getMBBFromIndex(V.OtherVNI->def)) { + MachineBasicBlock *OtherMBB = Indexes->getMBBFromIndex(V.OtherVNI->def); + if (DefMI && DefMI->getParent() != OtherMBB) { LLVM_DEBUG(dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def << " extends into " << printMBBReference(*DefMI->getParent()) << ", keeping it.\n"); OtherV.ErasableImplicitDef = false; + } else if (OtherMBB->hasEHPadSuccessor()) { + // If OtherV is defined in a basic block that has EH pad successors then + // we get the same problem not just if OtherV is live beyond its basic + // block, but beyond the last call instruction in its basic block. Handle + // this case conservatively. + LLVM_DEBUG( + dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def + << " may be live into EH pad successors, keeping it.\n"); + OtherV.ErasableImplicitDef = false; } else { // We deferred clearing these lanes in case we needed to save them OtherV.ValidLanes &= ~OtherV.WriteLanes; @@ -2952,7 +2962,7 @@ void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) { // its lanes. if (OtherV.ErasableImplicitDef && TrackSubRegLiveness && - (OtherV.WriteLanes & ~V.ValidLanes).any()) { + (OtherV.ValidLanes & ~V.ValidLanes).any()) { LLVM_DEBUG(dbgs() << "Cannot erase implicit_def with missing values\n"); OtherV.ErasableImplicitDef = false; @@ -3029,8 +3039,8 @@ bool JoinVals::usesLanes(const MachineInstr &MI, Register Reg, unsigned SubIdx, LaneBitmask Lanes) const { if (MI.isDebugOrPseudoInstr()) return false; - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || MO.isDef() || MO.getReg() != Reg) + for (const MachineOperand &MO : MI.all_uses()) { + if (MO.getReg() != Reg) continue; if (!MO.readsReg()) continue; @@ -3759,18 +3769,9 @@ void RegisterCoalescer::checkMergingChangesDbgValues(CoalescerPair &CP, checkMergingChangesDbgValuesImpl(Reg, LHS, RHS, RHSVals); }; - // Scan for potentially unsound DBG_VALUEs: examine first the register number - // Reg, and then any other vregs that may have been merged into it. - auto PerformScan = [this](Register Reg, std::function<void(Register)> Func) { - Func(Reg); - if (DbgMergedVRegNums.count(Reg)) - for (Register X : DbgMergedVRegNums[Reg]) - Func(X); - }; - // Scan for unsound updates of both the source and destination register. - PerformScan(CP.getSrcReg(), ScanForSrcReg); - PerformScan(CP.getDstReg(), ScanForDstReg); + ScanForSrcReg(CP.getSrcReg()); + ScanForDstReg(CP.getDstReg()); } void RegisterCoalescer::checkMergingChangesDbgValuesImpl(Register Reg, @@ -4099,7 +4100,7 @@ void RegisterCoalescer::releaseMemory() { } bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { - LLVM_DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n" + LLVM_DEBUG(dbgs() << "********** REGISTER COALESCER **********\n" << "********** Function: " << fn.getName() << '\n'); // Variables changed between a setjmp and a longjump can have undefined value @@ -4151,7 +4152,6 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { MF->verify(this, "Before register coalescing"); DbgVRegToValues.clear(); - DbgMergedVRegNums.clear(); buildVRegToDbgValueMap(fn); RegClassInfo.runOnMachineFunction(fn); diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp index d4c29f96a4f9..f86aa3a16720 100644 --- a/llvm/lib/CodeGen/RegisterPressure.cpp +++ b/llvm/lib/CodeGen/RegisterPressure.cpp @@ -521,9 +521,8 @@ class RegisterOperandsCollector { if (Reg.isVirtual()) { addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneBitmask::getAll())); } else if (MRI.isAllocatable(Reg)) { - for (MCRegUnitIterator Units(Reg.asMCReg(), &TRI); Units.isValid(); - ++Units) - addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll())); + for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) + addRegLanes(RegUnits, RegisterMaskPair(Unit, LaneBitmask::getAll())); } } @@ -557,9 +556,8 @@ class RegisterOperandsCollector { : MRI.getMaxLaneMaskForVReg(Reg); addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneMask)); } else if (MRI.isAllocatable(Reg)) { - for (MCRegUnitIterator Units(Reg.asMCReg(), &TRI); Units.isValid(); - ++Units) - addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll())); + for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) + addRegLanes(RegUnits, RegisterMaskPair(Unit, LaneBitmask::getAll())); } } }; diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp index 8d10a5558315..c00d3fde6426 100644 --- a/llvm/lib/CodeGen/RegisterScavenging.cpp +++ b/llvm/lib/CodeGen/RegisterScavenging.cpp @@ -96,13 +96,13 @@ void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) { } void RegScavenger::addRegUnits(BitVector &BV, MCRegister Reg) { - for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) - BV.set(*RUI); + for (MCRegUnit Unit : TRI->regunits(Reg)) + BV.set(Unit); } void RegScavenger::removeRegUnits(BitVector &BV, MCRegister Reg) { - for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) - BV.reset(*RUI); + for (MCRegUnit Unit : TRI->regunits(Reg)) + BV.reset(Unit); } void RegScavenger::determineKillsAndDefs() { @@ -198,25 +198,13 @@ void RegScavenger::forward() { // S1 is can be freely clobbered. // Ideally we would like a way to model this, but leaving the // insert_subreg around causes both correctness and performance issues. - bool SubUsed = false; - for (const MCPhysReg &SubReg : TRI->subregs(Reg)) - if (isRegUsed(SubReg)) { - SubUsed = true; - break; - } - bool SuperUsed = false; - for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) { - if (isRegUsed(*SR)) { - SuperUsed = true; - break; - } - } - if (!SubUsed && !SuperUsed) { + if (none_of(TRI->subregs(Reg), + [&](MCPhysReg SR) { return isRegUsed(SR); }) && + none_of(TRI->superregs(Reg), + [&](MCPhysReg SR) { return isRegUsed(SR); })) { MBB->getParent()->verify(nullptr, "In Register Scavenger"); llvm_unreachable("Using an undefined register!"); } - (void)SubUsed; - (void)SuperUsed; } } else { assert(MO.isDef()); @@ -282,70 +270,6 @@ BitVector RegScavenger::getRegsAvailable(const TargetRegisterClass *RC) { return Mask; } -Register RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI, - BitVector &Candidates, - unsigned InstrLimit, - MachineBasicBlock::iterator &UseMI) { - int Survivor = Candidates.find_first(); - assert(Survivor > 0 && "No candidates for scavenging"); - - MachineBasicBlock::iterator ME = MBB->getFirstTerminator(); - assert(StartMI != ME && "MI already at terminator"); - MachineBasicBlock::iterator RestorePointMI = StartMI; - MachineBasicBlock::iterator MI = StartMI; - - bool inVirtLiveRange = false; - for (++MI; InstrLimit > 0 && MI != ME; ++MI, --InstrLimit) { - if (MI->isDebugOrPseudoInstr()) { - ++InstrLimit; // Don't count debug instructions - continue; - } - bool isVirtKillInsn = false; - bool isVirtDefInsn = false; - // Remove any candidates touched by instruction. - for (const MachineOperand &MO : MI->operands()) { - if (MO.isRegMask()) - Candidates.clearBitsNotInMask(MO.getRegMask()); - if (!MO.isReg() || MO.isUndef() || !MO.getReg()) - continue; - if (MO.getReg().isVirtual()) { - if (MO.isDef()) - isVirtDefInsn = true; - else if (MO.isKill()) - isVirtKillInsn = true; - continue; - } - for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI) - Candidates.reset(*AI); - } - // If we're not in a virtual reg's live range, this is a valid - // restore point. - if (!inVirtLiveRange) RestorePointMI = MI; - - // Update whether we're in the live range of a virtual register - if (isVirtKillInsn) inVirtLiveRange = false; - if (isVirtDefInsn) inVirtLiveRange = true; - - // Was our survivor untouched by this instruction? - if (Candidates.test(Survivor)) - continue; - - // All candidates gone? - if (Candidates.none()) - break; - - Survivor = Candidates.find_first(); - } - // If we ran off the end, that's where we want to restore. - if (MI == ME) RestorePointMI = ME; - assert(RestorePointMI != StartMI && - "No available scavenger restore location!"); - - // We ran out of candidates, so stop the search. - UseMI = RestorePointMI; - return Survivor; -} - /// Given the bitvector \p Available of free register units at position /// \p From. Search backwards to find a register that is part of \p /// Candidates and not used/clobbered until the point \p To. If there is @@ -522,73 +446,6 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj, return Scavenged[SI]; } -Register RegScavenger::scavengeRegister(const TargetRegisterClass *RC, - MachineBasicBlock::iterator I, - int SPAdj, bool AllowSpill) { - MachineInstr &MI = *I; - const MachineFunction &MF = *MI.getMF(); - // Consider all allocatable registers in the register class initially - BitVector Candidates = TRI->getAllocatableSet(MF, RC); - - // Exclude all the registers being used by the instruction. - for (const MachineOperand &MO : MI.operands()) { - if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) && - !MO.getReg().isVirtual()) - for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI) - Candidates.reset(*AI); - } - - // If we have already scavenged some registers, remove them from the - // candidates. If we end up recursively calling eliminateFrameIndex, we don't - // want to be clobbering previously scavenged registers or their associated - // stack slots. - for (ScavengedInfo &SI : Scavenged) { - if (SI.Reg) { - if (isRegUsed(SI.Reg)) { - LLVM_DEBUG( - dbgs() << "Removing " << printReg(SI.Reg, TRI) << - " from scavenging candidates since it was already scavenged\n"); - for (MCRegAliasIterator AI(SI.Reg, TRI, true); AI.isValid(); ++AI) - Candidates.reset(*AI); - } - } - } - - // Try to find a register that's unused if there is one, as then we won't - // have to spill. - BitVector Available = getRegsAvailable(RC); - Available &= Candidates; - if (Available.any()) - Candidates = Available; - - // Find the register whose use is furthest away. - MachineBasicBlock::iterator UseMI; - Register SReg = findSurvivorReg(I, Candidates, 25, UseMI); - - // If we found an unused register there is no reason to spill it. - if (!isRegUsed(SReg)) { - LLVM_DEBUG(dbgs() << "Scavenged register: " << printReg(SReg, TRI) << "\n"); - return SReg; - } - - if (!AllowSpill) - return 0; - -#ifndef NDEBUG - for (ScavengedInfo &SI : Scavenged) { - assert(SI.Reg != SReg && "scavenged a previously scavenged register"); - } -#endif - - ScavengedInfo &Scavenged = spill(SReg, *RC, SPAdj, I, UseMI); - Scavenged.Restore = &*std::prev(UseMI); - - LLVM_DEBUG(dbgs() << "Scavenged register (with spill): " - << printReg(SReg, TRI) << "\n"); - - return SReg; -} - Register RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, diff --git a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp index 05bbd1a2d03b..bc3ef1c0329a 100644 --- a/llvm/lib/CodeGen/RenameIndependentSubregs.cpp +++ b/llvm/lib/CodeGen/RenameIndependentSubregs.cpp @@ -101,9 +101,9 @@ private: const SmallVectorImpl<LiveInterval*> &Intervals) const; - LiveIntervals *LIS; - MachineRegisterInfo *MRI; - const TargetInstrInfo *TII; + LiveIntervals *LIS = nullptr; + MachineRegisterInfo *MRI = nullptr; + const TargetInstrInfo *TII = nullptr; }; } // end anonymous namespace @@ -249,7 +249,7 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes, /// Undef use operands are not tracked in the equivalence class, /// but need to be updated if they are tied; take care to only /// update the tied operand. - unsigned OperandNo = MI->getOperandNo(&MO); + unsigned OperandNo = MO.getOperandNo(); unsigned TiedIdx = MI->findTiedOperandIdx(OperandNo); MI->getOperand(TiedIdx).setReg(VReg); diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp index 87b8ac59bdba..57cd1fcffb61 100644 --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -229,8 +229,6 @@ void ReplaceWithVeclibLegacy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<TargetLibraryInfoWrapperPass>(); AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); - AU.addPreserved<LoopAccessLegacyAnalysis>(); - AU.addPreserved<DemandedBitsWrapperPass>(); AU.addPreserved<OptimizationRemarkEmitterWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } diff --git a/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp b/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp index 0ad6ef84220a..11bdf3bb2ba8 100644 --- a/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp +++ b/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; #define DEBUG_TYPE "reset-machine-function" @@ -68,6 +69,10 @@ namespace { MF.reset(); MF.initTargetMachineFunctionInfo(MF.getSubtarget()); + const LLVMTargetMachine &TM = MF.getTarget(); + // MRI callback for target specific initializations. + TM.registerMachineRegisterInfoCallback(MF); + if (EmitFallbackDiag) { const Function &F = MF.getFunction(); DiagnosticInfoISelFallback DiagFallback(F); diff --git a/llvm/lib/CodeGen/SanitizerBinaryMetadata.cpp b/llvm/lib/CodeGen/SanitizerBinaryMetadata.cpp index dd70a2f23e45..cc29bdce1210 100644 --- a/llvm/lib/CodeGen/SanitizerBinaryMetadata.cpp +++ b/llvm/lib/CodeGen/SanitizerBinaryMetadata.cpp @@ -52,12 +52,13 @@ bool MachineSanitizerBinaryMetadata::runOnMachineFunction(MachineFunction &MF) { if (!MD) return false; const auto &Section = *cast<MDString>(MD->getOperand(0)); - if (!Section.getString().equals(kSanitizerBinaryMetadataCoveredSection)) + if (!Section.getString().startswith(kSanitizerBinaryMetadataCoveredSection)) return false; auto &AuxMDs = *cast<MDTuple>(MD->getOperand(1)); // Assume it currently only has features. assert(AuxMDs.getNumOperands() == 1); - auto *Features = cast<ConstantAsMetadata>(AuxMDs.getOperand(0))->getValue(); + Constant *Features = + cast<ConstantAsMetadata>(AuxMDs.getOperand(0))->getValue(); if (!Features->getUniqueInteger()[kSanitizerBinaryMetadataUARBit]) return false; // Calculate size of stack args for the function. @@ -69,12 +70,18 @@ bool MachineSanitizerBinaryMetadata::runOnMachineFunction(MachineFunction &MF) { Align = std::max(Align, MFI.getObjectAlign(i).value()); } Size = (Size + Align - 1) & ~(Align - 1); + if (!Size) + return false; + // Non-zero size, update metadata. auto &F = MF.getFunction(); IRBuilder<> IRB(F.getContext()); MDBuilder MDB(F.getContext()); // Keep the features and append size of stack args to the metadata. - F.setMetadata(LLVMContext::MD_pcsections, - MDB.createPCSections( - {{Section.getString(), {Features, IRB.getInt32(Size)}}})); + APInt NewFeatures = Features->getUniqueInteger(); + NewFeatures.setBit(kSanitizerBinaryMetadataUARHasSizeBit); + F.setMetadata( + LLVMContext::MD_pcsections, + MDB.createPCSections({{Section.getString(), + {IRB.getInt(NewFeatures), IRB.getInt32(Size)}}})); return false; } diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp index 696b29018ae6..14ec41920e3e 100644 --- a/llvm/lib/CodeGen/ScheduleDAG.cpp +++ b/llvm/lib/CodeGen/ScheduleDAG.cpp @@ -183,8 +183,6 @@ void SUnit::removePred(const SDep &D) { SUnit *N = D.getSUnit(); SmallVectorImpl<SDep>::iterator Succ = llvm::find(N->Succs, P); assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!"); - N->Succs.erase(Succ); - Preds.erase(I); // Update the bookkeeping. if (P.getKind() == SDep::Data) { assert(NumPreds > 0 && "NumPreds will underflow!"); @@ -193,21 +191,25 @@ void SUnit::removePred(const SDep &D) { --N->NumSuccs; } if (!N->isScheduled) { - if (D.isWeak()) + if (D.isWeak()) { + assert(WeakPredsLeft > 0 && "WeakPredsLeft will underflow!"); --WeakPredsLeft; - else { + } else { assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!"); --NumPredsLeft; } } if (!isScheduled) { - if (D.isWeak()) + if (D.isWeak()) { + assert(WeakSuccsLeft > 0 && "WeakSuccsLeft will underflow!"); --N->WeakSuccsLeft; - else { + } else { assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!"); --N->NumSuccsLeft; } } + N->Succs.erase(Succ); + Preds.erase(I); if (P.getLatency() != 0) { this->setDepthDirty(); N->setHeightDirty(); @@ -722,6 +724,8 @@ void ScheduleDAGTopologicalSort::AddSUnitWithoutPredecessors(const SUnit *SU) { bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU, const SUnit *TargetSU) { + assert(TargetSU != nullptr && "Invalid target SUnit"); + assert(SU != nullptr && "Invalid SUnit"); FixOrder(); // If insertion of the edge SU->TargetSU would create a cycle // then there is a path from TargetSU to SU. diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index 1b213e87e75c..239b44857c28 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -208,13 +208,12 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { ExitSU.setInstr(ExitMI); // Add dependencies on the defs and uses of the instruction. if (ExitMI) { - for (const MachineOperand &MO : ExitMI->operands()) { - if (!MO.isReg() || MO.isDef()) continue; + for (const MachineOperand &MO : ExitMI->all_uses()) { Register Reg = MO.getReg(); if (Reg.isPhysical()) { Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg)); } else if (Reg.isVirtual() && MO.readsReg()) { - addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO)); + addVRegUseDeps(&ExitSU, MO.getOperandNo()); } } } @@ -334,11 +333,11 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { addPhysRegDataDeps(SU, OperIdx); // Clear previous uses and defs of this register and its subergisters. - for (MCSubRegIterator SubReg(Reg, TRI, true); SubReg.isValid(); ++SubReg) { - if (Uses.contains(*SubReg)) - Uses.eraseAll(*SubReg); + for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) { + if (Uses.contains(SubReg)) + Uses.eraseAll(SubReg); if (!MO.isDead()) - Defs.eraseAll(*SubReg); + Defs.eraseAll(SubReg); } if (MO.isDead() && SU->isCall) { // Calls will not be reordered because of chain dependencies (see @@ -1026,15 +1025,14 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) { void ScheduleDAGInstrs::Value2SUsMap::dump() { for (const auto &[ValType, SUs] : *this) { - if (ValType.is<const Value*>()) { - const Value *V = ValType.get<const Value*>(); + if (isa<const Value *>(ValType)) { + const Value *V = cast<const Value *>(ValType); if (isa<UndefValue>(V)) dbgs() << "Unknown"; else V->printAsOperand(dbgs()); - } - else if (ValType.is<const PseudoSourceValue*>()) - dbgs() << ValType.get<const PseudoSourceValue*>(); + } else if (isa<const PseudoSourceValue *>(ValType)) + dbgs() << cast<const PseudoSourceValue *>(ValType); else llvm_unreachable("Unknown Value type."); @@ -1522,7 +1520,7 @@ LLVM_DUMP_METHOD void ILPValue::dump() const { namespace llvm { -LLVM_DUMP_METHOD +LLVM_ATTRIBUTE_UNUSED raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) { Val.print(OS); return OS; diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp index 5fd78eccf732..30d959704745 100644 --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -98,15 +99,15 @@ namespace { class SelectOptimize : public FunctionPass { const TargetMachine *TM = nullptr; - const TargetSubtargetInfo *TSI; + const TargetSubtargetInfo *TSI = nullptr; const TargetLowering *TLI = nullptr; const TargetTransformInfo *TTI = nullptr; - const LoopInfo *LI; - DominatorTree *DT; + const LoopInfo *LI = nullptr; + DominatorTree *DT = nullptr; std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; - ProfileSummaryInfo *PSI; - OptimizationRemarkEmitter *ORE; + ProfileSummaryInfo *PSI = nullptr; + OptimizationRemarkEmitter *ORE = nullptr; TargetSchedModel TSchedModel; public: diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0a3ebd73d272..de909cc10795 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -30,11 +30,14 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/CodeGen/ByteProvider.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" @@ -57,7 +60,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -169,7 +171,8 @@ namespace { /// them) when they are deleted from the underlying DAG. It relies on /// stable indices of nodes within the worklist. DenseMap<SDNode *, unsigned> WorklistMap; - /// This records all nodes attempted to add to the worklist since we + + /// This records all nodes attempted to be added to the worklist since we /// considered a new worklist entry. As we keep do not add duplicate nodes /// in the worklist, this is different from the tail of the worklist. SmallSetVector<SDNode *, 32> PruningList; @@ -262,7 +265,7 @@ namespace { /// Add to the worklist making sure its instance is at the back (next to be /// processed.) - void AddToWorklist(SDNode *N) { + void AddToWorklist(SDNode *N, bool IsCandidateForPruning = true) { assert(N->getOpcode() != ISD::DELETED_NODE && "Deleted Node added to Worklist"); @@ -271,7 +274,8 @@ namespace { if (N->getOpcode() == ISD::HANDLENODE) return; - ConsiderForPruning(N); + if (IsCandidateForPruning) + ConsiderForPruning(N); if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) Worklist.push_back(N); @@ -362,6 +366,11 @@ namespace { SDValue SplitIndexingFromLoad(LoadSDNode *LD); bool SliceUpLoad(SDNode *N); + // Looks up the chain to find a unique (unaliased) store feeding the passed + // load. If no such store is found, returns a nullptr. + // Note: This will look past a CALLSEQ_START if the load is chained to it so + // so that it can find stack stores for byval params. + StoreSDNode *getUniqueStoreFeeding(LoadSDNode *LD, int64_t &Offset); // Scalars have size 0 to distinguish from singleton vectors. SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD); bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val); @@ -417,11 +426,12 @@ namespace { SDValue visitSUBC(SDNode *N); SDValue visitSUBO(SDNode *N); SDValue visitADDE(SDNode *N); - SDValue visitADDCARRY(SDNode *N); + SDValue visitUADDO_CARRY(SDNode *N); SDValue visitSADDO_CARRY(SDNode *N); - SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); + SDValue visitUADDO_CARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, + SDNode *N); SDValue visitSUBE(SDNode *N); - SDValue visitSUBCARRY(SDNode *N); + SDValue visitUSUBO_CARRY(SDNode *N); SDValue visitSSUBO_CARRY(SDNode *N); SDValue visitMUL(SDNode *N); SDValue visitMULFIX(SDNode *N); @@ -434,6 +444,7 @@ namespace { SDValue visitMULHU(SDNode *N); SDValue visitMULHS(SDNode *N); SDValue visitAVG(SDNode *N); + SDValue visitABD(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); SDValue visitMULO(SDNode *N); @@ -476,10 +487,12 @@ namespace { SDValue visitFREEZE(SDNode *N); SDValue visitBUILD_PAIR(SDNode *N); SDValue visitFADD(SDNode *N); + SDValue visitVP_FADD(SDNode *N); + SDValue visitVP_FSUB(SDNode *N); SDValue visitSTRICT_FADD(SDNode *N); SDValue visitFSUB(SDNode *N); SDValue visitFMUL(SDNode *N); - SDValue visitFMA(SDNode *N); + template <class MatchContextClass> SDValue visitFMA(SDNode *N); SDValue visitFDIV(SDNode *N); SDValue visitFREM(SDNode *N); SDValue visitFSQRT(SDNode *N); @@ -495,6 +508,7 @@ namespace { SDValue visitFABS(SDNode *N); SDValue visitFCEIL(SDNode *N); SDValue visitFTRUNC(SDNode *N); + SDValue visitFFREXP(SDNode *N); SDValue visitFFLOOR(SDNode *N); SDValue visitFMinMax(SDNode *N); SDValue visitBRCOND(SDNode *N); @@ -503,6 +517,7 @@ namespace { SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain); SDValue replaceStoreOfFPConstant(StoreSDNode *ST); + SDValue replaceStoreOfInsertLoad(StoreSDNode *ST); bool refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(SDNode *N); @@ -527,8 +542,12 @@ namespace { SDValue visitFP_TO_BF16(SDNode *N); SDValue visitVECREDUCE(SDNode *N); SDValue visitVPOp(SDNode *N); + SDValue visitGET_FPENV_MEM(SDNode *N); + SDValue visitSET_FPENV_MEM(SDNode *N); + template <class MatchContextClass> SDValue visitFADDForFMACombine(SDNode *N); + template <class MatchContextClass> SDValue visitFSUBForFMACombine(SDNode *N); SDValue visitFMULForFMADistributiveCombine(SDNode *N); @@ -539,9 +558,12 @@ namespace { SDValue N0, SDValue N1); SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0, - SDValue N1); + SDValue N1, SDNodeFlags Flags); SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SDNodeFlags Flags); + SDValue reassociateReduction(unsigned ResOpc, unsigned Opc, const SDLoc &DL, + EVT VT, SDValue N0, SDValue N1, + SDNodeFlags Flags = SDNodeFlags()); SDValue visitShiftByConstant(SDNode *N); @@ -579,11 +601,15 @@ namespace { SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp); SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); + SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI); + SDValue CombineExtLoad(SDNode *N); SDValue CombineZExtLogicopShiftLoad(SDNode *N); SDValue combineRepeatedFPDivisors(SDNode *N); SDValue mergeInsertEltWithShuffle(SDNode *N, unsigned InsIndex); SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); + SDValue combineInsertEltToLoad(SDNode *N, unsigned InsIndex); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); @@ -713,6 +739,11 @@ namespace { SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores); + /// Helper function for mergeConsecutiveStores which checks if all the store + /// nodes have the same underlying object. We can still reuse the first + /// store's pointer info if all the stores are from the same object. + bool hasSameUnderlyingObj(ArrayRef<MemOpLink> StoreNodes); + /// This is a helper function for mergeConsecutiveStores. When the source /// elements of the consecutive stores are all constants or all extracted /// vector elements, try to merge them into one larger store introducing @@ -841,6 +872,138 @@ public: void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); } }; +class EmptyMatchContext { + SelectionDAG &DAG; + const TargetLowering &TLI; + +public: + EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root) + : DAG(DAG), TLI(TLI) {} + + bool match(SDValue OpN, unsigned Opcode) const { + return Opcode == OpN->getOpcode(); + } + + // Same as SelectionDAG::getNode(). + template <typename... ArgT> SDValue getNode(ArgT &&...Args) { + return DAG.getNode(std::forward<ArgT>(Args)...); + } + + bool isOperationLegalOrCustom(unsigned Op, EVT VT, + bool LegalOnly = false) const { + return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly); + } +}; + +class VPMatchContext { + SelectionDAG &DAG; + const TargetLowering &TLI; + SDValue RootMaskOp; + SDValue RootVectorLenOp; + +public: + VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root) + : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() { + assert(Root->isVPOpcode()); + if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode())) + RootMaskOp = Root->getOperand(*RootMaskPos); + + if (auto RootVLenPos = + ISD::getVPExplicitVectorLengthIdx(Root->getOpcode())) + RootVectorLenOp = Root->getOperand(*RootVLenPos); + } + + /// whether \p OpVal is a node that is functionally compatible with the + /// NodeType \p Opc + bool match(SDValue OpVal, unsigned Opc) const { + if (!OpVal->isVPOpcode()) + return OpVal->getOpcode() == Opc; + + auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(), + !OpVal->getFlags().hasNoFPExcept()); + if (BaseOpc != Opc) + return false; + + // Make sure the mask of OpVal is true mask or is same as Root's. + unsigned VPOpcode = OpVal->getOpcode(); + if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) { + SDValue MaskOp = OpVal.getOperand(*MaskPos); + if (RootMaskOp != MaskOp && + !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode())) + return false; + } + + // Make sure the EVL of OpVal is same as Root's. + if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode)) + if (RootVectorLenOp != OpVal.getOperand(*VLenPos)) + return false; + return true; + } + + // Specialize based on number of operands. + // TODO emit VP intrinsics where MaskOp/VectorLenOp != null + // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return + // DAG.getNode(Opcode, DL, VT); } + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 1 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); + return DAG.getNode(VPOpcode, DL, VT, + {Operand, RootMaskOp, RootVectorLenOp}); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 2 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); + return DAG.getNode(VPOpcode, DL, VT, + {N1, N2, RootMaskOp, RootVectorLenOp}); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2, SDValue N3) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 3 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); + return DAG.getNode(VPOpcode, DL, VT, + {N1, N2, N3, RootMaskOp, RootVectorLenOp}); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand, + SDNodeFlags Flags) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 1 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2); + return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp}, + Flags); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2, SDNodeFlags Flags) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 2 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3); + return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp}, + Flags); + } + + SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, + SDValue N2, SDValue N3, SDNodeFlags Flags) { + unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode); + assert(ISD::getVPMaskIdx(VPOpcode) == 3 && + ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4); + return DAG.getNode(VPOpcode, DL, VT, + {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags); + } + + bool isOperationLegalOrCustom(unsigned Op, EVT VT, + bool LegalOnly = false) const { + unsigned VPOp = ISD::getVPForBaseOpcode(Op); + return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly); + } +}; + } // end anonymous namespace //===----------------------------------------------------------------------===// @@ -1099,7 +1262,8 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc, // Helper for DAGCombiner::reassociateOps. Try to reassociate an expression // such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc. SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, - SDValue N0, SDValue N1) { + SDValue N0, SDValue N1, + SDNodeFlags Flags) { EVT VT = N0.getValueType(); if (N0.getOpcode() != Opc) @@ -1118,8 +1282,12 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, if (TLI.isReassocProfitable(DAG, N0, N1)) { // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) // iff (op x, c1) has one use - SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1); - return DAG.getNode(Opc, DL, VT, OpNode, N01); + SDNodeFlags NewFlags; + if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() && + Flags.hasNoUnsignedWrap()) + NewFlags.setNoUnsignedWrap(true); + SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1, NewFlags); + return DAG.getNode(Opc, DL, VT, OpNode, N01, NewFlags); } } @@ -1177,13 +1345,32 @@ SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, if (!Flags.hasAllowReassociation() || !Flags.hasNoSignedZeros()) return SDValue(); - if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1)) + if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1, Flags)) return Combined; - if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0)) + if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0, Flags)) return Combined; return SDValue(); } +// Try to fold Opc(vecreduce(x), vecreduce(y)) -> vecreduce(Opc(x, y)) +// Note that we only expect Flags to be passed from FP operations. For integer +// operations they need to be dropped. +SDValue DAGCombiner::reassociateReduction(unsigned RedOpc, unsigned Opc, + const SDLoc &DL, EVT VT, SDValue N0, + SDValue N1, SDNodeFlags Flags) { + if (N0.getOpcode() == RedOpc && N1.getOpcode() == RedOpc && + N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() && + N0->hasOneUse() && N1->hasOneUse() && + TLI.isOperationLegalOrCustom(Opc, N0.getOperand(0).getValueType()) && + TLI.shouldReassociateReduction(RedOpc, N0.getOperand(0).getValueType())) { + SelectionDAG::FlagInserter FlagsInserter(DAG, Flags); + return DAG.getNode(RedOpc, DL, VT, + DAG.getNode(Opc, DL, N0.getOperand(0).getValueType(), + N0.getOperand(0), N1.getOperand(0))); + } + return SDValue(); +} + SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo) { assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); @@ -1591,8 +1778,13 @@ void DAGCombiner::Run(CombineLevel AtLevel) { WorklistInserter AddNodes(*this); // Add all the dag nodes to the worklist. + // + // Note: All nodes are not added to PruningList here, this is because the only + // nodes which can be deleted are those which have no uses and all other nodes + // which would otherwise be added to the worklist by the first call to + // getNextWorklistEntry are already present in it. for (SDNode &Node : DAG.allnodes()) - AddToWorklist(&Node); + AddToWorklist(&Node, /* IsCandidateForPruning */ Node.use_empty()); // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted, and tracking any @@ -1627,11 +1819,11 @@ void DAGCombiner::Run(CombineLevel AtLevel) { // Add any operands of the new node which have not yet been combined to the // worklist as well. Because the worklist uniques things already, this // won't repeatedly process the same operand. - CombinedNodes.insert(N); for (const SDValue &ChildN : N->op_values()) if (!CombinedNodes.count(ChildN.getNode())) AddToWorklist(ChildN.getNode()); + CombinedNodes.insert(N); SDValue RV = combine(N); if (!RV.getNode()) @@ -1665,10 +1857,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) { // out), because re-visiting the EntryToken and its users will not uncover // any additional opportunities, but there may be a large number of such // users, potentially causing compile time explosion. - if (RV.getOpcode() != ISD::EntryToken) { - AddToWorklist(RV.getNode()); - AddUsersToWorklist(RV.getNode()); - } + if (RV.getOpcode() != ISD::EntryToken) + AddToWorklistWithUsers(RV.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to @@ -1700,10 +1890,10 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::SSUBO: case ISD::USUBO: return visitSUBO(N); case ISD::ADDE: return visitADDE(N); - case ISD::ADDCARRY: return visitADDCARRY(N); + case ISD::UADDO_CARRY: return visitUADDO_CARRY(N); case ISD::SADDO_CARRY: return visitSADDO_CARRY(N); case ISD::SUBE: return visitSUBE(N); - case ISD::SUBCARRY: return visitSUBCARRY(N); + case ISD::USUBO_CARRY: return visitUSUBO_CARRY(N); case ISD::SSUBO_CARRY: return visitSSUBO_CARRY(N); case ISD::SMULFIX: case ISD::SMULFIXSAT: @@ -1720,6 +1910,8 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::AVGFLOORU: case ISD::AVGCEILS: case ISD::AVGCEILU: return visitAVG(N); + case ISD::ABDS: + case ISD::ABDU: return visitABD(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); case ISD::SMULO: @@ -1770,7 +1962,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::STRICT_FADD: return visitSTRICT_FADD(N); case ISD::FSUB: return visitFSUB(N); case ISD::FMUL: return visitFMUL(N); - case ISD::FMA: return visitFMA(N); + case ISD::FMA: return visitFMA<EmptyMatchContext>(N); case ISD::FDIV: return visitFDIV(N); case ISD::FREM: return visitFREM(N); case ISD::FSQRT: return visitFSQRT(N); @@ -1791,6 +1983,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::FMAXIMUM: return visitFMinMax(N); case ISD::FCEIL: return visitFCEIL(N); case ISD::FTRUNC: return visitFTRUNC(N); + case ISD::FFREXP: return visitFFREXP(N); case ISD::BRCOND: return visitBRCOND(N); case ISD::BR_CC: return visitBR_CC(N); case ISD::LOAD: return visitLOAD(N); @@ -1812,6 +2005,8 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); case ISD::FP_TO_BF16: return visitFP_TO_BF16(N); case ISD::FREEZE: return visitFREEZE(N); + case ISD::GET_FPENV_MEM: return visitGET_FPENV_MEM(N); + case ISD::SET_FPENV_MEM: return visitSET_FPENV_MEM(N); case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_ADD: @@ -1824,7 +2019,9 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: - case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N); + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: return visitVECREDUCE(N); #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC: #include "llvm/IR/VPIntrinsics.def" return visitVPOp(N); @@ -2131,6 +2328,39 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { return Const != nullptr && !Const->isOpaque() ? Const : nullptr; } +// isTruncateOf - If N is a truncate of some other value, return true, record +// the value being truncated in Op and which of Op's bits are zero/one in Known. +// This function computes KnownBits to avoid a duplicated call to +// computeKnownBits in the caller. +static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, + KnownBits &Known) { + if (N->getOpcode() == ISD::TRUNCATE) { + Op = N->getOperand(0); + Known = DAG.computeKnownBits(Op); + return true; + } + + if (N.getOpcode() != ISD::SETCC || + N.getValueType().getScalarType() != MVT::i1 || + cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE) + return false; + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + assert(Op0.getValueType() == Op1.getValueType()); + + if (isNullOrNullSplat(Op0)) + Op = Op1; + else if (isNullOrNullSplat(Op1)) + Op = Op0; + else + return false; + + Known = DAG.computeKnownBits(Op); + + return (Known.Zero | 1).isAllOnes(); +} + /// Return true if 'Use' is a load or a store that uses N as its base pointer /// and that N may be folded in the load / store addressing mode. static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG, @@ -2206,11 +2436,12 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG, if (N1.getOpcode() != ISD::VSELECT || !N1.hasOneUse()) return SDValue(); - // We can't hoist div/rem because of immediate UB (not speculatable). - unsigned Opcode = N->getOpcode(); - if (!DAG.isSafeToSpeculativelyExecute(Opcode)) + // We can't hoist all instructions because of immediate UB (not speculatable). + // For example div/rem by zero. + if (!DAG.isSafeToSpeculativelyExecuteNode(N)) return SDValue(); + unsigned Opcode = N->getOpcode(); EVT VT = N->getValueType(0); SDValue Cond = N1.getOperand(0); SDValue TVal = N1.getOperand(1); @@ -2258,6 +2489,17 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) { SelOpNo = 1; Sel = BO->getOperand(1); + + // Peek through trunc to shift amount type. + if ((BinOpcode == ISD::SHL || BinOpcode == ISD::SRA || + BinOpcode == ISD::SRL) && Sel.hasOneUse()) { + // This is valid when the truncated bits of x are already zero. + SDValue Op; + KnownBits Known; + if (isTruncateOf(DAG, Sel, Op, Known) && + Known.countMaxActiveBits() < Sel.getScalarValueSizeInBits()) + Sel = Op; + } } if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) @@ -2310,18 +2552,14 @@ SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { // constant. Eliminate the binop by pulling the constant math into the // select. Example: add (select Cond, CT, CF), CBO --> select Cond, CT + // CBO, CF + CBO - NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT) - : DAG.getNode(BinOpcode, DL, VT, CT, CBO); - if (!CanFoldNonConst && !NewCT.isUndef() && - !isConstantOrConstantVector(NewCT, true) && - !DAG.isConstantFPBuildVectorOrConstantFP(NewCT)) + NewCT = SelOpNo ? DAG.FoldConstantArithmetic(BinOpcode, DL, VT, {CBO, CT}) + : DAG.FoldConstantArithmetic(BinOpcode, DL, VT, {CT, CBO}); + if (!NewCT) return SDValue(); - NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF) - : DAG.getNode(BinOpcode, DL, VT, CF, CBO); - if (!CanFoldNonConst && !NewCF.isUndef() && - !isConstantOrConstantVector(NewCF, true) && - !DAG.isConstantFPBuildVectorOrConstantFP(NewCF)) + NewCF = SelOpNo ? DAG.FoldConstantArithmetic(BinOpcode, DL, VT, {CBO, CF}) + : DAG.FoldConstantArithmetic(BinOpcode, DL, VT, {CF, CBO}); + if (!NewCF) return SDValue(); } @@ -2420,6 +2658,12 @@ static bool isADDLike(SDValue V, const SelectionDAG &DAG) { return false; } +static bool +areBitwiseNotOfEachother(SDValue Op0, SDValue Op1) { + return (isBitwiseNot(Op0) && Op0.getOperand(0) == Op1) || + (isBitwiseNot(Op1) && Op1.getOperand(0) == Op0); +} + /// Try to fold a node that behaves like an ADD (note that N isn't necessarily /// an ISD::ADD here, it could for example be an ISD::OR if we know that there /// are no common bits set in the operands). @@ -2444,6 +2688,10 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::ADD, DL, VT, N1, N0); + if (areBitwiseNotOfEachother(N0, N1)) + return DAG.getConstant(APInt::getAllOnes(VT.getScalarSizeInBits()), + SDLoc(N), VT); + // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) @@ -2509,12 +2757,22 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { // equivalent to (add x, c). // Reassociate (add (xor x, c), y) -> (add add(x, y), c)) if (xor x, c) is // equivalent to (add x, c). + // Do this optimization only when adding c does not introduce instructions + // for adding carries. auto ReassociateAddOr = [&](SDValue N0, SDValue N1) { if (isADDLike(N0, DAG) && N0.hasOneUse() && isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) { - return DAG.getNode(ISD::ADD, DL, VT, - DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), - N0.getOperand(1)); + // If N0's type does not split or is a sign mask, it does not introduce + // add carry. + auto TyActn = TLI.getTypeAction(*DAG.getContext(), N0.getValueType()); + bool NoAddCarry = TyActn == TargetLoweringBase::TypeLegal || + TyActn == TargetLoweringBase::TypePromoteInteger || + isMinSignedConstant(N0.getOperand(1)); + if (NoAddCarry) + return DAG.getNode( + ISD::ADD, DL, VT, + DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), + N0.getOperand(1)); } return SDValue(); }; @@ -2522,6 +2780,11 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { return Add; if (SDValue Add = ReassociateAddOr(N1, N0)) return Add; + + // Fold add(vecreduce(x), vecreduce(y)) -> vecreduce(add(x, y)) + if (SDValue SD = + reassociateReduction(ISD::VECREDUCE_ADD, ISD::ADD, DL, VT, N0, N1)) + return SD; } // fold ((0-A) + B) -> B-A if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0))) @@ -2626,7 +2889,10 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { // And if the target does not like this form then turn into: // sub y, (xor x, -1) if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD && - N0.hasOneUse()) { + N0.hasOneUse() && + // Limit this to after legalization if the add has wrap flags + (Level >= AfterLegalizeDAG || (!N->getFlags().hasNoUnsignedWrap() && + !N->getFlags().hasNoSignedWrap()))) { SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), DAG.getAllOnesConstant(DL, VT)); return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not); @@ -2714,6 +2980,7 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); + bool IsSigned = Opcode == ISD::SADDSAT; SDLoc DL(N); // fold (add_sat x, undef) -> -1 @@ -2744,14 +3011,14 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) { return N0; // If it cannot overflow, transform into an add. - if (Opcode == ISD::UADDSAT) - if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) - return DAG.getNode(ISD::ADD, DL, VT, N0, N1); + if (DAG.computeOverflowForAdd(IsSigned, N0, N1) == SelectionDAG::OFK_Never) + return DAG.getNode(ISD::ADD, DL, VT, N0, N1); return SDValue(); } -static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { +static SDValue getAsCarry(const TargetLowering &TLI, SDValue V, + bool ForceCarryReconstruction = false) { bool Masked = false; // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization. @@ -2762,11 +3029,17 @@ static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { } if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) { + if (ForceCarryReconstruction) + return V; + Masked = true; V = V.getOperand(0); continue; } + if (ForceCarryReconstruction && V.getValueType() == MVT::i1) + return V; + break; } @@ -2774,7 +3047,7 @@ static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { if (V.getResNo() != 1) return SDValue(); - if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY && + if (V.getOpcode() != ISD::UADDO_CARRY && V.getOpcode() != ISD::USUBO_CARRY && V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) return SDValue(); @@ -2842,7 +3115,10 @@ SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1, // And if the target does not like this form then turn into: // sub y, (xor x, -1) if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD && - N0.hasOneUse() && isOneOrOneSplat(N0.getOperand(1))) { + N0.hasOneUse() && isOneOrOneSplat(N0.getOperand(1)) && + // Limit this to after legalization if the add has wrap flags + (Level >= AfterLegalizeDAG || (!N0->getFlags().hasNoUnsignedWrap() && + !N0->getFlags().hasNoSignedWrap()))) { SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), DAG.getAllOnesConstant(DL, VT)); return DAG.getNode(ISD::SUB, DL, VT, N1, Not); @@ -2864,6 +3140,15 @@ SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1, } } + // add (mul x, C), x -> mul x, C+1 + if (N0.getOpcode() == ISD::MUL && N0.getOperand(0) == N1 && + isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true) && + N0.hasOneUse()) { + SDValue NewC = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), + DAG.getConstant(1, DL, VT)); + return DAG.getNode(ISD::MUL, DL, VT, N0.getOperand(0), NewC); + } + // If the target's bool is represented as 0/1, prefer to make this 'sub 0/1' // rather than 'add 0/-1' (the zext should get folded). // add (sext i1 Y), X --> sub X, (zext i1 Y) @@ -2884,16 +3169,16 @@ SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1, } } - // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) - if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) && + // (add X, (uaddo_carry Y, 0, Carry)) -> (uaddo_carry X, Y, Carry) + if (N1.getOpcode() == ISD::UADDO_CARRY && isNullConstant(N1.getOperand(1)) && N1.getResNo() == 0) - return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), + return DAG.getNode(ISD::UADDO_CARRY, DL, N1->getVTList(), N0, N1.getOperand(0), N1.getOperand(2)); - // (add X, Carry) -> (addcarry X, 0, Carry) - if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) + // (add X, Carry) -> (uaddo_carry X, 0, Carry) + if (TLI.isOperationLegalOrCustom(ISD::UADDO_CARRY, VT)) if (SDValue Carry = getAsCarry(TLI, N1)) - return DAG.getNode(ISD::ADDCARRY, DL, + return DAG.getNode(ISD::UADDO_CARRY, DL, DAG.getVTList(VT, Carry.getValueType()), N0, DAG.getConstant(0, DL, VT), Carry); @@ -2923,7 +3208,7 @@ SDValue DAGCombiner::visitADDC(SDNode *N) { DL, MVT::Glue)); // If it cannot overflow, transform into an add. - if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) + if (DAG.computeOverflowForUnsignedAdd(N0, N1) == SelectionDAG::OFK_Never) return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); @@ -2995,12 +3280,12 @@ SDValue DAGCombiner::visitADDO(SDNode *N) { if (isNullOrNullSplat(N1)) return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); - if (!IsSigned) { - // If it cannot overflow, transform into an add. - if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) - return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), - DAG.getConstant(0, DL, CarryVT)); + // If it cannot overflow, transform into an add. + if (DAG.computeOverflowForAdd(IsSigned, N0, N1) == SelectionDAG::OFK_Never) + return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), + DAG.getConstant(0, DL, CarryVT)); + if (!IsSigned) { // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry. if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) { SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(), @@ -3024,20 +3309,20 @@ SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { if (VT.isVector()) return SDValue(); - // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) + // (uaddo X, (uaddo_carry Y, 0, Carry)) -> (uaddo_carry X, Y, Carry) // If Y + 1 cannot overflow. - if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { + if (N1.getOpcode() == ISD::UADDO_CARRY && isNullConstant(N1.getOperand(1))) { SDValue Y = N1.getOperand(0); SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType()); - if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never) - return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y, + if (DAG.computeOverflowForUnsignedAdd(Y, One) == SelectionDAG::OFK_Never) + return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), N->getVTList(), N0, Y, N1.getOperand(2)); } - // (uaddo X, Carry) -> (addcarry X, 0, Carry) - if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) + // (uaddo X, Carry) -> (uaddo_carry X, 0, Carry) + if (TLI.isOperationLegalOrCustom(ISD::UADDO_CARRY, VT)) if (SDValue Carry = getAsCarry(TLI, N1)) - return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, + return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), N->getVTList(), N0, DAG.getConstant(0, SDLoc(N), VT), Carry); return SDValue(); @@ -3062,7 +3347,7 @@ SDValue DAGCombiner::visitADDE(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::visitADDCARRY(SDNode *N) { +SDValue DAGCombiner::visitUADDO_CARRY(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); @@ -3072,16 +3357,16 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) { ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); if (N0C && !N1C) - return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); + return DAG.getNode(ISD::UADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn); - // fold (addcarry x, y, false) -> (uaddo x, y) + // fold (uaddo_carry x, y, false) -> (uaddo x, y) if (isNullConstant(CarryIn)) { if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0))) return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); } - // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. + // fold (uaddo_carry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. if (isNullConstant(N0) && isNullConstant(N1)) { EVT VT = N0.getValueType(); EVT CarryVT = CarryIn.getValueType(); @@ -3092,73 +3377,52 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) { DAG.getConstant(0, DL, CarryVT)); } - if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) + if (SDValue Combined = visitUADDO_CARRYLike(N0, N1, CarryIn, N)) return Combined; - if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N)) + if (SDValue Combined = visitUADDO_CARRYLike(N1, N0, CarryIn, N)) return Combined; // We want to avoid useless duplication. - // TODO: This is done automatically for binary operations. As ADDCARRY is + // TODO: This is done automatically for binary operations. As UADDO_CARRY is // not a binary operation, this is not really possible to leverage this // existing mechanism for it. However, if more operations require the same // deduplication logic, then it may be worth generalize. SDValue Ops[] = {N1, N0, CarryIn}; SDNode *CSENode = - DAG.getNodeIfExists(ISD::ADDCARRY, N->getVTList(), Ops, N->getFlags()); + DAG.getNodeIfExists(ISD::UADDO_CARRY, N->getVTList(), Ops, N->getFlags()); if (CSENode) return SDValue(CSENode, 0); return SDValue(); } -SDValue DAGCombiner::visitSADDO_CARRY(SDNode *N) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDValue CarryIn = N->getOperand(2); - SDLoc DL(N); - - // canonicalize constant to RHS - ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); - if (N0C && !N1C) - return DAG.getNode(ISD::SADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn); - - // fold (saddo_carry x, y, false) -> (saddo x, y) - if (isNullConstant(CarryIn)) { - if (!LegalOperations || - TLI.isOperationLegalOrCustom(ISD::SADDO, N->getValueType(0))) - return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, N1); - } - - return SDValue(); -} - /** * If we are facing some sort of diamond carry propapagtion pattern try to * break it up to generate something like: - * (addcarry X, 0, (addcarry A, B, Z):Carry) + * (uaddo_carry X, 0, (uaddo_carry A, B, Z):Carry) * * The end result is usually an increase in operation required, but because the * carry is now linearized, other transforms can kick in and optimize the DAG. * * Patterns typically look something like - * (uaddo A, B) - * / \ - * Carry Sum - * | \ - * | (addcarry *, 0, Z) - * | / - * \ Carry - * | / - * (addcarry X, *, *) + * (uaddo A, B) + * / \ + * Carry Sum + * | \ + * | (uaddo_carry *, 0, Z) + * | / + * \ Carry + * | / + * (uaddo_carry X, *, *) * * But numerous variation exist. Our goal is to identify A, B, X and Z and * produce a combine with a single path for carry propagation. */ -static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, - SDValue X, SDValue Carry0, SDValue Carry1, - SDNode *N) { +static SDValue combineUADDO_CARRYDiamond(DAGCombiner &Combiner, + SelectionDAG &DAG, SDValue X, + SDValue Carry0, SDValue Carry1, + SDNode *N) { if (Carry1.getResNo() != 1 || Carry0.getResNo() != 1) return SDValue(); if (Carry1.getOpcode() != ISD::UADDO) @@ -3168,9 +3432,9 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, /** * First look for a suitable Z. It will present itself in the form of - * (addcarry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true + * (uaddo_carry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true */ - if (Carry0.getOpcode() == ISD::ADDCARRY && + if (Carry0.getOpcode() == ISD::UADDO_CARRY && isNullConstant(Carry0.getOperand(1))) { Z = Carry0.getOperand(2); } else if (Carry0.getOpcode() == ISD::UADDO && @@ -3185,26 +3449,27 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, auto cancelDiamond = [&](SDValue A,SDValue B) { SDLoc DL(N); - SDValue NewY = DAG.getNode(ISD::ADDCARRY, DL, Carry0->getVTList(), A, B, Z); + SDValue NewY = + DAG.getNode(ISD::UADDO_CARRY, DL, Carry0->getVTList(), A, B, Z); Combiner.AddToWorklist(NewY.getNode()); - return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), X, + return DAG.getNode(ISD::UADDO_CARRY, DL, N->getVTList(), X, DAG.getConstant(0, DL, X.getValueType()), NewY.getValue(1)); }; /** - * (uaddo A, B) - * | - * Sum - * | - * (addcarry *, 0, Z) + * (uaddo A, B) + * | + * Sum + * | + * (uaddo_carry *, 0, Z) */ if (Carry0.getOperand(0) == Carry1.getValue(0)) { return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1)); } /** - * (addcarry A, 0, Z) + * (uaddo_carry A, 0, Z) * | * Sum * | @@ -3241,12 +3506,12 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG, // | / // CarryOut = (or *, *) // -// And generate ADDCARRY (or SUBCARRY) with two result values: +// And generate UADDO_CARRY (or USUBO_CARRY) with two result values: // -// {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn) +// {AddCarrySum, CarryOut} = (uaddo_carry A, B, CarryIn) // -// Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with -// a single path for carry/borrow out propagation: +// Our goal is to identify A, B, and CarryIn and produce UADDO_CARRY/USUBO_CARRY +// with a single path for carry/borrow out propagation. static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI, SDValue N0, SDValue N1, SDNode *N) { SDValue Carry0 = getAsCarry(TLI, N0); @@ -3279,16 +3544,13 @@ static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI, return SDValue(); SDValue CarryIn = Carry1.getOperand(CarryInOperandNum); - unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY; + unsigned NewOp = Opcode == ISD::UADDO ? ISD::UADDO_CARRY : ISD::USUBO_CARRY; if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType())) return SDValue(); // Verify that the carry/borrow in is plausibly a carry/borrow bit. - // TODO: make getAsCarry() aware of how partial carries are merged. - if (CarryIn.getOpcode() != ISD::ZERO_EXTEND) - return SDValue(); - CarryIn = CarryIn.getOperand(0); - if (CarryIn.getValueType() != MVT::i1) + CarryIn = getAsCarry(TLI, CarryIn, true); + if (!CarryIn) return SDValue(); SDLoc DL(N); @@ -3315,45 +3577,68 @@ static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI, return Merged.getValue(1); } -SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, - SDNode *N) { - // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry. +SDValue DAGCombiner::visitUADDO_CARRYLike(SDValue N0, SDValue N1, + SDValue CarryIn, SDNode *N) { + // fold (uaddo_carry (xor a, -1), b, c) -> (usubo_carry b, a, !c) and flip + // carry. if (isBitwiseNot(N0)) if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) { SDLoc DL(N); - SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1, + SDValue Sub = DAG.getNode(ISD::USUBO_CARRY, DL, N->getVTList(), N1, N0.getOperand(0), NotC); return CombineTo( N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1))); } // Iff the flag result is dead: - // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry) + // (uaddo_carry (add|uaddo X, Y), 0, Carry) -> (uaddo_carry X, Y, Carry) // Don't do this if the Carry comes from the uaddo. It won't remove the uaddo // or the dependency between the instructions. if ((N0.getOpcode() == ISD::ADD || (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 && N0.getValue(1) != CarryIn)) && isNullConstant(N1) && !N->hasAnyUseOfValue(1)) - return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), + return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), N->getVTList(), N0.getOperand(0), N0.getOperand(1), CarryIn); /** - * When one of the addcarry argument is itself a carry, we may be facing + * When one of the uaddo_carry argument is itself a carry, we may be facing * a diamond carry propagation. In which case we try to transform the DAG * to ensure linear carry propagation if that is possible. */ if (auto Y = getAsCarry(TLI, N1)) { // Because both are carries, Y and Z can be swapped. - if (auto R = combineADDCARRYDiamond(*this, DAG, N0, Y, CarryIn, N)) + if (auto R = combineUADDO_CARRYDiamond(*this, DAG, N0, Y, CarryIn, N)) return R; - if (auto R = combineADDCARRYDiamond(*this, DAG, N0, CarryIn, Y, N)) + if (auto R = combineUADDO_CARRYDiamond(*this, DAG, N0, CarryIn, Y, N)) return R; } return SDValue(); } +SDValue DAGCombiner::visitSADDO_CARRY(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CarryIn = N->getOperand(2); + SDLoc DL(N); + + // canonicalize constant to RHS + ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + if (N0C && !N1C) + return DAG.getNode(ISD::SADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn); + + // fold (saddo_carry x, y, false) -> (saddo x, y) + if (isNullConstant(CarryIn)) { + if (!LegalOperations || + TLI.isOperationLegalOrCustom(ISD::SADDO, N->getValueType(0))) + return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, N1); + } + + return SDValue(); +} + // Attempt to create a USUBSAT(LHS, RHS) node with DstVT, performing a // clamp/truncation if necessary. static SDValue getTruncatedUSUBSAT(EVT DstVT, EVT SrcVT, SDValue LHS, @@ -3720,11 +4005,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { // If the relocation model supports it, consider symbol offsets. if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0)) if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) { - // fold (sub Sym, c) -> Sym-c - if (N1C && GA->getOpcode() == ISD::GlobalAddress) - return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT, - GA->getOffset() - - (uint64_t)N1C->getSExtValue()); // fold (sub Sym+c1, Sym+c2) -> c1-c2 if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1)) if (GA->getGlobal() == GB->getGlobal()) @@ -3776,19 +4056,19 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { return DAG.getNode(ISD::ADD, DL, VT, N1, N0); } - // (sub (subcarry X, 0, Carry), Y) -> (subcarry X, Y, Carry) - if (N0.getOpcode() == ISD::SUBCARRY && isNullConstant(N0.getOperand(1)) && + // (sub (usubo_carry X, 0, Carry), Y) -> (usubo_carry X, Y, Carry) + if (N0.getOpcode() == ISD::USUBO_CARRY && isNullConstant(N0.getOperand(1)) && N0.getResNo() == 0 && N0.hasOneUse()) - return DAG.getNode(ISD::SUBCARRY, DL, N0->getVTList(), + return DAG.getNode(ISD::USUBO_CARRY, DL, N0->getVTList(), N0.getOperand(0), N1, N0.getOperand(2)); - if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) { - // (sub Carry, X) -> (addcarry (sub 0, X), 0, Carry) + if (TLI.isOperationLegalOrCustom(ISD::UADDO_CARRY, VT)) { + // (sub Carry, X) -> (uaddo_carry (sub 0, X), 0, Carry) if (SDValue Carry = getAsCarry(TLI, N0)) { SDValue X = N1; SDValue Zero = DAG.getConstant(0, DL, VT); SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X); - return DAG.getNode(ISD::ADDCARRY, DL, + return DAG.getNode(ISD::UADDO_CARRY, DL, DAG.getVTList(VT, Carry.getValueType()), NegX, Zero, Carry); } @@ -3814,7 +4094,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { (N0.getOperand(0) != N1.getOperand(1) || N0.getOperand(1) != N1.getOperand(0))) return SDValue(); - if (!TLI.isOperationLegalOrCustom(Abd, VT)) + if (!hasOperation(Abd, VT)) return SDValue(); return DAG.getNode(Abd, DL, VT, N0.getOperand(0), N0.getOperand(1)); }; @@ -3827,9 +4107,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { } SDValue DAGCombiner::visitSUBSAT(SDNode *N) { + unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); + bool IsSigned = Opcode == ISD::SSUBSAT; SDLoc DL(N); // fold (sub_sat x, undef) -> 0 @@ -3841,7 +4123,7 @@ SDValue DAGCombiner::visitSUBSAT(SDNode *N) { return DAG.getConstant(0, DL, VT); // fold (sub_sat c1, c2) -> c3 - if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1})) + if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) return C; // fold vector ops @@ -3858,6 +4140,10 @@ SDValue DAGCombiner::visitSUBSAT(SDNode *N) { if (isNullConstant(N1)) return N0; + // If it cannot overflow, transform into an sub. + if (DAG.computeOverflowForSub(IsSigned, N0, N1) == SelectionDAG::OFK_Never) + return DAG.getNode(ISD::SUB, DL, VT, N0, N1); + return SDValue(); } @@ -3911,7 +4197,7 @@ SDValue DAGCombiner::visitSUBO(SDNode *N) { ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); // fold (subox, c) -> (addo x, -c) - if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) { + if (IsSigned && N1C && !N1C->isMinSignedValue()) { return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); } @@ -3920,6 +4206,11 @@ SDValue DAGCombiner::visitSUBO(SDNode *N) { if (isNullOrNullSplat(N1)) return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); + // If it cannot overflow, transform into an sub. + if (DAG.computeOverflowForSub(IsSigned, N0, N1) == SelectionDAG::OFK_Never) + return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), + DAG.getConstant(0, DL, CarryVT)); + // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow if (!IsSigned && isAllOnesOrAllOnesSplat(N0)) return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), @@ -3940,12 +4231,12 @@ SDValue DAGCombiner::visitSUBE(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::visitSUBCARRY(SDNode *N) { +SDValue DAGCombiner::visitUSUBO_CARRY(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); - // fold (subcarry x, y, false) -> (usubo x, y) + // fold (usubo_carry x, y, false) -> (usubo x, y) if (isNullConstant(CarryIn)) { if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0))) @@ -4062,13 +4353,14 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isNegatedPowerOf2()) { unsigned Log2Val = (-ConstValue1).logBase2(); + EVT ShiftVT = getShiftAmountTy(N0.getValueType()); + // FIXME: If the input is something that is easily negated (e.g. a // single-use add), we should put the negate there. return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), DAG.getNode(ISD::SHL, DL, VT, N0, - DAG.getConstant(Log2Val, DL, - getShiftAmountTy(N0.getValueType())))); + DAG.getConstant(Log2Val, DL, ShiftVT))); } // Attempt to reuse an existing umul_lohi/smul_lohi node, but only if the @@ -4108,7 +4400,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { unsigned MathOp = ISD::DELETED_NODE; APInt MulC = ConstValue1.abs(); // The constant `2` should be treated as (2^0 + 1). - unsigned TZeros = MulC == 2 ? 0 : MulC.countTrailingZeros(); + unsigned TZeros = MulC == 2 ? 0 : MulC.countr_zero(); MulC.lshrInPlace(TZeros); if ((MulC - 1).isPowerOf2()) MathOp = ISD::ADD; @@ -4163,8 +4455,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { } // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) - if (DAG.isConstantIntBuildVectorOrConstantInt(N1) && - N0.getOpcode() == ISD::ADD && + if (N0.getOpcode() == ISD::ADD && + DAG.isConstantIntBuildVectorOrConstantInt(N1) && DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) && isMulAddWithConstProfitable(N, N0, N1)) return DAG.getNode( @@ -4223,6 +4515,11 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags())) return RMUL; + // Fold mul(vecreduce(x), vecreduce(y)) -> vecreduce(mul(x, y)) + if (SDValue SD = + reassociateReduction(ISD::VECREDUCE_MUL, ISD::MUL, DL, VT, N0, N1)) + return SD; + // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -4386,7 +4683,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { return DAG.getNegative(N0, DL, VT); // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0) - if (N1C && N1C->getAPIntValue().isMinSignedValue()) + if (N1C && N1C->isMinSignedValue()) return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT)); @@ -4886,11 +5183,57 @@ SDValue DAGCombiner::visitAVG(SDNode *N) { if (N1.isUndef()) return N0; + // Fold (avg x, x) --> x + if (N0 == N1 && Level >= AfterLegalizeTypes) + return N0; + // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1 return SDValue(); } +SDValue DAGCombiner::visitABD(SDNode *N) { + unsigned Opcode = N->getOpcode(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // fold (abd c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) + return C; + + // canonicalize constant to RHS. + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0); + + if (VT.isVector()) { + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + + // fold (abds x, 0) -> abs x + // fold (abdu x, 0) -> x + if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) { + if (Opcode == ISD::ABDS) + return DAG.getNode(ISD::ABS, DL, VT, N0); + if (Opcode == ISD::ABDU) + return N0; + } + } + + // fold (abd x, undef) -> 0 + if (N0.isUndef() || N1.isUndef()) + return DAG.getConstant(0, DL, VT); + + // fold (abds x, y) -> (abdu x, y) iff both args are known positive + if (Opcode == ISD::ABDS && hasOperation(ISD::ABDU, VT) && + DAG.SignBitIsZero(N0) && DAG.SignBitIsZero(N1)) + return DAG.getNode(ISD::ABDU, DL, VT, N1, N0); + + return SDValue(); +} + /// Perform optimizations common to nodes that compute two values. LoOp and HiOp /// give the opcodes for the two computations that are being performed. Return /// true if a simplification was made. @@ -5108,7 +5451,7 @@ SDValue DAGCombiner::visitMULO(SDNode *N) { // same as SimplifySelectCC. N0<N1 ? N2 : N3. static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC, unsigned &BW, - bool &Unsigned) { + bool &Unsigned, SelectionDAG &DAG) { auto isSignedMinMax = [&](SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC) { // The compare and select operand should be the same or the select operands @@ -5132,6 +5475,26 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2, if (!Opcode0) return SDValue(); + // We could only need one range check, if the fptosi could never produce + // the upper value. + if (N0.getOpcode() == ISD::FP_TO_SINT && Opcode0 == ISD::SMAX) { + if (isNullOrNullSplat(N3)) { + EVT IntVT = N0.getValueType().getScalarType(); + EVT FPVT = N0.getOperand(0).getValueType().getScalarType(); + if (FPVT.isSimple()) { + Type *InputTy = FPVT.getTypeForEVT(*DAG.getContext()); + const fltSemantics &Semantics = InputTy->getFltSemantics(); + uint32_t MinBitWidth = + APFloatBase::semanticsIntSizeInBits(Semantics, /*isSigned*/ true); + if (IntVT.getSizeInBits() >= MinBitWidth) { + Unsigned = true; + BW = PowerOf2Ceil(MinBitWidth); + return N0; + } + } + } + } + SDValue N00, N01, N02, N03; ISD::CondCode N0CC; switch (N0.getOpcode()) { @@ -5194,7 +5557,7 @@ static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2, SelectionDAG &DAG) { unsigned BW; bool Unsigned; - SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW, Unsigned); + SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW, Unsigned, DAG); if (!Fp || Fp.getOpcode() != ISD::FP_TO_SINT) return SDValue(); EVT FPVT = Fp.getOperand(0).getValueType(); @@ -5208,8 +5571,7 @@ static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2, SDLoc DL(Fp); SDValue Sat = DAG.getNode(NewOpc, DL, NewVT, Fp.getOperand(0), DAG.getValueType(NewVT.getScalarType())); - return Unsigned ? DAG.getZExtOrTrunc(Sat, DL, N2->getValueType(0)) - : DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0)); + return DAG.getExtOrTrunc(!Unsigned, Sat, DL, N2->getValueType(0)); } static SDValue PerformUMinFpToSatCombine(SDValue N0, SDValue N1, SDValue N2, @@ -5298,6 +5660,25 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N0, N1, ISD::SETULT, DAG)) return S; + // Fold min/max(vecreduce(x), vecreduce(y)) -> vecreduce(min/max(x, y)) + auto ReductionOpcode = [](unsigned Opcode) { + switch (Opcode) { + case ISD::SMIN: + return ISD::VECREDUCE_SMIN; + case ISD::SMAX: + return ISD::VECREDUCE_SMAX; + case ISD::UMIN: + return ISD::VECREDUCE_UMIN; + case ISD::UMAX: + return ISD::VECREDUCE_UMAX; + default: + llvm_unreachable("Unexpected opcode"); + } + }; + if (SDValue SD = reassociateReduction(ReductionOpcode(Opcode), Opcode, + SDLoc(N), VT, N0, N1)) + return SD; + // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -5312,8 +5693,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { EVT VT = N0.getValueType(); unsigned LogicOpcode = N->getOpcode(); unsigned HandOpcode = N0.getOpcode(); - assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR || - LogicOpcode == ISD::XOR) && "Expected logic opcode"); + assert(ISD::isBitwiseLogicOp(LogicOpcode) && "Expected logic opcode"); assert(HandOpcode == N1.getOpcode() && "Bad input!"); // Bail early if none of these transforms apply. @@ -5323,13 +5703,14 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { // FIXME: We should check number of uses of the operands to not increase // the instruction count for all transforms. - // Handle size-changing casts. + // Handle size-changing casts (or sign_extend_inreg). SDValue X = N0.getOperand(0); SDValue Y = N1.getOperand(0); EVT XVT = X.getValueType(); SDLoc DL(N); - if (HandOpcode == ISD::ANY_EXTEND || HandOpcode == ISD::ZERO_EXTEND || - HandOpcode == ISD::SIGN_EXTEND) { + if (ISD::isExtOpcode(HandOpcode) || ISD::isExtVecInRegOpcode(HandOpcode) || + (HandOpcode == ISD::SIGN_EXTEND_INREG && + N0.getOperand(1) == N1.getOperand(1))) { // If both operands have other uses, this transform would create extra // instructions without eliminating anything. if (!N0.hasOneUse() && !N1.hasOneUse()) @@ -5344,11 +5725,14 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { return SDValue(); // Avoid infinite looping with PromoteIntBinOp. // TODO: Should we apply desirable/legal constraints to all opcodes? - if (HandOpcode == ISD::ANY_EXTEND && LegalTypes && - !TLI.isTypeDesirableForOp(LogicOpcode, XVT)) + if ((HandOpcode == ISD::ANY_EXTEND || + HandOpcode == ISD::ANY_EXTEND_VECTOR_INREG) && + LegalTypes && !TLI.isTypeDesirableForOp(LogicOpcode, XVT)) return SDValue(); // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y) SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); + if (HandOpcode == ISD::SIGN_EXTEND_INREG) + return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1)); return DAG.getNode(HandOpcode, DL, VT, Logic); } @@ -5629,6 +6013,172 @@ SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, return SDValue(); } +static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) { + using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind; + assert( + (LogicOp->getOpcode() == ISD::AND || LogicOp->getOpcode() == ISD::OR) && + "Invalid Op to combine SETCC with"); + + // TODO: Search past casts/truncates. + SDValue LHS = LogicOp->getOperand(0); + SDValue RHS = LogicOp->getOperand(1); + if (LHS->getOpcode() != ISD::SETCC || RHS->getOpcode() != ISD::SETCC) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + AndOrSETCCFoldKind TargetPreference = TLI.isDesirableToCombineLogicOpOfSETCC( + LogicOp, LHS.getNode(), RHS.getNode()); + + SDValue LHS0 = LHS->getOperand(0); + SDValue RHS0 = RHS->getOperand(0); + SDValue LHS1 = LHS->getOperand(1); + SDValue RHS1 = RHS->getOperand(1); + // TODO: We don't actually need a splat here, for vectors we just need the + // invariants to hold for each element. + auto *LHS1C = isConstOrConstSplat(LHS1); + auto *RHS1C = isConstOrConstSplat(RHS1); + ISD::CondCode CCL = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + ISD::CondCode CCR = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); + EVT VT = LogicOp->getValueType(0); + EVT OpVT = LHS0.getValueType(); + SDLoc DL(LogicOp); + + // Check if the operands of an and/or operation are comparisons and if they + // compare against the same value. Replace the and/or-cmp-cmp sequence with + // min/max cmp sequence. If LHS1 is equal to RHS1, then the or-cmp-cmp + // sequence will be replaced with min-cmp sequence: + // (LHS0 < LHS1) | (RHS0 < RHS1) -> min(LHS0, RHS0) < LHS1 + // and and-cmp-cmp will be replaced with max-cmp sequence: + // (LHS0 < LHS1) & (RHS0 < RHS1) -> max(LHS0, RHS0) < LHS1 + if (OpVT.isInteger() && TLI.isOperationLegal(ISD::UMAX, OpVT) && + TLI.isOperationLegal(ISD::SMAX, OpVT) && + TLI.isOperationLegal(ISD::UMIN, OpVT) && + TLI.isOperationLegal(ISD::SMIN, OpVT)) { + if (LHS->getOpcode() == ISD::SETCC && RHS->getOpcode() == ISD::SETCC && + LHS->hasOneUse() && RHS->hasOneUse() && + // The two comparisons should have either the same predicate or the + // predicate of one of the comparisons is the opposite of the other one. + (CCL == CCR || CCL == ISD::getSetCCSwappedOperands(CCR)) && + // The optimization does not work for `==` or `!=` . + !ISD::isIntEqualitySetCC(CCL) && !ISD::isIntEqualitySetCC(CCR)) { + SDValue CommonValue, Operand1, Operand2; + ISD::CondCode CC = ISD::SETCC_INVALID; + if (CCL == CCR) { + if (LHS0 == RHS0) { + CommonValue = LHS0; + Operand1 = LHS1; + Operand2 = RHS1; + CC = ISD::getSetCCSwappedOperands(CCL); + } else if (LHS1 == RHS1) { + CommonValue = LHS1; + Operand1 = LHS0; + Operand2 = RHS0; + CC = CCL; + } + } else { + assert(CCL == ISD::getSetCCSwappedOperands(CCR) && "Unexpected CC"); + if (LHS0 == RHS1) { + CommonValue = LHS0; + Operand1 = LHS1; + Operand2 = RHS0; + CC = ISD::getSetCCSwappedOperands(CCL); + } else if (RHS0 == LHS1) { + CommonValue = LHS1; + Operand1 = LHS0; + Operand2 = RHS1; + CC = CCL; + } + } + + if (CC != ISD::SETCC_INVALID) { + unsigned NewOpcode; + bool IsSigned = isSignedIntSetCC(CC); + if (((CC == ISD::SETLE || CC == ISD::SETULE || CC == ISD::SETLT || + CC == ISD::SETULT) && + (LogicOp->getOpcode() == ISD::OR)) || + ((CC == ISD::SETGE || CC == ISD::SETUGE || CC == ISD::SETGT || + CC == ISD::SETUGT) && + (LogicOp->getOpcode() == ISD::AND))) + NewOpcode = IsSigned ? ISD::SMIN : ISD::UMIN; + else + NewOpcode = IsSigned ? ISD::SMAX : ISD::UMAX; + + SDValue MinMaxValue = + DAG.getNode(NewOpcode, DL, OpVT, Operand1, Operand2); + return DAG.getSetCC(DL, VT, MinMaxValue, CommonValue, CC); + } + } + } + + if (TargetPreference == AndOrSETCCFoldKind::None) + return SDValue(); + + if (CCL == CCR && + CCL == (LogicOp->getOpcode() == ISD::AND ? ISD::SETNE : ISD::SETEQ) && + LHS0 == RHS0 && LHS1C && RHS1C && OpVT.isInteger() && LHS.hasOneUse() && + RHS.hasOneUse()) { + const APInt &APLhs = LHS1C->getAPIntValue(); + const APInt &APRhs = RHS1C->getAPIntValue(); + + // Preference is to use ISD::ABS or we already have an ISD::ABS (in which + // case this is just a compare). + if (APLhs == (-APRhs) && + ((TargetPreference & AndOrSETCCFoldKind::ABS) || + DAG.doesNodeExist(ISD::ABS, DAG.getVTList(OpVT), {LHS0}))) { + const APInt &C = APLhs.isNegative() ? APRhs : APLhs; + // (icmp eq A, C) | (icmp eq A, -C) + // -> (icmp eq Abs(A), C) + // (icmp ne A, C) & (icmp ne A, -C) + // -> (icmp ne Abs(A), C) + SDValue AbsOp = DAG.getNode(ISD::ABS, DL, OpVT, LHS0); + return DAG.getNode(ISD::SETCC, DL, VT, AbsOp, + DAG.getConstant(C, DL, OpVT), LHS.getOperand(2)); + } else if (TargetPreference & + (AndOrSETCCFoldKind::AddAnd | AndOrSETCCFoldKind::NotAnd)) { + + // AndOrSETCCFoldKind::AddAnd: + // A == C0 | A == C1 + // IF IsPow2(smax(C0, C1)-smin(C0, C1)) + // -> ((A - smin(C0, C1)) & ~(smax(C0, C1)-smin(C0, C1))) == 0 + // A != C0 & A != C1 + // IF IsPow2(smax(C0, C1)-smin(C0, C1)) + // -> ((A - smin(C0, C1)) & ~(smax(C0, C1)-smin(C0, C1))) != 0 + + // AndOrSETCCFoldKind::NotAnd: + // A == C0 | A == C1 + // IF smax(C0, C1) == -1 AND IsPow2(smax(C0, C1) - smin(C0, C1)) + // -> ~A & smin(C0, C1) == 0 + // A != C0 & A != C1 + // IF smax(C0, C1) == -1 AND IsPow2(smax(C0, C1) - smin(C0, C1)) + // -> ~A & smin(C0, C1) != 0 + + const APInt &MaxC = APIntOps::smax(APRhs, APLhs); + const APInt &MinC = APIntOps::smin(APRhs, APLhs); + APInt Dif = MaxC - MinC; + if (!Dif.isZero() && Dif.isPowerOf2()) { + if (MaxC.isAllOnes() && + (TargetPreference & AndOrSETCCFoldKind::NotAnd)) { + SDValue NotOp = DAG.getNOT(DL, LHS0, OpVT); + SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, NotOp, + DAG.getConstant(MinC, DL, OpVT)); + return DAG.getNode(ISD::SETCC, DL, VT, AndOp, + DAG.getConstant(0, DL, OpVT), LHS.getOperand(2)); + } else if (TargetPreference & AndOrSETCCFoldKind::AddAnd) { + + SDValue AddOp = DAG.getNode(ISD::ADD, DL, OpVT, LHS0, + DAG.getConstant(-MinC, DL, OpVT)); + SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, AddOp, + DAG.getConstant(~Dif, DL, OpVT)); + return DAG.getNode(ISD::SETCC, DL, VT, AndOp, + DAG.getConstant(0, DL, OpVT), LHS.getOperand(2)); + } + } + } + } + + return SDValue(); +} + /// This contains all DAGCombine rules which reduce two values combined by /// an And operation to a single value. This makes them reusable in the context /// of visitSELECT(). Rules involving constants are not included as @@ -5644,6 +6194,11 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) return V; + // Canonicalize: + // and(x, add) -> and(add, x) + if (N1.getOpcode() == ISD::ADD) + std::swap(N0, N1); + // TODO: Rewrite this to return a new 'AND' instead of using CombineTo. if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && VT.getSizeInBits() <= 64 && N0->hasOneUse()) { @@ -5655,8 +6210,7 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { // in a register. APInt ADDC = ADDI->getAPIntValue(); APInt SRLC = SRLI->getAPIntValue(); - if (ADDC.getMinSignedBits() <= 64 && - SRLC.ult(VT.getSizeInBits()) && + if (ADDC.getSignificantBits() <= 64 && SRLC.ult(VT.getSizeInBits()) && !TLI.isLegalAddImmediate(ADDC.getSExtValue())) { APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), SRLC.getZExtValue()); @@ -5677,55 +6231,6 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { } } - // Reduce bit extract of low half of an integer to the narrower type. - // (and (srl i64:x, K), KMask) -> - // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask) - if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { - if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) { - if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { - unsigned Size = VT.getSizeInBits(); - const APInt &AndMask = CAnd->getAPIntValue(); - unsigned ShiftBits = CShift->getZExtValue(); - - // Bail out, this node will probably disappear anyway. - if (ShiftBits == 0) - return SDValue(); - - unsigned MaskBits = AndMask.countTrailingOnes(); - EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); - - if (AndMask.isMask() && - // Required bits must not span the two halves of the integer and - // must fit in the half size type. - (ShiftBits + MaskBits <= Size / 2) && - TLI.isNarrowingProfitable(VT, HalfVT) && - TLI.isTypeDesirableForOp(ISD::AND, HalfVT) && - TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) && - TLI.isTruncateFree(VT, HalfVT) && - TLI.isZExtFree(HalfVT, VT)) { - // The isNarrowingProfitable is to avoid regressions on PPC and - // AArch64 which match a few 64-bit bit insert / bit extract patterns - // on downstream users of this. Those patterns could probably be - // extended to handle extensions mixed in. - - SDValue SL(N0); - assert(MaskBits <= Size); - - // Extracting the highest bit of the low half. - EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout()); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT, - N0.getOperand(0)); - - SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT); - SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT); - SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK); - SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask); - return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And); - } - } - } - } - return SDValue(); } @@ -5734,7 +6239,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, if (!AndC->getAPIntValue().isMask()) return false; - unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); + unsigned ActiveBits = AndC->getAPIntValue().countr_one(); ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); EVT LoadedVT = LoadN->getMemoryVT(); @@ -5898,7 +6403,7 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N, } case ISD::ZERO_EXTEND: case ISD::AssertZext: { - unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); + unsigned ActiveBits = Mask->getAPIntValue().countr_one(); EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); EVT VT = Op.getOpcode() == ISD::AssertZext ? cast<VTSDNode>(Op.getOperand(1))->getVT() : @@ -6071,12 +6576,6 @@ SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) { static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) { assert(And->getOpcode() == ISD::AND && "Expected an 'and' op"); - // This is probably not worthwhile without a supported type. - EVT VT = And->getValueType(0); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isTypeLegal(VT)) - return SDValue(); - // Look through an optional extension. SDValue And0 = And->getOperand(0), And1 = And->getOperand(1); if (And0.getOpcode() == ISD::ANY_EXTEND && And0.hasOneUse()) @@ -6104,13 +6603,17 @@ static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) { if (Src.getOpcode() != ISD::SRL || !Src.hasOneUse()) return SDValue(); + // This is probably not worthwhile without a supported type. + EVT SrcVT = Src.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(SrcVT)) + return SDValue(); + // We might have looked through casts that make this transform invalid. - // TODO: If the source type is wider than the result type, do the mask and - // compare in the source type. - unsigned VTBitWidth = VT.getScalarSizeInBits(); + unsigned BitWidth = SrcVT.getScalarSizeInBits(); SDValue ShiftAmt = Src.getOperand(1); auto *ShiftAmtC = dyn_cast<ConstantSDNode>(ShiftAmt); - if (!ShiftAmtC || !ShiftAmtC->getAPIntValue().ult(VTBitWidth)) + if (!ShiftAmtC || !ShiftAmtC->getAPIntValue().ult(BitWidth)) return SDValue(); // Set source to shift source. @@ -6131,14 +6634,15 @@ static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) { // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0 // and (srl (not X), C)), 1 --> (and X, 1<<C) == 0 SDLoc DL(And); - SDValue X = DAG.getZExtOrTrunc(Src, DL, VT); - EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue X = DAG.getZExtOrTrunc(Src, DL, SrcVT); + EVT CCVT = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT); SDValue Mask = DAG.getConstant( - APInt::getOneBitSet(VTBitWidth, ShiftAmtC->getZExtValue()), DL, VT); - SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask); - SDValue Zero = DAG.getConstant(0, DL, VT); + APInt::getOneBitSet(BitWidth, ShiftAmtC->getZExtValue()), DL, SrcVT); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, SrcVT, X, Mask); + SDValue Zero = DAG.getConstant(0, DL, SrcVT); SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ); - return DAG.getZExtOrTrunc(Setcc, DL, VT); + return DAG.getZExtOrTrunc(Setcc, DL, And->getValueType(0)); } /// For targets that support usubsat, match a bit-hack form of that operation @@ -6181,9 +6685,8 @@ static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) { static SDValue foldLogicOfShifts(SDNode *N, SDValue LogicOp, SDValue ShiftOp, SelectionDAG &DAG) { unsigned LogicOpcode = N->getOpcode(); - assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR || - LogicOpcode == ISD::XOR) - && "Expected bitwise logic operation"); + assert(ISD::isBitwiseLogicOp(LogicOpcode) && + "Expected bitwise logic operation"); if (!LogicOp.hasOneUse() || !ShiftOp.hasOneUse()) return SDValue(); @@ -6230,8 +6733,8 @@ static SDValue foldLogicOfShifts(SDNode *N, SDValue LogicOp, SDValue ShiftOp, static SDValue foldLogicTreeOfShifts(SDNode *N, SDValue LeftHand, SDValue RightHand, SelectionDAG &DAG) { unsigned LogicOpcode = N->getOpcode(); - assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR || - LogicOpcode == ISD::XOR)); + assert(ISD::isBitwiseLogicOp(LogicOpcode) && + "Expected bitwise logic operation"); if (LeftHand.getOpcode() != LogicOpcode || RightHand.getOpcode() != LogicOpcode) return SDValue(); @@ -6276,6 +6779,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) { !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0); + if (areBitwiseNotOfEachother(N0, N1)) + return DAG.getConstant(APInt::getZero(VT.getScalarSizeInBits()), SDLoc(N), + VT); + // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N))) @@ -6330,6 +6837,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth))) return DAG.getConstant(0, SDLoc(N), VT); + if (SDValue R = foldAndOrOfSETCC(N, DAG)) + return R; + if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -6337,6 +6847,11 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags())) return RAND; + // Fold and(vecreduce(x), vecreduce(y)) -> vecreduce(and(x, y)) + if (SDValue SD = reassociateReduction(ISD::VECREDUCE_AND, ISD::AND, SDLoc(N), + VT, N0, N1)) + return SD; + // fold (and (or x, C), D) -> D if (C & D) == D auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); @@ -6345,13 +6860,27 @@ SDValue DAGCombiner::visitAND(SDNode *N) { ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset)) return N1; - // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N0Op0 = N0.getOperand(0); + EVT SrcVT = N0Op0.getValueType(); + unsigned SrcBitWidth = SrcVT.getScalarSizeInBits(); APInt Mask = ~N1C->getAPIntValue(); - Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits()); + Mask = Mask.trunc(SrcBitWidth); + + // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. if (DAG.MaskedValueIsZero(N0Op0, Mask)) - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N0.getValueType(), N0Op0); + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0Op0); + + // fold (and (any_ext V), c) -> (zero_ext (and (trunc V), c)) if profitable. + if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) && + TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) && + TLI.isTypeDesirableForOp(ISD::AND, SrcVT) && + TLI.isNarrowingProfitable(VT, SrcVT)) { + SDLoc DL(N); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, + DAG.getNode(ISD::AND, DL, SrcVT, N0Op0, + DAG.getZExtOrTrunc(N1, DL, SrcVT))); + } } // fold (and (ext (and V, c1)), c2) -> (and (ext V), (and c1, (ext c2))) @@ -7046,24 +7575,39 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1, SDNode *N) { EVT VT = N0.getValueType(); - if (N0.getOpcode() == ISD::AND) { - SDValue N00 = N0.getOperand(0); - SDValue N01 = N0.getOperand(1); + + auto peekThroughResize = [](SDValue V) { + if (V->getOpcode() == ISD::ZERO_EXTEND || V->getOpcode() == ISD::TRUNCATE) + return V->getOperand(0); + return V; + }; + + SDValue N0Resized = peekThroughResize(N0); + if (N0Resized.getOpcode() == ISD::AND) { + SDValue N1Resized = peekThroughResize(N1); + SDValue N00 = N0Resized.getOperand(0); + SDValue N01 = N0Resized.getOperand(1); // fold or (and x, y), x --> x - if (N00 == N1 || N01 == N1) + if (N00 == N1Resized || N01 == N1Resized) return N1; // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y) // TODO: Set AllowUndefs = true. - if (getBitwiseNotOperand(N01, N00, - /* AllowUndefs */ false) == N1) - return DAG.getNode(ISD::OR, SDLoc(N), VT, N00, N1); + if (SDValue NotOperand = getBitwiseNotOperand(N01, N00, + /* AllowUndefs */ false)) { + if (peekThroughResize(NotOperand) == N1Resized) + return DAG.getNode(ISD::OR, SDLoc(N), VT, + DAG.getZExtOrTrunc(N00, SDLoc(N), VT), N1); + } // fold (or (and (xor Y, -1), X), Y) -> (or X, Y) - if (getBitwiseNotOperand(N00, N01, - /* AllowUndefs */ false) == N1) - return DAG.getNode(ISD::OR, SDLoc(N), VT, N01, N1); + if (SDValue NotOperand = getBitwiseNotOperand(N00, N01, + /* AllowUndefs */ false)) { + if (peekThroughResize(NotOperand) == N1Resized) + return DAG.getNode(ISD::OR, SDLoc(N), VT, + DAG.getZExtOrTrunc(N01, SDLoc(N), VT), N1); + } } if (N0.getOpcode() == ISD::XOR) { @@ -7215,6 +7759,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) return N1; + if (SDValue R = foldAndOrOfSETCC(N, DAG)) + return R; + if (SDValue Combined = visitORLike(N0, N1, N)) return Combined; @@ -7231,6 +7778,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags())) return ROR; + // Fold or(vecreduce(x), vecreduce(y)) -> vecreduce(or(x, y)) + if (SDValue SD = reassociateReduction(ISD::VECREDUCE_OR, ISD::OR, SDLoc(N), + VT, N0, N1)) + return SD; + // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) // iff (c1 & c2) != 0 or c1/c2 are undef. auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) { @@ -7898,42 +8450,6 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { return SDValue(); } -namespace { - -/// Represents known origin of an individual byte in load combine pattern. The -/// value of the byte is either constant zero or comes from memory. -struct ByteProvider { - // For constant zero providers Load is set to nullptr. For memory providers - // Load represents the node which loads the byte from memory. - // ByteOffset is the offset of the byte in the value produced by the load. - LoadSDNode *Load = nullptr; - unsigned ByteOffset = 0; - unsigned VectorOffset = 0; - - ByteProvider() = default; - - static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset, - unsigned VectorOffset) { - return ByteProvider(Load, ByteOffset, VectorOffset); - } - - static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0, 0); } - - bool isConstantZero() const { return !Load; } - bool isMemory() const { return Load; } - - bool operator==(const ByteProvider &Other) const { - return Other.Load == Load && Other.ByteOffset == ByteOffset && - Other.VectorOffset == VectorOffset; - } - -private: - ByteProvider(LoadSDNode *Load, unsigned ByteOffset, unsigned VectorOffset) - : Load(Load), ByteOffset(ByteOffset), VectorOffset(VectorOffset) {} -}; - -} // end anonymous namespace - /// Recursively traverses the expression calculating the origin of the requested /// byte of the given value. Returns std::nullopt if the provider can't be /// calculated. @@ -7975,7 +8491,9 @@ private: /// LOAD /// /// *ExtractVectorElement -static const std::optional<ByteProvider> +using SDByteProvider = ByteProvider<SDNode *>; + +static const std::optional<SDByteProvider> calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional<uint64_t> VectorIndex, unsigned StartingIndex = 0) { @@ -8034,7 +8552,7 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, // provide, then do not provide anything. Otherwise, subtract the index by // the amount we shifted by. return Index < ByteShift - ? ByteProvider::getConstantZero() + ? SDByteProvider::getConstantZero() : calculateByteProvider(Op->getOperand(0), Index - ByteShift, Depth + 1, VectorIndex, Index); } @@ -8049,7 +8567,8 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, if (Index >= NarrowByteWidth) return Op.getOpcode() == ISD::ZERO_EXTEND - ? std::optional<ByteProvider>(ByteProvider::getConstantZero()) + ? std::optional<SDByteProvider>( + SDByteProvider::getConstantZero()) : std::nullopt; return calculateByteProvider(NarrowOp, Index, Depth + 1, VectorIndex, StartingIndex); @@ -8099,11 +8618,12 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, // question if (Index >= NarrowByteWidth) return L->getExtensionType() == ISD::ZEXTLOAD - ? std::optional<ByteProvider>(ByteProvider::getConstantZero()) + ? std::optional<SDByteProvider>( + SDByteProvider::getConstantZero()) : std::nullopt; unsigned BPVectorIndex = VectorIndex.value_or(0U); - return ByteProvider::getMemory(L, Index, BPVectorIndex); + return SDByteProvider::getSrc(L, Index, BPVectorIndex); } } @@ -8191,9 +8711,12 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { !N->isSimple() || N->isIndexed()) return SDValue(); - // Collect all of the stores in the chain. + // Collect all of the stores in the chain, upto the maximum store width (i64). SDValue Chain = N->getChain(); SmallVector<StoreSDNode *, 8> Stores = {N}; + unsigned NarrowNumBits = MemVT.getScalarSizeInBits(); + unsigned MaxWideNumBits = 64; + unsigned MaxStores = MaxWideNumBits / NarrowNumBits; while (auto *Store = dyn_cast<StoreSDNode>(Chain)) { // All stores must be the same size to ensure that we are writing all of the // bytes in the wide value. @@ -8207,6 +8730,8 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { return SDValue(); Stores.push_back(Store); Chain = Store->getChain(); + if (MaxStores < Stores.size()) + return SDValue(); } // There is no reason to continue if we do not have at least a pair of stores. if (Stores.size() < 2) @@ -8215,7 +8740,6 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) { // Handle simple types only. LLVMContext &Context = *DAG.getContext(); unsigned NumStores = Stores.size(); - unsigned NarrowNumBits = N->getMemoryVT().getScalarSizeInBits(); unsigned WideNumBits = NumStores * NarrowNumBits; EVT WideVT = EVT::getIntegerVT(Context, WideNumBits); if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64) @@ -8397,23 +8921,24 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { unsigned ByteWidth = VT.getSizeInBits() / 8; bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); - auto MemoryByteOffset = [&] (ByteProvider P) { - assert(P.isMemory() && "Must be a memory byte provider"); - unsigned LoadBitWidth = P.Load->getMemoryVT().getScalarSizeInBits(); + auto MemoryByteOffset = [&](SDByteProvider P) { + assert(P.hasSrc() && "Must be a memory byte provider"); + auto *Load = cast<LoadSDNode>(P.Src.value()); + + unsigned LoadBitWidth = Load->getMemoryVT().getScalarSizeInBits(); assert(LoadBitWidth % 8 == 0 && "can only analyze providers for individual bytes not bit"); unsigned LoadByteWidth = LoadBitWidth / 8; - return IsBigEndianTarget - ? bigEndianByteAt(LoadByteWidth, P.ByteOffset) - : littleEndianByteAt(LoadByteWidth, P.ByteOffset); + return IsBigEndianTarget ? bigEndianByteAt(LoadByteWidth, P.DestOffset) + : littleEndianByteAt(LoadByteWidth, P.DestOffset); }; std::optional<BaseIndexOffset> Base; SDValue Chain; SmallPtrSet<LoadSDNode *, 8> Loads; - std::optional<ByteProvider> FirstByteProvider; + std::optional<SDByteProvider> FirstByteProvider; int64_t FirstOffset = INT64_MAX; // Check if all the bytes of the OR we are looking at are loaded from the same @@ -8434,9 +8959,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { return SDValue(); continue; } - assert(P->isMemory() && "provenance should either be memory or zero"); - - LoadSDNode *L = P->Load; + assert(P->hasSrc() && "provenance should either be memory or zero"); + auto *L = cast<LoadSDNode>(P->Src.value()); // All loads must share the same chain SDValue LChain = L->getChain(); @@ -8460,7 +8984,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { unsigned LoadWidthInBit = L->getMemoryVT().getScalarSizeInBits(); if (LoadWidthInBit % 8 != 0) return SDValue(); - unsigned ByteOffsetFromVector = P->VectorOffset * LoadWidthInBit / 8; + unsigned ByteOffsetFromVector = P->SrcOffset * LoadWidthInBit / 8; Ptr.addToOffset(ByteOffsetFromVector); } @@ -8517,7 +9041,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // So the combined value can be loaded from the first load address. if (MemoryByteOffset(*FirstByteProvider) != 0) return SDValue(); - LoadSDNode *FirstLoad = FirstByteProvider->Load; + auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value()); // The node we are looking at matches with the pattern, check if we can // replace it with a single (possibly zero-extended) load and bswap + shift if @@ -8715,6 +9239,11 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags())) return RXOR; + // Fold xor(vecreduce(x), vecreduce(y)) -> vecreduce(xor(x, y)) + if (SDValue SD = + reassociateReduction(ISD::VECREDUCE_XOR, ISD::XOR, DL, VT, N0, N1)) + return SD; + // fold (a^b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && DAG.haveNoCommonBitsSet(N0, N1)) @@ -9462,7 +9991,7 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG, SDValue MulhRightOp; if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) { unsigned ActiveBits = IsSignExt - ? Constant->getAPIntValue().getMinSignedBits() + ? Constant->getAPIntValue().getSignificantBits() : Constant->getAPIntValue().getActiveBits(); if (ActiveBits > NarrowVTSize) return SDValue(); @@ -9499,14 +10028,59 @@ static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG, // we use mulhs. Othewise, zero extends (zext) use mulhu. unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU; - // Combine to mulh if mulh is legal/custom for the narrow type on the target. - if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT)) - return SDValue(); + // Combine to mulh if mulh is legal/custom for the narrow type on the target + // or if it is a vector type then we could transform to an acceptable type and + // rely on legalization to split/combine the result. + if (NarrowVT.isVector()) { + EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), NarrowVT); + if (TransformVT.getVectorElementType() != NarrowVT.getVectorElementType() || + !TLI.isOperationLegalOrCustom(MulhOpcode, TransformVT)) + return SDValue(); + } else { + if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT)) + return SDValue(); + } SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), MulhRightOp); - return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT) - : DAG.getZExtOrTrunc(Result, DL, WideVT)); + bool IsSigned = N->getOpcode() == ISD::SRA; + return DAG.getExtOrTrunc(IsSigned, Result, DL, WideVT); +} + +// fold (bswap (logic_op(bswap(x),y))) -> logic_op(x,bswap(y)) +// This helper function accept SDNode with opcode ISD::BSWAP and ISD::BITREVERSE +static SDValue foldBitOrderCrossLogicOp(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + if (Opcode != ISD::BSWAP && Opcode != ISD::BITREVERSE) + return SDValue(); + + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + SDLoc DL(N); + if (ISD::isBitwiseLogicOp(N0.getOpcode()) && N0.hasOneUse()) { + SDValue OldLHS = N0.getOperand(0); + SDValue OldRHS = N0.getOperand(1); + + // If both operands are bswap/bitreverse, ignore the multiuse + // Otherwise need to ensure logic_op and bswap/bitreverse(x) have one use. + if (OldLHS.getOpcode() == Opcode && OldRHS.getOpcode() == Opcode) { + return DAG.getNode(N0.getOpcode(), DL, VT, OldLHS.getOperand(0), + OldRHS.getOperand(0)); + } + + if (OldLHS.getOpcode() == Opcode && OldLHS.hasOneUse()) { + SDValue NewBitReorder = DAG.getNode(Opcode, DL, VT, OldRHS); + return DAG.getNode(N0.getOpcode(), DL, VT, OldLHS.getOperand(0), + NewBitReorder); + } + + if (OldRHS.getOpcode() == Opcode && OldRHS.hasOneUse()) { + SDValue NewBitReorder = DAG.getNode(Opcode, DL, VT, OldLHS); + return DAG.getNode(N0.getOpcode(), DL, VT, NewBitReorder, + OldRHS.getOperand(0)); + } + } + return SDValue(); } SDValue DAGCombiner::visitSRA(SDNode *N) { @@ -9892,8 +10466,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1); } - // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). + // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit), and x has a power + // of two bitwidth. The "5" represents (log2 (bitwidth x)). if (N1C && N0.getOpcode() == ISD::CTLZ && + isPowerOf2_32(OpSizeInBits) && N1C->getAPIntValue() == Log2_32(OpSizeInBits)) { KnownBits Known = DAG.computeKnownBits(N0.getOperand(0)); @@ -9912,7 +10488,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { // could be set on input to the CTLZ node. If this bit is set, the SRL // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair // to an SRL/XOR pair, which is likely to simplify more. - unsigned ShAmt = UnknownBits.countTrailingZeros(); + unsigned ShAmt = UnknownBits.countr_zero(); SDValue Op = N0.getOperand(0); if (ShAmt) { @@ -10138,13 +10714,23 @@ SDValue DAGCombiner::visitSHLSAT(SDNode *N) { return SDValue(); } -// Given a ABS node, detect the following pattern: +// Given a ABS node, detect the following patterns: // (ABS (SUB (EXTEND a), (EXTEND b))). +// (TRUNC (ABS (SUB (EXTEND a), (EXTEND b)))). // Generates UABD/SABD instruction. SDValue DAGCombiner::foldABSToABD(SDNode *N) { + EVT SrcVT = N->getValueType(0); + + if (N->getOpcode() == ISD::TRUNCATE) + N = N->getOperand(0).getNode(); + + if (N->getOpcode() != ISD::ABS) + return SDValue(); + EVT VT = N->getValueType(0); SDValue AbsOp1 = N->getOperand(0); SDValue Op0, Op1; + SDLoc DL(N); if (AbsOp1.getOpcode() != ISD::SUB) return SDValue(); @@ -10157,9 +10743,11 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N) { if (Opc0 != Op1.getOpcode() || (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND)) { // fold (abs (sub nsw x, y)) -> abds(x, y) - if (AbsOp1->getFlags().hasNoSignedWrap() && - TLI.isOperationLegalOrCustom(ISD::ABDS, VT)) - return DAG.getNode(ISD::ABDS, SDLoc(N), VT, Op0, Op1); + if (AbsOp1->getFlags().hasNoSignedWrap() && hasOperation(ISD::ABDS, VT) && + TLI.preferABDSToABSWithNSW(VT)) { + SDValue ABD = DAG.getNode(ISD::ABDS, DL, VT, Op0, Op1); + return DAG.getZExtOrTrunc(ABD, DL, SrcVT); + } return SDValue(); } @@ -10170,17 +10758,20 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N) { // fold abs(sext(x) - sext(y)) -> zext(abds(x, y)) // fold abs(zext(x) - zext(y)) -> zext(abdu(x, y)) // NOTE: Extensions must be equivalent. - if (VT1 == VT2 && TLI.isOperationLegalOrCustom(ABDOpcode, VT1)) { + if (VT1 == VT2 && hasOperation(ABDOpcode, VT1)) { Op0 = Op0.getOperand(0); Op1 = Op1.getOperand(0); - SDValue ABD = DAG.getNode(ABDOpcode, SDLoc(N), VT1, Op0, Op1); - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, ABD); + SDValue ABD = DAG.getNode(ABDOpcode, DL, VT1, Op0, Op1); + ABD = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, ABD); + return DAG.getZExtOrTrunc(ABD, DL, SrcVT); } // fold abs(sext(x) - sext(y)) -> abds(sext(x), sext(y)) // fold abs(zext(x) - zext(y)) -> abdu(zext(x), zext(y)) - if (TLI.isOperationLegalOrCustom(ABDOpcode, VT)) - return DAG.getNode(ABDOpcode, SDLoc(N), VT, Op0, Op1); + if (hasOperation(ABDOpcode, VT)) { + SDValue ABD = DAG.getNode(ABDOpcode, DL, VT, Op0, Op1); + return DAG.getZExtOrTrunc(ABD, DL, SrcVT); + } return SDValue(); } @@ -10190,8 +10781,8 @@ SDValue DAGCombiner::visitABS(SDNode *N) { EVT VT = N->getValueType(0); // fold (abs c1) -> c2 - if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) - return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::ABS, SDLoc(N), VT, {N0})) + return C; // fold (abs (abs x)) -> (abs x) if (N0.getOpcode() == ISD::ABS) return N0; @@ -10277,6 +10868,9 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) { } } + if (SDValue V = foldBitOrderCrossLogicOp(N, DAG)) + return V; + return SDValue(); } @@ -10447,7 +11041,8 @@ SDValue DAGCombiner::combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, if (NegRHS == False) { SDValue Combined = combineMinNumMaxNumImpl(DL, VT, LHS, RHS, NegTrue, False, CC, TLI, DAG); - return DAG.getNode(ISD::FNEG, DL, VT, Combined); + if (Combined) + return DAG.getNode(ISD::FNEG, DL, VT, Combined); } } } @@ -11091,6 +11686,23 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) return Chain; + // Remove a masked store if base pointers and masks are equal. + if (MaskedStoreSDNode *MST1 = dyn_cast<MaskedStoreSDNode>(Chain)) { + if (MST->isUnindexed() && MST->isSimple() && MST1->isUnindexed() && + MST1->isSimple() && MST1->getBasePtr() == Ptr && + !MST->getBasePtr().isUndef() && + ((Mask == MST1->getMask() && MST->getMemoryVT().getStoreSize() == + MST1->getMemoryVT().getStoreSize()) || + ISD::isConstantSplatVectorAllOnes(Mask.getNode())) && + TypeSize::isKnownLE(MST1->getMemoryVT().getStoreSize(), + MST->getMemoryVT().getStoreSize())) { + CombineTo(MST1, MST1->getChain()); + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); + return SDValue(N, 0); + } + } + // If this is a masked load with an all ones mask, we can use a unmasked load. // FIXME: Can we do this for indexed, compressing, or truncating stores? if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MST->isUnindexed() && @@ -11391,6 +12003,38 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { } } + // Match VSELECTs with absolute difference patterns. + // (vselect (setcc a, b, set?gt), (sub a, b), (sub b, a)) --> (abd? a, b) + // (vselect (setcc a, b, set?ge), (sub a, b), (sub b, a)) --> (abd? a, b) + // (vselect (setcc a, b, set?lt), (sub b, a), (sub a, b)) --> (abd? a, b) + // (vselect (setcc a, b, set?le), (sub b, a), (sub a, b)) --> (abd? a, b) + if (N1.getOpcode() == ISD::SUB && N2.getOpcode() == ISD::SUB && + N1.getOperand(0) == N2.getOperand(1) && + N1.getOperand(1) == N2.getOperand(0)) { + bool IsSigned = isSignedIntSetCC(CC); + unsigned ABDOpc = IsSigned ? ISD::ABDS : ISD::ABDU; + if (hasOperation(ABDOpc, VT)) { + switch (CC) { + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: + if (LHS == N1.getOperand(0) && RHS == N1.getOperand(1)) + return DAG.getNode(ABDOpc, DL, VT, LHS, RHS); + break; + case ISD::SETLT: + case ISD::SETLE: + case ISD::SETULT: + case ISD::SETULE: + if (RHS == N1.getOperand(0) && LHS == N1.getOperand(1) ) + return DAG.getNode(ABDOpc, DL, VT, LHS, RHS); + break; + default: + break; + } + } + } + // Match VSELECTs into add with unsigned saturation. if (hasOperation(ISD::UADDSAT, VT)) { // Check if one of the arms of the VSELECT is vector with all bits set. @@ -11612,57 +12256,6 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) { ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get(); EVT VT = N->getValueType(0); - // SETCC(FREEZE(X), CONST, Cond) - // => - // FREEZE(SETCC(X, CONST, Cond)) - // This is correct if FREEZE(X) has one use and SETCC(FREEZE(X), CONST, Cond) - // isn't equivalent to true or false. - // For example, SETCC(FREEZE(X), -128, SETULT) cannot be folded to - // FREEZE(SETCC(X, -128, SETULT)) because X can be poison. - // - // This transformation is beneficial because visitBRCOND can fold - // BRCOND(FREEZE(X)) to BRCOND(X). - - // Conservatively optimize integer comparisons only. - if (PreferSetCC) { - // Do this only when SETCC is going to be used by BRCOND. - - SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); - ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0); - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); - bool Updated = false; - - // Is 'X Cond C' always true or false? - auto IsAlwaysTrueOrFalse = [](ISD::CondCode Cond, ConstantSDNode *C) { - bool False = (Cond == ISD::SETULT && C->isZero()) || - (Cond == ISD::SETLT && C->isMinSignedValue()) || - (Cond == ISD::SETUGT && C->isAllOnes()) || - (Cond == ISD::SETGT && C->isMaxSignedValue()); - bool True = (Cond == ISD::SETULE && C->isAllOnes()) || - (Cond == ISD::SETLE && C->isMaxSignedValue()) || - (Cond == ISD::SETUGE && C->isZero()) || - (Cond == ISD::SETGE && C->isMinSignedValue()); - return True || False; - }; - - if (N0->getOpcode() == ISD::FREEZE && N0.hasOneUse() && N1C) { - if (!IsAlwaysTrueOrFalse(Cond, N1C)) { - N0 = N0->getOperand(0); - Updated = true; - } - } - if (N1->getOpcode() == ISD::FREEZE && N1.hasOneUse() && N0C) { - if (!IsAlwaysTrueOrFalse(ISD::getSetCCSwappedOperands(Cond), - N0C)) { - N1 = N1->getOperand(0); - Updated = true; - } - } - - if (Updated) - return DAG.getFreeze(DAG.getSetCC(SDLoc(N), VT, N0, N1, Cond)); - } - SDValue Combined = SimplifySetCC(VT, N->getOperand(0), N->getOperand(1), Cond, SDLoc(N), !PreferSetCC); @@ -11733,7 +12326,8 @@ static bool isCompatibleLoad(SDValue N, unsigned ExtOpcode) { /// This function is called by the DAGCombiner when visiting sext/zext/aext /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI, - SelectionDAG &DAG) { + SelectionDAG &DAG, + CombineLevel Level) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -11758,10 +12352,14 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI, else if (Opcode == ISD::ZERO_EXTEND) ExtLoadOpcode = ISD::ZEXTLOAD; + // Illegal VSELECT may ISel fail if happen after legalization (DAG + // Combine2), so we should conservatively check the OperationAction. LoadSDNode *Load1 = cast<LoadSDNode>(Op1); LoadSDNode *Load2 = cast<LoadSDNode>(Op2); if (!TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load1->getMemoryVT()) || - !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT())) + !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT()) || + (N0->getOpcode() == ISD::VSELECT && Level >= AfterLegalizeTypes && + TLI.getOperationAction(ISD::VSELECT, VT) != TargetLowering::Legal)) return SDValue(); SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1); @@ -11782,11 +12380,7 @@ static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, EVT VT = N->getValueType(0); SDLoc DL(N); - assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || - Opcode == ISD::ANY_EXTEND || - Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || - Opcode == ISD::ZERO_EXTEND_VECTOR_INREG || - Opcode == ISD::ANY_EXTEND_VECTOR_INREG) && + assert((ISD::isExtOpcode(Opcode) || ISD::isExtVecInRegOpcode(Opcode)) && "Expected EXTEND dag node in input!"); // fold (sext c1) -> c1 @@ -12052,8 +12646,7 @@ SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) { // and/or/xor SDValue N0 = N->getOperand(0); - if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || - N0.getOpcode() == ISD::XOR) || + if (!ISD::isBitwiseLogicOp(N0.getOpcode()) || N0.getOperand(1).getOpcode() != ISD::Constant || (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))) return SDValue(); @@ -12449,11 +13042,19 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); + // fold (sext (aext_extend_vector_inreg x)) -> (sext_extend_vector_inreg x) + // fold (sext (sext_extend_vector_inreg x)) -> (sext_extend_vector_inreg x) + if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) + return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, + N0.getOperand(0)); + // fold (sext (sext_inreg x)) -> (sext (trunc x)) if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG) { SDValue N00 = N0.getOperand(0); EVT ExtVT = cast<VTSDNode>(N0->getOperand(1))->getVT(); - if (N00.getOpcode() == ISD::TRUNCATE && (!LegalOperations || TLI.isTypeLegal(ExtVT))) { + if (N00.getOpcode() == ISD::TRUNCATE && + (!LegalTypes || TLI.isTypeLegal(ExtVT))) { SDValue T = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, N00.getOperand(0)); return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, T); } @@ -12532,8 +13133,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { // fold (sext (and/or/xor (load x), cst)) -> // (and/or/xor (sextload x), (sext cst)) - if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || - N0.getOpcode() == ISD::XOR) && + if (ISD::isBitwiseLogicOp(N0.getOpcode()) && isa<LoadSDNode>(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { @@ -12630,45 +13230,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT)); } - if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG)) + if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level)) return Res; return SDValue(); } -// isTruncateOf - If N is a truncate of some other value, return true, record -// the value being truncated in Op and which of Op's bits are zero/one in Known. -// This function computes KnownBits to avoid a duplicated call to -// computeKnownBits in the caller. -static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, - KnownBits &Known) { - if (N->getOpcode() == ISD::TRUNCATE) { - Op = N->getOperand(0); - Known = DAG.computeKnownBits(Op); - return true; - } - - if (N.getOpcode() != ISD::SETCC || - N.getValueType().getScalarType() != MVT::i1 || - cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE) - return false; - - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - assert(Op0.getValueType() == Op1.getValueType()); - - if (isNullOrNullSplat(Op0)) - Op = Op1; - else if (isNullOrNullSplat(Op1)) - Op = Op0; - else - return false; - - Known = DAG.computeKnownBits(Op); - - return (Known.Zero | 1).isAllOnes(); -} - /// Given an extending node with a pop-count operand, if the target does not /// support a pop-count in the narrow source type but does support it in the /// destination type, widen the pop-count to the destination type. @@ -12722,14 +13289,15 @@ static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) { SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + SDLoc DL(N); if (VT.isVector()) - if (SDValue FoldedVOp = SimplifyVCastOp(N, SDLoc(N))) + if (SDValue FoldedVOp = SimplifyVCastOp(N, DL)) return FoldedVOp; // zext(undef) = 0 if (N0.isUndef()) - return DAG.getConstant(0, SDLoc(N), VT); + return DAG.getConstant(0, DL, VT); if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes)) return Res; @@ -12737,7 +13305,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // fold (zext (zext x)) -> (zext x) // fold (zext (aext x)) -> (zext x) if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); + + // fold (zext (aext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x) + // fold (zext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x) + if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) + return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, N0.getOperand(0)); // fold (zext (truncate x)) -> (zext x) or @@ -12754,7 +13328,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { std::min(Op.getScalarValueSizeInBits(), VT.getScalarSizeInBits())); if (TruncatedBits.isSubsetOf(Known.Zero)) - return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); + return DAG.getZExtOrTrunc(Op, DL, VT); } // fold (zext (truncate x)) -> (and x, mask) @@ -12780,9 +13354,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) && TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) { SDValue Op = N0.getOperand(0); - Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT); + Op = DAG.getZeroExtendInReg(Op, DL, MinVT); AddToWorklist(Op.getNode()); - SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT); + SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, DL, VT); // Transfer the debug info; the new node is equivalent to N0. DAG.transferDbgValues(N0, ZExtOrTrunc); return ZExtOrTrunc; @@ -12790,9 +13364,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { } if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { - SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); + SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), DL, VT); AddToWorklist(Op.getNode()); - SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT); + SDValue And = DAG.getZeroExtendInReg(Op, DL, MinVT); // We may safely transfer the debug info describing the truncate node over // to the equivalent and operation. DAG.transferDbgValues(N0, And); @@ -12811,7 +13385,6 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue X = N0.getOperand(0).getOperand(0); X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT); APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); - SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(Mask, DL, VT)); } @@ -12836,8 +13409,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // (and/or/xor (zextload x), (zext cst)) // Unless (and (load x) cst) will match as a zextload already and has // additional users. - if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || - N0.getOpcode() == ISD::XOR) && + if (ISD::isBitwiseLogicOp(N0.getOpcode()) && isa<LoadSDNode>(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { @@ -12865,7 +13437,6 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { LN00->getMemoryVT(), LN00->getMemOperand()); APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits()); - SDLoc DL(N); SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, ExtLoad, DAG.getConstant(Mask, DL, VT)); ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND); @@ -12919,7 +13490,6 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // that matter). Check to see that they are the same size. If so, we know // that the element size of the sext'd result matches the element size of // the compare operands. - SDLoc DL(N); if (VT.getSizeInBits() == N00VT.getSizeInBits()) { // zext(setcc) -> zext_in_reg(vsetcc) for vectors. SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0), @@ -12939,7 +13509,6 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { } // zext(setcc x,y,cc) -> zext(select x, y, true, false, cc) - SDLoc DL(N); EVT N0VT = N0.getValueType(); EVT N00VT = N0.getOperand(0).getValueType(); if (SDValue SCC = SimplifySelectCC( @@ -12952,29 +13521,29 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // (zext (shl (zext x), cst)) -> (shl (zext x), cst) if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) && - isa<ConstantSDNode>(N0.getOperand(1)) && - N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && - N0.hasOneUse()) { + !TLI.isZExtFree(N0, VT)) { + SDValue ShVal = N0.getOperand(0); SDValue ShAmt = N0.getOperand(1); - if (N0.getOpcode() == ISD::SHL) { - SDValue InnerZExt = N0.getOperand(0); - // If the original shl may be shifting out bits, do not perform this - // transformation. - unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() - - InnerZExt.getOperand(0).getValueSizeInBits(); - if (cast<ConstantSDNode>(ShAmt)->getAPIntValue().ugt(KnownZeroBits)) - return SDValue(); - } - - SDLoc DL(N); + if (auto *ShAmtC = dyn_cast<ConstantSDNode>(ShAmt)) { + if (ShVal.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse()) { + if (N0.getOpcode() == ISD::SHL) { + // If the original shl may be shifting out bits, do not perform this + // transformation. + // TODO: Add MaskedValueIsZero check. + unsigned KnownZeroBits = ShVal.getValueSizeInBits() - + ShVal.getOperand(0).getValueSizeInBits(); + if (ShAmtC->getAPIntValue().ugt(KnownZeroBits)) + return SDValue(); + } - // Ensure that the shift amount is wide enough for the shifted value. - if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits()) - ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt); + // Ensure that the shift amount is wide enough for the shifted value. + if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits()) + ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt); - return DAG.getNode(N0.getOpcode(), DL, VT, - DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)), - ShAmt); + return DAG.getNode(N0.getOpcode(), DL, VT, + DAG.getNode(ISD::ZERO_EXTEND, DL, VT, ShVal), ShAmt); + } + } } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) @@ -12986,7 +13555,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (SDValue V = widenAbs(N, DAG)) return V; - if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG)) + if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level)) return Res; return SDValue(); @@ -13011,6 +13580,14 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { N0.getOpcode() == ISD::SIGN_EXTEND) return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); + // fold (aext (aext_extend_vector_inreg x)) -> (aext_extend_vector_inreg x) + // fold (aext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x) + // fold (aext (sext_extend_vector_inreg x)) -> (sext_extend_vector_inreg x) + if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) + return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); + // fold (aext (truncate (load x))) -> (aext (smaller load x)) // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) if (N0.getOpcode() == ISD::TRUNCATE) { @@ -13147,7 +13724,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { if (SDValue NewCtPop = widenCtPop(N, DAG)) return NewCtPop; - if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG)) + if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level)) return Res; return SDValue(); @@ -13305,7 +13882,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) { const APInt &Mask = AndC->getAPIntValue(); unsigned ActiveBits = 0; if (Mask.isMask()) { - ActiveBits = Mask.countTrailingOnes(); + ActiveBits = Mask.countr_one(); } else if (Mask.isShiftedMask(ShAmt, ActiveBits)) { HasShiftedOffset = true; } else { @@ -13373,8 +13950,8 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) { isa<ConstantSDNode>(Mask->getOperand(1))) { const APInt& ShiftMask = Mask->getConstantOperandAPInt(1); if (ShiftMask.isMask()) { - EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), - ShiftMask.countTrailingOnes()); + EVT MaskedVT = + EVT::getIntegerVT(*DAG.getContext(), ShiftMask.countr_one()); // If the mask is smaller, recompute the type. if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) && TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) @@ -13520,9 +14097,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) // if x is small enough or if we know that x has more than 1 sign bit and the // sign_extend_inreg is extending from one of them. - if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || - N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || - N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) { + if (ISD::isExtVecInRegOpcode(N0.getOpcode())) { SDValue N00 = N0.getOperand(0); unsigned N00Bits = N00.getScalarValueSizeInBits(); unsigned DstElts = N0.getValueType().getVectorMinNumElements(); @@ -13543,7 +14118,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { SDValue N00 = N0.getOperand(0); if (N00.getScalarValueSizeInBits() == ExtVTBits && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) - return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); + return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00); } // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. @@ -13690,9 +14265,7 @@ foldExtendVectorInregToExtendOfSubvector(SDNode *N, const TargetLowering &TLI, Src.getValueType().getVectorElementType(), VT.getVectorElementCount()); - assert((InregOpcode == ISD::SIGN_EXTEND_VECTOR_INREG || - InregOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || - InregOpcode == ISD::ANY_EXTEND_VECTOR_INREG) && + assert(ISD::isExtVecInRegOpcode(InregOpcode) && "Expected EXTEND_VECTOR_INREG dag node in input!"); // Profitability check: our operand must be an one-use CONCAT_VECTORS. @@ -13752,11 +14325,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); // fold (truncate c1) -> c1 - if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { - SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); - if (C.getNode() != N) - return C; - } + if (SDValue C = DAG.FoldConstantArithmetic(ISD::TRUNCATE, SDLoc(N), VT, {N0})) + return C; // fold (truncate (ext x)) -> (ext x) or (truncate x) or x if (N0.getOpcode() == ISD::ZERO_EXTEND || @@ -13860,6 +14430,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { if (SDValue V = foldSubToUSubSat(VT, N0.getNode())) return V; + if (SDValue ABD = foldABSToABD(N)) + return ABD; + // Attempt to pre-truncate BUILD_VECTOR sources. if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations && TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) && @@ -14036,12 +14609,13 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { } break; case ISD::ADDE: - case ISD::ADDCARRY: + case ISD::UADDO_CARRY: // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) - // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry) + // (trunc uaddo_carry(X, Y, Carry)) -> + // (uaddo_carry trunc(X), trunc(Y), Carry) // When the adde's carry is not used. - // We only do for addcarry before legalize operation - if (((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) || + // We only do for uaddo_carry before legalize operation + if (((!LegalOperations && N0.getOpcode() == ISD::UADDO_CARRY) || TLI.isOperationLegal(N0.getOpcode(), VT)) && N0.hasOneUse() && !N0->hasAnyUseOfValue(1)) { SDLoc DL(N); @@ -14114,18 +14688,19 @@ static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) { return DAG.getDataLayout().isBigEndian() ? 1 : 0; } -static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, - const TargetLowering &TLI) { +SDValue DAGCombiner::foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI) { // If this is not a bitcast to an FP type or if the target doesn't have // IEEE754-compliant FP logic, we're done. EVT VT = N->getValueType(0); - if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT)) + SDValue N0 = N->getOperand(0); + EVT SourceVT = N0.getValueType(); + + if (!VT.isFloatingPoint()) return SDValue(); // TODO: Handle cases where the integer constant is a different scalar // bitwidth to the FP. - SDValue N0 = N->getOperand(0); - EVT SourceVT = N0.getValueType(); if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits()) return SDValue(); @@ -14148,6 +14723,19 @@ static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, return SDValue(); } + if (LegalOperations && !TLI.isOperationLegal(FPOpcode, VT)) + return SDValue(); + + // This needs to be the inverse of logic in foldSignChangeInBitcast. + // FIXME: I don't think looking for bitcast intrinsically makes sense, but + // removing this would require more changes. + auto IsBitCastOrFree = [&TLI, FPOpcode](SDValue Op, EVT VT) { + if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT) + return true; + + return FPOpcode == ISD::FABS ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT); + }; + // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X // Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) -> @@ -14155,9 +14743,9 @@ static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, SDValue LogicOp0 = N0.getOperand(0); ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true); if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask && - LogicOp0.getOpcode() == ISD::BITCAST && - LogicOp0.getOperand(0).getValueType() == VT) { - SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0.getOperand(0)); + IsBitCastOrFree(LogicOp0, VT)) { + SDValue CastOp0 = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, LogicOp0); + SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, CastOp0); NumFPLogicOpsConv++; if (N0.getOpcode() == ISD::OR) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp); @@ -14209,6 +14797,22 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { if (N0.getOpcode() == ISD::BITCAST) return DAG.getBitcast(VT, N0.getOperand(0)); + // fold (conv (logicop (conv x), (c))) -> (logicop x, (conv c)) + // iff the current bitwise logicop type isn't legal + if (ISD::isBitwiseLogicOp(N0.getOpcode()) && VT.isInteger() && + !TLI.isTypeLegal(N0.getOperand(0).getValueType())) { + auto IsFreeBitcast = [VT](SDValue V) { + return (V.getOpcode() == ISD::BITCAST && + V.getOperand(0).getValueType() == VT) || + (ISD::isBuildVectorOfConstantSDNodes(V.getNode()) && + V->hasOneUse()); + }; + if (IsFreeBitcast(N0.getOperand(0)) && IsFreeBitcast(N0.getOperand(1))) + return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, + DAG.getBitcast(VT, N0.getOperand(0)), + DAG.getBitcast(VT, N0.getOperand(1))); + } + // fold (conv (load x)) -> (load (conv*)x) // If the resultant load doesn't need a higher alignment than the original! if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && @@ -14437,7 +15041,9 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { N0->getNumValues() != 1 || !N0->hasOneUse()) return SDValue(); - bool AllowMultipleMaybePoisonOperands = N0.getOpcode() == ISD::BUILD_VECTOR; + bool AllowMultipleMaybePoisonOperands = N0.getOpcode() == ISD::BUILD_VECTOR || + N0.getOpcode() == ISD::BUILD_PAIR || + N0.getOpcode() == ISD::CONCAT_VECTORS; SmallSetVector<SDValue, 8> MaybePoisonOperands; for (SDValue Op : N0->ops()) { @@ -14474,6 +15080,10 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { } } + // This node has been merged with another. + if (N->getOpcode() == ISD::DELETED_NODE) + return SDValue(N, 0); + // The whole node may have been updated, so the value we were holding // may no longer be valid. Re-fetch the operand we're `freeze`ing. N0 = N->getOperand(0); @@ -14585,21 +15195,26 @@ static bool hasNoInfs(const TargetOptions &Options, SDValue N) { } /// Try to perform FMA combining on a given FADD node. +template <class MatchContextClass> SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); - + MatchContextClass matcher(DAG, TLI, N); const TargetOptions &Options = DAG.getTarget().Options; + bool UseVP = std::is_same_v<MatchContextClass, VPMatchContext>; + // Floating-point multiply-add with intermediate rounding. - bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N)); + // FIXME: Make isFMADLegal have specific behavior when using VPMatchContext. + // FIXME: Add VP_FMAD opcode. + bool HasFMAD = !UseVP && (LegalOperations && TLI.isFMADLegal(DAG, N)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); + (!LegalOperations || matcher.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) @@ -14613,6 +15228,13 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (!AllowFusionGlobally && !N->getFlags().hasAllowContract()) return SDValue(); + // Folding fadd (fmul x, y), (fmul x, y) -> fma x, y, (fmul x, y) is never + // beneficial. It does not reduce latency. It increases register pressure. It + // replaces an fadd with an fma which is a more complex instruction, so is + // likely to have a larger encoding, use more functional units, etc. + if (N0 == N1) + return SDValue(); + if (TLI.generateFMAsInMachineCombiner(VT, OptLevel)) return SDValue(); @@ -14621,14 +15243,13 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { bool Aggressive = TLI.enableAggressiveFMAFusion(VT); auto isFusedOp = [&](SDValue N) { - unsigned Opcode = N.getOpcode(); - return Opcode == ISD::FMA || Opcode == ISD::FMAD; + return matcher.match(N, ISD::FMA) || matcher.match(N, ISD::FMAD); }; // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. - auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { - if (N.getOpcode() != ISD::FMUL) + auto isContractableFMUL = [AllowFusionGlobally, &matcher](SDValue N) { + if (!matcher.match(N, ISD::FMUL)) return false; return AllowFusionGlobally || N->getFlags().hasAllowContract(); }; @@ -14641,15 +15262,15 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), - N0.getOperand(1), N1); + return matcher.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), + N0.getOperand(1), N1); } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), - N1.getOperand(1), N0); + return matcher.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), + N1.getOperand(1), N0); } // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E) @@ -14673,10 +15294,10 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue TmpFMA = FMA; while (E && isFusedOp(TmpFMA) && TmpFMA.hasOneUse()) { SDValue FMul = TmpFMA->getOperand(2); - if (FMul.getOpcode() == ISD::FMUL && FMul.hasOneUse()) { + if (matcher.match(FMul, ISD::FMUL) && FMul.hasOneUse()) { SDValue C = FMul.getOperand(0); SDValue D = FMul.getOperand(1); - SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E); + SDValue CDE = matcher.getNode(PreferredFusedOpcode, SL, VT, C, D, E); DAG.ReplaceAllUsesOfValueWith(FMul, CDE); // Replacing the inner FMul could cause the outer FMA to be simplified // away. @@ -14690,29 +15311,29 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // Look through FP_EXTEND nodes to do more combining. // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) - if (N0.getOpcode() == ISD::FP_EXTEND) { + if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), - N1); + return matcher.getNode( + PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), N1); } } // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) // Note: Commutes FADD operands. - if (N1.getOpcode() == ISD::FP_EXTEND) { + if (matcher.match(N1, ISD::FP_EXTEND)) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N10.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), - N0); + return matcher.getNode( + PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0); } } @@ -14722,15 +15343,15 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, U), - DAG.getNode(ISD::FP_EXTEND, SL, VT, V), - Z)); + return matcher.getNode( + PreferredFusedOpcode, SL, VT, X, Y, + matcher.getNode(PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, U), + matcher.getNode(ISD::FP_EXTEND, SL, VT, V), Z)); }; if (isFusedOp(N0)) { SDValue N02 = N0.getOperand(2); - if (N02.getOpcode() == ISD::FP_EXTEND) { + if (matcher.match(N02, ISD::FP_EXTEND)) { SDValue N020 = N02.getOperand(0); if (isContractableFMUL(N020) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, @@ -14749,12 +15370,13 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { // interesting for all targets, especially GPUs. auto FoldFAddFPExtFMAFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) { - return DAG.getNode( - PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, X), - DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, U), - DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z)); + return matcher.getNode( + PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, X), + matcher.getNode(ISD::FP_EXTEND, SL, VT, Y), + matcher.getNode(PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, U), + matcher.getNode(ISD::FP_EXTEND, SL, VT, V), Z)); }; if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); @@ -14810,20 +15432,26 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { } /// Try to perform FMA combining on a given FSUB node. +template <class MatchContextClass> SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); - + MatchContextClass matcher(DAG, TLI, N); const TargetOptions &Options = DAG.getTarget().Options; + + bool UseVP = std::is_same_v<MatchContextClass, VPMatchContext>; + // Floating-point multiply-add with intermediate rounding. - bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N)); + // FIXME: Make isFMADLegal have specific behavior when using VPMatchContext. + // FIXME: Add VP_FMAD opcode. + bool HasFMAD = !UseVP && (LegalOperations && TLI.isFMADLegal(DAG, N)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) && - (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); + (!LegalOperations || matcher.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) @@ -14847,8 +15475,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. - auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { - if (N.getOpcode() != ISD::FMUL) + auto isContractableFMUL = [AllowFusionGlobally, &matcher](SDValue N) { + if (!matcher.match(N, ISD::FMUL)) return false; return AllowFusionGlobally || N->getFlags().hasAllowContract(); }; @@ -14856,8 +15484,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) { if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0), - XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z)); + return matcher.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0), + XY.getOperand(1), + matcher.getNode(ISD::FNEG, SL, VT, Z)); } return SDValue(); }; @@ -14866,9 +15495,10 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // Note: Commutes FSUB operands. auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) { if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)), - YZ.getOperand(1), X); + return matcher.getNode( + PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)), + YZ.getOperand(1), X); } return SDValue(); }; @@ -14893,44 +15523,46 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { } // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) - if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && + if (matcher.match(N0, ISD::FNEG) && isContractableFMUL(N0.getOperand(0)) && (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N00), N01, - DAG.getNode(ISD::FNEG, SL, VT, N1)); + return matcher.getNode(PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FNEG, SL, VT, N00), N01, + matcher.getNode(ISD::FNEG, SL, VT, N1)); } // Look through FP_EXTEND nodes to do more combining. // fold (fsub (fpext (fmul x, y)), z) // -> (fma (fpext x), (fpext y), (fneg z)) - if (N0.getOpcode() == ISD::FP_EXTEND) { + if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, N1)); + return matcher.getNode( + PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), + matcher.getNode(ISD::FNEG, SL, VT, N1)); } } // fold (fsub x, (fpext (fmul y, z))) // -> (fma (fneg (fpext y)), (fpext z), x) // Note: Commutes FSUB operands. - if (N1.getOpcode() == ISD::FP_EXTEND) { + if (matcher.match(N1, ISD::FP_EXTEND)) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N10.getValueType())) { - return DAG.getNode( + return matcher.getNode( PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0); + matcher.getNode( + ISD::FNEG, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0); } } @@ -14940,19 +15572,20 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent // from implementing the canonicalization in visitFSUB. - if (N0.getOpcode() == ISD::FP_EXTEND) { + if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FNEG) { + if (matcher.match(N00, ISD::FNEG)) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { - return DAG.getNode( + return matcher.getNode( ISD::FNEG, SL, VT, - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), - N1)); + matcher.getNode( + PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), + N1)); } } } @@ -14963,24 +15596,25 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent // from implementing the canonicalization in visitFSUB. - if (N0.getOpcode() == ISD::FNEG) { + if (matcher.match(N0, ISD::FNEG)) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == ISD::FP_EXTEND) { + if (matcher.match(N00, ISD::FP_EXTEND)) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N000.getValueType())) { - return DAG.getNode( + return matcher.getNode( ISD::FNEG, SL, VT, - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), - N1)); + matcher.getNode( + PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), + N1)); } } } - auto isReassociable = [Options](SDNode *N) { + auto isReassociable = [&Options](SDNode *N) { return Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); }; @@ -14990,8 +15624,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { }; auto isFusedOp = [&](SDValue N) { - unsigned Opcode = N.getOpcode(); - return Opcode == ISD::FMA || Opcode == ISD::FMAD; + return matcher.match(N, ISD::FMA) || matcher.match(N, ISD::FMAD); }; // More folding opportunities when target permits. @@ -15002,12 +15635,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { if (CanFuse && isFusedOp(N0) && isContractableAndReassociableFMUL(N0.getOperand(2)) && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { - return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), - N0.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - N0.getOperand(2).getOperand(0), - N0.getOperand(2).getOperand(1), - DAG.getNode(ISD::FNEG, SL, VT, N1))); + return matcher.getNode( + PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), + matcher.getNode(PreferredFusedOpcode, SL, VT, + N0.getOperand(2).getOperand(0), + N0.getOperand(2).getOperand(1), + matcher.getNode(ISD::FNEG, SL, VT, N1))); } // fold (fsub x, (fma y, z, (fmul u, v))) @@ -15017,29 +15650,30 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { N1->hasOneUse() && NoSignedZero) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); - return DAG.getNode( + return matcher.getNode( PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N20), N21, N0)); + matcher.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), + N1.getOperand(1), + matcher.getNode(PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FNEG, SL, VT, N20), N21, N0)); } // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (isFusedOp(N0) && N0->hasOneUse()) { SDValue N02 = N0.getOperand(2); - if (N02.getOpcode() == ISD::FP_EXTEND) { + if (matcher.match(N02, ISD::FP_EXTEND)) { SDValue N020 = N02.getOperand(0); if (isContractableAndReassociableFMUL(N020) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N020.getValueType())) { - return DAG.getNode( + return matcher.getNode( PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), - DAG.getNode( + matcher.getNode( PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, N1))); + matcher.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)), + matcher.getNode(ISD::FNEG, SL, VT, N1))); } } } @@ -15050,29 +15684,29 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. - if (N0.getOpcode() == ISD::FP_EXTEND) { + if (matcher.match(N0, ISD::FP_EXTEND)) { SDValue N00 = N0.getOperand(0); if (isFusedOp(N00)) { SDValue N002 = N00.getOperand(2); if (isContractableAndReassociableFMUL(N002) && TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT, N00.getValueType())) { - return DAG.getNode( + return matcher.getNode( PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), - DAG.getNode( + matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), + matcher.getNode( PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)), - DAG.getNode(ISD::FNEG, SL, VT, N1))); + matcher.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)), + matcher.getNode(ISD::FNEG, SL, VT, N1))); } } } // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) - if (isFusedOp(N1) && N1.getOperand(2).getOpcode() == ISD::FP_EXTEND && + if (isFusedOp(N1) && matcher.match(N1.getOperand(2), ISD::FP_EXTEND) && N1->hasOneUse()) { SDValue N120 = N1.getOperand(2).getOperand(0); if (isContractableAndReassociableFMUL(N120) && @@ -15080,13 +15714,15 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { N120.getValueType())) { SDValue N1200 = N120.getOperand(0); SDValue N1201 = N120.getOperand(1); - return DAG.getNode( + return matcher.getNode( PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N1200)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0)); + matcher.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), + N1.getOperand(1), + matcher.getNode( + PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FNEG, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, N1200)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0)); } } @@ -15096,7 +15732,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. - if (N1.getOpcode() == ISD::FP_EXTEND && isFusedOp(N1.getOperand(0))) { + if (matcher.match(N1, ISD::FP_EXTEND) && isFusedOp(N1.getOperand(0))) { SDValue CvtSrc = N1.getOperand(0); SDValue N100 = CvtSrc.getOperand(0); SDValue N101 = CvtSrc.getOperand(1); @@ -15106,15 +15742,16 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { CvtSrc.getValueType())) { SDValue N1020 = N102.getOperand(0); SDValue N1021 = N102.getOperand(1); - return DAG.getNode( + return matcher.getNode( PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N100)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), - DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, - DAG.getNode(ISD::FP_EXTEND, SL, VT, N1020)), - DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0)); + matcher.getNode(ISD::FNEG, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, N100)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N101), + matcher.getNode( + PreferredFusedOpcode, SL, VT, + matcher.getNode(ISD::FNEG, SL, VT, + matcher.getNode(ISD::FP_EXTEND, SL, VT, N1020)), + matcher.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0)); } } } @@ -15217,6 +15854,17 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitVP_FADD(SDNode *N) { + SelectionDAG::FlagInserter FlagsInserter(DAG, N); + + // FADD -> FMA combines: + if (SDValue Fused = visitFADDForFMACombine<VPMatchContext>(N)) { + AddToWorklist(Fused.getNode()); + return Fused; + } + return SDValue(); +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -15394,10 +16042,15 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { DAG.getConstantFP(4.0, DL, VT)); } } + + // Fold fadd(vecreduce(x), vecreduce(y)) -> vecreduce(fadd(x, y)) + if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FADD, ISD::FADD, DL, + VT, N0, N1, Flags)) + return SD; } // enable-unsafe-fp-math // FADD -> FMA combines: - if (SDValue Fused = visitFADDForFMACombine(N)) { + if (SDValue Fused = visitFADDForFMACombine<EmptyMatchContext>(N)) { AddToWorklist(Fused.getNode()); return Fused; } @@ -15507,7 +16160,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1); // FSUB -> FMA combines: - if (SDValue Fused = visitFSUBForFMACombine(N)) { + if (SDValue Fused = visitFSUBForFMACombine<EmptyMatchContext>(N)) { AddToWorklist(Fused.getNode()); return Fused; } @@ -15568,6 +16221,11 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1); return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts); } + + // Fold fmul(vecreduce(x), vecreduce(y)) -> vecreduce(fmul(x, y)) + if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FMUL, ISD::FMUL, DL, + VT, N0, N1, Flags)) + return SD; } // fold (fmul X, 2.0) -> (fadd X, X) @@ -15653,7 +16311,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { return SDValue(); } -SDValue DAGCombiner::visitFMA(SDNode *N) { +template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); @@ -15664,6 +16322,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { const TargetOptions &Options = DAG.getTarget().Options; // FMA nodes have flags that propagate to the created nodes. SelectionDAG::FlagInserter FlagsInserter(DAG, N); + MatchContextClass matcher(DAG, TLI, N); bool CanReassociate = Options.UnsafeFPMath || N->getFlags().hasAllowReassociation(); @@ -15672,7 +16331,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (isa<ConstantFPSDNode>(N0) && isa<ConstantFPSDNode>(N1) && isa<ConstantFPSDNode>(N2)) { - return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); + return matcher.getNode(ISD::FMA, DL, VT, N0, N1, N2); } // (-N0 * -N1) + N2 --> (N0 * N1) + N2 @@ -15688,7 +16347,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1); if (NegN1 && (CostN0 == TargetLowering::NegatibleCost::Cheaper || CostN1 == TargetLowering::NegatibleCost::Cheaper)) - return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2); + return matcher.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2); } // FIXME: use fast math flags instead of Options.UnsafeFPMath @@ -15699,70 +16358,74 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { return N2; } + // FIXME: Support splat of constant. if (N0CFP && N0CFP->isExactlyValue(1.0)) - return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); + return matcher.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); if (N1CFP && N1CFP->isExactlyValue(1.0)) - return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); + return matcher.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); // Canonicalize (fma c, x, y) -> (fma x, c, y) if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && !DAG.isConstantFPBuildVectorOrConstantFP(N1)) - return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); + return matcher.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); if (CanReassociate) { // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) - if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && + if (matcher.match(N2, ISD::FMUL) && N0 == N2.getOperand(0) && DAG.isConstantFPBuildVectorOrConstantFP(N1) && DAG.isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { - return DAG.getNode(ISD::FMUL, DL, VT, N0, - DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1))); + return matcher.getNode( + ISD::FMUL, DL, VT, N0, + matcher.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1))); } // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) - if (N0.getOpcode() == ISD::FMUL && + if (matcher.match(N0, ISD::FMUL) && DAG.isConstantFPBuildVectorOrConstantFP(N1) && DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { - return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), - DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1)), - N2); + return matcher.getNode( + ISD::FMA, DL, VT, N0.getOperand(0), + matcher.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1)), N2); } } // (fma x, -1, y) -> (fadd (fneg x), y) + // FIXME: Support splat of constant. if (N1CFP) { if (N1CFP->isExactlyValue(1.0)) - return DAG.getNode(ISD::FADD, DL, VT, N0, N2); + return matcher.getNode(ISD::FADD, DL, VT, N0, N2); if (N1CFP->isExactlyValue(-1.0) && (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { - SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0); + SDValue RHSNeg = matcher.getNode(ISD::FNEG, DL, VT, N0); AddToWorklist(RHSNeg.getNode()); - return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); + return matcher.getNode(ISD::FADD, DL, VT, N2, RHSNeg); } // fma (fneg x), K, y -> fma x -K, y - if (N0.getOpcode() == ISD::FNEG && + if (matcher.match(N0, ISD::FNEG) && (TLI.isOperationLegal(ISD::ConstantFP, VT) || - (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT, - ForCodeSize)))) { - return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), - DAG.getNode(ISD::FNEG, DL, VT, N1), N2); + (N1.hasOneUse() && + !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT, ForCodeSize)))) { + return matcher.getNode(ISD::FMA, DL, VT, N0.getOperand(0), + matcher.getNode(ISD::FNEG, DL, VT, N1), N2); } } + // FIXME: Support splat of constant. if (CanReassociate) { // (fma x, c, x) -> (fmul x, (c+1)) if (N1CFP && N0 == N2) { - return DAG.getNode( - ISD::FMUL, DL, VT, N0, - DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(1.0, DL, VT))); + return matcher.getNode(ISD::FMUL, DL, VT, N0, + matcher.getNode(ISD::FADD, DL, VT, N1, + DAG.getConstantFP(1.0, DL, VT))); } // (fma x, c, (fneg x)) -> (fmul x, (c-1)) - if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { - return DAG.getNode( - ISD::FMUL, DL, VT, N0, - DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(-1.0, DL, VT))); + if (N1CFP && matcher.match(N2, ISD::FNEG) && N2.getOperand(0) == N0) { + return matcher.getNode(ISD::FMUL, DL, VT, N0, + matcher.getNode(ISD::FADD, DL, VT, N1, + DAG.getConstantFP(-1.0, DL, VT))); } } @@ -15771,7 +16434,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { if (!TLI.isFNegFree(VT)) if (SDValue Neg = TLI.getCheaperNegatedExpression( SDValue(N, 0), DAG, LegalOperations, ForCodeSize)) - return DAG.getNode(ISD::FNEG, DL, VT, Neg); + return matcher.getNode(ISD::FNEG, DL, VT, Neg); return SDValue(); } @@ -16043,27 +16706,30 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) { /// copysign(x, fp_extend(y)) -> copysign(x, y) /// copysign(x, fp_round(y)) -> copysign(x, y) -static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { - SDValue N1 = N->getOperand(1); - if ((N1.getOpcode() == ISD::FP_EXTEND || - N1.getOpcode() == ISD::FP_ROUND)) { - EVT N1VT = N1->getValueType(0); - EVT N1Op0VT = N1->getOperand(0).getValueType(); +/// Operands to the functions are the type of X and Y respectively. +static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(EVT XTy, EVT YTy) { + // Always fold no-op FP casts. + if (XTy == YTy) + return true; - // Always fold no-op FP casts. - if (N1VT == N1Op0VT) - return true; + // Do not optimize out type conversion of f128 type yet. + // For some targets like x86_64, configuration is changed to keep one f128 + // value in one SSE register, but instruction selection cannot handle + // FCOPYSIGN on SSE registers yet. + if (YTy == MVT::f128) + return false; - // Do not optimize out type conversion of f128 type yet. - // For some targets like x86_64, configuration is changed to keep one f128 - // value in one SSE register, but instruction selection cannot handle - // FCOPYSIGN on SSE registers yet. - if (N1Op0VT == MVT::f128) - return false; + return !YTy.isVector() || EnableVectorFCopySignExtendRound; +} - return !N1Op0VT.isVector() || EnableVectorFCopySignExtendRound; - } - return false; +static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() != ISD::FP_EXTEND && + N1.getOpcode() != ISD::FP_ROUND) + return false; + EVT N1VT = N1->getValueType(0); + EVT N1Op0VT = N1->getOperand(0).getValueType(); + return CanCombineFCOPYSIGN_EXTEND_ROUND(N1VT, N1Op0VT); } SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { @@ -16399,6 +17065,10 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { const bool NIsTrunc = N->getConstantOperandVal(1) == 1; const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1; + // Avoid folding legal fp_rounds into non-legal ones. + if (!hasOperation(ISD::FP_ROUND, VT)) + return SDValue(); + // Skip this folding if it results in an fp_round from f80 to f16. // // f80 to f16 always generates an expensive (and as yet, unimplemented) @@ -16423,7 +17093,13 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { } // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y) - if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse()) { + // Note: From a legality perspective, this is a two step transform. First, + // we duplicate the fp_round to the arguments of the copysign, then we + // eliminate the fp_round on Y. The second step requires an additional + // predicate to match the implementation above. + if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse() && + CanCombineFCOPYSIGN_EXTEND_ROUND(VT, + N0.getValueType())) { SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT, N0.getOperand(0), N1); AddToWorklist(Tmp.getNode()); @@ -16529,6 +17205,15 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitFFREXP(SDNode *N) { + SDValue N0 = N->getOperand(0); + + // fold (ffrexp c1) -> ffrexp(c1) + if (DAG.isConstantFPBuildVectorOrConstantFP(N0)) + return DAG.getNode(ISD::FFREXP, SDLoc(N), N->getVTList(), N0); + return SDValue(); +} + SDValue DAGCombiner::visitFFLOOR(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -16618,6 +17303,13 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) { } } + if (SDValue SD = reassociateReduction( + PropagatesNaN + ? (IsMin ? ISD::VECREDUCE_FMINIMUM : ISD::VECREDUCE_FMAXIMUM) + : (IsMin ? ISD::VECREDUCE_FMIN : ISD::VECREDUCE_FMAX), + Opc, SDLoc(N), VT, N0, N1, Flags)) + return SD; + return SDValue(); } @@ -16656,6 +17348,55 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) { N1->getOperand(0), N2); } + // Variant of the previous fold where there is a SETCC in between: + // BRCOND(SETCC(FREEZE(X), CONST, Cond)) + // => + // BRCOND(FREEZE(SETCC(X, CONST, Cond))) + // => + // BRCOND(SETCC(X, CONST, Cond)) + // This is correct if FREEZE(X) has one use and SETCC(FREEZE(X), CONST, Cond) + // isn't equivalent to true or false. + // For example, SETCC(FREEZE(X), -128, SETULT) cannot be folded to + // FREEZE(SETCC(X, -128, SETULT)) because X can be poison. + if (N1->getOpcode() == ISD::SETCC && N1.hasOneUse()) { + SDValue S0 = N1->getOperand(0), S1 = N1->getOperand(1); + ISD::CondCode Cond = cast<CondCodeSDNode>(N1->getOperand(2))->get(); + ConstantSDNode *S0C = dyn_cast<ConstantSDNode>(S0); + ConstantSDNode *S1C = dyn_cast<ConstantSDNode>(S1); + bool Updated = false; + + // Is 'X Cond C' always true or false? + auto IsAlwaysTrueOrFalse = [](ISD::CondCode Cond, ConstantSDNode *C) { + bool False = (Cond == ISD::SETULT && C->isZero()) || + (Cond == ISD::SETLT && C->isMinSignedValue()) || + (Cond == ISD::SETUGT && C->isAllOnes()) || + (Cond == ISD::SETGT && C->isMaxSignedValue()); + bool True = (Cond == ISD::SETULE && C->isAllOnes()) || + (Cond == ISD::SETLE && C->isMaxSignedValue()) || + (Cond == ISD::SETUGE && C->isZero()) || + (Cond == ISD::SETGE && C->isMinSignedValue()); + return True || False; + }; + + if (S0->getOpcode() == ISD::FREEZE && S0.hasOneUse() && S1C) { + if (!IsAlwaysTrueOrFalse(Cond, S1C)) { + S0 = S0->getOperand(0); + Updated = true; + } + } + if (S1->getOpcode() == ISD::FREEZE && S1.hasOneUse() && S0C) { + if (!IsAlwaysTrueOrFalse(ISD::getSetCCSwappedOperands(Cond), S0C)) { + S1 = S1->getOperand(0); + Updated = true; + } + } + + if (Updated) + return DAG.getNode( + ISD::BRCOND, SDLoc(N), MVT::Other, Chain, + DAG.getSetCC(SDLoc(N1), N1->getValueType(0), S0, S1, Cond), N2); + } + // If N is a constant we could fold this into a fallthrough or unconditional // branch. However that doesn't happen very often in normal code, because // Instcombine/SimplifyCFG should have handled the available opportunities. @@ -17288,11 +18029,53 @@ bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) { return false; } +StoreSDNode *DAGCombiner::getUniqueStoreFeeding(LoadSDNode *LD, + int64_t &Offset) { + SDValue Chain = LD->getOperand(0); + + // Look through CALLSEQ_START. + if (Chain.getOpcode() == ISD::CALLSEQ_START) + Chain = Chain->getOperand(0); + + StoreSDNode *ST = nullptr; + SmallVector<SDValue, 8> Aliases; + if (Chain.getOpcode() == ISD::TokenFactor) { + // Look for unique store within the TokenFactor. + for (SDValue Op : Chain->ops()) { + StoreSDNode *Store = dyn_cast<StoreSDNode>(Op.getNode()); + if (!Store) + continue; + BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); + BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG); + if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) + continue; + // Make sure the store is not aliased with any nodes in TokenFactor. + GatherAllAliases(Store, Chain, Aliases); + if (Aliases.empty() || + (Aliases.size() == 1 && Aliases.front().getNode() == Store)) + ST = Store; + break; + } + } else { + StoreSDNode *Store = dyn_cast<StoreSDNode>(Chain.getNode()); + if (Store) { + BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); + BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG); + if (BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) + ST = Store; + } + } + + return ST; +} + SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { if (OptLevel == CodeGenOpt::None || !LD->isSimple()) return SDValue(); SDValue Chain = LD->getOperand(0); - StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode()); + int64_t Offset; + + StoreSDNode *ST = getUniqueStoreFeeding(LD, Offset); // TODO: Relax this restriction for unordered atomics (see D66309) if (!ST || !ST->isSimple() || ST->getAddressSpace() != LD->getAddressSpace()) return SDValue(); @@ -17309,8 +18092,8 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { // 2. The store is scalable and the load is fixed width. We could // potentially support a limited number of cases here, but there has been // no cost-benefit analysis to prove it's worth it. - bool LdStScalable = LDMemType.isScalableVector(); - if (LdStScalable != STMemType.isScalableVector()) + bool LdStScalable = LDMemType.isScalableVT(); + if (LdStScalable != STMemType.isScalableVT()) return SDValue(); // If we are dealing with scalable vectors on a big endian platform the @@ -17320,12 +18103,6 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { if (LdStScalable && DAG.getDataLayout().isBigEndian()) return SDValue(); - BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); - BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); - int64_t Offset; - if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset)) - return SDValue(); - // Normalize for Endianness. After this Offset=0 will denote that the least // significant bit in the loaded value maps to the least significant bit in // the stored value). With Offset=n (for n > 0) the loaded value starts at the @@ -17682,7 +18459,7 @@ struct LoadedSlice { /// Get the size of the slice to be loaded in bytes. unsigned getLoadedSize() const { - unsigned SliceSize = getUsedBits().countPopulation(); + unsigned SliceSize = getUsedBits().popcount(); assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte."); return SliceSize / 8; } @@ -17867,9 +18644,9 @@ static bool areUsedBitsDense(const APInt &UsedBits) { return true; // Get rid of the unused bits on the right. - APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros()); + APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countr_zero()); // Get rid of the unused bits on the left. - if (NarrowedUsedBits.countLeadingZeros()) + if (NarrowedUsedBits.countl_zero()) NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits()); // Check that the chunk of bits is completely used. return NarrowedUsedBits.isAllOnes(); @@ -18125,14 +18902,14 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { // 0 and the bits being kept are 1. Use getSExtValue so that leading bits // follow the sign bit for uniformity. uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue(); - unsigned NotMaskLZ = countLeadingZeros(NotMask); + unsigned NotMaskLZ = llvm::countl_zero(NotMask); if (NotMaskLZ & 7) return Result; // Must be multiple of a byte. - unsigned NotMaskTZ = countTrailingZeros(NotMask); + unsigned NotMaskTZ = llvm::countr_zero(NotMask); if (NotMaskTZ & 7) return Result; // Must be multiple of a byte. if (NotMaskLZ == 64) return Result; // All zero mask. // See if we have a continuous run of bits. If so, we have 0*1+0* - if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64) + if (llvm::countr_one(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64) return Result; // Adjust NotMaskLZ down to be from the actual size of the int instead of i64. @@ -18199,6 +18976,11 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo, UseTruncStore = true; else return SDValue(); + + // Can't do this for indexed stores. + if (St->isIndexed()) + return SDValue(); + // Check that the target doesn't think this is a bad idea. if (St->getMemOperand() && !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, @@ -18309,8 +19091,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { Imm ^= APInt::getAllOnes(BitWidth); if (Imm == 0 || Imm.isAllOnes()) return SDValue(); - unsigned ShAmt = Imm.countTrailingZeros(); - unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1; + unsigned ShAmt = Imm.countr_zero(); + unsigned MSB = BitWidth - Imm.countl_zero() - 1; unsigned NewBW = NextPowerOf2(MSB - ShAmt); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); // The narrowing should be profitable, the load/store operation should be @@ -18527,6 +19309,30 @@ SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes, return DAG.getTokenFactor(StoreDL, Chains); } +bool DAGCombiner::hasSameUnderlyingObj(ArrayRef<MemOpLink> StoreNodes) { + const Value *UnderlyingObj = nullptr; + for (const auto &MemOp : StoreNodes) { + const MachineMemOperand *MMO = MemOp.MemNode->getMemOperand(); + // Pseudo value like stack frame has its own frame index and size, should + // not use the first store's frame index for other frames. + if (MMO->getPseudoValue()) + return false; + + if (!MMO->getValue()) + return false; + + const Value *Obj = getUnderlyingObject(MMO->getValue()); + + if (UnderlyingObj && UnderlyingObj != Obj) + return false; + + if (!UnderlyingObj) + UnderlyingObj = Obj; + } + + return true; +} + bool DAGCombiner::mergeStoresOfConstantsOrVecElts( SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores, bool IsConstantSrc, bool UseVector, bool UseTrunc) { @@ -18678,13 +19484,21 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts( LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); + bool CanReusePtrInfo = hasSameUnderlyingObj(StoreNodes); // make sure we use trunc store if it's necessary to be legal. + // When generate the new widen store, if the first store's pointer info can + // not be reused, discard the pointer info except the address space because + // now the widen store can not be represented by the original pointer info + // which is for the narrow memory object. SDValue NewStore; if (!UseTrunc) { - NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), - FirstInChain->getAlign(), *Flags, AAInfo); + NewStore = DAG.getStore( + NewChain, DL, StoredVal, FirstInChain->getBasePtr(), + CanReusePtrInfo + ? FirstInChain->getPointerInfo() + : MachinePointerInfo(FirstInChain->getPointerInfo().getAddrSpace()), + FirstInChain->getAlign(), *Flags, AAInfo); } else { // Must be realized as a trunc store EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); @@ -18695,8 +19509,11 @@ bool DAGCombiner::mergeStoresOfConstantsOrVecElts( LegalizedStoredValTy); NewStore = DAG.getTruncStore( NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/, - FirstInChain->getAlign(), *Flags, AAInfo); + CanReusePtrInfo + ? FirstInChain->getPointerInfo() + : MachinePointerInfo(FirstInChain->getPointerInfo().getAddrSpace()), + StoredVal.getValueType() /*TVT*/, FirstInChain->getAlign(), *Flags, + AAInfo); } // Replace all merged stores with the new store. @@ -18749,6 +19566,8 @@ void DAGCombiner::getStoreMergeCandidates( // Don't mix temporal stores with non-temporal stores. if (St->isNonTemporal() != Other->isNonTemporal()) return false; + if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*St, *Other)) + return false; SDValue OtherBC = peekThroughBitcasts(Other->getValue()); // Allow merging constants of different types as integers. bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) @@ -18774,6 +19593,9 @@ void DAGCombiner::getStoreMergeCandidates( // Don't mix temporal loads with non-temporal loads. if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal()) return false; + if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*cast<LoadSDNode>(Val), + *OtherLd)) + return false; if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; break; @@ -19042,11 +19864,9 @@ bool DAGCombiner::tryStoreMergeOfConstants( } } - // We only use vectors if the constant is known to be zero or the - // target allows it and the function is not marked with the - // noimplicitfloat attribute. - if ((!NonZero || - TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && + // We only use vectors if the target allows it and the function is not + // marked with the noimplicitfloat attribute. + if (TLI.storeOfVectorConstantIsCheap(!NonZero, MemVT, i + 1, FirstStoreAS) && AllowVectors) { // Find a legal type for the vector store. unsigned Elts = (i + 1) * NumMemElts; @@ -19389,6 +20209,7 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, // using the first's chain is acceptable. SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); + bool CanReusePtrInfo = hasSameUnderlyingObj(StoreNodes); AddToWorklist(NewStoreChain.getNode()); MachineMemOperand::Flags LdMMOFlags = @@ -19397,10 +20218,14 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, if (IsNonTemporalLoad) LdMMOFlags |= MachineMemOperand::MONonTemporal; + LdMMOFlags |= TLI.getTargetMMOFlags(*FirstLoad); + MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore ? MachineMemOperand::MONonTemporal : MachineMemOperand::MONone; + StMMOFlags |= TLI.getTargetMMOFlags(*StoreNodes[0].MemNode); + SDValue NewLoad, NewStore; if (UseVectorTy || !DoIntegerTruncate) { NewLoad = DAG.getLoad( @@ -19418,7 +20243,9 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, } NewStore = DAG.getStore( NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags); + CanReusePtrInfo ? FirstInChain->getPointerInfo() + : MachinePointerInfo(FirstStoreAS), + FirstStoreAlign, StMMOFlags); } else { // This must be the truncstore/extload case EVT ExtendedTy = TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); @@ -19428,8 +20255,10 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes, FirstLoadAlign, LdMMOFlags); NewStore = DAG.getTruncStore( NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), JointMemOpVT, - FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags()); + CanReusePtrInfo ? FirstInChain->getPointerInfo() + : MachinePointerInfo(FirstStoreAS), + JointMemOpVT, FirstInChain->getAlign(), + FirstInChain->getMemOperand()->getFlags()); } // Transfer chain users from old loads to the new load. @@ -19465,7 +20294,7 @@ bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) { // store since we know <vscale x 16 x i8> is exactly twice as large as // <vscale x 8 x i8>). Until then, bail out for scalable vectors. EVT MemVT = St->getMemoryVT(); - if (MemVT.isScalableVector()) + if (MemVT.isScalableVT()) return false; if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) return false; @@ -19647,6 +20476,62 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { } } +// (store (insert_vector_elt (load p), x, i), p) -> (store x, p+offset) +// +// If a store of a load with an element inserted into it has no other +// uses in between the chain, then we can consider the vector store +// dead and replace it with just the single scalar element store. +SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) { + SDLoc DL(ST); + SDValue Value = ST->getValue(); + SDValue Ptr = ST->getBasePtr(); + SDValue Chain = ST->getChain(); + if (Value.getOpcode() != ISD::INSERT_VECTOR_ELT || !Value.hasOneUse()) + return SDValue(); + + SDValue Elt = Value.getOperand(1); + SDValue Idx = Value.getOperand(2); + + // If the element isn't byte sized then we can't compute an offset + EVT EltVT = Elt.getValueType(); + if (!EltVT.isByteSized()) + return SDValue(); + + auto *Ld = dyn_cast<LoadSDNode>(Value.getOperand(0)); + if (!Ld || Ld->getBasePtr() != Ptr || + ST->getMemoryVT() != Ld->getMemoryVT() || !ST->isSimple() || + !ISD::isNormalStore(ST) || + Ld->getAddressSpace() != ST->getAddressSpace() || + !Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) + return SDValue(); + + unsigned IsFast; + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + Elt.getValueType(), ST->getAddressSpace(), + ST->getAlign(), ST->getMemOperand()->getFlags(), + &IsFast) || + !IsFast) + return SDValue(); + EVT PtrVT = Ptr.getValueType(); + + SDValue Offset = + DAG.getNode(ISD::MUL, DL, PtrVT, Idx, + DAG.getConstant(EltVT.getSizeInBits() / 8, DL, PtrVT)); + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Offset); + MachinePointerInfo PointerInfo(ST->getAddressSpace()); + + // If the offset is a known constant then try to recover the pointer + // info + if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx)) { + unsigned COffset = CIdx->getSExtValue() * EltVT.getSizeInBits() / 8; + NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(COffset), DL); + PointerInfo = ST->getPointerInfo().getWithOffset(COffset); + } + + return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(), + ST->getMemOperand()->getFlags()); +} + SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast<StoreSDNode>(N); SDValue Chain = ST->getChain(); @@ -19768,9 +20653,13 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { } // If this is a load followed by a store to the same location, then the store - // is dead/noop. + // is dead/noop. Peek through any truncates if canCombineTruncStore failed. + // TODO: Add big-endian truncate support with test coverage. // TODO: Can relax for unordered atomics (see D66309) - if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) { + SDValue TruncVal = DAG.getDataLayout().isLittleEndian() + ? peekThroughTruncates(Value) + : Value; + if (auto *Ld = dyn_cast<LoadSDNode>(TruncVal)) { if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() && ST->isUnindexed() && ST->isSimple() && Ld->getAddressSpace() == ST->getAddressSpace() && @@ -19782,6 +20671,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { } } + // Try scalarizing vector stores of loads where we only change one element + if (SDValue NewST = replaceStoreOfInsertLoad(ST)) + return NewST; + // TODO: Can relax for unordered atomics (see D66309) if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { if (ST->isUnindexed() && ST->isSimple() && @@ -19796,22 +20689,32 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && !ST1->getBasePtr().isUndef() && - // BaseIndexOffset and the code below requires knowing the size - // of a vector, so bail out if MemoryVT is scalable. - !ST->getMemoryVT().isScalableVector() && - !ST1->getMemoryVT().isScalableVector() && ST->getAddressSpace() == ST1->getAddressSpace()) { - const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG); - const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG); - unsigned STBitSize = ST->getMemoryVT().getFixedSizeInBits(); - unsigned ChainBitSize = ST1->getMemoryVT().getFixedSizeInBits(); - // If this is a store who's preceding store to a subset of the current - // location and no one other node is chained to that store we can - // effectively drop the store. Do not remove stores to undef as they may - // be used as data sinks. - if (STBase.contains(DAG, STBitSize, ChainBase, ChainBitSize)) { - CombineTo(ST1, ST1->getChain()); - return SDValue(); + // If we consider two stores and one smaller in size is a scalable + // vector type and another one a bigger size store with a fixed type, + // then we could not allow the scalable store removal because we don't + // know its final size in the end. + if (ST->getMemoryVT().isScalableVector() || + ST1->getMemoryVT().isScalableVector()) { + if (ST1->getBasePtr() == Ptr && + TypeSize::isKnownLE(ST1->getMemoryVT().getStoreSize(), + ST->getMemoryVT().getStoreSize())) { + CombineTo(ST1, ST1->getChain()); + return SDValue(); + } + } else { + const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG); + const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG); + // If this is a store who's preceding store to a subset of the current + // location and no one other node is chained to that store we can + // effectively drop the store. Do not remove stores to undef as they + // may be used as data sinks. + if (STBase.contains(DAG, ST->getMemoryVT().getFixedSizeInBits(), + ChainBase, + ST1->getMemoryVT().getFixedSizeInBits())) { + CombineTo(ST1, ST1->getChain()); + return SDValue(); + } } } } @@ -20183,6 +21086,99 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { return DAG.getBitcast(VT, Shuf); } +// Combine insert(shuffle(load, <u,0,1,2>), load, 0) into a single load if +// possible and the new load will be quick. We use more loads but less shuffles +// and inserts. +SDValue DAGCombiner::combineInsertEltToLoad(SDNode *N, unsigned InsIndex) { + EVT VT = N->getValueType(0); + + // InsIndex is expected to be the first of last lane. + if (!VT.isFixedLengthVector() || + (InsIndex != 0 && InsIndex != VT.getVectorNumElements() - 1)) + return SDValue(); + + // Look for a shuffle with the mask u,0,1,2,3,4,5,6 or 1,2,3,4,5,6,7,u + // depending on the InsIndex. + auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0)); + SDValue Scalar = N->getOperand(1); + if (!Shuffle || !all_of(enumerate(Shuffle->getMask()), [&](auto P) { + return InsIndex == P.index() || P.value() < 0 || + (InsIndex == 0 && P.value() == (int)P.index() - 1) || + (InsIndex == VT.getVectorNumElements() - 1 && + P.value() == (int)P.index() + 1); + })) + return SDValue(); + + // We optionally skip over an extend so long as both loads are extended in the + // same way from the same type. + unsigned Extend = 0; + if (Scalar.getOpcode() == ISD::ZERO_EXTEND || + Scalar.getOpcode() == ISD::SIGN_EXTEND || + Scalar.getOpcode() == ISD::ANY_EXTEND) { + Extend = Scalar.getOpcode(); + Scalar = Scalar.getOperand(0); + } + + auto *ScalarLoad = dyn_cast<LoadSDNode>(Scalar); + if (!ScalarLoad) + return SDValue(); + + SDValue Vec = Shuffle->getOperand(0); + if (Extend) { + if (Vec.getOpcode() != Extend) + return SDValue(); + Vec = Vec.getOperand(0); + } + auto *VecLoad = dyn_cast<LoadSDNode>(Vec); + if (!VecLoad || Vec.getValueType().getScalarType() != Scalar.getValueType()) + return SDValue(); + + int EltSize = ScalarLoad->getValueType(0).getScalarSizeInBits(); + if (EltSize == 0 || EltSize % 8 != 0 || !ScalarLoad->isSimple() || + !VecLoad->isSimple() || VecLoad->getExtensionType() != ISD::NON_EXTLOAD || + ScalarLoad->getExtensionType() != ISD::NON_EXTLOAD || + ScalarLoad->getAddressSpace() != VecLoad->getAddressSpace()) + return SDValue(); + + // Check that the offset between the pointers to produce a single continuous + // load. + if (InsIndex == 0) { + if (!DAG.areNonVolatileConsecutiveLoads(ScalarLoad, VecLoad, EltSize / 8, + -1)) + return SDValue(); + } else { + if (!DAG.areNonVolatileConsecutiveLoads( + VecLoad, ScalarLoad, VT.getVectorNumElements() * EltSize / 8, -1)) + return SDValue(); + } + + // And that the new unaligned load will be fast. + unsigned IsFast = 0; + Align NewAlign = commonAlignment(VecLoad->getAlign(), EltSize / 8); + if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + Vec.getValueType(), VecLoad->getAddressSpace(), + NewAlign, VecLoad->getMemOperand()->getFlags(), + &IsFast) || + !IsFast) + return SDValue(); + + // Calculate the new Ptr and create the new load. + SDLoc DL(N); + SDValue Ptr = ScalarLoad->getBasePtr(); + if (InsIndex != 0) + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), VecLoad->getBasePtr(), + DAG.getConstant(EltSize / 8, DL, Ptr.getValueType())); + MachinePointerInfo PtrInfo = + InsIndex == 0 ? ScalarLoad->getPointerInfo() + : VecLoad->getPointerInfo().getWithOffset(EltSize / 8); + + SDValue Load = DAG.getLoad(VecLoad->getValueType(0), DL, + ScalarLoad->getChain(), Ptr, PtrInfo, NewAlign); + DAG.makeEquivalentMemoryOrdering(ScalarLoad, Load.getValue(1)); + DAG.makeEquivalentMemoryOrdering(VecLoad, Load.getValue(1)); + return Extend ? DAG.getNode(Extend, DL, VT, Load) : Load; +} + SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); @@ -20254,6 +21250,9 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) return Shuf; + if (SDValue Shuf = combineInsertEltToLoad(N, Elt)) + return Shuf; + // Attempt to convert an insert_vector_elt chain into a legal build_vector. if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) { // vXi1 vector - we don't need to recurse. @@ -20349,6 +21348,20 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { return NewShuffle; } + // If all insertions are zero value, try to convert to AND mask. + // TODO: Do this for -1 with OR mask? + if (!LegalOperations && llvm::isNullConstant(InVal) && + all_of(Ops, [InVal](SDValue Op) { return !Op || Op == InVal; }) && + count_if(Ops, [InVal](SDValue Op) { return Op == InVal; }) >= 2) { + SDValue Zero = DAG.getConstant(0, DL, MaxEltVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, MaxEltVT); + SmallVector<SDValue, 8> Mask(NumElts); + for (unsigned I = 0; I != NumElts; ++I) + Mask[I] = Ops[I] ? Zero : AllOnes; + return DAG.getNode(ISD::AND, DL, VT, CurVec, + DAG.getBuildVector(VT, DL, Mask)); + } + // Failed to find a match in the chain - bail. break; } @@ -20701,8 +21714,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // extract_vector_elt (build_vector x, y), 1 -> y if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) || VecOp.getOpcode() == ISD::SPLAT_VECTOR) && - TLI.isTypeLegal(VecVT) && - (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) { + TLI.isTypeLegal(VecVT)) { assert((VecOp.getOpcode() != ISD::BUILD_VECTOR || VecVT.isFixedLengthVector()) && "BUILD_VECTOR used for scalable vectors"); @@ -20711,12 +21723,15 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { SDValue Elt = VecOp.getOperand(IndexVal); EVT InEltVT = Elt.getValueType(); - // Sometimes build_vector's scalar input types do not match result type. - if (ScalarVT == InEltVT) - return Elt; + if (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT) || + isNullConstant(Elt)) { + // Sometimes build_vector's scalar input types do not match result type. + if (ScalarVT == InEltVT) + return Elt; - // TODO: It may be useful to truncate if free if the build_vector implicitly - // converts. + // TODO: It may be useful to truncate if free if the build_vector + // implicitly converts. + } } if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations)) @@ -21025,9 +22040,10 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { // same source type and all of the inputs must be any or zero extend. // Scalar sizes must be a power of two. EVT OutScalarTy = VT.getScalarType(); - bool ValidTypes = SourceType != MVT::Other && - isPowerOf2_32(OutScalarTy.getSizeInBits()) && - isPowerOf2_32(SourceType.getSizeInBits()); + bool ValidTypes = + SourceType != MVT::Other && + llvm::has_single_bit<uint32_t>(OutScalarTy.getSizeInBits()) && + llvm::has_single_bit<uint32_t>(SourceType.getSizeInBits()); // Create a new simpler BUILD_VECTOR sequence which other optimizations can // turn into a single shuffle instruction. @@ -21157,7 +22173,7 @@ SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) { } // Only cast if the size is the same - if (Src.getValueType().getSizeInBits() != VT.getSizeInBits()) + if (!Src || Src.getValueType().getSizeInBits() != VT.getSizeInBits()) return SDValue(); return DAG.getBitcast(VT, Src); @@ -21359,10 +22375,9 @@ static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) { // the source vector. The high bits map to zero. We will use a zero vector // as the 2nd source operand of the shuffle, so use the 1st element of // that vector (mask value is number-of-elements) for the high bits. - if (i % ZextRatio == 0) - ShufMask[i] = Extract.getConstantOperandVal(1); - else - ShufMask[i] = NumMaskElts; + int Low = DAG.getDataLayout().isBigEndian() ? (ZextRatio - 1) : 0; + ShufMask[i] = (i % ZextRatio == Low) ? Extract.getConstantOperandVal(1) + : NumMaskElts; } // Undef elements of the build vector remain undef because we initialize @@ -21917,7 +22932,7 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { EVT OpVT = N->getOperand(0).getValueType(); // If the operands are legal vectors, leave them alone. - if (TLI.isTypeLegal(OpVT)) + if (TLI.isTypeLegal(OpVT) || OpVT.isScalableVector()) return SDValue(); SDLoc DL(N); @@ -22273,7 +23288,13 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // If the input is a concat_vectors, just make a larger concat by padding // with smaller undefs. - if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) { + // + // Legalizing in AArch64TargetLowering::LowerCONCAT_VECTORS() and combining + // here could cause an infinite loop. That legalizing happens when LegalDAG + // is true and input of AArch64TargetLowering::LowerCONCAT_VECTORS() is + // scalable. + if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse() && + !(LegalDAG && In.getValueType().isScalableVector())) { unsigned NumOps = N->getNumOperands() * In.getNumOperands(); SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end()); Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType())); @@ -22767,10 +23788,6 @@ static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N, continue; } - // Profitability check: only deal with extractions from the first subvector. - if (OpSubvecIdx != 0) - return SDValue(); - const std::pair<SDValue, int> DemandedSubvector = std::make_pair(Op, OpSubvecIdx); @@ -22800,6 +23817,14 @@ static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N, if (DemandedSubvectors.empty()) return DAG.getUNDEF(NarrowVT); + // Profitability check: only deal with extractions from the first subvector + // unless the mask becomes an identity mask. + if (!ShuffleVectorInst::isIdentityMask(NewMask) || + any_of(NewMask, [](int M) { return M < 0; })) + for (auto &DemandedSubvector : DemandedSubvectors) + if (DemandedSubvector.second != 0) + return SDValue(); + // We still perform the exact same EXTRACT_SUBVECTOR, just on different // operand[s]/index[es], so there is no point in checking for it's legality. @@ -22975,7 +24000,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { if (NumElems == 1) { SDValue Src = V->getOperand(IdxVal); if (EltVT != Src.getValueType()) - Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src); + Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Src); return DAG.getBitcast(NVT, Src); } @@ -23450,9 +24475,7 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SDValue N0 = peekThroughBitcasts(SVN->getOperand(0)); unsigned Opcode = N0.getOpcode(); - if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && - Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && - Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) + if (!ISD::isExtVecInRegOpcode(Opcode)) return SDValue(); SDValue N00 = N0.getOperand(0); @@ -23518,7 +24541,7 @@ static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf, assert((unsigned)Idx < NumElts && "Out-of-bounds shuffle indice?"); DemandedElts.setBit(Idx); } - assert(DemandedElts.countPopulation() > 1 && "Is a splat shuffle already?"); + assert(DemandedElts.popcount() > 1 && "Is a splat shuffle already?"); APInt UndefElts; if (DAG.isSplatValue(Shuf->getOperand(0), DemandedElts, UndefElts)) { // Even if all demanded elements are splat, some of them could be undef. @@ -24072,8 +25095,8 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { bool IsInLaneMask = true; ArrayRef<int> Mask = SVN->getMask(); SmallVector<int, 16> ClearMask(NumElts, -1); - APInt DemandedLHS = APInt::getNullValue(NumElts); - APInt DemandedRHS = APInt::getNullValue(NumElts); + APInt DemandedLHS = APInt::getZero(NumElts); + APInt DemandedRHS = APInt::getZero(NumElts); for (int I = 0; I != (int)NumElts; ++I) { int M = Mask[I]; if (M < 0) @@ -24086,12 +25109,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { } } // TODO: Should we try to mask with N1 as well? - if (!IsInLaneMask && - (!DemandedLHS.isNullValue() || !DemandedRHS.isNullValue()) && - (DemandedLHS.isNullValue() || - DAG.MaskedVectorIsZero(N0, DemandedLHS)) && - (DemandedRHS.isNullValue() || - DAG.MaskedVectorIsZero(N1, DemandedRHS))) { + if (!IsInLaneMask && (!DemandedLHS.isZero() || !DemandedRHS.isZero()) && + (DemandedLHS.isZero() || DAG.MaskedVectorIsZero(N0, DemandedLHS)) && + (DemandedRHS.isZero() || DAG.MaskedVectorIsZero(N1, DemandedRHS))) { SDLoc DL(N); EVT IntVT = VT.changeVectorElementTypeToInteger(); EVT IntSVT = VT.getVectorElementType().changeTypeToInteger(); @@ -24771,6 +25791,17 @@ SDValue DAGCombiner::visitVECREDUCE(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitVP_FSUB(SDNode *N) { + SelectionDAG::FlagInserter FlagsInserter(DAG, N); + + // FSUB -> FMA combines: + if (SDValue Fused = visitFSUBForFMACombine<VPMatchContext>(N)) { + AddToWorklist(Fused.getNode()); + return Fused; + } + return SDValue(); +} + SDValue DAGCombiner::visitVPOp(SDNode *N) { if (N->getOpcode() == ISD::VP_GATHER) @@ -24792,8 +25823,17 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) { ISD::isConstantSplatVectorAllZeros(N->getOperand(*MaskIdx).getNode()); // This is the only generic VP combine we support for now. - if (!AreAllEltsDisabled) + if (!AreAllEltsDisabled) { + switch (N->getOpcode()) { + case ISD::VP_FADD: + return visitVP_FADD(N); + case ISD::VP_FSUB: + return visitVP_FSUB(N); + case ISD::VP_FMA: + return visitFMA<VPMatchContext>(N); + } return SDValue(); + } // Binary operations can be replaced by UNDEF. if (ISD::isVPBinaryOp(N->getOpcode())) @@ -24814,6 +25854,97 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitGET_FPENV_MEM(SDNode *N) { + SDValue Chain = N->getOperand(0); + SDValue Ptr = N->getOperand(1); + EVT MemVT = cast<FPStateAccessSDNode>(N)->getMemoryVT(); + + // Check if the memory, where FP state is written to, is used only in a single + // load operation. + LoadSDNode *LdNode = nullptr; + for (auto *U : Ptr->uses()) { + if (U == N) + continue; + if (auto *Ld = dyn_cast<LoadSDNode>(U)) { + if (LdNode && LdNode != Ld) + return SDValue(); + LdNode = Ld; + continue; + } + return SDValue(); + } + if (!LdNode || !LdNode->isSimple() || LdNode->isIndexed() || + !LdNode->getOffset().isUndef() || LdNode->getMemoryVT() != MemVT || + !LdNode->getChain().reachesChainWithoutSideEffects(SDValue(N, 0))) + return SDValue(); + + // Check if the loaded value is used only in a store operation. + StoreSDNode *StNode = nullptr; + for (auto I = LdNode->use_begin(), E = LdNode->use_end(); I != E; ++I) { + SDUse &U = I.getUse(); + if (U.getResNo() == 0) { + if (auto *St = dyn_cast<StoreSDNode>(U.getUser())) { + if (StNode) + return SDValue(); + StNode = St; + } else { + return SDValue(); + } + } + } + if (!StNode || !StNode->isSimple() || StNode->isIndexed() || + !StNode->getOffset().isUndef() || StNode->getMemoryVT() != MemVT || + !StNode->getChain().reachesChainWithoutSideEffects(SDValue(LdNode, 1))) + return SDValue(); + + // Create new node GET_FPENV_MEM, which uses the store address to write FP + // environment. + SDValue Res = DAG.getGetFPEnv(Chain, SDLoc(N), StNode->getBasePtr(), MemVT, + StNode->getMemOperand()); + CombineTo(StNode, Res, false); + return Res; +} + +SDValue DAGCombiner::visitSET_FPENV_MEM(SDNode *N) { + SDValue Chain = N->getOperand(0); + SDValue Ptr = N->getOperand(1); + EVT MemVT = cast<FPStateAccessSDNode>(N)->getMemoryVT(); + + // Check if the address of FP state is used also in a store operation only. + StoreSDNode *StNode = nullptr; + for (auto *U : Ptr->uses()) { + if (U == N) + continue; + if (auto *St = dyn_cast<StoreSDNode>(U)) { + if (StNode && StNode != St) + return SDValue(); + StNode = St; + continue; + } + return SDValue(); + } + if (!StNode || !StNode->isSimple() || StNode->isIndexed() || + !StNode->getOffset().isUndef() || StNode->getMemoryVT() != MemVT || + !Chain.reachesChainWithoutSideEffects(SDValue(StNode, 0))) + return SDValue(); + + // Check if the stored value is loaded from some location and the loaded + // value is used only in the store operation. + SDValue StValue = StNode->getValue(); + auto *LdNode = dyn_cast<LoadSDNode>(StValue); + if (!LdNode || !LdNode->isSimple() || LdNode->isIndexed() || + !LdNode->getOffset().isUndef() || LdNode->getMemoryVT() != MemVT || + !StNode->getChain().reachesChainWithoutSideEffects(SDValue(LdNode, 1))) + return SDValue(); + + // Create new node SET_FPENV_MEM, which uses the load address to read FP + // environment. + SDValue Res = + DAG.getSetFPEnv(LdNode->getChain(), SDLoc(N), LdNode->getBasePtr(), MemVT, + LdNode->getMemOperand()); + return Res; +} + /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle /// with the destination vector and a zero vector. /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> @@ -24960,8 +26091,6 @@ SDValue DAGCombiner::SimplifyVCastOp(SDNode *N, const SDLoc &DL) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); - EVT SrcVT = N0->getValueType(0); - EVT SrcEltVT = SrcVT.getVectorElementType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // TODO: promote operation might be also good here? @@ -24971,7 +26100,9 @@ SDValue DAGCombiner::SimplifyVCastOp(SDNode *N, const SDLoc &DL) { (N0.getOpcode() == ISD::SPLAT_VECTOR || TLI.isExtractVecEltCheap(VT, Index0)) && TLI.isOperationLegalOrCustom(Opcode, EltVT) && - TLI.preferScalarizeSplat(Opcode)) { + TLI.preferScalarizeSplat(N)) { + EVT SrcVT = N0.getValueType(); + EVT SrcEltVT = SrcVT.getVectorElementType(); SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcEltVT, Src0, IndexC); @@ -25588,14 +26719,14 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) { SDValue AndLHS = N0->getOperand(0); auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1)); - if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { + if (ConstAndRHS && ConstAndRHS->getAPIntValue().popcount() == 1) { // Shift the tested bit over the sign bit. const APInt &AndMask = ConstAndRHS->getAPIntValue(); unsigned ShCt = AndMask.getBitWidth() - 1; if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) { SDValue ShlAmt = - DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS), - getShiftAmountTy(AndLHS.getValueType())); + DAG.getConstant(AndMask.countl_zero(), SDLoc(AndLHS), + getShiftAmountTy(AndLHS.getValueType())); SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt); // Now arithmetic right shift it all the way over, so the result is @@ -25991,7 +27122,7 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, Reciprocal)) { AddToWorklist(Est.getNode()); - if (Iterations) + if (Iterations > 0) Est = UseOneConstNR ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal) : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); @@ -26334,7 +27465,7 @@ bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) { // BaseIndexOffset assumes that offsets are fixed-size, which // is not valid for scalable vectors where the offsets are // scaled by `vscale`, so bail out early. - if (St->getMemoryVT().isScalableVector()) + if (St->getMemoryVT().isScalableVT()) return false; // Add ST's interval. diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 2f2ae6e29855..f0affce7b6b8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -59,6 +59,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" @@ -95,7 +96,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -454,8 +454,7 @@ bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) { if (!TLI.isTypeLegal(VT)) { // MVT::i1 is special. Allow AND, OR, or XOR because they // don't require additional zeroing, which makes them easy. - if (VT == MVT::i1 && (ISDOpcode == ISD::AND || ISDOpcode == ISD::OR || - ISDOpcode == ISD::XOR)) + if (VT == MVT::i1 && ISD::isBitwiseLogicOp(ISDOpcode)) VT = TLI.getTypeToTransformTo(I->getContext(), VT); else return false; @@ -894,7 +893,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) { bool FastISel::selectXRayCustomEvent(const CallInst *I) { const auto &Triple = TM.getTargetTriple(); - if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux()) + if (Triple.isAArch64(64) && Triple.getArch() != Triple::x86_64) return true; // don't do anything to this instruction. SmallVector<MachineOperand, 8> Ops; Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(0)), @@ -913,7 +912,7 @@ bool FastISel::selectXRayCustomEvent(const CallInst *I) { bool FastISel::selectXRayTypedEvent(const CallInst *I) { const auto &Triple = TM.getTargetTriple(); - if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux()) + if (Triple.isAArch64(64) && Triple.getArch() != Triple::x86_64) return true; // don't do anything to this instruction. SmallVector<MachineOperand, 8> Ops; Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(0)), @@ -1209,6 +1208,9 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { return true; } + if (FuncInfo.PreprocessedDbgDeclares.contains(DI)) + return true; + const Value *Address = DI->getAddress(); if (!Address || isa<UndefValue>(Address)) { LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI @@ -1216,13 +1218,6 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { return true; } - // Byval arguments with frame indices were already handled after argument - // lowering and before isel. - const auto *Arg = - dyn_cast<Argument>(Address->stripInBoundsConstantOffsets()); - if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX) - return true; - std::optional<MachineOperand> Op; if (Register Reg = lookUpRegForValue(Address)) Op = MachineOperand::CreateReg(Reg, false); @@ -1277,60 +1272,85 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { const DbgValueInst *DI = cast<DbgValueInst>(II); const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); const Value *V = DI->getValue(); - assert(DI->getVariable()->isValidLocationForIntrinsic(MIMD.getDL()) && + DIExpression *Expr = DI->getExpression(); + DILocalVariable *Var = DI->getVariable(); + assert(Var->isValidLocationForIntrinsic(MIMD.getDL()) && "Expected inlined-at fields to agree"); if (!V || isa<UndefValue>(V) || DI->hasArgList()) { // DI is either undef or cannot produce a valid DBG_VALUE, so produce an // undef DBG_VALUE to terminate any prior location. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(), II, false, 0U, - DI->getVariable(), DI->getExpression()); - } else if (const auto *CI = dyn_cast<ConstantInt>(V)) { + Var, Expr); + return true; + } + if (const auto *CI = dyn_cast<ConstantInt>(V)) { // See if there's an expression to constant-fold. - DIExpression *Expr = DI->getExpression(); if (Expr) std::tie(Expr, CI) = Expr->constantFold(CI); if (CI->getBitWidth() > 64) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II) .addCImm(CI) .addImm(0U) - .addMetadata(DI->getVariable()) + .addMetadata(Var) .addMetadata(Expr); else BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II) .addImm(CI->getZExtValue()) .addImm(0U) - .addMetadata(DI->getVariable()) + .addMetadata(Var) .addMetadata(Expr); - } else if (const auto *CF = dyn_cast<ConstantFP>(V)) { + return true; + } + if (const auto *CF = dyn_cast<ConstantFP>(V)) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II) .addFPImm(CF) .addImm(0U) - .addMetadata(DI->getVariable()) - .addMetadata(DI->getExpression()); - } else if (Register Reg = lookUpRegForValue(V)) { + .addMetadata(Var) + .addMetadata(Expr); + return true; + } + if (const auto *Arg = dyn_cast<Argument>(V); + Arg && Expr && Expr->isEntryValue()) { + // As per the Verifier, this case is only valid for swift async Args. + assert(Arg->hasAttribute(Attribute::AttrKind::SwiftAsync)); + + Register Reg = getRegForValue(Arg); + for (auto [PhysReg, VirtReg] : FuncInfo.RegInfo->liveins()) + if (Reg == VirtReg || Reg == PhysReg) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(), II, + false /*IsIndirect*/, PhysReg, Var, Expr); + return true; + } + + LLVM_DEBUG(dbgs() << "Dropping dbg.value: expression is entry_value but " + "couldn't find a physical register\n" + << *DI << "\n"); + return true; + } + if (Register Reg = lookUpRegForValue(V)) { // FIXME: This does not handle register-indirect values at offset 0. if (!FuncInfo.MF->useDebugInstrRef()) { bool IsIndirect = false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(), II, IsIndirect, - Reg, DI->getVariable(), DI->getExpression()); - } else { - // If using instruction referencing, produce this as a DBG_INSTR_REF, - // to be later patched up by finalizeDebugInstrRefs. - SmallVector<MachineOperand, 1> MOs({MachineOperand::CreateReg( - /* Reg */ Reg, /* isDef */ false, /* isImp */ false, - /* isKill */ false, /* isDead */ false, - /* isUndef */ false, /* isEarlyClobber */ false, - /* SubReg */ 0, /* isDebug */ true)}); - SmallVector<uint64_t, 2> Ops({dwarf::DW_OP_LLVM_arg, 0}); - auto *NewExpr = DIExpression::prependOpcodes(DI->getExpression(), Ops); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(), - TII.get(TargetOpcode::DBG_INSTR_REF), /*IsIndirect*/ false, MOs, - DI->getVariable(), NewExpr); + Reg, Var, Expr); + return true; } - } else { - // We don't know how to handle other cases, so we drop. - LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n"); + // If using instruction referencing, produce this as a DBG_INSTR_REF, + // to be later patched up by finalizeDebugInstrRefs. + SmallVector<MachineOperand, 1> MOs({MachineOperand::CreateReg( + /* Reg */ Reg, /* isDef */ false, /* isImp */ false, + /* isKill */ false, /* isDead */ false, + /* isUndef */ false, /* isEarlyClobber */ false, + /* SubReg */ 0, /* isDebug */ true)}); + SmallVector<uint64_t, 2> Ops({dwarf::DW_OP_LLVM_arg, 0}); + auto *NewExpr = DIExpression::prependOpcodes(Expr, Ops); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(), + TII.get(TargetOpcode::DBG_INSTR_REF), /*IsIndirect*/ false, MOs, + Var, NewExpr); + return true; } + // We don't know how to handle other cases, so we drop. + LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n"); return true; } case Intrinsic::dbg_label: { diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index c18cd39ed296..1d0a03ccfcdc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -13,7 +13,7 @@ #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/ADT/APInt.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -83,7 +83,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); - DA = DAG->getDivergenceAnalysis(); + UA = DAG->getUniformityInfo(); // Check whether the function can return without sret-demotion. SmallVector<ISD::OutputArg, 4> Outs; @@ -128,20 +128,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, for (const Instruction &I : BB) { if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { Type *Ty = AI->getAllocatedType(); - Align TyPrefAlign = MF->getDataLayout().getPrefTypeAlign(Ty); - // The "specified" alignment is the alignment written on the alloca, - // or the preferred alignment of the type if none is specified. - // - // (Unspecified alignment on allocas will be going away soon.) - Align SpecifiedAlign = AI->getAlign(); - - // If the preferred alignment of the type is higher than the specified - // alignment of the alloca, promote the alignment, as long as it doesn't - // require realigning the stack. - // - // FIXME: Do we really want to second-guess the IR in isel? - Align Alignment = - std::max(std::min(TyPrefAlign, StackAlign), SpecifiedAlign); + Align Alignment = AI->getAlign(); // Static allocas can be folded into the initial stack frame // adjustment. For targets that don't realign the stack, don't @@ -165,9 +152,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, false, AI); } - // Scalable vectors may need a special StackID to distinguish - // them from other (fixed size) stack objects. - if (isa<ScalableVectorType>(Ty)) + // Scalable vectors and structures that contain scalable vectors may + // need a special StackID to distinguish them from other (fixed size) + // stack objects. + if (Ty->isScalableTy()) MF->getFrameInfo().setStackID(FrameIndex, TFI->getStackIDForScalableVectors()); @@ -305,18 +293,18 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) { for (WinEHHandlerType &H : TBME.HandlerArray) { if (H.Handler) - H.Handler = MBBMap[H.Handler.get<const BasicBlock *>()]; + H.Handler = MBBMap[cast<const BasicBlock *>(H.Handler)]; } } for (CxxUnwindMapEntry &UME : EHInfo.CxxUnwindMap) if (UME.Cleanup) - UME.Cleanup = MBBMap[UME.Cleanup.get<const BasicBlock *>()]; + UME.Cleanup = MBBMap[cast<const BasicBlock *>(UME.Cleanup)]; for (SEHUnwindMapEntry &UME : EHInfo.SEHUnwindMap) { - const auto *BB = UME.Handler.get<const BasicBlock *>(); + const auto *BB = cast<const BasicBlock *>(UME.Handler); UME.Handler = MBBMap[BB]; } for (ClrEHUnwindMapEntry &CME : EHInfo.ClrEHUnwindMap) { - const auto *BB = CME.Handler.get<const BasicBlock *>(); + const auto *BB = cast<const BasicBlock *>(CME.Handler); CME.Handler = MBBMap[BB]; } } else if (Personality == EHPersonality::Wasm_CXX) { @@ -326,18 +314,18 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, // Map all BB references in the Wasm EH data to MBBs. DenseMap<BBOrMBB, BBOrMBB> SrcToUnwindDest; for (auto &KV : EHInfo.SrcToUnwindDest) { - const auto *Src = KV.first.get<const BasicBlock *>(); - const auto *Dest = KV.second.get<const BasicBlock *>(); + const auto *Src = cast<const BasicBlock *>(KV.first); + const auto *Dest = cast<const BasicBlock *>(KV.second); SrcToUnwindDest[MBBMap[Src]] = MBBMap[Dest]; } EHInfo.SrcToUnwindDest = std::move(SrcToUnwindDest); DenseMap<BBOrMBB, SmallPtrSet<BBOrMBB, 4>> UnwindDestToSrcs; for (auto &KV : EHInfo.UnwindDestToSrcs) { - const auto *Dest = KV.first.get<const BasicBlock *>(); + const auto *Dest = cast<const BasicBlock *>(KV.first); UnwindDestToSrcs[MBBMap[Dest]] = SmallPtrSet<BBOrMBB, 4>(); for (const auto P : KV.second) UnwindDestToSrcs[MBBMap[Dest]].insert( - MBBMap[P.get<const BasicBlock *>()]); + MBBMap[cast<const BasicBlock *>(P)]); } EHInfo.UnwindDestToSrcs = std::move(UnwindDestToSrcs); } @@ -361,6 +349,7 @@ void FunctionLoweringInfo::clear() { StatepointStackSlots.clear(); StatepointRelocationMaps.clear(); PreferredExtendType.clear(); + PreprocessedDbgDeclares.clear(); } /// CreateReg - Allocate a single virtual register for the given type. @@ -394,8 +383,8 @@ Register FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) { } Register FunctionLoweringInfo::CreateRegs(const Value *V) { - return CreateRegs(V->getType(), DA && DA->isDivergent(V) && - !TLI->requiresUniformRegister(*MF, V)); + return CreateRegs(V->getType(), UA && UA->isDivergent(V) && + !TLI->requiresUniformRegister(*MF, V)); } /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the @@ -517,7 +506,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) { return; } DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits); - DestLOI.Known = KnownBits::commonBits(DestLOI.Known, SrcLOI->Known); + DestLOI.Known = DestLOI.Known.intersectWith(SrcLOI->Known); } } diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 338172e4e10a..4e7895c0b3cf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -1078,6 +1078,9 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, if (Flags.hasNoFPExcept()) MI->setFlag(MachineInstr::MIFlag::NoFPExcept); + + if (Flags.hasUnpredictable()) + MI->setFlag(MachineInstr::MIFlag::Unpredictable); } // Emit all of the actual operands of this instruction, adding them to the @@ -1161,6 +1164,13 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, } } + // Add rounding control registers as implicit def for function call. + if (II.isCall() && MF->getFunction().hasFnAttribute(Attribute::StrictFP)) { + ArrayRef<MCPhysReg> RCRegs = TLI->getRoundingControlRegisters(); + for (MCPhysReg Reg : RCRegs) + UsedRegs.push_back(Reg); + } + // Finally mark unused registers as dead. if (!UsedRegs.empty() || !II.implicit_defs().empty() || II.hasOptionalDef()) MIB->setPhysRegsDeadExcept(UsedRegs, *TRI); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index c3106216a060..61fc31715d71 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -41,7 +42,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -133,8 +133,11 @@ private: SDValue N1, SDValue N2, ArrayRef<int> Mask) const; - SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned); + std::pair<SDValue, SDValue> ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, + TargetLowering::ArgListTy &&Args, bool isSigned); + std::pair<SDValue, SDValue> ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned); + void ExpandFrexpLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results); void ExpandFPLibCall(SDNode *Node, RTLIB::Libcall LC, SmallVectorImpl<SDValue> &Results); void ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32, @@ -172,6 +175,9 @@ private: SDValue ExpandFCOPYSIGN(SDNode *Node) const; SDValue ExpandFABS(SDNode *Node) const; SDValue ExpandFNEG(SDNode *Node) const; + SDValue expandLdexp(SDNode *Node) const; + SDValue expandFrexp(SDNode *Node) const; + SDValue ExpandLegalINT_TO_FP(SDNode *Node, SDValue &Chain); void PromoteLegalINT_TO_FP(SDNode *N, const SDLoc &dl, SmallVectorImpl<SDValue> &Results); @@ -880,8 +886,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { // If the source type is not legal, see if there is a legal extload to // an intermediate type that we can then extend further. EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT()); - if (TLI.isTypeLegal(SrcVT) || // Same as SrcVT == LoadVT? - TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT)) { + if ((LoadVT.isFloatingPoint() == SrcVT.isFloatingPoint()) && + (TLI.isTypeLegal(SrcVT) || // Same as SrcVT == LoadVT? + TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT))) { // If we are loading a legal type, this is a non-extload followed by a // full extend. ISD::LoadExtType MidExtType = @@ -999,6 +1006,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { if (Action != TargetLowering::Promote) Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other); break; + case ISD::SET_FPENV: + Action = TLI.getOperationAction(Node->getOpcode(), + Node->getOperand(1).getValueType()); + break; case ISD::FP_TO_FP16: case ISD::FP_TO_BF16: case ISD::SINT_TO_FP: @@ -1199,6 +1210,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: case ISD::IS_FPCLASS: Action = TLI.getOperationAction( Node->getOpcode(), Node->getOperand(0).getValueType()); @@ -1546,7 +1559,7 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State, auto &DataLayout = DAG.getDataLayout(); // Store the float to memory, then load the sign part out as an integer. - MVT LoadTy = TLI.getRegisterType(*DAG.getContext(), MVT::i8); + MVT LoadTy = TLI.getRegisterType(MVT::i8); // First create a temporary that is aligned for both the load and store. SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy); int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); @@ -2015,23 +2028,14 @@ SDValue SelectionDAGLegalize::ExpandSPLAT_VECTOR(SDNode *Node) { return DAG.getSplatBuildVector(VT, DL, SplatVal); } -// Expand a node into a call to a libcall. If the result value -// does not fit into a register, return the lo part and set the hi part to the -// by-reg argument. If it does fit into a single register, return the result -// and leave the Hi part unset. -SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, +// Expand a node into a call to a libcall, returning the value as the first +// result and the chain as the second. If the result value does not fit into a +// register, return the lo part and set the hi part to the by-reg argument in +// the first. If it does fit into a single register, return the result and +// leave the Hi part unset. +std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, + TargetLowering::ArgListTy &&Args, bool isSigned) { - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - for (const SDValue &Op : Node->op_values()) { - EVT ArgVT = Op.getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - Entry.Node = Op; - Entry.Ty = ArgTy; - Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned); - Entry.IsZExt = !TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned); - Args.push_back(Entry); - } SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); @@ -2070,11 +2074,69 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, if (!CallInfo.second.getNode()) { LLVM_DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump(&DAG)); // It's a tailcall, return the chain (which is the DAG root). - return DAG.getRoot(); + return {DAG.getRoot(), DAG.getRoot()}; } LLVM_DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump(&DAG)); - return CallInfo.first; + return CallInfo; +} + +std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, + bool isSigned) { + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (const SDValue &Op : Node->op_values()) { + EVT ArgVT = Op.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Entry.Node = Op; + Entry.Ty = ArgTy; + Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned); + Entry.IsZExt = !Entry.IsSExt; + Args.push_back(Entry); + } + + return ExpandLibCall(LC, Node, std::move(Args), isSigned); +} + +void SelectionDAGLegalize::ExpandFrexpLibCall( + SDNode *Node, SmallVectorImpl<SDValue> &Results) { + SDLoc dl(Node); + EVT VT = Node->getValueType(0); + EVT ExpVT = Node->getValueType(1); + + SDValue FPOp = Node->getOperand(0); + + EVT ArgVT = FPOp.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + + TargetLowering::ArgListEntry FPArgEntry; + FPArgEntry.Node = FPOp; + FPArgEntry.Ty = ArgTy; + + SDValue StackSlot = DAG.CreateStackTemporary(ExpVT); + TargetLowering::ArgListEntry PtrArgEntry; + PtrArgEntry.Node = StackSlot; + PtrArgEntry.Ty = PointerType::get(*DAG.getContext(), + DAG.getDataLayout().getAllocaAddrSpace()); + + TargetLowering::ArgListTy Args = {FPArgEntry, PtrArgEntry}; + + RTLIB::Libcall LC = RTLIB::getFREXP(VT); + auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false); + + // FIXME: Get type of int for libcall declaration and cast + + int FrameIdx = cast<FrameIndexSDNode>(StackSlot)->getIndex(); + auto PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + + SDValue LoadExp = DAG.getLoad(ExpVT, dl, Chain, StackSlot, PtrInfo); + SDValue OutputChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + LoadExp.getValue(1), DAG.getRoot()); + DAG.setRoot(OutputChain); + + Results.push_back(Call); + Results.push_back(LoadExp); } void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node, @@ -2095,7 +2157,7 @@ void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node, Results.push_back(Tmp.first); Results.push_back(Tmp.second); } else { - SDValue Tmp = ExpandLibCall(LC, Node, false); + SDValue Tmp = ExpandLibCall(LC, Node, false).first; Results.push_back(Tmp); } } @@ -2129,7 +2191,7 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned, case MVT::i64: LC = Call_I64; break; case MVT::i128: LC = Call_I128; break; } - return ExpandLibCall(LC, Node, isSigned); + return ExpandLibCall(LC, Node, isSigned).first; } /// Expand the node to a libcall based on first argument type (for instance @@ -2309,6 +2371,237 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node, DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo())); } +SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { + SDLoc dl(Node); + EVT VT = Node->getValueType(0); + SDValue X = Node->getOperand(0); + SDValue N = Node->getOperand(1); + EVT ExpVT = N.getValueType(); + EVT AsIntVT = VT.changeTypeToInteger(); + if (AsIntVT == EVT()) // TODO: How to handle f80? + return SDValue(); + + if (Node->getOpcode() == ISD::STRICT_FLDEXP) // TODO + return SDValue(); + + SDNodeFlags NSW; + NSW.setNoSignedWrap(true); + SDNodeFlags NUW_NSW; + NUW_NSW.setNoUnsignedWrap(true); + NUW_NSW.setNoSignedWrap(true); + + EVT SetCCVT = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ExpVT); + const fltSemantics &FltSem = SelectionDAG::EVTToAPFloatSemantics(VT); + + const APFloat::ExponentType MaxExpVal = APFloat::semanticsMaxExponent(FltSem); + const APFloat::ExponentType MinExpVal = APFloat::semanticsMinExponent(FltSem); + const int Precision = APFloat::semanticsPrecision(FltSem); + + const SDValue MaxExp = DAG.getConstant(MaxExpVal, dl, ExpVT); + const SDValue MinExp = DAG.getConstant(MinExpVal, dl, ExpVT); + + const SDValue DoubleMaxExp = DAG.getConstant(2 * MaxExpVal, dl, ExpVT); + + const APFloat One(FltSem, "1.0"); + APFloat ScaleUpK = scalbn(One, MaxExpVal, APFloat::rmNearestTiesToEven); + + // Offset by precision to avoid denormal range. + APFloat ScaleDownK = + scalbn(One, MinExpVal + Precision, APFloat::rmNearestTiesToEven); + + // TODO: Should really introduce control flow and use a block for the > + // MaxExp, < MinExp cases + + // First, handle exponents Exp > MaxExp and scale down. + SDValue NGtMaxExp = DAG.getSetCC(dl, SetCCVT, N, MaxExp, ISD::SETGT); + + SDValue DecN0 = DAG.getNode(ISD::SUB, dl, ExpVT, N, MaxExp, NSW); + SDValue ClampMaxVal = DAG.getConstant(3 * MaxExpVal, dl, ExpVT); + SDValue ClampN_Big = DAG.getNode(ISD::SMIN, dl, ExpVT, N, ClampMaxVal); + SDValue DecN1 = + DAG.getNode(ISD::SUB, dl, ExpVT, ClampN_Big, DoubleMaxExp, NSW); + + SDValue ScaleUpTwice = + DAG.getSetCC(dl, SetCCVT, N, DoubleMaxExp, ISD::SETUGT); + + const SDValue ScaleUpVal = DAG.getConstantFP(ScaleUpK, dl, VT); + SDValue ScaleUp0 = DAG.getNode(ISD::FMUL, dl, VT, X, ScaleUpVal); + SDValue ScaleUp1 = DAG.getNode(ISD::FMUL, dl, VT, ScaleUp0, ScaleUpVal); + + SDValue SelectN_Big = + DAG.getNode(ISD::SELECT, dl, ExpVT, ScaleUpTwice, DecN1, DecN0); + SDValue SelectX_Big = + DAG.getNode(ISD::SELECT, dl, VT, ScaleUpTwice, ScaleUp1, ScaleUp0); + + // Now handle exponents Exp < MinExp + SDValue NLtMinExp = DAG.getSetCC(dl, SetCCVT, N, MinExp, ISD::SETLT); + + SDValue Increment0 = DAG.getConstant(-(MinExpVal + Precision), dl, ExpVT); + SDValue Increment1 = DAG.getConstant(-2 * (MinExpVal + Precision), dl, ExpVT); + + SDValue IncN0 = DAG.getNode(ISD::ADD, dl, ExpVT, N, Increment0, NUW_NSW); + + SDValue ClampMinVal = + DAG.getConstant(3 * MinExpVal + 2 * Precision, dl, ExpVT); + SDValue ClampN_Small = DAG.getNode(ISD::SMAX, dl, ExpVT, N, ClampMinVal); + SDValue IncN1 = + DAG.getNode(ISD::ADD, dl, ExpVT, ClampN_Small, Increment1, NSW); + + const SDValue ScaleDownVal = DAG.getConstantFP(ScaleDownK, dl, VT); + SDValue ScaleDown0 = DAG.getNode(ISD::FMUL, dl, VT, X, ScaleDownVal); + SDValue ScaleDown1 = DAG.getNode(ISD::FMUL, dl, VT, ScaleDown0, ScaleDownVal); + + SDValue ScaleDownTwice = DAG.getSetCC( + dl, SetCCVT, N, DAG.getConstant(2 * MinExpVal + Precision, dl, ExpVT), + ISD::SETULT); + + SDValue SelectN_Small = + DAG.getNode(ISD::SELECT, dl, ExpVT, ScaleDownTwice, IncN1, IncN0); + SDValue SelectX_Small = + DAG.getNode(ISD::SELECT, dl, VT, ScaleDownTwice, ScaleDown1, ScaleDown0); + + // Now combine the two out of range exponent handling cases with the base + // case. + SDValue NewX = DAG.getNode( + ISD::SELECT, dl, VT, NGtMaxExp, SelectX_Big, + DAG.getNode(ISD::SELECT, dl, VT, NLtMinExp, SelectX_Small, X)); + + SDValue NewN = DAG.getNode( + ISD::SELECT, dl, ExpVT, NGtMaxExp, SelectN_Big, + DAG.getNode(ISD::SELECT, dl, ExpVT, NLtMinExp, SelectN_Small, N)); + + SDValue BiasedN = DAG.getNode(ISD::ADD, dl, ExpVT, NewN, MaxExp, NSW); + + SDValue ExponentShiftAmt = + DAG.getShiftAmountConstant(Precision - 1, ExpVT, dl); + SDValue CastExpToValTy = DAG.getZExtOrTrunc(BiasedN, dl, AsIntVT); + + SDValue AsInt = DAG.getNode(ISD::SHL, dl, AsIntVT, CastExpToValTy, + ExponentShiftAmt, NUW_NSW); + SDValue AsFP = DAG.getNode(ISD::BITCAST, dl, VT, AsInt); + return DAG.getNode(ISD::FMUL, dl, VT, NewX, AsFP); +} + +SDValue SelectionDAGLegalize::expandFrexp(SDNode *Node) const { + SDLoc dl(Node); + SDValue Val = Node->getOperand(0); + EVT VT = Val.getValueType(); + EVT ExpVT = Node->getValueType(1); + EVT AsIntVT = VT.changeTypeToInteger(); + if (AsIntVT == EVT()) // TODO: How to handle f80? + return SDValue(); + + const fltSemantics &FltSem = SelectionDAG::EVTToAPFloatSemantics(VT); + const APFloat::ExponentType MinExpVal = APFloat::semanticsMinExponent(FltSem); + const unsigned Precision = APFloat::semanticsPrecision(FltSem); + const unsigned BitSize = VT.getScalarSizeInBits(); + + // TODO: Could introduce control flow and skip over the denormal handling. + + // scale_up = fmul value, scalbn(1.0, precision + 1) + // extracted_exp = (bitcast value to uint) >> precision - 1 + // biased_exp = extracted_exp + min_exp + // extracted_fract = (bitcast value to uint) & (fract_mask | sign_mask) + // + // is_denormal = val < smallest_normalized + // computed_fract = is_denormal ? scale_up : extracted_fract + // computed_exp = is_denormal ? biased_exp + (-precision - 1) : biased_exp + // + // result_0 = (!isfinite(val) || iszero(val)) ? val : computed_fract + // result_1 = (!isfinite(val) || iszero(val)) ? 0 : computed_exp + + SDValue NegSmallestNormalizedInt = DAG.getConstant( + APFloat::getSmallestNormalized(FltSem, true).bitcastToAPInt(), dl, + AsIntVT); + + SDValue SmallestNormalizedInt = DAG.getConstant( + APFloat::getSmallestNormalized(FltSem, false).bitcastToAPInt(), dl, + AsIntVT); + + // Masks out the exponent bits. + SDValue ExpMask = + DAG.getConstant(APFloat::getInf(FltSem).bitcastToAPInt(), dl, AsIntVT); + + // Mask out the exponent part of the value. + // + // e.g, for f32 FractSignMaskVal = 0x807fffff + APInt FractSignMaskVal = APInt::getBitsSet(BitSize, 0, Precision - 1); + FractSignMaskVal.setBit(BitSize - 1); // Set the sign bit + + APInt SignMaskVal = APInt::getSignedMaxValue(BitSize); + SDValue SignMask = DAG.getConstant(SignMaskVal, dl, AsIntVT); + + SDValue FractSignMask = DAG.getConstant(FractSignMaskVal, dl, AsIntVT); + + const APFloat One(FltSem, "1.0"); + // Scale a possible denormal input. + // e.g., for f64, 0x1p+54 + APFloat ScaleUpKVal = + scalbn(One, Precision + 1, APFloat::rmNearestTiesToEven); + + SDValue ScaleUpK = DAG.getConstantFP(ScaleUpKVal, dl, VT); + SDValue ScaleUp = DAG.getNode(ISD::FMUL, dl, VT, Val, ScaleUpK); + + EVT SetCCVT = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + + SDValue AsInt = DAG.getNode(ISD::BITCAST, dl, AsIntVT, Val); + + SDValue Abs = DAG.getNode(ISD::AND, dl, AsIntVT, AsInt, SignMask); + + SDValue AddNegSmallestNormal = + DAG.getNode(ISD::ADD, dl, AsIntVT, Abs, NegSmallestNormalizedInt); + SDValue DenormOrZero = DAG.getSetCC(dl, SetCCVT, AddNegSmallestNormal, + NegSmallestNormalizedInt, ISD::SETULE); + + SDValue IsDenormal = + DAG.getSetCC(dl, SetCCVT, Abs, SmallestNormalizedInt, ISD::SETULT); + + SDValue MinExp = DAG.getConstant(MinExpVal, dl, ExpVT); + SDValue Zero = DAG.getConstant(0, dl, ExpVT); + + SDValue ScaledAsInt = DAG.getNode(ISD::BITCAST, dl, AsIntVT, ScaleUp); + SDValue ScaledSelect = + DAG.getNode(ISD::SELECT, dl, AsIntVT, IsDenormal, ScaledAsInt, AsInt); + + SDValue ExpMaskScaled = + DAG.getNode(ISD::AND, dl, AsIntVT, ScaledAsInt, ExpMask); + + SDValue ScaledValue = + DAG.getNode(ISD::SELECT, dl, AsIntVT, IsDenormal, ExpMaskScaled, Abs); + + // Extract the exponent bits. + SDValue ExponentShiftAmt = + DAG.getShiftAmountConstant(Precision - 1, AsIntVT, dl); + SDValue ShiftedExp = + DAG.getNode(ISD::SRL, dl, AsIntVT, ScaledValue, ExponentShiftAmt); + SDValue Exp = DAG.getSExtOrTrunc(ShiftedExp, dl, ExpVT); + + SDValue NormalBiasedExp = DAG.getNode(ISD::ADD, dl, ExpVT, Exp, MinExp); + SDValue DenormalOffset = DAG.getConstant(-Precision - 1, dl, ExpVT); + SDValue DenormalExpBias = + DAG.getNode(ISD::SELECT, dl, ExpVT, IsDenormal, DenormalOffset, Zero); + + SDValue MaskedFractAsInt = + DAG.getNode(ISD::AND, dl, AsIntVT, ScaledSelect, FractSignMask); + const APFloat Half(FltSem, "0.5"); + SDValue FPHalf = DAG.getConstant(Half.bitcastToAPInt(), dl, AsIntVT); + SDValue Or = DAG.getNode(ISD::OR, dl, AsIntVT, MaskedFractAsInt, FPHalf); + SDValue MaskedFract = DAG.getNode(ISD::BITCAST, dl, VT, Or); + + SDValue ComputedExp = + DAG.getNode(ISD::ADD, dl, ExpVT, NormalBiasedExp, DenormalExpBias); + + SDValue Result0 = + DAG.getNode(ISD::SELECT, dl, VT, DenormOrZero, Val, MaskedFract); + + SDValue Result1 = + DAG.getNode(ISD::SELECT, dl, ExpVT, DenormOrZero, Zero, ComputedExp); + + return DAG.getMergeValues({Result0, Result1}, dl); +} + /// This function is responsible for legalizing a /// INT_TO_FP operation of the specified operand when the target requests that /// we expand it. At this point, we know that the result and operand types are @@ -2365,10 +2658,10 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node, SDValue Load = DAG.getLoad(MVT::f64, dl, MemChain, StackSlot, MachinePointerInfo()); // FP constant to bias correct the final result - SDValue Bias = DAG.getConstantFP(isSigned ? - BitsToDouble(0x4330000080000000ULL) : - BitsToDouble(0x4330000000000000ULL), - dl, MVT::f64); + SDValue Bias = DAG.getConstantFP( + isSigned ? llvm::bit_cast<double>(0x4330000080000000ULL) + : llvm::bit_cast<double>(0x4330000000000000ULL), + dl, MVT::f64); // Subtract the bias and get the final result. SDValue Sub; SDValue Result; @@ -2696,6 +2989,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { if ((Tmp1 = TLI.expandABS(Node, DAG))) Results.push_back(Tmp1); break; + case ISD::ABDS: + case ISD::ABDU: + if ((Tmp1 = TLI.expandABD(Node, DAG))) + Results.push_back(Tmp1); + break; case ISD::CTPOP: if ((Tmp1 = TLI.expandCTPOP(Node, DAG))) Results.push_back(Tmp1); @@ -3241,6 +3539,36 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { } break; } + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: { + EVT VT = Node->getValueType(0); + RTLIB::Libcall LC = RTLIB::getLDEXP(VT); + // Use the LibCall instead, it is very likely faster + // FIXME: Use separate LibCall action. + if (TLI.getLibcallName(LC)) + break; + + if (SDValue Expanded = expandLdexp(Node)) { + Results.push_back(Expanded); + if (Node->getOpcode() == ISD::STRICT_FLDEXP) + Results.push_back(Expanded.getValue(1)); + } + + break; + } + case ISD::FFREXP: { + RTLIB::Libcall LC = RTLIB::getFREXP(Node->getValueType(0)); + // Use the LibCall instead, it is very likely faster + // FIXME: Use separate LibCall action. + if (TLI.getLibcallName(LC)) + break; + + if (SDValue Expanded = expandFrexp(Node)) { + Results.push_back(Expanded); + Results.push_back(Expanded.getValue(1)); + } + break; + } case ISD::FMAD: llvm_unreachable("Illegal fmad should never be formed"); @@ -3477,13 +3805,13 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { // if we were allowed to generate libcalls to division functions of illegal // type. But we cannot do that. llvm_unreachable("Cannot expand DIVFIX!"); - case ISD::ADDCARRY: - case ISD::SUBCARRY: { + case ISD::UADDO_CARRY: + case ISD::USUBO_CARRY: { SDValue LHS = Node->getOperand(0); SDValue RHS = Node->getOperand(1); SDValue Carry = Node->getOperand(2); - bool IsAdd = Node->getOpcode() == ISD::ADDCARRY; + bool IsAdd = Node->getOpcode() == ISD::UADDO_CARRY; // Initial add of the 2 operands. unsigned Op = IsAdd ? ISD::ADD : ISD::SUB; @@ -3628,9 +3956,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { } else { // We test only the i1 bit. Skip the AND if UNDEF or another AND. if (Tmp2.isUndef() || - (Tmp2.getOpcode() == ISD::AND && - isa<ConstantSDNode>(Tmp2.getOperand(1)) && - cast<ConstantSDNode>(Tmp2.getOperand(1))->getZExtValue() == 1)) + (Tmp2.getOpcode() == ISD::AND && isOneConstant(Tmp2.getOperand(1)))) Tmp3 = Tmp2; else Tmp3 = DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2, @@ -3864,6 +4190,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: Results.push_back(TLI.expandVecReduce(Node, DAG)); break; case ISD::GLOBAL_OFFSET_TABLE: @@ -4029,6 +4357,9 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::FMIN_F80, RTLIB::FMIN_F128, RTLIB::FMIN_PPCF128, Results); break; + // FIXME: We do not have libcalls for FMAXIMUM and FMINIMUM. So, we cannot use + // libcall legalization for these nodes, but there is no default expasion for + // these nodes either (see PR63267 for example). case ISD::FMAXNUM: case ISD::STRICT_FMAXNUM: ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64, @@ -4135,6 +4466,15 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::ROUNDEVEN_F128, RTLIB::ROUNDEVEN_PPCF128, Results); break; + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: + ExpandFPLibCall(Node, RTLIB::LDEXP_F32, RTLIB::LDEXP_F64, RTLIB::LDEXP_F80, + RTLIB::LDEXP_F128, RTLIB::LDEXP_PPCF128, Results); + break; + case ISD::FFREXP: { + ExpandFrexpLibCall(Node, Results); + break; + } case ISD::FPOWI: case ISD::STRICT_FPOWI: { RTLIB::Libcall LC = RTLIB::getPOWI(Node->getSimpleValueType(0)); @@ -4241,7 +4581,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { break; case ISD::FP16_TO_FP: if (Node->getValueType(0) == MVT::f32) { - Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false)); + Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false).first); } break; case ISD::STRICT_FP16_TO_FP: { @@ -4259,14 +4599,14 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::Libcall LC = RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16"); - Results.push_back(ExpandLibCall(LC, Node, false)); + Results.push_back(ExpandLibCall(LC, Node, false).first); break; } case ISD::FP_TO_BF16: { RTLIB::Libcall LC = RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::bf16); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_bf16"); - Results.push_back(ExpandLibCall(LC, Node, false)); + Results.push_back(ExpandLibCall(LC, Node, false).first); break; } case ISD::STRICT_SINT_TO_FP: @@ -4381,7 +4721,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { Results.push_back( ExpandLibCall(RTLIB::getFPEXT(Node->getOperand(0).getValueType(), Node->getValueType(0)), - Node, false)); + Node, false).first); break; } case ISD::STRICT_FP_EXTEND: @@ -4447,16 +4787,39 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { default: llvm_unreachable("LibCall explicitly requested, but not available"); case MVT::i32: - Results.push_back(ExpandLibCall(RTLIB::CTLZ_I32, Node, false)); + Results.push_back(ExpandLibCall(RTLIB::CTLZ_I32, Node, false).first); break; case MVT::i64: - Results.push_back(ExpandLibCall(RTLIB::CTLZ_I64, Node, false)); + Results.push_back(ExpandLibCall(RTLIB::CTLZ_I64, Node, false).first); break; case MVT::i128: - Results.push_back(ExpandLibCall(RTLIB::CTLZ_I128, Node, false)); + Results.push_back(ExpandLibCall(RTLIB::CTLZ_I128, Node, false).first); break; } break; + case ISD::RESET_FPENV: { + // It is legalized to call 'fesetenv(FE_DFL_ENV)'. On most targets + // FE_DFL_ENV is defined as '((const fenv_t *) -1)' in glibc. + SDValue Ptr = DAG.getIntPtrConstant(-1LL, dl); + SDValue Chain = Node->getOperand(0); + Results.push_back( + DAG.makeStateFunctionCall(RTLIB::FESETENV, Ptr, Chain, dl)); + break; + } + case ISD::GET_FPENV_MEM: { + SDValue Chain = Node->getOperand(0); + SDValue EnvPtr = Node->getOperand(1); + Results.push_back( + DAG.makeStateFunctionCall(RTLIB::FEGETENV, EnvPtr, Chain, dl)); + break; + } + case ISD::SET_FPENV_MEM: { + SDValue Chain = Node->getOperand(0); + SDValue EnvPtr = Node->getOperand(1); + Results.push_back( + DAG.makeStateFunctionCall(RTLIB::FESETENV, EnvPtr, Chain, dl)); + break; + } } // Replace the original node with the legalized result. @@ -4785,6 +5148,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::FREM: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINIMUM: + case ISD::FMAXIMUM: case ISD::FPOW: Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1)); @@ -4841,6 +5206,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { Results.push_back(Tmp4.getValue(1)); break; case ISD::FCOPYSIGN: + case ISD::FLDEXP: case ISD::FPOWI: { Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); Tmp2 = Node->getOperand(1); @@ -4867,6 +5233,17 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { Results.push_back(Tmp3); Results.push_back(Tmp3.getValue(1)); break; + case ISD::FFREXP: { + Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0)); + Tmp2 = DAG.getNode(ISD::FFREXP, dl, {NVT, Node->getValueType(1)}, Tmp1); + + Results.push_back( + DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp2, + DAG.getIntPtrConstant(0, dl, /*isTarget=*/true))); + + Results.push_back(Tmp2.getValue(1)); + break; + } case ISD::FFLOOR: case ISD::FCEIL: case ISD::FRINT: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index f1e80ce7e037..7e035d21ef71 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -59,7 +59,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { dbgs() << "SoftenFloatResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to soften the result of this operator!"); + report_fatal_error("Do not know how to soften the result of this " + "operator!"); case ISD::ARITH_FENCE: R = SoftenFloatRes_ARITH_FENCE(N); break; case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break; @@ -107,10 +108,16 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: R = SoftenFloatRes_FP_ROUND(N); break; case ISD::FP16_TO_FP: R = SoftenFloatRes_FP16_TO_FP(N); break; + case ISD::BF16_TO_FP: R = SoftenFloatRes_BF16_TO_FP(N); break; case ISD::STRICT_FPOW: case ISD::FPOW: R = SoftenFloatRes_FPOW(N); break; case ISD::STRICT_FPOWI: - case ISD::FPOWI: R = SoftenFloatRes_FPOWI(N); break; + case ISD::FPOWI: + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: R = SoftenFloatRes_ExpOp(N); break; + case ISD::FFREXP: + R = SoftenFloatRes_FFREXP(N); + break; case ISD::STRICT_FREM: case ISD::FREM: R = SoftenFloatRes_FREM(N); break; case ISD::STRICT_FRINT: @@ -142,6 +149,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: R = SoftenFloatRes_VECREDUCE(N); break; case ISD::VECREDUCE_SEQ_FADD: @@ -510,10 +519,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { return BitConvertToInteger(Op); } - // There's only a libcall for f16 -> f32, so proceed in two stages. Also, it's - // entirely possible for both f16 and f32 to be legal, so use the fully - // hard-float FP_EXTEND rather than FP16_TO_FP. - if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) { + // There's only a libcall for f16 -> f32 and shifting is only valid for bf16 + // -> f32, so proceed in two stages. Also, it's entirely possible for both + // f16 and f32 to be legal, so use the fully hard-float FP_EXTEND rather + // than FP16_TO_FP. + if ((Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16) && + N->getValueType(0) != MVT::f32) { if (IsStrict) { Op = DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N), { MVT::f32, MVT::Other }, { Chain, Op }); @@ -523,6 +534,9 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { } } + if (Op.getValueType() == MVT::bf16) + return SoftenFloatRes_BF16_TO_FP(N); + RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); TargetLowering::MakeLibCallOptions CallOptions; @@ -555,6 +569,21 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) { return TLI.makeLibCall(DAG, LC, NVT, Res32, CallOptions, SDLoc(N)).first; } +// FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special +// nodes? +SDValue DAGTypeLegalizer::SoftenFloatRes_BF16_TO_FP(SDNode *N) { + assert(N->getValueType(0) == MVT::f32 && + "Can only soften BF16_TO_FP with f32 result"); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32); + SDValue Op = N->getOperand(0); + SDLoc DL(N); + Op = DAG.getNode(ISD::ANY_EXTEND, DL, NVT, + DAG.getNode(ISD::BITCAST, DL, MVT::i16, Op)); + SDValue Res = DAG.getNode(ISD::SHL, DL, NVT, Op, + DAG.getShiftAmountConstant(16, NVT, DL)); + return Res; +} + SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) { bool IsStrict = N->isStrictFPOpcode(); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); @@ -582,13 +611,17 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) { RTLIB::POW_PPCF128)); } -SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::SoftenFloatRes_ExpOp(SDNode *N) { bool IsStrict = N->isStrictFPOpcode(); unsigned Offset = IsStrict ? 1 : 0; assert((N->getOperand(1 + Offset).getValueType() == MVT::i16 || N->getOperand(1 + Offset).getValueType() == MVT::i32) && "Unsupported power type!"); - RTLIB::Libcall LC = RTLIB::getPOWI(N->getValueType(0)); + bool IsPowI = + N->getOpcode() == ISD::FPOWI || N->getOpcode() == ISD::STRICT_FPOWI; + + RTLIB::Libcall LC = IsPowI ? RTLIB::getPOWI(N->getValueType(0)) + : RTLIB::getLDEXP(N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fpowi."); if (!TLI.getLibcallName(LC)) { // Some targets don't have a powi libcall; use pow instead. @@ -621,6 +654,45 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) { return Tmp.first; } +SDValue DAGTypeLegalizer::SoftenFloatRes_FFREXP(SDNode *N) { + assert(!N->isStrictFPOpcode() && "strictfp not implemented for frexp"); + EVT VT0 = N->getValueType(0); + EVT VT1 = N->getValueType(1); + RTLIB::Libcall LC = RTLIB::getFREXP(VT0); + + if (DAG.getLibInfo().getIntSize() != VT1.getSizeInBits()) { + // If the exponent does not match with sizeof(int) a libcall would use the + // wrong type for the argument. + // TODO: Should be able to handle mismatches. + DAG.getContext()->emitError("ffrexp exponent does not match sizeof(int)"); + return DAG.getUNDEF(N->getValueType(0)); + } + + EVT NVT0 = TLI.getTypeToTransformTo(*DAG.getContext(), VT0); + SDValue StackSlot = DAG.CreateStackTemporary(VT1); + + SDLoc DL(N); + + TargetLowering::MakeLibCallOptions CallOptions; + SDValue Ops[2] = {GetSoftenedFloat(N->getOperand(0)), StackSlot}; + EVT OpsVT[2] = {VT0, StackSlot.getValueType()}; + + // TODO: setTypeListBeforeSoften can't properly express multiple return types, + // but we only really need to handle the 0th one for softening anyway. + CallOptions.setTypeListBeforeSoften({OpsVT}, VT0, true); + + auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT0, Ops, CallOptions, DL, + /*Chain=*/SDValue()); + int FrameIdx = cast<FrameIndexSDNode>(StackSlot)->getIndex(); + auto PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + + SDValue LoadExp = DAG.getLoad(VT1, DL, Chain, StackSlot, PtrInfo); + + ReplaceValueWith(SDValue(N, 1), LoadExp); + return ReturnVal; +} + SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) { return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0), RTLIB::REM_F32, @@ -828,7 +900,7 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) { dbgs() << "SoftenFloatOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to soften this operator's operand!"); + report_fatal_error("Do not know how to soften this operator's operand!"); case ISD::BITCAST: Res = SoftenFloatOp_BITCAST(N); break; case ISD::BR_CC: Res = SoftenFloatOp_BR_CC(N); break; @@ -1199,7 +1271,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { dbgs() << "ExpandFloatResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to expand the result of this operator!"); + report_fatal_error("Do not know how to expand the result of this " + "operator!"); case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; @@ -1253,6 +1326,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::FPOW: ExpandFloatRes_FPOW(N, Lo, Hi); break; case ISD::STRICT_FPOWI: case ISD::FPOWI: ExpandFloatRes_FPOWI(N, Lo, Hi); break; + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: ExpandFloatRes_FLDEXP(N, Lo, Hi); break; case ISD::FREEZE: ExpandFloatRes_FREEZE(N, Lo, Hi); break; case ISD::STRICT_FRINT: case ISD::FRINT: ExpandFloatRes_FRINT(N, Lo, Hi); break; @@ -1548,6 +1623,11 @@ void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N, ExpandFloatRes_Binary(N, RTLIB::getPOWI(N->getValueType(0)), Lo, Hi); } +void DAGTypeLegalizer::ExpandFloatRes_FLDEXP(SDNode *N, SDValue &Lo, + SDValue &Hi) { + ExpandFloatRes_Binary(N, RTLIB::getLDEXP(N->getValueType(0)), Lo, Hi); +} + void DAGTypeLegalizer::ExpandFloatRes_FREEZE(SDNode *N, SDValue &Lo, SDValue &Hi) { assert(N->getValueType(0) == MVT::ppcf128 && @@ -1785,7 +1865,7 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) { dbgs() << "ExpandFloatOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to expand this operator's operand!"); + report_fatal_error("Do not know how to expand this operator's operand!"); case ISD::BITCAST: Res = ExpandOp_BITCAST(N); break; case ISD::BUILD_VECTOR: Res = ExpandOp_BUILD_VECTOR(N); break; @@ -2106,7 +2186,7 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) { dbgs() << "PromoteFloatOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to promote this operator's operand!"); + report_fatal_error("Do not know how to promote this operator's operand!"); case ISD::BITCAST: R = PromoteFloatOp_BITCAST(N, OpNo); break; case ISD::FCOPYSIGN: R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break; @@ -2245,7 +2325,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { dbgs() << "PromoteFloatResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to promote this operator's result!"); + report_fatal_error("Do not know how to promote this operator's result!"); case ISD::BITCAST: R = PromoteFloatRes_BITCAST(N); break; case ISD::ConstantFP: R = PromoteFloatRes_ConstantFP(N); break; @@ -2289,7 +2369,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::FMA: // FMA is same as FMAD case ISD::FMAD: R = PromoteFloatRes_FMAD(N); break; - case ISD::FPOWI: R = PromoteFloatRes_FPOWI(N); break; + case ISD::FPOWI: + case ISD::FLDEXP: R = PromoteFloatRes_ExpOp(N); break; + case ISD::FFREXP: R = PromoteFloatRes_FFREXP(N); break; case ISD::FP_ROUND: R = PromoteFloatRes_FP_ROUND(N); break; case ISD::LOAD: R = PromoteFloatRes_LOAD(N); break; @@ -2304,6 +2386,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: R = PromoteFloatRes_VECREDUCE(N); break; case ISD::VECREDUCE_SEQ_FADD: @@ -2458,7 +2542,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FMAD(SDNode *N) { } // Promote the Float (first) operand and retain the Integer (second) operand -SDValue DAGTypeLegalizer::PromoteFloatRes_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::PromoteFloatRes_ExpOp(SDNode *N) { EVT VT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); SDValue Op0 = GetPromotedFloat(N->getOperand(0)); @@ -2467,6 +2551,17 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FPOWI(SDNode *N) { return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1); } +SDValue DAGTypeLegalizer::PromoteFloatRes_FFREXP(SDNode *N) { + EVT VT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue Op = GetPromotedFloat(N->getOperand(0)); + SDValue Res = + DAG.getNode(N->getOpcode(), SDLoc(N), {NVT, N->getValueType(1)}, Op); + + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + // Explicit operation to reduce precision. Reduce the value to half precision // and promote it back to the legal type. SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) { @@ -2608,7 +2703,8 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { dbgs() << "SoftPromoteHalfResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to soft promote this operator's result!"); + report_fatal_error("Do not know how to soft promote this operator's " + "result!"); case ISD::BITCAST: R = SoftPromoteHalfRes_BITCAST(N); break; case ISD::ConstantFP: R = SoftPromoteHalfRes_ConstantFP(N); break; @@ -2655,7 +2751,8 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FMA: // FMA is same as FMAD case ISD::FMAD: R = SoftPromoteHalfRes_FMAD(N); break; - case ISD::FPOWI: R = SoftPromoteHalfRes_FPOWI(N); break; + case ISD::FPOWI: + case ISD::FLDEXP: R = SoftPromoteHalfRes_ExpOp(N); break; case ISD::LOAD: R = SoftPromoteHalfRes_LOAD(N); break; case ISD::SELECT: R = SoftPromoteHalfRes_SELECT(N); break; @@ -2668,6 +2765,8 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: R = SoftPromoteHalfRes_VECREDUCE(N); break; case ISD::VECREDUCE_SEQ_FADD: @@ -2767,7 +2866,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) { return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } -SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ExpOp(SDNode *N) { EVT OVT = N->getValueType(0); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT); SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0)); @@ -2916,7 +3015,8 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) { dbgs() << "SoftPromoteHalfOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to soft promote this operator's operand!"); + report_fatal_error("Do not know how to soft promote this operator's " + "operand!"); case ISD::BITCAST: Res = SoftPromoteHalfOp_BITCAST(N); break; case ISD::FCOPYSIGN: Res = SoftPromoteHalfOp_FCOPYSIGN(N, OpNo); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index c9ce9071a25d..df5878fcdf2e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -20,6 +20,7 @@ #include "LegalizeTypes.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" @@ -54,7 +55,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { dbgs() << "PromoteIntegerResult #" << ResNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to promote this operator!"); + report_fatal_error("Do not know how to promote this operator!"); case ISD::MERGE_VALUES:Res = PromoteIntRes_MERGE_VALUES(N, ResNo); break; case ISD::AssertSext: Res = PromoteIntRes_AssertSext(N); break; case ISD::AssertZext: Res = PromoteIntRes_AssertZext(N); break; @@ -115,6 +116,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VECTOR_SHUFFLE(N); break; case ISD::VECTOR_SPLICE: Res = PromoteIntRes_VECTOR_SPLICE(N); break; + case ISD::VECTOR_INTERLEAVE: + case ISD::VECTOR_DEINTERLEAVE: + Res = PromoteIntRes_VECTOR_INTERLEAVE_DEINTERLEAVE(N); + return; case ISD::INSERT_VECTOR_ELT: Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break; case ISD::BUILD_VECTOR: @@ -134,7 +139,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_EXTEND_VECTOR_INREG(N); break; case ISD::SIGN_EXTEND: + case ISD::VP_SIGN_EXTEND: case ISD::ZERO_EXTEND: + case ISD::VP_ZERO_EXTEND: case ISD::ANY_EXTEND: Res = PromoteIntRes_INT_EXTEND(N); break; case ISD::VP_FP_TO_SINT: @@ -191,8 +198,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ADDE: case ISD::SUBE: - case ISD::ADDCARRY: - case ISD::SUBCARRY: Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break; + case ISD::UADDO_CARRY: + case ISD::USUBO_CARRY: Res = PromoteIntRes_UADDSUBO_CARRY(N, ResNo); break; case ISD::SADDO_CARRY: case ISD::SSUBO_CARRY: Res = PromoteIntRes_SADDSUBO_CARRY(N, ResNo); break; @@ -279,6 +286,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::IS_FPCLASS: Res = PromoteIntRes_IS_FPCLASS(N); break; + case ISD::FFREXP: + Res = PromoteIntRes_FFREXP(N); + break; } // If the result is null then the sub-method took care of registering it. @@ -753,8 +763,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) { assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!"); // If the result and operand types are the same after promotion, simplify - // to an in-register extension. - if (NVT == Res.getValueType()) { + // to an in-register extension. Unless this is a VP_*_EXTEND. + if (NVT == Res.getValueType() && N->getNumOperands() == 1) { // The high bits are not guaranteed to be anything. Insert an extend. if (N->getOpcode() == ISD::SIGN_EXTEND) return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res, @@ -767,6 +777,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) { } // Otherwise, just extend the original operand all the way to the larger type. + if (N->getNumOperands() != 1) { + assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); + assert(N->isVPOpcode() && "Expected VP opcode"); + return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0), + N->getOperand(1), N->getOperand(2)); + } return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); } @@ -1023,14 +1039,8 @@ static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS, if (VT.isVector()) WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, VT.getVectorElementCount()); - if (Signed) { - LHS = DAG.getSExtOrTrunc(LHS, dl, WideVT); - RHS = DAG.getSExtOrTrunc(RHS, dl, WideVT); - } else { - LHS = DAG.getZExtOrTrunc(LHS, dl, WideVT); - RHS = DAG.getZExtOrTrunc(RHS, dl, WideVT); - } - + LHS = DAG.getExtOrTrunc(Signed, LHS, dl, WideVT); + RHS = DAG.getExtOrTrunc(Signed, RHS, dl, WideVT); SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale, DAG); assert(Res && "Expanding DIVFIX with wide type failed?"); @@ -1177,16 +1187,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) { // Get the SETCC result using the canonical SETCC type. SDValue SetCC; if (N->isStrictFPOpcode()) { - EVT VTs[] = {SVT, MVT::Other}; + SDVTList VTs = DAG.getVTList({SVT, MVT::Other}); SDValue Opers[] = {N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)}; - SetCC = DAG.getNode(N->getOpcode(), dl, VTs, Opers); + SetCC = DAG.getNode(N->getOpcode(), dl, VTs, Opers, N->getFlags()); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), SetCC.getValue(1)); } else SetCC = DAG.getNode(N->getOpcode(), dl, SVT, N->getOperand(0), - N->getOperand(1), N->getOperand(2)); + N->getOperand(1), N->getOperand(2), N->getFlags()); // Convert to the expected type. return DAG.getSExtOrTrunc(SetCC, dl, NVT); @@ -1200,6 +1210,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_IS_FPCLASS(SDNode *N) { return DAG.getNode(ISD::IS_FPCLASS, DL, NResVT, Arg, Test); } +SDValue DAGTypeLegalizer::PromoteIntRes_FFREXP(SDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1)); + EVT VT = N->getValueType(0); + + SDLoc dl(N); + SDValue Res = + DAG.getNode(N->getOpcode(), dl, DAG.getVTList(VT, NVT), N->getOperand(0)); + + ReplaceValueWith(SDValue(N, 0), Res); + return Res.getValue(1); +} + SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) { SDValue LHS = GetPromotedInteger(N->getOperand(0)); SDValue RHS = N->getOperand(1); @@ -1445,23 +1467,24 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) { return Res; } -// Handle promotion for the ADDE/SUBE/ADDCARRY/SUBCARRY nodes. Notice that +// Handle promotion for the ADDE/SUBE/UADDO_CARRY/USUBO_CARRY nodes. Notice that // the third operand of ADDE/SUBE nodes is carry flag, which differs from -// the ADDCARRY/SUBCARRY nodes in that the third operand is carry Boolean. -SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo) { +// the UADDO_CARRY/USUBO_CARRY nodes in that the third operand is carry Boolean. +SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO_CARRY(SDNode *N, + unsigned ResNo) { if (ResNo == 1) return PromoteIntRes_Overflow(N); // We need to sign-extend the operands so the carry value computed by the // wide operation will be equivalent to the carry value computed by the // narrow operation. - // An ADDCARRY can generate carry only if any of the operands has its + // An UADDO_CARRY can generate carry only if any of the operands has its // most significant bit set. Sign extension propagates the most significant // bit into the higher bits which means the extra bit that the narrow // addition would need (i.e. the carry) will be propagated through the higher // bits of the wide addition. - // A SUBCARRY can generate borrow only if LHS < RHS and this property will be - // preserved by sign extension. + // A USUBO_CARRY can generate borrow only if LHS < RHS and this property will + // be preserved by sign extension. SDValue LHS = SExtPromotedInteger(N->getOperand(0)); SDValue RHS = SExtPromotedInteger(N->getOperand(1)); @@ -1629,7 +1652,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { dbgs() << "PromoteIntegerOperand Op #" << OpNo << ": "; N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to promote this operator's operand!"); + report_fatal_error("Do not know how to promote this operator's operand!"); case ISD::ANY_EXTEND: Res = PromoteIntOp_ANY_EXTEND(N); break; case ISD::ATOMIC_STORE: @@ -1655,6 +1678,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::VP_SETCC: case ISD::SETCC: Res = PromoteIntOp_SETCC(N, OpNo); break; case ISD::SIGN_EXTEND: Res = PromoteIntOp_SIGN_EXTEND(N); break; + case ISD::VP_SIGN_EXTEND: Res = PromoteIntOp_VP_SIGN_EXTEND(N); break; case ISD::VP_SINT_TO_FP: case ISD::SINT_TO_FP: Res = PromoteIntOp_SINT_TO_FP(N); break; case ISD::STRICT_SINT_TO_FP: Res = PromoteIntOp_STRICT_SINT_TO_FP(N); break; @@ -1676,6 +1700,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; case ISD::STRICT_UINT_TO_FP: Res = PromoteIntOp_STRICT_UINT_TO_FP(N); break; case ISD::ZERO_EXTEND: Res = PromoteIntOp_ZERO_EXTEND(N); break; + case ISD::VP_ZERO_EXTEND: Res = PromoteIntOp_VP_ZERO_EXTEND(N); break; case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntOp_EXTRACT_SUBVECTOR(N); break; case ISD::INSERT_SUBVECTOR: Res = PromoteIntOp_INSERT_SUBVECTOR(N); break; @@ -1690,8 +1715,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SADDO_CARRY: case ISD::SSUBO_CARRY: - case ISD::ADDCARRY: - case ISD::SUBCARRY: Res = PromoteIntOp_ADDSUBCARRY(N, OpNo); break; + case ISD::UADDO_CARRY: + case ISD::USUBO_CARRY: Res = PromoteIntOp_ADDSUBO_CARRY(N, OpNo); break; case ISD::FRAMEADDR: case ISD::RETURNADDR: Res = PromoteIntOp_FRAMERETURNADDR(N); break; @@ -1706,10 +1731,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::SDIVFIXSAT: case ISD::UDIVFIX: case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break; - case ISD::FPOWI: - case ISD::STRICT_FPOWI: Res = PromoteIntOp_FPOWI(N); break; - + case ISD::STRICT_FPOWI: + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: Res = PromoteIntOp_ExpOp(N); break; case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_MUL: case ISD::VECREDUCE_AND: @@ -2005,6 +2030,23 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) { Op, DAG.getValueType(N->getOperand(0).getValueType())); } +SDValue DAGTypeLegalizer::PromoteIntOp_VP_SIGN_EXTEND(SDNode *N) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Op = GetPromotedInteger(N->getOperand(0)); + // FIXME: There is no VP_ANY_EXTEND yet. + Op = DAG.getNode(ISD::VP_ZERO_EXTEND, dl, VT, Op, N->getOperand(1), + N->getOperand(2)); + unsigned Diff = + VT.getScalarSizeInBits() - N->getOperand(0).getScalarValueSizeInBits(); + SDValue ShAmt = DAG.getShiftAmountConstant(Diff, VT, dl); + // FIXME: There is no VP_SIGN_EXTEND_INREG so use a pair of shifts. + SDValue Shl = DAG.getNode(ISD::VP_SHL, dl, VT, Op, ShAmt, N->getOperand(1), + N->getOperand(2)); + return DAG.getNode(ISD::VP_ASHR, dl, VT, Shl, ShAmt, N->getOperand(1), + N->getOperand(2)); +} + SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) { if (N->getOpcode() == ISD::VP_SINT_TO_FP) return SDValue(DAG.UpdateNodeOperands(N, @@ -2156,7 +2198,20 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) { return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType()); } -SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) { +SDValue DAGTypeLegalizer::PromoteIntOp_VP_ZERO_EXTEND(SDNode *N) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Op = GetPromotedInteger(N->getOperand(0)); + // FIXME: There is no VP_ANY_EXTEND yet. + Op = DAG.getNode(ISD::VP_ZERO_EXTEND, dl, VT, Op, N->getOperand(1), + N->getOperand(2)); + APInt Imm = APInt::getLowBitsSet(VT.getScalarSizeInBits(), + N->getOperand(0).getScalarValueSizeInBits()); + return DAG.getNode(ISD::VP_AND, dl, VT, Op, DAG.getConstant(Imm, dl, VT), + N->getOperand(1), N->getOperand(2)); +} + +SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBO_CARRY(SDNode *N, unsigned OpNo) { assert(OpNo == 2 && "Don't know how to promote this operand!"); SDValue LHS = N->getOperand(0); @@ -2193,26 +2248,29 @@ SDValue DAGTypeLegalizer::PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo) { 0); } -SDValue DAGTypeLegalizer::PromoteIntOp_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::PromoteIntOp_ExpOp(SDNode *N) { bool IsStrict = N->isStrictFPOpcode(); SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); - // The integer operand is the last operand in FPOWI (so the result and - // floating point operand is already type legalized). + bool IsPowI = + N->getOpcode() == ISD::FPOWI || N->getOpcode() == ISD::STRICT_FPOWI; + + // The integer operand is the last operand in FPOWI (or FLDEXP) (so the result + // and floating point operand is already type legalized). + RTLIB::Libcall LC = IsPowI ? RTLIB::getPOWI(N->getValueType(0)) + : RTLIB::getLDEXP(N->getValueType(0)); + + if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) { + SDValue Op = SExtPromotedInteger(N->getOperand(1)); + return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op), 0); + } // We can't just promote the exponent type in FPOWI, since we want to lower // the node to a libcall and we if we promote to a type larger than // sizeof(int) the libcall might not be according to the targets ABI. Instead // we rewrite to a libcall here directly, letting makeLibCall handle promotion // if the target accepts it according to shouldSignExtendTypeInLibCall. - RTLIB::Libcall LC = RTLIB::getPOWI(N->getValueType(0)); - assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fpowi."); - if (!TLI.getLibcallName(LC)) { - // Some targets don't have a powi libcall; use pow instead. - // FIXME: Implement this if some target needs it. - DAG.getContext()->emitError("Don't know how to promote fpowi to fpow"); - return DAG.getUNDEF(N->getValueType(0)); - } + unsigned OpOffset = IsStrict ? 1 : 0; // The exponent should fit in a sizeof(int) type for the libcall to be valid. assert(DAG.getLibInfo().getIntSize() == @@ -2290,16 +2348,40 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) { // An i1 vecreduce_or is equivalent to vecreduce_umax, use that instead if // vecreduce_or is not legal else if (Opcode == ISD::VECREDUCE_OR && OrigEltVT == MVT::i1 && - !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_OR, InVT) && - TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMAX, InVT)) + !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_OR, InVT) && + TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMAX, InVT)) { Opcode = ISD::VECREDUCE_UMAX; + // Can't use promoteTargetBoolean here because we still need + // to either sign_ext or zero_ext in the undefined case. + switch (TLI.getBooleanContents(InVT)) { + case TargetLoweringBase::UndefinedBooleanContent: + case TargetLoweringBase::ZeroOrOneBooleanContent: + Op = ZExtPromotedInteger(N->getOperand(0)); + break; + case TargetLoweringBase::ZeroOrNegativeOneBooleanContent: + Op = SExtPromotedInteger(N->getOperand(0)); + break; + } + } // An i1 vecreduce_and is equivalent to vecreduce_umin, use that instead if // vecreduce_and is not legal else if (Opcode == ISD::VECREDUCE_AND && OrigEltVT == MVT::i1 && - !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_AND, InVT) && - TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMIN, InVT)) + !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_AND, InVT) && + TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMIN, InVT)) { Opcode = ISD::VECREDUCE_UMIN; + // Can't use promoteTargetBoolean here because we still need + // to either sign_ext or zero_ext in the undefined case. + switch (TLI.getBooleanContents(InVT)) { + case TargetLoweringBase::UndefinedBooleanContent: + case TargetLoweringBase::ZeroOrOneBooleanContent: + Op = ZExtPromotedInteger(N->getOperand(0)); + break; + case TargetLoweringBase::ZeroOrNegativeOneBooleanContent: + Op = SExtPromotedInteger(N->getOperand(0)); + break; + } + } if (ResVT.bitsGE(EltVT)) return DAG.getNode(Opcode, SDLoc(N), ResVT, Op); @@ -2512,8 +2594,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::ADDE: case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break; - case ISD::ADDCARRY: - case ISD::SUBCARRY: ExpandIntRes_ADDSUBCARRY(N, Lo, Hi); break; + case ISD::UADDO_CARRY: + case ISD::USUBO_CARRY: ExpandIntRes_UADDSUBO_CARRY(N, Lo, Hi); break; case ISD::SADDO_CARRY: case ISD::SSUBO_CARRY: ExpandIntRes_SADDSUBO_CARRY(N, Lo, Hi); break; @@ -2874,48 +2956,118 @@ static std::pair<ISD::CondCode, ISD::NodeType> getExpandedMinMaxOps(int Op) { void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc DL(N); - ISD::NodeType LoOpc; - ISD::CondCode CondC; - std::tie(CondC, LoOpc) = getExpandedMinMaxOps(N->getOpcode()); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - // Expand the subcomponents. - SDValue LHSL, LHSH, RHSL, RHSH; - GetExpandedInteger(LHS, LHSL, LHSH); - GetExpandedInteger(RHS, RHSL, RHSH); - - // Value types - EVT NVT = LHSL.getValueType(); - EVT CCT = getSetCCResultType(NVT); - // If the upper halves are all sign bits, then we can perform the MINMAX on // the lower half and sign-extend the result to the upper half. - unsigned NumHalfBits = NVT.getScalarSizeInBits(); + unsigned NumBits = N->getValueType(0).getScalarSizeInBits(); + unsigned NumHalfBits = NumBits / 2; if (DAG.ComputeNumSignBits(LHS) > NumHalfBits && DAG.ComputeNumSignBits(RHS) > NumHalfBits) { + SDValue LHSL, LHSH, RHSL, RHSH; + GetExpandedInteger(LHS, LHSL, LHSH); + GetExpandedInteger(RHS, RHSL, RHSH); + EVT NVT = LHSL.getValueType(); + Lo = DAG.getNode(N->getOpcode(), DL, NVT, LHSL, RHSL); Hi = DAG.getNode(ISD::SRA, DL, NVT, Lo, DAG.getShiftAmountConstant(NumHalfBits - 1, NVT, DL)); return; } - // Hi part is always the same op - Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH}); + // The Lo of smin(X, -1) is LHSL if X is negative. Otherwise it's -1. + // The Lo of smax(X, 0) is 0 if X is negative. Otherwise it's LHSL. + if ((N->getOpcode() == ISD::SMAX && isNullConstant(RHS)) || + (N->getOpcode() == ISD::SMIN && isAllOnesConstant(RHS))) { + SDValue LHSL, LHSH, RHSL, RHSH; + GetExpandedInteger(LHS, LHSL, LHSH); + GetExpandedInteger(RHS, RHSL, RHSH); + EVT NVT = LHSL.getValueType(); + EVT CCT = getSetCCResultType(NVT); - // We need to know whether to select Lo part that corresponds to 'winning' - // Hi part or if Hi parts are equal. - SDValue IsHiLeft = DAG.getSetCC(DL, CCT, LHSH, RHSH, CondC); - SDValue IsHiEq = DAG.getSetCC(DL, CCT, LHSH, RHSH, ISD::SETEQ); + SDValue HiNeg = + DAG.getSetCC(DL, CCT, LHSH, DAG.getConstant(0, DL, NVT), ISD::SETLT); + if (N->getOpcode() == ISD::SMIN) { + Lo = DAG.getSelect(DL, NVT, HiNeg, LHSL, DAG.getConstant(-1, DL, NVT)); + } else { + Lo = DAG.getSelect(DL, NVT, HiNeg, DAG.getConstant(0, DL, NVT), LHSL); + } + Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH}); + return; + } - // Lo part corresponding to the 'winning' Hi part - SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL); + const APInt *RHSVal = nullptr; + if (auto *RHSConst = dyn_cast<ConstantSDNode>(RHS)) + RHSVal = &RHSConst->getAPIntValue(); - // Recursed Lo part if Hi parts are equal, this uses unsigned version - SDValue LoMinMax = DAG.getNode(LoOpc, DL, NVT, {LHSL, RHSL}); + // The high half of MIN/MAX is always just the the MIN/MAX of the + // high halves of the operands. Expand this way if it appears profitable. + if (RHSVal && (N->getOpcode() == ISD::UMIN || N->getOpcode() == ISD::UMAX) && + (RHSVal->countLeadingOnes() >= NumHalfBits || + RHSVal->countLeadingZeros() >= NumHalfBits)) { + SDValue LHSL, LHSH, RHSL, RHSH; + GetExpandedInteger(LHS, LHSL, LHSH); + GetExpandedInteger(RHS, RHSL, RHSH); + EVT NVT = LHSL.getValueType(); + EVT CCT = getSetCCResultType(NVT); + + ISD::NodeType LoOpc; + ISD::CondCode CondC; + std::tie(CondC, LoOpc) = getExpandedMinMaxOps(N->getOpcode()); + + Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH}); + // We need to know whether to select Lo part that corresponds to 'winning' + // Hi part or if Hi parts are equal. + SDValue IsHiLeft = DAG.getSetCC(DL, CCT, LHSH, RHSH, CondC); + SDValue IsHiEq = DAG.getSetCC(DL, CCT, LHSH, RHSH, ISD::SETEQ); - Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp); + // Lo part corresponding to the 'winning' Hi part + SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL); + + // Recursed Lo part if Hi parts are equal, this uses unsigned version + SDValue LoMinMax = DAG.getNode(LoOpc, DL, NVT, {LHSL, RHSL}); + + Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp); + return; + } + + // Expand to "a < b ? a : b" etc. Prefer ge/le if that simplifies + // the compare. + ISD::CondCode Pred; + switch (N->getOpcode()) { + default: llvm_unreachable("How did we get here?"); + case ISD::SMAX: + if (RHSVal && RHSVal->countTrailingZeros() >= NumHalfBits) + Pred = ISD::SETGE; + else + Pred = ISD::SETGT; + break; + case ISD::SMIN: + if (RHSVal && RHSVal->countTrailingOnes() >= NumHalfBits) + Pred = ISD::SETLE; + else + Pred = ISD::SETLT; + break; + case ISD::UMAX: + if (RHSVal && RHSVal->countTrailingZeros() >= NumHalfBits) + Pred = ISD::SETUGE; + else + Pred = ISD::SETUGT; + break; + case ISD::UMIN: + if (RHSVal && RHSVal->countTrailingOnes() >= NumHalfBits) + Pred = ISD::SETULE; + else + Pred = ISD::SETULT; + break; + } + EVT VT = N->getValueType(0); + EVT CCT = getSetCCResultType(VT); + SDValue Cond = DAG.getSetCC(DL, CCT, LHS, RHS, Pred); + SDValue Result = DAG.getSelect(DL, VT, Cond, LHS, RHS); + SplitInteger(Result, Lo, Hi); } void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, @@ -2931,7 +3083,7 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, SDValue HiOps[3] = { LHSH, RHSH }; bool HasOpCarry = TLI.isOperationLegalOrCustom( - N->getOpcode() == ISD::ADD ? ISD::ADDCARRY : ISD::SUBCARRY, + N->getOpcode() == ISD::ADD ? ISD::UADDO_CARRY : ISD::USUBO_CARRY, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); if (HasOpCarry) { SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT)); @@ -2940,13 +3092,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, HiOps[2] = Lo.getValue(1); Hi = DAG.computeKnownBits(HiOps[2]).isZero() ? DAG.getNode(ISD::UADDO, dl, VTList, ArrayRef(HiOps, 2)) - : DAG.getNode(ISD::ADDCARRY, dl, VTList, HiOps); + : DAG.getNode(ISD::UADDO_CARRY, dl, VTList, HiOps); } else { Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps); HiOps[2] = Lo.getValue(1); Hi = DAG.computeKnownBits(HiOps[2]).isZero() ? DAG.getNode(ISD::USUBO, dl, VTList, ArrayRef(HiOps, 2)) - : DAG.getNode(ISD::SUBCARRY, dl, VTList, HiOps); + : DAG.getNode(ISD::USUBO_CARRY, dl, VTList, HiOps); } return; } @@ -3014,8 +3166,22 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, if (N->getOpcode() == ISD::ADD) { Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps); Hi = DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2)); - SDValue Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0], - ISD::SETULT); + SDValue Cmp; + // Special case: X+1 has a carry out if X+1==0. This may reduce the live + // range of X. We assume comparing with 0 is cheap. + if (isOneConstant(LoOps[1])) + Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, + DAG.getConstant(0, dl, NVT), ISD::SETEQ); + else if (isAllOnesConstant(LoOps[1])) { + if (isAllOnesConstant(HiOps[1])) + Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), LoOps[0], + DAG.getConstant(0, dl, NVT), ISD::SETEQ); + else + Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), LoOps[0], + DAG.getConstant(0, dl, NVT), ISD::SETNE); + } else + Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0], + ISD::SETULT); SDValue Carry; if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) @@ -3024,7 +3190,10 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, Carry = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT), DAG.getConstant(0, dl, NVT)); - Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry); + if (isAllOnesConstant(LoOps[1]) && isAllOnesConstant(HiOps[1])) + Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps[0], Carry); + else + Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry); } else { Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps); Hi = DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2)); @@ -3101,12 +3270,12 @@ void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N, ISD::CondCode Cond; switch(N->getOpcode()) { case ISD::UADDO: - CarryOp = ISD::ADDCARRY; + CarryOp = ISD::UADDO_CARRY; NoCarryOp = ISD::ADD; Cond = ISD::SETULT; break; case ISD::USUBO: - CarryOp = ISD::SUBCARRY; + CarryOp = ISD::USUBO_CARRY; NoCarryOp = ISD::SUB; Cond = ISD::SETUGT; break; @@ -3137,9 +3306,22 @@ void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N, SDValue Sum = DAG.getNode(NoCarryOp, dl, LHS.getValueType(), LHS, RHS); SplitInteger(Sum, Lo, Hi); - // Calculate the overflow: addition overflows iff a + b < a, and subtraction - // overflows iff a - b > a. - Ovf = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, Cond); + if (N->getOpcode() == ISD::UADDO && isOneConstant(RHS)) { + // Special case: uaddo X, 1 overflowed if X+1 == 0. We can detect this + // with (Lo | Hi) == 0. + SDValue Or = DAG.getNode(ISD::OR, dl, Lo.getValueType(), Lo, Hi); + Ovf = DAG.getSetCC(dl, N->getValueType(1), Or, + DAG.getConstant(0, dl, Lo.getValueType()), ISD::SETEQ); + } else if (N->getOpcode() == ISD::UADDO && isAllOnesConstant(RHS)) { + // Special case: uaddo X, -1 overflows if X == 0. + Ovf = + DAG.getSetCC(dl, N->getValueType(1), LHS, + DAG.getConstant(0, dl, LHS.getValueType()), ISD::SETNE); + } else { + // Calculate the overflow: addition overflows iff a + b < a, and + // subtraction overflows iff a - b > a. + Ovf = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, Cond); + } } // Legalized the flag result - switch anything that used the old flag to @@ -3147,8 +3329,8 @@ void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N, ReplaceValueWith(SDValue(N, 1), Ovf); } -void DAGTypeLegalizer::ExpandIntRes_ADDSUBCARRY(SDNode *N, - SDValue &Lo, SDValue &Hi) { +void DAGTypeLegalizer::ExpandIntRes_UADDSUBO_CARRY(SDNode *N, SDValue &Lo, + SDValue &Hi) { // Expand the subcomponents. SDValue LHSL, LHSH, RHSL, RHSH; SDLoc dl(N); @@ -3177,8 +3359,8 @@ void DAGTypeLegalizer::ExpandIntRes_SADDSUBO_CARRY(SDNode *N, SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1)); // We need to use an unsigned carry op for the lo part. - unsigned CarryOp = N->getOpcode() == ISD::SADDO_CARRY ? ISD::ADDCARRY - : ISD::SUBCARRY; + unsigned CarryOp = + N->getOpcode() == ISD::SADDO_CARRY ? ISD::UADDO_CARRY : ISD::USUBO_CARRY; Lo = DAG.getNode(CarryOp, dl, VTList, { LHSL, RHSL, N->getOperand(2) }); Hi = DAG.getNode(N->getOpcode(), dl, VTList, { LHSH, RHSH, Lo.getValue(1) }); @@ -3308,14 +3490,14 @@ void DAGTypeLegalizer::ExpandIntRes_ABS(SDNode *N, SDValue &Lo, SDValue &Hi) { return; } - // If we have SUBCARRY, use the expanded form of the sra+xor+sub sequence we - // use in LegalizeDAG. The SUB part of the expansion is based on - // ExpandIntRes_ADDSUB which also uses SUBCARRY/USUBO after checking that - // SUBCARRY is LegalOrCustom. Each of the pieces here can be further expanded - // if needed. Shift expansion has a special case for filling with sign bits - // so that we will only end up with one SRA. + // If we have USUBO_CARRY, use the expanded form of the sra+xor+sub sequence + // we use in LegalizeDAG. The SUB part of the expansion is based on + // ExpandIntRes_ADDSUB which also uses USUBO_CARRY/USUBO after checking that + // USUBO_CARRY is LegalOrCustom. Each of the pieces here can be further + // expanded if needed. Shift expansion has a special case for filling with + // sign bits so that we will only end up with one SRA. bool HasSubCarry = TLI.isOperationLegalOrCustom( - ISD::SUBCARRY, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); + ISD::USUBO_CARRY, TLI.getTypeToExpandTo(*DAG.getContext(), NVT)); if (HasSubCarry) { SDValue Sign = DAG.getNode( ISD::SRA, dl, NVT, Hi, @@ -3324,7 +3506,7 @@ void DAGTypeLegalizer::ExpandIntRes_ABS(SDNode *N, SDValue &Lo, SDValue &Hi) { Lo = DAG.getNode(ISD::XOR, dl, NVT, Lo, Sign); Hi = DAG.getNode(ISD::XOR, dl, NVT, Hi, Sign); Lo = DAG.getNode(ISD::USUBO, dl, VTList, Lo, Sign); - Hi = DAG.getNode(ISD::SUBCARRY, dl, VTList, Hi, Sign, Lo.getValue(1)); + Hi = DAG.getNode(ISD::USUBO_CARRY, dl, VTList, Hi, Sign, Lo.getValue(1)); return; } @@ -4956,8 +5138,7 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS, ConstantSDNode *LoCmpC = dyn_cast<ConstantSDNode>(LoCmp.getNode()); ConstantSDNode *HiCmpC = dyn_cast<ConstantSDNode>(HiCmp.getNode()); - bool EqAllowed = (CCCode == ISD::SETLE || CCCode == ISD::SETGE || - CCCode == ISD::SETUGE || CCCode == ISD::SETULE); + bool EqAllowed = ISD::isTrueWhenEqual(CCCode); // FIXME: Is the HiCmpC->isOne() here correct for // ZeroOrNegativeOneBooleanContent. @@ -5088,9 +5269,10 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCCCARRY(SDNode *N) { GetExpandedInteger(LHS, LHSLo, LHSHi); GetExpandedInteger(RHS, RHSLo, RHSHi); - // Expand to a SUBE for the low part and a smaller SETCCCARRY for the high. + // Expand to a USUBO_CARRY for the low part and a SETCCCARRY for the high. SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), Carry.getValueType()); - SDValue LowCmp = DAG.getNode(ISD::SUBCARRY, dl, VTList, LHSLo, RHSLo, Carry); + SDValue LowCmp = + DAG.getNode(ISD::USUBO_CARRY, dl, VTList, LHSLo, RHSLo, Carry); return DAG.getNode(ISD::SETCCCARRY, dl, N->getValueType(0), LHSHi, RHSHi, LowCmp.getValue(1), Cond); } @@ -5293,6 +5475,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) { return DAG.getNode(ISD::VECTOR_SPLICE, dl, OutVT, V0, V1, N->getOperand(2)); } +SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_INTERLEAVE_DEINTERLEAVE(SDNode *N) { + SDLoc dl(N); + + SDValue V0 = GetPromotedInteger(N->getOperand(0)); + SDValue V1 = GetPromotedInteger(N->getOperand(1)); + EVT ResVT = V0.getValueType(); + SDValue Res = DAG.getNode(N->getOpcode(), dl, + DAG.getVTList(ResVT, ResVT), V0, V1); + SetPromotedInteger(SDValue(N, 0), Res.getValue(0)); + SetPromotedInteger(SDValue(N, 1), Res.getValue(1)); + return SDValue(); +} + SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) { EVT OutVT = N->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index 5e0349593139..328939e44dcb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -245,8 +245,7 @@ bool DAGTypeLegalizer::run() { // types are illegal. for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) { EVT ResultVT = N->getValueType(i); - LLVM_DEBUG(dbgs() << "Analyzing result type: " << ResultVT.getEVTString() - << "\n"); + LLVM_DEBUG(dbgs() << "Analyzing result type: " << ResultVT << "\n"); switch (getTypeAction(ResultVT)) { case TargetLowering::TypeLegal: LLVM_DEBUG(dbgs() << "Legal result type\n"); @@ -716,7 +715,6 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) { auto &OpIdEntry = PromotedIntegers[getTableId(Op)]; assert((OpIdEntry == 0) && "Node is already promoted!"); OpIdEntry = getTableId(Result); - Result->setFlags(Op->getFlags()); DAG.transferDbgValues(Op, Result); } @@ -989,10 +987,7 @@ void DAGTypeLegalizer::GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi) { SDLoc dl(Pair); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Pair.getValueType()); - Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair, - DAG.getIntPtrConstant(0, dl)); - Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair, - DAG.getIntPtrConstant(1, dl)); + std::tie(Lo, Hi) = DAG.SplitScalar(Pair, dl, NVT, NVT); } /// Build an integer with low bits Lo and high bits Hi. @@ -1005,7 +1000,7 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) { EVT NVT = EVT::getIntegerVT(*DAG.getContext(), LVT.getSizeInBits() + HVT.getSizeInBits()); - EVT ShiftAmtVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout(), false); + EVT ShiftAmtVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout()); Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo); Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi); Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index b97e44a01319..db8f61eee606 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -307,6 +307,7 @@ private: SDValue PromoteIntRes_VECTOR_REVERSE(SDNode *N); SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N); SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N); + SDValue PromoteIntRes_VECTOR_INTERLEAVE_DEINTERLEAVE(SDNode *N); SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N); SDValue PromoteIntRes_ScalarOp(SDNode *N); SDValue PromoteIntRes_STEP_VECTOR(SDNode *N); @@ -331,6 +332,7 @@ private: SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N); SDValue PromoteIntRes_Overflow(SDNode *N); + SDValue PromoteIntRes_FFREXP(SDNode *N); SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_Select(SDNode *N); SDValue PromoteIntRes_SELECT_CC(SDNode *N); @@ -345,7 +347,7 @@ private: SDValue PromoteIntRes_SRL(SDNode *N); SDValue PromoteIntRes_TRUNCATE(SDNode *N); SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo); - SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo); + SDValue PromoteIntRes_UADDSUBO_CARRY(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_SADDSUBO_CARRY(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); @@ -383,6 +385,7 @@ private: SDValue PromoteIntOp_Shift(SDNode *N); SDValue PromoteIntOp_FunnelShift(SDNode *N); SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N); + SDValue PromoteIntOp_VP_SIGN_EXTEND(SDNode *N); SDValue PromoteIntOp_SINT_TO_FP(SDNode *N); SDValue PromoteIntOp_STRICT_SINT_TO_FP(SDNode *N); SDValue PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo); @@ -390,15 +393,16 @@ private: SDValue PromoteIntOp_UINT_TO_FP(SDNode *N); SDValue PromoteIntOp_STRICT_UINT_TO_FP(SDNode *N); SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); + SDValue PromoteIntOp_VP_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo); - SDValue PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_ADDSUBO_CARRY(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N); SDValue PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_FIX(SDNode *N); - SDValue PromoteIntOp_FPOWI(SDNode *N); + SDValue PromoteIntOp_ExpOp(SDNode *N); SDValue PromoteIntOp_VECREDUCE(SDNode *N); SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SET_ROUNDING(SDNode *N); @@ -447,7 +451,7 @@ private: void ExpandIntRes_ADDSUB (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBC (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ADDSUBE (SDNode *N, SDValue &Lo, SDValue &Hi); - void ExpandIntRes_ADDSUBCARRY (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_UADDSUBO_CARRY (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_SADDSUBO_CARRY (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BITREVERSE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_BSWAP (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -558,9 +562,11 @@ private: SDValue SoftenFloatRes_FNEG(SDNode *N); SDValue SoftenFloatRes_FP_EXTEND(SDNode *N); SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N); + SDValue SoftenFloatRes_BF16_TO_FP(SDNode *N); SDValue SoftenFloatRes_FP_ROUND(SDNode *N); SDValue SoftenFloatRes_FPOW(SDNode *N); - SDValue SoftenFloatRes_FPOWI(SDNode *N); + SDValue SoftenFloatRes_ExpOp(SDNode *N); + SDValue SoftenFloatRes_FFREXP(SDNode *N); SDValue SoftenFloatRes_FREEZE(SDNode *N); SDValue SoftenFloatRes_FREM(SDNode *N); SDValue SoftenFloatRes_FRINT(SDNode *N); @@ -638,6 +644,7 @@ private: void ExpandFloatRes_FP_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FPOW (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FPOWI (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandFloatRes_FLDEXP (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FREEZE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FREM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FRINT (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -687,7 +694,8 @@ private: SDValue PromoteFloatRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteFloatRes_FCOPYSIGN(SDNode *N); SDValue PromoteFloatRes_FMAD(SDNode *N); - SDValue PromoteFloatRes_FPOWI(SDNode *N); + SDValue PromoteFloatRes_ExpOp(SDNode *N); + SDValue PromoteFloatRes_FFREXP(SDNode *N); SDValue PromoteFloatRes_FP_ROUND(SDNode *N); SDValue PromoteFloatRes_LOAD(SDNode *N); SDValue PromoteFloatRes_SELECT(SDNode *N); @@ -728,7 +736,7 @@ private: SDValue SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue SoftPromoteHalfRes_FCOPYSIGN(SDNode *N); SDValue SoftPromoteHalfRes_FMAD(SDNode *N); - SDValue SoftPromoteHalfRes_FPOWI(SDNode *N); + SDValue SoftPromoteHalfRes_ExpOp(SDNode *N); SDValue SoftPromoteHalfRes_FP_ROUND(SDNode *N); SDValue SoftPromoteHalfRes_LOAD(SDNode *N); SDValue SoftPromoteHalfRes_SELECT(SDNode *N); @@ -781,7 +789,7 @@ private: SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N); SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N); SDValue ScalarizeVecRes_FP_ROUND(SDNode *N); - SDValue ScalarizeVecRes_FPOWI(SDNode *N); + SDValue ScalarizeVecRes_ExpOp(SDNode *N); SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N); SDValue ScalarizeVecRes_LOAD(LoadSDNode *N); SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N); @@ -795,6 +803,7 @@ private: SDValue ScalarizeVecRes_IS_FPCLASS(SDNode *N); SDValue ScalarizeVecRes_FIX(SDNode *N); + SDValue ScalarizeVecRes_FFREXP(SDNode *N, unsigned ResNo); // Vector Operand Scalarization: <1 x ty> -> ty. bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo); @@ -843,6 +852,7 @@ private: void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_FFREXP(SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi); void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -857,8 +867,7 @@ private: void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); @@ -875,6 +884,8 @@ private: void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N); + void SplitVecRes_VECTOR_INTERLEAVE(SDNode *N); void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -901,7 +912,7 @@ private: SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N); SDValue SplitVecOp_VSETCC(SDNode *N); SDValue SplitVecOp_FP_ROUND(SDNode *N); - SDValue SplitVecOp_FCOPYSIGN(SDNode *N); + SDValue SplitVecOp_FPOpDifferentTypes(SDNode *N); SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N); //===--------------------------------------------------------------------===// @@ -942,6 +953,7 @@ private: // Widen Vector Result Promotion. void WidenVectorResult(SDNode *N, unsigned ResNo); SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo); + SDValue WidenVecRes_AssertZext(SDNode* N); SDValue WidenVecRes_BITCAST(SDNode* N); SDValue WidenVecRes_BUILD_VECTOR(SDNode* N); SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N); @@ -976,7 +988,7 @@ private: SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N); SDValue WidenVecRes_FCOPYSIGN(SDNode *N); SDValue WidenVecRes_IS_FPCLASS(SDNode *N); - SDValue WidenVecRes_POWI(SDNode *N); + SDValue WidenVecRes_ExpOp(SDNode *N); SDValue WidenVecRes_Unary(SDNode *N); SDValue WidenVecRes_InregOp(SDNode *N); @@ -1001,11 +1013,12 @@ private: SDValue WidenVecOp_Convert(SDNode *N); SDValue WidenVecOp_FP_TO_XINT_SAT(SDNode *N); - SDValue WidenVecOp_FCOPYSIGN(SDNode *N); + SDValue WidenVecOp_UnrollVectorOp(SDNode *N); SDValue WidenVecOp_IS_FPCLASS(SDNode *N); SDValue WidenVecOp_VECREDUCE(SDNode *N); SDValue WidenVecOp_VECREDUCE_SEQ(SDNode *N); SDValue WidenVecOp_VP_REDUCE(SDNode *N); + SDValue WidenVecOp_ExpOp(SDNode *N); /// Helper function to generate a set of operations to perform /// a vector operation for a wider type. @@ -1071,6 +1084,7 @@ private: // Generic Result Splitting. void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi); + void SplitVecRes_AssertZext (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_Select (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_SELECT_CC (SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index 21b5255c8f72..296242c00401 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -571,6 +571,16 @@ void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) { Hi = DAG.getUNDEF(HiVT); } +void DAGTypeLegalizer::SplitVecRes_AssertZext(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDValue L, H; + SDLoc dl(N); + GetSplitOp(N->getOperand(0), L, H); + + Lo = DAG.getNode(ISD::AssertZext, dl, L.getValueType(), L, N->getOperand(1)); + Hi = DAG.getNode(ISD::AssertZext, dl, H.getValueType(), H, N->getOperand(1)); +} + void DAGTypeLegalizer::SplitRes_FREEZE(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue L, H; SDLoc dl(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index e245b3cb4c6d..3862fd241897 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLowering.h" @@ -38,7 +39,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" #include <cassert> #include <cstdint> #include <iterator> @@ -296,7 +296,16 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { if (Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP) ValVT = Node->getOperand(1).getValueType(); - Action = TLI.getOperationAction(Node->getOpcode(), ValVT); + if (Op.getOpcode() == ISD::STRICT_FSETCC || + Op.getOpcode() == ISD::STRICT_FSETCCS) { + MVT OpVT = Node->getOperand(1).getSimpleValueType(); + ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(3))->get(); + Action = TLI.getCondCodeAction(CCCode, OpVT); + if (Action == TargetLowering::Legal) + Action = TLI.getOperationAction(Node->getOpcode(), OpVT); + } else { + Action = TLI.getOperationAction(Node->getOpcode(), ValVT); + } // If we're asked to expand a strict vector floating-point operation, // by default we're going to simply unroll it. That is usually the // best approach, except in the case where the resulting strict (scalar) @@ -368,6 +377,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FSQRT: case ISD::FSIN: case ISD::FCOS: + case ISD::FLDEXP: case ISD::FPOWI: case ISD::FPOW: case ISD::FLOG: @@ -402,6 +412,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::SMULO: case ISD::UMULO: case ISD::FCANONICALIZE: + case ISD::FFREXP: case ISD::SADDSAT: case ISD::UADDSAT: case ISD::SSUBSAT: @@ -441,6 +452,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); break; @@ -454,7 +467,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(2))->get(); Action = TLI.getCondCodeAction(CCCode, OpVT); if (Action == TargetLowering::Legal) - Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); + Action = TLI.getOperationAction(Node->getOpcode(), OpVT); break; } @@ -785,6 +798,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { return; } break; + case ISD::ABDS: + case ISD::ABDU: + if (SDValue Expanded = TLI.expandABD(Node, DAG)) { + Results.push_back(Expanded); + return; + } + break; case ISD::BITREVERSE: ExpandBITREVERSE(Node, Results); return; @@ -943,6 +963,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { case ISD::VECREDUCE_FMUL: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: Results.push_back(TLI.expandVecReduce(Node, DAG)); return; case ISD::VECREDUCE_SEQ_FADD: @@ -958,7 +980,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { return; } - Results.push_back(DAG.UnrollVectorOp(Node)); + SDValue Unrolled = DAG.UnrollVectorOp(Node); + for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I) + Results.push_back(Unrolled.getValue(I)); } SDValue VectorLegalizer::ExpandSELECT(SDNode *Node) { @@ -1304,11 +1328,11 @@ SDValue VectorLegalizer::ExpandVP_SELECT(SDNode *Node) { return DAG.UnrollVectorOp(Node); SDValue Ones = DAG.getAllOnesConstant(DL, VT); - SDValue NotMask = DAG.getNode(ISD::VP_XOR, DL, VT, Mask, Ones, Mask, EVL); + SDValue NotMask = DAG.getNode(ISD::VP_XOR, DL, VT, Mask, Ones, Ones, EVL); - Op1 = DAG.getNode(ISD::VP_AND, DL, VT, Op1, Mask, Mask, EVL); - Op2 = DAG.getNode(ISD::VP_AND, DL, VT, Op2, NotMask, Mask, EVL); - return DAG.getNode(ISD::VP_OR, DL, VT, Op1, Op2, Mask, EVL); + Op1 = DAG.getNode(ISD::VP_AND, DL, VT, Op1, Mask, Ones, EVL); + Op2 = DAG.getNode(ISD::VP_AND, DL, VT, Op2, NotMask, Ones, EVL); + return DAG.getNode(ISD::VP_OR, DL, VT, Op1, Op2, Ones, EVL); } SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) { @@ -1516,39 +1540,54 @@ void VectorLegalizer::ExpandSETCC(SDNode *Node, SmallVectorImpl<SDValue> &Results) { bool NeedInvert = false; bool IsVP = Node->getOpcode() == ISD::VP_SETCC; - SDLoc dl(Node); - MVT OpVT = Node->getOperand(0).getSimpleValueType(); - ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(2))->get(); + bool IsStrict = Node->getOpcode() == ISD::STRICT_FSETCC || + Node->getOpcode() == ISD::STRICT_FSETCCS; + bool IsSignaling = Node->getOpcode() == ISD::STRICT_FSETCCS; + unsigned Offset = IsStrict ? 1 : 0; + + SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue(); + SDValue LHS = Node->getOperand(0 + Offset); + SDValue RHS = Node->getOperand(1 + Offset); + SDValue CC = Node->getOperand(2 + Offset); + + MVT OpVT = LHS.getSimpleValueType(); + ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get(); if (TLI.getCondCodeAction(CCCode, OpVT) != TargetLowering::Expand) { + if (IsStrict) { + UnrollStrictFPOp(Node, Results); + return; + } Results.push_back(UnrollVSETCC(Node)); return; } - SDValue Chain; - SDValue LHS = Node->getOperand(0); - SDValue RHS = Node->getOperand(1); - SDValue CC = Node->getOperand(2); SDValue Mask, EVL; if (IsVP) { - Mask = Node->getOperand(3); - EVL = Node->getOperand(4); + Mask = Node->getOperand(3 + Offset); + EVL = Node->getOperand(4 + Offset); } + SDLoc dl(Node); bool Legalized = TLI.LegalizeSetCCCondCode(DAG, Node->getValueType(0), LHS, RHS, CC, Mask, - EVL, NeedInvert, dl, Chain); + EVL, NeedInvert, dl, Chain, IsSignaling); if (Legalized) { // If we expanded the SETCC by swapping LHS and RHS, or by inverting the // condition code, create a new SETCC node. if (CC.getNode()) { - if (!IsVP) - LHS = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0), LHS, RHS, CC, - Node->getFlags()); - else + if (IsStrict) { + LHS = DAG.getNode(Node->getOpcode(), dl, Node->getVTList(), + {Chain, LHS, RHS, CC}, Node->getFlags()); + Chain = LHS.getValue(1); + } else if (IsVP) { LHS = DAG.getNode(ISD::VP_SETCC, dl, Node->getValueType(0), {LHS, RHS, CC, Mask, EVL}, Node->getFlags()); + } else { + LHS = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0), LHS, RHS, CC, + Node->getFlags()); + } } // If we expanded the SETCC by inverting the condition code, then wrap @@ -1560,6 +1599,8 @@ void VectorLegalizer::ExpandSETCC(SDNode *Node, LHS = DAG.getVPLogicalNOT(dl, LHS, Mask, EVL, LHS->getValueType(0)); } } else { + assert(!IsStrict && "Don't know how to expand for strict nodes."); + // Otherwise, SETCC for the given comparison type must be completely // illegal; expand it into a SELECT_CC. EVT VT = Node->getValueType(0); @@ -1571,6 +1612,8 @@ void VectorLegalizer::ExpandSETCC(SDNode *Node, } Results.push_back(LHS); + if (IsStrict) + Results.push_back(Chain); } void VectorLegalizer::ExpandUADDSUBO(SDNode *Node, @@ -1618,6 +1661,12 @@ void VectorLegalizer::ExpandStrictFPOp(SDNode *Node, return; } + if (Node->getOpcode() == ISD::STRICT_FSETCC || + Node->getOpcode() == ISD::STRICT_FSETCCS) { + ExpandSETCC(Node, Results); + return; + } + UnrollStrictFPOp(Node, Results); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index af5ea1ce5f45..8c117c1c74dc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -57,7 +57,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break; case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::FP_ROUND: R = ScalarizeVecRes_FP_ROUND(N); break; - case ISD::FPOWI: R = ScalarizeVecRes_FPOWI(N); break; + case ISD::FPOWI: R = ScalarizeVecRes_ExpOp(N); break; case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break; case ISD::LOAD: R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break; case ISD::SCALAR_TO_VECTOR: R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break; @@ -113,7 +113,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::FCANONICALIZE: R = ScalarizeVecRes_UnaryOp(N); break; - + case ISD::FFREXP: + R = ScalarizeVecRes_FFREXP(N, ResNo); + break; case ISD::ADD: case ISD::AND: case ISD::FADD: @@ -126,6 +128,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::FMAXNUM_IEEE: case ISD::FMINIMUM: case ISD::FMAXIMUM: + case ISD::FLDEXP: case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: @@ -142,6 +145,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::FREM: case ISD::FSUB: case ISD::MUL: + case ISD::MULHS: + case ISD::MULHU: case ISD::OR: case ISD::SDIV: case ISD::SREM: @@ -221,6 +226,34 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_FIX(SDNode *N) { Op2, N->getFlags()); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_FFREXP(SDNode *N, unsigned ResNo) { + assert(N->getValueType(0).getVectorNumElements() == 1 && + "Unexpected vector type!"); + SDValue Elt = GetScalarizedVector(N->getOperand(0)); + + EVT VT0 = N->getValueType(0); + EVT VT1 = N->getValueType(1); + SDLoc dl(N); + + SDNode *ScalarNode = + DAG.getNode(N->getOpcode(), dl, + {VT0.getScalarType(), VT1.getScalarType()}, Elt) + .getNode(); + + // Replace the other vector result not being explicitly scalarized here. + unsigned OtherNo = 1 - ResNo; + EVT OtherVT = N->getValueType(OtherNo); + if (getTypeAction(OtherVT) == TargetLowering::TypeScalarizeVector) { + SetScalarizedVector(SDValue(N, OtherNo), SDValue(ScalarNode, OtherNo)); + } else { + SDValue OtherVal = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, OtherVT, + SDValue(ScalarNode, OtherNo)); + ReplaceValueWith(SDValue(N, OtherNo), OtherVal); + } + + return SDValue(ScalarNode, ResNo); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) { EVT VT = N->getValueType(0).getVectorElementType(); unsigned NumOpers = N->getNumOperands(); @@ -348,10 +381,10 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) { N->getOperand(1)); } -SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) { +SDValue DAGTypeLegalizer::ScalarizeVecRes_ExpOp(SDNode *N) { SDValue Op = GetScalarizedVector(N->getOperand(0)); - return DAG.getNode(ISD::FPOWI, SDLoc(N), - Op.getValueType(), Op, N->getOperand(1)); + return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op, + N->getOperand(1)); } SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) { @@ -695,6 +728,8 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: Res = ScalarizeVecOp_VECREDUCE(N); break; case ISD::VECREDUCE_SEQ_FADD: @@ -948,6 +983,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { "operator!\n"); case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; + case ISD::AssertZext: SplitVecRes_AssertZext(N, Lo, Hi); break; case ISD::VSELECT: case ISD::SELECT: case ISD::VP_MERGE: @@ -959,8 +995,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::CONCAT_VECTORS: SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break; case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break; case ISD::INSERT_SUBVECTOR: SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break; - case ISD::FPOWI: SplitVecRes_FPOWI(N, Lo, Hi); break; - case ISD::FCOPYSIGN: SplitVecRes_FCOPYSIGN(N, Lo, Hi); break; + case ISD::FPOWI: + case ISD::FLDEXP: + case ISD::FCOPYSIGN: SplitVecRes_FPOp_MultiType(N, Lo, Hi); break; case ISD::IS_FPCLASS: SplitVecRes_IS_FPCLASS(N, Lo, Hi); break; case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break; case ISD::SPLAT_VECTOR: @@ -1000,6 +1037,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::VECTOR_SPLICE: SplitVecRes_VECTOR_SPLICE(N, Lo, Hi); break; + case ISD::VECTOR_DEINTERLEAVE: + SplitVecRes_VECTOR_DEINTERLEAVE(N); + return; + case ISD::VECTOR_INTERLEAVE: + SplitVecRes_VECTOR_INTERLEAVE(N); + return; case ISD::VAARG: SplitVecRes_VAARG(N, Lo, Hi); break; @@ -1069,6 +1112,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::FCANONICALIZE: SplitVecRes_UnaryOp(N, Lo, Hi); break; + case ISD::FFREXP: + SplitVecRes_FFREXP(N, ResNo, Lo, Hi); + break; case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: @@ -1456,16 +1502,11 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MPI, SmallestAlign); } -void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, - SDValue &Hi) { - SDLoc dl(N); - GetSplitVector(N->getOperand(0), Lo, Hi); - Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1)); - Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1)); -} - -void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, - SDValue &Hi) { +// Handle splitting an FP where the second operand does not match the first +// type. The second operand may be a scalar, or a vector that has exactly as +// many elements as the first +void DAGTypeLegalizer::SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, + SDValue &Hi) { SDValue LHSLo, LHSHi; GetSplitVector(N->getOperand(0), LHSLo, LHSHi); SDLoc DL(N); @@ -1473,14 +1514,18 @@ void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue RHSLo, RHSHi; SDValue RHS = N->getOperand(1); EVT RHSVT = RHS.getValueType(); - if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector) - GetSplitVector(RHS, RHSLo, RHSHi); - else - std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS)); - + if (RHSVT.isVector()) { + if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector) + GetSplitVector(RHS, RHSLo, RHSHi); + else + std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS)); - Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLo.getValueType(), LHSLo, RHSLo); - Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi); + Lo = DAG.getNode(N->getOpcode(), DL, LHSLo.getValueType(), LHSLo, RHSLo); + Hi = DAG.getNode(N->getOpcode(), DL, LHSHi.getValueType(), LHSHi, RHSHi); + } else { + Lo = DAG.getNode(N->getOpcode(), DL, LHSLo.getValueType(), LHSLo, RHS); + Hi = DAG.getNode(N->getOpcode(), DL, LHSHi.getValueType(), LHSHi, RHS); + } } void DAGTypeLegalizer::SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, @@ -2284,6 +2329,42 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, Hi = DAG.getNode(Opcode, dl, HiVT, {Hi, MaskHi, EVLHi}, Flags); } +void DAGTypeLegalizer::SplitVecRes_FFREXP(SDNode *N, unsigned ResNo, + SDValue &Lo, SDValue &Hi) { + SDLoc dl(N); + auto [LoVT, HiVT] = DAG.GetSplitDestVTs(N->getValueType(0)); + auto [LoVT1, HiVT1] = DAG.GetSplitDestVTs(N->getValueType(1)); + + // If the input also splits, handle it directly for a compile time speedup. + // Otherwise split it by hand. + EVT InVT = N->getOperand(0).getValueType(); + if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) + GetSplitVector(N->getOperand(0), Lo, Hi); + else + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + + Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo); + Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi); + Lo->setFlags(N->getFlags()); + Hi->setFlags(N->getFlags()); + + SDNode *HiNode = Hi.getNode(); + SDNode *LoNode = Lo.getNode(); + + // Replace the other vector result not being explicitly split here. + unsigned OtherNo = 1 - ResNo; + EVT OtherVT = N->getValueType(OtherNo); + if (getTypeAction(OtherVT) == TargetLowering::TypeSplitVector) { + SetSplitVector(SDValue(N, OtherNo), SDValue(LoNode, OtherNo), + SDValue(HiNode, OtherNo)); + } else { + SDValue OtherVal = + DAG.getNode(ISD::CONCAT_VECTORS, dl, OtherVT, SDValue(LoNode, OtherNo), + SDValue(HiNode, OtherNo)); + ReplaceValueWith(SDValue(N, OtherNo), OtherVal); + } +} + void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi) { SDLoc dl(N); @@ -2377,7 +2458,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, EVT EltVT = NewVT.getVectorElementType(); SmallVector<SDValue> Ops(NewElts, DAG.getUNDEF(EltVT)); for (unsigned I = 0; I < NewElts; ++I) { - if (Mask[I] == UndefMaskElem) + if (Mask[I] == PoisonMaskElem) continue; unsigned Idx = Mask[I]; if (Idx >= NewElts) @@ -2417,11 +2498,11 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, // Use shuffles operands instead of shuffles themselves. // 1. Adjust mask. for (int &Idx : Mask) { - if (Idx == UndefMaskElem) + if (Idx == PoisonMaskElem) continue; unsigned SrcRegIdx = Idx / NewElts; if (Inputs[SrcRegIdx].isUndef()) { - Idx = UndefMaskElem; + Idx = PoisonMaskElem; continue; } auto *Shuffle = @@ -2429,8 +2510,8 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, if (!Shuffle || !is_contained(P.second, SrcRegIdx)) continue; int MaskElt = Shuffle->getMaskElt(Idx % NewElts); - if (MaskElt == UndefMaskElem) { - Idx = UndefMaskElem; + if (MaskElt == PoisonMaskElem) { + Idx = PoisonMaskElem; continue; } Idx = MaskElt % NewElts + @@ -2449,11 +2530,11 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, // Check if any concat_vectors can be simplified. SmallBitVector UsedSubVector(2 * std::size(Inputs)); for (int &Idx : Mask) { - if (Idx == UndefMaskElem) + if (Idx == PoisonMaskElem) continue; unsigned SrcRegIdx = Idx / NewElts; if (Inputs[SrcRegIdx].isUndef()) { - Idx = UndefMaskElem; + Idx = PoisonMaskElem; continue; } TargetLowering::LegalizeTypeAction TypeAction = @@ -2483,7 +2564,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, if (!Pairs.empty() && Pairs.front().size() > 1) { // Adjust mask. for (int &Idx : Mask) { - if (Idx == UndefMaskElem) + if (Idx == PoisonMaskElem) continue; unsigned SrcRegIdx = Idx / NewElts; auto *It = find_if( @@ -2525,14 +2606,14 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, !Shuffle->getOperand(1).isUndef()) { // Find the only used operand, if possible. for (int &Idx : Mask) { - if (Idx == UndefMaskElem) + if (Idx == PoisonMaskElem) continue; unsigned SrcRegIdx = Idx / NewElts; if (SrcRegIdx != I) continue; int MaskElt = Shuffle->getMaskElt(Idx % NewElts); - if (MaskElt == UndefMaskElem) { - Idx = UndefMaskElem; + if (MaskElt == PoisonMaskElem) { + Idx = PoisonMaskElem; continue; } int OpIdx = MaskElt / NewElts; @@ -2558,14 +2639,14 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, // Found that operand is used already. // 1. Fix the mask for the reused operand. for (int &Idx : Mask) { - if (Idx == UndefMaskElem) + if (Idx == PoisonMaskElem) continue; unsigned SrcRegIdx = Idx / NewElts; if (SrcRegIdx != I) continue; int MaskElt = Shuffle->getMaskElt(Idx % NewElts); - if (MaskElt == UndefMaskElem) { - Idx = UndefMaskElem; + if (MaskElt == PoisonMaskElem) { + Idx = PoisonMaskElem; continue; } int MaskIdx = MaskElt / NewElts; @@ -2582,7 +2663,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, Inputs[I] = Shuffle->getOperand(Op); // Adjust mask. for (int &Idx : Mask) { - if (Idx == UndefMaskElem) + if (Idx == PoisonMaskElem) continue; unsigned SrcRegIdx = Idx / NewElts; if (SrcRegIdx != I) @@ -2616,11 +2697,11 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, auto &&UniqueConstantVec = UniqueConstantInputs.takeVector(); unsigned ConstNum = UniqueConstantVec.size(); for (int &Idx : Mask) { - if (Idx == UndefMaskElem) + if (Idx == PoisonMaskElem) continue; unsigned SrcRegIdx = Idx / NewElts; if (Inputs[SrcRegIdx].isUndef()) { - Idx = UndefMaskElem; + Idx = PoisonMaskElem; continue; } const auto It = find(UniqueConstantVec, Inputs[SrcRegIdx]); @@ -2649,7 +2730,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, // Build a shuffle mask for the output, discovering on the fly which // input vectors to use as shuffle operands. unsigned FirstMaskIdx = High * NewElts; - SmallVector<int> Mask(NewElts * std::size(Inputs), UndefMaskElem); + SmallVector<int> Mask(NewElts * std::size(Inputs), PoisonMaskElem); copy(ArrayRef(OrigMask).slice(FirstMaskIdx, NewElts), Mask.begin()); assert(!Output && "Expected default initialized initial value."); TryPeekThroughShufflesInputs(Mask); @@ -2768,6 +2849,37 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo, DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL)); } +void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) { + + SDValue Op0Lo, Op0Hi, Op1Lo, Op1Hi; + GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi); + GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi); + EVT VT = Op0Lo.getValueType(); + SDLoc DL(N); + SDValue ResLo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, + DAG.getVTList(VT, VT), Op0Lo, Op0Hi); + SDValue ResHi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, + DAG.getVTList(VT, VT), Op1Lo, Op1Hi); + + SetSplitVector(SDValue(N, 0), ResLo.getValue(0), ResHi.getValue(0)); + SetSplitVector(SDValue(N, 1), ResLo.getValue(1), ResHi.getValue(1)); +} + +void DAGTypeLegalizer::SplitVecRes_VECTOR_INTERLEAVE(SDNode *N) { + SDValue Op0Lo, Op0Hi, Op1Lo, Op1Hi; + GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi); + GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi); + EVT VT = Op0Lo.getValueType(); + SDLoc DL(N); + SDValue Res[] = {DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, + DAG.getVTList(VT, VT), Op0Lo, Op1Lo), + DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, + DAG.getVTList(VT, VT), Op0Hi, Op1Hi)}; + + SetSplitVector(SDValue(N, 0), Res[0].getValue(0), Res[0].getValue(1)); + SetSplitVector(SDValue(N, 1), Res[1].getValue(0), Res[1].getValue(1)); +} + //===----------------------------------------------------------------------===// // Operand Vector Splitting //===----------------------------------------------------------------------===// @@ -2808,7 +2920,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::STRICT_FP_ROUND: case ISD::VP_FP_ROUND: case ISD::FP_ROUND: Res = SplitVecOp_FP_ROUND(N); break; - case ISD::FCOPYSIGN: Res = SplitVecOp_FCOPYSIGN(N); break; + case ISD::FCOPYSIGN: Res = SplitVecOp_FPOpDifferentTypes(N); break; case ISD::STORE: Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo); break; @@ -2862,6 +2974,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::FTRUNC: Res = SplitVecOp_UnaryOp(N); break; + case ISD::FLDEXP: + Res = SplitVecOp_FPOpDifferentTypes(N); + break; case ISD::ANY_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: @@ -2882,6 +2997,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: Res = SplitVecOp_VECREDUCE(N, OpNo); break; case ISD::VECREDUCE_SEQ_FADD: @@ -3807,10 +3924,12 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); } -SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) { - // The result (and the first input) has a legal vector type, but the second - // input needs splitting. - +// Split a vector type in an FP binary operation where the second operand has a +// different type from the first. +// +// The result (and the first input) has a legal vector type, but the second +// input needs splitting. +SDValue DAGTypeLegalizer::SplitVecOp_FPOpDifferentTypes(SDNode *N) { SDLoc DL(N); EVT LHSLoVT, LHSHiVT; @@ -3826,8 +3945,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) { SDValue RHSLo, RHSHi; std::tie(RHSLo, RHSHi) = DAG.SplitVector(N->getOperand(1), DL); - SDValue Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLoVT, LHSLo, RHSLo); - SDValue Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHiVT, LHSHi, RHSHi); + SDValue Lo = DAG.getNode(N->getOpcode(), DL, LHSLoVT, LHSLo, RHSLo); + SDValue Hi = DAG.getNode(N->getOpcode(), DL, LHSHiVT, LHSHi, RHSHi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Lo, Hi); } @@ -3885,9 +4004,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to widen the result of this operator!"); + report_fatal_error("Do not know how to widen the result of this operator!"); case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break; + case ISD::AssertZext: Res = WidenVecRes_AssertZext(N); break; case ISD::BITCAST: Res = WidenVecRes_BITCAST(N); break; case ISD::BUILD_VECTOR: Res = WidenVecRes_BUILD_VECTOR(N); break; case ISD::CONCAT_VECTORS: Res = WidenVecRes_CONCAT_VECTORS(N); break; @@ -4036,8 +4156,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { Res = WidenVecRes_IS_FPCLASS(N); break; + case ISD::FLDEXP: case ISD::FPOWI: - Res = WidenVecRes_POWI(N); + if (!unrollExpandedOp()) + Res = WidenVecRes_ExpOp(N); break; case ISD::ANY_EXTEND_VECTOR_INREG: @@ -4394,10 +4516,18 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) { for (unsigned i = 1; i < NumOpers; ++i) { SDValue Oper = N->getOperand(i); - if (Oper.getValueType().isVector()) { - assert(Oper.getValueType() == N->getValueType(0) && - "Invalid operand type to widen!"); - Oper = GetWidenedVector(Oper); + EVT OpVT = Oper.getValueType(); + if (OpVT.isVector()) { + if (getTypeAction(OpVT) == TargetLowering::TypeWidenVector) + Oper = GetWidenedVector(Oper); + else { + EVT WideOpVT = + EVT::getVectorVT(*DAG.getContext(), OpVT.getVectorElementType(), + WidenVT.getVectorElementCount()); + Oper = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + DAG.getUNDEF(WideOpVT), Oper, + DAG.getVectorIdxConstant(0, dl)); + } } InOps.push_back(Oper); @@ -4415,9 +4545,14 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) { for (unsigned i = 0; i < NumOpers; ++i) { SDValue Op = InOps[i]; - if (Op.getValueType().isVector()) - Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op, + EVT OpVT = Op.getValueType(); + if (OpVT.isVector()) { + EVT OpExtractVT = + EVT::getVectorVT(*DAG.getContext(), OpVT.getVectorElementType(), + VT.getVectorElementCount()); + Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpExtractVT, Op, DAG.getVectorIdxConstant(Idx, dl)); + } EOps.push_back(Op); } @@ -4441,8 +4576,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) { for (unsigned i = 0; i < NumOpers; ++i) { SDValue Op = InOps[i]; - if (Op.getValueType().isVector()) - Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, Op, + EVT OpVT = Op.getValueType(); + if (OpVT.isVector()) + Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + OpVT.getVectorElementType(), Op, DAG.getVectorIdxConstant(Idx, dl)); EOps.push_back(Op); @@ -4751,11 +4888,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_IS_FPCLASS(SDNode *N) { N->getFlags()); } -SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) { +SDValue DAGTypeLegalizer::WidenVecRes_ExpOp(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp = GetWidenedVector(N->getOperand(0)); - SDValue ShOp = N->getOperand(1); - return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp); + SDValue RHS = N->getOperand(1); + SDValue ExpOp = RHS.getValueType().isVector() ? GetWidenedVector(RHS) : RHS; + + return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ExpOp); } SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) { @@ -4763,7 +4902,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) { EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDValue InOp = GetWidenedVector(N->getOperand(0)); if (N->getNumOperands() == 1) - return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp); + return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, N->getFlags()); assert(N->getNumOperands() == 3 && "Unexpected number of operands!"); assert(N->isVPOpcode() && "Expected VP opcode"); @@ -4863,7 +5002,16 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenSize / InEltVT.getSizeInBits()); } else { - NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumParts); + // For big endian systems, using the promoted input scalar type + // to produce the scalar_to_vector would put the desired bits into + // the least significant byte(s) of the wider element zero. This + // will mean that the users of the result vector are using incorrect + // bits. Use the original input type instead. Although either input + // type can be used on little endian systems, for consistency we + // use the original type there as well. + EVT OrigInVT = N->getOperand(0).getValueType(); + NewNumParts = WidenSize / OrigInVT.getSizeInBits(); + NewInVT = EVT::getVectorVT(*DAG.getContext(), OrigInVT, NewNumParts); } if (TLI.isTypeLegal(NewInVT)) { @@ -5080,6 +5228,14 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { return DAG.getBuildVector(WidenVT, dl, Ops); } +SDValue DAGTypeLegalizer::WidenVecRes_AssertZext(SDNode *N) { + SDValue InOp = ModifyToType( + N->getOperand(0), + TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)), true); + return DAG.getNode(ISD::AssertZext, SDLoc(N), InOp.getValueType(), InOp, + N->getOperand(1)); +} + SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { SDValue InOp = GetWidenedVector(N->getOperand(0)); return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), @@ -5105,30 +5261,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { return SDValue(); } - SDValue Result; - SmallVector<SDValue, 16> LdChain; // Chain for the series of load - if (ExtType != ISD::NON_EXTLOAD) - Result = GenWidenVectorExtLoads(LdChain, LD, ExtType); - else - Result = GenWidenVectorLoads(LdChain, LD); - - if (Result) { - // If we generate a single load, we can use that for the chain. Otherwise, - // build a factor node to remember the multiple loads are independent and - // chain to that. - SDValue NewChain; - if (LdChain.size() == 1) - NewChain = LdChain[0]; - else - NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain); - - // Modified the chain - switch anything that used the old chain to use - // the new one. - ReplaceValueWith(SDValue(N, 1), NewChain); - - return Result; - } - // Generate a vector-predicated load if it is custom/legal on the target. To // avoid possible recursion, only do this if the widened mask type is legal. // FIXME: Not all targets may support EVL in VP_LOAD. These will have been @@ -5138,15 +5270,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), LdVT); EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, WideVT.getVectorElementCount()); - if (ExtType == ISD::NON_EXTLOAD && WideVT.isScalableVector() && + if (ExtType == ISD::NON_EXTLOAD && TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WideVT) && TLI.isTypeLegal(WideMaskVT)) { SDLoc DL(N); SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT); - MVT EVLVT = TLI.getVPExplicitVectorLengthTy(); - unsigned NumVTElts = LdVT.getVectorMinNumElements(); - SDValue EVL = - DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts)); + SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(), + LdVT.getVectorElementCount()); const auto *MMO = LD->getMemOperand(); SDValue NewLoad = DAG.getLoadVP(WideVT, DL, LD->getChain(), LD->getBasePtr(), Mask, EVL, @@ -5160,6 +5290,30 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { return NewLoad; } + SDValue Result; + SmallVector<SDValue, 16> LdChain; // Chain for the series of load + if (ExtType != ISD::NON_EXTLOAD) + Result = GenWidenVectorExtLoads(LdChain, LD, ExtType); + else + Result = GenWidenVectorLoads(LdChain, LD); + + if (Result) { + // If we generate a single load, we can use that for the chain. Otherwise, + // build a factor node to remember the multiple loads are independent and + // chain to that. + SDValue NewChain; + if (LdChain.size() == 1) + NewChain = LdChain[0]; + else + NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain); + + // Modified the chain - switch anything that used the old chain to use + // the new one. + ReplaceValueWith(SDValue(N, 1), NewChain); + + return Result; + } + report_fatal_error("Unable to widen vector load"); } @@ -5780,7 +5934,7 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { N->dump(&DAG); dbgs() << "\n"; #endif - llvm_unreachable("Do not know how to widen this operator's operand!"); + report_fatal_error("Do not know how to widen this operator's operand!"); case ISD::BITCAST: Res = WidenVecOp_BITCAST(N); break; case ISD::CONCAT_VECTORS: Res = WidenVecOp_CONCAT_VECTORS(N); break; @@ -5800,7 +5954,8 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: Res = WidenVecOp_STRICT_FSETCC(N); break; case ISD::VSELECT: Res = WidenVecOp_VSELECT(N); break; - case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break; + case ISD::FLDEXP: + case ISD::FCOPYSIGN: Res = WidenVecOp_UnrollVectorOp(N); break; case ISD::IS_FPCLASS: Res = WidenVecOp_IS_FPCLASS(N); break; case ISD::ANY_EXTEND: @@ -5843,6 +5998,8 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: Res = WidenVecOp_VECREDUCE(N); break; case ISD::VECREDUCE_SEQ_FADD: @@ -5947,7 +6104,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) { } } -SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) { +SDValue DAGTypeLegalizer::WidenVecOp_UnrollVectorOp(SDNode *N) { // The result (and first input) is legal, but the second input is illegal. // We can't do much to fix that, so just unroll and let the extracts off of // the second input be widened as needed later. @@ -6192,14 +6349,6 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { if (ST->isTruncatingStore()) return TLI.scalarizeVectorStore(ST, DAG); - SmallVector<SDValue, 16> StChain; - if (GenWidenVectorStores(StChain, ST)) { - if (StChain.size() == 1) - return StChain[0]; - - return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain); - } - // Generate a vector-predicated store if it is custom/legal on the target. // To avoid possible recursion, only do this if the widened mask type is // legal. @@ -6211,23 +6360,29 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT); EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, WideVT.getVectorElementCount()); - if (WideVT.isScalableVector() && - TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) && + + if (TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) && TLI.isTypeLegal(WideMaskVT)) { // Widen the value. SDLoc DL(N); StVal = GetWidenedVector(StVal); SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT); - MVT EVLVT = TLI.getVPExplicitVectorLengthTy(); - unsigned NumVTElts = StVT.getVectorMinNumElements(); - SDValue EVL = - DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts)); + SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(), + StVT.getVectorElementCount()); return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(), DAG.getUNDEF(ST->getBasePtr().getValueType()), Mask, - EVL, StVal.getValueType(), ST->getMemOperand(), + EVL, StVT, ST->getMemOperand(), ST->getAddressingMode()); } + SmallVector<SDValue, 16> StChain; + if (GenWidenVectorStores(StChain, ST)) { + if (StChain.size() == 1) + return StChain[0]; + + return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain); + } + report_fatal_error("Unable to widen vector store"); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h index 9fcf692babdc..c31b971e7fc3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h +++ b/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h @@ -230,7 +230,7 @@ public: bool isEmitted() const { return Emitted; } /// clearIsEmitted - Reset Emitted flag, for certain special cases where - /// dbg.addr is emitted twice. + /// SDDbgValue is emitted twice. DBG_INSTR_REF depends on this behaviour. void clearIsEmitted() { Emitted = false; } LLVM_DUMP_METHOD void dump() const; diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp index 2d93adea6b9b..5b01743d23e0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp @@ -69,7 +69,7 @@ private: /// LiveRegDefs - A set of physical registers and their definition /// that are "live". These nodes must be scheduled before any other nodes that /// modifies the registers can be scheduled. - unsigned NumLiveRegs; + unsigned NumLiveRegs = 0u; std::vector<SUnit*> LiveRegDefs; std::vector<unsigned> LiveRegCycles; diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index c252046ef10b..458f50c54824 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" @@ -45,7 +46,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> @@ -156,16 +156,16 @@ private: unsigned CurCycle = 0; /// MinAvailableCycle - Cycle of the soonest available instruction. - unsigned MinAvailableCycle; + unsigned MinAvailableCycle = ~0u; /// IssueCount - Count instructions issued in this cycle /// Currently valid only for bottom-up scheduling. - unsigned IssueCount; + unsigned IssueCount = 0u; /// LiveRegDefs - A set of physical registers and their definition /// that are "live". These nodes must be scheduled before any other nodes that /// modifies the registers can be scheduled. - unsigned NumLiveRegs; + unsigned NumLiveRegs = 0u; std::unique_ptr<SUnit*[]> LiveRegDefs; std::unique_ptr<SUnit*[]> LiveRegGens; @@ -1744,12 +1744,12 @@ protected: bool SrcOrder; // SUnits - The SUnits for the current graph. - std::vector<SUnit> *SUnits; + std::vector<SUnit> *SUnits = nullptr; MachineFunction &MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - const TargetLowering *TLI; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const TargetLowering *TLI = nullptr; ScheduleDAGRRList *scheduleDAG = nullptr; // SethiUllmanNumbers - The SethiUllman number for each node. diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 2e1fd1e8a758..0579c1664d5c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -667,7 +667,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use, // This copy is a liveout value. It is likely coalesced, so reduce the // latency so not to penalize the def. // FIXME: need target specific adjustment here? - Latency = (Latency > 1) ? Latency - 1 : 1; + Latency = Latency - 1; } if (Latency >= 0) dep.setLatency(Latency); diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h index 99bbaeb19182..439ccfdc3275 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h @@ -16,10 +16,10 @@ #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/MachineValueType.h" #include <cassert> #include <string> #include <vector> diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9a3609bc183b..5c1b19eba1c1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -17,11 +17,11 @@ #include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" @@ -35,6 +35,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -61,12 +62,12 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include <algorithm> #include <cassert> @@ -200,10 +201,10 @@ bool ISD::isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly) { SDValue NotZero = N->getOperand(i); unsigned EltSize = N->getValueType(0).getScalarSizeInBits(); if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) { - if (CN->getAPIntValue().countTrailingOnes() < EltSize) + if (CN->getAPIntValue().countr_one() < EltSize) return false; } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) { - if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize) + if (CFPN->getValueAPF().bitcastToAPInt().countr_one() < EltSize) return false; } else return false; @@ -244,10 +245,10 @@ bool ISD::isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly) { // constants are. unsigned EltSize = N->getValueType(0).getScalarSizeInBits(); if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) { - if (CN->getAPIntValue().countTrailingZeros() < EltSize) + if (CN->getAPIntValue().countr_zero() < EltSize) return false; } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) { - if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize) + if (CFPN->getValueAPF().bitcastToAPInt().countr_zero() < EltSize) return false; } else return false; @@ -454,6 +455,10 @@ ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) { case ISD::VECREDUCE_FMIN: case ISD::VP_REDUCE_FMIN: return ISD::FMINNUM; + case ISD::VECREDUCE_FMAXIMUM: + return ISD::FMAXIMUM; + case ISD::VECREDUCE_FMINIMUM: + return ISD::FMINIMUM; } } @@ -516,6 +521,31 @@ std::optional<unsigned> ISD::getVPExplicitVectorLengthIdx(unsigned Opcode) { } } +std::optional<unsigned> ISD::getBaseOpcodeForVP(unsigned VPOpcode, + bool hasFPExcept) { + // FIXME: Return strict opcodes in case of fp exceptions. + switch (VPOpcode) { + default: + return std::nullopt; +#define BEGIN_REGISTER_VP_SDNODE(VPOPC, ...) case ISD::VPOPC: +#define VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) return ISD::SDOPC; +#define END_REGISTER_VP_SDNODE(VPOPC) break; +#include "llvm/IR/VPIntrinsics.def" + } + return std::nullopt; +} + +unsigned ISD::getVPForBaseOpcode(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("can not translate this Opcode to VP."); +#define BEGIN_REGISTER_VP_SDNODE(VPOPC, ...) break; +#define VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) case ISD::SDOPC: +#define END_REGISTER_VP_SDNODE(VPOPC) return ISD::VPOPC; +#include "llvm/IR/VPIntrinsics.def" + } +} + ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) { switch (ExtType) { case ISD::EXTLOAD: @@ -866,12 +896,6 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { ID.AddInteger(AT->getMemOperand()->getFlags()); break; } - case ISD::PREFETCH: { - const MemSDNode *PF = cast<MemSDNode>(N); - ID.AddInteger(PF->getPointerInfo().getAddrSpace()); - ID.AddInteger(PF->getMemOperand()->getFlags()); - break; - } case ISD::VECTOR_SHUFFLE: { const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements(); @@ -890,14 +914,20 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) { case ISD::AssertAlign: ID.AddInteger(cast<AssertAlignSDNode>(N)->getAlign().value()); break; + case ISD::PREFETCH: + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + // Handled by MemIntrinsicSDNode check after the switch. + break; } // end switch (N->getOpcode()) - // Target specific memory nodes could also have address spaces and flags + // MemIntrinsic nodes could also have subclass data, address spaces, and flags // to check. - if (N->isTargetMemoryOpcode()) { - const MemSDNode *MN = cast<MemSDNode>(N); + if (auto *MN = dyn_cast<MemIntrinsicSDNode>(N)) { + ID.AddInteger(MN->getRawSubclassData()); ID.AddInteger(MN->getPointerInfo().getAddrSpace()); ID.AddInteger(MN->getMemOperand()->getFlags()); + ID.AddInteger(MN->getMemoryVT().getRawBits()); } } @@ -1285,8 +1315,8 @@ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL) void SelectionDAG::init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE, Pass *PassPtr, const TargetLibraryInfo *LibraryInfo, - LegacyDivergenceAnalysis *Divergence, - ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin, + UniformityInfo *NewUA, ProfileSummaryInfo *PSIin, + BlockFrequencyInfo *BFIin, FunctionVarLocs const *VarLocs) { MF = &NewMF; SDAGISelPass = PassPtr; @@ -1295,7 +1325,7 @@ void SelectionDAG::init(MachineFunction &NewMF, TSI = getSubtarget().getSelectionDAGInfo(); LibInfo = LibraryInfo; Context = &MF->getFunction().getContext(); - DA = Divergence; + UA = NewUA; PSI = PSIin; BFI = BFIin; FnVarLocs = VarLocs; @@ -1910,6 +1940,34 @@ SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) { return SDValue(CondCodeNodes[Cond], 0); } +SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm, + bool ConstantFold) { + assert(MulImm.getBitWidth() == VT.getSizeInBits() && + "APInt size does not match type size!"); + + if (ConstantFold) { + const MachineFunction &MF = getMachineFunction(); + auto Attr = MF.getFunction().getFnAttribute(Attribute::VScaleRange); + if (Attr.isValid()) { + unsigned VScaleMin = Attr.getVScaleRangeMin(); + if (std::optional<unsigned> VScaleMax = Attr.getVScaleRangeMax()) + if (*VScaleMax == VScaleMin) + return getConstant(MulImm * VScaleMin, DL, VT); + } + } + + return getNode(ISD::VSCALE, DL, VT, getConstant(MulImm, DL, VT)); +} + +SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, + bool ConstantFold) { + if (EC.isScalable()) + return getVScale(DL, VT, + APInt(VT.getSizeInBits(), EC.getKnownMinValue())); + + return getConstant(EC.getKnownMinValue(), DL, VT); +} + SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) { APInt One(ResVT.getScalarSizeInBits(), 1); return getStepVector(DL, ResVT, One); @@ -2128,7 +2186,7 @@ SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) { return SDValue(E, 0); auto *N = newSDNode<RegisterSDNode>(RegNo, VT); - N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, DA); + N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA); CSEMap.InsertNode(N, IP); InsertNode(N); return SDValue(N, 0); @@ -2381,6 +2439,16 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, ISD::CondCode Cond, const SDLoc &dl) { EVT OpVT = N1.getValueType(); + auto GetUndefBooleanConstant = [&]() { + if (VT.getScalarType() == MVT::i1 || + TLI->getBooleanContents(OpVT) == + TargetLowering::UndefinedBooleanContent) + return getUNDEF(VT); + // ZeroOrOne / ZeroOrNegative require specific values for the high bits, + // so we cannot use getUNDEF(). Return zero instead. + return getConstant(0, dl, VT); + }; + // These setcc operations always fold. switch (Cond) { default: break; @@ -2410,12 +2478,12 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, // icmp eq/ne X, undef -> undef. if ((N1.isUndef() || N2.isUndef()) && (Cond == ISD::SETEQ || Cond == ISD::SETNE)) - return getUNDEF(VT); + return GetUndefBooleanConstant(); // If both operands are undef, we can return undef for int comparison. // icmp undef, undef -> undef. if (N1.isUndef() && N2.isUndef()) - return getUNDEF(VT); + return GetUndefBooleanConstant(); // icmp X, X -> true/false // icmp X, undef -> true/false because undef could be X. @@ -2441,34 +2509,34 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, switch (Cond) { default: break; case ISD::SETEQ: if (R==APFloat::cmpUnordered) - return getUNDEF(VT); + return GetUndefBooleanConstant(); [[fallthrough]]; case ISD::SETOEQ: return getBoolConstant(R==APFloat::cmpEqual, dl, VT, OpVT); case ISD::SETNE: if (R==APFloat::cmpUnordered) - return getUNDEF(VT); + return GetUndefBooleanConstant(); [[fallthrough]]; case ISD::SETONE: return getBoolConstant(R==APFloat::cmpGreaterThan || R==APFloat::cmpLessThan, dl, VT, OpVT); case ISD::SETLT: if (R==APFloat::cmpUnordered) - return getUNDEF(VT); + return GetUndefBooleanConstant(); [[fallthrough]]; case ISD::SETOLT: return getBoolConstant(R==APFloat::cmpLessThan, dl, VT, OpVT); case ISD::SETGT: if (R==APFloat::cmpUnordered) - return getUNDEF(VT); + return GetUndefBooleanConstant(); [[fallthrough]]; case ISD::SETOGT: return getBoolConstant(R==APFloat::cmpGreaterThan, dl, VT, OpVT); case ISD::SETLE: if (R==APFloat::cmpUnordered) - return getUNDEF(VT); + return GetUndefBooleanConstant(); [[fallthrough]]; case ISD::SETOLE: return getBoolConstant(R==APFloat::cmpLessThan || R==APFloat::cmpEqual, dl, VT, OpVT); case ISD::SETGE: if (R==APFloat::cmpUnordered) - return getUNDEF(VT); + return GetUndefBooleanConstant(); [[fallthrough]]; case ISD::SETOGE: return getBoolConstant(R==APFloat::cmpGreaterThan || R==APFloat::cmpEqual, dl, VT, OpVT); @@ -2513,7 +2581,7 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2, case 1: // Known true. return getBoolConstant(true, dl, VT, OpVT); case 2: // Undefined. - return getUNDEF(VT); + return GetUndefBooleanConstant(); } } @@ -2567,7 +2635,7 @@ APInt SelectionDAG::computeVectorKnownZeroElements(SDValue Op, unsigned NumElts = VT.getVectorNumElements(); assert(DemandedElts.getBitWidth() == NumElts && "Unexpected demanded mask."); - APInt KnownZeroElements = APInt::getNullValue(NumElts); + APInt KnownZeroElements = APInt::getZero(NumElts); for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) { if (!DemandedElts[EltIdx]) continue; // Don't query elements that are not demanded. @@ -2661,8 +2729,8 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, } case ISD::VECTOR_SHUFFLE: { // Check if this is a shuffle node doing a splat or a shuffle of a splat. - APInt DemandedLHS = APInt::getNullValue(NumElts); - APInt DemandedRHS = APInt::getNullValue(NumElts); + APInt DemandedLHS = APInt::getZero(NumElts); + APInt DemandedRHS = APInt::getZero(NumElts); ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(V)->getMask(); for (int i = 0; i != (int)NumElts; ++i) { int M = Mask[i]; @@ -2689,7 +2757,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, // TODO: Handle source ops splats with undefs. auto CheckSplatSrc = [&](SDValue Src, const APInt &SrcElts) { APInt SrcUndefs; - return (SrcElts.countPopulation() == 1) || + return (SrcElts.popcount() == 1) || (isSplatValue(Src, SrcElts, SrcUndefs, Depth + 1) && (SrcElts & SrcUndefs).isZero()); }; @@ -2808,7 +2876,7 @@ SDValue SelectionDAG::getSplatSourceVector(SDValue V, int &SplatIdx) { SplatIdx = 0; return getUNDEF(VT); } - SplatIdx = (UndefElts & DemandedElts).countTrailingOnes(); + SplatIdx = (UndefElts & DemandedElts).countr_one(); } return V; } @@ -3005,7 +3073,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, } // Known bits are the values that are shared by every demanded element. - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); // If we don't know any bits, early out. if (Known.isUnknown()) @@ -3028,7 +3096,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, if (!!DemandedLHS) { SDValue LHS = Op.getOperand(0); Known2 = computeKnownBits(LHS, DemandedLHS, Depth + 1); - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); } // If we don't know any bits, early out. if (Known.isUnknown()) @@ -3036,10 +3104,16 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, if (!!DemandedRHS) { SDValue RHS = Op.getOperand(1); Known2 = computeKnownBits(RHS, DemandedRHS, Depth + 1); - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); } break; } + case ISD::VSCALE: { + const Function &F = getMachineFunction().getFunction(); + const APInt &Multiplier = Op.getConstantOperandAPInt(0); + Known = getVScaleRange(&F, BitWidth).multiply(Multiplier).toKnownBits(); + break; + } case ISD::CONCAT_VECTORS: { if (Op.getValueType().isScalableVector()) break; @@ -3054,7 +3128,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, if (!!DemandedSub) { SDValue Sub = Op.getOperand(i); Known2 = computeKnownBits(Sub, DemandedSub, Depth + 1); - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); } // If we don't know any bits, early out. if (Known.isUnknown()) @@ -3084,7 +3158,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, } if (!!DemandedSrcElts) { Known2 = computeKnownBits(Src, DemandedSrcElts, Depth + 1); - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); } break; } @@ -3174,8 +3248,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, if (DemandedElts[i]) { unsigned Shifts = IsLE ? i : NumElts - 1 - i; unsigned Offset = (Shifts % SubScale) * BitWidth; - Known = KnownBits::commonBits(Known, - Known2.extractBits(BitWidth, Offset)); + Known = Known.intersectWith(Known2.extractBits(BitWidth, Offset)); // If we don't know any bits, early out. if (Known.isUnknown()) break; @@ -3273,7 +3346,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth+1); // Only known if known in both the LHS and RHS. - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); break; case ISD::SELECT_CC: Known = computeKnownBits(Op.getOperand(3), DemandedElts, Depth+1); @@ -3283,7 +3356,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known2 = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1); // Only known if known in both the LHS and RHS. - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); break; case ISD::SMULO: case ISD::UMULO: @@ -3334,7 +3407,6 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known = KnownBits::ashr(Known, Known2); - // TODO: Add minimum shift high known sign bits. break; case ISD::FSHL: case ISD::FSHR: @@ -3364,8 +3436,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known2.One.lshrInPlace(Amt); Known2.Zero.lshrInPlace(Amt); } - Known.One |= Known2.One; - Known.Zero |= Known2.Zero; + Known = Known.unionWith(Known2); } break; case ISD::SHL_PARTS: @@ -3588,9 +3659,18 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, // All bits are zero except the low bit. Known.Zero.setBitsFrom(1); break; + case ISD::ADD: + case ISD::SUB: { + SDNodeFlags Flags = Op.getNode()->getFlags(); + Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known = KnownBits::computeForAddSub(Op.getOpcode() == ISD::ADD, + Flags.hasNoSignedWrap(), Known, Known2); + break; + } case ISD::USUBO: case ISD::SSUBO: - case ISD::SUBCARRY: + case ISD::USUBO_CARRY: case ISD::SSUBO_CARRY: if (Op.getResNo() == 1) { // If we know the result of a setcc has the top bits zero, use this info. @@ -3601,13 +3681,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, break; } [[fallthrough]]; - case ISD::SUB: case ISD::SUBC: { assert(Op.getResNo() == 0 && "We only compute knownbits for the difference here."); // TODO: Compute influence of the carry operand. - if (Opcode == ISD::SUBCARRY || Opcode == ISD::SSUBO_CARRY) + if (Opcode == ISD::USUBO_CARRY || Opcode == ISD::SSUBO_CARRY) break; Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); @@ -3618,7 +3697,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, } case ISD::UADDO: case ISD::SADDO: - case ISD::ADDCARRY: + case ISD::UADDO_CARRY: case ISD::SADDO_CARRY: if (Op.getResNo() == 1) { // If we know the result of a setcc has the top bits zero, use this info. @@ -3629,17 +3708,16 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, break; } [[fallthrough]]; - case ISD::ADD: case ISD::ADDC: case ISD::ADDE: { assert(Op.getResNo() == 0 && "We only compute knownbits for the sum here."); - // With ADDE and ADDCARRY, a carry bit may be added in. + // With ADDE and UADDO_CARRY, a carry bit may be added in. KnownBits Carry(1); if (Opcode == ISD::ADDE) // Can't track carry from glue, set carry to unknown. Carry.resetAll(); - else if (Opcode == ISD::ADDCARRY || Opcode == ISD::SADDO_CARRY) + else if (Opcode == ISD::UADDO_CARRY || Opcode == ISD::SADDO_CARRY) // TODO: Compute known bits for the carry operand. Not sure if it is worth // the trouble (how often will we find a known carry bit). And I haven't // tested this very much yet, but something like this might work: @@ -3657,7 +3735,13 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, case ISD::UDIV: { Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); - Known = KnownBits::udiv(Known, Known2); + Known = KnownBits::udiv(Known, Known2, Op->getFlags().hasExact()); + break; + } + case ISD::SDIV: { + Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known = KnownBits::sdiv(Known, Known2, Op->getFlags().hasExact()); break; } case ISD::SREM: { @@ -3735,11 +3819,11 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known.Zero.setAllBits(); if (DemandedVal) { Known2 = computeKnownBits(InVal, Depth + 1); - Known = KnownBits::commonBits(Known, Known2.zextOrTrunc(BitWidth)); + Known = Known.intersectWith(Known2.zextOrTrunc(BitWidth)); } if (!!DemandedVecElts) { Known2 = computeKnownBits(InVec, DemandedVecElts, Depth + 1); - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); } break; } @@ -3897,38 +3981,87 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, return Known; } -SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0, - SDValue N1) const { +/// Convert ConstantRange OverflowResult into SelectionDAG::OverflowKind. +static SelectionDAG::OverflowKind mapOverflowResult(ConstantRange::OverflowResult OR) { + switch (OR) { + case ConstantRange::OverflowResult::MayOverflow: + return SelectionDAG::OFK_Sometime; + case ConstantRange::OverflowResult::AlwaysOverflowsLow: + case ConstantRange::OverflowResult::AlwaysOverflowsHigh: + return SelectionDAG::OFK_Always; + case ConstantRange::OverflowResult::NeverOverflows: + return SelectionDAG::OFK_Never; + } + llvm_unreachable("Unknown OverflowResult"); +} + +SelectionDAG::OverflowKind +SelectionDAG::computeOverflowForSignedAdd(SDValue N0, SDValue N1) const { // X + 0 never overflow if (isNullConstant(N1)) return OFK_Never; - KnownBits N1Known = computeKnownBits(N1); - if (N1Known.Zero.getBoolValue()) { - KnownBits N0Known = computeKnownBits(N0); + // If both operands each have at least two sign bits, the addition + // cannot overflow. + if (ComputeNumSignBits(N0) > 1 && ComputeNumSignBits(N1) > 1) + return OFK_Never; - bool overflow; - (void)N0Known.getMaxValue().uadd_ov(N1Known.getMaxValue(), overflow); - if (!overflow) - return OFK_Never; - } + // TODO: Add ConstantRange::signedAddMayOverflow handling. + return OFK_Sometime; +} + +SelectionDAG::OverflowKind +SelectionDAG::computeOverflowForUnsignedAdd(SDValue N0, SDValue N1) const { + // X + 0 never overflow + if (isNullConstant(N1)) + return OFK_Never; // mulhi + 1 never overflow + KnownBits N1Known = computeKnownBits(N1); if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 && - (N1Known.getMaxValue() & 0x01) == N1Known.getMaxValue()) + N1Known.getMaxValue().ult(2)) return OFK_Never; - if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) { - KnownBits N0Known = computeKnownBits(N0); + KnownBits N0Known = computeKnownBits(N0); + if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1 && + N0Known.getMaxValue().ult(2)) + return OFK_Never; - if ((N0Known.getMaxValue() & 0x01) == N0Known.getMaxValue()) - return OFK_Never; - } + // Fallback to ConstantRange::unsignedAddMayOverflow handling. + ConstantRange N0Range = ConstantRange::fromKnownBits(N0Known, false); + ConstantRange N1Range = ConstantRange::fromKnownBits(N1Known, false); + return mapOverflowResult(N0Range.unsignedAddMayOverflow(N1Range)); +} + +SelectionDAG::OverflowKind +SelectionDAG::computeOverflowForSignedSub(SDValue N0, SDValue N1) const { + // X - 0 never overflow + if (isNullConstant(N1)) + return OFK_Never; + + // If both operands each have at least two sign bits, the subtraction + // cannot overflow. + if (ComputeNumSignBits(N0) > 1 && ComputeNumSignBits(N1) > 1) + return OFK_Never; + + // TODO: Add ConstantRange::signedSubMayOverflow handling. + return OFK_Sometime; +} + +SelectionDAG::OverflowKind +SelectionDAG::computeOverflowForUnsignedSub(SDValue N0, SDValue N1) const { + // X - 0 never overflow + if (isNullConstant(N1)) + return OFK_Never; + // TODO: Add ConstantRange::unsignedSubMayOverflow handling. return OFK_Sometime; } -bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { +bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val, unsigned Depth) const { + if (Depth >= MaxRecursionDepth) + return false; // Limit search depth. + EVT OpVT = Val.getValueType(); unsigned BitWidth = OpVT.getScalarSizeInBits(); @@ -3970,15 +4103,12 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // vscale(power-of-two) is a power-of-two for some targets if (Val.getOpcode() == ISD::VSCALE && getTargetLoweringInfo().isVScaleKnownToBeAPowerOfTwo() && - isKnownToBeAPowerOfTwo(Val.getOperand(0))) + isKnownToBeAPowerOfTwo(Val.getOperand(0), Depth + 1)) return true; // More could be done here, though the above checks are enough // to handle some common cases. - - // Fall back to computeKnownBits to catch other known cases. - KnownBits Known = computeKnownBits(Val); - return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1); + return false; } unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { @@ -4041,14 +4171,20 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, continue; SDValue SrcOp = Op.getOperand(i); - Tmp2 = ComputeNumSignBits(SrcOp, Depth + 1); + // BUILD_VECTOR can implicitly truncate sources, we handle this specially + // for constant nodes to ensure we only look at the sign bits. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SrcOp)) { + APInt T = C->getAPIntValue().trunc(VTBits); + Tmp2 = T.getNumSignBits(); + } else { + Tmp2 = ComputeNumSignBits(SrcOp, Depth + 1); - // BUILD_VECTOR can implicitly truncate sources, we must handle this. - if (SrcOp.getValueSizeInBits() != VTBits) { - assert(SrcOp.getValueSizeInBits() > VTBits && - "Expected BUILD_VECTOR implicit truncation"); - unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits; - Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1); + if (SrcOp.getValueSizeInBits() != VTBits) { + assert(SrcOp.getValueSizeInBits() > VTBits && + "Expected BUILD_VECTOR implicit truncation"); + unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits; + Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1); + } } Tmp = std::min(Tmp, Tmp2); } @@ -4225,11 +4361,11 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, case ISD::SADDO: case ISD::UADDO: case ISD::SADDO_CARRY: - case ISD::ADDCARRY: + case ISD::UADDO_CARRY: case ISD::SSUBO: case ISD::USUBO: case ISD::SSUBO_CARRY: - case ISD::SUBCARRY: + case ISD::USUBO_CARRY: case ISD::SMULO: case ISD::UMULO: if (Op.getResNo() != 1) @@ -4733,6 +4869,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::AssertSext: case ISD::AssertZext: case ISD::FREEZE: + case ISD::CONCAT_VECTORS: case ISD::INSERT_SUBVECTOR: case ISD::AND: case ISD::OR: @@ -4753,6 +4890,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::BITCAST: case ISD::BUILD_VECTOR: + case ISD::BUILD_PAIR: return false; case ISD::ADD: @@ -4771,6 +4909,13 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap()); + case ISD::INSERT_VECTOR_ELT:{ + // Ensure that the element index is in bounds. + EVT VecVT = Op.getOperand(0).getValueType(); + KnownBits KnownIdx = computeKnownBits(Op.getOperand(2), Depth + 1); + return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements()); + } + default: // Allow the target to implement this method for its nodes. if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN || @@ -4835,7 +4980,8 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const case ISD::FROUND: case ISD::FROUNDEVEN: case ISD::FRINT: - case ISD::FNEARBYINT: { + case ISD::FNEARBYINT: + case ISD::FLDEXP: { if (SNaN) return true; return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); @@ -4918,13 +5064,28 @@ bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const { "Floating point type expected"); // If the value is a constant, we can obviously see if it is a zero or not. - // TODO: Add BuildVector support. if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) return !C->isZero(); + + // Return false if we find any zero in a vector. + if (Op->getOpcode() == ISD::BUILD_VECTOR || + Op->getOpcode() == ISD::SPLAT_VECTOR) { + for (const SDValue &OpVal : Op->op_values()) { + if (OpVal.isUndef()) + return false; + if (auto *C = dyn_cast<ConstantFPSDNode>(OpVal)) + if (C->isZero()) + return false; + } + return true; + } return false; } -bool SelectionDAG::isKnownNeverZero(SDValue Op) const { +bool SelectionDAG::isKnownNeverZero(SDValue Op, unsigned Depth) const { + if (Depth >= MaxRecursionDepth) + return false; // Limit search depth. + assert(!Op.getValueType().isFloatingPoint() && "Floating point types unsupported - use isKnownNeverZeroFloat"); @@ -4933,24 +5094,105 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op) const { [](ConstantSDNode *C) { return !C->isZero(); })) return true; - // TODO: Recognize more cases here. + // TODO: Recognize more cases here. Most of the cases are also incomplete to + // some degree. switch (Op.getOpcode()) { - default: break; + default: + break; + case ISD::OR: - if (isKnownNeverZero(Op.getOperand(1)) || - isKnownNeverZero(Op.getOperand(0))) + return isKnownNeverZero(Op.getOperand(1), Depth + 1) || + isKnownNeverZero(Op.getOperand(0), Depth + 1); + + case ISD::VSELECT: + case ISD::SELECT: + return isKnownNeverZero(Op.getOperand(1), Depth + 1) && + isKnownNeverZero(Op.getOperand(2), Depth + 1); + + case ISD::SHL: + if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap()) + return isKnownNeverZero(Op.getOperand(0), Depth + 1); + + // 1 << X is never zero. TODO: This can be expanded if we can bound X. + // The expression is really !Known.One[BitWidth-MaxLog2(Known):0].isZero() + if (computeKnownBits(Op.getOperand(0), Depth + 1).One[0]) return true; break; + + case ISD::UADDSAT: + case ISD::UMAX: + return isKnownNeverZero(Op.getOperand(1), Depth + 1) || + isKnownNeverZero(Op.getOperand(0), Depth + 1); + + case ISD::UMIN: + return isKnownNeverZero(Op.getOperand(1), Depth + 1) && + isKnownNeverZero(Op.getOperand(0), Depth + 1); + + case ISD::ROTL: + case ISD::ROTR: + case ISD::BITREVERSE: + case ISD::BSWAP: + case ISD::CTPOP: + case ISD::ABS: + return isKnownNeverZero(Op.getOperand(0), Depth + 1); + + case ISD::SRA: + case ISD::SRL: + if (Op->getFlags().hasExact()) + return isKnownNeverZero(Op.getOperand(0), Depth + 1); + // Signed >> X is never zero. TODO: This can be expanded if we can bound X. + // The expression is really + // !Known.One[SignBit:SignBit-(BitWidth-MaxLog2(Known))].isZero() + if (computeKnownBits(Op.getOperand(0), Depth + 1).isNegative()) + return true; + break; + + case ISD::UDIV: + case ISD::SDIV: + // div exact can only produce a zero if the dividend is zero. + // TODO: For udiv this is also true if Op1 u<= Op0 + if (Op->getFlags().hasExact()) + return isKnownNeverZero(Op.getOperand(0), Depth + 1); + break; + + case ISD::ADD: + if (Op->getFlags().hasNoUnsignedWrap()) + if (isKnownNeverZero(Op.getOperand(1), Depth + 1) || + isKnownNeverZero(Op.getOperand(0), Depth + 1)) + return true; + // TODO: There are a lot more cases we can prove for add. + break; + + case ISD::SUB: { + if (isNullConstant(Op.getOperand(0))) + return isKnownNeverZero(Op.getOperand(1), Depth + 1); + + std::optional<bool> ne = + KnownBits::ne(computeKnownBits(Op.getOperand(0), Depth + 1), + computeKnownBits(Op.getOperand(1), Depth + 1)); + return ne && *ne; } - return false; + case ISD::MUL: + if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap()) + if (isKnownNeverZero(Op.getOperand(1), Depth + 1) && + isKnownNeverZero(Op.getOperand(0), Depth + 1)) + return true; + break; + + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + return isKnownNeverZero(Op.getOperand(0), Depth + 1); + } + + return computeKnownBits(Op, Depth).isNonZero(); } bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const { // Check the obvious case. if (A == B) return true; - // For for negative and positive zero. + // For negative and positive zero. if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) if (CA->isZero() && CB->isZero()) return true; @@ -4986,6 +5228,10 @@ static bool haveNoCommonBitsSetCommutative(SDValue A, SDValue B) { SDValue Other) { if (SDValue NotOperand = getBitwiseNotOperand(Not, Mask, /* AllowUndefs */ true)) { + if (NotOperand->getOpcode() == ISD::ZERO_EXTEND || + NotOperand->getOpcode() == ISD::TRUNCATE) + NotOperand = NotOperand->getOperand(0); + if (Other == NotOperand) return true; if (Other->getOpcode() == ISD::AND) @@ -4994,6 +5240,13 @@ static bool haveNoCommonBitsSetCommutative(SDValue A, SDValue B) { } return false; }; + + if (A->getOpcode() == ISD::ZERO_EXTEND || A->getOpcode() == ISD::TRUNCATE) + A = A->getOperand(0); + + if (B->getOpcode() == ISD::ZERO_EXTEND || B->getOpcode() == ISD::TRUNCATE) + B = B->getOperand(0); + if (A->getOpcode() == ISD::AND) return MatchNoCommonBitsPattern(A->getOperand(0), A->getOperand(1), B) || MatchNoCommonBitsPattern(A->getOperand(1), A->getOperand(0), B); @@ -5159,23 +5412,22 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, - SDValue Operand) { + SDValue N1) { SDNodeFlags Flags; if (Inserter) Flags = Inserter->getFlags(); - return getNode(Opcode, DL, VT, Operand, Flags); + return getNode(Opcode, DL, VT, N1, Flags); } SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, - SDValue Operand, const SDNodeFlags Flags) { - assert(Operand.getOpcode() != ISD::DELETED_NODE && - "Operand is DELETED_NODE!"); + SDValue N1, const SDNodeFlags Flags) { + assert(N1.getOpcode() != ISD::DELETED_NODE && "Operand is DELETED_NODE!"); // Constant fold unary operations with an integer constant operand. Even // opaque constant will be folded, because the folding of unary operations // doesn't create new constants with different values. Nevertheless, the // opaque flag is preserved during folding to prevent future folding with // other constants. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand)) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { const APInt &Val = C->getAPIntValue(); switch (Opcode) { default: break; @@ -5191,7 +5443,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, C->isTargetOpcode(), C->isOpaque()); case ISD::ANY_EXTEND: // Some targets like RISCV prefer to sign extend some types. - if (TLI->isSExtCheaperThanZExt(Operand.getValueType(), VT)) + if (TLI->isSExtCheaperThanZExt(N1.getValueType(), VT)) return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT, C->isTargetOpcode(), C->isOpaque()); return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT, @@ -5225,15 +5477,15 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTPOP: - return getConstant(Val.countPopulation(), DL, VT, C->isTargetOpcode(), + return getConstant(Val.popcount(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: - return getConstant(Val.countLeadingZeros(), DL, VT, C->isTargetOpcode(), + return getConstant(Val.countl_zero(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: - return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(), + return getConstant(Val.countr_zero(), DL, VT, C->isTargetOpcode(), C->isOpaque()); case ISD::FP16_TO_FP: case ISD::BF16_TO_FP: { @@ -5249,7 +5501,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return getConstantFP(FPV, DL, VT); } case ISD::STEP_VECTOR: { - if (SDValue V = FoldSTEP_VECTOR(DL, VT, Operand, *this)) + if (SDValue V = FoldSTEP_VECTOR(DL, VT, N1, *this)) return V; break; } @@ -5257,7 +5509,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } // Constant fold unary operations with a floating point constant operand. - if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand)) { + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N1)) { APFloat V = C->getValueAPF(); // make copy switch (Opcode) { case ISD::FNEG: @@ -5354,262 +5606,250 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::CTPOP: { - SDValue Ops = {Operand}; + SDValue Ops = {N1}; if (SDValue Fold = FoldConstantArithmetic(Opcode, DL, VT, Ops)) return Fold; } } - unsigned OpOpcode = Operand.getNode()->getOpcode(); + unsigned OpOpcode = N1.getNode()->getOpcode(); switch (Opcode) { case ISD::STEP_VECTOR: assert(VT.isScalableVector() && "STEP_VECTOR can only be used with scalable types"); assert(OpOpcode == ISD::TargetConstant && - VT.getVectorElementType() == Operand.getValueType() && + VT.getVectorElementType() == N1.getValueType() && "Unexpected step operand"); break; case ISD::FREEZE: - assert(VT == Operand.getValueType() && "Unexpected VT!"); - if (isGuaranteedNotToBeUndefOrPoison(Operand, /*PoisonOnly*/ false, + assert(VT == N1.getValueType() && "Unexpected VT!"); + if (isGuaranteedNotToBeUndefOrPoison(N1, /*PoisonOnly*/ false, /*Depth*/ 1)) - return Operand; + return N1; break; case ISD::TokenFactor: case ISD::MERGE_VALUES: case ISD::CONCAT_VECTORS: - return Operand; // Factor, merge or concat of one node? No need. + return N1; // Factor, merge or concat of one node? No need. case ISD::BUILD_VECTOR: { // Attempt to simplify BUILD_VECTOR. - SDValue Ops[] = {Operand}; + SDValue Ops[] = {N1}; if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this)) return V; break; } case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node"); case ISD::FP_EXTEND: - assert(VT.isFloatingPoint() && - Operand.getValueType().isFloatingPoint() && "Invalid FP cast!"); - if (Operand.getValueType() == VT) return Operand; // noop conversion. - assert((!VT.isVector() || - VT.getVectorElementCount() == - Operand.getValueType().getVectorElementCount()) && + assert(VT.isFloatingPoint() && N1.getValueType().isFloatingPoint() && + "Invalid FP cast!"); + if (N1.getValueType() == VT) return N1; // noop conversion. + assert((!VT.isVector() || VT.getVectorElementCount() == + N1.getValueType().getVectorElementCount()) && "Vector element count mismatch!"); - assert(Operand.getValueType().bitsLT(VT) && - "Invalid fpext node, dst < src!"); - if (Operand.isUndef()) + assert(N1.getValueType().bitsLT(VT) && "Invalid fpext node, dst < src!"); + if (N1.isUndef()) return getUNDEF(VT); break; case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: - if (Operand.isUndef()) + if (N1.isUndef()) return getUNDEF(VT); break; case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: // [us]itofp(undef) = 0, because the result value is bounded. - if (Operand.isUndef()) + if (N1.isUndef()) return getConstantFP(0.0, DL, VT); break; case ISD::SIGN_EXTEND: - assert(VT.isInteger() && Operand.getValueType().isInteger() && + assert(VT.isInteger() && N1.getValueType().isInteger() && "Invalid SIGN_EXTEND!"); - assert(VT.isVector() == Operand.getValueType().isVector() && + assert(VT.isVector() == N1.getValueType().isVector() && "SIGN_EXTEND result type type should be vector iff the operand " "type is vector!"); - if (Operand.getValueType() == VT) return Operand; // noop extension - assert((!VT.isVector() || - VT.getVectorElementCount() == - Operand.getValueType().getVectorElementCount()) && + if (N1.getValueType() == VT) return N1; // noop extension + assert((!VT.isVector() || VT.getVectorElementCount() == + N1.getValueType().getVectorElementCount()) && "Vector element count mismatch!"); - assert(Operand.getValueType().bitsLT(VT) && - "Invalid sext node, dst < src!"); + assert(N1.getValueType().bitsLT(VT) && "Invalid sext node, dst < src!"); if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND) - return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); + return getNode(OpOpcode, DL, VT, N1.getOperand(0)); if (OpOpcode == ISD::UNDEF) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, DL, VT); break; case ISD::ZERO_EXTEND: - assert(VT.isInteger() && Operand.getValueType().isInteger() && + assert(VT.isInteger() && N1.getValueType().isInteger() && "Invalid ZERO_EXTEND!"); - assert(VT.isVector() == Operand.getValueType().isVector() && + assert(VT.isVector() == N1.getValueType().isVector() && "ZERO_EXTEND result type type should be vector iff the operand " "type is vector!"); - if (Operand.getValueType() == VT) return Operand; // noop extension - assert((!VT.isVector() || - VT.getVectorElementCount() == - Operand.getValueType().getVectorElementCount()) && + if (N1.getValueType() == VT) return N1; // noop extension + assert((!VT.isVector() || VT.getVectorElementCount() == + N1.getValueType().getVectorElementCount()) && "Vector element count mismatch!"); - assert(Operand.getValueType().bitsLT(VT) && - "Invalid zext node, dst < src!"); - if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x) - return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0)); + assert(N1.getValueType().bitsLT(VT) && "Invalid zext node, dst < src!"); + if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x) + return getNode(ISD::ZERO_EXTEND, DL, VT, N1.getOperand(0)); if (OpOpcode == ISD::UNDEF) // zext(undef) = 0, because the top bits will be zero. return getConstant(0, DL, VT); break; case ISD::ANY_EXTEND: - assert(VT.isInteger() && Operand.getValueType().isInteger() && + assert(VT.isInteger() && N1.getValueType().isInteger() && "Invalid ANY_EXTEND!"); - assert(VT.isVector() == Operand.getValueType().isVector() && + assert(VT.isVector() == N1.getValueType().isVector() && "ANY_EXTEND result type type should be vector iff the operand " "type is vector!"); - if (Operand.getValueType() == VT) return Operand; // noop extension - assert((!VT.isVector() || - VT.getVectorElementCount() == - Operand.getValueType().getVectorElementCount()) && + if (N1.getValueType() == VT) return N1; // noop extension + assert((!VT.isVector() || VT.getVectorElementCount() == + N1.getValueType().getVectorElementCount()) && "Vector element count mismatch!"); - assert(Operand.getValueType().bitsLT(VT) && - "Invalid anyext node, dst < src!"); + assert(N1.getValueType().bitsLT(VT) && "Invalid anyext node, dst < src!"); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) // (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x) - return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); + return getNode(OpOpcode, DL, VT, N1.getOperand(0)); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // (ext (trunc x)) -> x if (OpOpcode == ISD::TRUNCATE) { - SDValue OpOp = Operand.getOperand(0); + SDValue OpOp = N1.getOperand(0); if (OpOp.getValueType() == VT) { - transferDbgValues(Operand, OpOp); + transferDbgValues(N1, OpOp); return OpOp; } } break; case ISD::TRUNCATE: - assert(VT.isInteger() && Operand.getValueType().isInteger() && + assert(VT.isInteger() && N1.getValueType().isInteger() && "Invalid TRUNCATE!"); - assert(VT.isVector() == Operand.getValueType().isVector() && + assert(VT.isVector() == N1.getValueType().isVector() && "TRUNCATE result type type should be vector iff the operand " "type is vector!"); - if (Operand.getValueType() == VT) return Operand; // noop truncate - assert((!VT.isVector() || - VT.getVectorElementCount() == - Operand.getValueType().getVectorElementCount()) && + if (N1.getValueType() == VT) return N1; // noop truncate + assert((!VT.isVector() || VT.getVectorElementCount() == + N1.getValueType().getVectorElementCount()) && "Vector element count mismatch!"); - assert(Operand.getValueType().bitsGT(VT) && - "Invalid truncate node, src < dst!"); + assert(N1.getValueType().bitsGT(VT) && "Invalid truncate node, src < dst!"); if (OpOpcode == ISD::TRUNCATE) - return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0)); + return getNode(ISD::TRUNCATE, DL, VT, N1.getOperand(0)); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) { // If the source is smaller than the dest, we still need an extend. - if (Operand.getOperand(0).getValueType().getScalarType() - .bitsLT(VT.getScalarType())) - return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); - if (Operand.getOperand(0).getValueType().bitsGT(VT)) - return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0)); - return Operand.getOperand(0); + if (N1.getOperand(0).getValueType().getScalarType().bitsLT( + VT.getScalarType())) + return getNode(OpOpcode, DL, VT, N1.getOperand(0)); + if (N1.getOperand(0).getValueType().bitsGT(VT)) + return getNode(ISD::TRUNCATE, DL, VT, N1.getOperand(0)); + return N1.getOperand(0); } if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); if (OpOpcode == ISD::VSCALE && !NewNodesMustHaveLegalTypes) - return getVScale(DL, VT, Operand.getConstantOperandAPInt(0)); + return getVScale(DL, VT, + N1.getConstantOperandAPInt(0).trunc(VT.getSizeInBits())); break; case ISD::ANY_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: assert(VT.isVector() && "This DAG node is restricted to vector types."); - assert(Operand.getValueType().bitsLE(VT) && + assert(N1.getValueType().bitsLE(VT) && "The input must be the same size or smaller than the result."); assert(VT.getVectorMinNumElements() < - Operand.getValueType().getVectorMinNumElements() && + N1.getValueType().getVectorMinNumElements() && "The destination vector type must have fewer lanes than the input."); break; case ISD::ABS: - assert(VT.isInteger() && VT == Operand.getValueType() && - "Invalid ABS!"); + assert(VT.isInteger() && VT == N1.getValueType() && "Invalid ABS!"); if (OpOpcode == ISD::UNDEF) return getConstant(0, DL, VT); break; case ISD::BSWAP: - assert(VT.isInteger() && VT == Operand.getValueType() && - "Invalid BSWAP!"); + assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BSWAP!"); assert((VT.getScalarSizeInBits() % 16 == 0) && "BSWAP types must be a multiple of 16 bits!"); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // bswap(bswap(X)) -> X. if (OpOpcode == ISD::BSWAP) - return Operand.getOperand(0); + return N1.getOperand(0); break; case ISD::BITREVERSE: - assert(VT.isInteger() && VT == Operand.getValueType() && - "Invalid BITREVERSE!"); + assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BITREVERSE!"); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; case ISD::BITCAST: - assert(VT.getSizeInBits() == Operand.getValueSizeInBits() && + assert(VT.getSizeInBits() == N1.getValueSizeInBits() && "Cannot BITCAST between types of different sizes!"); - if (VT == Operand.getValueType()) return Operand; // noop conversion. - if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x) - return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0)); + if (VT == N1.getValueType()) return N1; // noop conversion. + if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x) + return getNode(ISD::BITCAST, DL, VT, N1.getOperand(0)); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); break; case ISD::SCALAR_TO_VECTOR: - assert(VT.isVector() && !Operand.getValueType().isVector() && - (VT.getVectorElementType() == Operand.getValueType() || + assert(VT.isVector() && !N1.getValueType().isVector() && + (VT.getVectorElementType() == N1.getValueType() || (VT.getVectorElementType().isInteger() && - Operand.getValueType().isInteger() && - VT.getVectorElementType().bitsLE(Operand.getValueType()))) && + N1.getValueType().isInteger() && + VT.getVectorElementType().bitsLE(N1.getValueType()))) && "Illegal SCALAR_TO_VECTOR node!"); if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined. if (OpOpcode == ISD::EXTRACT_VECTOR_ELT && - isa<ConstantSDNode>(Operand.getOperand(1)) && - Operand.getConstantOperandVal(1) == 0 && - Operand.getOperand(0).getValueType() == VT) - return Operand.getOperand(0); + isa<ConstantSDNode>(N1.getOperand(1)) && + N1.getConstantOperandVal(1) == 0 && + N1.getOperand(0).getValueType() == VT) + return N1.getOperand(0); break; case ISD::FNEG: // Negation of an unknown bag of bits is still completely undefined. if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); - if (OpOpcode == ISD::FNEG) // --X -> X - return Operand.getOperand(0); + if (OpOpcode == ISD::FNEG) // --X -> X + return N1.getOperand(0); break; case ISD::FABS: - if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X) - return getNode(ISD::FABS, DL, VT, Operand.getOperand(0)); + if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X) + return getNode(ISD::FABS, DL, VT, N1.getOperand(0)); break; case ISD::VSCALE: - assert(VT == Operand.getValueType() && "Unexpected VT!"); + assert(VT == N1.getValueType() && "Unexpected VT!"); break; case ISD::CTPOP: - if (Operand.getValueType().getScalarType() == MVT::i1) - return Operand; + if (N1.getValueType().getScalarType() == MVT::i1) + return N1; break; case ISD::CTLZ: case ISD::CTTZ: - if (Operand.getValueType().getScalarType() == MVT::i1) - return getNOT(DL, Operand, Operand.getValueType()); + if (N1.getValueType().getScalarType() == MVT::i1) + return getNOT(DL, N1, N1.getValueType()); break; case ISD::VECREDUCE_ADD: - if (Operand.getValueType().getScalarType() == MVT::i1) - return getNode(ISD::VECREDUCE_XOR, DL, VT, Operand); + if (N1.getValueType().getScalarType() == MVT::i1) + return getNode(ISD::VECREDUCE_XOR, DL, VT, N1); break; case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: - if (Operand.getValueType().getScalarType() == MVT::i1) - return getNode(ISD::VECREDUCE_OR, DL, VT, Operand); + if (N1.getValueType().getScalarType() == MVT::i1) + return getNode(ISD::VECREDUCE_OR, DL, VT, N1); break; case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_UMIN: - if (Operand.getValueType().getScalarType() == MVT::i1) - return getNode(ISD::VECREDUCE_AND, DL, VT, Operand); + if (N1.getValueType().getScalarType() == MVT::i1) + return getNode(ISD::VECREDUCE_AND, DL, VT, N1); break; } SDNode *N; SDVTList VTs = getVTList(VT); - SDValue Ops[] = {Operand}; + SDValue Ops[] = {N1}; if (VT != MVT::Glue) { // Don't CSE flag producing nodes FoldingSetNodeID ID; AddNodeIDNode(ID, Opcode, VTs, Ops); @@ -5710,6 +5950,10 @@ static std::optional<APInt> FoldValue(unsigned Opcode, const APInt &C1, APInt C2Ext = C2.zext(FullWidth); return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1); } + case ISD::ABDS: + return APIntOps::smax(C1, C2) - APIntOps::smin(C1, C2); + case ISD::ABDU: + return APIntOps::umax(C1, C2) - APIntOps::umin(C1, C2); } return std::nullopt; } @@ -6678,7 +6922,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::VECTOR_SHUFFLE: llvm_unreachable("should use getVectorShuffle constructor!"); case ISD::VECTOR_SPLICE: { - if (cast<ConstantSDNode>(N3)->isNullValue()) + if (cast<ConstantSDNode>(N3)->isZero()) return N1; break; } @@ -6745,6 +6989,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (N1.getValueType() == VT) return N1; break; + case ISD::VP_TRUNCATE: + case ISD::VP_SIGN_EXTEND: + case ISD::VP_ZERO_EXTEND: + // Don't create noop casts. + if (N1.getValueType() == VT) + return N1; + break; } // Memoize node if it doesn't produce a flag. @@ -7042,7 +7293,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, AAMDNodes NewAAInfo = AAInfo; NewAAInfo.TBAA = NewAAInfo.TBAAStruct = nullptr; - const Value *SrcVal = SrcPtrInfo.V.dyn_cast<const Value *>(); + const Value *SrcVal = dyn_cast_if_present<const Value *>(SrcPtrInfo.V); bool isConstant = AA && SrcVal && AA->pointsToConstantMemory(MemoryLocation(SrcVal, Size, AAInfo)); @@ -7321,8 +7572,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl, FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst); if (FI && !MFI.isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; - bool IsZeroVal = - isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isZero(); + bool IsZeroVal = isNullConstant(Src); unsigned Limit = AlwaysInline ? ~0 : TLI.getMaxStoresPerMemset(OptSize); if (!TLI.findOptimalMemOpLowering( @@ -7870,7 +8120,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, assert((Opcode == ISD::INTRINSIC_VOID || Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::PREFETCH || - ((int)Opcode <= std::numeric_limits<int>::max() && + (Opcode <= (unsigned)std::numeric_limits<int>::max() && (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) && "Opcode is not a memory-accessing opcode!"); @@ -7883,6 +8133,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, Opcode, dl.getIROrder(), VTList, MemVT, MMO)); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); ID.AddInteger(MMO->getFlags()); + ID.AddInteger(MemVT.getRawBits()); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO); @@ -8307,7 +8558,7 @@ SDValue SelectionDAG::getLoadVP(ISD::MemIndexedMode AM, SDValue Ops[] = {Chain, Ptr, Offset, Mask, EVL}; FoldingSetNodeID ID; AddNodeIDNode(ID, ISD::VP_LOAD, VTs, Ops); - ID.AddInteger(VT.getRawBits()); + ID.AddInteger(MemVT.getRawBits()); ID.AddInteger(getSyntheticNodeSubclassData<VPLoadSDNode>( dl.getIROrder(), VTs, AM, ExtType, IsExpanding, MemVT, MMO)); ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); @@ -9051,6 +9302,60 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, return V; } +SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, + EVT MemVT, MachineMemOperand *MMO) { + assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); + SDVTList VTs = getVTList(MVT::Other); + SDValue Ops[] = {Chain, Ptr}; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::GET_FPENV_MEM, VTs, Ops); + ID.AddInteger(MemVT.getRawBits()); + ID.AddInteger(getSyntheticNodeSubclassData<FPStateAccessSDNode>( + ISD::GET_FPENV_MEM, dl.getIROrder(), VTs, MemVT, MMO)); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + ID.AddInteger(MMO->getFlags()); + void *IP = nullptr; + if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) + return SDValue(E, 0); + + auto *N = newSDNode<FPStateAccessSDNode>(ISD::GET_FPENV_MEM, dl.getIROrder(), + dl.getDebugLoc(), VTs, MemVT, MMO); + createOperands(N, Ops); + + CSEMap.InsertNode(N, IP); + InsertNode(N); + SDValue V(N, 0); + NewSDValueDbgMsg(V, "Creating new node: ", this); + return V; +} + +SDValue SelectionDAG::getSetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, + EVT MemVT, MachineMemOperand *MMO) { + assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); + SDVTList VTs = getVTList(MVT::Other); + SDValue Ops[] = {Chain, Ptr}; + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::SET_FPENV_MEM, VTs, Ops); + ID.AddInteger(MemVT.getRawBits()); + ID.AddInteger(getSyntheticNodeSubclassData<FPStateAccessSDNode>( + ISD::SET_FPENV_MEM, dl.getIROrder(), VTs, MemVT, MMO)); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + ID.AddInteger(MMO->getFlags()); + void *IP = nullptr; + if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) + return SDValue(E, 0); + + auto *N = newSDNode<FPStateAccessSDNode>(ISD::SET_FPENV_MEM, dl.getIROrder(), + dl.getDebugLoc(), VTs, MemVT, MMO); + createOperands(N, Ops); + + CSEMap.InsertNode(N, IP); + InsertNode(N); + SDValue V(N, 0); + NewSDValueDbgMsg(V, "Creating new node: ", this); + return V; +} + SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) { // select undef, T, F --> T (if T is a constant), otherwise F // select, ?, undef, F --> F @@ -9348,6 +9653,23 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, "Binary operator types must match!"); break; } + case ISD::FFREXP: { + assert(VTList.NumVTs == 2 && Ops.size() == 1 && "Invalid ffrexp op!"); + assert(VTList.VTs[0].isFloatingPoint() && VTList.VTs[1].isInteger() && + VTList.VTs[0] == Ops[0].getValueType() && "frexp type mismatch"); + + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Ops[0])) { + int FrexpExp; + APFloat FrexpMant = + frexp(C->getValueAPF(), FrexpExp, APFloat::rmNearestTiesToEven); + SDValue Result0 = getConstantFP(FrexpMant, DL, VTList.VTs[0]); + SDValue Result1 = + getConstant(FrexpMant.isFinite() ? FrexpExp : 0, DL, VTList.VTs[1]); + return getNode(ISD::MERGE_VALUES, DL, VTList, {Result0, Result1}, Flags); + } + + break; + } case ISD::STRICT_FP_EXTEND: assert(VTList.NumVTs == 2 && Ops.size() == 2 && "Invalid STRICT_FP_EXTEND!"); @@ -9357,8 +9679,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, "STRICT_FP_EXTEND result type should be vector iff the operand " "type is vector!"); assert((!VTList.VTs[0].isVector() || - VTList.VTs[0].getVectorNumElements() == - Ops[1].getValueType().getVectorNumElements()) && + VTList.VTs[0].getVectorElementCount() == + Ops[1].getValueType().getVectorElementCount()) && "Vector element count mismatch!"); assert(Ops[1].getValueType().bitsLT(VTList.VTs[0]) && "Invalid fpext node, dst <= src!"); @@ -9369,8 +9691,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, "STRICT_FP_ROUND result type should be vector iff the operand " "type is vector!"); assert((!VTList.VTs[0].isVector() || - VTList.VTs[0].getVectorNumElements() == - Ops[1].getValueType().getVectorNumElements()) && + VTList.VTs[0].getVectorElementCount() == + Ops[1].getValueType().getVectorElementCount()) && "Vector element count mismatch!"); assert(VTList.VTs[0].isFloatingPoint() && Ops[1].getValueType().isFloatingPoint() && @@ -10247,8 +10569,7 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) { case ISD::ADD: SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); - if (!isConstantIntBuildVectorOrConstantInt(N0) && - isConstantIntBuildVectorOrConstantInt(N1)) { + if (!isa<ConstantSDNode>(N0) && isa<ConstantSDNode>(N1)) { uint64_t Offset = N.getConstantOperandVal(1); // Rewrite an ADD constant node into a DIExpression. Since we are @@ -10594,11 +10915,11 @@ public: bool SelectionDAG::calculateDivergence(SDNode *N) { if (TLI->isSDNodeAlwaysUniform(N)) { - assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, DA) && + assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) && "Conflicting divergence information!"); return false; } - if (TLI->isSDNodeSourceOfDivergence(N, FLI, DA)) + if (TLI->isSDNodeSourceOfDivergence(N, FLI, UA)) return true; for (const auto &Op : N->ops()) { if (Op.Val.getValueType() != MVT::Other && Op.getNode()->isDivergent()) @@ -10975,6 +11296,12 @@ SDValue llvm::peekThroughExtractSubvectors(SDValue V) { return V; } +SDValue llvm::peekThroughTruncates(SDValue V) { + while (V.getOpcode() == ISD::TRUNCATE) + V = V.getOperand(0); + return V; +} + bool llvm::isBitwiseNot(SDValue V, bool AllowUndefs) { if (V.getOpcode() != ISD::XOR) return false; @@ -10982,7 +11309,7 @@ bool llvm::isBitwiseNot(SDValue V, bool AllowUndefs) { unsigned NumBits = V.getScalarValueSizeInBits(); ConstantSDNode *C = isConstOrConstSplat(V, AllowUndefs, /*AllowTruncation*/ true); - return C && (C->getAPIntValue().countTrailingOnes() >= NumBits); + return C && (C->getAPIntValue().countr_one() >= NumBits); } ConstantSDNode *llvm::isConstOrConstSplat(SDValue N, bool AllowUndefs, @@ -11394,16 +11721,11 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, } SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { - assert(N->getNumValues() == 1 && - "Can't unroll a vector with multiple results!"); - EVT VT = N->getValueType(0); - unsigned NE = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); - SDLoc dl(N); + unsigned NE = VT.getVectorNumElements(); - SmallVector<SDValue, 8> Scalars; - SmallVector<SDValue, 4> Operands(N->getNumOperands()); + SDLoc dl(N); // If ResNE is 0, fully unroll the vector op. if (ResNE == 0) @@ -11411,6 +11733,40 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { else if (NE > ResNE) NE = ResNE; + if (N->getNumValues() == 2) { + SmallVector<SDValue, 8> Scalars0, Scalars1; + SmallVector<SDValue, 4> Operands(N->getNumOperands()); + EVT VT1 = N->getValueType(1); + EVT EltVT1 = VT1.getVectorElementType(); + + unsigned i; + for (i = 0; i != NE; ++i) { + for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) { + SDValue Operand = N->getOperand(j); + EVT OperandVT = Operand.getValueType(); + + // A vector operand; extract a single element. + EVT OperandEltVT = OperandVT.getVectorElementType(); + Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, + Operand, getVectorIdxConstant(i, dl)); + } + + SDValue EltOp = getNode(N->getOpcode(), dl, {EltVT, EltVT1}, Operands); + Scalars0.push_back(EltOp); + Scalars1.push_back(EltOp.getValue(1)); + } + + SDValue Vec0 = getBuildVector(VT, dl, Scalars0); + SDValue Vec1 = getBuildVector(VT1, dl, Scalars1); + return getMergeValues({Vec0, Vec1}, dl); + } + + assert(N->getNumValues() == 1 && + "Can't unroll a vector with multiple results!"); + + SmallVector<SDValue, 8> Scalars; + SmallVector<SDValue, 4> Operands(N->getNumOperands()); + unsigned i; for (i= 0; i != NE; ++i) { for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) { @@ -11533,7 +11889,7 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD, int64_t Offset = 0; if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset)) - return (Dist * Bytes == Offset); + return (Dist * (int64_t)Bytes == Offset); return false; } @@ -11573,6 +11929,21 @@ MaybeAlign SelectionDAG::InferPtrAlign(SDValue Ptr) const { return std::nullopt; } +/// Split the scalar node with EXTRACT_ELEMENT using the provided +/// VTs and return the low/high part. +std::pair<SDValue, SDValue> SelectionDAG::SplitScalar(const SDValue &N, + const SDLoc &DL, + const EVT &LoVT, + const EVT &HiVT) { + assert(!LoVT.isVector() && !HiVT.isVector() && !N.getValueType().isVector() && + "Split node must be a scalar type"); + SDValue Lo = + getNode(ISD::EXTRACT_ELEMENT, DL, LoVT, N, getIntPtrConstant(0, DL)); + SDValue Hi = + getNode(ISD::EXTRACT_ELEMENT, DL, HiVT, N, getIntPtrConstant(1, DL)); + return std::make_pair(Lo, Hi); +} + /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type /// which is split (or expanded) into two not necessarily identical pieces. std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const { @@ -11786,7 +12157,7 @@ SDValue BuildVectorSDNode::getSplatValue(const APInt &DemandedElts, } if (!Splatted) { - unsigned FirstDemandedIdx = DemandedElts.countTrailingZeros(); + unsigned FirstDemandedIdx = DemandedElts.countr_zero(); assert(getOperand(FirstDemandedIdx).isUndef() && "Can only have a splat without a constant for all undefs."); return getOperand(FirstDemandedIdx); @@ -11908,7 +12279,7 @@ bool BuildVectorSDNode::getConstantRawBits( // Extract raw src bits. SmallVector<APInt> SrcBitElements(NumSrcOps, - APInt::getNullValue(SrcEltSizeInBits)); + APInt::getZero(SrcEltSizeInBits)); BitVector SrcUndeElements(NumSrcOps, false); for (unsigned I = 0; I != NumSrcOps; ++I) { @@ -11946,7 +12317,7 @@ void BuildVectorSDNode::recastRawBits(bool IsLittleEndian, unsigned NumDstOps = (NumSrcOps * SrcEltSizeInBits) / DstEltSizeInBits; DstUndefElements.clear(); DstUndefElements.resize(NumDstOps, false); - DstBitElements.assign(NumDstOps, APInt::getNullValue(DstEltSizeInBits)); + DstBitElements.assign(NumDstOps, APInt::getZero(DstEltSizeInBits)); // Concatenate src elements constant bits together into dst element. if (SrcEltSizeInBits <= DstEltSizeInBits) { @@ -12093,7 +12464,7 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) { Node->NumOperands = Vals.size(); Node->OperandList = Ops; if (!TLI->isSDNodeAlwaysUniform(Node)) { - IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, DA); + IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, UA); Node->SDNodeBits.IsDivergent = IsDivergent; } checkForCycles(Node); @@ -12147,9 +12518,53 @@ SDValue SelectionDAG::getNeutralElement(unsigned Opcode, const SDLoc &DL, return getConstantFP(NeutralAF, DL, VT); } + case ISD::FMINIMUM: + case ISD::FMAXIMUM: { + // Neutral element for fminimum is Inf or FLT_MAX, depending on FMF. + const fltSemantics &Semantics = EVTToAPFloatSemantics(VT); + APFloat NeutralAF = !Flags.hasNoInfs() ? APFloat::getInf(Semantics) + : APFloat::getLargest(Semantics); + if (Opcode == ISD::FMAXIMUM) + NeutralAF.changeSign(); + + return getConstantFP(NeutralAF, DL, VT); + } + } } +/// Helper used to make a call to a library function that has one argument of +/// pointer type. +/// +/// Such functions include 'fegetmode', 'fesetenv' and some others, which are +/// used to get or set floating-point state. They have one argument of pointer +/// type, which points to the memory region containing bits of the +/// floating-point state. The value returned by such function is ignored in the +/// created call. +/// +/// \param LibFunc Reference to library function (value of RTLIB::Libcall). +/// \param Ptr Pointer used to save/load state. +/// \param InChain Ingoing token chain. +/// \returns Outgoing chain token. +SDValue SelectionDAG::makeStateFunctionCall(unsigned LibFunc, SDValue Ptr, + SDValue InChain, + const SDLoc &DLoc) { + assert(InChain.getValueType() == MVT::Other && "Expected token chain"); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Node = Ptr; + Entry.Ty = Ptr.getValueType().getTypeForEVT(*getContext()); + Args.push_back(Entry); + RTLIB::Libcall LC = static_cast<RTLIB::Libcall>(LibFunc); + SDValue Callee = getExternalSymbol(TLI->getLibcallName(LC), + TLI->getPointerTy(getDataLayout())); + TargetLowering::CallLoweringInfo CLI(*this); + CLI.setDebugLoc(DLoc).setChain(InChain).setLibCallee( + TLI->getLibcallCallingConv(LC), Type::getVoidTy(*getContext()), Callee, + std::move(Args)); + return TLI->LowerCallTo(CLI).second; +} + void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) { assert(From && To && "Invalid SDNode; empty source SDValue?"); auto I = SDEI.find(From); @@ -12158,8 +12573,90 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) { // Use of operator[] on the DenseMap may cause an insertion, which invalidates // the iterator, hence the need to make a copy to prevent a use-after-free. - NodeExtraInfo Copy = I->second; - SDEI[To] = std::move(Copy); + NodeExtraInfo NEI = I->second; + if (LLVM_LIKELY(!NEI.PCSections)) { + // No deep copy required for the types of extra info set. + // + // FIXME: Investigate if other types of extra info also need deep copy. This + // depends on the types of nodes they can be attached to: if some extra info + // is only ever attached to nodes where a replacement To node is always the + // node where later use and propagation of the extra info has the intended + // semantics, no deep copy is required. + SDEI[To] = std::move(NEI); + return; + } + + // We need to copy NodeExtraInfo to all _new_ nodes that are being introduced + // through the replacement of From with To. Otherwise, replacements of a node + // (From) with more complex nodes (To and its operands) may result in lost + // extra info where the root node (To) is insignificant in further propagating + // and using extra info when further lowering to MIR. + // + // In the first step pre-populate the visited set with the nodes reachable + // from the old From node. This avoids copying NodeExtraInfo to parts of the + // DAG that is not new and should be left untouched. + SmallVector<const SDNode *> Leafs{From}; // Leafs reachable with VisitFrom. + DenseSet<const SDNode *> FromReach; // The set of nodes reachable from From. + auto VisitFrom = [&](auto &&Self, const SDNode *N, int MaxDepth) { + if (MaxDepth == 0) { + // Remember this node in case we need to increase MaxDepth and continue + // populating FromReach from this node. + Leafs.emplace_back(N); + return; + } + if (!FromReach.insert(N).second) + return; + for (const SDValue &Op : N->op_values()) + Self(Self, Op.getNode(), MaxDepth - 1); + }; + + // Copy extra info to To and all its transitive operands (that are new). + SmallPtrSet<const SDNode *, 8> Visited; + auto DeepCopyTo = [&](auto &&Self, const SDNode *N) { + if (FromReach.contains(N)) + return true; + if (!Visited.insert(N).second) + return true; + if (getEntryNode().getNode() == N) + return false; + for (const SDValue &Op : N->op_values()) { + if (!Self(Self, Op.getNode())) + return false; + } + // Copy only if entry node was not reached. + SDEI[N] = NEI; + return true; + }; + + // We first try with a lower MaxDepth, assuming that the path to common + // operands between From and To is relatively short. This significantly + // improves performance in the common case. The initial MaxDepth is big + // enough to avoid retry in the common case; the last MaxDepth is large + // enough to avoid having to use the fallback below (and protects from + // potential stack exhaustion from recursion). + for (int PrevDepth = 0, MaxDepth = 16; MaxDepth <= 1024; + PrevDepth = MaxDepth, MaxDepth *= 2, Visited.clear()) { + // StartFrom is the previous (or initial) set of leafs reachable at the + // previous maximum depth. + SmallVector<const SDNode *> StartFrom; + std::swap(StartFrom, Leafs); + for (const SDNode *N : StartFrom) + VisitFrom(VisitFrom, N, MaxDepth - PrevDepth); + if (LLVM_LIKELY(DeepCopyTo(DeepCopyTo, To))) + return; + // This should happen very rarely (reached the entry node). + LLVM_DEBUG(dbgs() << __func__ << ": MaxDepth=" << MaxDepth << " too low\n"); + assert(!Leafs.empty()); + } + + // This should not happen - but if it did, that means the subgraph reachable + // from From has depth greater or equal to maximum MaxDepth, and VisitFrom() + // could not visit all reachable common operands. Consequently, we were able + // to reach the entry node. + errs() << "warning: incomplete propagation of SelectionDAG::NodeExtraInfo\n"; + assert(false && "From subgraph too complex - increase max. MaxDepth?"); + // Best-effort fallback if assertions disabled. + SDEI[To] = std::move(NEI); } #ifndef NDEBUG diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 0bdfdac6a65f..9595da9d0d8a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -19,21 +19,21 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/AssignmentTrackingAnalysis.h" #include "llvm/CodeGen/CodeGenCommonISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GCMetadata.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -67,6 +67,7 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/InlineAsm.h" @@ -96,6 +97,7 @@ #include "llvm/Target/TargetIntrinsicInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/Local.h" #include <cstddef> #include <iterator> @@ -416,6 +418,10 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, return Val; if (PartEVT.isInteger() && ValueVT.isFloatingPoint()) return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + + // Vector/Vector bitcast (e.g. <2 x bfloat> -> <2 x half>). + if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) + return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); } // Promoted vector extract @@ -495,7 +501,6 @@ getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V, CallConv); - unsigned PartBits = PartVT.getSizeInBits(); unsigned OrigNumParts = NumParts; assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) && "Copying to an illegal type!"); @@ -511,6 +516,7 @@ getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, return; } + unsigned PartBits = PartVT.getSizeInBits(); if (NumParts * PartBits > ValueVT.getSizeInBits()) { // If the parts cover more bits than the value has, promote the value. if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) { @@ -621,6 +627,8 @@ static SDValue widenVectorToPartType(SelectionDAG &DAG, SDValue Val, return SDValue(); EVT ValueVT = Val.getValueType(); + EVT PartEVT = PartVT.getVectorElementType(); + EVT ValueEVT = ValueVT.getVectorElementType(); ElementCount PartNumElts = PartVT.getVectorElementCount(); ElementCount ValueNumElts = ValueVT.getVectorElementCount(); @@ -628,9 +636,18 @@ static SDValue widenVectorToPartType(SelectionDAG &DAG, SDValue Val, // fixed/scalable properties. If a target needs to widen a fixed-length type // to a scalable one, it should be possible to use INSERT_SUBVECTOR below. if (ElementCount::isKnownLE(PartNumElts, ValueNumElts) || - PartNumElts.isScalable() != ValueNumElts.isScalable() || - PartVT.getVectorElementType() != ValueVT.getVectorElementType()) + PartNumElts.isScalable() != ValueNumElts.isScalable()) + return SDValue(); + + // Have a try for bf16 because some targets share its ABI with fp16. + if (ValueEVT == MVT::bf16 && PartEVT == MVT::f16) { + assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) && + "Cannot widen to illegal type"); + Val = DAG.getNode(ISD::BITCAST, DL, + ValueVT.changeVectorElementType(MVT::f16), Val); + } else if (PartEVT != ValueEVT) { return SDValue(); + } // Widening a scalable vector to another scalable vector is done by inserting // the vector into a larger undef one. @@ -638,12 +655,11 @@ static SDValue widenVectorToPartType(SelectionDAG &DAG, SDValue Val, return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT), Val, DAG.getVectorIdxConstant(0, DL)); - EVT ElementVT = PartVT.getVectorElementType(); // Vector widening case, e.g. <2 x float> -> <4 x float>. Shuffle in // undef elements. SmallVector<SDValue, 16> Ops; DAG.ExtractVectorElements(Val, Ops); - SDValue EltUndef = DAG.getUNDEF(ElementVT); + SDValue EltUndef = DAG.getUNDEF(PartEVT); Ops.append((PartNumElts - ValueNumElts).getFixedValue(), EltUndef); // FIXME: Use CONCAT for 2x -> 4x. @@ -833,7 +849,7 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI, SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, const SDLoc &dl, SDValue &Chain, - SDValue *Flag, const Value *V) const { + SDValue *Glue, const Value *V) const { // A Value with type {} or [0 x %t] needs no registers. if (ValueVTs.empty()) return SDValue(); @@ -855,11 +871,11 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, Parts.resize(NumRegs); for (unsigned i = 0; i != NumRegs; ++i) { SDValue P; - if (!Flag) { + if (!Glue) { P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT); } else { - P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag); - *Flag = P.getValue(2); + P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Glue); + *Glue = P.getValue(2); } Chain = P.getValue(1); @@ -918,7 +934,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, } void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, - const SDLoc &dl, SDValue &Chain, SDValue *Flag, + const SDLoc &dl, SDValue &Chain, SDValue *Glue, const Value *V, ISD::NodeType PreferredExtendType) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -947,18 +963,18 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SmallVector<SDValue, 8> Chains(NumRegs); for (unsigned i = 0; i != NumRegs; ++i) { SDValue Part; - if (!Flag) { + if (!Glue) { Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]); } else { - Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag); - *Flag = Part.getValue(1); + Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Glue); + *Glue = Part.getValue(1); } Chains[i] = Part.getValue(0); } - if (NumRegs == 1 || Flag) - // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is + if (NumRegs == 1 || Glue) + // If NumRegs > 1 && Glue is used then the use of the last CopyToReg is // flagged to it. That is the CopyToReg nodes and the user are considered // a single scheduling unit. If we create a TokenFactor and return it as // chain, then the TokenFactor is both a predecessor (operand) of the @@ -1050,6 +1066,8 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa, Context = DAG.getContext(); LPadToCallSiteMap.clear(); SL->init(DAG.getTargetLoweringInfo(), TM, DAG.getDataLayout()); + AssignmentTrackingEnabled = isAssignmentTrackingEnabled( + *DAG.getMachineFunction().getFunction().getParent()); } void SelectionDAGBuilder::clear() { @@ -1144,8 +1162,13 @@ void SelectionDAGBuilder::visit(const Instruction &I) { It != End; ++It) { auto *Var = FnVarLocs->getDILocalVariable(It->VariableID); dropDanglingDebugInfo(Var, It->Expr); - if (!handleDebugValue(It->V, Var, It->Expr, It->DL, SDNodeOrder, - /*IsVariadic=*/false)) + if (It->Values.isKillLocation(It->Expr)) { + handleKillDebugValue(Var, It->Expr, It->DL, SDNodeOrder); + continue; + } + SmallVector<Value *> Values(It->Values.location_ops()); + if (!handleDebugValue(Values, Var, It->Expr, It->DL, SDNodeOrder, + It->Values.hasArgList())) addDanglingDebugInfo(It, SDNodeOrder); } } @@ -1205,27 +1228,46 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) { } } +static bool handleDanglingVariadicDebugInfo(SelectionDAG &DAG, + DILocalVariable *Variable, + DebugLoc DL, unsigned Order, + RawLocationWrapper Values, + DIExpression *Expression) { + if (!Values.hasArgList()) + return false; + // For variadic dbg_values we will now insert an undef. + // FIXME: We can potentially recover these! + SmallVector<SDDbgOperand, 2> Locs; + for (const Value *V : Values.location_ops()) { + auto *Undef = UndefValue::get(V->getType()); + Locs.push_back(SDDbgOperand::fromConst(Undef)); + } + SDDbgValue *SDV = DAG.getDbgValueList(Variable, Expression, Locs, {}, + /*IsIndirect=*/false, DL, Order, + /*IsVariadic=*/true); + DAG.AddDbgValue(SDV, /*isParameter=*/false); + return true; +} + void SelectionDAGBuilder::addDanglingDebugInfo(const VarLocInfo *VarLoc, unsigned Order) { - DanglingDebugInfoMap[VarLoc->V].emplace_back(VarLoc, Order); + if (!handleDanglingVariadicDebugInfo( + DAG, + const_cast<DILocalVariable *>(DAG.getFunctionVarLocs() + ->getVariable(VarLoc->VariableID) + .getVariable()), + VarLoc->DL, Order, VarLoc->Values, VarLoc->Expr)) { + DanglingDebugInfoMap[VarLoc->Values.getVariableLocationOp(0)].emplace_back( + VarLoc, Order); + } } void SelectionDAGBuilder::addDanglingDebugInfo(const DbgValueInst *DI, unsigned Order) { // We treat variadic dbg_values differently at this stage. - if (DI->hasArgList()) { - // For variadic dbg_values we will now insert an undef. - // FIXME: We can potentially recover these! - SmallVector<SDDbgOperand, 2> Locs; - for (const Value *V : DI->getValues()) { - auto Undef = UndefValue::get(V->getType()); - Locs.push_back(SDDbgOperand::fromConst(Undef)); - } - SDDbgValue *SDV = DAG.getDbgValueList( - DI->getVariable(), DI->getExpression(), Locs, {}, - /*IsIndirect=*/false, DI->getDebugLoc(), Order, /*IsVariadic=*/true); - DAG.AddDbgValue(SDV, /*isParameter=*/false); - } else { + if (!handleDanglingVariadicDebugInfo( + DAG, DI->getVariable(), DI->getDebugLoc(), Order, + DI->getWrappedLocation(), DI->getExpression())) { // TODO: Dangling debug info will eventually either be resolved or produce // an Undef DBG_VALUE. However in the resolution case, a gap may appear // between the original dbg.value location and its resolved DBG_VALUE, @@ -1382,6 +1424,17 @@ void SelectionDAGBuilder::salvageUnresolvedDbgValue(DanglingDebugInfo &DDI) { << "\n"); } +void SelectionDAGBuilder::handleKillDebugValue(DILocalVariable *Var, + DIExpression *Expr, + DebugLoc DbgLoc, + unsigned Order) { + Value *Poison = PoisonValue::get(Type::getInt1Ty(*Context)); + DIExpression *NewExpr = + const_cast<DIExpression *>(DIExpression::convertToUndefExpression(Expr)); + handleDebugValue(Poison, Var, NewExpr, DbgLoc, Order, + /*IsVariadic*/ false); +} + bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values, DILocalVariable *Var, DIExpression *Expr, DebugLoc DbgLoc, @@ -1569,7 +1622,7 @@ SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) { // If we already have an SDValue for this value, use it. SDValue &N = NodeMap[V]; if (N.getNode()) { - if (isa<ConstantSDNode>(N) || isa<ConstantFPSDNode>(N)) { + if (isIntOrFPConstant(N)) { // Remove the debug location from the node as the node is about to be used // in a location which may differ from the original debug location. This // is relevant to Constant and ConstantFP nodes because they can appear @@ -1606,7 +1659,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { TLI.getPointerTy(DAG.getDataLayout(), AS)); } - if (match(C, m_VScale(DAG.getDataLayout()))) + if (match(C, m_VScale())) return DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1)); if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) @@ -1976,8 +2029,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { // registers the usual way. SmallVector<EVT, 1> PtrValueVTs; ComputeValueVTs(TLI, DL, - F->getReturnType()->getPointerTo( - DAG.getDataLayout().getAllocaAddrSpace()), + PointerType::get(F->getContext(), + DAG.getDataLayout().getAllocaAddrSpace()), PtrValueVTs); SDValue RetPtr = @@ -1987,7 +2040,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { SmallVector<EVT, 4> ValueVTs, MemVTs; SmallVector<uint64_t, 4> Offsets; ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &MemVTs, - &Offsets); + &Offsets, 0); unsigned NumValues = ValueVTs.size(); SmallVector<SDValue, 4> Chains(NumValues); @@ -2123,7 +2176,8 @@ void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) { DenseMap<const Value *, Register>::iterator VMI = FuncInfo.ValueMap.find(V); if (VMI != FuncInfo.ValueMap.end()) { - assert(!V->use_empty() && "Unused value assigned virtual registers!"); + assert((!V->use_empty() || isa<CallBrInst>(V)) && + "Unused value assigned virtual registers!"); CopyValueToVirtualRegister(V, VMI->second); } } @@ -2424,10 +2478,12 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) { // If this is not a fall-through branch or optimizations are switched off, // emit the branch. - if (Succ0MBB != NextBlock(BrMBB) || TM.getOptLevel() == CodeGenOpt::None) - DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), - MVT::Other, getControlRoot(), - DAG.getBasicBlock(Succ0MBB))); + if (Succ0MBB != NextBlock(BrMBB) || TM.getOptLevel() == CodeGenOpt::None) { + auto Br = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, + getControlRoot(), DAG.getBasicBlock(Succ0MBB)); + setValue(&I, Br); + DAG.setRoot(Br); + } return; } @@ -2901,14 +2957,13 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB, // would need to be to shift a 1 bit in that position. Cmp = DAG.getSetCC( dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), - ShiftOp, DAG.getConstant(countTrailingZeros(B.Mask), dl, VT), + ShiftOp, DAG.getConstant(llvm::countr_zero(B.Mask), dl, VT), ISD::SETEQ); } else if (PopCount == BB.Range) { // There is only one zero bit in the range, test for it directly. Cmp = DAG.getSetCC( dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), - ShiftOp, DAG.getConstant(countTrailingOnes(B.Mask), dl, VT), - ISD::SETNE); + ShiftOp, DAG.getConstant(llvm::countr_one(B.Mask), dl, VT), ISD::SETNE); } else { // Make desired shift SDValue SwitchVal = DAG.getNode(ISD::SHL, dl, VT, @@ -2950,6 +3005,7 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) { // catchswitch for successors. MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)]; const BasicBlock *EHPadBB = I.getSuccessor(1); + MachineBasicBlock *EHPadMBB = FuncInfo.MBBMap[EHPadBB]; // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't // have to do anything here to lower funclet bundles. @@ -2974,6 +3030,10 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) { case Intrinsic::seh_scope_begin: case Intrinsic::seh_try_end: case Intrinsic::seh_scope_end: + if (EHPadMBB) + // a block referenced by EH table + // so dtor-funclet not removed by opts + EHPadMBB->setMachineBlockAddressTaken(); break; case Intrinsic::experimental_patchpoint_void: case Intrinsic::experimental_patchpoint_i64: @@ -3338,6 +3398,9 @@ void SelectionDAGBuilder::visitSelect(const User &I) { if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) Flags.copyFMF(*FPOp); + Flags.setUnpredictable( + cast<SelectInst>(I).getMetadata(LLVMContext::MD_unpredictable)); + // Min/max matching is only viable if all output VTs are the same. if (all_equal(ValueVTs)) { EVT VT = ValueVTs[0]; @@ -3355,6 +3418,9 @@ void SelectionDAGBuilder::visitSelect(const User &I) { bool UseScalarMinMax = VT.isVector() && !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT); + // ValueTracking's select pattern matching does not account for -0.0, + // so we can't lower to FMINIMUM/FMAXIMUM because those nodes specify that + // -0.0 is less than +0.0. Value *LHS, *RHS; auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS); ISD::NodeType Opc = ISD::DELETED_NODE; @@ -3366,34 +3432,26 @@ void SelectionDAGBuilder::visitSelect(const User &I) { case SPF_FMINNUM: switch (SPR.NaNBehavior) { case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?"); - case SPNB_RETURNS_NAN: Opc = ISD::FMINIMUM; break; + case SPNB_RETURNS_NAN: break; case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break; - case SPNB_RETURNS_ANY: { - if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT)) + case SPNB_RETURNS_ANY: + if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT) || + (UseScalarMinMax && + TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()))) Opc = ISD::FMINNUM; - else if (TLI.isOperationLegalOrCustom(ISD::FMINIMUM, VT)) - Opc = ISD::FMINIMUM; - else if (UseScalarMinMax) - Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ? - ISD::FMINNUM : ISD::FMINIMUM; break; } - } break; case SPF_FMAXNUM: switch (SPR.NaNBehavior) { case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?"); - case SPNB_RETURNS_NAN: Opc = ISD::FMAXIMUM; break; + case SPNB_RETURNS_NAN: break; case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break; case SPNB_RETURNS_ANY: - - if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT)) + if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT) || + (UseScalarMinMax && + TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()))) Opc = ISD::FMAXNUM; - else if (TLI.isOperationLegalOrCustom(ISD::FMAXIMUM, VT)) - Opc = ISD::FMAXIMUM; - else if (UseScalarMinMax) - Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ? - ISD::FMAXNUM : ISD::FMAXIMUM; break; } break; @@ -4123,7 +4181,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { Type *Ty = I.getType(); SmallVector<EVT, 4> ValueVTs, MemVTs; SmallVector<uint64_t, 4> Offsets; - ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets); + ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets, 0); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; @@ -4196,7 +4254,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { Chains[ChainI] = L.getValue(1); if (MemVTs[i] != ValueVTs[i]) - L = DAG.getZExtOrTrunc(L, dl, ValueVTs[i]); + L = DAG.getPtrExtOrTrunc(L, dl, ValueVTs[i]); Values[i] = L; } @@ -4222,7 +4280,7 @@ void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) { SmallVector<uint64_t, 4> Offsets; const Value *SrcV = I.getOperand(0); ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), - SrcV->getType(), ValueVTs, &Offsets); + SrcV->getType(), ValueVTs, &Offsets, 0); assert(ValueVTs.size() == 1 && Offsets[0] == 0 && "expect a single EVT for swifterror"); @@ -4258,7 +4316,7 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) { SmallVector<EVT, 4> ValueVTs; SmallVector<uint64_t, 4> Offsets; ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty, - ValueVTs, &Offsets); + ValueVTs, &Offsets, 0); assert(ValueVTs.size() == 1 && Offsets[0] == 0 && "expect a single EVT for swifterror"); @@ -4295,7 +4353,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { SmallVector<EVT, 4> ValueVTs, MemVTs; SmallVector<uint64_t, 4> Offsets; ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), - SrcV->getType(), ValueVTs, &MemVTs, &Offsets); + SrcV->getType(), ValueVTs, &MemVTs, &Offsets, 0); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; @@ -4447,11 +4505,13 @@ static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index, if (BasePtr->getType()->isVectorTy() || !IndexVal->getType()->isVectorTy()) return false; - uint64_t ScaleVal = DL.getTypeAllocSize(GEP->getResultElementType()); + TypeSize ScaleVal = DL.getTypeAllocSize(GEP->getResultElementType()); + if (ScaleVal.isScalable()) + return false; // Target may not support the required addressing mode. if (ScaleVal != 1 && - !TLI.isLegalScaleForGatherScatter(ScaleVal, ElemSize)) + !TLI.isLegalScaleForGatherScatter(ScaleVal.getFixedValue(), ElemSize)) return false; Base = SDB->getValue(BasePtr); @@ -4919,8 +4979,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, Result = lowerRangeToAssertZExt(DAG, I, Result); MaybeAlign Alignment = I.getRetAlign(); - if (!Alignment) - Alignment = F->getAttributes().getRetAlignment(); + // Insert `assertalign` node if there's an alignment. if (InsertAssertAlign && Alignment) { Result = @@ -5504,13 +5563,8 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL, PromVT = EVT::getVectorVT(Ctx, PromVT, VT.getVectorElementCount()); } else llvm_unreachable("Wrong VT for DIVFIX?"); - if (Signed) { - LHS = DAG.getSExtOrTrunc(LHS, DL, PromVT); - RHS = DAG.getSExtOrTrunc(RHS, DL, PromVT); - } else { - LHS = DAG.getZExtOrTrunc(LHS, DL, PromVT); - RHS = DAG.getZExtOrTrunc(RHS, DL, PromVT); - } + LHS = DAG.getExtOrTrunc(Signed, LHS, DL, PromVT); + RHS = DAG.getExtOrTrunc(Signed, RHS, DL, PromVT); EVT ShiftTy = TLI.getShiftAmountTy(PromVT, DAG.getDataLayout()); // For saturating operations, we need to shift up the LHS to get the // proper saturation width, and then shift down again afterwards. @@ -5767,6 +5821,26 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( if (!Op) return false; + // If the expression refers to the entry value of an Argument, use the + // corresponding livein physical register. As per the Verifier, this is only + // allowed for swiftasync Arguments. + if (Op->isReg() && Expr->isEntryValue()) { + assert(Arg->hasAttribute(Attribute::AttrKind::SwiftAsync)); + auto OpReg = Op->getReg(); + for (auto [PhysReg, VirtReg] : FuncInfo.RegInfo->liveins()) + if (OpReg == VirtReg || OpReg == PhysReg) { + SDDbgValue *SDV = DAG.getVRegDbgValue( + Variable, Expr, PhysReg, + Kind != FuncArgumentDbgValueKind::Value /*is indirect*/, DL, + SDNodeOrder); + DAG.AddDbgValue(SDV, false /*treat as dbg.declare byval parameter*/); + return true; + } + LLVM_DEBUG(dbgs() << "Dropping dbg.value: expression is entry_value but " + "couldn't find a physical register\n"); + return true; + } + assert(Variable->isValidLocationForIntrinsic(DL) && "Expected inlined-at fields to agree"); MachineInstr *NewMI = nullptr; @@ -5873,7 +5947,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, visitTargetIntrinsic(I, Intrinsic); return; case Intrinsic::vscale: { - match(&I, m_VScale(DAG.getDataLayout())); EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); setValue(&I, DAG.getVScale(sdl, VT, APInt(VT.getSizeInBits(), 1))); return; @@ -6092,14 +6165,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, DAG.setRoot(Res.getValue(1)); return; } - case Intrinsic::dbg_addr: case Intrinsic::dbg_declare: { - // Debug intrinsics are handled seperately in assignment tracking mode. - if (isAssignmentTrackingEnabled(*I.getFunction()->getParent())) + const auto &DI = cast<DbgDeclareInst>(I); + // Debug intrinsics are handled separately in assignment tracking mode. + // Some intrinsics are handled right after Argument lowering. + if (AssignmentTrackingEnabled || + FuncInfo.PreprocessedDbgDeclares.count(&DI)) return; - // Assume dbg.addr and dbg.declare can not currently use DIArgList, i.e. - // they are non-variadic. - const auto &DI = cast<DbgVariableIntrinsic>(I); + // Assume dbg.declare can not currently use DIArgList, i.e. + // it is non-variadic. assert(!DI.hasArgList() && "Only dbg.value should currently use DIArgList"); DILocalVariable *Variable = DI.getVariable(); DIExpression *Expression = DI.getExpression(); @@ -6118,37 +6192,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, bool isParameter = Variable->isParameter() || isa<Argument>(Address); - // Check if this variable can be described by a frame index, typically - // either as a static alloca or a byval parameter. - int FI = std::numeric_limits<int>::max(); - if (const auto *AI = - dyn_cast<AllocaInst>(Address->stripInBoundsConstantOffsets())) { - if (AI->isStaticAlloca()) { - auto I = FuncInfo.StaticAllocaMap.find(AI); - if (I != FuncInfo.StaticAllocaMap.end()) - FI = I->second; - } - } else if (const auto *Arg = dyn_cast<Argument>( - Address->stripInBoundsConstantOffsets())) { - FI = FuncInfo.getArgumentFrameIndex(Arg); - } - - // llvm.dbg.addr is control dependent and always generates indirect - // DBG_VALUE instructions. llvm.dbg.declare is handled as a frame index in - // the MachineFunction variable table. - if (FI != std::numeric_limits<int>::max()) { - if (Intrinsic == Intrinsic::dbg_addr) { - SDDbgValue *SDV = DAG.getFrameIndexDbgValue( - Variable, Expression, FI, getRoot().getNode(), /*IsIndirect*/ true, - dl, SDNodeOrder); - DAG.AddDbgValue(SDV, isParameter); - } else { - LLVM_DEBUG(dbgs() << "Skipping " << DI - << " (variable info stashed in MF side table)\n"); - } - return; - } - SDValue &N = NodeMap[Address]; if (!N.getNode() && isa<Argument>(Address)) // Check unused arguments map. @@ -6198,13 +6241,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } case Intrinsic::dbg_assign: { // Debug intrinsics are handled seperately in assignment tracking mode. - assert(isAssignmentTrackingEnabled(*I.getFunction()->getParent()) && - "expected assignment tracking to be enabled"); - return; + if (AssignmentTrackingEnabled) + return; + // If assignment tracking hasn't been enabled then fall through and treat + // the dbg.assign as a dbg.value. + [[fallthrough]]; } case Intrinsic::dbg_value: { // Debug intrinsics are handled seperately in assignment tracking mode. - if (isAssignmentTrackingEnabled(*I.getFunction()->getParent())) + if (AssignmentTrackingEnabled) return; const DbgValueInst &DI = cast<DbgValueInst>(I); assert(DI.getVariable() && "Missing variable"); @@ -6212,11 +6257,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, DILocalVariable *Variable = DI.getVariable(); DIExpression *Expression = DI.getExpression(); dropDanglingDebugInfo(Variable, Expression); - SmallVector<Value *, 4> Values(DI.getValues()); - if (Values.empty()) + + if (DI.isKillLocation()) { + handleKillDebugValue(Variable, Expression, DI.getDebugLoc(), SDNodeOrder); return; + } - if (llvm::is_contained(Values, nullptr)) + SmallVector<Value *, 4> Values(DI.getValues()); + if (Values.empty()) return; bool IsVariadic = DI.hasArgList(); @@ -6413,6 +6461,20 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), Flags)); return; + case Intrinsic::ldexp: + setValue(&I, DAG.getNode(ISD::FLDEXP, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)), Flags)); + return; + case Intrinsic::frexp: { + SmallVector<EVT, 2> ValueVTs; + ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs); + SDVTList VTs = DAG.getVTList(ValueVTs); + setValue(&I, + DAG.getNode(ISD::FFREXP, sdl, VTs, getValue(I.getArgOperand(0)))); + return; + } case Intrinsic::arithmetic_fence: { setValue(&I, DAG.getNode(ISD::ARITH_FENCE, sdl, getValue(I.getArgOperand(0)).getValueType(), @@ -6515,7 +6577,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, const DataLayout DLayout = DAG.getDataLayout(); EVT DestVT = TLI.getValueType(DLayout, I.getType()); EVT ArgVT = TLI.getValueType(DLayout, I.getArgOperand(0)->getType()); - unsigned Test = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); + FPClassTest Test = static_cast<FPClassTest>( + cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); MachineFunction &MF = DAG.getMachineFunction(); const Function &F = MF.getFunction(); SDValue Op = getValue(I.getArgOperand(0)); @@ -6536,6 +6599,64 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, setValue(&I, V); return; } + case Intrinsic::get_fpenv: { + const DataLayout DLayout = DAG.getDataLayout(); + EVT EnvVT = TLI.getValueType(DLayout, I.getType()); + Align TempAlign = DAG.getEVTAlign(EnvVT); + SDValue Chain = getRoot(); + // Use GET_FPENV if it is legal or custom. Otherwise use memory-based node + // and temporary storage in stack. + if (TLI.isOperationLegalOrCustom(ISD::GET_FPENV, EnvVT)) { + Res = DAG.getNode( + ISD::GET_FPENV, sdl, + DAG.getVTList(TLI.getValueType(DAG.getDataLayout(), I.getType()), + MVT::Other), + Chain); + } else { + SDValue Temp = DAG.CreateStackTemporary(EnvVT, TempAlign.value()); + int SPFI = cast<FrameIndexSDNode>(Temp.getNode())->getIndex(); + auto MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, + TempAlign); + Chain = DAG.getGetFPEnv(Chain, sdl, Temp, EnvVT, MMO); + Res = DAG.getLoad(EnvVT, sdl, Chain, Temp, MPI); + } + setValue(&I, Res); + DAG.setRoot(Res.getValue(1)); + return; + } + case Intrinsic::set_fpenv: { + const DataLayout DLayout = DAG.getDataLayout(); + SDValue Env = getValue(I.getArgOperand(0)); + EVT EnvVT = Env.getValueType(); + Align TempAlign = DAG.getEVTAlign(EnvVT); + SDValue Chain = getRoot(); + // If SET_FPENV is custom or legal, use it. Otherwise use loading + // environment from memory. + if (TLI.isOperationLegalOrCustom(ISD::SET_FPENV, EnvVT)) { + Chain = DAG.getNode(ISD::SET_FPENV, sdl, MVT::Other, Chain, Env); + } else { + // Allocate space in stack, copy environment bits into it and use this + // memory in SET_FPENV_MEM. + SDValue Temp = DAG.CreateStackTemporary(EnvVT, TempAlign.value()); + int SPFI = cast<FrameIndexSDNode>(Temp.getNode())->getIndex(); + auto MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); + Chain = DAG.getStore(Chain, sdl, Env, Temp, MPI, TempAlign, + MachineMemOperand::MOStore); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, + TempAlign); + Chain = DAG.getSetFPEnv(Chain, sdl, Temp, EnvVT, MMO); + } + DAG.setRoot(Chain); + return; + } + case Intrinsic::reset_fpenv: + DAG.setRoot(DAG.getNode(ISD::RESET_FPENV, sdl, MVT::Other, getRoot())); + return; case Intrinsic::pcmarker: { SDValue Tmp = getValue(I.getArgOperand(0)); DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp)); @@ -7020,6 +7141,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, llvm_unreachable("instrprof failed to lower a cover"); case Intrinsic::instrprof_increment: llvm_unreachable("instrprof failed to lower an increment"); + case Intrinsic::instrprof_timestamp: + llvm_unreachable("instrprof failed to lower a timestamp"); case Intrinsic::instrprof_value_profile: llvm_unreachable("instrprof failed to lower a value profiling call"); case Intrinsic::localescape: { @@ -7093,10 +7216,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } case Intrinsic::xray_customevent: { // Here we want to make sure that the intrinsic behaves as if it has a - // specific calling convention, and only for x86_64. - // FIXME: Support other platforms later. + // specific calling convention. const auto &Triple = DAG.getTarget().getTargetTriple(); - if (Triple.getArch() != Triple::x86_64) + if (!Triple.isAArch64(64) && Triple.getArch() != Triple::x86_64) return; SmallVector<SDValue, 8> Ops; @@ -7123,10 +7245,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } case Intrinsic::xray_typedevent: { // Here we want to make sure that the intrinsic behaves as if it has a - // specific calling convention, and only for x86_64. - // FIXME: Support other platforms later. + // specific calling convention. const auto &Triple = DAG.getTarget().getTargetTriple(); - if (Triple.getArch() != Triple::x86_64) + if (!Triple.isAArch64(64) && Triple.getArch() != Triple::x86_64) return; SmallVector<SDValue, 8> Ops; @@ -7174,6 +7295,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::vector_reduce_umin: case Intrinsic::vector_reduce_fmax: case Intrinsic::vector_reduce_fmin: + case Intrinsic::vector_reduce_fmaximum: + case Intrinsic::vector_reduce_fminimum: visitVectorReduce(I, Intrinsic); return; @@ -7285,6 +7408,40 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, setValue(&I, SetCC); return; } + case Intrinsic::experimental_get_vector_length: { + assert(cast<ConstantInt>(I.getOperand(1))->getSExtValue() > 0 && + "Expected positive VF"); + unsigned VF = cast<ConstantInt>(I.getOperand(1))->getZExtValue(); + bool IsScalable = cast<ConstantInt>(I.getOperand(2))->isOne(); + + SDValue Count = getValue(I.getOperand(0)); + EVT CountVT = Count.getValueType(); + + if (!TLI.shouldExpandGetVectorLength(CountVT, VF, IsScalable)) { + visitTargetIntrinsic(I, Intrinsic); + return; + } + + // Expand to a umin between the trip count and the maximum elements the type + // can hold. + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + // Extend the trip count to at least the result VT. + if (CountVT.bitsLT(VT)) { + Count = DAG.getNode(ISD::ZERO_EXTEND, sdl, VT, Count); + CountVT = VT; + } + + SDValue MaxEVL = DAG.getElementCount(sdl, CountVT, + ElementCount::get(VF, IsScalable)); + + SDValue UMin = DAG.getNode(ISD::UMIN, sdl, CountVT, Count, MaxEVL); + // Clip to the result type if needed. + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, sdl, VT, UMin); + + setValue(&I, Trunc); + return; + } case Intrinsic::vector_insert: { SDValue Vec = getValue(I.getOperand(0)); SDValue SubVec = getValue(I.getOperand(1)); @@ -7324,6 +7481,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::experimental_vector_splice: visitVectorSplice(I); return; + case Intrinsic::callbr_landingpad: + visitCallBrLandingPad(I); + return; + case Intrinsic::experimental_vector_interleave2: + visitVectorInterleave(I); + return; + case Intrinsic::experimental_vector_deinterleave2: + visitVectorDeinterleave(I); + return; } } @@ -7442,12 +7608,12 @@ static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) { std::optional<unsigned> ResOPC; switch (VPIntrin.getIntrinsicID()) { case Intrinsic::vp_ctlz: { - bool IsZeroUndef = cast<ConstantInt>(VPIntrin.getArgOperand(3))->isOne(); + bool IsZeroUndef = cast<ConstantInt>(VPIntrin.getArgOperand(1))->isOne(); ResOPC = IsZeroUndef ? ISD::VP_CTLZ_ZERO_UNDEF : ISD::VP_CTLZ; break; } case Intrinsic::vp_cttz: { - bool IsZeroUndef = cast<ConstantInt>(VPIntrin.getArgOperand(3))->isOne(); + bool IsZeroUndef = cast<ConstantInt>(VPIntrin.getArgOperand(1))->isOne(); ResOPC = IsZeroUndef ? ISD::VP_CTTZ_ZERO_UNDEF : ISD::VP_CTTZ; break; } @@ -7472,21 +7638,21 @@ static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) { return *ResOPC; } -void SelectionDAGBuilder::visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT, - SmallVector<SDValue, 7> &OpValues) { +void SelectionDAGBuilder::visitVPLoad( + const VPIntrinsic &VPIntrin, EVT VT, + const SmallVectorImpl<SDValue> &OpValues) { SDLoc DL = getCurSDLoc(); Value *PtrOperand = VPIntrin.getArgOperand(0); MaybeAlign Alignment = VPIntrin.getPointerAlignment(); AAMDNodes AAInfo = VPIntrin.getAAMetadata(); const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range); SDValue LD; - bool AddToChain = true; // Do not serialize variable-length loads of constant memory with // anything. if (!Alignment) Alignment = DAG.getEVTAlign(VT); MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); - AddToChain = !AA || !AA->pointsToConstantMemory(ML); + bool AddToChain = !AA || !AA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, @@ -7498,8 +7664,9 @@ void SelectionDAGBuilder::visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT, setValue(&VPIntrin, LD); } -void SelectionDAGBuilder::visitVPGather(const VPIntrinsic &VPIntrin, EVT VT, - SmallVector<SDValue, 7> &OpValues) { +void SelectionDAGBuilder::visitVPGather( + const VPIntrinsic &VPIntrin, EVT VT, + const SmallVectorImpl<SDValue> &OpValues) { SDLoc DL = getCurSDLoc(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); Value *PtrOperand = VPIntrin.getArgOperand(0); @@ -7539,8 +7706,8 @@ void SelectionDAGBuilder::visitVPGather(const VPIntrinsic &VPIntrin, EVT VT, setValue(&VPIntrin, LD); } -void SelectionDAGBuilder::visitVPStore(const VPIntrinsic &VPIntrin, - SmallVector<SDValue, 7> &OpValues) { +void SelectionDAGBuilder::visitVPStore( + const VPIntrinsic &VPIntrin, const SmallVectorImpl<SDValue> &OpValues) { SDLoc DL = getCurSDLoc(); Value *PtrOperand = VPIntrin.getArgOperand(1); EVT VT = OpValues[0].getValueType(); @@ -7561,8 +7728,8 @@ void SelectionDAGBuilder::visitVPStore(const VPIntrinsic &VPIntrin, setValue(&VPIntrin, ST); } -void SelectionDAGBuilder::visitVPScatter(const VPIntrinsic &VPIntrin, - SmallVector<SDValue, 7> &OpValues) { +void SelectionDAGBuilder::visitVPScatter( + const VPIntrinsic &VPIntrin, const SmallVectorImpl<SDValue> &OpValues) { SDLoc DL = getCurSDLoc(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); Value *PtrOperand = VPIntrin.getArgOperand(1); @@ -7604,7 +7771,8 @@ void SelectionDAGBuilder::visitVPScatter(const VPIntrinsic &VPIntrin, } void SelectionDAGBuilder::visitVPStridedLoad( - const VPIntrinsic &VPIntrin, EVT VT, SmallVectorImpl<SDValue> &OpValues) { + const VPIntrinsic &VPIntrin, EVT VT, + const SmallVectorImpl<SDValue> &OpValues) { SDLoc DL = getCurSDLoc(); Value *PtrOperand = VPIntrin.getArgOperand(0); MaybeAlign Alignment = VPIntrin.getPointerAlignment(); @@ -7629,7 +7797,7 @@ void SelectionDAGBuilder::visitVPStridedLoad( } void SelectionDAGBuilder::visitVPStridedStore( - const VPIntrinsic &VPIntrin, SmallVectorImpl<SDValue> &OpValues) { + const VPIntrinsic &VPIntrin, const SmallVectorImpl<SDValue> &OpValues) { SDLoc DL = getCurSDLoc(); Value *PtrOperand = VPIntrin.getArgOperand(1); EVT VT = OpValues[0].getValueType(); @@ -7790,10 +7958,8 @@ void SelectionDAGBuilder::visitVectorPredicationIntrinsic( case ISD::VP_CTLZ_ZERO_UNDEF: case ISD::VP_CTTZ: case ISD::VP_CTTZ_ZERO_UNDEF: { - // Pop is_zero_poison operand for cp.ctlz/cttz or - // is_int_min_poison operand for vp.abs. - OpValues.pop_back(); - SDValue Result = DAG.getNode(Opcode, DL, VTs, OpValues); + SDValue Result = + DAG.getNode(Opcode, DL, VTs, {OpValues[0], OpValues[2], OpValues[3]}); setValue(&VPIntrin, Result); break; } @@ -8068,10 +8234,7 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I, bool IsSigned) { EVT VT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), I.getType(), true); - if (IsSigned) - Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT); - else - Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT); + Value = DAG.getExtOrTrunc(IsSigned, Value, getCurSDLoc(), VT); setValue(&I, Value); } @@ -8206,14 +8369,13 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) { // DAG::getMemcpy needs Alignment to be defined. Align Alignment = std::min(DstAlign, SrcAlign); - bool isVol = false; SDLoc sdl = getCurSDLoc(); // In the mempcpy context we need to pass in a false value for isTailCall // because the return pointer needs to be adjusted by the size of // the copied memory. - SDValue Root = isVol ? getRoot() : getMemoryRoot(); - SDValue MC = DAG.getMemcpy(Root, sdl, Dst, Src, Size, Alignment, isVol, false, + SDValue Root = getMemoryRoot(); + SDValue MC = DAG.getMemcpy(Root, sdl, Dst, Src, Size, Alignment, false, false, /*isTailCall=*/false, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1)), @@ -8498,6 +8660,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { if (visitUnaryFloatCall(I, ISD::FEXP2)) return; break; + case LibFunc_ldexp: + case LibFunc_ldexpf: + case LibFunc_ldexpl: + if (visitBinaryFloatCall(I, ISD::FLDEXP)) + return; + break; case LibFunc_memcmp: if (visitMemCmpBCmpCall(I)) return; @@ -8897,7 +9065,7 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, // We won't need to flush pending loads if this asm doesn't touch // memory and is nonvolatile. - SDValue Flag, Chain = (HasSideEffect) ? getRoot() : DAG.getRoot(); + SDValue Glue, Chain = (HasSideEffect) ? getRoot() : DAG.getRoot(); bool EmitEHLabels = isa<InvokeInst>(Call); if (EmitEHLabels) { @@ -9124,7 +9292,7 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, SDLoc dl = getCurSDLoc(); // Use the produced MatchedRegs object to - MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag, &Call); + MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Glue, &Call); MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, true, OpInfo.getMatchedOperand(), dl, DAG, AsmNodeOperands); @@ -9202,10 +9370,6 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, } if (OpInfo.ConstraintType == TargetLowering::C_Address) { - assert(InOperandVal.getValueType() == - TLI.getPointerTy(DAG.getDataLayout()) && - "Address operands expect pointer values"); - unsigned ConstraintID = TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode); assert(ConstraintID != InlineAsm::Constraint_Unknown && @@ -9258,7 +9422,7 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, SDLoc dl = getCurSDLoc(); - OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag, + OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Glue, &Call); OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0, @@ -9278,12 +9442,12 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, // Finish up input operands. Set the input chain and add the flag last. AsmNodeOperands[InlineAsm::Op_InputChain] = Chain; - if (Flag.getNode()) AsmNodeOperands.push_back(Flag); + if (Glue.getNode()) AsmNodeOperands.push_back(Glue); unsigned ISDOpc = IsCallBr ? ISD::INLINEASM_BR : ISD::INLINEASM; Chain = DAG.getNode(ISDOpc, getCurSDLoc(), DAG.getVTList(MVT::Other, MVT::Glue), AsmNodeOperands); - Flag = Chain.getValue(1); + Glue = Chain.getValue(1); // Do additional work to generate outputs. @@ -9341,11 +9505,11 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call, case TargetLowering::C_Register: case TargetLowering::C_RegisterClass: Val = OpInfo.AssignedRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), - Chain, &Flag, &Call); + Chain, &Glue, &Call); break; case TargetLowering::C_Immediate: case TargetLowering::C_Other: - Val = TLI.LowerAsmOutputForConstraint(Chain, Flag, getCurSDLoc(), + Val = TLI.LowerAsmOutputForConstraint(Chain, Glue, getCurSDLoc(), OpInfo, DAG); break; case TargetLowering::C_Memory: @@ -9576,7 +9740,7 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value."); - SDValue Chain, InFlag, Callee; + SDValue Chain, InGlue, Callee; SmallVector<SDValue, 32> Ops; SDLoc DL = getCurSDLoc(); @@ -9593,11 +9757,11 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { // chain, flag = CALLSEQ_END(chain, 0, 0, flag) // Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL); - InFlag = Chain.getValue(1); + InGlue = Chain.getValue(1); // Add the STACKMAP operands, starting with DAG house-keeping. Ops.push_back(Chain); - Ops.push_back(InFlag); + Ops.push_back(InGlue); // Add the <id>, <numShadowBytes> operands. // @@ -9621,9 +9785,9 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { // Create the STACKMAP node. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(ISD::STACKMAP, DL, NodeTys, Ops); - InFlag = Chain.getValue(1); + InGlue = Chain.getValue(1); - Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, DL); + Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL); // Stackmaps don't generate values, so nothing goes into the NodeMap. @@ -9847,6 +10011,12 @@ void SelectionDAGBuilder::visitVectorReduce(const CallInst &I, case Intrinsic::vector_reduce_fmin: Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags); break; + case Intrinsic::vector_reduce_fmaximum: + Res = DAG.getNode(ISD::VECREDUCE_FMAXIMUM, dl, VT, Op1, SDFlags); + break; + case Intrinsic::vector_reduce_fminimum: + Res = DAG.getNode(ISD::VECREDUCE_FMINIMUM, dl, VT, Op1, SDFlags); + break; default: llvm_unreachable("Unhandled vector reduce intrinsic"); } @@ -9880,7 +10050,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { SmallVector<EVT, 4> RetTys; SmallVector<uint64_t, 4> Offsets; auto &DL = CLI.DAG.getDataLayout(); - ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets); + ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets, 0); if (CLI.IsPostTypeLegalization) { // If we are lowering a libcall after legalization, split the return type. @@ -10200,7 +10370,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // The instruction result is the result of loading from the // hidden sret parameter. SmallVector<EVT, 1> PVTs; - Type *PtrRetTy = OrigRetTy->getPointerTo(DL.getAllocaAddrSpace()); + Type *PtrRetTy = + PointerType::get(OrigRetTy->getContext(), DL.getAllocaAddrSpace()); ComputeValueVTs(*this, DL, PtrRetTy, PVTs); assert(PVTs.size() == 1 && "Pointers should fit in one register"); @@ -10452,9 +10623,9 @@ static void tryToElideArgumentCopy( DenseMap<int, int> &ArgCopyElisionFrameIndexMap, SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs, ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg, - SDValue ArgVal, bool &ArgHasUses) { + ArrayRef<SDValue> ArgVals, bool &ArgHasUses) { // Check if this is a load from a fixed stack object. - auto *LNode = dyn_cast<LoadSDNode>(ArgVal); + auto *LNode = dyn_cast<LoadSDNode>(ArgVals[0]); if (!LNode) return; auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()); @@ -10497,7 +10668,8 @@ static void tryToElideArgumentCopy( MFI.setIsImmutableObjectIndex(FixedIndex, false); AllocaIndex = FixedIndex; ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex}); - Chains.push_back(ArgVal.getValue(1)); + for (SDValue ArgVal : ArgVals) + Chains.push_back(ArgVal.getValue(1)); // Avoid emitting code for the store implementing the copy. const StoreInst *SI = ArgCopyIter->second.second; @@ -10527,8 +10699,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Put in an sret pointer parameter before all the other parameters. SmallVector<EVT, 1> ValueVTs; ComputeValueVTs(*TLI, DAG.getDataLayout(), - F.getReturnType()->getPointerTo( - DAG.getDataLayout().getAllocaAddrSpace()), + PointerType::get(F.getContext(), + DAG.getDataLayout().getAllocaAddrSpace()), ValueVTs); // NOTE: Assuming that a pointer will never break down to more than one VT @@ -10721,8 +10893,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // from the sret argument into it. SmallVector<EVT, 1> ValueVTs; ComputeValueVTs(*TLI, DAG.getDataLayout(), - F.getReturnType()->getPointerTo( - DAG.getDataLayout().getAllocaAddrSpace()), + PointerType::get(F.getContext(), + DAG.getDataLayout().getAllocaAddrSpace()), ValueVTs); MVT VT = ValueVTs[0].getSimpleVT(); MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT); @@ -10758,9 +10930,14 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // Elide the copying store if the target loaded this argument from a // suitable fixed stack object. if (Ins[i].Flags.isCopyElisionCandidate()) { + unsigned NumParts = 0; + for (EVT VT : ValueVTs) + NumParts += TLI->getNumRegistersForCallingConv(*CurDAG->getContext(), + F.getCallingConv(), VT); + tryToElideArgumentCopy(*FuncInfo, Chains, ArgCopyElisionFrameIndexMap, ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg, - InVals[i], ArgHasUses); + ArrayRef(&InVals[i], NumParts), ArgHasUses); } // If this argument is unused then remember its value. It is used to generate @@ -10872,12 +11049,12 @@ void SelectionDAGISel::LowerArguments(const Function &F) { // If any argument copy elisions occurred and we have debug info, update the // stale frame indices used in the dbg.declare variable info table. - MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo(); - if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) { - for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) { - auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot); + if (!ArgCopyElisionFrameIndexMap.empty()) { + for (MachineFunction::VariableDbgInfo &VI : + MF->getInStackSlotVariableDbgInfo()) { + auto I = ArgCopyElisionFrameIndexMap.find(VI.getStackSlot()); if (I != ArgCopyElisionFrameIndexMap.end()) - VI.Slot = I->second; + VI.updateStackSlot(I->second); } } @@ -11554,6 +11731,62 @@ void SelectionDAGBuilder::visitVectorReverse(const CallInst &I) { setValue(&I, DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), Mask)); } +void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I) { + auto DL = getCurSDLoc(); + SDValue InVec = getValue(I.getOperand(0)); + EVT OutVT = + InVec.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); + + unsigned OutNumElts = OutVT.getVectorMinNumElements(); + + // ISD Node needs the input vectors split into two equal parts + SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, + DAG.getVectorIdxConstant(0, DL)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec, + DAG.getVectorIdxConstant(OutNumElts, DL)); + + // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing + // legalisation and combines. + if (OutVT.isFixedLengthVector()) { + SDValue Even = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, + createStrideMask(0, 2, OutNumElts)); + SDValue Odd = DAG.getVectorShuffle(OutVT, DL, Lo, Hi, + createStrideMask(1, 2, OutNumElts)); + SDValue Res = DAG.getMergeValues({Even, Odd}, getCurSDLoc()); + setValue(&I, Res); + return; + } + + SDValue Res = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, + DAG.getVTList(OutVT, OutVT), Lo, Hi); + setValue(&I, Res); +} + +void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I) { + auto DL = getCurSDLoc(); + EVT InVT = getValue(I.getOperand(0)).getValueType(); + SDValue InVec0 = getValue(I.getOperand(0)); + SDValue InVec1 = getValue(I.getOperand(1)); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT OutVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing + // legalisation and combines. + if (OutVT.isFixedLengthVector()) { + unsigned NumElts = InVT.getVectorMinNumElements(); + SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVec0, InVec1); + setValue(&I, DAG.getVectorShuffle(OutVT, DL, V, DAG.getUNDEF(OutVT), + createInterleaveMask(NumElts, 2))); + return; + } + + SDValue Res = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, + DAG.getVTList(InVT, InVT), InVec0, InVec1); + Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Res.getValue(0), + Res.getValue(1)); + setValue(&I, Res); +} + void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) { SmallVector<EVT, 4> ValueVTs; ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(), @@ -11599,3 +11832,113 @@ void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) { Mask.push_back(Idx + i); setValue(&I, DAG.getVectorShuffle(VT, DL, V1, V2, Mask)); } + +// Consider the following MIR after SelectionDAG, which produces output in +// phyregs in the first case or virtregs in the second case. +// +// INLINEASM_BR ..., implicit-def $ebx, ..., implicit-def $edx +// %5:gr32 = COPY $ebx +// %6:gr32 = COPY $edx +// %1:gr32 = COPY %6:gr32 +// %0:gr32 = COPY %5:gr32 +// +// INLINEASM_BR ..., def %5:gr32, ..., def %6:gr32 +// %1:gr32 = COPY %6:gr32 +// %0:gr32 = COPY %5:gr32 +// +// Given %0, we'd like to return $ebx in the first case and %5 in the second. +// Given %1, we'd like to return $edx in the first case and %6 in the second. +// +// If a callbr has outputs, it will have a single mapping in FuncInfo.ValueMap +// to a single virtreg (such as %0). The remaining outputs monotonically +// increase in virtreg number from there. If a callbr has no outputs, then it +// should not have a corresponding callbr landingpad; in fact, the callbr +// landingpad would not even be able to refer to such a callbr. +static Register FollowCopyChain(MachineRegisterInfo &MRI, Register Reg) { + MachineInstr *MI = MRI.def_begin(Reg)->getParent(); + // There is definitely at least one copy. + assert(MI->getOpcode() == TargetOpcode::COPY && + "start of copy chain MUST be COPY"); + Reg = MI->getOperand(1).getReg(); + MI = MRI.def_begin(Reg)->getParent(); + // There may be an optional second copy. + if (MI->getOpcode() == TargetOpcode::COPY) { + assert(Reg.isVirtual() && "expected COPY of virtual register"); + Reg = MI->getOperand(1).getReg(); + assert(Reg.isPhysical() && "expected COPY of physical register"); + MI = MRI.def_begin(Reg)->getParent(); + } + // The start of the chain must be an INLINEASM_BR. + assert(MI->getOpcode() == TargetOpcode::INLINEASM_BR && + "end of copy chain MUST be INLINEASM_BR"); + return Reg; +} + +// We must do this walk rather than the simpler +// setValue(&I, getCopyFromRegs(CBR, CBR->getType())); +// otherwise we will end up with copies of virtregs only valid along direct +// edges. +void SelectionDAGBuilder::visitCallBrLandingPad(const CallInst &I) { + SmallVector<EVT, 8> ResultVTs; + SmallVector<SDValue, 8> ResultValues; + const auto *CBR = + cast<CallBrInst>(I.getParent()->getUniquePredecessor()->getTerminator()); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + + unsigned InitialDef = FuncInfo.ValueMap[CBR]; + SDValue Chain = DAG.getRoot(); + + // Re-parse the asm constraints string. + TargetLowering::AsmOperandInfoVector TargetConstraints = + TLI.ParseConstraints(DAG.getDataLayout(), TRI, *CBR); + for (auto &T : TargetConstraints) { + SDISelAsmOperandInfo OpInfo(T); + if (OpInfo.Type != InlineAsm::isOutput) + continue; + + // Pencil in OpInfo.ConstraintType and OpInfo.ConstraintVT based on the + // individual constraint. + TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG); + + switch (OpInfo.ConstraintType) { + case TargetLowering::C_Register: + case TargetLowering::C_RegisterClass: { + // Fill in OpInfo.AssignedRegs.Regs. + getRegistersForValue(DAG, getCurSDLoc(), OpInfo, OpInfo); + + // getRegistersForValue may produce 1 to many registers based on whether + // the OpInfo.ConstraintVT is legal on the target or not. + for (size_t i = 0, e = OpInfo.AssignedRegs.Regs.size(); i != e; ++i) { + Register OriginalDef = FollowCopyChain(MRI, InitialDef++); + if (Register::isPhysicalRegister(OriginalDef)) + FuncInfo.MBB->addLiveIn(OriginalDef); + // Update the assigned registers to use the original defs. + OpInfo.AssignedRegs.Regs[i] = OriginalDef; + } + + SDValue V = OpInfo.AssignedRegs.getCopyFromRegs( + DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, CBR); + ResultValues.push_back(V); + ResultVTs.push_back(OpInfo.ConstraintVT); + break; + } + case TargetLowering::C_Other: { + SDValue Flag; + SDValue V = TLI.LowerAsmOutputForConstraint(Chain, Flag, getCurSDLoc(), + OpInfo, DAG); + ++InitialDef; + ResultValues.push_back(V); + ResultVTs.push_back(OpInfo.ConstraintVT); + break; + } + default: + break; + } + } + SDValue V = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(), + DAG.getVTList(ResultVTs), ResultValues); + setValue(&I, V); +} diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index bf2111013461..f2496f24973a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -21,6 +21,7 @@ #include "llvm/CodeGen/AssignmentTrackingAnalysis.h" #include "llvm/CodeGen/CodeGenCommonISel.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SwitchLoweringUtils.h" #include "llvm/CodeGen/TargetLowering.h" @@ -30,7 +31,6 @@ #include "llvm/Support/BranchProbability.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -119,25 +119,25 @@ class SelectionDAGBuilder { : Info(VarLoc), SDNodeOrder(SDNO) {} DILocalVariable *getVariable(const FunctionVarLocs *Locs) const { - if (Info.is<VarLocTy>()) - return Locs->getDILocalVariable(Info.get<VarLocTy>()->VariableID); - return Info.get<DbgValTy>()->getVariable(); + if (isa<VarLocTy>(Info)) + return Locs->getDILocalVariable(cast<VarLocTy>(Info)->VariableID); + return cast<DbgValTy>(Info)->getVariable(); } DIExpression *getExpression() const { - if (Info.is<VarLocTy>()) - return Info.get<VarLocTy>()->Expr; - return Info.get<DbgValTy>()->getExpression(); + if (isa<VarLocTy>(Info)) + return cast<VarLocTy>(Info)->Expr; + return cast<DbgValTy>(Info)->getExpression(); } Value *getVariableLocationOp(unsigned Idx) const { assert(Idx == 0 && "Dangling variadic debug values not supported yet"); - if (Info.is<VarLocTy>()) - return Info.get<VarLocTy>()->V; - return Info.get<DbgValTy>()->getVariableLocationOp(Idx); + if (isa<VarLocTy>(Info)) + return cast<VarLocTy>(Info)->Values.getVariableLocationOp(Idx); + return cast<DbgValTy>(Info)->getVariableLocationOp(Idx); } DebugLoc getDebugLoc() const { - if (Info.is<VarLocTy>()) - return Info.get<VarLocTy>()->DL; - return Info.get<DbgValTy>()->getDebugLoc(); + if (isa<VarLocTy>(Info)) + return cast<VarLocTy>(Info)->DL; + return cast<DbgValTy>(Info)->getDebugLoc(); } unsigned getSDNodeOrder() const { return SDNodeOrder; } @@ -175,6 +175,10 @@ class SelectionDAGBuilder { /// We defer handling these until we do see it. MapVector<const Value*, DanglingDebugInfoVector> DanglingDebugInfoMap; + /// Cache the module flag for whether we should use debug-info assignment + /// tracking. + bool AssignmentTrackingEnabled = false; + public: /// Loads are not emitted to the program immediately. We bunch them up and /// then emit token factor nodes when possible. This allows us to get simple @@ -243,7 +247,7 @@ public: SelectionDAG &DAG; AAResults *AA = nullptr; AssumptionCache *AC = nullptr; - const TargetLibraryInfo *LibInfo; + const TargetLibraryInfo *LibInfo = nullptr; class SDAGSwitchLowering : public SwitchCG::SwitchLowering { public: @@ -257,7 +261,7 @@ public: } private: - SelectionDAGBuilder *SDB; + SelectionDAGBuilder *SDB = nullptr; }; // Data related to deferred switch lowerings. Used to construct additional @@ -279,7 +283,7 @@ public: SwiftErrorValueTracking &SwiftError; /// Garbage collection metadata for the function. - GCFunctionInfo *GFI; + GCFunctionInfo *GFI = nullptr; /// Map a landing pad to the call site indexes. DenseMap<MachineBasicBlock *, SmallVector<unsigned, 4>> LPadToCallSiteMap; @@ -288,7 +292,7 @@ public: /// a tail call. In this case, no subsequent DAG nodes should be created. bool HasTailCall = false; - LLVMContext *Context; + LLVMContext *Context = nullptr; SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo, SwiftErrorValueTracking &swifterror, CodeGenOpt::Level ol) @@ -372,6 +376,10 @@ public: DIExpression *Expr, DebugLoc DbgLoc, unsigned Order, bool IsVariadic); + /// Create a record for a kill location debug intrinsic. + void handleKillDebugValue(DILocalVariable *Var, DIExpression *Expr, + DebugLoc DbgLoc, unsigned Order); + /// Evict any dangling debug information, attempting to salvage it first. void resolveOrClearDbgInfo(); @@ -534,6 +542,7 @@ private: // These all get lowered before this pass. void visitInvoke(const InvokeInst &I); void visitCallBr(const CallBrInst &I); + void visitCallBrLandingPad(const CallInst &I); void visitResume(const ResumeInst &I); void visitUnary(const User &I, unsigned Opcode); @@ -620,17 +629,17 @@ private: void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI); void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT, - SmallVector<SDValue, 7> &OpValues); + const SmallVectorImpl<SDValue> &OpValues); void visitVPStore(const VPIntrinsic &VPIntrin, - SmallVector<SDValue, 7> &OpValues); + const SmallVectorImpl<SDValue> &OpValues); void visitVPGather(const VPIntrinsic &VPIntrin, EVT VT, - SmallVector<SDValue, 7> &OpValues); + const SmallVectorImpl<SDValue> &OpValues); void visitVPScatter(const VPIntrinsic &VPIntrin, - SmallVector<SDValue, 7> &OpValues); + const SmallVectorImpl<SDValue> &OpValues); void visitVPStridedLoad(const VPIntrinsic &VPIntrin, EVT VT, - SmallVectorImpl<SDValue> &OpValues); + const SmallVectorImpl<SDValue> &OpValues); void visitVPStridedStore(const VPIntrinsic &VPIntrin, - SmallVectorImpl<SDValue> &OpValues); + const SmallVectorImpl<SDValue> &OpValues); void visitVPCmp(const VPCmpIntrinsic &VPIntrin); void visitVectorPredicationIntrinsic(const VPIntrinsic &VPIntrin); @@ -648,6 +657,8 @@ private: void visitVectorReduce(const CallInst &I, unsigned Intrinsic); void visitVectorReverse(const CallInst &I); void visitVectorSplice(const CallInst &I); + void visitVectorInterleave(const CallInst &I); + void visitVectorDeinterleave(const CallInst &I); void visitStepVector(const CallInst &I); void visitUserOp1(const Instruction &I) { @@ -669,7 +680,6 @@ private: /// EmitFuncArgumentDbgValue. enum class FuncArgumentDbgValueKind { Value, // This was originally a llvm.dbg.value. - Addr, // This was originally a llvm.dbg.addr. Declare, // This was originally a llvm.dbg.declare. }; @@ -760,7 +770,7 @@ struct RegsForValue { /// updates them for the output Chain/Flag. If the Flag pointer is NULL, no /// flag is used. SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo, - const SDLoc &dl, SDValue &Chain, SDValue *Flag, + const SDLoc &dl, SDValue &Chain, SDValue *Glue, const Value *V = nullptr) const; /// Emit a series of CopyToReg nodes that copies the specified value into the @@ -769,7 +779,7 @@ struct RegsForValue { /// flag is used. If V is not nullptr, then it is used in printing better /// diagnostic messages on error. void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl, - SDValue &Chain, SDValue *Flag, const Value *V = nullptr, + SDValue &Chain, SDValue *Glue, const Value *V = nullptr, ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const; /// Add this value to the specified inlineasm node operand list. This adds the diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index fe4261291fc5..03a1ead5bbb4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -19,6 +19,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -40,7 +41,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -283,6 +283,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::UMIN: return "umin"; case ISD::UMAX: return "umax"; + case ISD::FLDEXP: return "fldexp"; + case ISD::STRICT_FLDEXP: return "strict_fldexp"; + case ISD::FFREXP: return "ffrexp"; case ISD::FPOWI: return "fpowi"; case ISD::STRICT_FPOWI: return "strict_fpowi"; case ISD::SETCC: return "setcc"; @@ -297,6 +300,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::CONCAT_VECTORS: return "concat_vectors"; case ISD::INSERT_SUBVECTOR: return "insert_subvector"; case ISD::EXTRACT_SUBVECTOR: return "extract_subvector"; + case ISD::VECTOR_DEINTERLEAVE: return "vector_deinterleave"; + case ISD::VECTOR_INTERLEAVE: return "vector_interleave"; case ISD::SCALAR_TO_VECTOR: return "scalar_to_vector"; case ISD::VECTOR_SHUFFLE: return "vector_shuffle"; case ISD::VECTOR_SPLICE: return "vector_splice"; @@ -307,7 +312,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::CARRY_FALSE: return "carry_false"; case ISD::ADDC: return "addc"; case ISD::ADDE: return "adde"; - case ISD::ADDCARRY: return "addcarry"; + case ISD::UADDO_CARRY: return "uaddo_carry"; case ISD::SADDO_CARRY: return "saddo_carry"; case ISD::SADDO: return "saddo"; case ISD::UADDO: return "uaddo"; @@ -317,7 +322,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::UMULO: return "umulo"; case ISD::SUBC: return "subc"; case ISD::SUBE: return "sube"; - case ISD::SUBCARRY: return "subcarry"; + case ISD::USUBO_CARRY: return "usubo_carry"; case ISD::SSUBO_CARRY: return "ssubo_carry"; case ISD::SHL_PARTS: return "shl_parts"; case ISD::SRA_PARTS: return "sra_parts"; @@ -429,6 +434,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { // Floating point environment manipulation case ISD::GET_ROUNDING: return "get_rounding"; case ISD::SET_ROUNDING: return "set_rounding"; + case ISD::GET_FPENV: return "get_fpenv"; + case ISD::SET_FPENV: return "set_fpenv"; + case ISD::RESET_FPENV: return "reset_fpenv"; + case ISD::GET_FPENV_MEM: return "get_fpenv_mem"; + case ISD::SET_FPENV_MEM: return "set_fpenv_mem"; // Bit manipulation case ISD::ABS: return "abs"; @@ -491,6 +501,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::VECREDUCE_UMIN: return "vecreduce_umin"; case ISD::VECREDUCE_FMAX: return "vecreduce_fmax"; case ISD::VECREDUCE_FMIN: return "vecreduce_fmin"; + case ISD::VECREDUCE_FMAXIMUM: return "vecreduce_fmaximum"; + case ISD::VECREDUCE_FMINIMUM: return "vecreduce_fminimum"; case ISD::STACKMAP: return "stackmap"; case ISD::PATCHPOINT: @@ -698,7 +710,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { else OS << "<null>"; } else if (const VTSDNode *N = dyn_cast<VTSDNode>(this)) { - OS << ":" << N->getVT().getEVTString(); + OS << ":" << N->getVT(); } else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(this)) { OS << "<"; @@ -713,7 +725,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { case ISD::ZEXTLOAD: OS << ", zext"; break; } if (doExt) - OS << " from " << LD->getMemoryVT().getEVTString(); + OS << " from " << LD->getMemoryVT(); const char *AM = getIndexedModeName(LD->getAddressingMode()); if (*AM) @@ -725,7 +737,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { printMemOperand(OS, *ST->getMemOperand(), G); if (ST->isTruncatingStore()) - OS << ", trunc to " << ST->getMemoryVT().getEVTString(); + OS << ", trunc to " << ST->getMemoryVT(); const char *AM = getIndexedModeName(ST->getAddressingMode()); if (*AM) @@ -745,7 +757,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { case ISD::ZEXTLOAD: OS << ", zext"; break; } if (doExt) - OS << " from " << MLd->getMemoryVT().getEVTString(); + OS << " from " << MLd->getMemoryVT(); const char *AM = getIndexedModeName(MLd->getAddressingMode()); if (*AM) @@ -760,7 +772,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { printMemOperand(OS, *MSt->getMemOperand(), G); if (MSt->isTruncatingStore()) - OS << ", trunc to " << MSt->getMemoryVT().getEVTString(); + OS << ", trunc to " << MSt->getMemoryVT(); const char *AM = getIndexedModeName(MSt->getAddressingMode()); if (*AM) @@ -782,7 +794,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { case ISD::ZEXTLOAD: OS << ", zext"; break; } if (doExt) - OS << " from " << MGather->getMemoryVT().getEVTString(); + OS << " from " << MGather->getMemoryVT(); auto Signed = MGather->isIndexSigned() ? "signed" : "unsigned"; auto Scaled = MGather->isIndexScaled() ? "scaled" : "unscaled"; @@ -794,7 +806,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { printMemOperand(OS, *MScatter->getMemOperand(), G); if (MScatter->isTruncatingStore()) - OS << ", trunc to " << MScatter->getMemoryVT().getEVTString(); + OS << ", trunc to " << MScatter->getMemoryVT(); auto Signed = MScatter->isIndexSigned() ? "signed" : "unsigned"; auto Scaled = MScatter->isIndexScaled() ? "scaled" : "unscaled"; @@ -849,6 +861,12 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const { Dbg->print(OS); } else if (getHasDebugValue()) OS << " [NoOfDbgValues>0]"; + + if (const auto *MD = G ? G->getPCSections(this) : nullptr) { + OS << " [pcsections "; + MD->printAsOperand(OS, G->getMachineFunction().getFunction().getParent()); + OS << ']'; + } } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 902f46115557..35abd990f968 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -25,13 +25,12 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/LazyBlockFrequencyInfo.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/CodeGen/AssignmentTrackingAnalysis.h" #include "llvm/CodeGen/CodeGenCommonISel.h" #include "llvm/CodeGen/FastISel.h" @@ -49,6 +48,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachinePassRegistry.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SchedulerRegistry.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -60,6 +60,7 @@ #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -67,6 +68,7 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/InstIterator.h" @@ -91,7 +93,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/Timer.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -425,9 +426,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n"); - CurDAG->init(*MF, *ORE, this, LibInfo, - getAnalysisIfAvailable<LegacyDivergenceAnalysis>(), PSI, BFI, - FnVarLocs); + UniformityInfo *UA = nullptr; + if (auto *UAPass = getAnalysisIfAvailable<UniformityInfoWrapperPass>()) + UA = &UAPass->getUniformityInfo(); + CurDAG->init(*MF, *ORE, this, LibInfo, UA, PSI, BFI, FnVarLocs); FuncInfo->set(Fn, *MF, CurDAG); SwiftError->setFunction(*MF); @@ -1291,6 +1293,43 @@ bool SelectionDAGISel::PrepareEHLandingPad() { return true; } +// Mark and Report IPToState for each Block under IsEHa +void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) { + MachineModuleInfo &MMI = MF->getMMI(); + llvm::WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo(); + if (!EHInfo) + return; + for (auto MBBI = MF->begin(), E = MF->end(); MBBI != E; ++MBBI) { + MachineBasicBlock *MBB = &*MBBI; + const BasicBlock *BB = MBB->getBasicBlock(); + int State = EHInfo->BlockToStateMap[BB]; + if (BB->getFirstMayFaultInst()) { + // Report IP range only for blocks with Faulty inst + auto MBBb = MBB->getFirstNonPHI(); + MachineInstr *MIb = &*MBBb; + if (MIb->isTerminator()) + continue; + + // Insert EH Labels + MCSymbol *BeginLabel = MMI.getContext().createTempSymbol(); + MCSymbol *EndLabel = MMI.getContext().createTempSymbol(); + EHInfo->addIPToStateRange(State, BeginLabel, EndLabel); + BuildMI(*MBB, MBBb, SDB->getCurDebugLoc(), + TII->get(TargetOpcode::EH_LABEL)) + .addSym(BeginLabel); + auto MBBe = MBB->instr_end(); + MachineInstr *MIe = &*(--MBBe); + // insert before (possible multiple) terminators + while (MIe->isTerminator()) + MIe = &*(--MBBe); + ++MBBe; + BuildMI(*MBB, MBBe, SDB->getCurDebugLoc(), + TII->get(TargetOpcode::EH_LABEL)) + .addSym(EndLabel); + } + } +} + /// isFoldedOrDeadInstruction - Return true if the specified instruction is /// side-effect free and is either dead or folded into a generated instruction. /// Return false if it needs to be emitted. @@ -1303,9 +1342,42 @@ static bool isFoldedOrDeadInstruction(const Instruction *I, !FuncInfo.isExportedInst(I); // Exported instrs must be computed. } -static void processDbgDeclare(FunctionLoweringInfo &FuncInfo, +static bool processIfEntryValueDbgDeclare(FunctionLoweringInfo &FuncInfo, + const Value *Arg, DIExpression *Expr, + DILocalVariable *Var, + DebugLoc DbgLoc) { + if (!Expr->isEntryValue() || !isa<Argument>(Arg)) + return false; + + auto ArgIt = FuncInfo.ValueMap.find(Arg); + if (ArgIt == FuncInfo.ValueMap.end()) + return false; + Register ArgVReg = ArgIt->getSecond(); + + // Find the corresponding livein physical register to this argument. + for (auto [PhysReg, VirtReg] : FuncInfo.RegInfo->liveins()) + if (VirtReg == ArgVReg) { + FuncInfo.MF->setVariableDbgInfo(Var, Expr, PhysReg, DbgLoc); + LLVM_DEBUG(dbgs() << "processDbgDeclare: setVariableDbgInfo Var=" << *Var + << ", Expr=" << *Expr << ", MCRegister=" << PhysReg + << ", DbgLoc=" << DbgLoc << "\n"); + return true; + } + return false; +} + +static bool processDbgDeclare(FunctionLoweringInfo &FuncInfo, const Value *Address, DIExpression *Expr, DILocalVariable *Var, DebugLoc DbgLoc) { + if (!Address) { + LLVM_DEBUG(dbgs() << "processDbgDeclares skipping " << *Var + << " (bad address)\n"); + return false; + } + + if (processIfEntryValueDbgDeclare(FuncInfo, Address, Expr, Var, DbgLoc)) + return true; + MachineFunction *MF = FuncInfo.MF; const DataLayout &DL = MF->getDataLayout(); @@ -1329,7 +1401,7 @@ static void processDbgDeclare(FunctionLoweringInfo &FuncInfo, FI = FuncInfo.getArgumentFrameIndex(Arg); if (FI == std::numeric_limits<int>::max()) - return; + return false; if (Offset.getBoolValue()) Expr = DIExpression::prepend(Expr, DIExpression::ApplyOffset, @@ -1339,24 +1411,17 @@ static void processDbgDeclare(FunctionLoweringInfo &FuncInfo, << ", Expr=" << *Expr << ", FI=" << FI << ", DbgLoc=" << DbgLoc << "\n"); MF->setVariableDbgInfo(Var, Expr, FI, DbgLoc); + return true; } /// Collect llvm.dbg.declare information. This is done after argument lowering /// in case the declarations refer to arguments. static void processDbgDeclares(FunctionLoweringInfo &FuncInfo) { - for (const BasicBlock &BB : *FuncInfo.Fn) { - for (const Instruction &I : BB) { - if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I)) { - Value *Address = DI->getAddress(); - if (!Address) { - LLVM_DEBUG(dbgs() << "processDbgDeclares skipping " << *DI - << " (bad address)\n"); - continue; - } - processDbgDeclare(FuncInfo, Address, DI->getExpression(), - DI->getVariable(), DI->getDebugLoc()); - } - } + for (const auto &I : instructions(*FuncInfo.Fn)) { + const auto *DI = dyn_cast<DbgDeclareInst>(&I); + if (DI && processDbgDeclare(FuncInfo, DI->getAddress(), DI->getExpression(), + DI->getVariable(), DI->getDebugLoc())) + FuncInfo.PreprocessedDbgDeclares.insert(DI); } } @@ -1367,9 +1432,11 @@ static void processSingleLocVars(FunctionLoweringInfo &FuncInfo, FunctionVarLocs const *FnVarLocs) { for (auto It = FnVarLocs->single_locs_begin(), End = FnVarLocs->single_locs_end(); - It != End; ++It) - processDbgDeclare(FuncInfo, It->V, It->Expr, + It != End; ++It) { + assert(!It->Values.hasArgList() && "Single loc variadic ops not supported"); + processDbgDeclare(FuncInfo, It->Values.getVariableLocationOp(0), It->Expr, FnVarLocs->getDILocalVariable(It->VariableID), It->DL); + } } void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { @@ -1408,7 +1475,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { Fn.getSubprogram(), &Fn.getEntryBlock()); R << "FastISel didn't lower all arguments: " - << ore::NV("Prototype", Fn.getType()); + << ore::NV("Prototype", Fn.getFunctionType()); reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1); // Use SelectionDAG argument lowering @@ -1646,6 +1713,10 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { ElidedArgCopyInstrs.clear(); } + // AsynchEH: Report Block State under -AsynchEH + if (Fn.getParent()->getModuleFlag("eh-asynch")) + reportIPToStateForBlocks(MF); + SP.copyToMachineFrameInfo(MF->getFrameInfo()); SwiftError->propagateVRegs(); @@ -2273,7 +2344,7 @@ void SelectionDAGISel::Select_STACKMAP(SDNode *N) { // Stash the chain and glue operands so we can move them to the end. SDValue Chain = *It++; - SDValue InFlag = *It++; + SDValue InGlue = *It++; // <id> operand. SDValue ID = *It++; @@ -2290,7 +2361,7 @@ void SelectionDAGISel::Select_STACKMAP(SDNode *N) { pushStackMapLiveVariable(Ops, *It, DL); Ops.push_back(Chain); - Ops.push_back(InFlag); + Ops.push_back(InGlue); SDVTList NodeTys = CurDAG->getVTList(MVT::Other, MVT::Glue); CurDAG->SelectNodeTo(N, TargetOpcode::STACKMAP, NodeTys, Ops); @@ -3240,7 +3311,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, if (CaseSize == 0) break; // Otherwise, execute the case we found. - LLVM_DEBUG(dbgs() << " TypeSwitch[" << EVT(CurNodeVT).getEVTString() + LLVM_DEBUG(dbgs() << " TypeSwitch[" << CurNodeVT << "] from " << SwitchStart << " to " << MatcherIndex << '\n'); continue; diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp index 57bfe344dbab..5afd05648772 100644 --- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -42,7 +43,6 @@ #include "llvm/IR/Type.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include <cassert> @@ -258,8 +258,7 @@ static bool willLowerDirectly(SDValue Incoming) { if (Incoming.getValueType().getSizeInBits() > 64) return false; - return (isa<ConstantSDNode>(Incoming) || isa<ConstantFPSDNode>(Incoming) || - Incoming.isUndef()); + return isIntOrFPConstant(Incoming) || Incoming.isUndef(); } /// Try to find existing copies of the incoming values in stack slots used for @@ -490,7 +489,7 @@ lowerIncomingStatepointValue(SDValue Incoming, bool RequireSpillSlot, Ops.push_back(std::get<0>(Res)); if (auto *MMO = std::get<2>(Res)) MemRefs.push_back(MMO); - Chain = std::get<1>(Res);; + Chain = std::get<1>(Res); Builder.DAG.setRoot(Chain); } @@ -1250,7 +1249,7 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) { // All the reloads are independent and are reading memory only modified by // statepoints (i.e. no other aliasing stores); informing SelectionDAG of - // this this let's CSE kick in for free and allows reordering of + // this lets CSE kick in for free and allows reordering of // instructions if possible. The lowering for statepoint sets the root, // so this is ordering all reloads with the either // a) the statepoint node itself, or diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 8d4c8802f71c..a84d35a6ea4e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -504,6 +504,11 @@ bool TargetLowering::ShrinkDemandedConstant(SDValue Op, SDLoc DL(Op); unsigned Opcode = Op.getOpcode(); + // Early-out if we've ended up calling an undemanded node, leave this to + // constant folding. + if (DemandedBits.isZero() || DemandedElts.isZero()) + return false; + // Do target-specific constant optimization. if (targetShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO)) return TLO.New.getNode(); @@ -552,18 +557,19 @@ bool TargetLowering::ShrinkDemandedConstant(SDValue Op, /// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be /// generalized for targets with other types of implicit widening casts. bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth, - const APInt &Demanded, + const APInt &DemandedBits, TargetLoweringOpt &TLO) const { assert(Op.getNumOperands() == 2 && "ShrinkDemandedOp only supports binary operators!"); assert(Op.getNode()->getNumValues() == 1 && "ShrinkDemandedOp only supports nodes with one result!"); + EVT VT = Op.getValueType(); SelectionDAG &DAG = TLO.DAG; SDLoc dl(Op); // Early return, as this function cannot handle vector types. - if (Op.getValueType().isVector()) + if (VT.isVector()) return false; // Don't do this if the node has another user, which may require the @@ -574,21 +580,18 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth, // Search for the smallest integer type with free casts to and from // Op's type. For expedience, just check power-of-2 integer types. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned DemandedSize = Demanded.getActiveBits(); - unsigned SmallVTBits = DemandedSize; - if (!isPowerOf2_32(SmallVTBits)) - SmallVTBits = NextPowerOf2(SmallVTBits); - for (; SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) { + unsigned DemandedSize = DemandedBits.getActiveBits(); + for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize); + SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) { EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), SmallVTBits); - if (TLI.isTruncateFree(Op.getValueType(), SmallVT) && - TLI.isZExtFree(SmallVT, Op.getValueType())) { + if (TLI.isTruncateFree(VT, SmallVT) && TLI.isZExtFree(SmallVT, VT)) { // We found a type with free casts. SDValue X = DAG.getNode( Op.getOpcode(), dl, SmallVT, DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)), DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1))); assert(DemandedSize <= SmallVTBits && "Narrowed below demanded bits?"); - SDValue Z = DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(), X); + SDValue Z = DAG.getNode(ISD::ANY_EXTEND, dl, VT, X); return TLO.CombineTo(Op, Z); } } @@ -773,7 +776,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( unsigned ShAmt = MaxSA->getZExtValue(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); - unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros(); + unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero(); if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= (UpperDemandedBits)) return Op0; } @@ -805,7 +808,8 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( SDValue Op0 = Op.getOperand(0); EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); unsigned ExBits = ExVT.getScalarSizeInBits(); - if (DemandedBits.getActiveBits() <= ExBits) + if (DemandedBits.getActiveBits() <= ExBits && + shouldRemoveRedundantExtend(Op)) return Op0; // If the input is already sign extended, just drop the extension. unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); @@ -856,15 +860,6 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( // If we don't demand the inserted subvector, return the base vector. if (DemandedSubElts == 0) return Vec; - // If this simply widens the lowest subvector, see if we can do it earlier. - // TODO: REMOVE ME - SimplifyMultipleUseDemandedBits shouldn't be creating - // general nodes like this. - if (Idx == 0 && Vec.isUndef()) { - if (SDValue NewSub = SimplifyMultipleUseDemandedBits( - Sub, DemandedBits, DemandedSubElts, DAG, Depth + 1)) - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - Op.getOperand(0), NewSub, Op.getOperand(2)); - } break; } case ISD::VECTOR_SHUFFLE: { @@ -954,33 +949,30 @@ static SDValue combineShiftToAVG(SDValue Op, SelectionDAG &DAG, SDValue ExtOpA = Add.getOperand(0); SDValue ExtOpB = Add.getOperand(1); - auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3) { + SDValue Add2; + auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3, SDValue A) { ConstantSDNode *ConstOp; - if ((ConstOp = isConstOrConstSplat(Op1, DemandedElts)) && - ConstOp->isOne()) { - ExtOpA = Op2; - ExtOpB = Op3; - return true; - } if ((ConstOp = isConstOrConstSplat(Op2, DemandedElts)) && ConstOp->isOne()) { ExtOpA = Op1; ExtOpB = Op3; + Add2 = A; return true; } if ((ConstOp = isConstOrConstSplat(Op3, DemandedElts)) && ConstOp->isOne()) { ExtOpA = Op1; ExtOpB = Op2; + Add2 = A; return true; } return false; }; bool IsCeil = (ExtOpA.getOpcode() == ISD::ADD && - MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB)) || + MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB, ExtOpA)) || (ExtOpB.getOpcode() == ISD::ADD && - MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA)); + MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA, ExtOpB)); // If the shift is signed (sra): // - Needs >= 2 sign bit for both operands. @@ -1040,11 +1032,25 @@ static SDValue combineShiftToAVG(SDValue Op, SelectionDAG &DAG, EVT VT = Op.getValueType(); unsigned MinWidth = std::max<unsigned>(VT.getScalarSizeInBits() - KnownBits, 8); - EVT NVT = EVT::getIntegerVT(*DAG.getContext(), PowerOf2Ceil(MinWidth)); + EVT NVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_ceil(MinWidth)); if (VT.isVector()) NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount()); - if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) - return SDValue(); + if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) { + // If we could not transform, and (both) adds are nuw/nsw, we can use the + // larger type size to do the transform. + if (!TLI.isOperationLegalOrCustom(AVGOpc, VT)) + return SDValue(); + + if (DAG.computeOverflowForAdd(IsSigned, Add.getOperand(0), + Add.getOperand(1)) == + SelectionDAG::OFK_Never && + (!Add2 || DAG.computeOverflowForAdd(IsSigned, Add2.getOperand(0), + Add2.getOperand(1)) == + SelectionDAG::OFK_Never)) + NVT = VT; + else + return SDValue(); + } SDLoc DL(Op); SDValue ResultAVG = @@ -1198,7 +1204,7 @@ bool TargetLowering::SimplifyDemandedBits( return true; if (!!DemandedVecElts) - Known = KnownBits::commonBits(Known, KnownVec); + Known = Known.intersectWith(KnownVec); return false; } @@ -1226,9 +1232,9 @@ bool TargetLowering::SimplifyDemandedBits( Known.Zero.setAllBits(); Known.One.setAllBits(); if (!!DemandedSubElts) - Known = KnownBits::commonBits(Known, KnownSub); + Known = Known.intersectWith(KnownSub); if (!!DemandedSrcElts) - Known = KnownBits::commonBits(Known, KnownSrc); + Known = Known.intersectWith(KnownSrc); // Attempt to avoid multi-use src if we don't need anything from it. if (!DemandedBits.isAllOnes() || !DemandedSubElts.isAllOnes() || @@ -1290,7 +1296,7 @@ bool TargetLowering::SimplifyDemandedBits( return true; // Known bits are shared by every demanded subvector element. if (!!DemandedSubElts) - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); } break; } @@ -1314,13 +1320,13 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Op0, DemandedBits, DemandedLHS, Known2, TLO, Depth + 1)) return true; - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); } if (!!DemandedRHS) { if (SimplifyDemandedBits(Op1, DemandedBits, DemandedRHS, Known2, TLO, Depth + 1)) return true; - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); } // Attempt to avoid multi-use ops if we don't need anything from them. @@ -1622,7 +1628,7 @@ bool TargetLowering::SimplifyDemandedBits( return true; // Only known if known in both the LHS and RHS. - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); break; case ISD::VSELECT: if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, DemandedElts, @@ -1635,7 +1641,7 @@ bool TargetLowering::SimplifyDemandedBits( assert(!Known2.hasConflict() && "Bits known to be one AND zero?"); // Only known if known in both the LHS and RHS. - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); break; case ISD::SELECT_CC: if (SimplifyDemandedBits(Op.getOperand(3), DemandedBits, Known, TLO, @@ -1652,7 +1658,7 @@ bool TargetLowering::SimplifyDemandedBits( return true; // Only known if known in both the LHS and RHS. - Known = KnownBits::commonBits(Known, Known2); + Known = Known.intersectWith(Known2); break; case ISD::SETCC: { SDValue Op0 = Op.getOperand(0); @@ -1724,12 +1730,9 @@ bool TargetLowering::SimplifyDemandedBits( unsigned InnerBits = InnerVT.getScalarSizeInBits(); if (ShAmt < InnerBits && DemandedBits.getActiveBits() <= InnerBits && isTypeDesirableForOp(ISD::SHL, InnerVT)) { - EVT ShTy = getShiftAmountTy(InnerVT, DL); - if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits())) - ShTy = InnerVT; - SDValue NarrowShl = - TLO.DAG.getNode(ISD::SHL, dl, InnerVT, InnerOp, - TLO.DAG.getConstant(ShAmt, dl, ShTy)); + SDValue NarrowShl = TLO.DAG.getNode( + ISD::SHL, dl, InnerVT, InnerOp, + TLO.DAG.getShiftAmountConstant(ShAmt, InnerVT, dl)); return TLO.CombineTo( Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl)); } @@ -1748,7 +1751,7 @@ bool TargetLowering::SimplifyDemandedBits( if (InnerShAmt < ShAmt && InnerShAmt < InnerBits && DemandedBits.getActiveBits() <= (InnerBits - InnerShAmt + ShAmt) && - DemandedBits.countTrailingZeros() >= ShAmt) { + DemandedBits.countr_zero() >= ShAmt) { SDValue NewSA = TLO.DAG.getConstant(ShAmt - InnerShAmt, dl, ShiftVT); SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, @@ -1771,7 +1774,7 @@ bool TargetLowering::SimplifyDemandedBits( Known.Zero.setLowBits(ShAmt); // Attempt to avoid multi-use ops if we don't need anything from them. - if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) { + if (!InDemandedMask.isAllOnes() || !DemandedElts.isAllOnes()) { SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1); if (DemandedOp0) { @@ -1789,7 +1792,7 @@ bool TargetLowering::SimplifyDemandedBits( // This is a variable shift, so we can't shift the demand mask by a known // amount. But if we are not demanding high bits, then we are not // demanding those bits from the pre-shifted operand either. - if (unsigned CTLZ = DemandedBits.countLeadingZeros()) { + if (unsigned CTLZ = DemandedBits.countl_zero()) { APInt DemandedFromOp(APInt::getLowBitsSet(BitWidth, BitWidth - CTLZ)); if (SimplifyDemandedBits(Op0, DemandedFromOp, DemandedElts, Known, TLO, Depth + 1)) { @@ -1814,7 +1817,7 @@ bool TargetLowering::SimplifyDemandedBits( unsigned ShAmt = MaxSA->getZExtValue(); unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); - unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros(); + unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero(); if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= (UpperDemandedBits)) return TLO.CombineTo(Op, Op0); } @@ -1865,6 +1868,27 @@ bool TargetLowering::SimplifyDemandedBits( if (Op->getFlags().hasExact()) InDemandedMask.setLowBits(ShAmt); + // Narrow shift to lower half - similar to ShrinkDemandedOp. + // (srl i64:x, K) -> (i64 zero_extend (srl (i32 (trunc i64:x)), K)) + if ((BitWidth % 2) == 0 && !VT.isVector() && + ((InDemandedMask.countLeadingZeros() >= (BitWidth / 2)) || + TLO.DAG.MaskedValueIsZero( + Op0, APInt::getHighBitsSet(BitWidth, BitWidth / 2)))) { + EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2); + if (isNarrowingProfitable(VT, HalfVT) && + isTypeDesirableForOp(ISD::SRL, HalfVT) && + isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) && + (!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT))) { + SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0); + SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant( + ShAmt, HalfVT, dl, TLO.LegalTypes()); + SDValue NewShift = + TLO.DAG.getNode(ISD::SRL, dl, HalfVT, NewOp, NewShiftAmt); + return TLO.CombineTo( + Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, NewShift)); + } + } + // Compute the new bits that are at the top now. if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) @@ -1876,7 +1900,7 @@ bool TargetLowering::SimplifyDemandedBits( Known.Zero.setHighBits(ShAmt); // Attempt to avoid multi-use ops if we don't need anything from them. - if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) { + if (!InDemandedMask.isAllOnes() || !DemandedElts.isAllOnes()) { SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1); if (DemandedOp0) { @@ -1884,6 +1908,10 @@ bool TargetLowering::SimplifyDemandedBits( return TLO.CombineTo(Op, NewOp); } } + } else { + // Use generic knownbits computation as it has support for non-uniform + // shift amounts. + Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth); } break; } @@ -1894,7 +1922,7 @@ bool TargetLowering::SimplifyDemandedBits( // If we only want bits that already match the signbit then we don't need // to shift. - unsigned NumHiDemandedBits = BitWidth - DemandedBits.countTrailingZeros(); + unsigned NumHiDemandedBits = BitWidth - DemandedBits.countr_zero(); if (TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1) >= NumHiDemandedBits) return TLO.CombineTo(Op, Op0); @@ -1926,7 +1954,7 @@ bool TargetLowering::SimplifyDemandedBits( // If any of the demanded bits are produced by the sign extension, we also // demand the input sign bit. - if (DemandedBits.countLeadingZeros() < ShAmt) + if (DemandedBits.countl_zero() < ShAmt) InDemandedMask.setSignBit(); if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, @@ -1939,7 +1967,7 @@ bool TargetLowering::SimplifyDemandedBits( // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. if (Known.Zero[BitWidth - ShAmt - 1] || - DemandedBits.countLeadingZeros() >= ShAmt) { + DemandedBits.countl_zero() >= ShAmt) { SDNodeFlags Flags; Flags.setExact(Op->getFlags().hasExact()); return TLO.CombineTo( @@ -2003,8 +2031,7 @@ bool TargetLowering::SimplifyDemandedBits( Known2.Zero <<= (IsFSHL ? Amt : (BitWidth - Amt)); Known.One.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt); Known.Zero.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt); - Known.One |= Known2.One; - Known.Zero |= Known2.Zero; + Known = Known.unionWith(Known2); // Attempt to avoid multi-use ops if we don't need anything from them. if (!Demanded0.isAllOnes() || !Demanded1.isAllOnes() || @@ -2059,12 +2086,12 @@ bool TargetLowering::SimplifyDemandedBits( // See if we don't demand either half of the rotated bits. if ((!TLO.LegalOperations() || isOperationLegal(ISD::SHL, VT)) && - DemandedBits.countTrailingZeros() >= (IsROTL ? Amt : RevAmt)) { + DemandedBits.countr_zero() >= (IsROTL ? Amt : RevAmt)) { Op1 = TLO.DAG.getConstant(IsROTL ? Amt : RevAmt, dl, Op1.getValueType()); return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, Op1)); } if ((!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT)) && - DemandedBits.countLeadingZeros() >= (IsROTL ? RevAmt : Amt)) { + DemandedBits.countl_zero() >= (IsROTL ? RevAmt : Amt)) { Op1 = TLO.DAG.getConstant(IsROTL ? RevAmt : Amt, dl, Op1.getValueType()); return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1)); } @@ -2120,8 +2147,8 @@ bool TargetLowering::SimplifyDemandedBits( // If the only bits demanded come from one byte of the bswap result, // just shift the input byte into position to eliminate the bswap. - unsigned NLZ = DemandedBits.countLeadingZeros(); - unsigned NTZ = DemandedBits.countTrailingZeros(); + unsigned NLZ = DemandedBits.countl_zero(); + unsigned NTZ = DemandedBits.countr_zero(); // Round NTZ down to the next byte. If we have 11 trailing zeros, then // we need all the bits down to bit 8. Likewise, round NLZ. If we @@ -2557,6 +2584,15 @@ bool TargetLowering::SimplifyDemandedBits( if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, KnownSrcBits, TLO, Depth + 1)) return true; + + // Attempt to avoid multi-use ops if we don't need anything from them. + if (!DemandedSrcBits.isAllOnes() || !DemandedSrcElts.isAllOnes()) { + if (SDValue DemandedSrc = SimplifyMultipleUseDemandedBits( + Src, DemandedSrcBits, DemandedSrcElts, TLO.DAG, Depth + 1)) { + SDValue NewOp = TLO.DAG.getBitcast(VT, DemandedSrc); + return TLO.CombineTo(Op, NewOp); + } + } } // If this is a bitcast, let computeKnownBits handle it. Only do this on a @@ -2572,9 +2608,9 @@ bool TargetLowering::SimplifyDemandedBits( // The LSB of X*Y is set only if (X & 1) == 1 and (Y & 1) == 1. // If we demand exactly one bit N and we have "X * (C' << N)" where C' is // odd (has LSB set), then the left-shifted low bit of X is the answer. - unsigned CTZ = DemandedBits.countTrailingZeros(); + unsigned CTZ = DemandedBits.countr_zero(); ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1), DemandedElts); - if (C && C->getAPIntValue().countTrailingZeros() == CTZ) { + if (C && C->getAPIntValue().countr_zero() == CTZ) { EVT ShiftAmtTy = getShiftAmountTy(VT, TLO.DAG.getDataLayout()); SDValue AmtC = TLO.DAG.getConstant(CTZ, dl, ShiftAmtTy); SDValue Shl = TLO.DAG.getNode(ISD::SHL, dl, VT, Op.getOperand(0), AmtC); @@ -2596,11 +2632,12 @@ bool TargetLowering::SimplifyDemandedBits( // of the highest bit demanded of them. SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); SDNodeFlags Flags = Op.getNode()->getFlags(); - unsigned DemandedBitsLZ = DemandedBits.countLeadingZeros(); + unsigned DemandedBitsLZ = DemandedBits.countl_zero(); APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); - if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, Known2, TLO, + KnownBits KnownOp0, KnownOp1; + if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, KnownOp0, TLO, Depth + 1) || - SimplifyDemandedBits(Op1, LoMask, DemandedElts, Known2, TLO, + SimplifyDemandedBits(Op1, LoMask, DemandedElts, KnownOp1, TLO, Depth + 1) || // See if the operation should be performed at a smaller bit width. ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) { @@ -2697,7 +2734,14 @@ bool TargetLowering::SimplifyDemandedBits( } } - [[fallthrough]]; + if (Op.getOpcode() == ISD::MUL) { + Known = KnownBits::mul(KnownOp0, KnownOp1); + } else { // Op.getOpcode() is either ISD::ADD or ISD::SUB. + Known = KnownBits::computeForAddSub(Op.getOpcode() == ISD::ADD, + Flags.hasNoSignedWrap(), KnownOp0, + KnownOp1); + } + break; } default: // We also ask the target about intrinsics (which could be specific to it). @@ -3914,8 +3958,7 @@ SDValue TargetLowering::optimizeSetCCOfSignedTruncationCheck( SDValue TargetLowering::optimizeSetCCByHoistingAndByConstFromLogicalShift( EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const { - assert(isConstOrConstSplat(N1C) && - isConstOrConstSplat(N1C)->getAPIntValue().isZero() && + assert(isConstOrConstSplat(N1C) && isConstOrConstSplat(N1C)->isZero() && "Should be a comparison with 0."); assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) && "Valid only for [in]equality comparisons."); @@ -4081,8 +4124,12 @@ static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT, ISD::CondCode InvCond = ISD::getSetCCInverse(Cond, CTVT); SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne); SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add); - SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond); SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond); + // Its not uncommon for known-never-zero X to exist in (ctpop X) eq/ne 1, so + // check before the emit a potentially unnecessary op. + if (DAG.isKnownNeverZero(CTOp)) + return RHS; + SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond); unsigned LogicOpcode = Cond == ISD::SETEQ ? ISD::AND : ISD::OR; return DAG.getNode(LogicOpcode, dl, VT, LHS, RHS); } @@ -4219,12 +4266,12 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, bool N1ConstOrSplat = isConstOrConstSplat(N1, /*AllowUndefs*/ false, /*AllowTruncate*/ true); - // Ensure that the constant occurs on the RHS and fold constant comparisons. + // Canonicalize toward having the constant on the RHS. // TODO: Handle non-splat vector constants. All undef causes trouble. // FIXME: We can't yet fold constant scalable vector splats, so avoid an // infinite loop here when we encounter one. ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond); - if (N0ConstOrSplat && (!OpVT.isScalableVector() || !N1ConstOrSplat) && + if (N0ConstOrSplat && !N1ConstOrSplat && (DCI.isBeforeLegalizeOps() || isCondCodeLegal(SwappedCC, N0.getSimpleValueType()))) return DAG.getSetCC(dl, VT, N1, N0, SwappedCC); @@ -4275,7 +4322,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // zero. if (N0.getOpcode() == ISD::SRL && (C1.isZero() || C1.isOne()) && N0.getOperand(0).getOpcode() == ISD::CTLZ && - isPowerOf2_32(N0.getScalarValueSizeInBits())) { + llvm::has_single_bit<uint32_t>(N0.getScalarValueSizeInBits())) { if (ConstantSDNode *ShAmt = isConstOrConstSplat(N0.getOperand(1))) { if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && ShAmt->getAPIntValue() == Log2_32(N0.getScalarValueSizeInBits())) { @@ -4315,7 +4362,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // DAGCombine turns costly ZExts into ANDs if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) if ((C->getAPIntValue()+1).isPowerOf2()) { - MinBits = C->getAPIntValue().countTrailingOnes(); + MinBits = C->getAPIntValue().countr_one(); PreExt = N0->getOperand(0); } } else if (N0->getOpcode() == ISD::SIGN_EXTEND) { @@ -4336,7 +4383,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } // Figure out how many bits we need to preserve this constant. - unsigned ReqdBits = Signed ? C1.getMinSignedBits() : C1.getActiveBits(); + unsigned ReqdBits = Signed ? C1.getSignificantBits() : C1.getActiveBits(); // Make sure we're not losing bits from the constant. if (MinBits > 0 && @@ -4510,7 +4557,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // If the constant doesn't fit into the number of bits for the source of // the sign extension, it is impossible for both sides to be equal. - if (C1.getMinSignedBits() > ExtSrcTyBits) + if (C1.getSignificantBits() > ExtSrcTyBits) return DAG.getBoolConstant(Cond == ISD::SETNE, dl, VT, OpVT); assert(ExtDstTy == N0.getOperand(0).getValueType() && @@ -4744,8 +4791,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // For example, when high 32-bits of i64 X are known clear: // all bits clear: (X | (Y<<32)) == 0 --> (X | Y) == 0 // all bits set: (X | (Y<<32)) == -1 --> (X & Y) == -1 - bool CmpZero = N1C->getAPIntValue().isZero(); - bool CmpNegOne = N1C->getAPIntValue().isAllOnes(); + bool CmpZero = N1C->isZero(); + bool CmpNegOne = N1C->isAllOnes(); if ((CmpZero || CmpNegOne) && N0.hasOneUse()) { // Match or(lo,shl(hi,bw/2)) pattern. auto IsConcat = [&](SDValue V, SDValue &Lo, SDValue &Hi) { @@ -4866,7 +4913,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, } } - if (C1.getMinSignedBits() <= 64 && + if (C1.getSignificantBits() <= 64 && !isLegalICmpImmediate(C1.getSExtValue())) { EVT ShiftTy = getShiftAmountTy(ShValTy, Layout, !DCI.isBeforeLegalize()); // (X & -256) == 256 -> (X >> 8) == 1 @@ -4875,7 +4922,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) { const APInt &AndRHSC = AndRHS->getAPIntValue(); if (AndRHSC.isNegatedPowerOf2() && (AndRHSC & C1) == C1) { - unsigned ShiftBits = AndRHSC.countTrailingZeros(); + unsigned ShiftBits = AndRHSC.countr_zero(); if (!TLI.shouldAvoidTransformToShift(ShValTy, ShiftBits)) { SDValue Shift = DAG.getNode(ISD::SRL, dl, ShValTy, N0.getOperand(0), @@ -4896,14 +4943,14 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, APInt NewC = C1; ISD::CondCode NewCond = Cond; if (AdjOne) { - ShiftBits = C1.countTrailingOnes(); + ShiftBits = C1.countr_one(); NewC = NewC + 1; NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; } else { - ShiftBits = C1.countTrailingZeros(); + ShiftBits = C1.countr_zero(); } NewC.lshrInPlace(ShiftBits); - if (ShiftBits && NewC.getMinSignedBits() <= 64 && + if (ShiftBits && NewC.getSignificantBits() <= 64 && isLegalICmpImmediate(NewC.getSExtValue()) && !TLI.shouldAvoidTransformToShift(ShValTy, ShiftBits)) { SDValue Shift = DAG.getNode(ISD::SRL, dl, ShValTy, N0, @@ -4980,6 +5027,23 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, return DAG.getSetCC(dl, VT, N0, N1, NewCond); } + // ~X > ~Y --> Y > X + // ~X < ~Y --> Y < X + // ~X < C --> X > ~C + // ~X > C --> X < ~C + if ((isSignedIntSetCC(Cond) || isUnsignedIntSetCC(Cond)) && + N0.getValueType().isInteger()) { + if (isBitwiseNot(N0)) { + if (isBitwiseNot(N1)) + return DAG.getSetCC(dl, VT, N1.getOperand(0), N0.getOperand(0), Cond); + + if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { + SDValue Not = DAG.getNOT(dl, N1, OpVT); + return DAG.getSetCC(dl, VT, Not, N0.getOperand(0), Cond); + } + } + } + if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && N0.getValueType().isInteger()) { if (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB || @@ -5225,7 +5289,7 @@ const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const { } SDValue TargetLowering::LowerAsmOutputForConstraint( - SDValue &Chain, SDValue &Flag, const SDLoc &DL, + SDValue &Chain, SDValue &Glue, const SDLoc &DL, const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { return SDValue(); } @@ -5308,10 +5372,8 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } } -void TargetLowering::CollectTargetIntrinsicOperands(const CallInst &I, - SmallVectorImpl<SDValue> &Ops, - SelectionDAG &DAG) const { - return; +void TargetLowering::CollectTargetIntrinsicOperands( + const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const { } std::pair<unsigned, const TargetRegisterClass *> @@ -5782,7 +5844,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N, if (C->isZero()) return false; APInt Divisor = C->getAPIntValue(); - unsigned Shift = Divisor.countTrailingZeros(); + unsigned Shift = Divisor.countr_zero(); if (Shift) { Divisor.ashrInPlace(Shift); UseSRA = true; @@ -5972,6 +6034,20 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y); return SDValue(LoHi.getNode(), 1); } + // If type twice as wide legal, widen and use a mul plus a shift. + unsigned Size = VT.getScalarSizeInBits(); + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2); + if (VT.isVector()) + WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, + VT.getVectorElementCount()); + if (isOperationLegalOrCustom(ISD::MUL, WideVT)) { + X = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, X); + Y = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, Y); + Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y); + Y = DAG.getNode(ISD::SRL, dl, WideVT, Y, + DAG.getShiftAmountConstant(EltBits, WideVT, dl)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Y); + } return SDValue(); }; @@ -6045,9 +6121,8 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, LeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros(); // UnsignedDivisionByConstantInfo doesn't work correctly if leading zeros in // the dividend exceeds the leading zeros for the divisor. - LeadingZeros = - std::min(LeadingZeros, - cast<ConstantSDNode>(N1)->getAPIntValue().countLeadingZeros()); + LeadingZeros = std::min( + LeadingZeros, cast<ConstantSDNode>(N1)->getAPIntValue().countl_zero()); } bool UseNPQ = false, UsePreShift = false, UsePostShift = false; @@ -6146,6 +6221,20 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y); return SDValue(LoHi.getNode(), 1); } + // If type twice as wide legal, widen and use a mul plus a shift. + unsigned Size = VT.getScalarSizeInBits(); + EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2); + if (VT.isVector()) + WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, + VT.getVectorElementCount()); + if (isOperationLegalOrCustom(ISD::MUL, WideVT)) { + X = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, X); + Y = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, Y); + Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y); + Y = DAG.getNode(ISD::SRL, dl, WideVT, Y, + DAG.getShiftAmountConstant(EltBits, WideVT, dl)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Y); + } return SDValue(); // No mulhu or equivalent }; @@ -6298,7 +6387,7 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode, AllComparisonsWithNonZerosAreTautological &= TautologicalLane; // Decompose D into D0 * 2^K - unsigned K = D.countTrailingZeros(); + unsigned K = D.countr_zero(); assert((!D.isOne() || (K == 0)) && "For divisor '1' we won't rotate."); APInt D0 = D.lshr(K); @@ -6540,7 +6629,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode, AllDivisorsAreOnes &= D.isOne(); // Decompose D into D0 * 2^K - unsigned K = D.countTrailingZeros(); + unsigned K = D.countr_zero(); assert((!D.isOne() || (K == 0)) && "For divisor '1' we won't rotate."); APInt D0 = D.lshr(K); @@ -6696,9 +6785,9 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode, // NOTE: we avoid letting illegal types through even if we're before legalize // ops – legalization has a hard time producing good code for the code that // follows. - if (!isOperationLegalOrCustom(ISD::SETEQ, VT) || + if (!isOperationLegalOrCustom(ISD::SETCC, SETCCVT) || !isOperationLegalOrCustom(ISD::AND, VT) || - !isOperationLegalOrCustom(Cond, VT) || + !isCondCodeLegalOrCustom(Cond, VT.getSimpleVT()) || !isOperationLegalOrCustom(ISD::VSELECT, SETCCVT)) return SDValue(); @@ -6748,20 +6837,23 @@ SDValue TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG, EVT VT = Op.getValueType(); EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); - // Testing it with denormal inputs to avoid wrong estimate. - if (Mode.Input == DenormalMode::IEEE) { - // This is specifically a check for the handling of denormal inputs, - // not the result. - // Test = fabs(X) < SmallestNormal - const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); - APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); - SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); - SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); - return DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); + // This is specifically a check for the handling of denormal inputs, not the + // result. + if (Mode.Input == DenormalMode::PreserveSign || + Mode.Input == DenormalMode::PositiveZero) { + // Test = X == 0.0 + return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); } - // Test = X == 0.0 - return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); + + // Testing it with denormal inputs to avoid wrong estimate. + // + // Test = fabs(X) < SmallestNormal + const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); + APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); + SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); + SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); + return DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); } SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, @@ -6769,7 +6861,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, NegatibleCost &Cost, unsigned Depth) const { // fneg is removable even if it has multiple uses. - if (Op.getOpcode() == ISD::FNEG) { + if (Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::VP_FNEG) { Cost = NegatibleCost::Cheaper; return Op.getOperand(0); } @@ -7212,7 +7304,7 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, const SDLoc &dl, Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next, Merge(Lo, Hi)); else - Next = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(VT, BoolType), Next, + Next = DAG.getNode(ISD::UADDO_CARRY, dl, DAG.getVTList(VT, BoolType), Next, Merge(Lo, Hi), DAG.getConstant(0, dl, BoolType)); SDValue Carry = Next.getValue(1); @@ -7226,7 +7318,7 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, const SDLoc &dl, Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero, Carry); else - Hi = DAG.getNode(ISD::ADDCARRY, dl, DAG.getVTList(HiLoVT, BoolType), Hi, + Hi = DAG.getNode(ISD::UADDO_CARRY, dl, DAG.getVTList(HiLoVT, BoolType), Hi, Zero, Carry); Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi)); @@ -7330,7 +7422,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, // If the divisor is even, shift it until it becomes odd. unsigned TrailingZeros = 0; if (!Divisor[0]) { - TrailingZeros = Divisor.countTrailingZeros(); + TrailingZeros = Divisor.countr_zero(); Divisor.lshrInPlace(TrailingZeros); } @@ -7342,14 +7434,10 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, // then add in the carry. // TODO: If we can't split it in half, we might be able to split into 3 or // more pieces using a smaller bit width. - if (HalfMaxPlus1.urem(Divisor).isOneValue()) { + if (HalfMaxPlus1.urem(Divisor).isOne()) { assert(!LL == !LH && "Expected both input halves or no input halves!"); - if (!LL) { - LL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, N->getOperand(0), - DAG.getIntPtrConstant(0, dl)); - LH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, N->getOperand(0), - DAG.getIntPtrConstant(1, dl)); - } + if (!LL) + std::tie(LL, LH) = DAG.SplitScalar(N->getOperand(0), dl, HiLoVT, HiLoVT); // Shift the input by the number of TrailingZeros in the divisor. The // shifted out bits will be added to the remainder later. @@ -7372,13 +7460,13 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)); } - // Use addcarry if we can, otherwise use a compare to detect overflow. + // Use uaddo_carry if we can, otherwise use a compare to detect overflow. EVT SetCCType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT); - if (isOperationLegalOrCustom(ISD::ADDCARRY, HiLoVT)) { + if (isOperationLegalOrCustom(ISD::UADDO_CARRY, HiLoVT)) { SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType); Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH); - Sum = DAG.getNode(ISD::ADDCARRY, dl, VTList, Sum, + Sum = DAG.getNode(ISD::UADDO_CARRY, dl, VTList, Sum, DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1)); } else { Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, LL, LH); @@ -7423,10 +7511,8 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, DAG.getConstant(MulFactor, dl, VT)); // Split the quotient into low and high parts. - SDValue QuotL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient, - DAG.getIntPtrConstant(0, dl)); - SDValue QuotH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HiLoVT, Quotient, - DAG.getIntPtrConstant(1, dl)); + SDValue QuotL, QuotH; + std::tie(QuotL, QuotH) = DAG.SplitScalar(Quotient, dl, HiLoVT, HiLoVT); Result.push_back(QuotL); Result.push_back(QuotH); } @@ -7915,7 +8001,7 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result, // -0.0. This will be added to +0.0 and produce -0.0 which is incorrect. SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT); SDValue TwoP84PlusTwoP52 = DAG.getConstantFP( - BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT); + llvm::bit_cast<double>(UINT64_C(0x4530000000100000)), dl, DstVT); SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT); SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT); SDValue HiShift = DAG.getConstant(32, dl, ShiftVT); @@ -7988,14 +8074,19 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, } // If the target has FMINIMUM/FMAXIMUM but not FMINNUM/FMAXNUM use that - // instead if there are no NaNs. - if (Node->getFlags().hasNoNaNs()) { + // instead if there are no NaNs and there can't be an incompatible zero + // compare: at least one operand isn't +/-0, or there are no signed-zeros. + if ((Node->getFlags().hasNoNaNs() || + (DAG.isKnownNeverNaN(Node->getOperand(0)) && + DAG.isKnownNeverNaN(Node->getOperand(1)))) && + (Node->getFlags().hasNoSignedZeros() || + DAG.isKnownNeverZeroFloat(Node->getOperand(0)) || + DAG.isKnownNeverZeroFloat(Node->getOperand(1)))) { unsigned IEEE2018Op = Node->getOpcode() == ISD::FMINNUM ? ISD::FMINIMUM : ISD::FMAXIMUM; - if (isOperationLegalOrCustom(IEEE2018Op, VT)) { + if (isOperationLegalOrCustom(IEEE2018Op, VT)) return DAG.getNode(IEEE2018Op, dl, VT, Node->getOperand(0), Node->getOperand(1), Node->getFlags()); - } } if (SDValue SelCC = createSelectForFMINNUM_FMAXNUM(Node, DAG)) @@ -8004,15 +8095,39 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, return SDValue(); } +/// Returns a true value if if this FPClassTest can be performed with an ordered +/// fcmp to 0, and a false value if it's an unordered fcmp to 0. Returns +/// std::nullopt if it cannot be performed as a compare with 0. +static std::optional<bool> isFCmpEqualZero(FPClassTest Test, + const fltSemantics &Semantics, + const MachineFunction &MF) { + FPClassTest OrderedMask = Test & ~fcNan; + FPClassTest NanTest = Test & fcNan; + bool IsOrdered = NanTest == fcNone; + bool IsUnordered = NanTest == fcNan; + + // Skip cases that are testing for only a qnan or snan. + if (!IsOrdered && !IsUnordered) + return std::nullopt; + + if (OrderedMask == fcZero && + MF.getDenormalMode(Semantics).Input == DenormalMode::IEEE) + return IsOrdered; + if (OrderedMask == (fcZero | fcSubnormal) && + MF.getDenormalMode(Semantics).inputsAreZero()) + return IsOrdered; + return std::nullopt; +} + SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, - unsigned Test, SDNodeFlags Flags, + FPClassTest Test, SDNodeFlags Flags, const SDLoc &DL, SelectionDAG &DAG) const { EVT OperandVT = Op.getValueType(); assert(OperandVT.isFloatingPoint()); // Degenerated cases. - if (Test == 0) + if (Test == fcNone) return DAG.getBoolConstant(false, DL, ResultVT, OperandVT); if ((Test & fcAllFlags) == fcAllFlags) return DAG.getBoolConstant(true, DL, ResultVT, OperandVT); @@ -8028,7 +8143,7 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, // Some checks may be represented as inversion of simpler check, for example // "inf|normal|subnormal|zero" => !"nan". bool IsInverted = false; - if (unsigned InvertedCheck = getInvertedFPClassTest(Test)) { + if (FPClassTest InvertedCheck = invertFPClassTestIfSimpler(Test)) { IsInverted = true; Test = InvertedCheck; } @@ -8043,13 +8158,40 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, // exceptions are ignored. if (Flags.hasNoFPExcept() && isOperationLegalOrCustom(ISD::SETCC, OperandVT.getScalarType())) { - if (Test == fcZero) + ISD::CondCode OrderedCmpOpcode = IsInverted ? ISD::SETUNE : ISD::SETOEQ; + ISD::CondCode UnorderedCmpOpcode = IsInverted ? ISD::SETONE : ISD::SETUEQ; + + if (std::optional<bool> IsCmp0 = + isFCmpEqualZero(Test, Semantics, DAG.getMachineFunction()); + IsCmp0 && (isCondCodeLegalOrCustom( + *IsCmp0 ? OrderedCmpOpcode : UnorderedCmpOpcode, + OperandVT.getScalarType().getSimpleVT()))) { + + // If denormals could be implicitly treated as 0, this is not equivalent + // to a compare with 0 since it will also be true for denormals. return DAG.getSetCC(DL, ResultVT, Op, DAG.getConstantFP(0.0, DL, OperandVT), - IsInverted ? ISD::SETUNE : ISD::SETOEQ); - if (Test == fcNan) + *IsCmp0 ? OrderedCmpOpcode : UnorderedCmpOpcode); + } + + if (Test == fcNan && + isCondCodeLegalOrCustom(IsInverted ? ISD::SETO : ISD::SETUO, + OperandVT.getScalarType().getSimpleVT())) { return DAG.getSetCC(DL, ResultVT, Op, Op, IsInverted ? ISD::SETO : ISD::SETUO); + } + + if (Test == fcInf && + isCondCodeLegalOrCustom(IsInverted ? ISD::SETUNE : ISD::SETOEQ, + OperandVT.getScalarType().getSimpleVT()) && + isOperationLegalOrCustom(ISD::FABS, OperandVT.getScalarType())) { + // isinf(x) --> fabs(x) == inf + SDValue Abs = DAG.getNode(ISD::FABS, DL, OperandVT, Op); + SDValue Inf = + DAG.getConstantFP(APFloat::getInf(Semantics), DL, OperandVT); + return DAG.getSetCC(DL, ResultVT, Abs, Inf, + IsInverted ? ISD::SETUNE : ISD::SETOEQ); + } } // In the general case use integer operations. @@ -8071,7 +8213,7 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf; APInt QNaNBitMask = APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1); - APInt InvertionMask = APInt::getAllOnesValue(ResultVT.getScalarSizeInBits()); + APInt InvertionMask = APInt::getAllOnes(ResultVT.getScalarSizeInBits()); SDValue ValueMaskV = DAG.getConstant(ValueMask, DL, IntVT); SDValue SignBitV = DAG.getConstant(SignBit, DL, IntVT); @@ -8129,6 +8271,18 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, } appendResult(PartialRes); + if (FPClassTest PartialCheck = Test & (fcZero | fcSubnormal)) { + // fcZero | fcSubnormal => test all exponent bits are 0 + // TODO: Handle sign bit specific cases + if (PartialCheck == (fcZero | fcSubnormal)) { + SDValue ExpBits = DAG.getNode(ISD::AND, DL, IntVT, OpAsInt, ExpMaskV); + SDValue ExpIsZero = + DAG.getSetCC(DL, ResultVT, ExpBits, ZeroV, ISD::SETEQ); + appendResult(ExpIsZero); + Test &= ~PartialCheck & fcAllFlags; + } + } + // Check for individual classes. if (unsigned PartialCheck = Test & fcZero) { @@ -8141,6 +8295,19 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, appendResult(PartialRes); } + if (unsigned PartialCheck = Test & fcSubnormal) { + // issubnormal(V) ==> unsigned(abs(V) - 1) < (all mantissa bits set) + // issubnormal(V) && V>0 ==> unsigned(V - 1) < (all mantissa bits set) + SDValue V = (PartialCheck == fcPosSubnormal) ? OpAsInt : AbsV; + SDValue MantissaV = DAG.getConstant(AllOneMantissa, DL, IntVT); + SDValue VMinusOneV = + DAG.getNode(ISD::SUB, DL, IntVT, V, DAG.getConstant(1, DL, IntVT)); + PartialRes = DAG.getSetCC(DL, ResultVT, VMinusOneV, MantissaV, ISD::SETULT); + if (PartialCheck == fcNegSubnormal) + PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, SignV); + appendResult(PartialRes); + } + if (unsigned PartialCheck = Test & fcInf) { if (PartialCheck == fcPosInf) PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, InfV, ISD::SETEQ); @@ -8185,19 +8352,6 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op, appendResult(PartialRes); } - if (unsigned PartialCheck = Test & fcSubnormal) { - // issubnormal(V) ==> unsigned(abs(V) - 1) < (all mantissa bits set) - // issubnormal(V) && V>0 ==> unsigned(V - 1) < (all mantissa bits set) - SDValue V = (PartialCheck == fcPosSubnormal) ? OpAsInt : AbsV; - SDValue MantissaV = DAG.getConstant(AllOneMantissa, DL, IntVT); - SDValue VMinusOneV = - DAG.getNode(ISD::SUB, DL, IntVT, V, DAG.getConstant(1, DL, IntVT)); - PartialRes = DAG.getSetCC(DL, ResultVT, VMinusOneV, MantissaV, ISD::SETULT); - if (PartialCheck == fcNegSubnormal) - PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, SignV); - appendResult(PartialRes); - } - if (unsigned PartialCheck = Test & fcNormal) { // isnormal(V) ==> (0 < exp < max_exp) ==> (unsigned(exp-1) < (max_exp-1)) APInt ExpLSB = ExpMask & ~(ExpMask.shl(1)); @@ -8609,6 +8763,38 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::SUB, dl, VT, Shift, Xor); } +SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue LHS = DAG.getFreeze(N->getOperand(0)); + SDValue RHS = DAG.getFreeze(N->getOperand(1)); + bool IsSigned = N->getOpcode() == ISD::ABDS; + + // abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs)) + // abdu(lhs, rhs) -> sub(umax(lhs,rhs), umin(lhs,rhs)) + unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX; + unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN; + if (isOperationLegal(MaxOpc, VT) && isOperationLegal(MinOpc, VT)) { + SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS); + SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS); + return DAG.getNode(ISD::SUB, dl, VT, Max, Min); + } + + // abdu(lhs, rhs) -> or(usubsat(lhs,rhs), usubsat(rhs,lhs)) + if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT)) + return DAG.getNode(ISD::OR, dl, VT, + DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS), + DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS)); + + // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) + // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs)) + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT; + SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC); + return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS), + DAG.getNode(ISD::SUB, dl, VT, RHS, LHS)); +} + SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const { SDLoc dl(N); EVT VT = N->getValueType(0); @@ -8796,8 +8982,7 @@ SDValue TargetLowering::expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const { Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT)); - APInt Shift(Sz, 1); - Shift <<= J; + APInt Shift = APInt::getOneBitSet(Sz, J); Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT)); Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2); } @@ -9494,10 +9679,21 @@ SDValue TargetLowering::expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const { SDValue Op0 = Node->getOperand(0); SDValue Op1 = Node->getOperand(1); EVT VT = Op0.getValueType(); + EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); unsigned Opcode = Node->getOpcode(); SDLoc DL(Node); + // umax(x,1) --> sub(x,cmpeq(x,0)) iff cmp result is allbits + if (Opcode == ISD::UMAX && llvm::isOneOrOneSplat(Op1, true) && BoolVT == VT && + getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) { + Op0 = DAG.getFreeze(Op0); + SDValue Zero = DAG.getConstant(0, DL, VT); + return DAG.getNode(ISD::SUB, DL, VT, Op0, + DAG.getSetCC(DL, VT, Op0, Zero, ISD::SETEQ)); + } + // umin(x,y) -> sub(x,usubsat(x,y)) + // TODO: Missing freeze(Op0)? if (Opcode == ISD::UMIN && isOperationLegal(ISD::SUB, VT) && isOperationLegal(ISD::USUBSAT, VT)) { return DAG.getNode(ISD::SUB, DL, VT, Op0, @@ -9505,30 +9701,59 @@ SDValue TargetLowering::expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const { } // umax(x,y) -> add(x,usubsat(y,x)) + // TODO: Missing freeze(Op0)? if (Opcode == ISD::UMAX && isOperationLegal(ISD::ADD, VT) && isOperationLegal(ISD::USUBSAT, VT)) { return DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getNode(ISD::USUBSAT, DL, VT, Op1, Op0)); } - // Expand Y = MAX(A, B) -> Y = (A > B) ? A : B - ISD::CondCode CC; - switch (Opcode) { - default: llvm_unreachable("How did we get here?"); - case ISD::SMAX: CC = ISD::SETGT; break; - case ISD::SMIN: CC = ISD::SETLT; break; - case ISD::UMAX: CC = ISD::SETUGT; break; - case ISD::UMIN: CC = ISD::SETULT; break; - } - // FIXME: Should really try to split the vector in case it's legal on a // subvector. if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) return DAG.UnrollVectorOp(Node); - EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); - SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, CC); - return DAG.getSelect(DL, VT, Cond, Op0, Op1); + // Attempt to find an existing SETCC node that we can reuse. + // TODO: Do we need a generic doesSETCCNodeExist? + // TODO: Missing freeze(Op0)/freeze(Op1)? + auto buildMinMax = [&](ISD::CondCode PrefCC, ISD::CondCode AltCC, + ISD::CondCode PrefCommuteCC, + ISD::CondCode AltCommuteCC) { + SDVTList BoolVTList = DAG.getVTList(BoolVT); + for (ISD::CondCode CC : {PrefCC, AltCC}) { + if (DAG.doesNodeExist(ISD::SETCC, BoolVTList, + {Op0, Op1, DAG.getCondCode(CC)})) { + SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, CC); + return DAG.getSelect(DL, VT, Cond, Op0, Op1); + } + } + for (ISD::CondCode CC : {PrefCommuteCC, AltCommuteCC}) { + if (DAG.doesNodeExist(ISD::SETCC, BoolVTList, + {Op0, Op1, DAG.getCondCode(CC)})) { + SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, CC); + return DAG.getSelect(DL, VT, Cond, Op1, Op0); + } + } + SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, PrefCC); + return DAG.getSelect(DL, VT, Cond, Op0, Op1); + }; + + // Expand Y = MAX(A, B) -> Y = (A > B) ? A : B + // -> Y = (A < B) ? B : A + // -> Y = (A >= B) ? A : B + // -> Y = (A <= B) ? B : A + switch (Opcode) { + case ISD::SMAX: + return buildMinMax(ISD::SETGT, ISD::SETGE, ISD::SETLT, ISD::SETLE); + case ISD::SMIN: + return buildMinMax(ISD::SETLT, ISD::SETLE, ISD::SETGT, ISD::SETGE); + case ISD::UMAX: + return buildMinMax(ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE); + case ISD::UMIN: + return buildMinMax(ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE); + } + + llvm_unreachable("How did we get here?"); } SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const { @@ -9607,6 +9832,37 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const { return DAG.getSelect(dl, VT, Overflow, Zero, SumDiff); } + if (Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) { + APInt MinVal = APInt::getSignedMinValue(BitWidth); + APInt MaxVal = APInt::getSignedMaxValue(BitWidth); + + KnownBits KnownLHS = DAG.computeKnownBits(LHS); + KnownBits KnownRHS = DAG.computeKnownBits(RHS); + + // If either of the operand signs are known, then they are guaranteed to + // only saturate in one direction. If non-negative they will saturate + // towards SIGNED_MAX, if negative they will saturate towards SIGNED_MIN. + // + // In the case of ISD::SSUBSAT, 'x - y' is equivalent to 'x + (-y)', so the + // sign of 'y' has to be flipped. + + bool LHSIsNonNegative = KnownLHS.isNonNegative(); + bool RHSIsNonNegative = Opcode == ISD::SADDSAT ? KnownRHS.isNonNegative() + : KnownRHS.isNegative(); + if (LHSIsNonNegative || RHSIsNonNegative) { + SDValue SatMax = DAG.getConstant(MaxVal, dl, VT); + return DAG.getSelect(dl, VT, Overflow, SatMax, SumDiff); + } + + bool LHSIsNegative = KnownLHS.isNegative(); + bool RHSIsNegative = Opcode == ISD::SADDSAT ? KnownRHS.isNegative() + : KnownRHS.isNonNegative(); + if (LHSIsNegative || RHSIsNegative) { + SDValue SatMin = DAG.getConstant(MinVal, dl, VT); + return DAG.getSelect(dl, VT, Overflow, SatMin, SumDiff); + } + } + // Overflow ? (SumDiff >> BW) ^ MinVal : SumDiff APInt MinVal = APInt::getSignedMinValue(BitWidth); SDValue SatMin = DAG.getConstant(MinVal, dl, VT); @@ -9892,8 +10148,8 @@ void TargetLowering::expandUADDSUBO( SDValue RHS = Node->getOperand(1); bool IsAdd = Node->getOpcode() == ISD::UADDO; - // If ADD/SUBCARRY is legal, use that instead. - unsigned OpcCarry = IsAdd ? ISD::ADDCARRY : ISD::SUBCARRY; + // If UADDO_CARRY/SUBO_CARRY is legal, use that instead. + unsigned OpcCarry = IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY; if (isOperationLegalOrCustom(OpcCarry, Node->getValueType(0))) { SDValue CarryIn = DAG.getConstant(0, dl, Node->getValueType(1)); SDValue NodeCarry = DAG.getNode(OpcCarry, dl, Node->getVTList(), @@ -9919,6 +10175,11 @@ void TargetLowering::expandUADDSUBO( SetCC = DAG.getSetCC(dl, SetCCType, Result, DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETEQ); + } else if (IsAdd && isAllOnesConstant(RHS)) { + // Special case: uaddo X, -1 overflows if X != 0. + SetCC = + DAG.getSetCC(dl, SetCCType, LHS, + DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETNE); } else { ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT; SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC); @@ -10271,8 +10532,10 @@ SDValue TargetLowering::expandFP_TO_INT_SAT(SDNode *Node, // Otherwise, select 0 if Src is NaN. SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); - return DAG.getSelectCC(dl, Src, Src, ZeroInt, FpToInt, - ISD::CondCode::SETUO); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT); + SDValue IsNan = DAG.getSetCC(dl, SetCCVT, Src, Src, ISD::CondCode::SETUO); + return DAG.getSelect(dl, DstVT, IsNan, ZeroInt, FpToInt); } SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT); @@ -10286,13 +10549,16 @@ SDValue TargetLowering::expandFP_TO_INT_SAT(SDNode *Node, SDValue Select = FpToInt; + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT); + // If Src ULT MinFloat, select MinInt. In particular, this also selects // MinInt if Src is NaN. - Select = DAG.getSelectCC(dl, Src, MinFloatNode, MinIntNode, Select, - ISD::CondCode::SETULT); + SDValue ULT = DAG.getSetCC(dl, SetCCVT, Src, MinFloatNode, ISD::SETULT); + Select = DAG.getSelect(dl, DstVT, ULT, MinIntNode, Select); // If Src OGT MaxFloat, select MaxInt. - Select = DAG.getSelectCC(dl, Src, MaxFloatNode, MaxIntNode, Select, - ISD::CondCode::SETOGT); + SDValue OGT = DAG.getSetCC(dl, SetCCVT, Src, MaxFloatNode, ISD::SETOGT); + Select = DAG.getSelect(dl, DstVT, OGT, MaxIntNode, Select); // In the unsigned case we are done, because we mapped NaN to MinInt, which // is already zero. @@ -10301,7 +10567,8 @@ SDValue TargetLowering::expandFP_TO_INT_SAT(SDNode *Node, // Otherwise, select 0 if Src is NaN. SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); - return DAG.getSelectCC(dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO); + SDValue IsNan = DAG.getSetCC(dl, SetCCVT, Src, Src, ISD::CondCode::SETUO); + return DAG.getSelect(dl, DstVT, IsNan, ZeroInt, Select); } SDValue TargetLowering::expandVectorSplice(SDNode *Node, diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp index 2411b1ad5203..4b1d3637a746 100644 --- a/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -53,6 +53,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" @@ -97,6 +98,9 @@ STATISTIC(NumCandidatesDropped, static cl::opt<cl::boolOrDefault> EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden, cl::desc("enable the shrink-wrapping pass")); +static cl::opt<bool> EnablePostShrinkWrapOpt( + "enable-shrink-wrap-region-split", cl::init(true), cl::Hidden, + cl::desc("enable splitting of the restore block if possible")); namespace { @@ -110,44 +114,44 @@ namespace { class ShrinkWrap : public MachineFunctionPass { /// Hold callee-saved information. RegisterClassInfo RCI; - MachineDominatorTree *MDT; - MachinePostDominatorTree *MPDT; + MachineDominatorTree *MDT = nullptr; + MachinePostDominatorTree *MPDT = nullptr; /// Current safe point found for the prologue. /// The prologue will be inserted before the first instruction /// in this basic block. - MachineBasicBlock *Save; + MachineBasicBlock *Save = nullptr; /// Current safe point found for the epilogue. /// The epilogue will be inserted before the first terminator instruction /// in this basic block. - MachineBasicBlock *Restore; + MachineBasicBlock *Restore = nullptr; /// Hold the information of the basic block frequency. /// Use to check the profitability of the new points. - MachineBlockFrequencyInfo *MBFI; + MachineBlockFrequencyInfo *MBFI = nullptr; /// Hold the loop information. Used to determine if Save and Restore /// are in the same loop. - MachineLoopInfo *MLI; + MachineLoopInfo *MLI = nullptr; // Emit remarks. MachineOptimizationRemarkEmitter *ORE = nullptr; /// Frequency of the Entry block. - uint64_t EntryFreq; + uint64_t EntryFreq = 0; /// Current opcode for frame setup. - unsigned FrameSetupOpcode; + unsigned FrameSetupOpcode = ~0u; /// Current opcode for frame destroy. - unsigned FrameDestroyOpcode; + unsigned FrameDestroyOpcode = ~0u; /// Stack pointer register, used by llvm.{savestack,restorestack} Register SP; /// Entry block. - const MachineBasicBlock *Entry; + const MachineBasicBlock *Entry = nullptr; using SetOfRegs = SmallSetVector<unsigned, 16>; @@ -155,12 +159,18 @@ class ShrinkWrap : public MachineFunctionPass { mutable SetOfRegs CurrentCSRs; /// Current MachineFunction. - MachineFunction *MachineFunc; + MachineFunction *MachineFunc = nullptr; + + /// Is `true` for block numbers where we can guarantee no stack access + /// or computation of stack-relative addresses on any CFG path including + /// the block itself. + BitVector StackAddressUsedBlockInfo; /// Check if \p MI uses or defines a callee-saved register or /// a frame index. If this is the case, this means \p MI must happen /// after Save and before Restore. - bool useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const; + bool useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS, + bool StackAddressUsed) const; const SetOfRegs &getCurrentCSRs(RegScavenger *RS) const { if (CurrentCSRs.empty()) { @@ -184,6 +194,32 @@ class ShrinkWrap : public MachineFunctionPass { /// this call. void updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS); + // Try to find safe point based on dominance and block frequency without + // any change in IR. + bool performShrinkWrapping( + const ReversePostOrderTraversal<MachineBasicBlock *> &RPOT, + RegScavenger *RS); + + /// This function tries to split the restore point if doing so can shrink the + /// save point further. \return True if restore point is split. + bool postShrinkWrapping(bool HasCandidate, MachineFunction &MF, + RegScavenger *RS); + + /// This function analyzes if the restore point can split to create a new + /// restore point. This function collects + /// 1. Any preds of current restore that are reachable by callee save/FI + /// blocks + /// - indicated by DirtyPreds + /// 2. Any preds of current restore that are not DirtyPreds - indicated by + /// CleanPreds + /// Both sets should be non-empty for considering restore point split. + bool checkIfRestoreSplittable( + const MachineBasicBlock *CurRestore, + const DenseSet<const MachineBasicBlock *> &ReachableByDirty, + SmallVectorImpl<MachineBasicBlock *> &DirtyPreds, + SmallVectorImpl<MachineBasicBlock *> &CleanPreds, + const TargetInstrInfo *TII, RegScavenger *RS); + /// Initialize the pass for \p MF. void init(MachineFunction &MF) { RCI.runOnMachineFunction(MF); @@ -257,15 +293,32 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) INITIALIZE_PASS_END(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false) -bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, - RegScavenger *RS) const { - // This prevents premature stack popping when occurs a indirect stack - // access. It is overly aggressive for the moment. - // TODO: - Obvious non-stack loads and store, such as global values, - // are known to not access the stack. - // - Further, data dependency and alias analysis can validate - // that load and stores never derive from the stack pointer. - if (MI.mayLoadOrStore()) +bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS, + bool StackAddressUsed) const { + /// Check if \p Op is known to access an address not on the function's stack . + /// At the moment, accesses where the underlying object is a global, function + /// argument, or jump table are considered non-stack accesses. Note that the + /// caller's stack may get accessed when passing an argument via the stack, + /// but not the stack of the current function. + /// + auto IsKnownNonStackPtr = [](MachineMemOperand *Op) { + if (Op->getValue()) { + const Value *UO = getUnderlyingObject(Op->getValue()); + if (!UO) + return false; + if (auto *Arg = dyn_cast<Argument>(UO)) + return !Arg->hasPassPointeeByValueCopyAttr(); + return isa<GlobalValue>(UO); + } + if (const PseudoSourceValue *PSV = Op->getPseudoValue()) + return PSV->isJumpTable(); + return false; + }; + // Load/store operations may access the stack indirectly when we previously + // computed an address to a stack location. + if (StackAddressUsed && MI.mayLoadOrStore() && + (MI.isCall() || MI.hasUnmodeledSideEffects() || MI.memoperands_empty() || + !all_of(MI.memoperands(), IsKnownNonStackPtr))) return true; if (MI.getOpcode() == FrameSetupOpcode || @@ -320,18 +373,314 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, /// Helper function to find the immediate (post) dominator. template <typename ListOfBBs, typename DominanceAnalysis> static MachineBasicBlock *FindIDom(MachineBasicBlock &Block, ListOfBBs BBs, - DominanceAnalysis &Dom) { + DominanceAnalysis &Dom, bool Strict = true) { MachineBasicBlock *IDom = &Block; for (MachineBasicBlock *BB : BBs) { IDom = Dom.findNearestCommonDominator(IDom, BB); if (!IDom) break; } - if (IDom == &Block) + if (Strict && IDom == &Block) return nullptr; return IDom; } +static bool isAnalyzableBB(const TargetInstrInfo &TII, + MachineBasicBlock &Entry) { + // Check if the block is analyzable. + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector<MachineOperand, 4> Cond; + return !TII.analyzeBranch(Entry, TBB, FBB, Cond); +} + +/// Determines if any predecessor of MBB is on the path from block that has use +/// or def of CSRs/FI to MBB. +/// ReachableByDirty: All blocks reachable from block that has use or def of +/// CSR/FI. +static bool +hasDirtyPred(const DenseSet<const MachineBasicBlock *> &ReachableByDirty, + const MachineBasicBlock &MBB) { + for (const MachineBasicBlock *PredBB : MBB.predecessors()) + if (ReachableByDirty.count(PredBB)) + return true; + return false; +} + +/// Derives the list of all the basic blocks reachable from MBB. +static void markAllReachable(DenseSet<const MachineBasicBlock *> &Visited, + const MachineBasicBlock &MBB) { + SmallVector<MachineBasicBlock *, 4> Worklist(MBB.succ_begin(), + MBB.succ_end()); + Visited.insert(&MBB); + while (!Worklist.empty()) { + MachineBasicBlock *SuccMBB = Worklist.pop_back_val(); + if (!Visited.insert(SuccMBB).second) + continue; + Worklist.append(SuccMBB->succ_begin(), SuccMBB->succ_end()); + } +} + +/// Collect blocks reachable by use or def of CSRs/FI. +static void collectBlocksReachableByDirty( + const DenseSet<const MachineBasicBlock *> &DirtyBBs, + DenseSet<const MachineBasicBlock *> &ReachableByDirty) { + for (const MachineBasicBlock *MBB : DirtyBBs) { + if (ReachableByDirty.count(MBB)) + continue; + // Mark all offsprings as reachable. + markAllReachable(ReachableByDirty, *MBB); + } +} + +/// \return true if there is a clean path from SavePoint to the original +/// Restore. +static bool +isSaveReachableThroughClean(const MachineBasicBlock *SavePoint, + ArrayRef<MachineBasicBlock *> CleanPreds) { + DenseSet<const MachineBasicBlock *> Visited; + SmallVector<MachineBasicBlock *, 4> Worklist(CleanPreds.begin(), + CleanPreds.end()); + while (!Worklist.empty()) { + MachineBasicBlock *CleanBB = Worklist.pop_back_val(); + if (CleanBB == SavePoint) + return true; + if (!Visited.insert(CleanBB).second || !CleanBB->pred_size()) + continue; + Worklist.append(CleanBB->pred_begin(), CleanBB->pred_end()); + } + return false; +} + +/// This function updates the branches post restore point split. +/// +/// Restore point has been split. +/// Old restore point: MBB +/// New restore point: NMBB +/// Any basic block(say BBToUpdate) which had a fallthrough to MBB +/// previously should +/// 1. Fallthrough to NMBB iff NMBB is inserted immediately above MBB in the +/// block layout OR +/// 2. Branch unconditionally to NMBB iff NMBB is inserted at any other place. +static void updateTerminator(MachineBasicBlock *BBToUpdate, + MachineBasicBlock *NMBB, + const TargetInstrInfo *TII) { + DebugLoc DL = BBToUpdate->findBranchDebugLoc(); + // if NMBB isn't the new layout successor for BBToUpdate, insert unconditional + // branch to it + if (!BBToUpdate->isLayoutSuccessor(NMBB)) + TII->insertUnconditionalBranch(*BBToUpdate, NMBB, DL); +} + +/// This function splits the restore point and returns new restore point/BB. +/// +/// DirtyPreds: Predessors of \p MBB that are ReachableByDirty +/// +/// Decision has been made to split the restore point. +/// old restore point: \p MBB +/// new restore point: \p NMBB +/// This function makes the necessary block layout changes so that +/// 1. \p NMBB points to \p MBB unconditionally +/// 2. All dirtyPreds that previously pointed to \p MBB point to \p NMBB +static MachineBasicBlock * +tryToSplitRestore(MachineBasicBlock *MBB, + ArrayRef<MachineBasicBlock *> DirtyPreds, + const TargetInstrInfo *TII) { + MachineFunction *MF = MBB->getParent(); + + // get the list of DirtyPreds who have a fallthrough to MBB + // before the block layout change. This is just to ensure that if the NMBB is + // inserted after MBB, then we create unconditional branch from + // DirtyPred/CleanPred to NMBB + SmallPtrSet<MachineBasicBlock *, 8> MBBFallthrough; + for (MachineBasicBlock *BB : DirtyPreds) + if (BB->getFallThrough(false) == MBB) + MBBFallthrough.insert(BB); + + MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock(); + // Insert this block at the end of the function. Inserting in between may + // interfere with control flow optimizer decisions. + MF->insert(MF->end(), NMBB); + + for (const MachineBasicBlock::RegisterMaskPair &LI : MBB->liveins()) + NMBB->addLiveIn(LI.PhysReg); + + TII->insertUnconditionalBranch(*NMBB, MBB, DebugLoc()); + + // After splitting, all predecessors of the restore point should be dirty + // blocks. + for (MachineBasicBlock *SuccBB : DirtyPreds) + SuccBB->ReplaceUsesOfBlockWith(MBB, NMBB); + + NMBB->addSuccessor(MBB); + + for (MachineBasicBlock *BBToUpdate : MBBFallthrough) + updateTerminator(BBToUpdate, NMBB, TII); + + return NMBB; +} + +/// This function undoes the restore point split done earlier. +/// +/// DirtyPreds: All predecessors of \p NMBB that are ReachableByDirty. +/// +/// Restore point was split and the change needs to be unrolled. Make necessary +/// changes to reset restore point from \p NMBB to \p MBB. +static void rollbackRestoreSplit(MachineFunction &MF, MachineBasicBlock *NMBB, + MachineBasicBlock *MBB, + ArrayRef<MachineBasicBlock *> DirtyPreds, + const TargetInstrInfo *TII) { + // For a BB, if NMBB is fallthrough in the current layout, then in the new + // layout a. BB should fallthrough to MBB OR b. BB should undconditionally + // branch to MBB + SmallPtrSet<MachineBasicBlock *, 8> NMBBFallthrough; + for (MachineBasicBlock *BB : DirtyPreds) + if (BB->getFallThrough(false) == NMBB) + NMBBFallthrough.insert(BB); + + NMBB->removeSuccessor(MBB); + for (MachineBasicBlock *SuccBB : DirtyPreds) + SuccBB->ReplaceUsesOfBlockWith(NMBB, MBB); + + NMBB->erase(NMBB->begin(), NMBB->end()); + NMBB->eraseFromParent(); + + for (MachineBasicBlock *BBToUpdate : NMBBFallthrough) + updateTerminator(BBToUpdate, MBB, TII); +} + +// A block is deemed fit for restore point split iff there exist +// 1. DirtyPreds - preds of CurRestore reachable from use or def of CSR/FI +// 2. CleanPreds - preds of CurRestore that arent DirtyPreds +bool ShrinkWrap::checkIfRestoreSplittable( + const MachineBasicBlock *CurRestore, + const DenseSet<const MachineBasicBlock *> &ReachableByDirty, + SmallVectorImpl<MachineBasicBlock *> &DirtyPreds, + SmallVectorImpl<MachineBasicBlock *> &CleanPreds, + const TargetInstrInfo *TII, RegScavenger *RS) { + for (const MachineInstr &MI : *CurRestore) + if (useOrDefCSROrFI(MI, RS, /*StackAddressUsed=*/true)) + return false; + + for (MachineBasicBlock *PredBB : CurRestore->predecessors()) { + if (!isAnalyzableBB(*TII, *PredBB)) + return false; + + if (ReachableByDirty.count(PredBB)) + DirtyPreds.push_back(PredBB); + else + CleanPreds.push_back(PredBB); + } + + return !(CleanPreds.empty() || DirtyPreds.empty()); +} + +bool ShrinkWrap::postShrinkWrapping(bool HasCandidate, MachineFunction &MF, + RegScavenger *RS) { + if (!EnablePostShrinkWrapOpt) + return false; + + MachineBasicBlock *InitSave = nullptr; + MachineBasicBlock *InitRestore = nullptr; + + if (HasCandidate) { + InitSave = Save; + InitRestore = Restore; + } else { + InitRestore = nullptr; + InitSave = &MF.front(); + for (MachineBasicBlock &MBB : MF) { + if (MBB.isEHFuncletEntry()) + return false; + if (MBB.isReturnBlock()) { + // Do not support multiple restore points. + if (InitRestore) + return false; + InitRestore = &MBB; + } + } + } + + if (!InitSave || !InitRestore || InitRestore == InitSave || + !MDT->dominates(InitSave, InitRestore) || + !MPDT->dominates(InitRestore, InitSave)) + return false; + + // Bail out of the optimization if any of the basic block is target of + // INLINEASM_BR instruction + for (MachineBasicBlock &MBB : MF) + if (MBB.isInlineAsmBrIndirectTarget()) + return false; + + DenseSet<const MachineBasicBlock *> DirtyBBs; + for (MachineBasicBlock &MBB : MF) { + if (MBB.isEHPad()) { + DirtyBBs.insert(&MBB); + continue; + } + for (const MachineInstr &MI : MBB) + if (useOrDefCSROrFI(MI, RS, /*StackAddressUsed=*/true)) { + DirtyBBs.insert(&MBB); + break; + } + } + + // Find blocks reachable from the use or def of CSRs/FI. + DenseSet<const MachineBasicBlock *> ReachableByDirty; + collectBlocksReachableByDirty(DirtyBBs, ReachableByDirty); + + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + SmallVector<MachineBasicBlock *, 2> DirtyPreds; + SmallVector<MachineBasicBlock *, 2> CleanPreds; + if (!checkIfRestoreSplittable(InitRestore, ReachableByDirty, DirtyPreds, + CleanPreds, TII, RS)) + return false; + + // Trying to reach out to the new save point which dominates all dirty blocks. + MachineBasicBlock *NewSave = + FindIDom<>(**DirtyPreds.begin(), DirtyPreds, *MDT, false); + + while (NewSave && (hasDirtyPred(ReachableByDirty, *NewSave) || + EntryFreq < MBFI->getBlockFreq(NewSave).getFrequency() || + /*Entry freq has been observed more than a loop block in + some cases*/ + MLI->getLoopFor(NewSave))) + NewSave = FindIDom<>(**NewSave->pred_begin(), NewSave->predecessors(), *MDT, + false); + + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + if (!NewSave || NewSave == InitSave || + isSaveReachableThroughClean(NewSave, CleanPreds) || + !TFI->canUseAsPrologue(*NewSave)) + return false; + + // Now we know that splitting a restore point can isolate the restore point + // from clean blocks and doing so can shrink the save point. + MachineBasicBlock *NewRestore = + tryToSplitRestore(InitRestore, DirtyPreds, TII); + + // Make sure if the new restore point is valid as an epilogue, depending on + // targets. + if (!TFI->canUseAsEpilogue(*NewRestore)) { + rollbackRestoreSplit(MF, NewRestore, InitRestore, DirtyPreds, TII); + return false; + } + + Save = NewSave; + Restore = NewRestore; + + MDT->runOnMachineFunction(MF); + MPDT->runOnMachineFunction(MF); + + assert((MDT->dominates(Save, Restore) && MPDT->dominates(Restore, Save)) && + "Incorrect save or restore point due to dominance relations"); + assert((!MLI->getLoopFor(Save) && !MLI->getLoopFor(Restore)) && + "Unexpected save or restore point in a loop"); + assert((EntryFreq >= MBFI->getBlockFreq(Save).getFrequency() && + EntryFreq >= MBFI->getBlockFreq(Restore).getFrequency()) && + "Incorrect save or restore point based on block frequency"); + return true; +} + void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS) { // Get rid of the easy cases first. @@ -356,7 +705,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, // terminator. if (Restore == &MBB) { for (const MachineInstr &Terminator : MBB.terminators()) { - if (!useOrDefCSROrFI(Terminator, RS)) + if (!useOrDefCSROrFI(Terminator, RS, /*StackAddressUsed=*/true)) continue; // One of the terminator needs to happen before the restore point. if (MBB.succ_empty()) { @@ -463,47 +812,24 @@ static bool giveUpWithRemarks(MachineOptimizationRemarkEmitter *ORE, return false; } -bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF)) - return false; - - LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); - - init(MF); - - ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin()); - if (containsIrreducibleCFG<MachineBasicBlock *>(RPOT, *MLI)) { - // If MF is irreducible, a block may be in a loop without - // MachineLoopInfo reporting it. I.e., we may use the - // post-dominance property in loops, which lead to incorrect - // results. Moreover, we may miss that the prologue and - // epilogue are not in the same loop, leading to unbalanced - // construction/deconstruction of the stack frame. - return giveUpWithRemarks(ORE, "UnsupportedIrreducibleCFG", - "Irreducible CFGs are not supported yet.", - MF.getFunction().getSubprogram(), &MF.front()); - } - - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - std::unique_ptr<RegScavenger> RS( - TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr); - - for (MachineBasicBlock &MBB : MF) { - LLVM_DEBUG(dbgs() << "Look into: " << MBB.getNumber() << ' ' - << MBB.getName() << '\n'); +bool ShrinkWrap::performShrinkWrapping( + const ReversePostOrderTraversal<MachineBasicBlock *> &RPOT, + RegScavenger *RS) { + for (MachineBasicBlock *MBB : RPOT) { + LLVM_DEBUG(dbgs() << "Look into: " << printMBBReference(*MBB) << '\n'); - if (MBB.isEHFuncletEntry()) + if (MBB->isEHFuncletEntry()) return giveUpWithRemarks(ORE, "UnsupportedEHFunclets", "EH Funclets are not supported yet.", - MBB.front().getDebugLoc(), &MBB); + MBB->front().getDebugLoc(), MBB); - if (MBB.isEHPad() || MBB.isInlineAsmBrIndirectTarget()) { + if (MBB->isEHPad() || MBB->isInlineAsmBrIndirectTarget()) { // Push the prologue and epilogue outside of the region that may throw (or // jump out via inlineasm_br), by making sure that all the landing pads // are at least at the boundary of the save and restore points. The // problem is that a basic block can jump out from the middle in these // cases, which we do not handle. - updateSaveRestorePoints(MBB, RS.get()); + updateSaveRestorePoints(*MBB, RS); if (!ArePointsInteresting()) { LLVM_DEBUG(dbgs() << "EHPad/inlineasm_br prevents shrink-wrapping\n"); return false; @@ -511,22 +837,37 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { continue; } - for (const MachineInstr &MI : MBB) { - if (!useOrDefCSROrFI(MI, RS.get())) - continue; - // Save (resp. restore) point must dominate (resp. post dominate) - // MI. Look for the proper basic block for those. - updateSaveRestorePoints(MBB, RS.get()); - // If we are at a point where we cannot improve the placement of - // save/restore instructions, just give up. - if (!ArePointsInteresting()) { - LLVM_DEBUG(dbgs() << "No Shrink wrap candidate found\n"); - return false; + bool StackAddressUsed = false; + // Check if we found any stack accesses in the predecessors. We are not + // doing a full dataflow analysis here to keep things simple but just + // rely on a reverse portorder traversal (RPOT) to guarantee predecessors + // are already processed except for loops (and accept the conservative + // result for loops). + for (const MachineBasicBlock *Pred : MBB->predecessors()) { + if (StackAddressUsedBlockInfo.test(Pred->getNumber())) { + StackAddressUsed = true; + break; } - // No need to look for other instructions, this basic block - // will already be part of the handled region. - break; } + + for (const MachineInstr &MI : *MBB) { + if (useOrDefCSROrFI(MI, RS, StackAddressUsed)) { + // Save (resp. restore) point must dominate (resp. post dominate) + // MI. Look for the proper basic block for those. + updateSaveRestorePoints(*MBB, RS); + // If we are at a point where we cannot improve the placement of + // save/restore instructions, just give up. + if (!ArePointsInteresting()) { + LLVM_DEBUG(dbgs() << "No Shrink wrap candidate found\n"); + return false; + } + // No need to look for other instructions, this basic block + // will already be part of the handled region. + StackAddressUsed = true; + break; + } + } + StackAddressUsedBlockInfo[MBB->getNumber()] = StackAddressUsed; } if (!ArePointsInteresting()) { // If the points are not interesting at this point, then they must be null @@ -540,13 +881,13 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "\n ** Results **\nFrequency of the Entry: " << EntryFreq << '\n'); - const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + const TargetFrameLowering *TFI = + MachineFunc->getSubtarget().getFrameLowering(); do { LLVM_DEBUG(dbgs() << "Shrink wrap candidates (#, Name, Freq):\nSave: " - << Save->getNumber() << ' ' << Save->getName() << ' ' + << printMBBReference(*Save) << ' ' << MBFI->getBlockFreq(Save).getFrequency() - << "\nRestore: " << Restore->getNumber() << ' ' - << Restore->getName() << ' ' + << "\nRestore: " << printMBBReference(*Restore) << ' ' << MBFI->getBlockFreq(Restore).getFrequency() << '\n'); bool IsSaveCheap, TargetCanUseSaveAsPrologue = false; @@ -570,24 +911,61 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { break; NewBB = Restore; } - updateSaveRestorePoints(*NewBB, RS.get()); + updateSaveRestorePoints(*NewBB, RS); } while (Save && Restore); if (!ArePointsInteresting()) { ++NumCandidatesDropped; return false; } + return true; +} + +bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF)) + return false; + + LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); + + init(MF); + + ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin()); + if (containsIrreducibleCFG<MachineBasicBlock *>(RPOT, *MLI)) { + // If MF is irreducible, a block may be in a loop without + // MachineLoopInfo reporting it. I.e., we may use the + // post-dominance property in loops, which lead to incorrect + // results. Moreover, we may miss that the prologue and + // epilogue are not in the same loop, leading to unbalanced + // construction/deconstruction of the stack frame. + return giveUpWithRemarks(ORE, "UnsupportedIrreducibleCFG", + "Irreducible CFGs are not supported yet.", + MF.getFunction().getSubprogram(), &MF.front()); + } + + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + std::unique_ptr<RegScavenger> RS( + TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr); + + bool Changed = false; + + StackAddressUsedBlockInfo.resize(MF.getNumBlockIDs(), true); + bool HasCandidate = performShrinkWrapping(RPOT, RS.get()); + StackAddressUsedBlockInfo.clear(); + Changed = postShrinkWrapping(HasCandidate, MF, RS.get()); + if (!HasCandidate && !Changed) + return false; + if (!ArePointsInteresting()) + return Changed; LLVM_DEBUG(dbgs() << "Final shrink wrap candidates:\nSave: " - << Save->getNumber() << ' ' << Save->getName() - << "\nRestore: " << Restore->getNumber() << ' ' - << Restore->getName() << '\n'); + << printMBBReference(*Save) << ' ' + << "\nRestore: " << printMBBReference(*Restore) << '\n'); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setSavePoint(Save); MFI.setRestorePoint(Restore); ++NumCandidates; - return false; + return Changed; } bool ShrinkWrap::isShrinkWrapEnabled(const MachineFunction &MF) { diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp index 3fed707a9eb1..d09953e76a80 100644 --- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp +++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp @@ -38,21 +38,21 @@ STATISTIC(NumSpilled, "Number of registers live across unwind edges"); namespace { class SjLjEHPrepare : public FunctionPass { - IntegerType *DataTy; - Type *doubleUnderDataTy; - Type *doubleUnderJBufTy; - Type *FunctionContextTy; + IntegerType *DataTy = nullptr; + Type *doubleUnderDataTy = nullptr; + Type *doubleUnderJBufTy = nullptr; + Type *FunctionContextTy = nullptr; FunctionCallee RegisterFn; FunctionCallee UnregisterFn; - Function *BuiltinSetupDispatchFn; - Function *FrameAddrFn; - Function *StackAddrFn; - Function *StackRestoreFn; - Function *LSDAAddrFn; - Function *CallSiteFn; - Function *FuncCtxFn; - AllocaInst *FuncCtx; - const TargetMachine *TM; + Function *BuiltinSetupDispatchFn = nullptr; + Function *FrameAddrFn = nullptr; + Function *StackAddrFn = nullptr; + Function *StackRestoreFn = nullptr; + Function *LSDAAddrFn = nullptr; + Function *CallSiteFn = nullptr; + Function *FuncCtxFn = nullptr; + AllocaInst *FuncCtx = nullptr; + const TargetMachine *TM = nullptr; public: static char ID; // Pass identification, replacement for typeid diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp index ee3a0164564e..47ee36971d0e 100644 --- a/llvm/lib/CodeGen/SlotIndexes.cpp +++ b/llvm/lib/CodeGen/SlotIndexes.cpp @@ -215,7 +215,7 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB, --MBBI; else pastStart = true; - } else if (MI && mi2iMap.find(MI) == mi2iMap.end()) { + } else if (MI && !mi2iMap.contains(MI)) { if (MBBI != Begin) --MBBI; else @@ -232,7 +232,7 @@ void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB, for (MachineBasicBlock::iterator I = End; I != Begin;) { --I; MachineInstr &MI = *I; - if (!MI.isDebugOrPseudoInstr() && mi2iMap.find(&MI) == mi2iMap.end()) + if (!MI.isDebugOrPseudoInstr() && !mi2iMap.contains(&MI)) insertMachineInstrInMaps(MI); } } diff --git a/llvm/lib/CodeGen/SpillPlacement.h b/llvm/lib/CodeGen/SpillPlacement.h index d2273a163025..bd37d85c6c0d 100644 --- a/llvm/lib/CodeGen/SpillPlacement.h +++ b/llvm/lib/CodeGen/SpillPlacement.h @@ -42,15 +42,15 @@ class MachineLoopInfo; class SpillPlacement : public MachineFunctionPass { struct Node; - const MachineFunction *MF; - const EdgeBundles *bundles; - const MachineLoopInfo *loops; - const MachineBlockFrequencyInfo *MBFI; + const MachineFunction *MF = nullptr; + const EdgeBundles *bundles = nullptr; + const MachineLoopInfo *loops = nullptr; + const MachineBlockFrequencyInfo *MBFI = nullptr; Node *nodes = nullptr; // Nodes that are active in the current computation. Owned by the prepare() // caller. - BitVector *ActiveNodes; + BitVector *ActiveNodes = nullptr; // Nodes with active links. Populated by scanActiveBundles. SmallVector<unsigned, 8> Linked; diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp index 92e820c9d3d8..83964eced597 100644 --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -514,10 +514,10 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) { VFP = ValueForcePair(nullptr, true); } -SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg, - MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, - unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) { - const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); +SlotIndex SplitEditor::buildSingleSubRegCopy( + Register FromReg, Register ToReg, MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, unsigned SubIdx, + LiveInterval &DestLI, bool Late, SlotIndex Def, const MCInstrDesc &Desc) { bool FirstCopy = !Def.isValid(); MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc) .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy) @@ -536,7 +536,8 @@ SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg, SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg, LaneBitmask LaneMask, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) { - const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + const MCInstrDesc &Desc = + TII.get(TII.getLiveRangeSplitOpcode(FromReg, *MBB.getParent())); SlotIndexes &Indexes = *LIS.getSlotIndexes(); if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) { // The full vreg is copied. @@ -564,7 +565,7 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg, SlotIndex Def; for (unsigned BestIdx : SubIndexes) { Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx, - DestLI, Late, Def); + DestLI, Late, Def, Desc); } BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator(); @@ -1365,7 +1366,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) { // The point we want to extend is 0d to 16e not 16r in this case, but if // we use 16r here we will extend nothing because that already contained // in [16e, 32d). - unsigned OpIdx = MI->getOperandNo(&MO); + unsigned OpIdx = MO.getOperandNo(); unsigned DefOpIdx = MI->findTiedOperandIdx(OpIdx); const MachineOperand &DefOp = MI->getOperand(DefOpIdx); IsEarlyClobber = DefOp.isEarlyClobber(); @@ -1584,7 +1585,9 @@ bool SplitAnalysis::shouldSplitSingleBlock(const BlockInfo &BI, if (BI.LiveIn && BI.LiveOut) return true; // No point in isolating a copy. It has no register class constraints. - if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike()) + MachineInstr *MI = LIS.getInstructionFromIndex(BI.FirstInstr); + bool copyLike = TII.isCopyInstr(*MI) || MI->isSubregToReg(); + if (copyLike) return false; // Finally, don't isolate an end point that was created by earlier splits. return isOriginalEndpoint(BI.FirstInstr); diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h index 5a3428a5e91f..1174e392e4e4 100644 --- a/llvm/lib/CodeGen/SplitKit.h +++ b/llvm/lib/CodeGen/SplitKit.h @@ -151,13 +151,13 @@ private: /// NumGapBlocks - Number of duplicate entries in UseBlocks for blocks where /// the live range has a gap. - unsigned NumGapBlocks; + unsigned NumGapBlocks = 0u; /// ThroughBlocks - Block numbers where CurLI is live through without uses. BitVector ThroughBlocks; /// NumThroughBlocks - Number of live-through blocks. - unsigned NumThroughBlocks; + unsigned NumThroughBlocks = 0u; // Sumarize statistics by counting instructions using CurLI. void analyzeUses(); @@ -428,8 +428,11 @@ private: bool Late, unsigned RegIdx); SlotIndex buildSingleSubRegCopy(Register FromReg, Register ToReg, - MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore, - unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def); + MachineBasicBlock &MB, + MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, + bool Late, SlotIndex Def, + const MCInstrDesc &Desc); public: /// Create a new SplitEditor for editing the LiveInterval analyzed by SA. diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp index 11c6bdc69956..66b9086e1d88 100644 --- a/llvm/lib/CodeGen/StackColoring.cpp +++ b/llvm/lib/CodeGen/StackColoring.cpp @@ -407,8 +407,8 @@ namespace { /// StackColoring - A machine pass for merging disjoint stack allocations, /// marked by the LIFETIME_START and LIFETIME_END pseudo instructions. class StackColoring : public MachineFunctionPass { - MachineFrameInfo *MFI; - MachineFunction *MF; + MachineFrameInfo *MFI = nullptr; + MachineFunction *MF = nullptr; /// A class representing liveness information for a single basic block. /// Each bit in the BitVector represents the liveness property @@ -448,7 +448,7 @@ class StackColoring : public MachineFunctionPass { VNInfo::Allocator VNInfoAllocator; /// SlotIndex analysis object. - SlotIndexes *Indexes; + SlotIndexes *Indexes = nullptr; /// The list of lifetime markers found. These markers are to be removed /// once the coloring is done. @@ -935,12 +935,13 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) { // Remap debug information that refers to stack slots. for (auto &VI : MF->getVariableDbgInfo()) { - if (!VI.Var) + if (!VI.Var || !VI.inStackSlot()) continue; - if (SlotRemap.count(VI.Slot)) { + int Slot = VI.getStackSlot(); + if (SlotRemap.count(Slot)) { LLVM_DEBUG(dbgs() << "Remapping debug info for [" << cast<DILocalVariable>(VI.Var)->getName() << "].\n"); - VI.Slot = SlotRemap[VI.Slot]; + VI.updateStackSlot(SlotRemap[Slot]); FixedDbg++; } } diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp index 3a48dd5b0a03..5d3903ed84ce 100644 --- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp +++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp @@ -210,8 +210,9 @@ struct StackFrameLayoutAnalysisPass : public MachineFunctionPass { SlotDbgMap SlotDebugMap; // add variables to the map - for (MachineFunction::VariableDbgInfo &DI : MF.getVariableDbgInfo()) - SlotDebugMap[DI.Slot].insert(DI.Var); + for (MachineFunction::VariableDbgInfo &DI : + MF.getInStackSlotVariableDbgInfo()) + SlotDebugMap[DI.getStackSlot()].insert(DI.Var); // Then add all the spills that have debug data for (MachineBasicBlock &MBB : MF) { diff --git a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp index b83c56903133..778ac1f5701c 100644 --- a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp +++ b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp @@ -48,7 +48,7 @@ namespace { /// information provided by this pass is optional and not required by the /// aformentioned intrinsic to function. class StackMapLiveness : public MachineFunctionPass { - const TargetRegisterInfo *TRI; + const TargetRegisterInfo *TRI = nullptr; LivePhysRegs LiveRegs; public: diff --git a/llvm/lib/CodeGen/StackMaps.cpp b/llvm/lib/CodeGen/StackMaps.cpp index bb7a51e49edb..f9115e434878 100644 --- a/llvm/lib/CodeGen/StackMaps.cpp +++ b/llvm/lib/CodeGen/StackMaps.cpp @@ -149,7 +149,7 @@ unsigned StatepointOpers::getGCPointerMap( bool StatepointOpers::isFoldableReg(Register Reg) const { unsigned FoldableAreaStart = getVarIdx(); for (const MachineOperand &MO : MI->uses()) { - if (MI->getOperandNo(&MO) >= FoldableAreaStart) + if (MO.getOperandNo() >= FoldableAreaStart) break; if (MO.isReg() && MO.getReg() == Reg) return false; @@ -193,9 +193,12 @@ unsigned StackMaps::getNextMetaArgIdx(const MachineInstr *MI, unsigned CurIdx) { /// Go up the super-register chain until we hit a valid dwarf register number. static unsigned getDwarfRegNum(unsigned Reg, const TargetRegisterInfo *TRI) { - int RegNum = TRI->getDwarfRegNum(Reg, false); - for (MCSuperRegIterator SR(Reg, TRI); SR.isValid() && RegNum < 0; ++SR) - RegNum = TRI->getDwarfRegNum(*SR, false); + int RegNum; + for (MCPhysReg SR : TRI->superregs_inclusive(Reg)) { + RegNum = TRI->getDwarfRegNum(SR, false); + if (RegNum >= 0) + break; + } assert(RegNum >= 0 && "Invalid Dwarf register number."); return (unsigned)RegNum; @@ -389,7 +392,7 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const { break; } I->Size = std::max(I->Size, II->Size); - if (TRI->isSuperRegister(I->Reg, II->Reg)) + if (I->Reg && TRI->isSuperRegister(I->Reg, II->Reg)) I->Reg = II->Reg; II->Reg = 0; // mark for deletion. } diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index 46685f7b8208..387b653f8815 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -15,9 +15,9 @@ #include "llvm/CodeGen/StackProtector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/Passes.h" @@ -30,6 +30,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" @@ -96,7 +97,7 @@ bool StackProtector::runOnFunction(Function &Fn) { SSPBufferSize = Fn.getFnAttributeAsParsedInteger( "stack-protector-buffer-size", DefaultSSPBufferSize); - if (!RequiresStackProtector()) + if (!requiresStackProtector(F, &Layout)) return false; // TODO(etienneb): Functions with funclets are not correctly supported now. @@ -121,9 +122,9 @@ bool StackProtector::runOnFunction(Function &Fn) { /// \param [out] IsLarge is set to true if a protectable array is found and /// it is "large" ( >= ssp-buffer-size). In the case of a structure with /// multiple arrays, this gets set if any of them is large. -bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge, - bool Strong, - bool InStruct) const { +static bool ContainsProtectableArray(Type *Ty, Module *M, unsigned SSPBufferSize, + bool &IsLarge, bool Strong, + bool InStruct) { if (!Ty) return false; if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) { @@ -132,7 +133,7 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge, // add stack protectors unless the array is a character array. // However, in strong mode any array, regardless of type and size, // triggers a protector. - if (!Strong && (InStruct || !Trip.isOSDarwin())) + if (!Strong && (InStruct || !Triple(M->getTargetTriple()).isOSDarwin())) return false; } @@ -154,7 +155,7 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge, bool NeedsProtector = false; for (Type *ET : ST->elements()) - if (ContainsProtectableArray(ET, IsLarge, Strong, true)) { + if (ContainsProtectableArray(ET, M, SSPBufferSize, IsLarge, Strong, true)) { // If the element is a protectable array and is large (>= SSPBufferSize) // then we are done. If the protectable array is not large, then // keep looking in case a subsequent element is a large array. @@ -166,8 +167,10 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge, return NeedsProtector; } -bool StackProtector::HasAddressTaken(const Instruction *AI, - TypeSize AllocSize) { +/// Check whether a stack allocation has its address taken. +static bool HasAddressTaken(const Instruction *AI, TypeSize AllocSize, + Module *M, + SmallPtrSet<const PHINode *, 16> &VisitedPHIs) { const DataLayout &DL = M->getDataLayout(); for (const User *U : AI->users()) { const auto *I = cast<Instruction>(U); @@ -221,14 +224,14 @@ bool StackProtector::HasAddressTaken(const Instruction *AI, // assume the scalable value is of minimum size. TypeSize NewAllocSize = TypeSize::Fixed(AllocSize.getKnownMinValue()) - OffsetSize; - if (HasAddressTaken(I, NewAllocSize)) + if (HasAddressTaken(I, NewAllocSize, M, VisitedPHIs)) return true; break; } case Instruction::BitCast: case Instruction::Select: case Instruction::AddrSpaceCast: - if (HasAddressTaken(I, AllocSize)) + if (HasAddressTaken(I, AllocSize, M, VisitedPHIs)) return true; break; case Instruction::PHI: { @@ -236,7 +239,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI, // they are only visited once. const auto *PN = cast<PHINode>(I); if (VisitedPHIs.insert(PN).second) - if (HasAddressTaken(PN, AllocSize)) + if (HasAddressTaken(PN, AllocSize, M, VisitedPHIs)) return true; break; } @@ -282,10 +285,19 @@ static const CallInst *findStackProtectorIntrinsic(Function &F) { /// functions with aggregates that contain any buffer regardless of type and /// size, and functions that contain stack-based variables that have had their /// address taken. -bool StackProtector::RequiresStackProtector() { +bool StackProtector::requiresStackProtector(Function *F, SSPLayoutMap *Layout) { + Module *M = F->getParent(); bool Strong = false; bool NeedsProtector = false; + // The set of PHI nodes visited when determining if a variable's reference has + // been taken. This set is maintained to ensure we don't visit the same PHI + // node multiple times. + SmallPtrSet<const PHINode *, 16> VisitedPHIs; + + unsigned SSPBufferSize = F->getFnAttributeAsParsedInteger( + "stack-protector-buffer-size", DefaultSSPBufferSize); + if (F->hasFnAttribute(Attribute::SafeStack)) return false; @@ -295,6 +307,8 @@ bool StackProtector::RequiresStackProtector() { OptimizationRemarkEmitter ORE(F); if (F->hasFnAttribute(Attribute::StackProtectReq)) { + if (!Layout) + return true; ORE.emit([&]() { return OptimizationRemark(DEBUG_TYPE, "StackProtectorRequested", F) << "Stack protection applied to function " @@ -324,21 +338,27 @@ bool StackProtector::RequiresStackProtector() { if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) { // A call to alloca with size >= SSPBufferSize requires // stack protectors. - Layout.insert(std::make_pair(AI, - MachineFrameInfo::SSPLK_LargeArray)); + if (!Layout) + return true; + Layout->insert( + std::make_pair(AI, MachineFrameInfo::SSPLK_LargeArray)); ORE.emit(RemarkBuilder); NeedsProtector = true; } else if (Strong) { // Require protectors for all alloca calls in strong mode. - Layout.insert(std::make_pair(AI, - MachineFrameInfo::SSPLK_SmallArray)); + if (!Layout) + return true; + Layout->insert( + std::make_pair(AI, MachineFrameInfo::SSPLK_SmallArray)); ORE.emit(RemarkBuilder); NeedsProtector = true; } } else { // A call to alloca with a variable size requires protectors. - Layout.insert(std::make_pair(AI, - MachineFrameInfo::SSPLK_LargeArray)); + if (!Layout) + return true; + Layout->insert( + std::make_pair(AI, MachineFrameInfo::SSPLK_LargeArray)); ORE.emit(RemarkBuilder); NeedsProtector = true; } @@ -346,10 +366,13 @@ bool StackProtector::RequiresStackProtector() { } bool IsLarge = false; - if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) { - Layout.insert(std::make_pair(AI, IsLarge - ? MachineFrameInfo::SSPLK_LargeArray - : MachineFrameInfo::SSPLK_SmallArray)); + if (ContainsProtectableArray(AI->getAllocatedType(), M, SSPBufferSize, + IsLarge, Strong, false)) { + if (!Layout) + return true; + Layout->insert(std::make_pair( + AI, IsLarge ? MachineFrameInfo::SSPLK_LargeArray + : MachineFrameInfo::SSPLK_SmallArray)); ORE.emit([&]() { return OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I) << "Stack protection applied to function " @@ -361,10 +384,14 @@ bool StackProtector::RequiresStackProtector() { continue; } - if (Strong && HasAddressTaken(AI, M->getDataLayout().getTypeAllocSize( - AI->getAllocatedType()))) { + if (Strong && + HasAddressTaken( + AI, M->getDataLayout().getTypeAllocSize(AI->getAllocatedType()), + M, VisitedPHIs)) { ++NumAddrTaken; - Layout.insert(std::make_pair(AI, MachineFrameInfo::SSPLK_AddrOf)); + if (!Layout) + return true; + Layout->insert(std::make_pair(AI, MachineFrameInfo::SSPLK_AddrOf)); ORE.emit([&]() { return OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken", &I) @@ -455,18 +482,15 @@ bool StackProtector::InsertStackProtectors() { if (&BB == FailBB) continue; Instruction *CheckLoc = dyn_cast<ReturnInst>(BB.getTerminator()); - if (!CheckLoc && !DisableCheckNoReturn) { - for (auto &Inst : BB) { - auto *CB = dyn_cast<CallBase>(&Inst); - if (!CB) - continue; - if (!CB->doesNotReturn()) - continue; - // Do stack check before non-return calls (e.g: __cxa_throw) - CheckLoc = CB; - break; - } - } + if (!CheckLoc && !DisableCheckNoReturn) + for (auto &Inst : BB) + if (auto *CB = dyn_cast<CallBase>(&Inst)) + // Do stack check before noreturn calls that aren't nounwind (e.g: + // __cxa_throw). + if (CB->doesNotReturn() && !CB->doesNotThrow()) { + CheckLoc = CB; + break; + } if (!CheckLoc) continue; @@ -594,18 +618,19 @@ BasicBlock *StackProtector::CreateFailBB() { if (F->getSubprogram()) B.SetCurrentDebugLocation( DILocation::get(Context, 0, 0, F->getSubprogram())); + FunctionCallee StackChkFail; + SmallVector<Value *, 1> Args; if (Trip.isOSOpenBSD()) { - FunctionCallee StackChkFail = M->getOrInsertFunction( - "__stack_smash_handler", Type::getVoidTy(Context), - Type::getInt8PtrTy(Context)); - - B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH")); + StackChkFail = M->getOrInsertFunction("__stack_smash_handler", + Type::getVoidTy(Context), + Type::getInt8PtrTy(Context)); + Args.push_back(B.CreateGlobalStringPtr(F->getName(), "SSH")); } else { - FunctionCallee StackChkFail = + StackChkFail = M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context)); - - B.CreateCall(StackChkFail, {}); } + cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn); + B.CreateCall(StackChkFail, Args); B.CreateUnreachable(); return FailBB; } diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp index b8c750688914..6d933ab12041 100644 --- a/llvm/lib/CodeGen/StackSlotColoring.cpp +++ b/llvm/lib/CodeGen/StackSlotColoring.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervalUnion.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -58,10 +59,10 @@ STATISTIC(NumDead, "Number of trivially dead stack accesses eliminated"); namespace { class StackSlotColoring : public MachineFunctionPass { - LiveStacks* LS; - MachineFrameInfo *MFI; - const TargetInstrInfo *TII; - const MachineBlockFrequencyInfo *MBFI; + LiveStacks *LS = nullptr; + MachineFrameInfo *MFI = nullptr; + const TargetInstrInfo *TII = nullptr; + const MachineBlockFrequencyInfo *MBFI = nullptr; // SSIntervals - Spill slot intervals. std::vector<LiveInterval*> SSIntervals; @@ -90,8 +91,50 @@ namespace { // UsedColors - "Colors" that have been assigned. This is per stack ID SmallVector<BitVector, 2> UsedColors; + // Join all intervals sharing one color into a single LiveIntervalUnion to + // speedup range overlap test. + class ColorAssignmentInfo { + // Single liverange (used to avoid creation of LiveIntervalUnion). + LiveInterval *SingleLI = nullptr; + // LiveIntervalUnion to perform overlap test. + LiveIntervalUnion *LIU = nullptr; + // LiveIntervalUnion has a parameter in its constructor so doing this + // dirty magic. + uint8_t LIUPad[sizeof(LiveIntervalUnion)]; + + public: + ~ColorAssignmentInfo() { + if (LIU) + LIU->~LiveIntervalUnion(); // Dirty magic again. + } + + // Return true if LiveInterval overlaps with any + // intervals that have already been assigned to this color. + bool overlaps(LiveInterval *LI) const { + if (LIU) + return LiveIntervalUnion::Query(*LI, *LIU).checkInterference(); + return SingleLI ? SingleLI->overlaps(*LI) : false; + } + + // Add new LiveInterval to this color. + void add(LiveInterval *LI, LiveIntervalUnion::Allocator &Alloc) { + assert(!overlaps(LI)); + if (LIU) { + LIU->unify(*LI, *LI); + } else if (SingleLI) { + LIU = new (LIUPad) LiveIntervalUnion(Alloc); + LIU->unify(*SingleLI, *SingleLI); + LIU->unify(*LI, *LI); + SingleLI = nullptr; + } else + SingleLI = LI; + } + }; + + LiveIntervalUnion::Allocator LIUAlloc; + // Assignments - Color to intervals mapping. - SmallVector<SmallVector<LiveInterval*,4>, 16> Assignments; + SmallVector<ColorAssignmentInfo, 16> Assignments; public: static char ID; // Pass identification @@ -116,7 +159,6 @@ namespace { private: void InitializeSlots(); void ScanForSpillSlotRefs(MachineFunction &MF); - bool OverlapWithAssignments(LiveInterval *li, int Color) const; int ColorSlot(LiveInterval *li); bool ColorSlots(MachineFunction &MF); void RewriteInstruction(MachineInstr &MI, SmallVectorImpl<int> &SlotMapping, @@ -247,19 +289,6 @@ void StackSlotColoring::InitializeSlots() { NextColors[I] = AllColors[I].find_first(); } -/// OverlapWithAssignments - Return true if LiveInterval overlaps with any -/// LiveIntervals that have already been assigned to the specified color. -bool -StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const { - const SmallVectorImpl<LiveInterval *> &OtherLIs = Assignments[Color]; - for (unsigned i = 0, e = OtherLIs.size(); i != e; ++i) { - LiveInterval *OtherLI = OtherLIs[i]; - if (OtherLI->overlaps(*li)) - return true; - } - return false; -} - /// ColorSlot - Assign a "color" (stack slot) to the specified stack slot. int StackSlotColoring::ColorSlot(LiveInterval *li) { int Color = -1; @@ -272,7 +301,7 @@ int StackSlotColoring::ColorSlot(LiveInterval *li) { // Check if it's possible to reuse any of the used colors. Color = UsedColors[StackID].find_first(); while (Color != -1) { - if (!OverlapWithAssignments(li, Color)) { + if (!Assignments[Color].overlaps(li)) { Share = true; ++NumEliminated; break; @@ -298,7 +327,7 @@ int StackSlotColoring::ColorSlot(LiveInterval *li) { assert(MFI->getStackID(Color) == MFI->getStackID(FI)); // Record the assignment. - Assignments[Color].push_back(li); + Assignments[Color].add(li, LIUAlloc); LLVM_DEBUG(dbgs() << "Assigning fi#" << FI << " to fi#" << Color << "\n"); // Change size and alignment of the allocated slot. If there are multiple @@ -515,8 +544,6 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) { OrigSizes.clear(); AllColors.clear(); UsedColors.clear(); - for (unsigned i = 0, e = Assignments.size(); i != e; ++i) - Assignments[i].clear(); Assignments.clear(); return Changed; diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index 865add28f781..5ed67bd0a121 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -427,7 +427,13 @@ void TailDuplicator::duplicateInstruction( } else { // For mapped registers that do not have sub-registers, simply // restrict their class to match the original one. - ConstrRC = MRI->constrainRegClass(VI->second.Reg, OrigRC); + + // We don't want debug instructions affecting the resulting code so + // if we're cloning a debug instruction then just use MappedRC + // rather than constraining the register class further. + ConstrRC = NewMI.isDebugInstr() + ? MappedRC + : MRI->constrainRegClass(VI->second.Reg, OrigRC); } if (ConstrRC) { @@ -436,16 +442,13 @@ void TailDuplicator::duplicateInstruction( MO.setReg(VI->second.Reg); // We have Reg -> VI.Reg:VI.SubReg, so if Reg is used with a // sub-register, we need to compose the sub-register indices. - MO.setSubReg(TRI->composeSubRegIndices(MO.getSubReg(), - VI->second.SubReg)); + MO.setSubReg( + TRI->composeSubRegIndices(VI->second.SubReg, MO.getSubReg())); } else { // The direct replacement is not possible, due to failing register // class constraints. An explicit COPY is necessary. Create one - // that can be reused - auto *NewRC = MI->getRegClassConstraint(i, TII, TRI); - if (NewRC == nullptr) - NewRC = OrigRC; - Register NewReg = MRI->createVirtualRegister(NewRC); + // that can be reused. + Register NewReg = MRI->createVirtualRegister(OrigRC); BuildMI(*PredBB, NewMI, NewMI.getDebugLoc(), TII->get(TargetOpcode::COPY), NewReg) .addReg(VI->second.Reg, 0, VI->second.SubReg); @@ -1016,13 +1019,11 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB, DenseMap<Register, RegSubRegPair> LocalVRMap; SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos; - MachineBasicBlock::iterator I = TailBB->begin(); // Process PHI instructions first. - while (I != TailBB->end() && I->isPHI()) { + for (MachineInstr &MI : make_early_inc_range(TailBB->phis())) { // Replace the uses of the def of the PHI with the register coming // from PredBB. - MachineInstr *MI = &*I++; - processPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false); + processPHI(&MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false); } appendCopies(PredBB, CopyInfos, Copies); } diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index 9430e86fe44d..48a2094f5d45 100644 --- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -17,7 +17,6 @@ #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/Attributes.h" -#include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" #include "llvm/MC/MCAsmInfo.h" @@ -131,16 +130,6 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF, } } -unsigned TargetFrameLowering::getStackAlignmentSkew( - const MachineFunction &MF) const { - // When HHVM function is called, the stack is skewed as the return address - // is removed from the stack before we enter the function. - if (LLVM_UNLIKELY(MF.getFunction().getCallingConv() == CallingConv::HHVM)) - return MF.getTarget().getAllocaPointerSize(); - - return 0; -} - bool TargetFrameLowering::allocateScavengingFrameIndexesNearIncomingSP( const MachineFunction &MF) const { if (!hasFP(MF)) diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 0f6cf11ca9d1..09dcddc17b06 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -19,6 +19,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/ScoreboardHazardRecognizer.h" #include "llvm/CodeGen/StackMaps.h" @@ -439,8 +440,9 @@ MachineInstr &TargetInstrInfo::duplicate(MachineBasicBlock &MBB, // If the COPY instruction in MI can be folded to a stack operation, return // the register class to use. static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI, + const TargetInstrInfo &TII, unsigned FoldIdx) { - assert(MI.isCopy() && "MI must be a COPY instruction"); + assert(TII.isCopyInstr(MI) && "MI must be a COPY instruction"); if (MI.getNumOperands() != 2) return nullptr; assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand"); @@ -629,10 +631,10 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, } // Straight COPY may fold as load/store. - if (!MI.isCopy() || Ops.size() != 1) + if (!isCopyInstr(MI) || Ops.size() != 1) return nullptr; - const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]); + const TargetRegisterClass *RC = canFoldCopy(MI, *this, Ops[0]); if (!RC) return nullptr; @@ -695,6 +697,61 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, return NewMI; } +/// transferImplicitOperands - MI is a pseudo-instruction, and the lowered +/// replacement instructions immediately precede it. Copy any implicit +/// operands from MI to the replacement instruction. +static void transferImplicitOperands(MachineInstr *MI, + const TargetRegisterInfo *TRI) { + MachineBasicBlock::iterator CopyMI = MI; + --CopyMI; + + Register DstReg = MI->getOperand(0).getReg(); + for (const MachineOperand &MO : MI->implicit_operands()) { + CopyMI->addOperand(MO); + + // Be conservative about preserving kills when subregister defs are + // involved. If there was implicit kill of a super-register overlapping the + // copy result, we would kill the subregisters previous copies defined. + + if (MO.isKill() && TRI->regsOverlap(DstReg, MO.getReg())) + CopyMI->getOperand(CopyMI->getNumOperands() - 1).setIsKill(false); + } +} + +void TargetInstrInfo::lowerCopy(MachineInstr *MI, + const TargetRegisterInfo *TRI) const { + if (MI->allDefsAreDead()) { + MI->setDesc(get(TargetOpcode::KILL)); + return; + } + + MachineOperand &DstMO = MI->getOperand(0); + MachineOperand &SrcMO = MI->getOperand(1); + + bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg()); + if (IdentityCopy || SrcMO.isUndef()) { + // No need to insert an identity copy instruction, but replace with a KILL + // if liveness is changed. + if (SrcMO.isUndef() || MI->getNumOperands() > 2) { + // We must make sure the super-register gets killed. Replace the + // instruction with KILL. + MI->setDesc(get(TargetOpcode::KILL)); + return; + } + // Vanilla identity copy. + MI->eraseFromParent(); + return; + } + + copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(), DstMO.getReg(), + SrcMO.getReg(), SrcMO.isKill()); + + if (MI->getNumOperands() > 2) + transferImplicitOperands(MI, TRI); + MI->eraseFromParent(); + return; +} + bool TargetInstrInfo::hasReassociableOperands( const MachineInstr &Inst, const MachineBasicBlock *MBB) const { const MachineOperand &Op1 = Inst.getOperand(1); @@ -1016,6 +1073,17 @@ void TargetInstrInfo::reassociateOps( InsInstrs.push_back(MIB2); DelInstrs.push_back(&Prev); DelInstrs.push_back(&Root); + + // We transformed: + // B = A op X (Prev) + // C = B op Y (Root) + // Into: + // B = X op Y (MIB1) + // C = A op B (MIB2) + // C has the same value as before, B doesn't; as such, keep the debug number + // of C but not of B. + if (unsigned OldRootNum = Root.peekDebugInstrNum()) + MIB2.getInstr()->setDebugInstrNum(OldRootNum); } void TargetInstrInfo::genAlternativeCodeSequence( @@ -1037,18 +1105,20 @@ void TargetInstrInfo::genAlternativeCodeSequence( Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); break; default: - break; + llvm_unreachable("Unknown pattern for machine combiner"); } // Don't reassociate if Prev and Root are in different blocks. if (Prev->getParent() != Root.getParent()) return; - assert(Prev && "Unknown pattern for machine combiner"); - reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); } +MachineTraceStrategy TargetInstrInfo::getMachineCombinerTraceStrategy() const { + return MachineTraceStrategy::TS_MinInstrCount; +} + bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric( const MachineInstr &MI) const { const MachineFunction &MF = *MI.getMF(); @@ -1329,11 +1399,7 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, if (Reg == DestReg) return ParamLoadedValue(*DestSrc->Source, Expr); - // Cases where super- or sub-registers needs to be described should - // be handled by the target's hook implementation. - assert(!TRI->isSuperOrSubRegisterEq(Reg, DestReg) && - "TargetInstrInfo::describeLoadedValue can't describe super- or " - "sub-regs for copy instructions"); + // If the target's hook couldn't describe this copy, give up. return std::nullopt; } else if (auto RegImm = isAddImmediate(MI, Reg)) { Register SrcReg = RegImm->Reg; @@ -1555,15 +1621,107 @@ void TargetInstrInfo::mergeOutliningCandidateAttributes( F.addFnAttr(Attribute::NoUnwind); } +outliner::InstrType TargetInstrInfo::getOutliningType( + MachineBasicBlock::iterator &MIT, unsigned Flags) const { + MachineInstr &MI = *MIT; + + // NOTE: MI.isMetaInstruction() will match CFI_INSTRUCTION, but some targets + // have support for outlining those. Special-case that here. + if (MI.isCFIInstruction()) + // Just go right to the target implementation. + return getOutliningTypeImpl(MIT, Flags); + + // Be conservative about inline assembly. + if (MI.isInlineAsm()) + return outliner::InstrType::Illegal; + + // Labels generally can't safely be outlined. + if (MI.isLabel()) + return outliner::InstrType::Illegal; + + // Don't let debug instructions impact analysis. + if (MI.isDebugInstr()) + return outliner::InstrType::Invisible; + + // Some other special cases. + switch (MI.getOpcode()) { + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::LIFETIME_START: + case TargetOpcode::LIFETIME_END: + return outliner::InstrType::Invisible; + default: + break; + } + + // Is this a terminator for a basic block? + if (MI.isTerminator()) { + // If this is a branch to another block, we can't outline it. + if (!MI.getParent()->succ_empty()) + return outliner::InstrType::Illegal; + + // Don't outline if the branch is not unconditional. + if (isPredicated(MI)) + return outliner::InstrType::Illegal; + } + + // Make sure none of the operands of this instruction do anything that + // might break if they're moved outside their current function. + // This includes MachineBasicBlock references, BlockAddressses, + // Constant pool indices and jump table indices. + // + // A quick note on MO_TargetIndex: + // This doesn't seem to be used in any of the architectures that the + // MachineOutliner supports, but it was still filtered out in all of them. + // There was one exception (RISC-V), but MO_TargetIndex also isn't used there. + // As such, this check is removed both here and in the target-specific + // implementations. Instead, we assert to make sure this doesn't + // catch anyone off-guard somewhere down the line. + for (const MachineOperand &MOP : MI.operands()) { + // If you hit this assertion, please remove it and adjust + // `getOutliningTypeImpl` for your target appropriately if necessary. + // Adding the assertion back to other supported architectures + // would be nice too :) + assert(!MOP.isTargetIndex() && "This isn't used quite yet!"); + + // CFI instructions should already have been filtered out at this point. + assert(!MOP.isCFIIndex() && "CFI instructions handled elsewhere!"); + + // PrologEpilogInserter should've already run at this point. + assert(!MOP.isFI() && "FrameIndex instructions should be gone by now!"); + + if (MOP.isMBB() || MOP.isBlockAddress() || MOP.isCPI() || MOP.isJTI()) + return outliner::InstrType::Illegal; + } + + // If we don't know, delegate to the target-specific hook. + return getOutliningTypeImpl(MIT, Flags); +} + bool TargetInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, unsigned &Flags) const { // Some instrumentations create special TargetOpcode at the start which // expands to special code sequences which must be present. auto First = MBB.getFirstNonDebugInstr(); - if (First != MBB.end() && - (First->getOpcode() == TargetOpcode::FENTRY_CALL || - First->getOpcode() == TargetOpcode::PATCHABLE_FUNCTION_ENTER)) + if (First == MBB.end()) + return true; + + if (First->getOpcode() == TargetOpcode::FENTRY_CALL || + First->getOpcode() == TargetOpcode::PATCHABLE_FUNCTION_ENTER) + return false; + + // Some instrumentations create special pseudo-instructions at or just before + // the end that must be present. + auto Last = MBB.getLastNonDebugInstr(); + if (Last->getOpcode() == TargetOpcode::PATCHABLE_RET || + Last->getOpcode() == TargetOpcode::PATCHABLE_TAIL_CALL) return false; + if (Last != First && Last->isReturn()) { + --Last; + if (Last->getOpcode() == TargetOpcode::PATCHABLE_FUNCTION_EXIT || + Last->getOpcode() == TargetOpcode::PATCHABLE_TAIL_CALL) + return false; + } return true; } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index da8b87babc2d..badb7fe53333 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -15,7 +15,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -29,6 +28,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetLowering.h" @@ -49,10 +49,10 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include <algorithm> #include <cassert> @@ -209,6 +209,18 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) { if (TT.isOSOpenBSD()) { setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr); } + + if (TT.isOSWindows() && !TT.isOSCygMing()) { + setLibcallName(RTLIB::LDEXP_F32, nullptr); + setLibcallName(RTLIB::LDEXP_F80, nullptr); + setLibcallName(RTLIB::LDEXP_F128, nullptr); + setLibcallName(RTLIB::LDEXP_PPCF128, nullptr); + + setLibcallName(RTLIB::FREXP_F32, nullptr); + setLibcallName(RTLIB::FREXP_F80, nullptr); + setLibcallName(RTLIB::FREXP_F128, nullptr); + setLibcallName(RTLIB::FREXP_PPCF128, nullptr); + } } /// GetFPLibCall - Helper to return the right libcall for the given floating @@ -498,6 +510,16 @@ RTLIB::Libcall RTLIB::getPOWI(EVT RetVT) { POWI_PPCF128); } +RTLIB::Libcall RTLIB::getLDEXP(EVT RetVT) { + return getFPLibCall(RetVT, LDEXP_F32, LDEXP_F64, LDEXP_F80, LDEXP_F128, + LDEXP_PPCF128); +} + +RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) { + return getFPLibCall(RetVT, FREXP_F32, FREXP_F64, FREXP_F80, FREXP_F128, + FREXP_PPCF128); +} + RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order, MVT VT) { unsigned ModeN, ModelN; @@ -724,7 +746,9 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { // with the Target-specific changes necessary. MaxAtomicSizeInBitsSupported = 1024; - MaxDivRemBitWidthSupported = llvm::IntegerType::MAX_INT_BITS; + // Assume that even with libcalls, no target supports wider than 128 bit + // division. + MaxDivRemBitWidthSupported = 128; MaxLargeFPConvertBitWidthSupported = llvm::IntegerType::MAX_INT_BITS; @@ -819,8 +843,8 @@ void TargetLoweringBase::initActions() { ISD::SMULO, ISD::UMULO}, VT, Expand); - // ADDCARRY operations default to expand - setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY, ISD::SETCCCARRY, + // Carry-using overflow operations default to expand. + setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY, ISD::SETCCCARRY, ISD::SADDO_CARRY, ISD::SSUBO_CARRY}, VT, Expand); @@ -843,7 +867,9 @@ void TargetLoweringBase::initActions() { setOperationAction({ISD::BITREVERSE, ISD::PARITY}, VT, Expand); // These library functions default to expand. - setOperationAction({ISD::FROUND, ISD::FROUNDEVEN, ISD::FPOWI}, VT, Expand); + setOperationAction( + {ISD::FROUND, ISD::FROUNDEVEN, ISD::FPOWI, ISD::FLDEXP, ISD::FFREXP}, + VT, Expand); // These operations default to expand for vector types. if (VT.isVector()) @@ -867,16 +893,22 @@ void TargetLoweringBase::initActions() { ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND, ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN, ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_FMAX, - ISD::VECREDUCE_FMIN, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL}, + ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM, + ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL}, VT, Expand); // Named vector shuffles default to expand. setOperationAction(ISD::VECTOR_SPLICE, VT, Expand); - // VP_SREM/UREM default to expand. - // TODO: Expand all VP intrinsics. - setOperationAction(ISD::VP_SREM, VT, Expand); - setOperationAction(ISD::VP_UREM, VT, Expand); + // VP operations default to expand. +#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) \ + setOperationAction(ISD::SDOPC, VT, Expand); +#include "llvm/IR/VPIntrinsics.def" + + // FP environment operations default to expand. + setOperationAction(ISD::GET_FPENV, VT, Expand); + setOperationAction(ISD::SET_FPENV, VT, Expand); + setOperationAction(ISD::RESET_FPENV, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. @@ -907,6 +939,9 @@ void TargetLoweringBase::initActions() { setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand); setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand); + + setOperationAction(ISD::GET_FPENV_MEM, MVT::Other, Expand); + setOperationAction(ISD::SET_FPENV_MEM, MVT::Other, Expand); } MVT TargetLoweringBase::getScalarShiftAmountTy(const DataLayout &DL, @@ -1137,8 +1172,7 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT, unsigned LaneSizeInBits = NewVT.getScalarSizeInBits(); // Convert sizes such as i33 to i64. - if (!isPowerOf2_32(LaneSizeInBits)) - LaneSizeInBits = NextPowerOf2(LaneSizeInBits); + LaneSizeInBits = llvm::bit_ceil(LaneSizeInBits); MVT DestVT = TLI->getRegisterType(NewVT); RegisterVT = DestVT; @@ -1627,7 +1661,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, if (EVT(DestVT).bitsLT(NewVT)) { // Value is expanded, e.g. i64 -> i16. TypeSize NewVTSize = NewVT.getSizeInBits(); // Convert sizes such as i33 to i64. - if (!isPowerOf2_32(NewVTSize.getKnownMinValue())) + if (!llvm::has_single_bit<uint32_t>(NewVTSize.getKnownMinValue())) NewVTSize = NewVTSize.coefficientNextPowerOf2(); return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits()); } @@ -1691,7 +1725,7 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType, // conventions. The frontend should mark functions whose return values // require promoting with signext or zeroext attributes. if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) { - MVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32); + MVT MinVT = TLI.getRegisterType(MVT::i32); if (VT.bitsLT(MinVT)) VT = MinVT; } @@ -1976,9 +2010,10 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const { "__stack_chk_guard"); // FreeBSD has "__stack_chk_guard" defined externally on libc.so - if (TM.getRelocationModel() == Reloc::Static && + if (M.getDirectAccessExternalData() && !TM.getTargetTriple().isWindowsGNUEnvironment() && - !TM.getTargetTriple().isOSFreeBSD()) + !TM.getTargetTriple().isOSFreeBSD() && + !TM.getTargetTriple().isOSDarwin()) GV->setDSOLocal(true); } } diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index e760564779c2..3994552884c4 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Triple.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/BinaryFormat/ELF.h" @@ -65,12 +64,17 @@ #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" #include <cassert> #include <string> using namespace llvm; using namespace dwarf; +static cl::opt<bool> JumpTableInFunctionSection( + "jumptable-in-function-section", cl::Hidden, cl::init(false), + cl::desc("Putting Jump Table in function section")); + static void GetObjCImageInfo(Module &M, unsigned &Version, unsigned &Flags, StringRef &Section) { SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags; @@ -182,26 +186,14 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx, // The small model guarantees static code/data size < 4GB, but not where it // will be in memory. Most of these could end up >2GB away so even a signed // pc-relative 32-bit address is insufficient, theoretically. - if (isPositionIndependent()) { - // ILP32 uses sdata4 instead of sdata8 - if (TgtM.getTargetTriple().getEnvironment() == Triple::GNUILP32) { - PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | - dwarf::DW_EH_PE_sdata4; - LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4; - TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | - dwarf::DW_EH_PE_sdata4; - } else { - PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | - dwarf::DW_EH_PE_sdata8; - LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8; - TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | - dwarf::DW_EH_PE_sdata8; - } - } else { - PersonalityEncoding = dwarf::DW_EH_PE_absptr; - LSDAEncoding = dwarf::DW_EH_PE_absptr; - TTypeEncoding = dwarf::DW_EH_PE_absptr; - } + // + // Use DW_EH_PE_indirect even for -fno-pic to avoid copy relocations. + LSDAEncoding = dwarf::DW_EH_PE_pcrel | + (TgtM.getTargetTriple().getEnvironment() == Triple::GNUILP32 + ? dwarf::DW_EH_PE_sdata4 + : dwarf::DW_EH_PE_sdata8); + PersonalityEncoding = LSDAEncoding | dwarf::DW_EH_PE_indirect; + TTypeEncoding = LSDAEncoding | dwarf::DW_EH_PE_indirect; break; case Triple::lanai: LSDAEncoding = dwarf::DW_EH_PE_absptr; @@ -591,14 +583,7 @@ static const MCSymbolELF *getLinkedToSymbol(const GlobalObject *GO, if (!MD) return nullptr; - const MDOperand &Op = MD->getOperand(0); - if (!Op.get()) - return nullptr; - - auto *VM = dyn_cast<ValueAsMetadata>(Op); - if (!VM) - report_fatal_error("MD_associated operand is not ValueAsMetadata"); - + auto *VM = cast<ValueAsMetadata>(MD->getOperand(0).get()); auto *OtherGV = dyn_cast<GlobalValue>(VM->getValue()); return OtherGV ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGV)) : nullptr; } @@ -629,21 +614,21 @@ static unsigned getEntrySizeForKind(SectionKind Kind) { /// Return the section prefix name used by options FunctionsSections and /// DataSections. -static StringRef getSectionPrefixForGlobal(SectionKind Kind) { +static StringRef getSectionPrefixForGlobal(SectionKind Kind, bool IsLarge) { if (Kind.isText()) return ".text"; if (Kind.isReadOnly()) - return ".rodata"; + return IsLarge ? ".lrodata" : ".rodata"; if (Kind.isBSS()) - return ".bss"; + return IsLarge ? ".lbss" : ".bss"; if (Kind.isThreadData()) return ".tdata"; if (Kind.isThreadBSS()) return ".tbss"; if (Kind.isData()) - return ".data"; + return IsLarge ? ".ldata" : ".data"; if (Kind.isReadOnlyWithRel()) - return ".data.rel.ro"; + return IsLarge ? ".ldata.rel.ro" : ".data.rel.ro"; llvm_unreachable("Unknown section kind"); } @@ -665,7 +650,10 @@ getELFSectionNameForGlobal(const GlobalObject *GO, SectionKind Kind, Name = ".rodata.cst"; Name += utostr(EntrySize); } else { - Name = getSectionPrefixForGlobal(Kind); + bool IsLarge = false; + if (isa<GlobalVariable>(GO)) + IsLarge = TM.isLargeData(); + Name = getSectionPrefixForGlobal(Kind, IsLarge); } bool HasPrefix = false; @@ -867,6 +855,12 @@ static MCSectionELF *selectELFSectionForGlobal( Group = C->getName(); IsComdat = C->getSelectionKind() == Comdat::Any; } + if (isa<GlobalVariable>(GO)) { + if (TM.isLargeData()) { + assert(TM.getTargetTriple().getArch() == Triple::x86_64); + Flags |= ELF::SHF_X86_64_LARGE; + } + } // Get the section entry size based on the kind. unsigned EntrySize = getEntrySizeForKind(Kind); @@ -1217,11 +1211,12 @@ void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx, MCSection *TargetLoweringObjectFileMachO::getStaticDtorSection( unsigned Priority, const MCSymbol *KeySym) const { - // TODO(yln): Remove -lower-global-dtors-via-cxa-atexit fallback flag - // (LowerGlobalDtorsViaCxaAtExit) and always issue a fatal error here. - if (TM->Options.LowerGlobalDtorsViaCxaAtExit) - report_fatal_error("@llvm.global_dtors should have been lowered already"); return StaticDtorSection; + // In userspace, we lower global destructors via atexit(), but kernel/kext + // environments do not provide this function so we still need to support the + // legacy way here. + // See the -disable-atexit-based-global-dtor-lowering CodeGen flag for more + // context. } void TargetLoweringObjectFileMachO::emitModuleMetadata(MCStreamer &Streamer, @@ -1282,6 +1277,20 @@ MCSection *TargetLoweringObjectFileMachO::getExplicitSectionGlobal( StringRef SectionName = GO->getSection(); + const GlobalVariable *GV = dyn_cast<GlobalVariable>(GO); + if (GV && GV->hasImplicitSection()) { + auto Attrs = GV->getAttributes(); + if (Attrs.hasAttribute("bss-section") && Kind.isBSS()) { + SectionName = Attrs.getAttribute("bss-section").getValueAsString(); + } else if (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly()) { + SectionName = Attrs.getAttribute("rodata-section").getValueAsString(); + } else if (Attrs.hasAttribute("relro-section") && Kind.isReadOnlyWithRel()) { + SectionName = Attrs.getAttribute("relro-section").getValueAsString(); + } else if (Attrs.hasAttribute("data-section") && Kind.isData()) { + SectionName = Attrs.getAttribute("data-section").getValueAsString(); + } + } + const Function *F = dyn_cast<Function>(GO); if (F && F->hasFnAttribute("implicit-section-name")) { SectionName = F->getFnAttribute("implicit-section-name").getValueAsString(); @@ -1411,6 +1420,11 @@ MCSection *TargetLoweringObjectFileMachO::getSectionForConstant( return ReadOnlySection; // .const } +MCSection *TargetLoweringObjectFileMachO::getSectionForCommandLines() const { + return getContext().getMachOSection("__TEXT", "__command_line", 0, + SectionKind::getReadOnly()); +} + const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference( const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM, MachineModuleInfo *MMI, MCStreamer &Streamer) const { @@ -1796,6 +1810,19 @@ MCSection *TargetLoweringObjectFileCOFF::getSectionForJumpTable( COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID); } +bool TargetLoweringObjectFileCOFF::shouldPutJumpTableInFunctionSection( + bool UsesLabelDifference, const Function &F) const { + if (TM->getTargetTriple().getArch() == Triple::x86_64) { + if (!JumpTableInFunctionSection) { + // We can always create relative relocations, so use another section + // that can be marked non-executable. + return false; + } + } + return TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection( + UsesLabelDifference, F); +} + void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer, Module &M) const { emitLinkerDirectives(Streamer, M); @@ -2152,7 +2179,7 @@ static MCSectionWasm *selectWasmSectionForGlobal( } bool UniqueSectionNames = TM.getUniqueSectionNames(); - SmallString<128> Name = getSectionPrefixForGlobal(Kind); + SmallString<128> Name = getSectionPrefixForGlobal(Kind, /*IsLarge=*/false); if (const auto *F = dyn_cast<Function>(GO)) { const auto &OptionalPrefix = F->getSectionPrefix(); @@ -2335,8 +2362,11 @@ MCSection *TargetLoweringObjectFileXCOFF::getExplicitSectionGlobal( XCOFF::StorageMappingClass MappingClass; if (Kind.isText()) MappingClass = XCOFF::XMC_PR; - else if (Kind.isData() || Kind.isReadOnlyWithRel() || Kind.isBSS()) + else if (Kind.isData() || Kind.isBSS()) MappingClass = XCOFF::XMC_RW; + else if (Kind.isReadOnlyWithRel()) + MappingClass = + TM.Options.XCOFFReadOnlyPointers ? XCOFF::XMC_RO : XCOFF::XMC_RW; else if (Kind.isReadOnly()) MappingClass = XCOFF::XMC_RO; else @@ -2421,9 +2451,18 @@ MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal( return TextSection; } - // TODO: We may put Kind.isReadOnlyWithRel() under option control, because - // user may want to have read-only data with relocations placed into a - // read-only section by the compiler. + if (TM.Options.XCOFFReadOnlyPointers && Kind.isReadOnlyWithRel()) { + if (!TM.getDataSections()) + report_fatal_error( + "ReadOnlyPointers is supported only if data sections is turned on"); + + SmallString<128> Name; + getNameWithPrefix(Name, GO, TM); + return getContext().getXCOFFSection( + Name, SectionKind::getReadOnly(), + XCOFF::CsectProperties(XCOFF::XMC_RO, XCOFF::XTY_SD)); + } + // For BSS kind, zero initialized data must be emitted to the .data section // because external linkage control sections that get mapped to the .bss // section will be linked as tentative defintions, which is only appropriate diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index 3127328c363e..98ea2f21b3c8 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -42,6 +42,8 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SaveAndRestore.h" #include "llvm/Support/Threading.h" +#include "llvm/Support/VirtualFileSystem.h" +#include "llvm/Support/WithColor.h" #include "llvm/Target/CGPassBuilderOption.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Scalar.h" @@ -99,6 +101,9 @@ static cl::opt<bool> DisableCopyProp("disable-copyprop", cl::Hidden, cl::desc("Disable Copy Propagation pass")); static cl::opt<bool> DisablePartialLibcallInlining("disable-partial-libcall-inlining", cl::Hidden, cl::desc("Disable Partial Libcall Inlining")); +static cl::opt<bool> DisableAtExitBasedGlobalDtorLowering( + "disable-atexit-based-global-dtor-lowering", cl::Hidden, + cl::desc("For MachO, disable atexit()-based global destructor lowering")); static cl::opt<bool> EnableImplicitNullChecks( "enable-implicit-null-checks", cl::desc("Fold null checks into faulting memory operations"), @@ -168,12 +173,6 @@ static cl::opt<GlobalISelAbortMode> EnableGlobalISelAbort( clEnumValN(GlobalISelAbortMode::DisableWithDiag, "2", "Disable the abort but emit a diagnostic on failure"))); -// An option that disables inserting FS-AFDO discriminators before emit. -// This is mainly for debugging and tuning purpose. -static cl::opt<bool> - FSNoFinalDiscrim("fs-no-final-discrim", cl::init(false), cl::Hidden, - cl::desc("Do not insert FS-AFDO discriminators before " - "emit.")); // Disable MIRProfileLoader before RegAlloc. This is for for debugging and // tuning purpose. static cl::opt<bool> DisableRAFSProfileLoader( @@ -878,7 +877,7 @@ void TargetPassConfig::addIRPasses() { // For MachO, lower @llvm.global_dtors into @llvm.global_ctors with // __cxa_atexit() calls to avoid emitting the deprecated __mod_term_func. if (TM->getTargetTriple().isOSBinFormatMachO() && - TM->Options.LowerGlobalDtorsViaCxaAtExit) + !DisableAtExitBasedGlobalDtorLowering) addPass(createLowerGlobalDtorsLegacyPass()); // Make sure that no unreachable blocks are instruction selected. @@ -977,6 +976,8 @@ void TargetPassConfig::addISelPrepare() { if (requiresCodeGenSCCOrder()) addPass(new DummyCGSCCPass); + addPass(createCallBrPass()); + // Add both the safe stack and the stack protection passes: each of them will // only protect functions that have corresponding attributes. addPass(createSafeStackPass()); @@ -1082,8 +1083,8 @@ bool TargetPassConfig::addISelPasses() { if (TM->useEmulatedTLS()) addPass(createLowerEmuTLSPass()); - addPass(createPreISelIntrinsicLoweringPass()); PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis())); + addPass(createPreISelIntrinsicLoweringPass()); addPass(createExpandLargeDivRemPass()); addPass(createExpandLargeFpConvertPass()); addIRPasses(); @@ -1149,9 +1150,9 @@ void TargetPassConfig::addMachinePasses() { sampleprof::FSDiscriminatorPass::Pass1)); const std::string ProfileFile = getFSProfileFile(TM); if (!ProfileFile.empty() && !DisableRAFSProfileLoader) - addPass( - createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM), - sampleprof::FSDiscriminatorPass::Pass1)); + addPass(createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM), + sampleprof::FSDiscriminatorPass::Pass1, + nullptr)); } // Run register allocation and passes that are tightly coupled with it, @@ -1219,14 +1220,6 @@ void TargetPassConfig::addMachinePasses() { addPass(&XRayInstrumentationID); addPass(&PatchableFunctionID); - if (EnableFSDiscriminator && !FSNoFinalDiscrim) - // Add FS discriminators here so that all the instruction duplicates - // in different BBs get their own discriminators. With this, we can "sum" - // the SampleFDO counters instead of using MAX. This will improve the - // SampleFDO profile quality. - addPass(createMIRAddFSDiscriminatorsPass( - sampleprof::FSDiscriminatorPass::PassLast)); - addPreEmitPass(); if (TM->Options.EnableIPRA) @@ -1252,6 +1245,10 @@ void TargetPassConfig::addMachinePasses() { addPass(createMachineOutlinerPass(RunOnAllFunctions)); } + if (EnableFSDiscriminator) + addPass(createMIRAddFSDiscriminatorsPass( + sampleprof::FSDiscriminatorPass::PassLast)); + // Machine function splitter uses the basic block sections feature. Both // cannot be enabled at the same time. Basic block sections takes precedence. // FIXME: In principle, BasicBlockSection::Labels and splitting can used @@ -1264,9 +1261,25 @@ void TargetPassConfig::addMachinePasses() { addPass(llvm::createBasicBlockSectionsPass()); } else if (TM->Options.EnableMachineFunctionSplitter || EnableMachineFunctionSplitter) { + const std::string ProfileFile = getFSProfileFile(TM); + if (!ProfileFile.empty()) { + if (EnableFSDiscriminator) { + addPass(createMIRProfileLoaderPass( + ProfileFile, getFSRemappingFile(TM), + sampleprof::FSDiscriminatorPass::PassLast, nullptr)); + } else { + // Sample profile is given, but FSDiscriminator is not + // enabled, this may result in performance regression. + WithColor::warning() + << "Using AutoFDO without FSDiscriminator for MFS may regress " + "performance."; + } + } addPass(createMachineFunctionSplitterPass()); } + addPostBBSections(); + if (!DisableCFIFixup && TM->Options.EnableCFIFixup) addPass(createCFIFixup()); @@ -1525,9 +1538,9 @@ void TargetPassConfig::addBlockPlacement() { sampleprof::FSDiscriminatorPass::Pass2)); const std::string ProfileFile = getFSProfileFile(TM); if (!ProfileFile.empty() && !DisableLayoutFSProfileLoader) - addPass( - createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM), - sampleprof::FSDiscriminatorPass::Pass2)); + addPass(createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM), + sampleprof::FSDiscriminatorPass::Pass2, + nullptr)); } if (addPass(&MachineBlockPlacementID)) { // Run a separate pass to collect block placement statistics. diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index a41d5999d961..77d2dfcf2323 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -33,7 +34,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Printable.h" #include "llvm/Support/raw_ostream.h" @@ -79,8 +79,8 @@ bool TargetRegisterInfo::shouldRegionSplitForVirtReg( void TargetRegisterInfo::markSuperRegs(BitVector &RegisterSet, MCRegister Reg) const { - for (MCSuperRegIterator AI(Reg, this, true); AI.isValid(); ++AI) - RegisterSet.set(*AI); + for (MCPhysReg SR : superregs_inclusive(Reg)) + RegisterSet.set(SR); } bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet, @@ -90,9 +90,9 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet, for (unsigned Reg : RegisterSet.set_bits()) { if (Checked[Reg]) continue; - for (MCSuperRegIterator SR(Reg, this); SR.isValid(); ++SR) { - if (!RegisterSet[*SR] && !is_contained(Exceptions, Reg)) { - dbgs() << "Error: Super register " << printReg(*SR, this) + for (MCPhysReg SR : superregs(Reg)) { + if (!RegisterSet[SR] && !is_contained(Exceptions, Reg)) { + dbgs() << "Error: Super register " << printReg(SR, this) << " of reserved register " << printReg(Reg, this) << " is not reserved.\n"; return false; @@ -100,7 +100,7 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet, // We transitively check superregs. So we can remember this for later // to avoid compiletime explosion in deep register hierarchies. - Checked.set(*SR); + Checked.set(SR); } } return true; @@ -281,7 +281,7 @@ const TargetRegisterClass *firstCommonClass(const uint32_t *A, const TargetRegisterInfo *TRI) { for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32) if (unsigned Common = *A++ & *B++) - return TRI->getRegClass(I + countTrailingZeros(Common)); + return TRI->getRegClass(I + llvm::countr_zero(Common)); return nullptr; } @@ -424,8 +424,8 @@ bool TargetRegisterInfo::getRegAllocationHints( SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); - const std::pair<Register, SmallVector<Register, 4>> &Hints_MRI = - MRI.getRegAllocationHints(VirtReg); + const std::pair<unsigned, SmallVector<Register, 4>> &Hints_MRI = + MRI.getRegAllocationHints(VirtReg); SmallSet<Register, 32> HintedRegs; // First hint may be a target hint. diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 8cb3667aea28..c3ea76bf8cea 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -87,18 +87,18 @@ static cl::opt<unsigned> MaxDataFlowEdge( namespace { class TwoAddressInstructionPass : public MachineFunctionPass { - MachineFunction *MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - const InstrItineraryData *InstrItins; - MachineRegisterInfo *MRI; - LiveVariables *LV; - LiveIntervals *LIS; - AliasAnalysis *AA; - CodeGenOpt::Level OptLevel; + MachineFunction *MF = nullptr; + const TargetInstrInfo *TII = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const InstrItineraryData *InstrItins = nullptr; + MachineRegisterInfo *MRI = nullptr; + LiveVariables *LV = nullptr; + LiveIntervals *LIS = nullptr; + AliasAnalysis *AA = nullptr; + CodeGenOpt::Level OptLevel = CodeGenOpt::None; // The current basic block being processed. - MachineBasicBlock *MBB; + MachineBasicBlock *MBB = nullptr; // Keep track the distance of a MI from the start of the current basic block. DenseMap<MachineInstr*, unsigned> DistanceMap; @@ -198,8 +198,6 @@ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE, "Two-Address instruction pass", false, false) -static bool isPlainlyKilled(MachineInstr *MI, Register Reg, LiveIntervals *LIS); - /// Return the MachineInstr* if it is the single def of the Reg in current BB. static MachineInstr *getSingleDef(Register Reg, MachineBasicBlock *BB, const MachineRegisterInfo *MRI) { @@ -287,7 +285,7 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII, /// Test if the given register value, which is used by the /// given instruction, is killed by the given instruction. -static bool isPlainlyKilled(MachineInstr *MI, Register Reg, +static bool isPlainlyKilled(const MachineInstr *MI, Register Reg, LiveIntervals *LIS) { if (LIS && Reg.isVirtual() && !LIS->isNotInMIMap(*MI)) { // FIXME: Sometimes tryInstructionTransform() will add instructions and @@ -311,6 +309,12 @@ static bool isPlainlyKilled(MachineInstr *MI, Register Reg, return MI->killsRegister(Reg); } +/// Test if the register used by the given operand is killed by the operand's +/// instruction. +static bool isPlainlyKilled(const MachineOperand &MO, LiveIntervals *LIS) { + return MO.isKill() || isPlainlyKilled(MO.getParent(), MO.getReg(), LIS); +} + /// Test if the given register value, which is used by the given /// instruction, is killed by the given instruction. This looks through /// coalescable copies to see if the original value is potentially not killed. @@ -404,7 +408,7 @@ findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB, } if (UseMI.isCommutable()) { unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex; - unsigned Src2 = UseMI.getOperandNo(UseOp); + unsigned Src2 = UseOp->getOperandNo(); if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) { MachineOperand &MO = UseMI.getOperand(Src1); if (MO.isReg() && MO.isUse() && @@ -693,10 +697,8 @@ bool TwoAddressInstructionPass::convertInstTo3Addr( assert(NewMI->getNumExplicitDefs() == 1); // Find the old and new def location. - auto OldIt = mi->defs().begin(); - auto NewIt = NewMI->defs().begin(); - unsigned OldIdx = mi->getOperandNo(OldIt); - unsigned NewIdx = NewMI->getOperandNo(NewIt); + unsigned OldIdx = mi->defs().begin()->getOperandNo(); + unsigned NewIdx = NewMI->defs().begin()->getOperandNo(); // Record that one def has been replaced by the other. unsigned NewInstrNum = NewMI->getDebugInstrNum(); @@ -863,8 +865,7 @@ bool TwoAddressInstructionPass::rescheduleMIBelowKill( Defs.push_back(MOReg); else { Uses.push_back(MOReg); - if (MOReg != Reg && (MO.isKill() || - (LIS && isPlainlyKilled(MI, MOReg, LIS)))) + if (MOReg != Reg && isPlainlyKilled(MO, LIS)) Kills.push_back(MOReg); } } @@ -915,8 +916,7 @@ bool TwoAddressInstructionPass::rescheduleMIBelowKill( } else { if (regOverlapsSet(Defs, MOReg, TRI)) return false; - bool isKill = - MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS)); + bool isKill = isPlainlyKilled(MO, LIS); if (MOReg != Reg && ((isKill && regOverlapsSet(Uses, MOReg, TRI)) || regOverlapsSet(Kills, MOReg, TRI))) // Don't want to extend other live ranges and update kills. @@ -1044,7 +1044,7 @@ bool TwoAddressInstructionPass::rescheduleKillAboveMI( continue; if (isDefTooClose(MOReg, DI->second, MI)) return false; - bool isKill = MO.isKill() || (LIS && isPlainlyKilled(KillMI, MOReg, LIS)); + bool isKill = isPlainlyKilled(MO, LIS); if (MOReg == Reg && !isKill) return false; Uses.push_back(MOReg); @@ -1086,8 +1086,7 @@ bool TwoAddressInstructionPass::rescheduleKillAboveMI( if (regOverlapsSet(Kills, MOReg, TRI)) // Don't want to extend other live ranges and update kills. return false; - if (&OtherMI != MI && MOReg == Reg && - !(MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS)))) + if (&OtherMI != MI && MOReg == Reg && !isPlainlyKilled(MO, LIS)) // We can't schedule across a use of the register in question. return false; } else { @@ -1533,8 +1532,8 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, S.addSegment(LiveRange::Segment(LastCopyIdx, endIdx, VNI)); } } else { - for (MCRegUnitIterator Unit(RegA, TRI); Unit.isValid(); ++Unit) { - if (LiveRange *LR = LIS->getCachedRegUnit(*Unit)) { + for (MCRegUnit Unit : TRI->regunits(RegA)) { + if (LiveRange *LR = LIS->getCachedRegUnit(Unit)) { VNInfo *VNI = LR->getNextValue(LastCopyIdx, LIS->getVNInfoAllocator()); LR->addSegment(LiveRange::Segment(LastCopyIdx, endIdx, VNI)); @@ -1566,8 +1565,8 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, if (AllUsesCopied) { LaneBitmask RemainingUses = LaneBitmask::getNone(); // Replace other (un-tied) uses of regB with LastCopiedReg. - for (MachineOperand &MO : MI->operands()) { - if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) { + for (MachineOperand &MO : MI->all_uses()) { + if (MO.getReg() == RegB) { if (MO.getSubReg() == SubRegB && !IsEarlyClobber) { if (MO.isKill()) { MO.setIsKill(false); @@ -1619,8 +1618,8 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, // regB is still used in this instruction, but a kill flag was // removed from a different tied use of regB, so now we need to add // a kill flag to one of the remaining uses of regB. - for (MachineOperand &MO : MI->operands()) { - if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) { + for (MachineOperand &MO : MI->all_uses()) { + if (MO.getReg() == RegB) { MO.setIsKill(true); break; } diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp index e6c0b3242d67..426292345a14 100644 --- a/llvm/lib/CodeGen/TypePromotion.cpp +++ b/llvm/lib/CodeGen/TypePromotion.cpp @@ -235,8 +235,6 @@ bool TypePromotionImpl::isSource(Value *V) { return true; else if (isa<LoadInst>(V)) return true; - else if (isa<BitCastInst>(V)) - return true; else if (auto *Call = dyn_cast<CallInst>(V)) return Call->hasRetAttr(Attribute::AttrKind::ZExt); else if (auto *Trunc = dyn_cast<TruncInst>(V)) @@ -724,8 +722,9 @@ bool TypePromotionImpl::isSupportedValue(Value *V) { case Instruction::Ret: case Instruction::Load: case Instruction::Trunc: - case Instruction::BitCast: return isSupportedType(I); + case Instruction::BitCast: + return I->getOperand(0)->getType() == I->getType(); case Instruction::ZExt: return isSupportedType(I->getOperand(0)); case Instruction::ICmp: @@ -960,8 +959,8 @@ bool TypePromotionImpl::run(Function &F, const TargetMachine *TM, if (isa<ZExtInst>(&I) && isa<PHINode>(I.getOperand(0)) && isa<IntegerType>(I.getType()) && BBIsInLoop(&BB)) { - LLVM_DEBUG(dbgs() << "IR Promotion: Searching from: " << I.getOperand(0) - << "\n"); + LLVM_DEBUG(dbgs() << "IR Promotion: Searching from: " + << *I.getOperand(0) << "\n"); EVT ZExtVT = TLI->getValueType(DL, I.getType()); Instruction *Phi = static_cast<Instruction *>(I.getOperand(0)); auto PromoteWidth = ZExtVT.getFixedSizeInBits(); diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp index 5e8514f525e9..f17450d264ba 100644 --- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp +++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp @@ -120,16 +120,14 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { while (BB.succ_begin() != BB.succ_end()) { MachineBasicBlock* succ = *BB.succ_begin(); - MachineBasicBlock::iterator start = succ->begin(); - while (start != succ->end() && start->isPHI()) { - for (unsigned i = start->getNumOperands() - 1; i >= 2; i-=2) - if (start->getOperand(i).isMBB() && - start->getOperand(i).getMBB() == &BB) { - start->removeOperand(i); - start->removeOperand(i-1); + for (MachineInstr &Phi : succ->phis()) { + for (unsigned i = Phi.getNumOperands() - 1; i >= 2; i -= 2) { + if (Phi.getOperand(i).isMBB() && + Phi.getOperand(i).getMBB() == &BB) { + Phi.removeOperand(i); + Phi.removeOperand(i - 1); } - - start++; + } } BB.removeSuccessor(BB.succ_begin()); @@ -152,18 +150,18 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { // Prune unneeded PHI entries. SmallPtrSet<MachineBasicBlock*, 8> preds(BB.pred_begin(), BB.pred_end()); - MachineBasicBlock::iterator phi = BB.begin(); - while (phi != BB.end() && phi->isPHI()) { - for (unsigned i = phi->getNumOperands() - 1; i >= 2; i-=2) - if (!preds.count(phi->getOperand(i).getMBB())) { - phi->removeOperand(i); - phi->removeOperand(i-1); + for (MachineInstr &Phi : make_early_inc_range(BB.phis())) { + for (unsigned i = Phi.getNumOperands() - 1; i >= 2; i -= 2) { + if (!preds.count(Phi.getOperand(i).getMBB())) { + Phi.removeOperand(i); + Phi.removeOperand(i - 1); ModifiedPHI = true; } + } - if (phi->getNumOperands() == 3) { - const MachineOperand &Input = phi->getOperand(1); - const MachineOperand &Output = phi->getOperand(0); + if (Phi.getNumOperands() == 3) { + const MachineOperand &Input = Phi.getOperand(1); + const MachineOperand &Output = Phi.getOperand(0); Register InputReg = Input.getReg(); Register OutputReg = Output.getReg(); assert(Output.getSubReg() == 0 && "Cannot have output subregister"); @@ -182,16 +180,13 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { // insert a COPY instead of simply replacing the output // with the input. const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo(); - BuildMI(BB, BB.getFirstNonPHI(), phi->getDebugLoc(), + BuildMI(BB, BB.getFirstNonPHI(), Phi.getDebugLoc(), TII->get(TargetOpcode::COPY), OutputReg) .addReg(InputReg, getRegState(Input), InputSub); } - phi++->eraseFromParent(); + Phi.eraseFromParent(); } - continue; } - - ++phi; } } diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp index 88460971338c..fc1cbfefb0db 100644 --- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp +++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp @@ -209,7 +209,7 @@ void VLIWMachineScheduler::schedule() { Topo.InitDAGTopologicalSorting(); // Postprocess the DAG to add platform-specific artificial dependencies. - postprocessDAG(); + postProcessDAG(); SmallVector<SUnit *, 8> TopRoots, BotRoots; findRootsAndBiasEdges(TopRoots, BotRoots); diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 608434800bc3..d514e1642e29 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TypeSize.h" #include "llvm/Support/WithColor.h" @@ -173,9 +174,20 @@ std::string EVT::getEVTString() const { case MVT::Untyped: return "Untyped"; case MVT::funcref: return "funcref"; case MVT::externref: return "externref"; + case MVT::aarch64svcount: + return "aarch64svcount"; + case MVT::spirvbuiltin: + return "spirvbuiltin"; } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void EVT::dump() const { + print(dbgs()); + dbgs() << "\n"; +} +#endif + /// getTypeForEVT - This method returns an LLVM type corresponding to the /// specified EVT. For integer types, this returns an unsigned type. Note /// that this will abort for types that cannot be represented. @@ -202,14 +214,12 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { case MVT::f128: return Type::getFP128Ty(Context); case MVT::ppcf128: return Type::getPPC_FP128Ty(Context); case MVT::x86mmx: return Type::getX86_MMXTy(Context); + case MVT::aarch64svcount: + return TargetExtType::get(Context, "aarch64.svcount"); case MVT::x86amx: return Type::getX86_AMXTy(Context); case MVT::i64x8: return IntegerType::get(Context, 512); - case MVT::externref: - // pointer to opaque struct in addrspace(10) - return PointerType::get(StructType::create(Context), 10); - case MVT::funcref: - // pointer to i8 addrspace(20) - return PointerType::get(Type::getInt8Ty(Context), 20); + case MVT::externref: return Type::getWasm_ExternrefTy(Context); + case MVT::funcref: return Type::getWasm_FuncrefTy(Context); case MVT::v1i1: return FixedVectorType::get(Type::getInt1Ty(Context), 1); case MVT::v2i1: @@ -561,6 +571,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { /// pointers as MVT::iPTR. If HandleUnknown is true, unknown types are returned /// as Other, otherwise they are invalid. MVT MVT::getVT(Type *Ty, bool HandleUnknown){ + assert(Ty != nullptr && "Invalid type"); switch (Ty->getTypeID()) { default: if (HandleUnknown) return MVT(MVT::Other); @@ -575,6 +586,16 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){ case Type::DoubleTyID: return MVT(MVT::f64); case Type::X86_FP80TyID: return MVT(MVT::f80); case Type::X86_MMXTyID: return MVT(MVT::x86mmx); + case Type::TargetExtTyID: { + TargetExtType *TargetExtTy = cast<TargetExtType>(Ty); + if (TargetExtTy->getName() == "aarch64.svcount") + return MVT(MVT::aarch64svcount); + else if (TargetExtTy->getName().starts_with("spirv.")) + return MVT(MVT::spirvbuiltin); + if (HandleUnknown) + return MVT(MVT::Other); + llvm_unreachable("Unknown target ext type!"); + } case Type::X86_AMXTyID: return MVT(MVT::x86amx); case Type::FP128TyID: return MVT(MVT::f128); case Type::PPC_FP128TyID: return MVT(MVT::ppcf128); @@ -607,3 +628,15 @@ EVT EVT::getEVT(Type *Ty, bool HandleUnknown){ } } } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void MVT::dump() const { + print(dbgs()); + dbgs() << "\n"; +} +#endif + +void MVT::print(raw_ostream &OS) const { + OS << EVT(*this).getEVTString(); +} + diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp index f80b06d7e9b7..a816bd5b52de 100644 --- a/llvm/lib/CodeGen/VirtRegMap.cpp +++ b/llvm/lib/CodeGen/VirtRegMap.cpp @@ -116,10 +116,10 @@ bool VirtRegMap::hasPreferredPhys(Register VirtReg) const { } bool VirtRegMap::hasKnownPreference(Register VirtReg) const { - std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(VirtReg); - if (Register::isPhysicalRegister(Hint.second)) + std::pair<unsigned, Register> Hint = MRI->getRegAllocationHint(VirtReg); + if (Hint.second.isPhysical()) return true; - if (Register::isVirtualRegister(Hint.second)) + if (Hint.second.isVirtual()) return hasPhys(Hint.second); return false; } @@ -181,14 +181,14 @@ LLVM_DUMP_METHOD void VirtRegMap::dump() const { namespace { class VirtRegRewriter : public MachineFunctionPass { - MachineFunction *MF; - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; - MachineRegisterInfo *MRI; - SlotIndexes *Indexes; - LiveIntervals *LIS; - VirtRegMap *VRM; - LiveDebugVariables *DebugVars; + MachineFunction *MF = nullptr; + const TargetRegisterInfo *TRI = nullptr; + const TargetInstrInfo *TII = nullptr; + MachineRegisterInfo *MRI = nullptr; + SlotIndexes *Indexes = nullptr; + LiveIntervals *LIS = nullptr; + VirtRegMap *VRM = nullptr; + LiveDebugVariables *DebugVars = nullptr; DenseSet<Register> RewriteRegs; bool ClearVirtRegs; @@ -514,8 +514,8 @@ bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI, SlotIndex MIIndex = LIS->getInstructionIndex(MI); SlotIndex BeforeMIUses = MIIndex.getBaseIndex(); SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex(); - for (MCRegUnitIterator Unit(SuperPhysReg, TRI); Unit.isValid(); ++Unit) { - const LiveRange &UnitRange = LIS->getRegUnit(*Unit); + for (MCRegUnit Unit : TRI->regunits(SuperPhysReg)) { + const LiveRange &UnitRange = LIS->getRegUnit(Unit); // If the regunit is live both before and after MI, // we assume it is live through. // Generally speaking, this is not true, because something like @@ -633,9 +633,8 @@ void VirtRegRewriter::rewrite() { // Don't bother maintaining accurate LiveIntervals for registers which were // already allocated. for (Register PhysReg : RewriteRegs) { - for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); - ++Units) { - LIS->removeRegUnit(*Units); + for (MCRegUnit Unit : TRI->regunits(PhysReg)) { + LIS->removeRegUnit(Unit); } } } diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp index 361f185243b1..cc04807e8455 100644 --- a/llvm/lib/CodeGen/WasmEHPrepare.cpp +++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp @@ -80,6 +80,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/WasmEHFuncInfo.h" +#include "llvm/IR/EHPersonalities.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/InitializePasses.h" @@ -209,6 +210,12 @@ bool WasmEHPrepare::prepareEHPads(Function &F) { if (CatchPads.empty() && CleanupPads.empty()) return false; + if (!F.hasPersonalityFn() || + !isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) { + report_fatal_error("Function '" + F.getName() + + "' does not have a correct Wasm personality function " + "'__gxx_wasm_personality_v0'"); + } assert(F.hasPersonalityFn() && "Personality function not found"); // __wasm_lpad_context global variable. diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp index dfca2be0a114..11597b119893 100644 --- a/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -18,12 +18,11 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/Triple.h" -#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" @@ -31,6 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" @@ -216,6 +216,127 @@ static void calculateStateNumbersForInvokes(const Function *Fn, } } +// See comments below for calculateSEHStateForAsynchEH(). +// State - incoming State of normal paths +struct WorkItem { + const BasicBlock *Block; + int State; + WorkItem(const BasicBlock *BB, int St) { + Block = BB; + State = St; + } +}; +void llvm::calculateCXXStateForAsynchEH(const BasicBlock *BB, int State, + WinEHFuncInfo &EHInfo) { + SmallVector<struct WorkItem *, 8> WorkList; + struct WorkItem *WI = new WorkItem(BB, State); + WorkList.push_back(WI); + + while (!WorkList.empty()) { + WI = WorkList.pop_back_val(); + const BasicBlock *BB = WI->Block; + int State = WI->State; + delete WI; + if (EHInfo.BlockToStateMap.count(BB) && EHInfo.BlockToStateMap[BB] <= State) + continue; // skip blocks already visited by lower State + + const llvm::Instruction *I = BB->getFirstNonPHI(); + const llvm::Instruction *TI = BB->getTerminator(); + if (I->isEHPad()) + State = EHInfo.EHPadStateMap[I]; + EHInfo.BlockToStateMap[BB] = State; // Record state, also flag visiting + + if ((isa<CleanupReturnInst>(TI) || isa<CatchReturnInst>(TI)) && State > 0) { + // Retrive the new State + State = EHInfo.CxxUnwindMap[State].ToState; // Retrive next State + } else if (isa<InvokeInst>(TI)) { + auto *Call = cast<CallBase>(TI); + const Function *Fn = Call->getCalledFunction(); + if (Fn && Fn->isIntrinsic() && + (Fn->getIntrinsicID() == Intrinsic::seh_scope_begin || + Fn->getIntrinsicID() == Intrinsic::seh_try_begin)) + // Retrive the new State from seh_scope_begin + State = EHInfo.InvokeStateMap[cast<InvokeInst>(TI)]; + else if (Fn && Fn->isIntrinsic() && + (Fn->getIntrinsicID() == Intrinsic::seh_scope_end || + Fn->getIntrinsicID() == Intrinsic::seh_try_end)) { + // In case of conditional ctor, let's retrieve State from Invoke + State = EHInfo.InvokeStateMap[cast<InvokeInst>(TI)]; + // end of current state, retrive new state from UnwindMap + State = EHInfo.CxxUnwindMap[State].ToState; + } + } + // Continue push successors into worklist + for (auto *SuccBB : successors(BB)) { + WI = new WorkItem(SuccBB, State); + WorkList.push_back(WI); + } + } +} + +// The central theory of this routine is based on the following: +// A _try scope is always a SEME (Single Entry Multiple Exits) region +// as jumping into a _try is not allowed +// The single entry must start with a seh_try_begin() invoke with a +// correct State number that is the initial state of the SEME. +// Through control-flow, state number is propagated into all blocks. +// Side exits marked by seh_try_end() will unwind to parent state via +// existing SEHUnwindMap[]. +// Side exits can ONLY jump into parent scopes (lower state number). +// Thus, when a block succeeds various states from its predecessors, +// the lowest State trumphs others. +// If some exits flow to unreachable, propagation on those paths terminate, +// not affecting remaining blocks. +void llvm::calculateSEHStateForAsynchEH(const BasicBlock *BB, int State, + WinEHFuncInfo &EHInfo) { + SmallVector<struct WorkItem *, 8> WorkList; + struct WorkItem *WI = new WorkItem(BB, State); + WorkList.push_back(WI); + + while (!WorkList.empty()) { + WI = WorkList.pop_back_val(); + const BasicBlock *BB = WI->Block; + int State = WI->State; + delete WI; + if (EHInfo.BlockToStateMap.count(BB) && EHInfo.BlockToStateMap[BB] <= State) + continue; // skip blocks already visited by lower State + + const llvm::Instruction *I = BB->getFirstNonPHI(); + const llvm::Instruction *TI = BB->getTerminator(); + if (I->isEHPad()) + State = EHInfo.EHPadStateMap[I]; + EHInfo.BlockToStateMap[BB] = State; // Record state + + if (isa<CatchPadInst>(I) && isa<CatchReturnInst>(TI)) { + const Constant *FilterOrNull = cast<Constant>( + cast<CatchPadInst>(I)->getArgOperand(0)->stripPointerCasts()); + const Function *Filter = dyn_cast<Function>(FilterOrNull); + if (!Filter || !Filter->getName().startswith("__IsLocalUnwind")) + State = EHInfo.SEHUnwindMap[State].ToState; // Retrive next State + } else if ((isa<CleanupReturnInst>(TI) || isa<CatchReturnInst>(TI)) && + State > 0) { + // Retrive the new State. + State = EHInfo.SEHUnwindMap[State].ToState; // Retrive next State + } else if (isa<InvokeInst>(TI)) { + auto *Call = cast<CallBase>(TI); + const Function *Fn = Call->getCalledFunction(); + if (Fn && Fn->isIntrinsic() && + Fn->getIntrinsicID() == Intrinsic::seh_try_begin) + // Retrive the new State from seh_try_begin + State = EHInfo.InvokeStateMap[cast<InvokeInst>(TI)]; + else if (Fn && Fn->isIntrinsic() && + Fn->getIntrinsicID() == Intrinsic::seh_try_end) + // end of current state, retrive new state from UnwindMap + State = EHInfo.SEHUnwindMap[State].ToState; + } + // Continue push successors into worklist + for (auto *SuccBB : successors(BB)) { + WI = new WorkItem(SuccBB, State); + WorkList.push_back(WI); + } + } +} + // Given BB which ends in an unwind edge, return the EHPad that this BB belongs // to. If the unwind edge came from an invoke, return null. static const BasicBlock *getEHPadFromPredecessor(const BasicBlock *BB, @@ -276,6 +397,7 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo, for (const auto *CatchPad : Handlers) { FuncInfo.FuncletBaseStateMap[CatchPad] = CatchLow; + FuncInfo.EHPadStateMap[CatchPad] = CatchLow; for (const User *U : CatchPad->users()) { const auto *UserI = cast<Instruction>(U); if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI)) { @@ -384,6 +506,7 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo, // Everything in the __try block uses TryState as its parent state. FuncInfo.EHPadStateMap[CatchSwitch] = TryState; + FuncInfo.EHPadStateMap[CatchPad] = TryState; LLVM_DEBUG(dbgs() << "Assigning state #" << TryState << " to BB " << CatchPadBB->getName() << '\n'); for (const BasicBlock *PredBlock : predecessors(BB)) @@ -464,6 +587,12 @@ void llvm::calculateSEHStateNumbers(const Function *Fn, } calculateStateNumbersForInvokes(Fn, FuncInfo); + + bool IsEHa = Fn->getParent()->getModuleFlag("eh-asynch"); + if (IsEHa) { + const BasicBlock *EntryBB = &(Fn->getEntryBlock()); + calculateSEHStateForAsynchEH(EntryBB, -1, FuncInfo); + } } void llvm::calculateWinCXXEHStateNumbers(const Function *Fn, @@ -482,6 +611,12 @@ void llvm::calculateWinCXXEHStateNumbers(const Function *Fn, } calculateStateNumbersForInvokes(Fn, FuncInfo); + + bool IsEHa = Fn->getParent()->getModuleFlag("eh-asynch"); + if (IsEHa) { + const BasicBlock *EntryBB = &(Fn->getEntryBlock()); + calculateCXXStateForAsynchEH(EntryBB, -1, FuncInfo); + } } static int addClrEHHandler(WinEHFuncInfo &FuncInfo, int HandlerParentState, @@ -602,7 +737,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, // so visit pads in descendant-most to ancestor-most order. for (ClrEHUnwindMapEntry &Entry : llvm::reverse(FuncInfo.ClrEHUnwindMap)) { const Instruction *Pad = - Entry.Handler.get<const BasicBlock *>()->getFirstNonPHI(); + cast<const BasicBlock *>(Entry.Handler)->getFirstNonPHI(); // For most pads, the TryParentState is the state associated with the // unwind dest of exceptional exits from it. const BasicBlock *UnwindDest; @@ -638,8 +773,8 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, int UserUnwindState = FuncInfo.ClrEHUnwindMap[UserState].TryParentState; if (UserUnwindState != -1) - UserUnwindDest = FuncInfo.ClrEHUnwindMap[UserUnwindState] - .Handler.get<const BasicBlock *>(); + UserUnwindDest = cast<const BasicBlock *>( + FuncInfo.ClrEHUnwindMap[UserUnwindState].Handler); } // Not having an unwind dest for this user might indicate that it @@ -1253,4 +1388,9 @@ void WinEHFuncInfo::addIPToStateRange(const InvokeInst *II, LabelToStateMap[InvokeBegin] = std::make_pair(InvokeStateMap[II], InvokeEnd); } +void WinEHFuncInfo::addIPToStateRange(int State, MCSymbol* InvokeBegin, + MCSymbol* InvokeEnd) { + LabelToStateMap[InvokeBegin] = std::make_pair(State, InvokeEnd); +} + WinEHFuncInfo::WinEHFuncInfo() = default; diff --git a/llvm/lib/CodeGen/XRayInstrumentation.cpp b/llvm/lib/CodeGen/XRayInstrumentation.cpp index 13f45ae048bb..d40725838c94 100644 --- a/llvm/lib/CodeGen/XRayInstrumentation.cpp +++ b/llvm/lib/CodeGen/XRayInstrumentation.cpp @@ -15,7 +15,6 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Triple.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" @@ -29,6 +28,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/Triple.h" using namespace llvm; @@ -226,6 +226,7 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { case Triple::ArchType::thumb: case Triple::ArchType::aarch64: case Triple::ArchType::hexagon: + case Triple::ArchType::loongarch64: case Triple::ArchType::mips: case Triple::ArchType::mipsel: case Triple::ArchType::mips64: |