diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2018-02-24 21:27:30 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2018-02-24 21:27:30 +0000 |
commit | 0f8e52dfc671bf6e2c09c8a28062ec76237954ea (patch) | |
tree | 03012a05e4c16a3dd809c281777acd1d9fe4a127 | |
parent | 3c315f3a8e8f326948fc789f146794ecd33cc540 (diff) |
Vendor import of llvm release_60 branch r325932:vendor/llvm/llvm-release_60-r325932
Notes
Notes:
svn path=/vendor/llvm/dist-release_60/; revision=329931
svn path=/vendor/llvm/llvm-release_60-r325932/; revision=329932; tag=vendor/llvm/llvm-release_60-r325932
35 files changed, 867 insertions, 227 deletions
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index 949ec85c270b..f2bbdc871add 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -5,12 +5,6 @@ LLVM 6.0.0 Release Notes .. contents:: :local: -.. warning:: - These are in-progress notes for the upcoming LLVM 6 release. - Release notes for previous releases can be found on - `the Download Page <http://releases.llvm.org/download.html>`_. - - Introduction ============ @@ -26,11 +20,6 @@ have questions or comments, the `LLVM Developer's Mailing List <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send them. -Note that if you are reading this file from a Subversion checkout or the main -LLVM web page, this document applies to the *next* release, not the current -one. To see the release notes for a specific release, please see the `releases -page <http://llvm.org/releases/>`_. - Non-comprehensive list of changes in this release ================================================= .. NOTE @@ -56,6 +45,9 @@ Non-comprehensive list of changes in this release * Significantly improved quality of CodeView debug info for Windows. +* Preliminary support for Sanitizers and sibling features on X86(_64) NetBSD + (ASan, UBsan, TSan, MSan, SafeStack, libFuzzer). + * Note.. .. NOTE @@ -71,6 +63,15 @@ Non-comprehensive list of changes in this release Changes to the LLVM IR ---------------------- +* The fast-math-flags (FMF) have been updated. Previously, the 'fast' flag + indicated that floating-point reassociation was allowed and all other flags + were set too. The 'fast' flag still exists, but there is a new flag called + 'reassoc' to indicate specifically that reassociation is allowed. A new bit + called 'afn' was also added to selectively allow approximations for common + mathlib functions like square-root. The new flags provide more flexibility + to enable/disable specific floating-point optimizations. Making the + optimizer respond appropriately to these flags is an ongoing effort. + Changes to the AArch64 Target ----------------------------- @@ -112,8 +113,44 @@ Changes to the Hexagon Target Changes to the MIPS Target -------------------------- - During this release ... - +Fixed numerous bugs: + +* fpowi on MIPS64 giving incorrect results when used with a negative integer. +* Usage of the asm 'c' constraint with the wrong datatype causing an + assert/crash. +* Fixed a conversion bug when using the DSP ASE. +* Fixed an inconsistency where objects were not marked as using the microMIPS as + when the micromips function attribute or the ".set micromips" directive was + used. +* Reordered the MIPSR6 specific hazard scheduler pass to after the delay slot + filler, fixing a class of rare edge case bugs where the delay slot filler + would violate ISA restrictions. +* Fixed a crash when using a type of unknown size with gp relative addressing. +* Corrected the j macro for microMIPS. +* Corrected the encoding of movep for microMIPS32r6. +* Fixed an issue with the usage of insert instructions having an invalid set of + operands. +* Fixed an issue where TLS symbols where not marked as such. +* Enabled the usage of register scavanging with MSA, due to its' shorter offsets + for loads and stores. +* Corrected the ELF headers when using the DSP ASE. + +New features: + +* The long branch pass now generates some R6 specific instructions when + targeting MIPSR6. +* The delay slot filler now performs more branch conversions if delay slots + cannot be filled. +* The MIPS MT ASE is now fully supported. +* Added support for the ``lapc`` pseudo instruction. +* Improved the selection of multiple instructions (``dext``, ``nmadd``, + ``nmsub``). +* Further improved microMIPS codesize reduction. + +Deprecation notices: + +* microMIPS64R6 support was been deprecated since 5.0, and has now been + completely removed. Changes to the PowerPC Target ----------------------------- @@ -132,11 +169,43 @@ During this release the SystemZ target has: Changes to the X86 Target ------------------------- -During this release ... +During this release the X86 target has: -* Got support for enabling SjLj exception handling on platforms where it +* Added support for enabling SjLj exception handling on platforms where it isn't the default. +* Added intrinsics for Intel Extensions: VAES, GFNI, VPCLMULQDQ, AVX512VBMI2, AVX512BITALG, AVX512VNNI. + +* Added support for Intel Icelake CPU. + +* Fixed some X87 codegen bugs. + +* Added instruction scheduling information for Intel Sandy Bridge, Ivy Bridge, Haswell, Broadwell, and Skylake CPUs. + +* Improved scheduler model for AMD Jaguar CPUs. + +* Improved llvm-mc's disassembler for some EVEX encoded instructions. + +* Add support for i8 and i16 vector signed/unsigned min/max horizontal reductions. + +* Improved codegen for memory comparisons + +* Improved codegen for i32 vector multiplies + +* Improved codegen for scalar integer absolute values + +* Improved codegen for vector integer rotations (XOP and AVX512) + +* Improved codegen of data being transferred between GPRs and K-registers. + +* Improved codegen for vector truncations. + +* Improved folding of address computations into gather/scatter instructions. + +* Gained initial support recognizing variable shuffles from vector element extracts and inserts. + +* Improved documentation for SSE/AVX intrinsics in *intrin.h header files. + Changes to the AMDGPU Target ----------------------------- diff --git a/docs/index.rst b/docs/index.rst index 47c2f0473931..de53b0df6906 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,11 +1,6 @@ Overview ======== -.. warning:: - - If you are using a released version of LLVM, see `the download page - <http://llvm.org/releases/>`_ to find your documentation. - The LLVM compiler infrastructure supports a wide range of projects, from industrial strength compilers to specialized JIT applications to small research projects. diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h index 70194c043479..01419d7ae2bf 100644 --- a/include/llvm/Bitcode/LLVMBitCodes.h +++ b/include/llvm/Bitcode/LLVMBitCodes.h @@ -395,6 +395,20 @@ enum OverflowingBinaryOperatorOptionalFlags { OBO_NO_SIGNED_WRAP = 1 }; +/// FastMath Flags +/// This is a fixed layout derived from the bitcode emitted by LLVM 5.0 +/// intended to decouple the in-memory representation from the serialization. +enum FastMathMap { + UnsafeAlgebra = (1 << 0), // Legacy + NoNaNs = (1 << 1), + NoInfs = (1 << 2), + NoSignedZeros = (1 << 3), + AllowReciprocal = (1 << 4), + AllowContract = (1 << 5), + ApproxFunc = (1 << 6), + AllowReassoc = (1 << 7) +}; + /// PossiblyExactOperatorOptionalFlags - Flags for serializing /// PossiblyExactOperator's SubclassOptionalData contents. enum PossiblyExactOperatorOptionalFlags { PEO_EXACT = 0 }; diff --git a/include/llvm/MC/MCAsmMacro.h b/include/llvm/MC/MCAsmMacro.h index 34d14abc9645..dac8d1a80050 100644 --- a/include/llvm/MC/MCAsmMacro.h +++ b/include/llvm/MC/MCAsmMacro.h @@ -33,6 +33,6 @@ public: MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P) : Name(N), Body(B), Parameters(std::move(P)) {} }; -}; // namespace llvm +} // namespace llvm #endif diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h index 750666136507..fb53647112f9 100644 --- a/include/llvm/Transforms/Utils/LoopUtils.h +++ b/include/llvm/Transforms/Utils/LoopUtils.h @@ -21,6 +21,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" @@ -172,15 +173,25 @@ public: Value *Left, Value *Right); /// Returns true if Phi is a reduction of type Kind and adds it to the - /// RecurrenceDescriptor. + /// RecurrenceDescriptor. If either \p DB is non-null or \p AC and \p DT are + /// non-null, the minimal bit width needed to compute the reduction will be + /// computed. static bool AddReductionVar(PHINode *Phi, RecurrenceKind Kind, Loop *TheLoop, bool HasFunNoNaNAttr, - RecurrenceDescriptor &RedDes); - - /// Returns true if Phi is a reduction in TheLoop. The RecurrenceDescriptor is - /// returned in RedDes. + RecurrenceDescriptor &RedDes, + DemandedBits *DB = nullptr, + AssumptionCache *AC = nullptr, + DominatorTree *DT = nullptr); + + /// Returns true if Phi is a reduction in TheLoop. The RecurrenceDescriptor + /// is returned in RedDes. If either \p DB is non-null or \p AC and \p DT are + /// non-null, the minimal bit width needed to compute the reduction will be + /// computed. static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, - RecurrenceDescriptor &RedDes); + RecurrenceDescriptor &RedDes, + DemandedBits *DB = nullptr, + AssumptionCache *AC = nullptr, + DominatorTree *DT = nullptr); /// Returns true if Phi is a first-order recurrence. A first-order recurrence /// is a non-reduction recurrence relation in which the value of the @@ -218,24 +229,6 @@ public: /// Returns true if the recurrence kind is an arithmetic kind. static bool isArithmeticRecurrenceKind(RecurrenceKind Kind); - /// Determines if Phi may have been type-promoted. If Phi has a single user - /// that ANDs the Phi with a type mask, return the user. RT is updated to - /// account for the narrower bit width represented by the mask, and the AND - /// instruction is added to CI. - static Instruction *lookThroughAnd(PHINode *Phi, Type *&RT, - SmallPtrSetImpl<Instruction *> &Visited, - SmallPtrSetImpl<Instruction *> &CI); - - /// Returns true if all the source operands of a recurrence are either - /// SExtInsts or ZExtInsts. This function is intended to be used with - /// lookThroughAnd to determine if the recurrence has been type-promoted. The - /// source operands are added to CI, and IsSigned is updated to indicate if - /// all source operands are SExtInsts. - static bool getSourceExtensionKind(Instruction *Start, Instruction *Exit, - Type *RT, bool &IsSigned, - SmallPtrSetImpl<Instruction *> &Visited, - SmallPtrSetImpl<Instruction *> &CI); - /// Returns the type of the recurrence. This type can be narrower than the /// actual type of the Phi if the recurrence has been type-promoted. Type *getRecurrenceType() { return RecurrenceType; } diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 10b5c74e378b..bfff7afb5b4e 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -205,6 +205,11 @@ static cl::opt<unsigned> cl::desc("Max coefficients in AddRec during evolving"), cl::init(16)); +static cl::opt<bool> VersionUnknown( + "scev-version-unknown", cl::Hidden, + cl::desc("Use predicated scalar evolution to version SCEVUnknowns"), + cl::init(false)); + //===----------------------------------------------------------------------===// // SCEV class definitions //===----------------------------------------------------------------------===// @@ -11467,6 +11472,8 @@ private: // couldn't create an AddRec for it, or couldn't add the predicate), we just // return \p Expr. const SCEV *convertToAddRecWithPreds(const SCEVUnknown *Expr) { + if (!VersionUnknown) + return Expr; if (!isa<PHINode>(Expr->getValue())) return Expr; Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>> diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index 95291a1caf9a..945ac4515368 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -1046,19 +1046,21 @@ static Comdat::SelectionKind getDecodedComdatSelectionKind(unsigned Val) { static FastMathFlags getDecodedFastMathFlags(unsigned Val) { FastMathFlags FMF; - if (0 != (Val & FastMathFlags::AllowReassoc)) + if (0 != (Val & bitc::UnsafeAlgebra)) + FMF.setFast(); + if (0 != (Val & bitc::AllowReassoc)) FMF.setAllowReassoc(); - if (0 != (Val & FastMathFlags::NoNaNs)) + if (0 != (Val & bitc::NoNaNs)) FMF.setNoNaNs(); - if (0 != (Val & FastMathFlags::NoInfs)) + if (0 != (Val & bitc::NoInfs)) FMF.setNoInfs(); - if (0 != (Val & FastMathFlags::NoSignedZeros)) + if (0 != (Val & bitc::NoSignedZeros)) FMF.setNoSignedZeros(); - if (0 != (Val & FastMathFlags::AllowReciprocal)) + if (0 != (Val & bitc::AllowReciprocal)) FMF.setAllowReciprocal(); - if (0 != (Val & FastMathFlags::AllowContract)) + if (0 != (Val & bitc::AllowContract)) FMF.setAllowContract(true); - if (0 != (Val & FastMathFlags::ApproxFunc)) + if (0 != (Val & bitc::ApproxFunc)) FMF.setApproxFunc(); return FMF; } diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index a7201ed97350..7bf37857eb97 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1330,19 +1330,19 @@ static uint64_t getOptimizationFlags(const Value *V) { Flags |= 1 << bitc::PEO_EXACT; } else if (const auto *FPMO = dyn_cast<FPMathOperator>(V)) { if (FPMO->hasAllowReassoc()) - Flags |= FastMathFlags::AllowReassoc; + Flags |= bitc::AllowReassoc; if (FPMO->hasNoNaNs()) - Flags |= FastMathFlags::NoNaNs; + Flags |= bitc::NoNaNs; if (FPMO->hasNoInfs()) - Flags |= FastMathFlags::NoInfs; + Flags |= bitc::NoInfs; if (FPMO->hasNoSignedZeros()) - Flags |= FastMathFlags::NoSignedZeros; + Flags |= bitc::NoSignedZeros; if (FPMO->hasAllowReciprocal()) - Flags |= FastMathFlags::AllowReciprocal; + Flags |= bitc::AllowReciprocal; if (FPMO->hasAllowContract()) - Flags |= FastMathFlags::AllowContract; + Flags |= bitc::AllowContract; if (FPMO->hasApproxFunc()) - Flags |= FastMathFlags::ApproxFunc; + Flags |= bitc::ApproxFunc; } return Flags; @@ -3183,7 +3183,7 @@ void ModuleBitcodeWriter::writeBlockInfo() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); // flags + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) != FUNCTION_INST_BINOP_FLAGS_ABBREV) llvm_unreachable("Unexpected abbrev ordering!"); diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index 5723f8fcf5bb..d968688911eb 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -4,7 +4,8 @@ if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ ) endif() if( MSVC OR MINGW ) # libuuid required for FOLDERID_Profile usage in lib/Support/Windows/Path.inc. - set(system_libs ${system_libs} psapi shell32 ole32 uuid) + # advapi32 required for CryptAcquireContextW in lib/Support/Windows/Path.inc. + set(system_libs ${system_libs} psapi shell32 ole32 uuid advapi32) elseif( CMAKE_HOST_UNIX ) if( HAVE_LIBRT ) set(system_libs ${system_libs} rt) diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 2bb9e381073a..0bc5b395499e 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -133,16 +133,21 @@ AArch64InstructionSelector::AArch64InstructionSelector( // for each class in the bank. static const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB, - const RegisterBankInfo &RBI) { + const RegisterBankInfo &RBI, + bool GetAllRegSet = false) { if (RB.getID() == AArch64::GPRRegBankID) { if (Ty.getSizeInBits() <= 32) - return &AArch64::GPR32RegClass; + return GetAllRegSet ? &AArch64::GPR32allRegClass + : &AArch64::GPR32RegClass; if (Ty.getSizeInBits() == 64) - return &AArch64::GPR64RegClass; + return GetAllRegSet ? &AArch64::GPR64allRegClass + : &AArch64::GPR64RegClass; return nullptr; } if (RB.getID() == AArch64::FPRRegBankID) { + if (Ty.getSizeInBits() <= 16) + return &AArch64::FPR16RegClass; if (Ty.getSizeInBits() == 32) return &AArch64::FPR32RegClass; if (Ty.getSizeInBits() == 64) @@ -310,19 +315,46 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, return GenericOpc; } +static bool selectFP16CopyFromGPR32(MachineInstr &I, const TargetInstrInfo &TII, + MachineRegisterInfo &MRI, unsigned SrcReg) { + // Copies from gpr32 to fpr16 need to use a sub-register copy. + unsigned CopyReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY)) + .addDef(CopyReg) + .addUse(SrcReg); + unsigned SubRegCopy = MRI.createVirtualRegister(&AArch64::FPR16RegClass); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY)) + .addDef(SubRegCopy) + .addUse(CopyReg, 0, AArch64::hsub); + + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(SubRegCopy); + return true; +} + static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) { unsigned DstReg = I.getOperand(0).getReg(); + unsigned SrcReg = I.getOperand(1).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { + if (TRI.getRegClass(AArch64::FPR16RegClassID)->contains(DstReg) && + !TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + const RegisterBank &RegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank( + MRI.getType(SrcReg), RegBank, RBI, /* GetAllRegSet */ true); + if (SrcRC == &AArch64::GPR32allRegClass) + return selectFP16CopyFromGPR32(I, TII, MRI, SrcReg); + } assert(I.isCopy() && "Generic operators do not allow physical registers"); return true; } const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - unsigned SrcReg = I.getOperand(1).getReg(); + (void)DstSize; const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); (void)SrcSize; assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) && @@ -340,26 +372,38 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, "Copy with different width?!"); assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) && "GPRs cannot get more than 64-bit width values"); - const TargetRegisterClass *RC = nullptr; - - if (RegBank.getID() == AArch64::FPRRegBankID) { - if (DstSize <= 16) - RC = &AArch64::FPR16RegClass; - else if (DstSize <= 32) - RC = &AArch64::FPR32RegClass; - else if (DstSize <= 64) - RC = &AArch64::FPR64RegClass; - else if (DstSize <= 128) - RC = &AArch64::FPR128RegClass; - else { - DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n'); - return false; + + const TargetRegisterClass *RC = getRegClassForTypeOnBank( + MRI.getType(DstReg), RegBank, RBI, /* GetAllRegSet */ true); + if (!RC) { + DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n'); + return false; + } + + if (!TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(SrcReg); + const TargetRegisterClass *SrcRC = + RegClassOrBank.dyn_cast<const TargetRegisterClass *>(); + const RegisterBank *RB = nullptr; + if (!SrcRC) { + RB = RegClassOrBank.get<const RegisterBank *>(); + SrcRC = getRegClassForTypeOnBank(MRI.getType(SrcReg), *RB, RBI, true); + } + // Copies from fpr16 to gpr32 need to use SUBREG_TO_REG. + if (RC == &AArch64::GPR32allRegClass && SrcRC == &AArch64::FPR16RegClass) { + unsigned PromoteReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass); + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(AArch64::SUBREG_TO_REG)) + .addDef(PromoteReg) + .addImm(0) + .addUse(SrcReg) + .addImm(AArch64::hsub); + MachineOperand &RegOp = I.getOperand(1); + RegOp.setReg(PromoteReg); + } else if (RC == &AArch64::FPR16RegClass && + SrcRC == &AArch64::GPR32allRegClass) { + selectFP16CopyFromGPR32(I, TII, MRI, SrcReg); } - } else { - assert(RegBank.getID() == AArch64::GPRRegBankID && - "Bitcast for the flags?"); - RC = - DstSize <= 32 ? &AArch64::GPR32allRegClass : &AArch64::GPR64allRegClass; } // No need to constrain SrcReg. It will get constrained when @@ -795,15 +839,23 @@ bool AArch64InstructionSelector::select(MachineInstr &I, } case TargetOpcode::G_EXTRACT: { LLT SrcTy = MRI.getType(I.getOperand(1).getReg()); + LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + unsigned SrcSize = SrcTy.getSizeInBits(); // Larger extracts are vectors, same-size extracts should be something else // by now (either split up or simplified to a COPY). if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32) return false; - I.setDesc(TII.get(AArch64::UBFMXri)); + I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri)); MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() + Ty.getSizeInBits() - 1); + if (SrcSize < 64) { + assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 && + "unexpected G_EXTRACT types"); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + unsigned DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(), TII.get(AArch64::COPY)) @@ -818,17 +870,26 @@ bool AArch64InstructionSelector::select(MachineInstr &I, case TargetOpcode::G_INSERT: { LLT SrcTy = MRI.getType(I.getOperand(2).getReg()); + LLT DstTy = MRI.getType(I.getOperand(0).getReg()); + unsigned DstSize = DstTy.getSizeInBits(); + (void)DstSize; // Larger inserts are vectors, same-size ones should be something else by // now (split up or turned into COPYs). if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32) return false; - I.setDesc(TII.get(AArch64::BFMXri)); + I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri)); unsigned LSB = I.getOperand(3).getImm(); unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits(); - I.getOperand(3).setImm((64 - LSB) % 64); + I.getOperand(3).setImm((DstSize - LSB) % DstSize); MachineInstrBuilder(MF, I).addImm(Width - 1); + if (DstSize < 64) { + assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 && + "unexpected G_INSERT types"); + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); + } + unsigned SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); BuildMI(MBB, I.getIterator(), I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG)) diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 2c127d787260..654b96f792b1 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3797,7 +3797,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } } - BuildMI(*MBB, Inst, Inst.getDebugLoc(), + MachineInstr *NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst) .add(*VAddr) // vaddr .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc @@ -3806,12 +3807,17 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) .addImm(0) // slc .addImm(0) // tfe - .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end()); + .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end()) + .getInstr(); MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), VDst); addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); Inst.eraseFromParent(); + + // Legalize all operands other than the offset. Notably, convert the srsrc + // into SGPRs using v_readfirstlane if needed. + legalizeOperands(*NewInstr); continue; } } diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp index fc638829378a..1d10ef9acfba 100644 --- a/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -454,13 +454,16 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { return true; } + // FREM is always a call. + if (J->getOpcode() == Instruction::FRem) + return true; + if (STI->useSoftFloat()) { switch(J->getOpcode()) { case Instruction::FAdd: case Instruction::FSub: case Instruction::FMul: case Instruction::FDiv: - case Instruction::FRem: case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::FPToUI: diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index ba97982e3330..cc4c8823c3da 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -740,7 +740,13 @@ class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel, def : SkylakeServerProc<"skylake-avx512">; def : SkylakeServerProc<"skx">; // Legacy alias. -def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [ +def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [ + FeatureAVX512, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, FeatureVBMI, FeatureIFMA, FeatureSHA diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index 6f26f7f5cd19..c790de3505f3 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1643,11 +1643,25 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { } } + auto canMergeSelectThroughBinop = [](BinaryOperator *BO) { + // The select might be preventing a division by 0. + switch (BO->getOpcode()) { + default: + return true; + case Instruction::SRem: + case Instruction::URem: + case Instruction::SDiv: + case Instruction::UDiv: + return false; + } + }; + // Try to simplify a binop sandwiched between 2 selects with the same // condition. // select(C, binop(select(C, X, Y), W), Z) -> select(C, binop(X, W), Z) BinaryOperator *TrueBO; - if (match(TrueVal, m_OneUse(m_BinOp(TrueBO)))) { + if (match(TrueVal, m_OneUse(m_BinOp(TrueBO))) && + canMergeSelectThroughBinop(TrueBO)) { if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) { if (TrueBOSI->getCondition() == CondVal) { TrueBO->setOperand(0, TrueBOSI->getTrueValue()); @@ -1666,7 +1680,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // select(C, Z, binop(select(C, X, Y), W)) -> select(C, Z, binop(Y, W)) BinaryOperator *FalseBO; - if (match(FalseVal, m_OneUse(m_BinOp(FalseBO)))) { + if (match(FalseVal, m_OneUse(m_BinOp(FalseBO))) && + canMergeSelectThroughBinop(FalseBO)) { if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) { if (FalseBOSI->getCondition() == CondVal) { FalseBO->setOperand(0, FalseBOSI->getFalseValue()); diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 4ea935793b80..946474fef062 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -97,7 +97,7 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + const Loop *CurLoop, LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE, bool FreeInLoop); static bool isSafeToExecuteUnconditionally(Instruction &Inst, const DominatorTree *DT, @@ -855,10 +855,16 @@ static Instruction *sinkThroughTriviallyReplacablePHI( return New; } -static bool canSplitPredecessors(PHINode *PN) { +static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) { BasicBlock *BB = PN->getParent(); if (!BB->canSplitPredecessors()) return false; + // It's not impossible to split EHPad blocks, but if BlockColors already exist + // it require updating BlockColors for all offspring blocks accordingly. By + // skipping such corner case, we can make updating BlockColors after splitting + // predecessor fairly simple. + if (!SafetyInfo->BlockColors.empty() && BB->getFirstNonPHI()->isEHPad()) + return false; for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { BasicBlock *BBPred = *PI; if (isa<IndirectBrInst>(BBPred->getTerminator())) @@ -868,7 +874,8 @@ static bool canSplitPredecessors(PHINode *PN) { } static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, - LoopInfo *LI, const Loop *CurLoop) { + LoopInfo *LI, const Loop *CurLoop, + LoopSafetyInfo *SafetyInfo) { #ifndef NDEBUG SmallVector<BasicBlock *, 32> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); @@ -910,13 +917,21 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, // LE: // %p = phi [%p1, %LE.split], [%p2, %LE.split2] // + auto &BlockColors = SafetyInfo->BlockColors; SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB)); while (!PredBBs.empty()) { BasicBlock *PredBB = *PredBBs.begin(); assert(CurLoop->contains(PredBB) && "Expect all predecessors are in the loop"); - if (PN->getBasicBlockIndex(PredBB) >= 0) - SplitBlockPredecessors(ExitBB, PredBB, ".split.loop.exit", DT, LI, true); + if (PN->getBasicBlockIndex(PredBB) >= 0) { + BasicBlock *NewPred = SplitBlockPredecessors( + ExitBB, PredBB, ".split.loop.exit", DT, LI, true); + // Since we do not allow splitting EH-block with BlockColors in + // canSplitPredecessors(), we can simply assign predecessor's color to + // the new block. + if (!BlockColors.empty()) + BlockColors[NewPred] = BlockColors[PredBB]; + } PredBBs.remove(PredBB); } } @@ -927,7 +942,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, /// position, and may either delete it or move it to outside of the loop. /// static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, - const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + const Loop *CurLoop, LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE, bool FreeInLoop) { DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); ORE->emit([&]() { @@ -975,12 +990,12 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, if (isTriviallyReplacablePHI(*PN, I)) continue; - if (!canSplitPredecessors(PN)) + if (!canSplitPredecessors(PN, SafetyInfo)) return Changed; // Split predecessors of the PHI so that we can make users trivially // replacable. - splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop); + splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo); // Should rebuild the iterators, as they may be invalidated by // splitPredecessorsOfLoopExit(). diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index a5a305ef582b..0a357f4b5004 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -30,6 +31,7 @@ #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -77,10 +79,13 @@ bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) { return false; } -Instruction * -RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT, - SmallPtrSetImpl<Instruction *> &Visited, - SmallPtrSetImpl<Instruction *> &CI) { +/// Determines if Phi may have been type-promoted. If Phi has a single user +/// that ANDs the Phi with a type mask, return the user. RT is updated to +/// account for the narrower bit width represented by the mask, and the AND +/// instruction is added to CI. +static Instruction *lookThroughAnd(PHINode *Phi, Type *&RT, + SmallPtrSetImpl<Instruction *> &Visited, + SmallPtrSetImpl<Instruction *> &CI) { if (!Phi->hasOneUse()) return Phi; @@ -101,70 +106,92 @@ RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT, return Phi; } -bool RecurrenceDescriptor::getSourceExtensionKind( - Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned, - SmallPtrSetImpl<Instruction *> &Visited, - SmallPtrSetImpl<Instruction *> &CI) { +/// Compute the minimal bit width needed to represent a reduction whose exit +/// instruction is given by Exit. +static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit, + DemandedBits *DB, + AssumptionCache *AC, + DominatorTree *DT) { + bool IsSigned = false; + const DataLayout &DL = Exit->getModule()->getDataLayout(); + uint64_t MaxBitWidth = DL.getTypeSizeInBits(Exit->getType()); + + if (DB) { + // Use the demanded bits analysis to determine the bits that are live out + // of the exit instruction, rounding up to the nearest power of two. If the + // use of demanded bits results in a smaller bit width, we know the value + // must be positive (i.e., IsSigned = false), because if this were not the + // case, the sign bit would have been demanded. + auto Mask = DB->getDemandedBits(Exit); + MaxBitWidth = Mask.getBitWidth() - Mask.countLeadingZeros(); + } + + if (MaxBitWidth == DL.getTypeSizeInBits(Exit->getType()) && AC && DT) { + // If demanded bits wasn't able to limit the bit width, we can try to use + // value tracking instead. This can be the case, for example, if the value + // may be negative. + auto NumSignBits = ComputeNumSignBits(Exit, DL, 0, AC, nullptr, DT); + auto NumTypeBits = DL.getTypeSizeInBits(Exit->getType()); + MaxBitWidth = NumTypeBits - NumSignBits; + KnownBits Bits = computeKnownBits(Exit, DL); + if (!Bits.isNonNegative()) { + // If the value is not known to be non-negative, we set IsSigned to true, + // meaning that we will use sext instructions instead of zext + // instructions to restore the original type. + IsSigned = true; + if (!Bits.isNegative()) + // If the value is not known to be negative, we don't known what the + // upper bit is, and therefore, we don't know what kind of extend we + // will need. In this case, just increase the bit width by one bit and + // use sext. + ++MaxBitWidth; + } + } + if (!isPowerOf2_64(MaxBitWidth)) + MaxBitWidth = NextPowerOf2(MaxBitWidth); + + return std::make_pair(Type::getIntNTy(Exit->getContext(), MaxBitWidth), + IsSigned); +} + +/// Collect cast instructions that can be ignored in the vectorizer's cost +/// model, given a reduction exit value and the minimal type in which the +/// reduction can be represented. +static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit, + Type *RecurrenceType, + SmallPtrSetImpl<Instruction *> &Casts) { SmallVector<Instruction *, 8> Worklist; - bool FoundOneOperand = false; - unsigned DstSize = RT->getPrimitiveSizeInBits(); + SmallPtrSet<Instruction *, 8> Visited; Worklist.push_back(Exit); - // Traverse the instructions in the reduction expression, beginning with the - // exit value. while (!Worklist.empty()) { - Instruction *I = Worklist.pop_back_val(); - for (Use &U : I->operands()) { - - // Terminate the traversal if the operand is not an instruction, or we - // reach the starting value. - Instruction *J = dyn_cast<Instruction>(U.get()); - if (!J || J == Start) - continue; - - // Otherwise, investigate the operation if it is also in the expression. - if (Visited.count(J)) { - Worklist.push_back(J); + Instruction *Val = Worklist.pop_back_val(); + Visited.insert(Val); + if (auto *Cast = dyn_cast<CastInst>(Val)) + if (Cast->getSrcTy() == RecurrenceType) { + // If the source type of a cast instruction is equal to the recurrence + // type, it will be eliminated, and should be ignored in the vectorizer + // cost model. + Casts.insert(Cast); continue; } - // If the operand is not in Visited, it is not a reduction operation, but - // it does feed into one. Make sure it is either a single-use sign- or - // zero-extend instruction. - CastInst *Cast = dyn_cast<CastInst>(J); - bool IsSExtInst = isa<SExtInst>(J); - if (!Cast || !Cast->hasOneUse() || !(isa<ZExtInst>(J) || IsSExtInst)) - return false; - - // Ensure the source type of the extend is no larger than the reduction - // type. It is not necessary for the types to be identical. - unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits(); - if (SrcSize > DstSize) - return false; - - // Furthermore, ensure that all such extends are of the same kind. - if (FoundOneOperand) { - if (IsSigned != IsSExtInst) - return false; - } else { - FoundOneOperand = true; - IsSigned = IsSExtInst; - } - - // Lastly, if the source type of the extend matches the reduction type, - // add the extend to CI so that we can avoid accounting for it in the - // cost model. - if (SrcSize == DstSize) - CI.insert(Cast); - } + // Add all operands to the work list if they are loop-varying values that + // we haven't yet visited. + for (Value *O : cast<User>(Val)->operands()) + if (auto *I = dyn_cast<Instruction>(O)) + if (TheLoop->contains(I) && !Visited.count(I)) + Worklist.push_back(I); } - return true; } bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, Loop *TheLoop, bool HasFunNoNaNAttr, - RecurrenceDescriptor &RedDes) { + RecurrenceDescriptor &RedDes, + DemandedBits *DB, + AssumptionCache *AC, + DominatorTree *DT) { if (Phi->getNumIncomingValues() != 2) return false; @@ -353,14 +380,49 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; - // If we think Phi may have been type-promoted, we also need to ensure that - // all source operands of the reduction are either SExtInsts or ZEstInsts. If - // so, we will be able to evaluate the reduction in the narrower bit width. - if (Start != Phi) - if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType, - IsSigned, VisitedInsts, CastInsts)) + if (Start != Phi) { + // If the starting value is not the same as the phi node, we speculatively + // looked through an 'and' instruction when evaluating a potential + // arithmetic reduction to determine if it may have been type-promoted. + // + // We now compute the minimal bit width that is required to represent the + // reduction. If this is the same width that was indicated by the 'and', we + // can represent the reduction in the smaller type. The 'and' instruction + // will be eliminated since it will essentially be a cast instruction that + // can be ignore in the cost model. If we compute a different type than we + // did when evaluating the 'and', the 'and' will not be eliminated, and we + // will end up with different kinds of operations in the recurrence + // expression (e.g., RK_IntegerAND, RK_IntegerADD). We give up if this is + // the case. + // + // The vectorizer relies on InstCombine to perform the actual + // type-shrinking. It does this by inserting instructions to truncate the + // exit value of the reduction to the width indicated by RecurrenceType and + // then extend this value back to the original width. If IsSigned is false, + // a 'zext' instruction will be generated; otherwise, a 'sext' will be + // used. + // + // TODO: We should not rely on InstCombine to rewrite the reduction in the + // smaller type. We should just generate a correctly typed expression + // to begin with. + Type *ComputedType; + std::tie(ComputedType, IsSigned) = + computeRecurrenceType(ExitInstruction, DB, AC, DT); + if (ComputedType != RecurrenceType) return false; + // The recurrence expression will be represented in a narrower type. If + // there are any cast instructions that will be unnecessary, collect them + // in CastInsts. Note that the 'and' instruction was already included in + // this list. + // + // TODO: A better way to represent this may be to tag in some way all the + // instructions that are a part of the reduction. The vectorizer cost + // model could then apply the recurrence type to these instructions, + // without needing a white list of instructions to ignore. + collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts); + } + // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. @@ -480,47 +542,57 @@ bool RecurrenceDescriptor::hasMultipleUsesOf( return false; } bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, - RecurrenceDescriptor &RedDes) { + RecurrenceDescriptor &RedDes, + DemandedBits *DB, AssumptionCache *AC, + DominatorTree *DT) { BasicBlock *Header = TheLoop->getHeader(); Function &F = *Header->getParent(); bool HasFunNoNaNAttr = F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; - if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes)) { + if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB, + AC, DT)) { DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes)) { + if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes, DB, + AC, DT)) { DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes)) { + if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes, DB, + AC, DT)) { DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes)) { + if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes, DB, + AC, DT)) { DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes)) { + if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes, DB, + AC, DT)) { DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr, - RedDes)) { + if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr, RedDes, + DB, AC, DT)) { DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes)) { + if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes, DB, + AC, DT)) { DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes)) { + if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB, + AC, DT)) { DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n"); return true; } - if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes)) { + if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes, DB, + AC, DT)) { DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi << "\n"); return true; } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 64f206ea92eb..5bcf0c0a7ba6 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1542,9 +1542,10 @@ public: const TargetTransformInfo *TTI, std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI, OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R, - LoopVectorizeHints *H) + LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC) : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA), - ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {} + ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H), + DB(DB), AC(AC) {} /// ReductionList contains the reduction descriptors for all /// of the reductions that were found in the loop. @@ -1833,6 +1834,14 @@ private: /// Used to emit an analysis of any legality issues. LoopVectorizeHints *Hints; + /// The demanded bits analsyis is used to compute the minimum type size in + /// which a reduction can be computed. + DemandedBits *DB; + + /// The assumption cache analysis is used to compute the minimum type size in + /// which a reduction can be computed. + AssumptionCache *AC; + /// While vectorizing these instructions we have to generate a /// call to the appropriate masked intrinsic SmallPtrSet<const Instruction *, 8> MaskedOp; @@ -5300,7 +5309,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } RecurrenceDescriptor RedDes; - if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) { + if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, + DT)) { if (RedDes.hasUnsafeAlgebra()) Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst()); AllowedExit.insert(RedDes.getLoopExitInstr()); @@ -8514,7 +8524,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements(*ORE); LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, - &Requirements, &Hints); + &Requirements, &Hints, DB, AC); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); emitMissedWarning(F, L, Hints, ORE); diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll index 6c47a853e24a..e9313dfba870 100644 --- a/test/Bitcode/compatibility-3.6.ll +++ b/test/Bitcode/compatibility-3.6.ll @@ -612,9 +612,7 @@ define void @fastmathflags(float %op1, float %op2) { %f.arcp = fadd arcp float %op1, %op2 ; CHECK: %f.arcp = fadd arcp float %op1, %op2 %f.fast = fadd fast float %op1, %op2 - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'. - ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2 + ; CHECK: %f.fast = fadd fast float %op1, %op2 ret void } diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll index 55844e5c4986..82fc99055357 100644 --- a/test/Bitcode/compatibility-3.7.ll +++ b/test/Bitcode/compatibility-3.7.ll @@ -656,9 +656,7 @@ define void @fastmathflags(float %op1, float %op2) { %f.arcp = fadd arcp float %op1, %op2 ; CHECK: %f.arcp = fadd arcp float %op1, %op2 %f.fast = fadd fast float %op1, %op2 - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'. - ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2 + ; CHECK: %f.fast = fadd fast float %op1, %op2 ret void } diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll index a7fa20f2bc08..2e70a380d10e 100644 --- a/test/Bitcode/compatibility-3.8.ll +++ b/test/Bitcode/compatibility-3.8.ll @@ -687,9 +687,7 @@ define void @fastmathflags(float %op1, float %op2) { %f.arcp = fadd arcp float %op1, %op2 ; CHECK: %f.arcp = fadd arcp float %op1, %op2 %f.fast = fadd fast float %op1, %op2 - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'. - ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2 + ; CHECK: %f.fast = fadd fast float %op1, %op2 ret void } @@ -702,9 +700,7 @@ declare <4 x double> @fmf3() ; CHECK-LABEL: fastMathFlagsForCalls( define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) { %call.fast = call fast float @fmf1() - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'contract' and 'aml' bits set, so this is not fully 'fast'. - ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1() + ; CHECK: %call.fast = call fast float @fmf1() ; Throw in some other attributes to make sure those stay in the right places. diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll index c456fefe9d40..7c84daa7d3c4 100644 --- a/test/Bitcode/compatibility-3.9.ll +++ b/test/Bitcode/compatibility-3.9.ll @@ -758,9 +758,7 @@ define void @fastmathflags(float %op1, float %op2) { %f.arcp = fadd arcp float %op1, %op2 ; CHECK: %f.arcp = fadd arcp float %op1, %op2 %f.fast = fadd fast float %op1, %op2 - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'. - ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2 + ; CHECK: %f.fast = fadd fast float %op1, %op2 ret void } @@ -773,9 +771,7 @@ declare <4 x double> @fmf3() ; CHECK-LABEL: fastMathFlagsForCalls( define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) { %call.fast = call fast float @fmf1() - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'. - ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1() + ; CHECK: %call.fast = call fast float @fmf1() ; Throw in some other attributes to make sure those stay in the right places. diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll index 68446a7d5b0a..9e34d48c95f7 100644 --- a/test/Bitcode/compatibility-4.0.ll +++ b/test/Bitcode/compatibility-4.0.ll @@ -757,10 +757,8 @@ define void @fastmathflags(float %op1, float %op2) { ; CHECK: %f.nsz = fadd nsz float %op1, %op2 %f.arcp = fadd arcp float %op1, %op2 ; CHECK: %f.arcp = fadd arcp float %op1, %op2 - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'. %f.fast = fadd fast float %op1, %op2 - ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2 + ; CHECK: %f.fast = fadd fast float %op1, %op2 ret void } @@ -773,9 +771,7 @@ declare <4 x double> @fmf3() ; CHECK-LABEL: fastMathFlagsForCalls( define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) { %call.fast = call fast float @fmf1() - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'. - ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1() + ; CHECK: %call.fast = call fast float @fmf1() ; Throw in some other attributes to make sure those stay in the right places. diff --git a/test/Bitcode/compatibility-5.0.ll b/test/Bitcode/compatibility-5.0.ll index cdadc032d87b..a4b3fca82b7b 100644 --- a/test/Bitcode/compatibility-5.0.ll +++ b/test/Bitcode/compatibility-5.0.ll @@ -765,9 +765,7 @@ define void @fastmathflags(float %op1, float %op2) { %f.contract = fadd contract float %op1, %op2 ; CHECK: %f.contract = fadd contract float %op1, %op2 %f.fast = fadd fast float %op1, %op2 - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'afn' bit set, so this is not fully 'fast'. - ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp contract float %op1, %op2 + ; CHECK: %f.fast = fadd fast float %op1, %op2 ret void } @@ -780,9 +778,7 @@ declare <4 x double> @fmf3() ; CHECK-LABEL: fastMathFlagsForCalls( define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) { %call.fast = call fast float @fmf1() - ; 'fast' used to be its own bit, but this changed in Oct 2017. - ; The binary test file does not have the newer 'afn' bit set, so this is not fully 'fast'. - ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp contract float @fmf1() + ; CHECK: %call.fast = call fast float @fmf1() ; Throw in some other attributes to make sure those stay in the right places. diff --git a/test/CodeGen/AArch64/GlobalISel/fp16-copy-gpr.mir b/test/CodeGen/AArch64/GlobalISel/fp16-copy-gpr.mir new file mode 100644 index 000000000000..fd1998037d38 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/fp16-copy-gpr.mir @@ -0,0 +1,131 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-unknown-unknown -o - -global-isel -verify-machineinstrs -run-pass=instruction-select %s | FileCheck %s + +# PR36345 +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-arm-none-eabi" + + ; Function Attrs: noinline nounwind optnone + define void @fp16_to_gpr([2 x half], [2 x half]* %addr) { + ret void + } + + define void @gpr_to_fp16() { + ret void + } + + define void @gpr_to_fp16_physreg() { + ret void + } +... +--- +name: fp16_to_gpr +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gpr } + - { id: 1, class: fpr } + - { id: 2, class: fpr } + - { id: 3, class: gpr } + - { id: 4, class: gpr } + - { id: 5, class: gpr } + - { id: 6, class: gpr } + - { id: 7, class: gpr } + - { id: 8, class: gpr } + - { id: 9, class: gpr } + - { id: 10, class: gpr } + - { id: 11, class: gpr } + - { id: 12, class: gpr } +body: | + bb.1 (%ir-block.1): + liveins: %h0, %h1, %x0 + + ; CHECK-LABEL: name: fp16_to_gpr + ; CHECK: liveins: %h0, %h1, %x0 + ; CHECK: [[COPY:%[0-9]+]]:fpr16 = COPY %h0 + ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY %h1 + ; CHECK: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY [[SUBREG_TO_REG]] + ; CHECK: [[BFMWri:%[0-9]+]]:gpr32 = BFMWri [[DEF]], [[COPY2]], 0, 15 + ; CHECK: [[SUBREG_TO_REG1:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK: [[COPY3:%[0-9]+]]:gpr32 = COPY [[SUBREG_TO_REG1]] + ; CHECK: [[BFMWri1:%[0-9]+]]:gpr32 = BFMWri [[BFMWri]], [[COPY3]], 16, 15 + ; CHECK: [[COPY4:%[0-9]+]]:gpr32 = COPY [[BFMWri1]] + ; CHECK: [[COPY5:%[0-9]+]]:gpr64sp = COPY %x0 + ; CHECK: STRWui [[COPY4]], [[COPY5]], 0 :: (store 4 into %ir.addr, align 2) + ; CHECK: RET_ReallyLR + %1:fpr(s16) = COPY %h0 + %2:fpr(s16) = COPY %h1 + %3:gpr(s32) = G_IMPLICIT_DEF + %11:gpr(s16) = COPY %1(s16) + %4:gpr(s32) = G_INSERT %3, %11(s16), 0 + %12:gpr(s16) = COPY %2(s16) + %5:gpr(s32) = G_INSERT %4, %12(s16), 16 + %0:gpr(s32) = COPY %5(s32) + %6:gpr(p0) = COPY %x0 + G_STORE %0(s32), %6(p0) :: (store 4 into %ir.addr, align 2) + RET_ReallyLR + +... + +--- +name: gpr_to_fp16 +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: fpr } +body: | + bb.1 (%ir-block.0): + liveins: %w0 + + ; CHECK-LABEL: name: gpr_to_fp16 + ; CHECK: liveins: %w0 + ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]] + ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY [[COPY1]] + ; CHECK: [[COPY3:%[0-9]+]]:fpr16 = COPY [[COPY2]].hsub + ; CHECK: [[COPY4:%[0-9]+]]:fpr16 = COPY [[COPY3]] + ; CHECK: %h0 = COPY [[COPY4]] + ; CHECK: RET_ReallyLR implicit %h0 + %0:gpr(s32) = COPY %w0 + %1:gpr(s16) = G_TRUNC %0(s32) + %2:fpr(s16) = COPY %1(s16) + %h0 = COPY %2(s16) + RET_ReallyLR implicit %h0 + +... +--- +name: gpr_to_fp16_physreg +alignment: 2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +body: | + bb.1 (%ir-block.0): + liveins: %w0 + + ; CHECK-LABEL: name: gpr_to_fp16_physreg + ; CHECK: liveins: %w0 + ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]] + ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY [[COPY1]] + ; CHECK: [[COPY3:%[0-9]+]]:fpr16 = COPY [[COPY2]].hsub + ; CHECK: %h0 = COPY [[COPY3]] + ; CHECK: RET_ReallyLR implicit %h0 + %0:gpr(s32) = COPY %w0 + %1:gpr(s16) = G_TRUNC %0(s32) + %h0 = COPY %1(s16) + RET_ReallyLR implicit %h0 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir index 33b483511065..1980048eb456 100644 --- a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir +++ b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir @@ -1,8 +1,8 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s --- -# CHECK-LABEL: name: insert_gprs -name: insert_gprs +name: insert_gprx legalized: true regBankSelected: true @@ -10,26 +10,56 @@ body: | bb.0: liveins: %x0 + ; CHECK-LABEL: name: insert_gprx + ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0 + ; CHECK: [[DEF:%[0-9]+]]:gpr64 = IMPLICIT_DEF + ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32 + ; CHECK: [[BFMXri:%[0-9]+]]:gpr64 = BFMXri [[DEF]], [[SUBREG_TO_REG]], 0, 31 + ; CHECK: [[SUBREG_TO_REG1:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32 + ; CHECK: [[BFMXri1:%[0-9]+]]:gpr64 = BFMXri [[DEF]], [[SUBREG_TO_REG1]], 51, 31 + ; CHECK: %x0 = COPY [[BFMXri]] + ; CHECK: %x1 = COPY [[BFMXri1]] %0:gpr(s32) = COPY %w0 %1:gpr(s64) = G_IMPLICIT_DEF - ; CHECK: body: - ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32 - ; CHECK: %2:gpr64 = BFMXri %1, [[TMP]], 0, 31 %2:gpr(s64) = G_INSERT %1, %0, 0 - ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32 - ; CHECK: %3:gpr64 = BFMXri %1, [[TMP]], 51, 31 %3:gpr(s64) = G_INSERT %1, %0, 13 %x0 = COPY %2 %x1 = COPY %3 ... +--- +name: insert_gprw +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: %w0, %w1 + ; CHECK-LABEL: name: insert_gprw + ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]] + ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY [[COPY]] + ; CHECK: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF + ; CHECK: [[BFMWri:%[0-9]+]]:gpr32 = BFMWri [[DEF]], [[COPY1]], 0, 15 + ; CHECK: [[BFMWri1:%[0-9]+]]:gpr32 = BFMWri [[BFMWri]], [[COPY2]], 16, 15 + ; CHECK: [[COPY3:%[0-9]+]]:gpr32all = COPY [[BFMWri1]] + ; CHECK: %w0 = COPY [[COPY3]] + %1:gpr(s32) = COPY %w0 + %2:gpr(s32) = COPY %w1 + %3:gpr(s16) = G_TRUNC %1(s32) + %4:gpr(s16) = G_TRUNC %1(s32) + %5:gpr(s32) = G_IMPLICIT_DEF + %6:gpr(s32) = G_INSERT %5, %3(s16), 0 + %7:gpr(s32) = G_INSERT %6, %4(s16), 16 + %0:gpr(s32) = COPY %7(s32) + %w0 = COPY %0 +... --- -# CHECK-LABEL: name: extract_gprs name: extract_gprs legalized: true regBankSelected: true @@ -38,17 +68,49 @@ body: | bb.0: liveins: %x0 + ; CHECK-LABEL: name: extract_gprs + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY %x0 + ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[COPY]], 0, 31 + ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[UBFMXri]].sub_32 + ; CHECK: [[UBFMXri1:%[0-9]+]]:gpr64 = UBFMXri [[COPY]], 13, 44 + ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY [[UBFMXri1]].sub_32 + ; CHECK: %w0 = COPY [[COPY1]] + ; CHECK: %w1 = COPY [[COPY2]] %0:gpr(s64) = COPY %x0 - ; CHECK: body: - ; CHECK: [[TMP:%[0-9]+]]:gpr64 = UBFMXri %0, 0, 31 - ; CHECK: %1:gpr32 = COPY [[TMP]].sub_32 %1:gpr(s32) = G_EXTRACT %0, 0 - ; CHECK: [[TMP:%[0-9]+]]:gpr64 = UBFMXri %0, 13, 44 - ; CHECK: %2:gpr32 = COPY [[TMP]].sub_32 %2:gpr(s32) = G_EXTRACT %0, 13 %w0 = COPY %1 %w1 = COPY %2 ... + +--- +name: extract_gprw +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: %w0 + + ; CHECK-LABEL: name: extract_gprw + ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0 + ; CHECK: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY]], 0, 15 + ; CHECK: [[UBFMWri1:%[0-9]+]]:gpr32 = UBFMWri [[COPY]], 15, 30 + ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY [[UBFMWri]] + ; CHECK: [[COPY2:%[0-9]+]]:fpr16 = COPY [[COPY1]].hsub + ; CHECK: %h0 = COPY [[COPY2]] + ; CHECK: [[COPY3:%[0-9]+]]:fpr32 = COPY [[UBFMWri1]] + ; CHECK: [[COPY4:%[0-9]+]]:fpr16 = COPY [[COPY3]].hsub + ; CHECK: %h1 = COPY [[COPY4]] + %0:gpr(s32) = COPY %w0 + + %1:gpr(s16) = G_EXTRACT %0, 0 + + %2:gpr(s16) = G_EXTRACT %0, 15 + + %h0 = COPY %1 + %h1 = COPY %2 +... diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll index 420c7b80b8d3..adf22323ae65 100644 --- a/test/CodeGen/AMDGPU/smrd.ll +++ b/test/CodeGen/AMDGPU/smrd.ll @@ -261,8 +261,42 @@ main_body: ret void } +; GCN-LABEL: {{^}}smrd_sgpr_descriptor_promoted +; GCN: v_readfirstlane +define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), i32) #0 { +main_body: + %descptr = bitcast [0 x i8] addrspace(2)* %0 to <4 x i32> addrspace(2)*, !amdgpu.uniform !0 + br label %.outer_loop_header + +ret_block: ; preds = %.outer, %.label22, %main_body + ret void + +.outer_loop_header: + br label %.inner_loop_header + +.inner_loop_header: ; preds = %.inner_loop_body, %.outer_loop_header + %loopctr.1 = phi i32 [ 0, %.outer_loop_header ], [ %loopctr.2, %.inner_loop_body ] + %loopctr.2 = add i32 %loopctr.1, 1 + %inner_br1 = icmp slt i32 %loopctr.2, 10 + br i1 %inner_br1, label %.inner_loop_body, label %ret_block + +.inner_loop_body: + %descriptor = load <4 x i32>, <4 x i32> addrspace(2)* %descptr, align 16, !invariant.load !0 + %load1result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 0) + %inner_br2 = icmp uge i32 %1, 10 + br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body + +.outer_loop_body: + %offset = shl i32 %loopctr.2, 6 + %load2result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 %offset) + %outer_br = fcmp ueq float %load2result, 0x0 + br i1 %outer_br, label %.outer_loop_header, label %ret_block +} + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } + +!0 = !{} diff --git a/test/CodeGen/PowerPC/pr36292.ll b/test/CodeGen/PowerPC/pr36292.ll new file mode 100644 index 000000000000..a171918b9e07 --- /dev/null +++ b/test/CodeGen/PowerPC/pr36292.ll @@ -0,0 +1,46 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s | \ +; RUN: FileCheck %s --implicit-check-not=mtctr --implicit-check-not=bdnz +$test = comdat any + +; No CTR loop due to frem (since it is always a call). +define void @test() #0 comdat { +; CHECK-LABEL: test: +; CHECK: ld 29, 0(3) +; CHECK: ld 30, 40(1) +; CHECK: xxlxor 31, 31, 31 +; CHECK: cmpld 30, 29 +; CHECK-NEXT: bge- 0, .LBB0_2 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB0_1: # %bounds.ok +; CHECK: fmr 1, 31 +; CHECK-NEXT: lfsx 2, 0, 3 +; CHECK-NEXT: bl fmodf +; CHECK-NEXT: nop +; CHECK-NEXT: addi 30, 30, 1 +; CHECK-NEXT: stfsx 1, 0, 3 +; CHECK-NEXT: cmpld 30, 29 +; CHECK-NEXT: blt+ 0, .LBB0_1 +; CHECK-NEXT: .LBB0_2: # %bounds.fail +; CHECK-NEXT: std 30, 40(1) + %pos = alloca i64, align 8 + br label %forcond + +forcond: ; preds = %bounds.ok, %0 + %1 = load i64, i64* %pos + %.len1 = load i64, i64* undef + %bounds.cmp = icmp ult i64 %1, %.len1 + br i1 %bounds.cmp, label %bounds.ok, label %bounds.fail + +bounds.ok: ; preds = %forcond + %2 = load float, float* undef + %3 = frem float 0.000000e+00, %2 + store float %3, float* undef + %4 = load i64, i64* %pos + %5 = add i64 %4, 1 + store i64 %5, i64* %pos + br label %forcond + +bounds.fail: ; preds = %forcond + unreachable +} + diff --git a/test/CodeGen/X86/clwb.ll b/test/CodeGen/X86/clwb.ll index 0bbb14917f7f..e5906c6ce68c 100644 --- a/test/CodeGen/X86/clwb.ll +++ b/test/CodeGen/X86/clwb.ll @@ -1,5 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: clwb is available in Skylake Server, not available in the newer +; NOTE: Cannon Lake arch, but available again in the newer Ice Lake arch. ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=clwb | FileCheck %s +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx | FileCheck %s +; RUN: not llc < %s -mtriple=i686-apple-darwin -mcpu=cannonlake 2>&1 | FileCheck %s --check-prefix=CNL +; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=icelake | FileCheck %s + +; CNL: LLVM ERROR: Cannot select: intrinsic %llvm.x86.clwb define void @clwb(i8* %p) nounwind { ; CHECK-LABEL: clwb: diff --git a/test/Transforms/InstCombine/pr36362.ll b/test/Transforms/InstCombine/pr36362.ll new file mode 100644 index 000000000000..412691543a15 --- /dev/null +++ b/test/Transforms/InstCombine/pr36362.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +;RUN: opt -instcombine -S %s | FileCheck %s + +; We shouldn't remove the select before the srem +define i32 @foo(i1 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[A:%.*]], i32 [[B:%.*]], i32 -1 +; CHECK-NEXT: [[REM:%.*]] = srem i32 [[C:%.*]], [[SEL1]] +; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[A]], i32 [[REM]], i32 0 +; CHECK-NEXT: ret i32 [[SEL2]] +; + %sel1 = select i1 %a, i32 %b, i32 -1 + %rem = srem i32 %c, %sel1 + %sel2 = select i1 %a, i32 %rem, i32 0 + ret i32 %sel2 +} + diff --git a/test/Transforms/LICM/sinking.ll b/test/Transforms/LICM/sinking.ll index b28eea0bc2aa..6d747877c58e 100644 --- a/test/Transforms/LICM/sinking.ll +++ b/test/Transforms/LICM/sinking.ll @@ -670,6 +670,67 @@ try.cont: ret void } +; The sinkable call should be sunk into an exit block split. After splitting +; the exit block, BlockColor for new blocks should be added properly so +; that we should be able to access valid ColorVector. +; +; CHECK-LABEL:@test21_pr36184 +; CHECK-LABEL: Loop +; CHECK-NOT: %sinkableCall +; CHECK-LABEL:Out.split.loop.exit +; CHECK: %sinkableCall +define i32 @test21_pr36184(i8* %P) personality i32 (...)* @__CxxFrameHandler3 { +entry: + br label %loop.ph + +loop.ph: + br label %Loop + +Loop: + %sinkableCall = call i32 @strlen( i8* %P ) readonly + br i1 undef, label %ContLoop, label %Out + +ContLoop: + br i1 undef, label %Loop, label %Out + +Out: + %idx = phi i32 [ %sinkableCall, %Loop ], [0, %ContLoop ] + ret i32 %idx +} + +; We do not support splitting a landingpad block if BlockColors is not empty. +; CHECK-LABEL: @test22 +; CHECK-LABEL: while.body2 +; CHECK-LABEL: %mul +; CHECK-NOT: lpadBB.split{{.*}} +define void @test22(i1 %b, i32 %v1, i32 %v2) personality i32 (...)* @__CxxFrameHandler3 { +entry: + br label %while.cond +while.cond: + br i1 %b, label %try.cont, label %while.body + +while.body: + invoke void @may_throw() + to label %while.body2 unwind label %lpadBB + +while.body2: + %v = call i32 @getv() + %mul = mul i32 %v, %v2 + invoke void @may_throw2() + to label %while.cond unwind label %lpadBB +lpadBB: + %.lcssa1 = phi i32 [ 0, %while.body ], [ %mul, %while.body2 ] + landingpad { i8*, i32 } + catch i8* null + br label %lpadBBSucc1 + +lpadBBSucc1: + ret void + +try.cont: + ret void +} + declare void @may_throw() declare void @may_throw2() declare i32 @__CxxFrameHandler3(...) diff --git a/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll index d9c9632be047..08d163fe6299 100644 --- a/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll +++ b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -scev-version-unknown < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/test/Transforms/LoopVectorize/pr35773.ll b/test/Transforms/LoopVectorize/pr35773.ll index 362ece70b898..308bb393cc4e 100644 --- a/test/Transforms/LoopVectorize/pr35773.ll +++ b/test/Transforms/LoopVectorize/pr35773.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -scev-version-unknown < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @a = common local_unnamed_addr global i32 0, align 4 @b = common local_unnamed_addr global i8 0, align 1 diff --git a/test/Transforms/LoopVectorize/reduction-small-size.ll b/test/Transforms/LoopVectorize/reduction-small-size.ll index b44beb8ce68f..879f1c3c5ad4 100644 --- a/test/Transforms/LoopVectorize/reduction-small-size.ll +++ b/test/Transforms/LoopVectorize/reduction-small-size.ll @@ -14,7 +14,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" ; CHECK-NEXT: [[TMP17]] = zext <4 x i8> [[TMP16]] to <4 x i32> ; CHECK-NEXT: br i1 {{.*}}, label %middle.block, label %vector.body ; -define void @PR34687(i1 %c, i32 %x, i32 %n) { +define i8 @PR34687(i1 %c, i32 %x, i32 %n) { entry: br label %for.body @@ -36,5 +36,38 @@ if.end: for.end: %tmp2 = phi i32 [ %r.next, %if.end ] - ret void + %tmp3 = trunc i32 %tmp2 to i8 + ret i8 %tmp3 +} + +; CHECK-LABEL: @PR35734( +; CHECK: vector.ph: +; CHECK: [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 %y, i32 0 +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP3]], %vector.ph ], [ [[TMP9:%.*]], %vector.body ] +; CHECK: [[TMP5:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1> +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], <i32 -1, i32 -1, i32 -1, i32 -1> +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK: [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i1> +; CHECK-NEXT: [[TMP9]] = sext <4 x i1> [[TMP8]] to <4 x i32> +; CHECK-NEXT: br i1 {{.*}}, label %middle.block, label %vector.body +; +define i32 @PR35734(i32 %x, i32 %y) { +entry: + br label %for.body + +for.body: + %i = phi i32 [ %x, %entry ], [ %i.next, %for.body ] + %r = phi i32 [ %y, %entry ], [ %r.next, %for.body ] + %tmp0 = and i32 %r, 1 + %r.next = add i32 %tmp0, -1 + %i.next = add nsw i32 %i, 1 + %cond = icmp sgt i32 %i, 77 + br i1 %cond, label %for.end, label %for.body + +for.end: + %tmp1 = phi i32 [ %r.next, %for.body ] + ret i32 %tmp1 } diff --git a/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll b/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll index 4ddc6a652179..f7877245b0d4 100644 --- a/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll +++ b/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll @@ -1,5 +1,5 @@ -; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s -check-prefix=VF8 -; RUN: opt -S -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix=VF1 +; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -scev-version-unknown < %s | FileCheck %s -check-prefix=VF8 +; RUN: opt -S -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -scev-version-unknown < %s | FileCheck %s -check-prefix=VF1 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/test/tools/llvm-config/system-libs.windows.test b/test/tools/llvm-config/system-libs.windows.test index 2c6e03afa2d9..09970cf68994 100644 --- a/test/tools/llvm-config/system-libs.windows.test +++ b/test/tools/llvm-config/system-libs.windows.test @@ -2,6 +2,6 @@ RUN: llvm-config --link-static --system-libs 2>&1 | FileCheck %s REQUIRES: static-libs REQUIRES: system-windows CHECK-NOT: -l -CHECK: psapi.lib shell32.lib ole32.lib uuid.lib +CHECK: psapi.lib shell32.lib ole32.lib uuid.lib advapi32.lib CHECK-NOT: error CHECK-NOT: warning |