diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-03-31 20:55:52 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-03-31 20:55:52 +0000 |
commit | 5bcd187b307a70f29854eb0c5ccdf30ff3770fe1 (patch) | |
tree | 005c0e9231b62275dc3a5d207b2550431858ce0a | |
parent | 11ee15ea4ee1ea5555f8d7ba1ec5ffe956df2a8c (diff) |
Vendor import of llvm-project branch release/16.x llvmorg-16.0.0-45-g42d1b276f779.vendor/llvm-project/llvmorg-16.0.0-45-g42d1b276f779
40 files changed, 2428 insertions, 316 deletions
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 36d4bc2a700d..e99beb3a7636 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1052,7 +1052,7 @@ def err_lambda_template_parameter_list_empty : Error< // C++2b static lambdas def err_static_lambda: ExtWarn< "static lambdas are a C++2b extension">, InGroup<CXX2b>; -def warn_cxx20_compat_static_lambda: ExtWarn< +def warn_cxx20_compat_static_lambda : Warning< "static lambdas are incompatible with C++ standards before C++2b">, InGroup<CXXPre2bCompat>, DefaultIgnore; def err_static_mutable_lambda : Error< diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index de56e3e1566b..bfe582d8252f 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -9138,8 +9138,9 @@ def err_operator_overload_static : Error< def err_operator_overload_default_arg : Error< "parameter of overloaded %0 cannot have a default argument">; -def ext_subscript_overload : ExtWarn< - "overloaded %0 with %select{no|a defaulted|more than one}1 parameter is a C++2b extension">, InGroup<CXXPre2bCompat>, DefaultIgnore; +def ext_subscript_overload : Warning< + "overloaded %0 with %select{no|a defaulted|more than one}1 parameter is a " + "C++2b extension">, InGroup<CXXPre2bCompat>, DefaultIgnore; def error_subscript_overload : Error< "overloaded %0 cannot have %select{no|a defaulted|more than one}1 parameter before C++2b">; diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp index f11751a76073..b85d5dc2d347 100644 --- a/clang/lib/Basic/Targets/ARM.cpp +++ b/clang/lib/Basic/Targets/ARM.cpp @@ -254,6 +254,7 @@ ARMTargetInfo::ARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) : TargetInfo(Triple), FPMath(FP_Default), IsAAPCS(true), LDREX(0), HW_FP(0) { + bool IsFreeBSD = Triple.isOSFreeBSD(); bool IsOpenBSD = Triple.isOSOpenBSD(); bool IsNetBSD = Triple.isOSNetBSD(); @@ -321,7 +322,7 @@ ARMTargetInfo::ARMTargetInfo(const llvm::Triple &Triple, default: if (IsNetBSD) setABI("apcs-gnu"); - else if (IsOpenBSD) + else if (IsFreeBSD || IsOpenBSD) setABI("aapcs-linux"); else setABI("aapcs"); diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp index 9a5af638c399..2230295ccd74 100644 --- a/clang/lib/Driver/ToolChains/FreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp @@ -85,16 +85,7 @@ void freebsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA, else CmdArgs.push_back("-mfpu=softvfp"); - switch (getToolChain().getTriple().getEnvironment()) { - case llvm::Triple::GNUEABIHF: - case llvm::Triple::GNUEABI: - case llvm::Triple::EABI: - CmdArgs.push_back("-meabi=5"); - break; - - default: - CmdArgs.push_back("-matpcs"); - } + CmdArgs.push_back("-meabi=5"); break; } case llvm::Triple::sparc: @@ -467,21 +458,6 @@ Tool *FreeBSD::buildAssembler() const { Tool *FreeBSD::buildLinker() const { return new tools::freebsd::Linker(*this); } -llvm::ExceptionHandling FreeBSD::GetExceptionModel(const ArgList &Args) const { - // FreeBSD uses SjLj exceptions on ARM oabi. - switch (getTriple().getEnvironment()) { - case llvm::Triple::GNUEABIHF: - case llvm::Triple::GNUEABI: - case llvm::Triple::EABI: - return llvm::ExceptionHandling::None; - default: - if (getTriple().getArch() == llvm::Triple::arm || - getTriple().getArch() == llvm::Triple::thumb) - return llvm::ExceptionHandling::SjLj; - return llvm::ExceptionHandling::None; - } -} - bool FreeBSD::HasNativeLLVMSupport() const { return true; } ToolChain::UnwindTableLevel diff --git a/clang/lib/Driver/ToolChains/FreeBSD.h b/clang/lib/Driver/ToolChains/FreeBSD.h index 9b24ef1a1e1c..cec67d84a2ce 100644 --- a/clang/lib/Driver/ToolChains/FreeBSD.h +++ b/clang/lib/Driver/ToolChains/FreeBSD.h @@ -78,8 +78,6 @@ public: void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; - llvm::ExceptionHandling - GetExceptionModel(const llvm::opt::ArgList &Args) const override; UnwindTableLevel getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override; bool isPIEDefault(const llvm::opt::ArgList &Args) const override; diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp index 79c08adb8fab..9678e30699c8 100644 --- a/clang/lib/Sema/SemaCoroutine.cpp +++ b/clang/lib/Sema/SemaCoroutine.cpp @@ -1562,7 +1562,7 @@ bool CoroutineStmtBuilder::makeNewAndDeleteExpr() { const auto *OpDeleteType = OpDeleteQualType.getTypePtr()->castAs<FunctionProtoType>(); if (OpDeleteType->getNumParams() > DeleteArgs.size() && - S.getASTContext().hasSameType( + S.getASTContext().hasSameUnqualifiedType( OpDeleteType->getParamType(DeleteArgs.size()), FrameSize->getType())) DeleteArgs.push_back(FrameSize); @@ -1579,7 +1579,7 @@ bool CoroutineStmtBuilder::makeNewAndDeleteExpr() { // So we are not forced to pass alignment to the deallocation function. if (S.getLangOpts().CoroAlignedAllocation && OpDeleteType->getNumParams() > DeleteArgs.size() && - S.getASTContext().hasSameType( + S.getASTContext().hasSameUnqualifiedType( OpDeleteType->getParamType(DeleteArgs.size()), FrameAlignment->getType())) DeleteArgs.push_back(FrameAlignment); diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp index 46948c12617c..49855305cecc 100644 --- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp +++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp @@ -1849,8 +1849,12 @@ std::optional<SVal> RegionStoreManager::getSValFromInitListExpr( // Go to the nested initializer list. ILE = IL; } - llvm_unreachable( - "Unhandled InitListExpr sub-expressions or invalid offsets."); + + assert(ILE); + + // FIXME: Unhandeled InitListExpr sub-expression, possibly constructing an + // enum? + return std::nullopt; } /// Returns an SVal, if possible, for the specified position in a string diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp index ec860fdc4ff9..37b2b57c0c84 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp @@ -583,6 +583,7 @@ static void GetTls(uptr *addr, uptr *size) { *addr = (uptr)tcb->tcb_dtv[1]; } } +#else #error "Unknown OS" #endif } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp index c647ab107ec5..ac2afe42e269 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp @@ -231,8 +231,6 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() { // Check that tool command lines are simple and that complete escaping is // unnecessary. CHECK(!internal_strchr(arg, '"') && "quotes in args unsupported"); - CHECK(!internal_strstr(arg, "\\\\") && - "double backslashes in args unsupported"); CHECK(arglen > 0 && arg[arglen - 1] != '\\' && "args ending in backslash and empty args unsupported"); command_line.append("\"%s\" ", arg); diff --git a/libcxx/include/__config b/libcxx/include/__config index ac6a1422bfe3..2f11f3b7d495 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -37,7 +37,7 @@ // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM. // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 16.0.1 == 16.00.01), _LIBCPP_VERSION is // defined to XXYYZZ. -# define _LIBCPP_VERSION 160000 +# define _LIBCPP_VERSION 160001 # define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y # define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y) diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h index e1f590c65efe..ca3e8a59922d 100644 --- a/libcxx/include/__expected/expected.h +++ b/libcxx/include/__expected/expected.h @@ -292,7 +292,8 @@ private: "be reverted to the previous state in case an exception is thrown during the assignment."); _T2 __tmp(std::move(__oldval)); std::destroy_at(std::addressof(__oldval)); - __exception_guard __trans([&] { std::construct_at(std::addressof(__oldval), std::move(__tmp)); }); + auto __trans = + std::__make_exception_guard([&] { std::construct_at(std::addressof(__oldval), std::move(__tmp)); }); std::construct_at(std::addressof(__newval), std::forward<_Args>(__args)...); __trans.__complete(); } @@ -451,7 +452,7 @@ public: if constexpr (is_nothrow_move_constructible_v<_Err>) { _Err __tmp(std::move(__with_err.__union_.__unex_)); std::destroy_at(std::addressof(__with_err.__union_.__unex_)); - __exception_guard __trans([&] { + auto __trans = std::__make_exception_guard([&] { std::construct_at(std::addressof(__with_err.__union_.__unex_), std::move(__tmp)); }); std::construct_at(std::addressof(__with_err.__union_.__val_), std::move(__with_val.__union_.__val_)); @@ -464,7 +465,7 @@ public: "that it can be reverted to the previous state in case an exception is thrown during swap."); _Tp __tmp(std::move(__with_val.__union_.__val_)); std::destroy_at(std::addressof(__with_val.__union_.__val_)); - __exception_guard __trans([&] { + auto __trans = std::__make_exception_guard([&] { std::construct_at(std::addressof(__with_val.__union_.__val_), std::move(__tmp)); }); std::construct_at(std::addressof(__with_val.__union_.__unex_), std::move(__with_err.__union_.__unex_)); diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h index 0067780c3f5d..90aecb7d6ad2 100644 --- a/libcxx/include/__memory/uninitialized_algorithms.h +++ b/libcxx/include/__memory/uninitialized_algorithms.h @@ -421,7 +421,7 @@ constexpr void __allocator_construct_at_multidimensional(_Alloc& __alloc, _Tp* _ _Tp& __array = *__loc; // If an exception is thrown, destroy what we have constructed so far in reverse order. - __exception_guard __guard([&]() { + auto __guard = std::__make_exception_guard([&]() { std::__allocator_destroy_multidimensional(__elem_alloc, __array, __array + __i); }); @@ -461,7 +461,7 @@ constexpr void __allocator_construct_at_multidimensional(_Alloc& __alloc, _Tp* _ _Tp& __array = *__loc; // If an exception is thrown, destroy what we have constructed so far in reverse order. - __exception_guard __guard([&]() { + auto __guard = std::__make_exception_guard([&]() { std::__allocator_destroy_multidimensional(__elem_alloc, __array, __array + __i); }); for (; __i != extent_v<_Tp>; ++__i) { @@ -488,7 +488,7 @@ __uninitialized_allocator_fill_n_multidimensional(_Alloc& __alloc, _BidirIter __ _BidirIter __begin = __it; // If an exception is thrown, destroy what we have constructed so far in reverse order. - __exception_guard __guard([&]() { std::__allocator_destroy_multidimensional(__value_alloc, __begin, __it); }); + auto __guard = std::__make_exception_guard([&]() { std::__allocator_destroy_multidimensional(__value_alloc, __begin, __it); }); for (; __n != 0; --__n, ++__it) { std::__allocator_construct_at_multidimensional(__value_alloc, std::addressof(*__it), __value); } @@ -505,7 +505,7 @@ __uninitialized_allocator_value_construct_n_multidimensional(_Alloc& __alloc, _B _BidirIter __begin = __it; // If an exception is thrown, destroy what we have constructed so far in reverse order. - __exception_guard __guard([&]() { std::__allocator_destroy_multidimensional(__value_alloc, __begin, __it); }); + auto __guard = std::__make_exception_guard([&]() { std::__allocator_destroy_multidimensional(__value_alloc, __begin, __it); }); for (; __n != 0; --__n, ++__it) { std::__allocator_construct_at_multidimensional(__value_alloc, std::addressof(*__it)); } diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h index 2489502bcdaf..f7b9a0b408c1 100644 --- a/libcxx/include/__memory_resource/polymorphic_allocator.h +++ b/libcxx/include/__memory_resource/polymorphic_allocator.h @@ -98,7 +98,7 @@ public: template <class _Type, class... _CtorArgs> [[nodiscard]] _Type* new_object(_CtorArgs&&... __ctor_args) { _Type* __ptr = allocate_object<_Type>(); - __exception_guard __guard([&] { deallocate_object(__ptr); }); + auto __guard = std::__make_exception_guard([&] { deallocate_object(__ptr); }); construct(__ptr, std::forward<_CtorArgs>(__ctor_args)...); __guard.__complete(); return __ptr; diff --git a/libcxx/include/__utility/exception_guard.h b/libcxx/include/__utility/exception_guard.h index 737d1a69c971..46f9359a5c0e 100644 --- a/libcxx/include/__utility/exception_guard.h +++ b/libcxx/include/__utility/exception_guard.h @@ -60,25 +60,26 @@ _LIBCPP_BEGIN_NAMESPACE_STD #ifndef _LIBCPP_NO_EXCEPTIONS template <class _Rollback> -struct __exception_guard { - __exception_guard() = delete; +struct __exception_guard_exceptions { + __exception_guard_exceptions() = delete; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit __exception_guard(_Rollback __rollback) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit __exception_guard_exceptions(_Rollback __rollback) : __rollback_(std::move(__rollback)), __completed_(false) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __exception_guard(__exception_guard&& __other) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 + __exception_guard_exceptions(__exception_guard_exceptions&& __other) _NOEXCEPT_(is_nothrow_move_constructible<_Rollback>::value) : __rollback_(std::move(__other.__rollback_)), __completed_(__other.__completed_) { __other.__completed_ = true; } - __exception_guard(__exception_guard const&) = delete; - __exception_guard& operator=(__exception_guard const&) = delete; - __exception_guard& operator=(__exception_guard&&) = delete; + __exception_guard_exceptions(__exception_guard_exceptions const&) = delete; + __exception_guard_exceptions& operator=(__exception_guard_exceptions const&) = delete; + __exception_guard_exceptions& operator=(__exception_guard_exceptions&&) = delete; _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __complete() _NOEXCEPT { __completed_ = true; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__exception_guard() { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__exception_guard_exceptions() { if (!__completed_) __rollback_(); } @@ -87,36 +88,46 @@ private: _Rollback __rollback_; bool __completed_; }; + +_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__exception_guard_exceptions); + +template <class _Rollback> +using __exception_guard = __exception_guard_exceptions<_Rollback>; #else // _LIBCPP_NO_EXCEPTIONS template <class _Rollback> -struct __exception_guard { - __exception_guard() = delete; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG explicit __exception_guard(_Rollback) {} +struct __exception_guard_noexceptions { + __exception_guard_noexceptions() = delete; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 + _LIBCPP_NODEBUG explicit __exception_guard_noexceptions(_Rollback) {} - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG __exception_guard(__exception_guard&& __other) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG + __exception_guard_noexceptions(__exception_guard_noexceptions&& __other) _NOEXCEPT_(is_nothrow_move_constructible<_Rollback>::value) : __completed_(__other.__completed_) { __other.__completed_ = true; } - __exception_guard(__exception_guard const&) = delete; - __exception_guard& operator=(__exception_guard const&) = delete; - __exception_guard& operator=(__exception_guard&&) = delete; + __exception_guard_noexceptions(__exception_guard_noexceptions const&) = delete; + __exception_guard_noexceptions& operator=(__exception_guard_noexceptions const&) = delete; + __exception_guard_noexceptions& operator=(__exception_guard_noexceptions&&) = delete; _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG void __complete() _NOEXCEPT { __completed_ = true; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG ~__exception_guard() { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG ~__exception_guard_noexceptions() { _LIBCPP_ASSERT(__completed_, "__exception_guard not completed with exceptions disabled"); } private: bool __completed_ = false; }; -#endif // _LIBCPP_NO_EXCEPTIONS -_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__exception_guard); +_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__exception_guard_noexceptions); + +template <class _Rollback> +using __exception_guard = __exception_guard_noexceptions<_Rollback>; +#endif // _LIBCPP_NO_EXCEPTIONS template <class _Rollback> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __exception_guard<_Rollback> __make_exception_guard(_Rollback __rollback) { diff --git a/libunwind/include/libunwind.modulemap b/libunwind/include/libunwind.modulemap index 162fe1d279a3..775841ecb5f1 100644 --- a/libunwind/include/libunwind.modulemap +++ b/libunwind/include/libunwind.modulemap @@ -6,5 +6,8 @@ module libunwind [system] { module unwind [system] { header "__libunwind_config.h" header "unwind.h" + private textual header "unwind_arm_ehabi.h" + private textual header "unwind_itanium.h" + export * } diff --git a/libunwind/include/unwind.h b/libunwind/include/unwind.h index 26cdef22207e..b1775d3a3dec 100644 --- a/libunwind/include/unwind.h +++ b/libunwind/include/unwind.h @@ -56,9 +56,9 @@ typedef enum { typedef struct _Unwind_Context _Unwind_Context; // opaque #if defined(_LIBUNWIND_ARM_EHABI) -#include "unwind_arm_ehabi.h" +#include <unwind_arm_ehabi.h> #else -#include "unwind_itanium.h" +#include <unwind_itanium.h> #endif typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn) diff --git a/libunwind/src/DwarfInstructions.hpp b/libunwind/src/DwarfInstructions.hpp index 27432be56133..9962c2ffa0ca 100644 --- a/libunwind/src/DwarfInstructions.hpp +++ b/libunwind/src/DwarfInstructions.hpp @@ -224,7 +224,8 @@ int DwarfInstructions<A, R>::stepWithDwarf(A &addressSpace, pint_t pc, p &= ~0xfULL; // CFA is the bottom of the current stack frame. for (; p < cfa; p += 16) { - __asm__ __volatile__(".arch_extension memtag\n" + __asm__ __volatile__(".arch armv8.5-a\n" + ".arch_extension memtag\n" "stg %[Ptr], [%[Ptr]]\n" : : [Ptr] "r"(p) diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S index 2a472be943f3..543b19f7e72a 100644 --- a/libunwind/src/UnwindRegistersRestore.S +++ b/libunwind/src/UnwindRegistersRestore.S @@ -194,9 +194,20 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_ppc646jumptoEv) addi 4, 3, PPC64_OFFS_FP // load VS register +#ifdef __LITTLE_ENDIAN__ +// For little-endian targets, we need a swap since lxvd2x will load the register +// in the incorrect doubleword order. +// FIXME: when supporting targets older than Power9 on LE is no longer required, +// this can be changed to simply `lxv n, (16 * n)(4)`. #define PPC64_LVS(n) \ lxvd2x n, 0, 4 ;\ + xxswapd n, n ;\ addi 4, 4, 16 +#else +#define PPC64_LVS(n) \ + lxvd2x n, 0, 4 ;\ + addi 4, 4, 16 +#endif // restore the first 32 VS regs (and also all floating point regs) PPC64_LVS(0) @@ -232,9 +243,16 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_ppc646jumptoEv) PPC64_LVS(30) PPC64_LVS(31) +#ifdef __LITTLE_ENDIAN__ +#define PPC64_CLVS_RESTORE(n) \ + addi 4, 3, PPC64_OFFS_FP + n * 16 ;\ + lxvd2x n, 0, 4 ;\ + xxswapd n, n +#else #define PPC64_CLVS_RESTORE(n) \ addi 4, 3, PPC64_OFFS_FP + n * 16 ;\ lxvd2x n, 0, 4 +#endif #if !defined(_AIX) // use VRSAVE to conditionally restore the remaining VS regs, that are diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S index 6c26b79877f6..79f5696a9888 100644 --- a/libunwind/src/UnwindRegistersSave.S +++ b/libunwind/src/UnwindRegistersSave.S @@ -351,9 +351,20 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) addi 4, 3, PPC64_OFFS_FP // store VS register +#ifdef __LITTLE_ENDIAN__ +// For little-endian targets, we need a swap since stxvd2x will store the +// register in the incorrect doubleword order. +// FIXME: when supporting targets older than Power9 on LE is no longer required +// this can be changed to simply `stxv n, 16 * n(4)`. #define PPC64_STVS(n) \ + xxswapd n, n ;\ stxvd2x n, 0, 4 ;\ addi 4, 4, 16 +#else +#define PPC64_STVS(n) \ + stxvd2x n, 0, 4 ;\ + addi 4, 4, 16 +#endif PPC64_STVS(0) PPC64_STVS(1) diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp index 01372cbdf29a..71aa596ea6ab 100644 --- a/lld/COFF/MinGW.cpp +++ b/lld/COFF/MinGW.cpp @@ -49,6 +49,9 @@ AutoExporter::AutoExporter( "libclang_rt.profile-x86_64", "libc++", "libc++abi", + "libFortran_main", + "libFortranRuntime", + "libFortranDecimal", "libunwind", "libmsvcrt", "libucrtbase", diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index 9131367bf223..5b75738e070c 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -114,13 +114,13 @@ public: static unsigned RelocSymbol64(const ELFRelocation &rel); - static unsigned RelocOffset32(const ELFRelocation &rel); + static elf_addr RelocOffset32(const ELFRelocation &rel); - static unsigned RelocOffset64(const ELFRelocation &rel); + static elf_addr RelocOffset64(const ELFRelocation &rel); - static unsigned RelocAddend32(const ELFRelocation &rel); + static elf_sxword RelocAddend32(const ELFRelocation &rel); - static unsigned RelocAddend64(const ELFRelocation &rel); + static elf_sxword RelocAddend64(const ELFRelocation &rel); bool IsRela() { return (reloc.is<ELFRela *>()); } @@ -185,28 +185,28 @@ unsigned ELFRelocation::RelocSymbol64(const ELFRelocation &rel) { return ELFRela::RelocSymbol64(*rel.reloc.get<ELFRela *>()); } -unsigned ELFRelocation::RelocOffset32(const ELFRelocation &rel) { +elf_addr ELFRelocation::RelocOffset32(const ELFRelocation &rel) { if (rel.reloc.is<ELFRel *>()) return rel.reloc.get<ELFRel *>()->r_offset; else return rel.reloc.get<ELFRela *>()->r_offset; } -unsigned ELFRelocation::RelocOffset64(const ELFRelocation &rel) { +elf_addr ELFRelocation::RelocOffset64(const ELFRelocation &rel) { if (rel.reloc.is<ELFRel *>()) return rel.reloc.get<ELFRel *>()->r_offset; else return rel.reloc.get<ELFRela *>()->r_offset; } -unsigned ELFRelocation::RelocAddend32(const ELFRelocation &rel) { +elf_sxword ELFRelocation::RelocAddend32(const ELFRelocation &rel) { if (rel.reloc.is<ELFRel *>()) return 0; else return rel.reloc.get<ELFRela *>()->r_addend; } -unsigned ELFRelocation::RelocAddend64(const ELFRelocation &rel) { +elf_sxword ELFRelocation::RelocAddend64(const ELFRelocation &rel) { if (rel.reloc.is<ELFRel *>()) return 0; else @@ -2593,6 +2593,50 @@ ObjectFileELF::ParseTrampolineSymbols(Symtab *symbol_table, user_id_t start_id, rel_data, symtab_data, strtab_data); } +static void ApplyELF64ABS64Relocation(Symtab *symtab, ELFRelocation &rel, + DataExtractor &debug_data, + Section *rel_section) { + Symbol *symbol = symtab->FindSymbolByID(ELFRelocation::RelocSymbol64(rel)); + if (symbol) { + addr_t value = symbol->GetAddressRef().GetFileAddress(); + DataBufferSP &data_buffer_sp = debug_data.GetSharedDataBuffer(); + // ObjectFileELF creates a WritableDataBuffer in CreateInstance. + WritableDataBuffer *data_buffer = + llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); + uint64_t *dst = reinterpret_cast<uint64_t *>( + data_buffer->GetBytes() + rel_section->GetFileOffset() + + ELFRelocation::RelocOffset64(rel)); + uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel); + memcpy(dst, &val_offset, sizeof(uint64_t)); + } +} + +static void ApplyELF64ABS32Relocation(Symtab *symtab, ELFRelocation &rel, + DataExtractor &debug_data, + Section *rel_section, bool is_signed) { + Symbol *symbol = symtab->FindSymbolByID(ELFRelocation::RelocSymbol64(rel)); + if (symbol) { + addr_t value = symbol->GetAddressRef().GetFileAddress(); + value += ELFRelocation::RelocAddend32(rel); + if ((!is_signed && (value > UINT32_MAX)) || + (is_signed && + ((int64_t)value > INT32_MAX || (int64_t)value < INT32_MIN))) { + Log *log = GetLog(LLDBLog::Modules); + LLDB_LOGF(log, "Failed to apply debug info relocations"); + return; + } + uint32_t truncated_addr = (value & 0xFFFFFFFF); + DataBufferSP &data_buffer_sp = debug_data.GetSharedDataBuffer(); + // ObjectFileELF creates a WritableDataBuffer in CreateInstance. + WritableDataBuffer *data_buffer = + llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); + uint32_t *dst = reinterpret_cast<uint32_t *>( + data_buffer->GetBytes() + rel_section->GetFileOffset() + + ELFRelocation::RelocOffset32(rel)); + memcpy(dst, &truncated_addr, sizeof(uint32_t)); + } +} + unsigned ObjectFileELF::ApplyRelocations( Symtab *symtab, const ELFHeader *hdr, const ELFSectionHeader *rel_hdr, const ELFSectionHeader *symtab_hdr, const ELFSectionHeader *debug_hdr, @@ -2656,55 +2700,50 @@ unsigned ObjectFileELF::ApplyRelocations( reloc_type(rel)); } } else { - switch (reloc_type(rel)) { - case R_AARCH64_ABS64: - case R_X86_64_64: { - symbol = symtab->FindSymbolByID(reloc_symbol(rel)); - if (symbol) { - addr_t value = symbol->GetAddressRef().GetFileAddress(); - DataBufferSP &data_buffer_sp = debug_data.GetSharedDataBuffer(); - // ObjectFileELF creates a WritableDataBuffer in CreateInstance. - WritableDataBuffer *data_buffer = - llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); - uint64_t *dst = reinterpret_cast<uint64_t *>( - data_buffer->GetBytes() + rel_section->GetFileOffset() + - ELFRelocation::RelocOffset64(rel)); - uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel); - memcpy(dst, &val_offset, sizeof(uint64_t)); + switch (hdr->e_machine) { + case llvm::ELF::EM_AARCH64: + switch (reloc_type(rel)) { + case R_AARCH64_ABS64: + ApplyELF64ABS64Relocation(symtab, rel, debug_data, rel_section); + break; + case R_AARCH64_ABS32: + ApplyELF64ABS32Relocation(symtab, rel, debug_data, rel_section, true); + break; + default: + assert(false && "unexpected relocation type"); } break; - } - case R_X86_64_32: - case R_X86_64_32S: - case R_AARCH64_ABS32: { - symbol = symtab->FindSymbolByID(reloc_symbol(rel)); - if (symbol) { - addr_t value = symbol->GetAddressRef().GetFileAddress(); - value += ELFRelocation::RelocAddend32(rel); - if ((reloc_type(rel) == R_X86_64_32 && (value > UINT32_MAX)) || - (reloc_type(rel) == R_X86_64_32S && - ((int64_t)value > INT32_MAX && (int64_t)value < INT32_MIN)) || - (reloc_type(rel) == R_AARCH64_ABS32 && - ((int64_t)value > INT32_MAX && (int64_t)value < INT32_MIN))) { - Log *log = GetLog(LLDBLog::Modules); - LLDB_LOGF(log, "Failed to apply debug info relocations"); - break; - } - uint32_t truncated_addr = (value & 0xFFFFFFFF); - DataBufferSP &data_buffer_sp = debug_data.GetSharedDataBuffer(); - // ObjectFileELF creates a WritableDataBuffer in CreateInstance. - WritableDataBuffer *data_buffer = - llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); - uint32_t *dst = reinterpret_cast<uint32_t *>( - data_buffer->GetBytes() + rel_section->GetFileOffset() + - ELFRelocation::RelocOffset32(rel)); - memcpy(dst, &truncated_addr, sizeof(uint32_t)); + case llvm::ELF::EM_LOONGARCH: + switch (reloc_type(rel)) { + case R_LARCH_64: + ApplyELF64ABS64Relocation(symtab, rel, debug_data, rel_section); + break; + case R_LARCH_32: + ApplyELF64ABS32Relocation(symtab, rel, debug_data, rel_section, true); + break; + default: + assert(false && "unexpected relocation type"); + } + break; + case llvm::ELF::EM_X86_64: + switch (reloc_type(rel)) { + case R_X86_64_64: + ApplyELF64ABS64Relocation(symtab, rel, debug_data, rel_section); + break; + case R_X86_64_32: + ApplyELF64ABS32Relocation(symtab, rel, debug_data, rel_section, + false); + break; + case R_X86_64_32S: + ApplyELF64ABS32Relocation(symtab, rel, debug_data, rel_section, true); + break; + case R_X86_64_PC32: + default: + assert(false && "unexpected relocation type"); } break; - } - case R_X86_64_PC32: default: - assert(false && "unexpected relocation type"); + assert(false && "unsupported machine"); } } } diff --git a/llvm/include/llvm/ADT/AddressRanges.h b/llvm/include/llvm/ADT/AddressRanges.h index f2052d82e7c1..415d30bbb5cf 100644 --- a/llvm/include/llvm/ADT/AddressRanges.h +++ b/llvm/include/llvm/ADT/AddressRanges.h @@ -28,7 +28,11 @@ public: uint64_t start() const { return Start; } uint64_t end() const { return End; } uint64_t size() const { return End - Start; } + uint64_t empty() const { return size() == 0; } bool contains(uint64_t Addr) const { return Start <= Addr && Addr < End; } + bool contains(const AddressRange &R) const { + return Start <= R.Start && R.End <= End; + } bool intersects(const AddressRange &R) const { return Start < R.End && R.Start < End; } @@ -45,101 +49,163 @@ private: uint64_t End = 0; }; -/// The AddressRanges class helps normalize address range collections. -/// This class keeps a sorted vector of AddressRange objects and can perform -/// insertions and searches efficiently. The address ranges are always sorted -/// and never contain any invalid or empty address ranges. -/// Intersecting([100,200), [150,300)) and adjacent([100,200), [200,300)) -/// address ranges are combined during insertion. -class AddressRanges { +/// The AddressRangesBase class presents the base functionality for the +/// normalized address ranges collection. This class keeps a sorted vector +/// of AddressRange-like objects and can perform searches efficiently. +/// The address ranges are always sorted and never contain any invalid, +/// empty or intersected address ranges. + +template <typename T> class AddressRangesBase { protected: - using Collection = SmallVector<AddressRange>; + using Collection = SmallVector<T>; Collection Ranges; public: void clear() { Ranges.clear(); } bool empty() const { return Ranges.empty(); } - bool contains(uint64_t Addr) const { return find(Addr) != Ranges.end(); } + bool contains(uint64_t Addr) const { + return find(Addr, Addr + 1) != Ranges.end(); + } bool contains(AddressRange Range) const { - return find(Range) != Ranges.end(); + return find(Range.start(), Range.end()) != Ranges.end(); } - std::optional<AddressRange> getRangeThatContains(uint64_t Addr) const { - Collection::const_iterator It = find(Addr); + void reserve(size_t Capacity) { Ranges.reserve(Capacity); } + size_t size() const { return Ranges.size(); } + + std::optional<T> getRangeThatContains(uint64_t Addr) const { + typename Collection::const_iterator It = find(Addr, Addr + 1); if (It == Ranges.end()) return std::nullopt; return *It; } - Collection::const_iterator insert(AddressRange Range); - void reserve(size_t Capacity) { Ranges.reserve(Capacity); } - size_t size() const { return Ranges.size(); } - bool operator==(const AddressRanges &RHS) const { - return Ranges == RHS.Ranges; - } - const AddressRange &operator[](size_t i) const { + + typename Collection::const_iterator begin() const { return Ranges.begin(); } + typename Collection::const_iterator end() const { return Ranges.end(); } + + const T &operator[](size_t i) const { assert(i < Ranges.size()); return Ranges[i]; } - Collection::const_iterator begin() const { return Ranges.begin(); } - Collection::const_iterator end() const { return Ranges.end(); } + + bool operator==(const AddressRangesBase<T> &RHS) const { + return Ranges == RHS.Ranges; + } protected: - Collection::const_iterator find(uint64_t Addr) const; - Collection::const_iterator find(AddressRange Range) const; + typename Collection::const_iterator find(uint64_t Start, uint64_t End) const { + if (Start >= End) + return Ranges.end(); + + auto It = + std::partition_point(Ranges.begin(), Ranges.end(), [=](const T &R) { + return AddressRange(R).start() <= Start; + }); + + if (It == Ranges.begin()) + return Ranges.end(); + + --It; + if (End > AddressRange(*It).end()) + return Ranges.end(); + + return It; + } }; -/// AddressRangesMap class maps values to the address ranges. -/// It keeps address ranges and corresponding values. If ranges -/// are combined during insertion, then combined range keeps -/// newly inserted value. -template <typename T> class AddressRangesMap : protected AddressRanges { +/// The AddressRanges class helps normalize address range collections. +/// This class keeps a sorted vector of AddressRange objects and can perform +/// insertions and searches efficiently. Intersecting([100,200), [150,300)) +/// and adjacent([100,200), [200,300)) address ranges are combined during +/// insertion. +class AddressRanges : public AddressRangesBase<AddressRange> { public: - void clear() { - Ranges.clear(); - Values.clear(); + Collection::const_iterator insert(AddressRange Range) { + if (Range.empty()) + return Ranges.end(); + + auto It = llvm::upper_bound(Ranges, Range); + auto It2 = It; + while (It2 != Ranges.end() && It2->start() <= Range.end()) + ++It2; + if (It != It2) { + Range = {Range.start(), std::max(Range.end(), std::prev(It2)->end())}; + It = Ranges.erase(It, It2); + } + if (It != Ranges.begin() && Range.start() <= std::prev(It)->end()) { + --It; + *It = {It->start(), std::max(It->end(), Range.end())}; + return It; + } + + return Ranges.insert(It, Range); } - bool empty() const { return AddressRanges::empty(); } - bool contains(uint64_t Addr) const { return AddressRanges::contains(Addr); } - bool contains(AddressRange Range) const { - return AddressRanges::contains(Range); - } - void insert(AddressRange Range, T Value) { - size_t InputSize = Ranges.size(); - Collection::const_iterator RangesIt = AddressRanges::insert(Range); - if (RangesIt == Ranges.end()) - return; +}; - // make Values match to Ranges. - size_t Idx = RangesIt - Ranges.begin(); - typename ValuesCollection::iterator ValuesIt = Values.begin() + Idx; - if (InputSize < Ranges.size()) - Values.insert(ValuesIt, T()); - else if (InputSize > Ranges.size()) - Values.erase(ValuesIt, ValuesIt + InputSize - Ranges.size()); - assert(Ranges.size() == Values.size()); - - // set value to the inserted or combined range. - Values[Idx] = Value; - } - size_t size() const { - assert(Ranges.size() == Values.size()); - return AddressRanges::size(); - } - std::optional<std::pair<AddressRange, T>> - getRangeValueThatContains(uint64_t Addr) const { - Collection::const_iterator It = find(Addr); - if (It == Ranges.end()) - return std::nullopt; +class AddressRangeValuePair { +public: + operator AddressRange() const { return Range; } - return std::make_pair(*It, Values[It - Ranges.begin()]); - } - std::pair<AddressRange, T> operator[](size_t Idx) const { - return std::make_pair(Ranges[Idx], Values[Idx]); - } + AddressRange Range; + int64_t Value = 0; +}; -protected: - using ValuesCollection = SmallVector<T>; - ValuesCollection Values; +inline bool operator==(const AddressRangeValuePair &LHS, + const AddressRangeValuePair &RHS) { + return LHS.Range == RHS.Range && LHS.Value == RHS.Value; +} + +/// AddressRangesMap class maps values to the address ranges. +/// It keeps normalized address ranges and corresponding values. +/// This class keeps a sorted vector of AddressRangeValuePair objects +/// and can perform insertions and searches efficiently. +/// Intersecting([100,200), [150,300)) ranges splitted into non-conflicting +/// parts([100,200), [200,300)). Adjacent([100,200), [200,300)) address +/// ranges are not combined during insertion. +class AddressRangesMap : public AddressRangesBase<AddressRangeValuePair> { +public: + void insert(AddressRange Range, int64_t Value) { + if (Range.empty()) + return; + + // Search for range which is less than or equal incoming Range. + auto It = std::partition_point(Ranges.begin(), Ranges.end(), + [=](const AddressRangeValuePair &R) { + return R.Range.start() <= Range.start(); + }); + + if (It != Ranges.begin()) + It--; + + while (!Range.empty()) { + // Inserted range does not overlap with any range. + // Store it into the Ranges collection. + if (It == Ranges.end() || Range.end() <= It->Range.start()) { + Ranges.insert(It, {Range, Value}); + return; + } + + // Inserted range partially overlaps with current range. + // Store not overlapped part of inserted range. + if (Range.start() < It->Range.start()) { + It = Ranges.insert(It, {{Range.start(), It->Range.start()}, Value}); + It++; + Range = {It->Range.start(), Range.end()}; + continue; + } + + // Inserted range fully overlaps with current range. + if (Range.end() <= It->Range.end()) + return; + + // Inserted range partially overlaps with current range. + // Remove overlapped part from the inserted range. + if (Range.start() < It->Range.end()) + Range = {It->Range.end(), Range.end()}; + + It++; + } + } }; } // namespace llvm diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h b/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h index 5b0ea339c4d6..9c7f24e69d48 100644 --- a/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h +++ b/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h @@ -21,7 +21,7 @@ class DeclContext; /// Mapped value in the address map is the offset to apply to the /// linked address. -using RangesTy = AddressRangesMap<int64_t>; +using RangesTy = AddressRangesMap; // FIXME: Delete this structure. struct PatchLocation { diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index 9f6e54377ede..d302d61894fa 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -1659,7 +1659,7 @@ void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit, DWARFDataExtractor RangeExtractor(OrigDwarf.getDWARFObj(), OrigDwarf.getDWARFObj().getRangesSection(), OrigDwarf.isLittleEndian(), AddressSize); - std::optional<std::pair<AddressRange, int64_t>> CachedRange; + std::optional<AddressRangeValuePair> CachedRange; DWARFUnit &OrigUnit = Unit.getOrigUnit(); auto OrigUnitDie = OrigUnit.getUnitDIE(false); uint64_t UnitBaseAddress = @@ -1687,9 +1687,9 @@ void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit, } if (!CachedRange || - !CachedRange->first.contains(Range.StartAddress + BaseAddress)) - CachedRange = FunctionRanges.getRangeValueThatContains( - Range.StartAddress + BaseAddress); + !CachedRange->Range.contains(Range.StartAddress + BaseAddress)) + CachedRange = FunctionRanges.getRangeThatContains(Range.StartAddress + + BaseAddress); // All range entries should lie in the function range. if (!CachedRange) { @@ -1698,8 +1698,8 @@ void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit, } LinkedRanges.insert( - {Range.StartAddress + BaseAddress + CachedRange->second, - Range.EndAddress + BaseAddress + CachedRange->second}); + {Range.StartAddress + BaseAddress + CachedRange->Value, + Range.EndAddress + BaseAddress + CachedRange->Value}); } } @@ -1802,7 +1802,7 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit, // in NewRows. std::vector<DWARFDebugLine::Row> Seq; const auto &FunctionRanges = Unit.getFunctionRanges(); - std::optional<std::pair<AddressRange, int64_t>> CurrRange; + std::optional<AddressRangeValuePair> CurrRange; // FIXME: This logic is meant to generate exactly the same output as // Darwin's classic dsymutil. There is a nicer way to implement this @@ -1821,13 +1821,13 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit, // it is marked as end_sequence in the input (because in that // case, the relocation offset is accurate and that entry won't // serve as the start of another function). - if (!CurrRange || !CurrRange->first.contains(Row.Address.Address) || - (Row.Address.Address == CurrRange->first.end() && !Row.EndSequence)) { + if (!CurrRange || !CurrRange->Range.contains(Row.Address.Address) || + (Row.Address.Address == CurrRange->Range.end() && !Row.EndSequence)) { // We just stepped out of a known range. Insert a end_sequence // corresponding to the end of the range. uint64_t StopAddress = - CurrRange ? CurrRange->first.end() + CurrRange->second : -1ULL; - CurrRange = FunctionRanges.getRangeValueThatContains(Row.Address.Address); + CurrRange ? CurrRange->Range.end() + CurrRange->Value : -1ULL; + CurrRange = FunctionRanges.getRangeThatContains(Row.Address.Address); if (!CurrRange) { if (StopAddress != -1ULL) { // Try harder by looking in the Address ranges map. @@ -1836,9 +1836,9 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit, // for now do as dsymutil. // FIXME: Understand exactly what cases this addresses and // potentially remove it along with the Ranges map. - if (std::optional<std::pair<AddressRange, int64_t>> Range = - Ranges.getRangeValueThatContains(Row.Address.Address)) - StopAddress = Row.Address.Address + (*Range).second; + if (std::optional<AddressRangeValuePair> Range = + Ranges.getRangeThatContains(Row.Address.Address)) + StopAddress = Row.Address.Address + (*Range).Value; } } if (StopAddress != -1ULL && !Seq.empty()) { @@ -1863,7 +1863,7 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit, continue; // Relocate row address and add it to the current sequence. - Row.Address.Address += CurrRange->second; + Row.Address.Address += CurrRange->Value; Seq.emplace_back(Row); if (Row.EndSequence) @@ -2002,8 +2002,8 @@ void DWARFLinker::patchFrameInfoForObject(const DWARFFile &File, // the function entry point, thus we can't just lookup the address // in the debug map. Use the AddressInfo's range map to see if the FDE // describes something that we can relocate. - std::optional<std::pair<AddressRange, int64_t>> Range = - Ranges.getRangeValueThatContains(Loc); + std::optional<AddressRangeValuePair> Range = + Ranges.getRangeThatContains(Loc); if (!Range) { // The +4 is to account for the size of the InitialLength field itself. InputOffset = EntryOffset + InitialLength + 4; @@ -2032,7 +2032,7 @@ void DWARFLinker::patchFrameInfoForObject(const DWARFFile &File, // fields that will get reconstructed by emitFDE(). unsigned FDERemainingBytes = InitialLength - (4 + AddrSize); TheDwarfEmitter->emitFDE(IteratorInserted.first->getValue(), AddrSize, - Loc + Range->second, + Loc + Range->Value, FrameData.substr(InputOffset, FDERemainingBytes)); InputOffset += FDERemainingBytes; } diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp index 5cad267fd845..ae79e8cb9066 100644 --- a/llvm/lib/DWARFLinker/DWARFStreamer.cpp +++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp @@ -402,10 +402,9 @@ void DwarfStreamer::emitUnitRangesEntries(CompileUnit &Unit, // Linked addresses might end up in a different order. // Build linked address ranges. AddressRanges LinkedRanges; - for (size_t Idx = 0; Idx < FunctionRanges.size(); Idx++) + for (const AddressRangeValuePair &Range : FunctionRanges) LinkedRanges.insert( - {FunctionRanges[Idx].first.start() + FunctionRanges[Idx].second, - FunctionRanges[Idx].first.end() + FunctionRanges[Idx].second}); + {Range.Range.start() + Range.Value, Range.Range.end() + Range.Value}); if (!FunctionRanges.empty()) emitDwarfDebugArangesTable(Unit, LinkedRanges); diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp index 567d5a4dd47a..0ad3b7235e87 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp @@ -552,6 +552,7 @@ void link_ELF_aarch64(std::unique_ptr<LinkGraph> G, Config.PrePrunePasses.push_back(EHFrameEdgeFixer( ".eh_frame", 8, aarch64::Pointer32, aarch64::Pointer64, aarch64::Delta32, aarch64::Delta64, aarch64::NegDelta32)); + Config.PrePrunePasses.push_back(EHFrameNullTerminator(".eh_frame")); // Add a mark-live pass. if (auto MarkLive = Ctx->getMarkLivePass(TT)) diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 7b9c55ff30a5..9342e10b5eda 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" @@ -1125,6 +1126,40 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { break; } + case 'w': + if (Name.startswith("wasm.fma.")) { + rename(F); + NewFn = Intrinsic::getDeclaration( + F->getParent(), Intrinsic::wasm_relaxed_madd, F->getReturnType()); + return true; + } + if (Name.startswith("wasm.fms.")) { + rename(F); + NewFn = Intrinsic::getDeclaration( + F->getParent(), Intrinsic::wasm_relaxed_nmadd, F->getReturnType()); + return true; + } + if (Name.startswith("wasm.laneselect.")) { + rename(F); + NewFn = Intrinsic::getDeclaration( + F->getParent(), Intrinsic::wasm_relaxed_laneselect, + F->getReturnType()); + return true; + } + if (Name == "wasm.dot.i8x16.i7x16.signed") { + rename(F); + NewFn = Intrinsic::getDeclaration( + F->getParent(), Intrinsic::wasm_relaxed_dot_i8x16_i7x16_signed); + return true; + } + if (Name == "wasm.dot.i8x16.i7x16.add.signed") { + rename(F); + NewFn = Intrinsic::getDeclaration( + F->getParent(), Intrinsic::wasm_relaxed_dot_i8x16_i7x16_add_signed); + return true; + } + break; + case 'x': if (UpgradeX86IntrinsicFunction(F, Name, NewFn)) return true; diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index eed29c25714b..0d074951cffc 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1685,6 +1685,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // keep one copy of each constant. MPM.addPass(ConstantMergePass()); + // Remove unused arguments from functions. + MPM.addPass(DeadArgumentEliminationPass()); + // Reduce the code after globalopt and ipsccp. Both can open up significant // simplification opportunities, and both can propagate functions through // function pointers. When this happens, we often have to resolve varargs @@ -1722,9 +1725,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // transform it to pass arguments by value instead of by reference. MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); - // Remove unused arguments from functions. - MPM.addPass(DeadArgumentEliminationPass()); - FunctionPassManager FPM; // The IPO Passes may leave cruft around. Clean up after them. FPM.addPass(InstCombinePass()); diff --git a/llvm/lib/Support/AddressRanges.cpp b/llvm/lib/Support/AddressRanges.cpp deleted file mode 100644 index 187d5be00dae..000000000000 --- a/llvm/lib/Support/AddressRanges.cpp +++ /dev/null @@ -1,70 +0,0 @@ -//===- AddressRanges.cpp ----------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/AddressRanges.h" -#include "llvm/ADT/STLExtras.h" -#include <inttypes.h> - -using namespace llvm; - -AddressRanges::Collection::const_iterator -AddressRanges::insert(AddressRange Range) { - if (Range.size() == 0) - return Ranges.end(); - - auto It = llvm::upper_bound(Ranges, Range); - auto It2 = It; - while (It2 != Ranges.end() && It2->start() <= Range.end()) - ++It2; - if (It != It2) { - Range = {Range.start(), std::max(Range.end(), std::prev(It2)->end())}; - It = Ranges.erase(It, It2); - } - if (It != Ranges.begin() && Range.start() <= std::prev(It)->end()) { - --It; - *It = {It->start(), std::max(It->end(), Range.end())}; - return It; - } - - return Ranges.insert(It, Range); -} - -AddressRanges::Collection::const_iterator -AddressRanges::find(uint64_t Addr) const { - auto It = std::partition_point( - Ranges.begin(), Ranges.end(), - [=](const AddressRange &R) { return R.start() <= Addr; }); - - if (It == Ranges.begin()) - return Ranges.end(); - - --It; - if (Addr >= It->end()) - return Ranges.end(); - - return It; -} - -AddressRanges::Collection::const_iterator -AddressRanges::find(AddressRange Range) const { - if (Range.size() == 0) - return Ranges.end(); - - auto It = std::partition_point( - Ranges.begin(), Ranges.end(), - [=](const AddressRange &R) { return R.start() <= Range.start(); }); - - if (It == Ranges.begin()) - return Ranges.end(); - - --It; - if (Range.end() > It->end()) - return Ranges.end(); - - return It; -} diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index 9a873413db87..bde5f5db99e7 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -782,6 +782,17 @@ void BTFDebug::visitCompositeType(const DICompositeType *CTy, visitEnumType(CTy, TypeId); } +bool BTFDebug::IsForwardDeclCandidate(const DIType *Base) { + if (const auto *CTy = dyn_cast<DICompositeType>(Base)) { + auto CTag = CTy->getTag(); + if ((CTag == dwarf::DW_TAG_structure_type || + CTag == dwarf::DW_TAG_union_type) && + !CTy->getName().empty() && !CTy->isForwardDecl()) + return true; + } + return false; +} + /// Handle pointer, typedef, const, volatile, restrict and member types. void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId, bool CheckPointer, bool SeenPointer) { @@ -796,20 +807,15 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId, if (CheckPointer && SeenPointer) { const DIType *Base = DTy->getBaseType(); if (Base) { - if (const auto *CTy = dyn_cast<DICompositeType>(Base)) { - auto CTag = CTy->getTag(); - if ((CTag == dwarf::DW_TAG_structure_type || - CTag == dwarf::DW_TAG_union_type) && - !CTy->getName().empty() && !CTy->isForwardDecl()) { - /// Find a candidate, generate a fixup. Later on the struct/union - /// pointee type will be replaced with either a real type or - /// a forward declaration. - auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, true); - auto &Fixup = FixupDerivedTypes[CTy]; - Fixup.push_back(std::make_pair(DTy, TypeEntry.get())); - TypeId = addType(std::move(TypeEntry), DTy); - return; - } + if (IsForwardDeclCandidate(Base)) { + /// Find a candidate, generate a fixup. Later on the struct/union + /// pointee type will be replaced with either a real type or + /// a forward declaration. + auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, true); + auto &Fixup = FixupDerivedTypes[cast<DICompositeType>(Base)]; + Fixup.push_back(std::make_pair(DTy, TypeEntry.get())); + TypeId = addType(std::move(TypeEntry), DTy); + return; } } } @@ -844,6 +850,13 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId, visitTypeEntry(DTy->getBaseType(), TempTypeId, CheckPointer, SeenPointer); } +/// Visit a type entry. CheckPointer is true if the type has +/// one of its predecessors as one struct/union member. SeenPointer +/// is true if CheckPointer is true and one of its predecessors +/// is a pointer. The goal of CheckPointer and SeenPointer is to +/// do pruning for struct/union types so some of these types +/// will not be emitted in BTF and rather forward declarations +/// will be generated. void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId, bool CheckPointer, bool SeenPointer) { if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) { @@ -888,6 +901,11 @@ void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId, if (DIToIdMap.find(BaseTy) != DIToIdMap.end()) { DTy = dyn_cast<DIDerivedType>(BaseTy); } else { + if (CheckPointer && DTy->getTag() == dwarf::DW_TAG_pointer_type) { + SeenPointer = true; + if (IsForwardDeclCandidate(BaseTy)) + break; + } uint32_t TmpTypeId; visitTypeEntry(BaseTy, TmpTypeId, CheckPointer, SeenPointer); break; diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h index aa982babd458..f0b42232f4d5 100644 --- a/llvm/lib/Target/BPF/BTFDebug.h +++ b/llvm/lib/Target/BPF/BTFDebug.h @@ -338,6 +338,9 @@ class BTFDebug : public DebugHandlerBase { void visitMapDefType(const DIType *Ty, uint32_t &TypeId); /// @} + /// Check whether the type is a forward declaration candidate or not. + bool IsForwardDeclCandidate(const DIType *Base); + /// Get the file content for the subprogram. Certain lines of the file /// later may be put into string table and referenced by line info. std::string populateFileContent(const DISubprogram *SP); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp index 379aaa713a00..88b926fce2aa 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp @@ -239,6 +239,16 @@ bool RISCVELFStreamer::requiresFixups(MCContext &C, const MCExpr *Value, if (B.isInSection() && B.getSection().getKind().isText()) return true; + // If A is undefined and B is defined, we should emit ADD/SUB for A-B. + // Unfortunately, A may be defined later, but this requiresFixups call has to + // eagerly make a decision. For now, emit ADD/SUB unless A is .L*. This + // heuristic handles many temporary label differences for .debug_* and + // .apple_types sections. + // + // TODO Implement delayed relocation decision. + if (!A.isInSection() && !A.isTemporary() && B.isInSection()) + return true; + // Support cross-section symbolic differences ... return A.isInSection() && B.isInSection() && A.getSection().getName() != B.getSection().getName(); diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 83bd2ff6acc3..55c56e76af6f 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -695,6 +695,7 @@ include "X86ScheduleSLM.td" include "X86ScheduleZnver1.td" include "X86ScheduleZnver2.td" include "X86ScheduleZnver3.td" +include "X86ScheduleZnver4.td" include "X86ScheduleBdVer2.td" include "X86ScheduleBtVer2.td" include "X86SchedSkylakeClient.td" @@ -1627,7 +1628,7 @@ def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features, ProcessorFeatures.ZN2Tuning>; def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features, ProcessorFeatures.ZN3Tuning>; -def : Proc<"znver4",ProcessorFeatures.ZN4Features, +def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features, ProcessorFeatures.ZN4Tuning>; def : Proc<"geode", [FeatureX87, FeatureCX8, Feature3DNowA], diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 8fddd0037999..7e1c96a429eb 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -896,15 +896,15 @@ multiclass ATOMIC_LOGIC_OP<Format Form, string s> { multiclass ATOMIC_LOGIC_OP_RM<bits<8> Opc8, string s> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, SchedRW = [WriteBitTestSetRegRMW] in { - def 16rm : Ii8<Opc8, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + def 16rm : I<Opc8, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), !strconcat(s, "{w}\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR16:$src2))]>, OpSize16, TB, LOCK; - def 32rm : Ii8<Opc8, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + def 32rm : I<Opc8, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), !strconcat(s, "{l}\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR32:$src2))]>, OpSize32, TB, LOCK; - def 64rm : RIi8<Opc8, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + def 64rm : RI<Opc8, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), !strconcat(s, "{q}\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR64:$src2))]>, TB, LOCK; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 561ba99db4af..f8660a9fa123 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -577,20 +577,37 @@ let Predicates = [HasAVX, NoVLX] in { def : Pat<(alignedloadv8f16 addr:$src), (VMOVAPSrm addr:$src)>; + def : Pat<(alignedloadv8bf16 addr:$src), + (VMOVAPSrm addr:$src)>; def : Pat<(loadv8f16 addr:$src), (VMOVUPSrm addr:$src)>; + def : Pat<(loadv8bf16 addr:$src), + (VMOVUPSrm addr:$src)>; def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8bf16 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; def : Pat<(store (v8f16 VR128:$src), addr:$dst), (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8bf16 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedloadv16f16 addr:$src), (VMOVAPSYrm addr:$src)>; + def : Pat<(alignedloadv16bf16 addr:$src), + (VMOVAPSYrm addr:$src)>; def : Pat<(loadv16f16 addr:$src), (VMOVUPSYrm addr:$src)>; + def : Pat<(loadv16bf16 addr:$src), + (VMOVUPSYrm addr:$src)>; def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst), (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore (v16bf16 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v16f16 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v16bf16 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; } // Use movaps / movups for SSE integer load / store (one byte shorter). diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td index d2460e12b005..49ef6efc6aec 100644 --- a/llvm/lib/Target/X86/X86PfmCounters.td +++ b/llvm/lib/Target/X86/X86PfmCounters.td @@ -290,4 +290,17 @@ def ZnVer3PfmCounters : ProcPfmCounters { ]; } def : PfmCountersBinding<"znver3", ZnVer3PfmCounters>; -def : PfmCountersBinding<"znver4", ZnVer3PfmCounters>; + +def ZnVer4PfmCounters : ProcPfmCounters { + let CycleCounter = PfmCounter<"cycles_not_in_halt">; + let UopsCounter = PfmCounter<"retired_ops">; + let IssueCounters = [ + PfmIssueCounter<"Zn4Int", "ops_type_dispatched_from_decoder:int_disp_retire_mode">, + PfmIssueCounter<"Zn4FPU", "ops_type_dispatched_from_decoder:fp_disp_retire_mode">, + PfmIssueCounter<"Zn4Load", "ls_dispatch:ld_dispatch">, + PfmIssueCounter<"Zn4Store", "ls_dispatch:store_dispatch">, + PfmIssueCounter<"Zn4Divider", "div_op_count">, + PfmIssueCounter<"Zn4AGU", "ls_dispatch:ld_st_dispatch + ls_dispatch:ld_dispatch + ls_dispatch:store_dispatch"> + ]; +} +def : PfmCountersBinding<"znver4", ZnVer4PfmCounters>; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td new file mode 100644 index 000000000000..c3f08998419f --- /dev/null +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -0,0 +1,1957 @@ +//=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Znver4 to support instruction +// scheduling and other instruction cost heuristics. +// Based on: +// * AMD Software Optimization Guide for AMD Family 19h Processors. +// https://www.amd.com/system/files/TechDocs/56665.zip +//===----------------------------------------------------------------------===// + +def Znver4Model : SchedMachineModel { + // AMD SOG 19h, 2.9.6 Dispatch + // The processor may dispatch up to 6 macro ops per cycle + // into the execution engine. + let IssueWidth = 6; + // AMD SOG 19h, 2.10.3 + // The retire control unit (RCU) tracks the completion status of all + // outstanding operations (integer, load/store, and floating-point) and is + // the final arbiter for exception processing and recovery. + // The unit can receive up to 6 macro ops dispatched per cycle and track up + // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode. + let MicroOpBufferSize = 320; + // AMD SOG 19h, 2.9.1 Op Cache + // The op cache is organized as an associative cache with 64 sets and 8 ways. + // At each set-way intersection is an entry containing up to 8 macro ops. + // The maximum capacity of the op cache is 4K ops. + // Agner, 22.5 µop cache + // The size of the µop cache is big enough for holding most critical loops. + // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity, + // with large values here the compilation of certain loops + // ends up taking way too long. + // Ideally for znver4, we should have 6.75K. However we don't add that + // considerting the impact compile time and prefer using default values + // instead. + // let LoopMicroOpBufferSize = 6750; + // AMD SOG 19h, 2.6.2 L1 Data Cache + // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. + // AMD SOG 19h, 2.12 L1 Data Cache + // The AGU and LS pipelines are optimized for simple address generation modes. + // <...> and can achieve 4-cycle load-to-use integer load latency. + let LoadLatency = 4; + // AMD SOG 19h, 2.12 L1 Data Cache + // The AGU and LS pipelines are optimized for simple address generation modes. + // <...> and can achieve <...> 7-cycle load-to-use FP load latency. + int VecLoadLatency = 7; + // Latency of a simple store operation. + int StoreLatency = 1; + // FIXME: + let HighLatency = 25; // FIXME: any better choice? + // AMD SOG 19h, 2.8 Optimizing Branching + // The branch misprediction penalty is in the range from 11 to 18 cycles, + // <...>. The common case penalty is 13 cycles. + let MispredictPenalty = 13; + + let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass. + + let CompleteModel = 1; +} + +let SchedModel = Znver4Model in { + + +//===----------------------------------------------------------------------===// +// RCU +//===----------------------------------------------------------------------===// + +// AMD SOG 19h, 2.10.3 Retire Control Unit +// The unit can receive up to 6 macro ops dispatched per cycle and track up to +// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> +// The retire unit handles in-order commit of up to nine macro ops per cycle. +def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>; + +//===----------------------------------------------------------------------===// +// Integer Execution Unit +// + +// AMD SOG 19h, 2.4 Superscalar Organization +// The processor uses four decoupled independent integer scheduler queues, +// each one servicing one ALU pipeline and one or two other pipelines + +// +// Execution pipes +//===----------------------------------------------------------------------===// + +// AMD SOG 19h, 2.10.2 Execution Units +// The processor contains 4 general purpose integer execution pipes. +// Each pipe has an ALU capable of general purpose integer operations. +def Zn4ALU0 : ProcResource<1>; +def Zn4ALU1 : ProcResource<1>; +def Zn4ALU2 : ProcResource<1>; +def Zn4ALU3 : ProcResource<1>; + +// AMD SOG 19h, 2.10.2 Execution Units +// There is also a separate branch execution unit. +def Zn4BRU1 : ProcResource<1>; + +// AMD SOG 19h, 2.10.2 Execution Units +// There are three Address Generation Units (AGUs) for all load and store +// address generation. There are also 3 store data movement units +// associated with the same schedulers as the AGUs. +def Zn4AGU0 : ProcResource<1>; +def Zn4AGU1 : ProcResource<1>; +def Zn4AGU2 : ProcResource<1>; + +// +// Execution Units +//===----------------------------------------------------------------------===// + +// AMD SOG 19h, 2.10.2 Execution Units +// ALU0 additionally has divide <...> execution capability. +defvar Zn4Divider = Zn4ALU0; + +// AMD SOG 19h, 2.10.2 Execution Units +// ALU0 additionally has <...> branch execution capability. +defvar Zn4BRU0 = Zn4ALU0; + +// Integer Multiplication issued on ALU1. +defvar Zn4Multiplier = Zn4ALU1; + +// Execution pipeline grouping +//===----------------------------------------------------------------------===// + +// General ALU operations +def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>; + +// General AGU operations +def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>; + +// Control flow: jumps, calls +def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>; + +// Everything that isn't control flow, but still needs to access CC register, +// namely: conditional moves, SETcc. +def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>; + +// Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT + +// Simple bit twiddling: bit test, shift/rotate, bit extraction +def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>; + + +// +// Scheduling +//===----------------------------------------------------------------------===// + +// AMD SOG 19h, 2.10.3 Retire Control Unit +// The integer physical register file (PRF) consists of 224 registers. +def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0], + 6, // Max moves that can be eliminated per cycle. + 0>; // Restrict move elimination to zero regs. + +// anandtech, The integer scheduler has a 4*24 entry macro op capacity. +// AMD SOG 19h, 2.10.1 Schedulers +// The schedulers can receive up to six macro ops per cycle, with a limit of +// two per scheduler. Each scheduler can issue one micro op per cycle into +// each of its associated pipelines +def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0 + Zn4ALU1, Zn4AGU1, // scheduler 1 + Zn4ALU2, Zn4AGU2, // scheduler 2 + Zn4ALU3, Zn4BRU1 // scheduler 3 + ]> { + let BufferSize = !mul(4, 24); +} + + +//===----------------------------------------------------------------------===// +// Floating-Point Unit +// + +// AMD SOG 19h, 2.4 Superscalar Organization +// The processor uses <...> two decoupled independent floating point schedulers +// each servicing two FP pipelines and one store or FP-to-integer pipeline. + +// +// Execution pipes +//===----------------------------------------------------------------------===// + +// AMD SOG 19h, 2.10.1 Schedulers +// <...>, and six FPU pipes. +// Agner, 22.10 Floating point execution pipes +// There are six floating point/vector execution pipes, +def Zn4FP0 : ProcResource<1>; +def Zn4FP1 : ProcResource<1>; +def Zn4FP2 : ProcResource<1>; +def Zn4FP3 : ProcResource<1>; +def Zn4FP45 : ProcResource<2>; + +// +// Execution Units +//===----------------------------------------------------------------------===// +// AMD SOG 19h, 2.11.1 Floating Point Execution Resources + +// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) +defvar Zn4FPFMul0 = Zn4FP0; +defvar Zn4FPFMul1 = Zn4FP1; + +// (v)FADD* +defvar Zn4FPFAdd0 = Zn4FP2; +defvar Zn4FPFAdd1 = Zn4FP3; + +// All convert operations except pack/unpack +defvar Zn4FPFCvt0 = Zn4FP2; +defvar Zn4FPFCvt1 = Zn4FP3; + +// All Divide and Square Root except Reciprocal Approximation +// AMD SOG 19h, 2.11.1 Floating Point Execution Resources +// FDIV unit can support 2 simultaneous operations in flight +// even though it occupies a single pipe. +// FIXME: BufferSize=2 ? +defvar Zn4FPFDiv = Zn4FP1; + +// Moves and Logical operations on Floating Point Data Types +defvar Zn4FPFMisc0 = Zn4FP0; +defvar Zn4FPFMisc1 = Zn4FP1; +defvar Zn4FPFMisc2 = Zn4FP2; +defvar Zn4FPFMisc3 = Zn4FP3; + +// Integer Adds, Subtracts, and Compares +// Some complex VADD operations are not available in all pipes. +defvar Zn4FPVAdd0 = Zn4FP0; +defvar Zn4FPVAdd1 = Zn4FP1; +defvar Zn4FPVAdd2 = Zn4FP2; +defvar Zn4FPVAdd3 = Zn4FP3; + +// Integer Multiplies, SAD, Blendvb +defvar Zn4FPVMul0 = Zn4FP0; +defvar Zn4FPVMul1 = Zn4FP3; + +// Data Shuffles, Packs, Unpacks, Permute +// Some complex shuffle operations are only available in pipe1. +defvar Zn4FPVShuf = Zn4FP1; +defvar Zn4FPVShufAux = Zn4FP2; + +// Bit Shift Left/Right operations +defvar Zn4FPVShift0 = Zn4FP1; +defvar Zn4FPVShift1 = Zn4FP2; + +// Moves and Logical operations on Packed Integer Data Types +defvar Zn4FPVMisc0 = Zn4FP0; +defvar Zn4FPVMisc1 = Zn4FP1; +defvar Zn4FPVMisc2 = Zn4FP2; +defvar Zn4FPVMisc3 = Zn4FP3; + +// *AES* +defvar Zn4FPAES0 = Zn4FP0; +defvar Zn4FPAES1 = Zn4FP1; + +// *CLM* +defvar Zn4FPCLM0 = Zn4FP0; +defvar Zn4FPCLM1 = Zn4FP1; + +// Execution pipeline grouping +//===----------------------------------------------------------------------===// + +// AMD SOG 19h, 2.11 Floating-Point Unit +// Stores and floating point to general purpose register transfer +// have 2 dedicated pipelines (pipe 5 and 6). +def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>; + +// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) +def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>; + +// (v)FADD* +// Some complex VADD operations are not available in all pipes. +def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>; + +// All convert operations except pack/unpack +def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>; + +// All Divide and Square Root except Reciprocal Approximation +// def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>; + +// Moves and Logical operations on Floating Point Data Types +def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>; + +// FIXUP and RANGE use FP01 pipelines +def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>; +def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>; +// SCALE instructions use FP23 pipelines +def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>; +def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>; + +// Loads, Stores and Move to General Register (EX) Operations +// AMD SOG 19h, 2.11 Floating-Point Unit +// Stores and floating point to general purpose register transfer +// have 2 dedicated pipelines (pipe 5 and 6). +defvar Zn4FPLd01 = Zn4FP45; + +// AMD SOG 19h, 2.11 Floating-Point Unit +// Note that FP stores are supported on two pipelines, +// but throughput is limited to one per cycle. +let Super = Zn4FP45 in +def Zn4FPSt : ProcResource<1>; + +// Integer Adds, Subtracts, and Compares +// Some complex VADD operations are not available in all pipes. +def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>; + +def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>; +def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>; + +// AVX512 Opmask pipelines +def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>; +def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>; + +// Integer Multiplies, SAD, Blendvb +def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>; + +// Data Shuffles, Packs, Unpacks, Permute +// Some complex shuffle operations are only available in pipe1. +def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>; + +// Bit Shift Left/Right operations +def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>; + +// Moves and Logical operations on Packed Integer Data Types +def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>; + +// *AES* +def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>; + +// *CLM* +def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>; + + +// +// Scheduling +//===----------------------------------------------------------------------===// + +// Agner, 21.8 Register renaming and out-of-order schedulers +// The floating point register file has 192 vector registers +// of 512b each in zen4. +def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1], + 6, // Max moves that can be eliminated per cycle. + 0>; // Restrict move elimination to zero regs. + +// AMD SOG 19h, 2.11 Floating-Point Unit +// The floating-point scheduler has a 2*32 entry macro op capacity. +// AMD SOG 19h, 2.11 Floating-Point Unit +// <...> the scheduler can issue 1 micro op per cycle for each pipe. +// FIXME: those are two separate schedulers, not a single big one. +def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0 + Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/ // scheduler 1 + ]> { + let BufferSize = !mul(2, 32); +} + +// AMD SOG 19h, 2.11 Floating-Point Unit +// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) +// even if floating-point scheduler is full. +// FIXME: how to model this properly? + + +//===----------------------------------------------------------------------===// +// Load-Store Unit +// + +// AMD SOG 19h, 2.12 Load-Store Unit +// The LS unit contains three largely independent pipe-lines +// enabling the execution of three 256-bit memory operations per cycle. +def Zn4LSU : ProcResource<3>; + +// AMD SOG 19h, 2.12 Load-Store Unit +// All three memory operations can be loads. +let Super = Zn4LSU in +def Zn4Load : ProcResource<3> { + // AMD SOG 19h, 2.12 Load-Store Unit + // The LS unit can process up to 72 out-of-order loads. + let BufferSize = 72; +} + +def Zn4LoadQueue : LoadQueue<Zn4Load>; + +// AMD SOG 19h, 2.12 Load-Store Unit +// A maximum of two of the memory operations can be stores. +let Super = Zn4LSU in +def Zn4Store : ProcResource<2> { + // AMD SOG 19h, 2.12 Load-Store Unit + // The LS unit utilizes a 64-entry store queue (STQ). + let BufferSize = 64; +} + +def Zn4StoreQueue : StoreQueue<Zn4Store>; + +//===----------------------------------------------------------------------===// +// Basic helper classes. +//===----------------------------------------------------------------------===// + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when dispatched by the schedulers. +// This multiclass defines the resource usage for variants with and without +// folded loads. + +multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts, + int Lat = 1, list<int> Res = [], int UOps = 1> { + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } +} + +multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, int Lat, + list<int> Res, int UOps, int LoadLat, int LoadUOps, + ProcResourceKind AGU, int LoadRes> { + defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; + + defm : __Zn4WriteRes<SchedRW.Folded, + !listconcat([AGU, Zn4Load], ExePorts), + !add(Lat, LoadLat), + !if(!and(!empty(Res), !eq(LoadRes, 1)), + [], + !listconcat([1, LoadRes], + !if(!empty(Res), + !listsplat(1, !size(ExePorts)), + Res))), + !add(UOps, LoadUOps)>; +} + +// For classes without folded loads. +multiclass Zn4WriteResInt<SchedWrite SchedRW, + list<ProcResourceKind> ExePorts, int Lat = 1, + list<int> Res = [], int UOps = 1> { + defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; +} + +multiclass Zn4WriteResXMM<SchedWrite SchedRW, + list<ProcResourceKind> ExePorts, int Lat = 1, + list<int> Res = [], int UOps = 1> { + defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; +} + +multiclass Zn4WriteResYMM<SchedWrite SchedRW, + list<ProcResourceKind> ExePorts, int Lat = 1, + list<int> Res = [], int UOps = 1> { + defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; +} + +multiclass Zn4WriteResZMM<SchedWrite SchedRW, + list<ProcResourceKind> ExePorts, int Lat = 1, + list<int> Res = [], int UOps = 1> { + defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>; +} + +// For classes with folded loads. +multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, int Lat = 1, + list<int> Res = [], int UOps = 1, + int LoadUOps = 0, int LoadRes = 1> { + defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, + Znver4Model.LoadLatency, + LoadUOps, Zn4AGU012, LoadRes>; +} + +multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, int Lat = 1, + list<int> Res = [], int UOps = 1, + int LoadUOps = 0, int LoadRes = 1> { + defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, + Znver4Model.VecLoadLatency, + LoadUOps, Zn4FPLd01, LoadRes>; +} + +multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, int Lat = 1, + list<int> Res = [], int UOps = 1, + int LoadUOps = 0, int LoadRes = 1> { + defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, + Znver4Model.VecLoadLatency, + LoadUOps, Zn4FPLd01, LoadRes>; +} + +multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, int Lat = 1, + list<int> Res = [], int UOps = 2, + int LoadUOps = 0, int LoadRes = 1> { + defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps, + Znver4Model.VecLoadLatency, + LoadUOps, Zn4FPLd01, LoadRes>; +} + +//===----------------------------------------------------------------------===// +// Here be dragons. +//===----------------------------------------------------------------------===// + +def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>; + +def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>; +def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>; +def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>; + +// AMD SOG 19h, 2.11 Floating-Point Unit +// There is 1 cycle of added latency for a result to cross +// from F to I or I to F domain. +def : ReadAdvance<ReadInt2Fpu, -1>; + +// Instructions with both a load and a store folded are modeled as a folded +// load + WriteRMW. +defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>; + +// Loads, stores, and moves, not folded with other operations. +defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>; + +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>; + +def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> { + let Latency = !add(Znver4Model.LoadLatency, 1); + let ResourceCycles = [3, 1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>; + +defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; +defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>; +defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>; + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { + let Latency = Znver4Model.LoadLatency; + let ResourceCycles = [1, 1, 4]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>; + +def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> { + let Latency = Znver4Model.StoreLatency; + let ResourceCycles = [4, 1, 1]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>; + +// Arithmetic. +defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op. + +def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 1; + let ResourceCycles = [4]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32, + AND8i8, AND16i16, AND32i32, AND64i32, + OR8i8, OR16i16, OR32i32, OR64i32, + SUB8i8, SUB16i16, SUB32i32, SUB64i32, + XOR8i8, XOR16i16, XOR32i32, XOR64i32)>; + +def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 1; + let ResourceCycles = [4]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>; + +def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> { + let Latency = 1; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>; + +def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> { + let Latency = 3; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr, + PEXT32rr, PEXT64rr)>; + +defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op. + +def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> { + let Latency = 1; + let ResourceCycles = [1, 1, 7, 1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>; + +// This is for simple LEAs with one or two input operands. +defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads. + +// This write is used for slow LEA instructions. +def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 2; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} + +// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset), +// or an LEA with a `Scale` value different than 1. +def Zn4SlowLEAPredicate : MCSchedPredicate< + CheckAny<[ + // A 3-operand LEA (base, index, offset). + IsThreeOperandsLEAFn, + // An LEA with a "Scale" different than 1. + CheckAll<[ + CheckIsImmOperand<2>, + CheckNot<CheckImmOperand<2, 1>> + ]> + ]> +>; + +def Zn4WriteLEA : SchedWriteVariant<[ + SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>, + SchedVar<NoSchedPred, [WriteLEA]> +]>; + +def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; + +def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 2; // FIXME: not from llvm-exegesis + let ResourceCycles = [4]; + let NumMicroOps = 2; +} + +def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>; + +// Integer multiplication +defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication. +defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication. +defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate. +defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register. +defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>; // Integer 32-bit multiplication. +defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. +defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate. +defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register. +defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>; // Integer 64-bit multiplication. +defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags. +defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate. +defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register. +defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>; // Integer multiplication, high part. +defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part. + +defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap. +defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap. + +defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap. + +def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 3; + let ResourceCycles = [12]; + let NumMicroOps = 3; +} +def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>; + +defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>; // Compare and set, compare and swap. + +def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency); + let ResourceCycles = [1, 1, 12]; + let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2); +} +def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>; + +def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 3; // FIXME: not from llvm-exegesis + let ResourceCycles = [24]; + let NumMicroOps = 19; +} +def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>; + +def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 4; // FIXME: not from llvm-exegesis + let ResourceCycles = [59]; + let NumMicroOps = 28; +} +def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>; + +def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 1; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>; + +def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { + let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis + let ResourceCycles = [1, 1, 2]; + let NumMicroOps = 5; +} +def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>; + +def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> { + let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis + let ResourceCycles = [1, 1, 2]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>; + +// Integer division. +// FIXME: uops for 8-bit division measures as 2. for others it's a guess. +// FIXME: latency for 8-bit division measures as 10. for others it's a guess. +defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>; +defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>; +defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>; +defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>; +defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>; +defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>; +defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>; +defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>; + +defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward. +defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse. + +defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count. + +def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 1; + let ResourceCycles = [4]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>; + +defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count. + +def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 1; + let ResourceCycles = [4]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>; + +defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count. + +def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> { + let Latency = 2; + let ResourceCycles = [4]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>; + +defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move. +defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move. +defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code. +defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis +defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH. + +defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test +defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>; +defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>; + +defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set +defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>; +defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>; + +// Integer shifts and rotates. +defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; +defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; +defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; + +def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> { + let Latency = 1; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1, + RCR8r1, RCR16r1, RCR32r1, RCR64r1)>; + +def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency); + let ResourceCycles = [1, 1, 2]; + let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1); +} +def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1, + RCR8m1, RCR16m1, RCR32m1, RCR64m1)>; + +def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> { + let Latency = 3; + let ResourceCycles = [6]; + let NumMicroOps = 7; +} +def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>; + +def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency); + let ResourceCycles = [1, 1, 8]; + let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3); +} +def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>; + +def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> { + let Latency = 4; + let ResourceCycles = [8]; + let NumMicroOps = 9; +} +def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>; + +def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency); + let ResourceCycles = [1, 1, 8]; + let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2); +} +def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>; + +defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; + +def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> { + let Latency = 3; + let ResourceCycles = [6]; + let NumMicroOps = 7; +} +def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>; + +def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency); + let ResourceCycles = [1, 1, 8]; + let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2); +} +def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>; + +def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> { + let Latency = 4; + let ResourceCycles = [8]; + let NumMicroOps = 9; +} +def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>; + +def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency); + let ResourceCycles = [1, 1, 8]; + let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2); +} +def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>; + +// Double shift instructions. +defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>; +defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>; +defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; +defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>; + +// BMI1 BEXTR/BLS, BMI2 BZHI +defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; +defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>; +defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis + +// Floating point. This covers both scalar and vector operations. +defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>; +defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; +defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>; +defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; + +def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> { + let Latency = 2; // FIXME: not from llvm-exegesis + let ResourceCycles = [1, 1]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr, + VMOVHPDmr, VMOVHPSmr)>; + +defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; +defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; +defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; +defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; +defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; + +defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; +defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; +defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; +defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; + +defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub. + +def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { + let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis + let ResourceCycles = [1, 1, 24]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m, + SUB_FI16m, SUB_FI32m, + SUBR_FI16m, SUBR_FI32m, + MUL_FI16m, MUL_FI32m)>; + +def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { + let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis + let ResourceCycles = [1, 1, 62]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m, + DIVR_FI16m, DIVR_FI32m)>; + +defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM). +defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM). +defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM). +defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub. +defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM). +defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM). +defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM). +defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare. +defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 1, [1], 1>; // Floating point compare (XMM). +defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (YMM). +defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [4], 1>; // Floating point compare (ZMM). +defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare. +defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare (XMM). +defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (YMM). +defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [4], 1>; // Floating point double compare (ZMM). +defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87). +defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE). +defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication. +defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM). +defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM). +defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM). +defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication. +defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM). +defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM). +defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM). +defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>; // Floating point division. +defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM). +defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM). +defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM). +defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division. +defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM). +defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM). +defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM). +defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root. +defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (XMM). +defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (YMM). +defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>; // Floating point square root (ZMM). +defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root. +defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM). +defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM). +defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM). +defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root. +defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate. +defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM). +defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM). +defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM). +defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate. +defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM). +defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM). +defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM). +defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add. +defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM). +defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM). +defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [4], 1>; // Fused Multiply Add (ZMM). +defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product. +defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product. +defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM). +defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs. +defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding. +defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM). +defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM). + +defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals. +defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM). +defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM). +defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions. +defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM). +defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM). +defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles. +defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM). +defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM). +defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles. +defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM). +defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM). +defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends. +defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM). +defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM). +defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends. +defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM). +defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM). + +// Horizontal Add/Sub (float and integer) +defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>; +defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>; +defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>; +defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>; +defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>; +defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>; +defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>; + +// Vector integer operations. +defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>; +defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; +defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; + +def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> { + let Latency = 4; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>; + +def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let ResourceCycles = [1, 1, 1]; + let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); +} +def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>; + +def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let ResourceCycles = [1, 1, 1]; + let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); +} +def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>; + +defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; +defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; +defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>; +defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; +defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>; +defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>; +defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>; + +defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>; +defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>; + +def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { + let Latency = 1; + let ResourceCycles = [1, 2]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>; + +def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> { + let Latency = 1; + let ResourceCycles = [1, 4]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>; + +defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals. + +def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { + let Latency = 3; + let ResourceCycles = [1, 1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>; + +def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> { + let Latency = 3; + let ResourceCycles = [1, 1]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>; + +defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM). + +def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr, + PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr, + PAVGBrr, PAVGWrr, + PSIGNBrr, PSIGNDrr, PSIGNWrr, + VPABSBrr, VPABSDrr, VPABSWrr, + VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr, + VPAVGBrr, VPAVGWrr, + VPCMPEQQrr, + VPSIGNBrr, VPSIGNDrr, VPSIGNWrr, + PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>; + +def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVecOpMask], (instrs KADDBrr, KADDDrr, KADDQrr, KADDWrr, + KANDBrr, KANDDrr, KANDQrr, KANDWrr, + KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr, + KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk, + KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk, + KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr, + KORBrr, KORDrr, KORQrr, KORWrr, + KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr, + KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr, + KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr, + KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr, + KXORBrr, KXORDrr, KXORQrr, KXORWrr)>; + +def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>; + +def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>; + +def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> { + // TODO: All align instructions are expected to be of 4 cycle latency + let Latency = 4; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri, + VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri) + >; +defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). + +def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr, + VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr, + VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr, + VPAVGBYrr, VPAVGWYrr, + VPCMPEQQYrr, + VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>; + +defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM). + +defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals. +defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM). +defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM). +defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM). +defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions. +defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM). +defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (ZMM). +defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (default). +defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM). +defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM). +defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM). +defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default). +defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM). +defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM). +defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM). +defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (default). +defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM). +defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM). +defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM). +defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD. +defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM). +defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM). +defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles. +defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM). +defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM). +defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM). +defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles. +defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM). +defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM). +defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM). +defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends. +defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM). +defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM). +defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends. +defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM). +defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM). +defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW. +defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM). +defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM). +defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM). +defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD. +defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM). +defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM). +defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>; // Vector PHMINPOS. + +// Vector insert/extract operations. +defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element. +defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr. +defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store. + +// MOVMSK operations. +defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; +defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; +defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>; +defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>; + +// Conversion between integer and float. +defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>; // Double -> Integer. +defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM). +defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM). +defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM). + +def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> { + let Latency = 1; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} +defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>; // Float -> Integer. + +defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM). +defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM). +defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM). + +defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double. +defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM). +defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM). +defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM). + +def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> { + let Latency = 2; + let ResourceCycles = [6]; + let NumMicroOps = 2; +} + +defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. +defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). +defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM). +defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM). + +def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> { + let Latency = 3; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} + +defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. +defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). +defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM). +defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM). + +defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion. +defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM). +defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM). +defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM). + +defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion. +defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM). +defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM). + +defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion. +defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM). +defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM). + +defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion. +defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM). +defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM). + +// CRC32 instruction. +defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>; + +def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; + +def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency); + let ResourceCycles = [1, 1, 2]; + let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0); +} +def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>; + +def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> { + let Latency = 1; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; + +def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency); + let ResourceCycles = [1, 1, 2]; + let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); +} +def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>; + +def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> { + let Latency = 2; + let ResourceCycles = [3]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; + +def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency); + let ResourceCycles = [1, 1, 3]; + let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0); +} +def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>; + +def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> { + let Latency = 3; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} +def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; + +def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency); + let ResourceCycles = [1, 1, 8]; + let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1); +} +def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>; + +def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> { + let Latency = 6; + let ResourceCycles = [8]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>; + +def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> { + let Latency = 4; + let ResourceCycles = [8]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>; + +// Strings instructions. +// Packed Compare Implicit Length Strings, Return Mask +defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>; +// Packed Compare Explicit Length Strings, Return Mask +defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>; +// Packed Compare Implicit Length Strings, Return Index +defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>; +// Packed Compare Explicit Length Strings, Return Index +defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>; + +// AES instructions. +defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption. +defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn. +defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation. + +// Carry-less multiplication instructions. +defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>; + +// EMMS/FEMMS +defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis + +// Load/store MXCSR +defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis +defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis + +// Catch-all for expensive system instructions. +defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>; + +def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> { + let Latency = 0; // FIXME: not from llvm-exegesis + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>; + +def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> { + let Latency = 10; // FIXME: not from llvm-exegesis + let ResourceCycles = [24]; + let NumMicroOps = 18; +} +def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>; + +// AVX2. +defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles. +defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles. +defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles. + +def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> { + let Latency = 3; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>; + +def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency); + let ResourceCycles = [1, 1, 1]; + let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); +} +def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>; + +def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { + let Latency = 7; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; + +def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency); + let ResourceCycles = [1, 1, 2]; + let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); +} +def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>; + +def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { + let Latency = 6; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; + +def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency); + let ResourceCycles = [1, 1, 2]; + let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); +} +def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>; + +def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { + let Latency = 5; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; + +def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { + let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency); + let ResourceCycles = [1, 1, 2]; + let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); +} +def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>; + +defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move. +defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles. +defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts. +defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM). +defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM). + +// Old microcoded instructions that nobody use. +defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>; + +// Fence instructions. +defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>; + +def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> { + let Latency = 1; + let ResourceCycles = [30]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>; + +def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>; + +// Nop, not very useful expect it provides a model for nops! +defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis + + +/////////////////////////////////////////////////////////////////////////////// +// Zero Cycle Move +/////////////////////////////////////////////////////////////////////////////// + +def Zn4WriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; + let ResourceCycles = []; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV, + MOV64rr, MOV64rr_REV, + MOVSX32rr32)>; + +def Zn4WriteSwapRenameable : SchedWriteRes<[]> { + let Latency = 0; + let ResourceCycles = []; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar, + XCHG64rr, XCHG64ar)>; + +defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support. + +defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>; +defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>; +defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>; + +defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX +defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>; +defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>; +defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>; + +def : IsOptimizableRegisterMove<[ + InstructionEquivalenceClass<[ + // GPR variants. + MOV32rr, MOV32rr_REV, + MOV64rr, MOV64rr_REV, + MOVSX32rr32, + XCHG32rr, XCHG32ar, + XCHG64rr, XCHG64ar, + + // MMX variants. + // MMX moves are *NOT* eliminated. + + // SSE variants. + MOVAPSrr, MOVAPSrr_REV, + MOVUPSrr, MOVUPSrr_REV, + MOVAPDrr, MOVAPDrr_REV, + MOVUPDrr, MOVUPDrr_REV, + MOVDQArr, MOVDQArr_REV, + MOVDQUrr, MOVDQUrr_REV, + + // AVX variants. + VMOVAPSrr, VMOVAPSrr_REV, + VMOVUPSrr, VMOVUPSrr_REV, + VMOVAPDrr, VMOVAPDrr_REV, + VMOVUPDrr, VMOVUPDrr_REV, + VMOVDQArr, VMOVDQArr_REV, + VMOVDQUrr, VMOVDQUrr_REV, + + // AVX YMM variants. + VMOVAPSYrr, VMOVAPSYrr_REV, + VMOVUPSYrr, VMOVUPSYrr_REV, + VMOVAPDYrr, VMOVAPDYrr_REV, + VMOVUPDYrr, VMOVUPDYrr_REV, + VMOVDQAYrr, VMOVDQAYrr_REV, + VMOVDQUYrr, VMOVDQUYrr_REV, + ], TruePred > +]>; + +// FIXUP and RANGE Instructions +def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex + "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", + "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)", + "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" + )>; + +// SCALE & REDUCE instructions +def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> { + let Latency = 6; + let ResourceCycles = [6]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteSCALErr], (instregex + "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)", + "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)" + )>; + +//BF16PS Instructions +def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> { + let Latency = 6; + let ResourceCycles = [6]; + let NumMicroOps = 2; +} +def : InstRW<[Zn4WriteBF16], (instregex + "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)" + )>; + +// BUSD and VPMADD Instructions +def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 4; + let ResourceCycles = [4]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex + "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)", + "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)" + )>; + +// SHIFT instructions +def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteSHIFTrr], (instregex + "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)", + "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)", + "(V?)P(SLL|SRL|SRA)DQYri", + "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri", + "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)", + "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)", + "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)", + "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)", + "VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz" + )>; + +def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteSHIFTri], (instregex + "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)" + )>; + +// ALIGN Instructions +def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteALIGN], (instregex + "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)" + )>; + +//PACK Instructions +def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WritePACK], (instregex + "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)" + )>; + +// MAX and MIN Instructions +def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteFCmp64], (instregex + "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)", + "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)", + "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)", + "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)" + )>; + +// MOV Instructions +def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4MOVS], (instregex + "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)", + "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)", + "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)", + "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)", + "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)" + )>; + +def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 4; + let ResourceCycles = [4]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4MOVSZ], (instregex + "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)" + )>; + +def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 5; + let ResourceCycles = [5]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4MOVSrr], (instregex + "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)" + )>; + + +//VPTEST Instructions +def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 3; + let ResourceCycles = [3]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4VPTESTZ128], (instregex + "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)" + )>; + +def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 4; + let ResourceCycles = [4]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4VPTESTZ256], (instregex + "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)" + )>; + +def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 5; + let ResourceCycles = [5]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4VPTESTZ], (instregex + "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)" + )>; + +// CONFLICT Instructions +def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4CONFLICTZ128], (instregex + "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)" + )>; + +def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> { + let Latency = 6; + let ResourceCycles = [2,2,2]; + let NumMicroOps = 4; +} +def : InstRW<[Zn4CONFLICTrr], (instregex + "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)" + )>; + +// RSQRT Instructions +def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 5; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4VRSQRT14PDZ256], (instregex + "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)" + )>; + + +// PERM Instructions +def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4PERMILP], (instregex + "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)" + )>; + +def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 3; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4PERMIT2_128], (instregex + "VPERM(I2|T2)(PS|PD|W)128(rr|rrk|rrkz)", + "VPERM(I2|T2)(B|D|Q)128(rr|rrk|rrkz)" + )>; + +def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4PERMIT2_128rr], (instregex + "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)", + "VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)" + )>; + +def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 4; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4PERMIT2_256], (instregex + "VPERM(I2|T2)(PS|PD|W)256(rr|rrk|rrkz)", + "VPERMP(S|D)Z256(rr|rrk|rrkz)", + "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)", + "VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)", + "VPERM(I2|Q|T2)(B|D|Q)(Z?)256(rr|rrk|rrkz)", + "VPEXPAND(B|W)Z256(rr|rrk|rrkz)" + )>; + +def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 5; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4PERMIT2Z], (instregex + "VPERM(I2|T2)(PS|PD|W)(rr|rrk|rrkz)", + "VPERM(B|D|W)Z(rr|rrk|rrkz)", + "VPERM(I2|Q|T2)(B|D|Q)(Z?)(rr|rrk|rrkz)", + "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)", + "VPEXPAND(B|W)Z(rr|rrk|rrkz)", + "VPERMP(S|D)Z(rr|rrk|rrkz)" + )>; + +// ALU SLOW Misc Instructions +def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4VecALUZSlow], (instrs + VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr, + VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk, + VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz, + VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr, + VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk, + VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz, + VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr, + VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk, + VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz, + VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr, + VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk, + VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz, + VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr, + VPSUBUSWZ128rrk, VPSUBUSWZ128rrkz + )>; + + +/////////////////////////////////////////////////////////////////////////////// +// Dependency breaking instructions. +/////////////////////////////////////////////////////////////////////////////// + +def Zn4WriteZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteALU]> +]>; +def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV, + XOR64rr, XOR64rr_REV, + SUB32rr, SUB32rr_REV, + SUB64rr, SUB64rr_REV)>; + +def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteALU]> +]>; +def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV, + CMP16rr, CMP16rr_REV, + CMP32rr, CMP32rr_REV, + CMP64rr, CMP64rr_REV)>; + +def Zn4WriteFZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogic]> +]>; +// NOTE: XORPSrr, XORPDrr are not zero-cycle! +def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr, + VANDNPSrr, VANDNPDrr)>; + +def Zn4WriteFZeroIdiomY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteFLogicY]> +]>; +def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, + VANDNPSYrr, VANDNPDYrr)>; + +def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicX]> +]>; +// NOTE: PXORrr,PANDNrr are not zero-cycle! +def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>; + +def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecLogicY]> +]>; +def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>; + +def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUX]> +]>; +// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, +// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle! +def : InstRW<[Zn4WriteVZeroIdiomALUX], + (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>; + +def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>, + SchedVar<NoSchedPred, [WriteVecALUY]> +]>; +def : InstRW<[Zn4WriteVZeroIdiomALUY], + (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, + VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>; + +def : IsZeroIdiomFunction<[ + // GPR Zero-idioms. + DepBreakingClass<[ XOR32rr, XOR32rr_REV, + XOR64rr, XOR64rr_REV, + SUB32rr, SUB32rr_REV, + SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>, + + // SSE XMM Zero-idioms. + DepBreakingClass<[ + // fp variants. + XORPSrr, XORPDrr, + ANDNPSrr, ANDNPDrr, + + // int variants. + PXORrr, + PANDNrr, + PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, + PSUBSBrr, PSUBSWrr, + PSUBUSBrr, PSUBUSWrr, + PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr + ], ZeroIdiomPredicate>, + + // AVX XMM Zero-idioms. + DepBreakingClass<[ + // fp variants. + VXORPSrr, VXORPDrr, + VANDNPSrr, VANDNPDrr, + + // int variants. + VPXORrr, + VPANDNrr, + VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, + VPSUBSBrr, VPSUBSWrr, + VPSUBUSBrr, VPSUBUSWrr, + VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, + ], ZeroIdiomPredicate>, + + // AVX YMM Zero-idioms. + DepBreakingClass<[ + // fp variants. + VXORPSYrr, VXORPDYrr, + VANDNPSYrr, VANDNPDYrr, + + // int variants. + VPXORYrr, + VPANDNYrr, + VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr, + VPSUBSBYrr, VPSUBSWYrr, + VPSUBUSBYrr, VPSUBUSWYrr, + VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr + ], ZeroIdiomPredicate>, +]>; + +def : IsDepBreakingFunction<[ + // GPR + DepBreakingClass<[ SBB32rr, SBB32rr_REV, + SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>, + DepBreakingClass<[ CMP8rr, CMP8rr_REV, + CMP16rr, CMP16rr_REV, + CMP32rr, CMP32rr_REV, + CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >, + // SSE + DepBreakingClass<[ + PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr + ], ZeroIdiomPredicate>, + + // AVX XMM + DepBreakingClass<[ + VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr + ], ZeroIdiomPredicate>, + + // AVX YMM + DepBreakingClass<[ + VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr + ], ZeroIdiomPredicate>, +]>; + +} // SchedModel + diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp index af98ecb122d6..22b985df9302 100644 --- a/llvm/lib/TargetParser/ARMTargetParser.cpp +++ b/llvm/lib/TargetParser/ARMTargetParser.cpp @@ -523,7 +523,7 @@ StringRef ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) { default: if (TT.isOSNetBSD()) return "apcs-gnu"; - if (TT.isOSOpenBSD()) + if (TT.isOSFreeBSD() || TT.isOSOpenBSD()) return "aapcs-linux"; return "aapcs"; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 1480a0ff9e2f..de3095852048 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -3184,16 +3184,6 @@ Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant( } break; } - case Instruction::And: { - const APInt *BOC; - if (match(BOp1, m_APInt(BOC))) { - // If we have ((X & C) == C), turn it into ((X & C) != 0). - if (C == *BOC && C.isPowerOf2()) - return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, - BO, Constant::getNullValue(RHS->getType())); - } - break; - } case Instruction::UDiv: if (C.isZero()) { // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A) @@ -5653,6 +5643,12 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) { } } } + + // Op0 eq C_Pow2 -> Op0 ne 0 if Op0 is known to be C_Pow2 or zero. + if (Op1Known.isConstant() && Op1Known.getConstant().isPowerOf2() && + (Op0Known & Op1Known) == Op0Known) + return new ICmpInst(CmpInst::getInversePredicate(Pred), Op0, + ConstantInt::getNullValue(Op1->getType())); break; } case ICmpInst::ICMP_ULT: { diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td index de7f883d24a8..c6627c75157b 100644 --- a/llvm/tools/llvm-objdump/ObjdumpOpts.td +++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td @@ -145,10 +145,10 @@ def reloc : Flag<["--"], "reloc">, def : Flag<["-"], "r">, Alias<reloc>, HelpText<"Alias for --reloc">; def print_imm_hex : Flag<["--"], "print-imm-hex">, - HelpText<"Use hex format for immediate values">; + HelpText<"Use hex format for immediate values (default)">; def no_print_imm_hex : Flag<["--"], "no-print-imm-hex">, - HelpText<"Do not use hex format for immediate values (default)">; + HelpText<"Do not use hex format for immediate values">; def : Flag<["--"], "print-imm-hex=false">, Alias<no_print_imm_hex>; def private_headers : Flag<["--"], "private-headers">, |