aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-03-31 20:55:52 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-03-31 20:55:52 +0000
commit5bcd187b307a70f29854eb0c5ccdf30ff3770fe1 (patch)
tree005c0e9231b62275dc3a5d207b2550431858ce0a
parent11ee15ea4ee1ea5555f8d7ba1ec5ffe956df2a8c (diff)
Vendor import of llvm-project branch release/16.x llvmorg-16.0.0-45-g42d1b276f779.vendor/llvm-project/llvmorg-16.0.0-45-g42d1b276f779
-rw-r--r--clang/include/clang/Basic/DiagnosticParseKinds.td2
-rw-r--r--clang/include/clang/Basic/DiagnosticSemaKinds.td5
-rw-r--r--clang/lib/Basic/Targets/ARM.cpp3
-rw-r--r--clang/lib/Driver/ToolChains/FreeBSD.cpp26
-rw-r--r--clang/lib/Driver/ToolChains/FreeBSD.h2
-rw-r--r--clang/lib/Sema/SemaCoroutine.cpp4
-rw-r--r--clang/lib/StaticAnalyzer/Core/RegionStore.cpp8
-rw-r--r--compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp1
-rw-r--r--compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp2
-rw-r--r--libcxx/include/__config2
-rw-r--r--libcxx/include/__expected/expected.h7
-rw-r--r--libcxx/include/__memory/uninitialized_algorithms.h8
-rw-r--r--libcxx/include/__memory_resource/polymorphic_allocator.h2
-rw-r--r--libcxx/include/__utility/exception_guard.h47
-rw-r--r--libunwind/include/libunwind.modulemap3
-rw-r--r--libunwind/include/unwind.h4
-rw-r--r--libunwind/src/DwarfInstructions.hpp3
-rw-r--r--libunwind/src/UnwindRegistersRestore.S18
-rw-r--r--libunwind/src/UnwindRegistersSave.S11
-rw-r--r--lld/COFF/MinGW.cpp3
-rw-r--r--lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp143
-rw-r--r--llvm/include/llvm/ADT/AddressRanges.h206
-rw-r--r--llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h2
-rw-r--r--llvm/lib/DWARFLinker/DWARFLinker.cpp36
-rw-r--r--llvm/lib/DWARFLinker/DWARFStreamer.cpp5
-rw-r--r--llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp1
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp35
-rw-r--r--llvm/lib/Passes/PassBuilderPipelines.cpp6
-rw-r--r--llvm/lib/Support/AddressRanges.cpp70
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp46
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.h3
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp10
-rw-r--r--llvm/lib/Target/X86/X86.td3
-rw-r--r--llvm/lib/Target/X86/X86InstrCompiler.td6
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td17
-rw-r--r--llvm/lib/Target/X86/X86PfmCounters.td15
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver4.td1957
-rw-r--r--llvm/lib/TargetParser/ARMTargetParser.cpp2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp16
-rw-r--r--llvm/tools/llvm-objdump/ObjdumpOpts.td4
40 files changed, 2428 insertions, 316 deletions
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 36d4bc2a700d..e99beb3a7636 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1052,7 +1052,7 @@ def err_lambda_template_parameter_list_empty : Error<
// C++2b static lambdas
def err_static_lambda: ExtWarn<
"static lambdas are a C++2b extension">, InGroup<CXX2b>;
-def warn_cxx20_compat_static_lambda: ExtWarn<
+def warn_cxx20_compat_static_lambda : Warning<
"static lambdas are incompatible with C++ standards before C++2b">,
InGroup<CXXPre2bCompat>, DefaultIgnore;
def err_static_mutable_lambda : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index de56e3e1566b..bfe582d8252f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9138,8 +9138,9 @@ def err_operator_overload_static : Error<
def err_operator_overload_default_arg : Error<
"parameter of overloaded %0 cannot have a default argument">;
-def ext_subscript_overload : ExtWarn<
- "overloaded %0 with %select{no|a defaulted|more than one}1 parameter is a C++2b extension">, InGroup<CXXPre2bCompat>, DefaultIgnore;
+def ext_subscript_overload : Warning<
+ "overloaded %0 with %select{no|a defaulted|more than one}1 parameter is a "
+ "C++2b extension">, InGroup<CXXPre2bCompat>, DefaultIgnore;
def error_subscript_overload : Error<
"overloaded %0 cannot have %select{no|a defaulted|more than one}1 parameter before C++2b">;
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index f11751a76073..b85d5dc2d347 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -254,6 +254,7 @@ ARMTargetInfo::ARMTargetInfo(const llvm::Triple &Triple,
const TargetOptions &Opts)
: TargetInfo(Triple), FPMath(FP_Default), IsAAPCS(true), LDREX(0),
HW_FP(0) {
+ bool IsFreeBSD = Triple.isOSFreeBSD();
bool IsOpenBSD = Triple.isOSOpenBSD();
bool IsNetBSD = Triple.isOSNetBSD();
@@ -321,7 +322,7 @@ ARMTargetInfo::ARMTargetInfo(const llvm::Triple &Triple,
default:
if (IsNetBSD)
setABI("apcs-gnu");
- else if (IsOpenBSD)
+ else if (IsFreeBSD || IsOpenBSD)
setABI("aapcs-linux");
else
setABI("aapcs");
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index 9a5af638c399..2230295ccd74 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -85,16 +85,7 @@ void freebsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
else
CmdArgs.push_back("-mfpu=softvfp");
- switch (getToolChain().getTriple().getEnvironment()) {
- case llvm::Triple::GNUEABIHF:
- case llvm::Triple::GNUEABI:
- case llvm::Triple::EABI:
- CmdArgs.push_back("-meabi=5");
- break;
-
- default:
- CmdArgs.push_back("-matpcs");
- }
+ CmdArgs.push_back("-meabi=5");
break;
}
case llvm::Triple::sparc:
@@ -467,21 +458,6 @@ Tool *FreeBSD::buildAssembler() const {
Tool *FreeBSD::buildLinker() const { return new tools::freebsd::Linker(*this); }
-llvm::ExceptionHandling FreeBSD::GetExceptionModel(const ArgList &Args) const {
- // FreeBSD uses SjLj exceptions on ARM oabi.
- switch (getTriple().getEnvironment()) {
- case llvm::Triple::GNUEABIHF:
- case llvm::Triple::GNUEABI:
- case llvm::Triple::EABI:
- return llvm::ExceptionHandling::None;
- default:
- if (getTriple().getArch() == llvm::Triple::arm ||
- getTriple().getArch() == llvm::Triple::thumb)
- return llvm::ExceptionHandling::SjLj;
- return llvm::ExceptionHandling::None;
- }
-}
-
bool FreeBSD::HasNativeLLVMSupport() const { return true; }
ToolChain::UnwindTableLevel
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.h b/clang/lib/Driver/ToolChains/FreeBSD.h
index 9b24ef1a1e1c..cec67d84a2ce 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.h
+++ b/clang/lib/Driver/ToolChains/FreeBSD.h
@@ -78,8 +78,6 @@ public:
void AddHIPIncludeArgs(const llvm::opt::ArgList &DriverArgs,
llvm::opt::ArgStringList &CC1Args) const override;
- llvm::ExceptionHandling
- GetExceptionModel(const llvm::opt::ArgList &Args) const override;
UnwindTableLevel
getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override;
bool isPIEDefault(const llvm::opt::ArgList &Args) const override;
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 79c08adb8fab..9678e30699c8 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -1562,7 +1562,7 @@ bool CoroutineStmtBuilder::makeNewAndDeleteExpr() {
const auto *OpDeleteType =
OpDeleteQualType.getTypePtr()->castAs<FunctionProtoType>();
if (OpDeleteType->getNumParams() > DeleteArgs.size() &&
- S.getASTContext().hasSameType(
+ S.getASTContext().hasSameUnqualifiedType(
OpDeleteType->getParamType(DeleteArgs.size()), FrameSize->getType()))
DeleteArgs.push_back(FrameSize);
@@ -1579,7 +1579,7 @@ bool CoroutineStmtBuilder::makeNewAndDeleteExpr() {
// So we are not forced to pass alignment to the deallocation function.
if (S.getLangOpts().CoroAlignedAllocation &&
OpDeleteType->getNumParams() > DeleteArgs.size() &&
- S.getASTContext().hasSameType(
+ S.getASTContext().hasSameUnqualifiedType(
OpDeleteType->getParamType(DeleteArgs.size()),
FrameAlignment->getType()))
DeleteArgs.push_back(FrameAlignment);
diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index 46948c12617c..49855305cecc 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -1849,8 +1849,12 @@ std::optional<SVal> RegionStoreManager::getSValFromInitListExpr(
// Go to the nested initializer list.
ILE = IL;
}
- llvm_unreachable(
- "Unhandled InitListExpr sub-expressions or invalid offsets.");
+
+ assert(ILE);
+
+ // FIXME: Unhandeled InitListExpr sub-expression, possibly constructing an
+ // enum?
+ return std::nullopt;
}
/// Returns an SVal, if possible, for the specified position in a string
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index ec860fdc4ff9..37b2b57c0c84 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -583,6 +583,7 @@ static void GetTls(uptr *addr, uptr *size) {
*addr = (uptr)tcb->tcb_dtv[1];
}
}
+#else
#error "Unknown OS"
#endif
}
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp
index c647ab107ec5..ac2afe42e269 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_win.cpp
@@ -231,8 +231,6 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
// Check that tool command lines are simple and that complete escaping is
// unnecessary.
CHECK(!internal_strchr(arg, '"') && "quotes in args unsupported");
- CHECK(!internal_strstr(arg, "\\\\") &&
- "double backslashes in args unsupported");
CHECK(arglen > 0 && arg[arglen - 1] != '\\' &&
"args ending in backslash and empty args unsupported");
command_line.append("\"%s\" ", arg);
diff --git a/libcxx/include/__config b/libcxx/include/__config
index ac6a1422bfe3..2f11f3b7d495 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -37,7 +37,7 @@
// _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
// Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 16.0.1 == 16.00.01), _LIBCPP_VERSION is
// defined to XXYYZZ.
-# define _LIBCPP_VERSION 160000
+# define _LIBCPP_VERSION 160001
# define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
# define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index e1f590c65efe..ca3e8a59922d 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -292,7 +292,8 @@ private:
"be reverted to the previous state in case an exception is thrown during the assignment.");
_T2 __tmp(std::move(__oldval));
std::destroy_at(std::addressof(__oldval));
- __exception_guard __trans([&] { std::construct_at(std::addressof(__oldval), std::move(__tmp)); });
+ auto __trans =
+ std::__make_exception_guard([&] { std::construct_at(std::addressof(__oldval), std::move(__tmp)); });
std::construct_at(std::addressof(__newval), std::forward<_Args>(__args)...);
__trans.__complete();
}
@@ -451,7 +452,7 @@ public:
if constexpr (is_nothrow_move_constructible_v<_Err>) {
_Err __tmp(std::move(__with_err.__union_.__unex_));
std::destroy_at(std::addressof(__with_err.__union_.__unex_));
- __exception_guard __trans([&] {
+ auto __trans = std::__make_exception_guard([&] {
std::construct_at(std::addressof(__with_err.__union_.__unex_), std::move(__tmp));
});
std::construct_at(std::addressof(__with_err.__union_.__val_), std::move(__with_val.__union_.__val_));
@@ -464,7 +465,7 @@ public:
"that it can be reverted to the previous state in case an exception is thrown during swap.");
_Tp __tmp(std::move(__with_val.__union_.__val_));
std::destroy_at(std::addressof(__with_val.__union_.__val_));
- __exception_guard __trans([&] {
+ auto __trans = std::__make_exception_guard([&] {
std::construct_at(std::addressof(__with_val.__union_.__val_), std::move(__tmp));
});
std::construct_at(std::addressof(__with_val.__union_.__unex_), std::move(__with_err.__union_.__unex_));
diff --git a/libcxx/include/__memory/uninitialized_algorithms.h b/libcxx/include/__memory/uninitialized_algorithms.h
index 0067780c3f5d..90aecb7d6ad2 100644
--- a/libcxx/include/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__memory/uninitialized_algorithms.h
@@ -421,7 +421,7 @@ constexpr void __allocator_construct_at_multidimensional(_Alloc& __alloc, _Tp* _
_Tp& __array = *__loc;
// If an exception is thrown, destroy what we have constructed so far in reverse order.
- __exception_guard __guard([&]() {
+ auto __guard = std::__make_exception_guard([&]() {
std::__allocator_destroy_multidimensional(__elem_alloc, __array, __array + __i);
});
@@ -461,7 +461,7 @@ constexpr void __allocator_construct_at_multidimensional(_Alloc& __alloc, _Tp* _
_Tp& __array = *__loc;
// If an exception is thrown, destroy what we have constructed so far in reverse order.
- __exception_guard __guard([&]() {
+ auto __guard = std::__make_exception_guard([&]() {
std::__allocator_destroy_multidimensional(__elem_alloc, __array, __array + __i);
});
for (; __i != extent_v<_Tp>; ++__i) {
@@ -488,7 +488,7 @@ __uninitialized_allocator_fill_n_multidimensional(_Alloc& __alloc, _BidirIter __
_BidirIter __begin = __it;
// If an exception is thrown, destroy what we have constructed so far in reverse order.
- __exception_guard __guard([&]() { std::__allocator_destroy_multidimensional(__value_alloc, __begin, __it); });
+ auto __guard = std::__make_exception_guard([&]() { std::__allocator_destroy_multidimensional(__value_alloc, __begin, __it); });
for (; __n != 0; --__n, ++__it) {
std::__allocator_construct_at_multidimensional(__value_alloc, std::addressof(*__it), __value);
}
@@ -505,7 +505,7 @@ __uninitialized_allocator_value_construct_n_multidimensional(_Alloc& __alloc, _B
_BidirIter __begin = __it;
// If an exception is thrown, destroy what we have constructed so far in reverse order.
- __exception_guard __guard([&]() { std::__allocator_destroy_multidimensional(__value_alloc, __begin, __it); });
+ auto __guard = std::__make_exception_guard([&]() { std::__allocator_destroy_multidimensional(__value_alloc, __begin, __it); });
for (; __n != 0; --__n, ++__it) {
std::__allocator_construct_at_multidimensional(__value_alloc, std::addressof(*__it));
}
diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h
index 2489502bcdaf..f7b9a0b408c1 100644
--- a/libcxx/include/__memory_resource/polymorphic_allocator.h
+++ b/libcxx/include/__memory_resource/polymorphic_allocator.h
@@ -98,7 +98,7 @@ public:
template <class _Type, class... _CtorArgs>
[[nodiscard]] _Type* new_object(_CtorArgs&&... __ctor_args) {
_Type* __ptr = allocate_object<_Type>();
- __exception_guard __guard([&] { deallocate_object(__ptr); });
+ auto __guard = std::__make_exception_guard([&] { deallocate_object(__ptr); });
construct(__ptr, std::forward<_CtorArgs>(__ctor_args)...);
__guard.__complete();
return __ptr;
diff --git a/libcxx/include/__utility/exception_guard.h b/libcxx/include/__utility/exception_guard.h
index 737d1a69c971..46f9359a5c0e 100644
--- a/libcxx/include/__utility/exception_guard.h
+++ b/libcxx/include/__utility/exception_guard.h
@@ -60,25 +60,26 @@ _LIBCPP_BEGIN_NAMESPACE_STD
#ifndef _LIBCPP_NO_EXCEPTIONS
template <class _Rollback>
-struct __exception_guard {
- __exception_guard() = delete;
+struct __exception_guard_exceptions {
+ __exception_guard_exceptions() = delete;
- _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit __exception_guard(_Rollback __rollback)
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit __exception_guard_exceptions(_Rollback __rollback)
: __rollback_(std::move(__rollback)), __completed_(false) {}
- _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __exception_guard(__exception_guard&& __other)
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+ __exception_guard_exceptions(__exception_guard_exceptions&& __other)
_NOEXCEPT_(is_nothrow_move_constructible<_Rollback>::value)
: __rollback_(std::move(__other.__rollback_)), __completed_(__other.__completed_) {
__other.__completed_ = true;
}
- __exception_guard(__exception_guard const&) = delete;
- __exception_guard& operator=(__exception_guard const&) = delete;
- __exception_guard& operator=(__exception_guard&&) = delete;
+ __exception_guard_exceptions(__exception_guard_exceptions const&) = delete;
+ __exception_guard_exceptions& operator=(__exception_guard_exceptions const&) = delete;
+ __exception_guard_exceptions& operator=(__exception_guard_exceptions&&) = delete;
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __complete() _NOEXCEPT { __completed_ = true; }
- _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__exception_guard() {
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 ~__exception_guard_exceptions() {
if (!__completed_)
__rollback_();
}
@@ -87,36 +88,46 @@ private:
_Rollback __rollback_;
bool __completed_;
};
+
+_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__exception_guard_exceptions);
+
+template <class _Rollback>
+using __exception_guard = __exception_guard_exceptions<_Rollback>;
#else // _LIBCPP_NO_EXCEPTIONS
template <class _Rollback>
-struct __exception_guard {
- __exception_guard() = delete;
- _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG explicit __exception_guard(_Rollback) {}
+struct __exception_guard_noexceptions {
+ __exception_guard_noexceptions() = delete;
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
+ _LIBCPP_NODEBUG explicit __exception_guard_noexceptions(_Rollback) {}
- _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG __exception_guard(__exception_guard&& __other)
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG
+ __exception_guard_noexceptions(__exception_guard_noexceptions&& __other)
_NOEXCEPT_(is_nothrow_move_constructible<_Rollback>::value)
: __completed_(__other.__completed_) {
__other.__completed_ = true;
}
- __exception_guard(__exception_guard const&) = delete;
- __exception_guard& operator=(__exception_guard const&) = delete;
- __exception_guard& operator=(__exception_guard&&) = delete;
+ __exception_guard_noexceptions(__exception_guard_noexceptions const&) = delete;
+ __exception_guard_noexceptions& operator=(__exception_guard_noexceptions const&) = delete;
+ __exception_guard_noexceptions& operator=(__exception_guard_noexceptions&&) = delete;
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG void __complete() _NOEXCEPT {
__completed_ = true;
}
- _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG ~__exception_guard() {
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NODEBUG ~__exception_guard_noexceptions() {
_LIBCPP_ASSERT(__completed_, "__exception_guard not completed with exceptions disabled");
}
private:
bool __completed_ = false;
};
-#endif // _LIBCPP_NO_EXCEPTIONS
-_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__exception_guard);
+_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(__exception_guard_noexceptions);
+
+template <class _Rollback>
+using __exception_guard = __exception_guard_noexceptions<_Rollback>;
+#endif // _LIBCPP_NO_EXCEPTIONS
template <class _Rollback>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __exception_guard<_Rollback> __make_exception_guard(_Rollback __rollback) {
diff --git a/libunwind/include/libunwind.modulemap b/libunwind/include/libunwind.modulemap
index 162fe1d279a3..775841ecb5f1 100644
--- a/libunwind/include/libunwind.modulemap
+++ b/libunwind/include/libunwind.modulemap
@@ -6,5 +6,8 @@ module libunwind [system] {
module unwind [system] {
header "__libunwind_config.h"
header "unwind.h"
+ private textual header "unwind_arm_ehabi.h"
+ private textual header "unwind_itanium.h"
+
export *
}
diff --git a/libunwind/include/unwind.h b/libunwind/include/unwind.h
index 26cdef22207e..b1775d3a3dec 100644
--- a/libunwind/include/unwind.h
+++ b/libunwind/include/unwind.h
@@ -56,9 +56,9 @@ typedef enum {
typedef struct _Unwind_Context _Unwind_Context; // opaque
#if defined(_LIBUNWIND_ARM_EHABI)
-#include "unwind_arm_ehabi.h"
+#include <unwind_arm_ehabi.h>
#else
-#include "unwind_itanium.h"
+#include <unwind_itanium.h>
#endif
typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)
diff --git a/libunwind/src/DwarfInstructions.hpp b/libunwind/src/DwarfInstructions.hpp
index 27432be56133..9962c2ffa0ca 100644
--- a/libunwind/src/DwarfInstructions.hpp
+++ b/libunwind/src/DwarfInstructions.hpp
@@ -224,7 +224,8 @@ int DwarfInstructions<A, R>::stepWithDwarf(A &addressSpace, pint_t pc,
p &= ~0xfULL;
// CFA is the bottom of the current stack frame.
for (; p < cfa; p += 16) {
- __asm__ __volatile__(".arch_extension memtag\n"
+ __asm__ __volatile__(".arch armv8.5-a\n"
+ ".arch_extension memtag\n"
"stg %[Ptr], [%[Ptr]]\n"
:
: [Ptr] "r"(p)
diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
index 2a472be943f3..543b19f7e72a 100644
--- a/libunwind/src/UnwindRegistersRestore.S
+++ b/libunwind/src/UnwindRegistersRestore.S
@@ -194,9 +194,20 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_ppc646jumptoEv)
addi 4, 3, PPC64_OFFS_FP
// load VS register
+#ifdef __LITTLE_ENDIAN__
+// For little-endian targets, we need a swap since lxvd2x will load the register
+// in the incorrect doubleword order.
+// FIXME: when supporting targets older than Power9 on LE is no longer required,
+// this can be changed to simply `lxv n, (16 * n)(4)`.
#define PPC64_LVS(n) \
lxvd2x n, 0, 4 ;\
+ xxswapd n, n ;\
addi 4, 4, 16
+#else
+#define PPC64_LVS(n) \
+ lxvd2x n, 0, 4 ;\
+ addi 4, 4, 16
+#endif
// restore the first 32 VS regs (and also all floating point regs)
PPC64_LVS(0)
@@ -232,9 +243,16 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind15Registers_ppc646jumptoEv)
PPC64_LVS(30)
PPC64_LVS(31)
+#ifdef __LITTLE_ENDIAN__
+#define PPC64_CLVS_RESTORE(n) \
+ addi 4, 3, PPC64_OFFS_FP + n * 16 ;\
+ lxvd2x n, 0, 4 ;\
+ xxswapd n, n
+#else
#define PPC64_CLVS_RESTORE(n) \
addi 4, 3, PPC64_OFFS_FP + n * 16 ;\
lxvd2x n, 0, 4
+#endif
#if !defined(_AIX)
// use VRSAVE to conditionally restore the remaining VS regs, that are
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index 6c26b79877f6..79f5696a9888 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -351,9 +351,20 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
addi 4, 3, PPC64_OFFS_FP
// store VS register
+#ifdef __LITTLE_ENDIAN__
+// For little-endian targets, we need a swap since stxvd2x will store the
+// register in the incorrect doubleword order.
+// FIXME: when supporting targets older than Power9 on LE is no longer required
+// this can be changed to simply `stxv n, 16 * n(4)`.
#define PPC64_STVS(n) \
+ xxswapd n, n ;\
stxvd2x n, 0, 4 ;\
addi 4, 4, 16
+#else
+#define PPC64_STVS(n) \
+ stxvd2x n, 0, 4 ;\
+ addi 4, 4, 16
+#endif
PPC64_STVS(0)
PPC64_STVS(1)
diff --git a/lld/COFF/MinGW.cpp b/lld/COFF/MinGW.cpp
index 01372cbdf29a..71aa596ea6ab 100644
--- a/lld/COFF/MinGW.cpp
+++ b/lld/COFF/MinGW.cpp
@@ -49,6 +49,9 @@ AutoExporter::AutoExporter(
"libclang_rt.profile-x86_64",
"libc++",
"libc++abi",
+ "libFortran_main",
+ "libFortranRuntime",
+ "libFortranDecimal",
"libunwind",
"libmsvcrt",
"libucrtbase",
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 9131367bf223..5b75738e070c 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -114,13 +114,13 @@ public:
static unsigned RelocSymbol64(const ELFRelocation &rel);
- static unsigned RelocOffset32(const ELFRelocation &rel);
+ static elf_addr RelocOffset32(const ELFRelocation &rel);
- static unsigned RelocOffset64(const ELFRelocation &rel);
+ static elf_addr RelocOffset64(const ELFRelocation &rel);
- static unsigned RelocAddend32(const ELFRelocation &rel);
+ static elf_sxword RelocAddend32(const ELFRelocation &rel);
- static unsigned RelocAddend64(const ELFRelocation &rel);
+ static elf_sxword RelocAddend64(const ELFRelocation &rel);
bool IsRela() { return (reloc.is<ELFRela *>()); }
@@ -185,28 +185,28 @@ unsigned ELFRelocation::RelocSymbol64(const ELFRelocation &rel) {
return ELFRela::RelocSymbol64(*rel.reloc.get<ELFRela *>());
}
-unsigned ELFRelocation::RelocOffset32(const ELFRelocation &rel) {
+elf_addr ELFRelocation::RelocOffset32(const ELFRelocation &rel) {
if (rel.reloc.is<ELFRel *>())
return rel.reloc.get<ELFRel *>()->r_offset;
else
return rel.reloc.get<ELFRela *>()->r_offset;
}
-unsigned ELFRelocation::RelocOffset64(const ELFRelocation &rel) {
+elf_addr ELFRelocation::RelocOffset64(const ELFRelocation &rel) {
if (rel.reloc.is<ELFRel *>())
return rel.reloc.get<ELFRel *>()->r_offset;
else
return rel.reloc.get<ELFRela *>()->r_offset;
}
-unsigned ELFRelocation::RelocAddend32(const ELFRelocation &rel) {
+elf_sxword ELFRelocation::RelocAddend32(const ELFRelocation &rel) {
if (rel.reloc.is<ELFRel *>())
return 0;
else
return rel.reloc.get<ELFRela *>()->r_addend;
}
-unsigned ELFRelocation::RelocAddend64(const ELFRelocation &rel) {
+elf_sxword ELFRelocation::RelocAddend64(const ELFRelocation &rel) {
if (rel.reloc.is<ELFRel *>())
return 0;
else
@@ -2593,6 +2593,50 @@ ObjectFileELF::ParseTrampolineSymbols(Symtab *symbol_table, user_id_t start_id,
rel_data, symtab_data, strtab_data);
}
+static void ApplyELF64ABS64Relocation(Symtab *symtab, ELFRelocation &rel,
+ DataExtractor &debug_data,
+ Section *rel_section) {
+ Symbol *symbol = symtab->FindSymbolByID(ELFRelocation::RelocSymbol64(rel));
+ if (symbol) {
+ addr_t value = symbol->GetAddressRef().GetFileAddress();
+ DataBufferSP &data_buffer_sp = debug_data.GetSharedDataBuffer();
+ // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
+ WritableDataBuffer *data_buffer =
+ llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
+ uint64_t *dst = reinterpret_cast<uint64_t *>(
+ data_buffer->GetBytes() + rel_section->GetFileOffset() +
+ ELFRelocation::RelocOffset64(rel));
+ uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel);
+ memcpy(dst, &val_offset, sizeof(uint64_t));
+ }
+}
+
+static void ApplyELF64ABS32Relocation(Symtab *symtab, ELFRelocation &rel,
+ DataExtractor &debug_data,
+ Section *rel_section, bool is_signed) {
+ Symbol *symbol = symtab->FindSymbolByID(ELFRelocation::RelocSymbol64(rel));
+ if (symbol) {
+ addr_t value = symbol->GetAddressRef().GetFileAddress();
+ value += ELFRelocation::RelocAddend32(rel);
+ if ((!is_signed && (value > UINT32_MAX)) ||
+ (is_signed &&
+ ((int64_t)value > INT32_MAX || (int64_t)value < INT32_MIN))) {
+ Log *log = GetLog(LLDBLog::Modules);
+ LLDB_LOGF(log, "Failed to apply debug info relocations");
+ return;
+ }
+ uint32_t truncated_addr = (value & 0xFFFFFFFF);
+ DataBufferSP &data_buffer_sp = debug_data.GetSharedDataBuffer();
+ // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
+ WritableDataBuffer *data_buffer =
+ llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
+ uint32_t *dst = reinterpret_cast<uint32_t *>(
+ data_buffer->GetBytes() + rel_section->GetFileOffset() +
+ ELFRelocation::RelocOffset32(rel));
+ memcpy(dst, &truncated_addr, sizeof(uint32_t));
+ }
+}
+
unsigned ObjectFileELF::ApplyRelocations(
Symtab *symtab, const ELFHeader *hdr, const ELFSectionHeader *rel_hdr,
const ELFSectionHeader *symtab_hdr, const ELFSectionHeader *debug_hdr,
@@ -2656,55 +2700,50 @@ unsigned ObjectFileELF::ApplyRelocations(
reloc_type(rel));
}
} else {
- switch (reloc_type(rel)) {
- case R_AARCH64_ABS64:
- case R_X86_64_64: {
- symbol = symtab->FindSymbolByID(reloc_symbol(rel));
- if (symbol) {
- addr_t value = symbol->GetAddressRef().GetFileAddress();
- DataBufferSP &data_buffer_sp = debug_data.GetSharedDataBuffer();
- // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
- WritableDataBuffer *data_buffer =
- llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
- uint64_t *dst = reinterpret_cast<uint64_t *>(
- data_buffer->GetBytes() + rel_section->GetFileOffset() +
- ELFRelocation::RelocOffset64(rel));
- uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel);
- memcpy(dst, &val_offset, sizeof(uint64_t));
+ switch (hdr->e_machine) {
+ case llvm::ELF::EM_AARCH64:
+ switch (reloc_type(rel)) {
+ case R_AARCH64_ABS64:
+ ApplyELF64ABS64Relocation(symtab, rel, debug_data, rel_section);
+ break;
+ case R_AARCH64_ABS32:
+ ApplyELF64ABS32Relocation(symtab, rel, debug_data, rel_section, true);
+ break;
+ default:
+ assert(false && "unexpected relocation type");
}
break;
- }
- case R_X86_64_32:
- case R_X86_64_32S:
- case R_AARCH64_ABS32: {
- symbol = symtab->FindSymbolByID(reloc_symbol(rel));
- if (symbol) {
- addr_t value = symbol->GetAddressRef().GetFileAddress();
- value += ELFRelocation::RelocAddend32(rel);
- if ((reloc_type(rel) == R_X86_64_32 && (value > UINT32_MAX)) ||
- (reloc_type(rel) == R_X86_64_32S &&
- ((int64_t)value > INT32_MAX && (int64_t)value < INT32_MIN)) ||
- (reloc_type(rel) == R_AARCH64_ABS32 &&
- ((int64_t)value > INT32_MAX && (int64_t)value < INT32_MIN))) {
- Log *log = GetLog(LLDBLog::Modules);
- LLDB_LOGF(log, "Failed to apply debug info relocations");
- break;
- }
- uint32_t truncated_addr = (value & 0xFFFFFFFF);
- DataBufferSP &data_buffer_sp = debug_data.GetSharedDataBuffer();
- // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
- WritableDataBuffer *data_buffer =
- llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
- uint32_t *dst = reinterpret_cast<uint32_t *>(
- data_buffer->GetBytes() + rel_section->GetFileOffset() +
- ELFRelocation::RelocOffset32(rel));
- memcpy(dst, &truncated_addr, sizeof(uint32_t));
+ case llvm::ELF::EM_LOONGARCH:
+ switch (reloc_type(rel)) {
+ case R_LARCH_64:
+ ApplyELF64ABS64Relocation(symtab, rel, debug_data, rel_section);
+ break;
+ case R_LARCH_32:
+ ApplyELF64ABS32Relocation(symtab, rel, debug_data, rel_section, true);
+ break;
+ default:
+ assert(false && "unexpected relocation type");
+ }
+ break;
+ case llvm::ELF::EM_X86_64:
+ switch (reloc_type(rel)) {
+ case R_X86_64_64:
+ ApplyELF64ABS64Relocation(symtab, rel, debug_data, rel_section);
+ break;
+ case R_X86_64_32:
+ ApplyELF64ABS32Relocation(symtab, rel, debug_data, rel_section,
+ false);
+ break;
+ case R_X86_64_32S:
+ ApplyELF64ABS32Relocation(symtab, rel, debug_data, rel_section, true);
+ break;
+ case R_X86_64_PC32:
+ default:
+ assert(false && "unexpected relocation type");
}
break;
- }
- case R_X86_64_PC32:
default:
- assert(false && "unexpected relocation type");
+ assert(false && "unsupported machine");
}
}
}
diff --git a/llvm/include/llvm/ADT/AddressRanges.h b/llvm/include/llvm/ADT/AddressRanges.h
index f2052d82e7c1..415d30bbb5cf 100644
--- a/llvm/include/llvm/ADT/AddressRanges.h
+++ b/llvm/include/llvm/ADT/AddressRanges.h
@@ -28,7 +28,11 @@ public:
uint64_t start() const { return Start; }
uint64_t end() const { return End; }
uint64_t size() const { return End - Start; }
+ uint64_t empty() const { return size() == 0; }
bool contains(uint64_t Addr) const { return Start <= Addr && Addr < End; }
+ bool contains(const AddressRange &R) const {
+ return Start <= R.Start && R.End <= End;
+ }
bool intersects(const AddressRange &R) const {
return Start < R.End && R.Start < End;
}
@@ -45,101 +49,163 @@ private:
uint64_t End = 0;
};
-/// The AddressRanges class helps normalize address range collections.
-/// This class keeps a sorted vector of AddressRange objects and can perform
-/// insertions and searches efficiently. The address ranges are always sorted
-/// and never contain any invalid or empty address ranges.
-/// Intersecting([100,200), [150,300)) and adjacent([100,200), [200,300))
-/// address ranges are combined during insertion.
-class AddressRanges {
+/// The AddressRangesBase class presents the base functionality for the
+/// normalized address ranges collection. This class keeps a sorted vector
+/// of AddressRange-like objects and can perform searches efficiently.
+/// The address ranges are always sorted and never contain any invalid,
+/// empty or intersected address ranges.
+
+template <typename T> class AddressRangesBase {
protected:
- using Collection = SmallVector<AddressRange>;
+ using Collection = SmallVector<T>;
Collection Ranges;
public:
void clear() { Ranges.clear(); }
bool empty() const { return Ranges.empty(); }
- bool contains(uint64_t Addr) const { return find(Addr) != Ranges.end(); }
+ bool contains(uint64_t Addr) const {
+ return find(Addr, Addr + 1) != Ranges.end();
+ }
bool contains(AddressRange Range) const {
- return find(Range) != Ranges.end();
+ return find(Range.start(), Range.end()) != Ranges.end();
}
- std::optional<AddressRange> getRangeThatContains(uint64_t Addr) const {
- Collection::const_iterator It = find(Addr);
+ void reserve(size_t Capacity) { Ranges.reserve(Capacity); }
+ size_t size() const { return Ranges.size(); }
+
+ std::optional<T> getRangeThatContains(uint64_t Addr) const {
+ typename Collection::const_iterator It = find(Addr, Addr + 1);
if (It == Ranges.end())
return std::nullopt;
return *It;
}
- Collection::const_iterator insert(AddressRange Range);
- void reserve(size_t Capacity) { Ranges.reserve(Capacity); }
- size_t size() const { return Ranges.size(); }
- bool operator==(const AddressRanges &RHS) const {
- return Ranges == RHS.Ranges;
- }
- const AddressRange &operator[](size_t i) const {
+
+ typename Collection::const_iterator begin() const { return Ranges.begin(); }
+ typename Collection::const_iterator end() const { return Ranges.end(); }
+
+ const T &operator[](size_t i) const {
assert(i < Ranges.size());
return Ranges[i];
}
- Collection::const_iterator begin() const { return Ranges.begin(); }
- Collection::const_iterator end() const { return Ranges.end(); }
+
+ bool operator==(const AddressRangesBase<T> &RHS) const {
+ return Ranges == RHS.Ranges;
+ }
protected:
- Collection::const_iterator find(uint64_t Addr) const;
- Collection::const_iterator find(AddressRange Range) const;
+ typename Collection::const_iterator find(uint64_t Start, uint64_t End) const {
+ if (Start >= End)
+ return Ranges.end();
+
+ auto It =
+ std::partition_point(Ranges.begin(), Ranges.end(), [=](const T &R) {
+ return AddressRange(R).start() <= Start;
+ });
+
+ if (It == Ranges.begin())
+ return Ranges.end();
+
+ --It;
+ if (End > AddressRange(*It).end())
+ return Ranges.end();
+
+ return It;
+ }
};
-/// AddressRangesMap class maps values to the address ranges.
-/// It keeps address ranges and corresponding values. If ranges
-/// are combined during insertion, then combined range keeps
-/// newly inserted value.
-template <typename T> class AddressRangesMap : protected AddressRanges {
+/// The AddressRanges class helps normalize address range collections.
+/// This class keeps a sorted vector of AddressRange objects and can perform
+/// insertions and searches efficiently. Intersecting([100,200), [150,300))
+/// and adjacent([100,200), [200,300)) address ranges are combined during
+/// insertion.
+class AddressRanges : public AddressRangesBase<AddressRange> {
public:
- void clear() {
- Ranges.clear();
- Values.clear();
+ Collection::const_iterator insert(AddressRange Range) {
+ if (Range.empty())
+ return Ranges.end();
+
+ auto It = llvm::upper_bound(Ranges, Range);
+ auto It2 = It;
+ while (It2 != Ranges.end() && It2->start() <= Range.end())
+ ++It2;
+ if (It != It2) {
+ Range = {Range.start(), std::max(Range.end(), std::prev(It2)->end())};
+ It = Ranges.erase(It, It2);
+ }
+ if (It != Ranges.begin() && Range.start() <= std::prev(It)->end()) {
+ --It;
+ *It = {It->start(), std::max(It->end(), Range.end())};
+ return It;
+ }
+
+ return Ranges.insert(It, Range);
}
- bool empty() const { return AddressRanges::empty(); }
- bool contains(uint64_t Addr) const { return AddressRanges::contains(Addr); }
- bool contains(AddressRange Range) const {
- return AddressRanges::contains(Range);
- }
- void insert(AddressRange Range, T Value) {
- size_t InputSize = Ranges.size();
- Collection::const_iterator RangesIt = AddressRanges::insert(Range);
- if (RangesIt == Ranges.end())
- return;
+};
- // make Values match to Ranges.
- size_t Idx = RangesIt - Ranges.begin();
- typename ValuesCollection::iterator ValuesIt = Values.begin() + Idx;
- if (InputSize < Ranges.size())
- Values.insert(ValuesIt, T());
- else if (InputSize > Ranges.size())
- Values.erase(ValuesIt, ValuesIt + InputSize - Ranges.size());
- assert(Ranges.size() == Values.size());
-
- // set value to the inserted or combined range.
- Values[Idx] = Value;
- }
- size_t size() const {
- assert(Ranges.size() == Values.size());
- return AddressRanges::size();
- }
- std::optional<std::pair<AddressRange, T>>
- getRangeValueThatContains(uint64_t Addr) const {
- Collection::const_iterator It = find(Addr);
- if (It == Ranges.end())
- return std::nullopt;
+class AddressRangeValuePair {
+public:
+ operator AddressRange() const { return Range; }
- return std::make_pair(*It, Values[It - Ranges.begin()]);
- }
- std::pair<AddressRange, T> operator[](size_t Idx) const {
- return std::make_pair(Ranges[Idx], Values[Idx]);
- }
+ AddressRange Range;
+ int64_t Value = 0;
+};
-protected:
- using ValuesCollection = SmallVector<T>;
- ValuesCollection Values;
+inline bool operator==(const AddressRangeValuePair &LHS,
+ const AddressRangeValuePair &RHS) {
+ return LHS.Range == RHS.Range && LHS.Value == RHS.Value;
+}
+
+/// AddressRangesMap class maps values to the address ranges.
+/// It keeps normalized address ranges and corresponding values.
+/// This class keeps a sorted vector of AddressRangeValuePair objects
+/// and can perform insertions and searches efficiently.
+/// Intersecting([100,200), [150,300)) ranges splitted into non-conflicting
+/// parts([100,200), [200,300)). Adjacent([100,200), [200,300)) address
+/// ranges are not combined during insertion.
+class AddressRangesMap : public AddressRangesBase<AddressRangeValuePair> {
+public:
+ void insert(AddressRange Range, int64_t Value) {
+ if (Range.empty())
+ return;
+
+ // Search for range which is less than or equal incoming Range.
+ auto It = std::partition_point(Ranges.begin(), Ranges.end(),
+ [=](const AddressRangeValuePair &R) {
+ return R.Range.start() <= Range.start();
+ });
+
+ if (It != Ranges.begin())
+ It--;
+
+ while (!Range.empty()) {
+ // Inserted range does not overlap with any range.
+ // Store it into the Ranges collection.
+ if (It == Ranges.end() || Range.end() <= It->Range.start()) {
+ Ranges.insert(It, {Range, Value});
+ return;
+ }
+
+ // Inserted range partially overlaps with current range.
+ // Store not overlapped part of inserted range.
+ if (Range.start() < It->Range.start()) {
+ It = Ranges.insert(It, {{Range.start(), It->Range.start()}, Value});
+ It++;
+ Range = {It->Range.start(), Range.end()};
+ continue;
+ }
+
+ // Inserted range fully overlaps with current range.
+ if (Range.end() <= It->Range.end())
+ return;
+
+ // Inserted range partially overlaps with current range.
+ // Remove overlapped part from the inserted range.
+ if (Range.start() < It->Range.end())
+ Range = {It->Range.end(), Range.end()};
+
+ It++;
+ }
+ }
};
} // namespace llvm
diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h b/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h
index 5b0ea339c4d6..9c7f24e69d48 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFLinkerCompileUnit.h
@@ -21,7 +21,7 @@ class DeclContext;
/// Mapped value in the address map is the offset to apply to the
/// linked address.
-using RangesTy = AddressRangesMap<int64_t>;
+using RangesTy = AddressRangesMap;
// FIXME: Delete this structure.
struct PatchLocation {
diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index 9f6e54377ede..d302d61894fa 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -1659,7 +1659,7 @@ void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit,
DWARFDataExtractor RangeExtractor(OrigDwarf.getDWARFObj(),
OrigDwarf.getDWARFObj().getRangesSection(),
OrigDwarf.isLittleEndian(), AddressSize);
- std::optional<std::pair<AddressRange, int64_t>> CachedRange;
+ std::optional<AddressRangeValuePair> CachedRange;
DWARFUnit &OrigUnit = Unit.getOrigUnit();
auto OrigUnitDie = OrigUnit.getUnitDIE(false);
uint64_t UnitBaseAddress =
@@ -1687,9 +1687,9 @@ void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit,
}
if (!CachedRange ||
- !CachedRange->first.contains(Range.StartAddress + BaseAddress))
- CachedRange = FunctionRanges.getRangeValueThatContains(
- Range.StartAddress + BaseAddress);
+ !CachedRange->Range.contains(Range.StartAddress + BaseAddress))
+ CachedRange = FunctionRanges.getRangeThatContains(Range.StartAddress +
+ BaseAddress);
// All range entries should lie in the function range.
if (!CachedRange) {
@@ -1698,8 +1698,8 @@ void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit,
}
LinkedRanges.insert(
- {Range.StartAddress + BaseAddress + CachedRange->second,
- Range.EndAddress + BaseAddress + CachedRange->second});
+ {Range.StartAddress + BaseAddress + CachedRange->Value,
+ Range.EndAddress + BaseAddress + CachedRange->Value});
}
}
@@ -1802,7 +1802,7 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
// in NewRows.
std::vector<DWARFDebugLine::Row> Seq;
const auto &FunctionRanges = Unit.getFunctionRanges();
- std::optional<std::pair<AddressRange, int64_t>> CurrRange;
+ std::optional<AddressRangeValuePair> CurrRange;
// FIXME: This logic is meant to generate exactly the same output as
// Darwin's classic dsymutil. There is a nicer way to implement this
@@ -1821,13 +1821,13 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
// it is marked as end_sequence in the input (because in that
// case, the relocation offset is accurate and that entry won't
// serve as the start of another function).
- if (!CurrRange || !CurrRange->first.contains(Row.Address.Address) ||
- (Row.Address.Address == CurrRange->first.end() && !Row.EndSequence)) {
+ if (!CurrRange || !CurrRange->Range.contains(Row.Address.Address) ||
+ (Row.Address.Address == CurrRange->Range.end() && !Row.EndSequence)) {
// We just stepped out of a known range. Insert a end_sequence
// corresponding to the end of the range.
uint64_t StopAddress =
- CurrRange ? CurrRange->first.end() + CurrRange->second : -1ULL;
- CurrRange = FunctionRanges.getRangeValueThatContains(Row.Address.Address);
+ CurrRange ? CurrRange->Range.end() + CurrRange->Value : -1ULL;
+ CurrRange = FunctionRanges.getRangeThatContains(Row.Address.Address);
if (!CurrRange) {
if (StopAddress != -1ULL) {
// Try harder by looking in the Address ranges map.
@@ -1836,9 +1836,9 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
// for now do as dsymutil.
// FIXME: Understand exactly what cases this addresses and
// potentially remove it along with the Ranges map.
- if (std::optional<std::pair<AddressRange, int64_t>> Range =
- Ranges.getRangeValueThatContains(Row.Address.Address))
- StopAddress = Row.Address.Address + (*Range).second;
+ if (std::optional<AddressRangeValuePair> Range =
+ Ranges.getRangeThatContains(Row.Address.Address))
+ StopAddress = Row.Address.Address + (*Range).Value;
}
}
if (StopAddress != -1ULL && !Seq.empty()) {
@@ -1863,7 +1863,7 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
continue;
// Relocate row address and add it to the current sequence.
- Row.Address.Address += CurrRange->second;
+ Row.Address.Address += CurrRange->Value;
Seq.emplace_back(Row);
if (Row.EndSequence)
@@ -2002,8 +2002,8 @@ void DWARFLinker::patchFrameInfoForObject(const DWARFFile &File,
// the function entry point, thus we can't just lookup the address
// in the debug map. Use the AddressInfo's range map to see if the FDE
// describes something that we can relocate.
- std::optional<std::pair<AddressRange, int64_t>> Range =
- Ranges.getRangeValueThatContains(Loc);
+ std::optional<AddressRangeValuePair> Range =
+ Ranges.getRangeThatContains(Loc);
if (!Range) {
// The +4 is to account for the size of the InitialLength field itself.
InputOffset = EntryOffset + InitialLength + 4;
@@ -2032,7 +2032,7 @@ void DWARFLinker::patchFrameInfoForObject(const DWARFFile &File,
// fields that will get reconstructed by emitFDE().
unsigned FDERemainingBytes = InitialLength - (4 + AddrSize);
TheDwarfEmitter->emitFDE(IteratorInserted.first->getValue(), AddrSize,
- Loc + Range->second,
+ Loc + Range->Value,
FrameData.substr(InputOffset, FDERemainingBytes));
InputOffset += FDERemainingBytes;
}
diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
index 5cad267fd845..ae79e8cb9066 100644
--- a/llvm/lib/DWARFLinker/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
@@ -402,10 +402,9 @@ void DwarfStreamer::emitUnitRangesEntries(CompileUnit &Unit,
// Linked addresses might end up in a different order.
// Build linked address ranges.
AddressRanges LinkedRanges;
- for (size_t Idx = 0; Idx < FunctionRanges.size(); Idx++)
+ for (const AddressRangeValuePair &Range : FunctionRanges)
LinkedRanges.insert(
- {FunctionRanges[Idx].first.start() + FunctionRanges[Idx].second,
- FunctionRanges[Idx].first.end() + FunctionRanges[Idx].second});
+ {Range.Range.start() + Range.Value, Range.Range.end() + Range.Value});
if (!FunctionRanges.empty())
emitDwarfDebugArangesTable(Unit, LinkedRanges);
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
index 567d5a4dd47a..0ad3b7235e87 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
@@ -552,6 +552,7 @@ void link_ELF_aarch64(std::unique_ptr<LinkGraph> G,
Config.PrePrunePasses.push_back(EHFrameEdgeFixer(
".eh_frame", 8, aarch64::Pointer32, aarch64::Pointer64,
aarch64::Delta32, aarch64::Delta64, aarch64::NegDelta32));
+ Config.PrePrunePasses.push_back(EHFrameNullTerminator(".eh_frame"));
// Add a mark-live pass.
if (auto MarkLive = Ctx->getMarkLivePass(TT))
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 7b9c55ff30a5..9342e10b5eda 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -26,6 +26,7 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IntrinsicsWebAssembly.h"
#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
@@ -1125,6 +1126,40 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
break;
}
+ case 'w':
+ if (Name.startswith("wasm.fma.")) {
+ rename(F);
+ NewFn = Intrinsic::getDeclaration(
+ F->getParent(), Intrinsic::wasm_relaxed_madd, F->getReturnType());
+ return true;
+ }
+ if (Name.startswith("wasm.fms.")) {
+ rename(F);
+ NewFn = Intrinsic::getDeclaration(
+ F->getParent(), Intrinsic::wasm_relaxed_nmadd, F->getReturnType());
+ return true;
+ }
+ if (Name.startswith("wasm.laneselect.")) {
+ rename(F);
+ NewFn = Intrinsic::getDeclaration(
+ F->getParent(), Intrinsic::wasm_relaxed_laneselect,
+ F->getReturnType());
+ return true;
+ }
+ if (Name == "wasm.dot.i8x16.i7x16.signed") {
+ rename(F);
+ NewFn = Intrinsic::getDeclaration(
+ F->getParent(), Intrinsic::wasm_relaxed_dot_i8x16_i7x16_signed);
+ return true;
+ }
+ if (Name == "wasm.dot.i8x16.i7x16.add.signed") {
+ rename(F);
+ NewFn = Intrinsic::getDeclaration(
+ F->getParent(), Intrinsic::wasm_relaxed_dot_i8x16_i7x16_add_signed);
+ return true;
+ }
+ break;
+
case 'x':
if (UpgradeX86IntrinsicFunction(F, Name, NewFn))
return true;
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index eed29c25714b..0d074951cffc 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1685,6 +1685,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
// keep one copy of each constant.
MPM.addPass(ConstantMergePass());
+ // Remove unused arguments from functions.
+ MPM.addPass(DeadArgumentEliminationPass());
+
// Reduce the code after globalopt and ipsccp. Both can open up significant
// simplification opportunities, and both can propagate functions through
// function pointers. When this happens, we often have to resolve varargs
@@ -1722,9 +1725,6 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
// transform it to pass arguments by value instead of by reference.
MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass()));
- // Remove unused arguments from functions.
- MPM.addPass(DeadArgumentEliminationPass());
-
FunctionPassManager FPM;
// The IPO Passes may leave cruft around. Clean up after them.
FPM.addPass(InstCombinePass());
diff --git a/llvm/lib/Support/AddressRanges.cpp b/llvm/lib/Support/AddressRanges.cpp
deleted file mode 100644
index 187d5be00dae..000000000000
--- a/llvm/lib/Support/AddressRanges.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-//===- AddressRanges.cpp ----------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/AddressRanges.h"
-#include "llvm/ADT/STLExtras.h"
-#include <inttypes.h>
-
-using namespace llvm;
-
-AddressRanges::Collection::const_iterator
-AddressRanges::insert(AddressRange Range) {
- if (Range.size() == 0)
- return Ranges.end();
-
- auto It = llvm::upper_bound(Ranges, Range);
- auto It2 = It;
- while (It2 != Ranges.end() && It2->start() <= Range.end())
- ++It2;
- if (It != It2) {
- Range = {Range.start(), std::max(Range.end(), std::prev(It2)->end())};
- It = Ranges.erase(It, It2);
- }
- if (It != Ranges.begin() && Range.start() <= std::prev(It)->end()) {
- --It;
- *It = {It->start(), std::max(It->end(), Range.end())};
- return It;
- }
-
- return Ranges.insert(It, Range);
-}
-
-AddressRanges::Collection::const_iterator
-AddressRanges::find(uint64_t Addr) const {
- auto It = std::partition_point(
- Ranges.begin(), Ranges.end(),
- [=](const AddressRange &R) { return R.start() <= Addr; });
-
- if (It == Ranges.begin())
- return Ranges.end();
-
- --It;
- if (Addr >= It->end())
- return Ranges.end();
-
- return It;
-}
-
-AddressRanges::Collection::const_iterator
-AddressRanges::find(AddressRange Range) const {
- if (Range.size() == 0)
- return Ranges.end();
-
- auto It = std::partition_point(
- Ranges.begin(), Ranges.end(),
- [=](const AddressRange &R) { return R.start() <= Range.start(); });
-
- if (It == Ranges.begin())
- return Ranges.end();
-
- --It;
- if (Range.end() > It->end())
- return Ranges.end();
-
- return It;
-}
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 9a873413db87..bde5f5db99e7 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -782,6 +782,17 @@ void BTFDebug::visitCompositeType(const DICompositeType *CTy,
visitEnumType(CTy, TypeId);
}
+bool BTFDebug::IsForwardDeclCandidate(const DIType *Base) {
+ if (const auto *CTy = dyn_cast<DICompositeType>(Base)) {
+ auto CTag = CTy->getTag();
+ if ((CTag == dwarf::DW_TAG_structure_type ||
+ CTag == dwarf::DW_TAG_union_type) &&
+ !CTy->getName().empty() && !CTy->isForwardDecl())
+ return true;
+ }
+ return false;
+}
+
/// Handle pointer, typedef, const, volatile, restrict and member types.
void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
bool CheckPointer, bool SeenPointer) {
@@ -796,20 +807,15 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
if (CheckPointer && SeenPointer) {
const DIType *Base = DTy->getBaseType();
if (Base) {
- if (const auto *CTy = dyn_cast<DICompositeType>(Base)) {
- auto CTag = CTy->getTag();
- if ((CTag == dwarf::DW_TAG_structure_type ||
- CTag == dwarf::DW_TAG_union_type) &&
- !CTy->getName().empty() && !CTy->isForwardDecl()) {
- /// Find a candidate, generate a fixup. Later on the struct/union
- /// pointee type will be replaced with either a real type or
- /// a forward declaration.
- auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, true);
- auto &Fixup = FixupDerivedTypes[CTy];
- Fixup.push_back(std::make_pair(DTy, TypeEntry.get()));
- TypeId = addType(std::move(TypeEntry), DTy);
- return;
- }
+ if (IsForwardDeclCandidate(Base)) {
+ /// Find a candidate, generate a fixup. Later on the struct/union
+ /// pointee type will be replaced with either a real type or
+ /// a forward declaration.
+ auto TypeEntry = std::make_unique<BTFTypeDerived>(DTy, Tag, true);
+ auto &Fixup = FixupDerivedTypes[cast<DICompositeType>(Base)];
+ Fixup.push_back(std::make_pair(DTy, TypeEntry.get()));
+ TypeId = addType(std::move(TypeEntry), DTy);
+ return;
}
}
}
@@ -844,6 +850,13 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
visitTypeEntry(DTy->getBaseType(), TempTypeId, CheckPointer, SeenPointer);
}
+/// Visit a type entry. CheckPointer is true if the type has
+/// one of its predecessors as one struct/union member. SeenPointer
+/// is true if CheckPointer is true and one of its predecessors
+/// is a pointer. The goal of CheckPointer and SeenPointer is to
+/// do pruning for struct/union types so some of these types
+/// will not be emitted in BTF and rather forward declarations
+/// will be generated.
void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
bool CheckPointer, bool SeenPointer) {
if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) {
@@ -888,6 +901,11 @@ void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
if (DIToIdMap.find(BaseTy) != DIToIdMap.end()) {
DTy = dyn_cast<DIDerivedType>(BaseTy);
} else {
+ if (CheckPointer && DTy->getTag() == dwarf::DW_TAG_pointer_type) {
+ SeenPointer = true;
+ if (IsForwardDeclCandidate(BaseTy))
+ break;
+ }
uint32_t TmpTypeId;
visitTypeEntry(BaseTy, TmpTypeId, CheckPointer, SeenPointer);
break;
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index aa982babd458..f0b42232f4d5 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -338,6 +338,9 @@ class BTFDebug : public DebugHandlerBase {
void visitMapDefType(const DIType *Ty, uint32_t &TypeId);
/// @}
+ /// Check whether the type is a forward declaration candidate or not.
+ bool IsForwardDeclCandidate(const DIType *Base);
+
/// Get the file content for the subprogram. Certain lines of the file
/// later may be put into string table and referenced by line info.
std::string populateFileContent(const DISubprogram *SP);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 379aaa713a00..88b926fce2aa 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -239,6 +239,16 @@ bool RISCVELFStreamer::requiresFixups(MCContext &C, const MCExpr *Value,
if (B.isInSection() && B.getSection().getKind().isText())
return true;
+ // If A is undefined and B is defined, we should emit ADD/SUB for A-B.
+ // Unfortunately, A may be defined later, but this requiresFixups call has to
+ // eagerly make a decision. For now, emit ADD/SUB unless A is .L*. This
+ // heuristic handles many temporary label differences for .debug_* and
+ // .apple_types sections.
+ //
+ // TODO Implement delayed relocation decision.
+ if (!A.isInSection() && !A.isTemporary() && B.isInSection())
+ return true;
+
// Support cross-section symbolic differences ...
return A.isInSection() && B.isInSection() &&
A.getSection().getName() != B.getSection().getName();
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 83bd2ff6acc3..55c56e76af6f 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -695,6 +695,7 @@ include "X86ScheduleSLM.td"
include "X86ScheduleZnver1.td"
include "X86ScheduleZnver2.td"
include "X86ScheduleZnver3.td"
+include "X86ScheduleZnver4.td"
include "X86ScheduleBdVer2.td"
include "X86ScheduleBtVer2.td"
include "X86SchedSkylakeClient.td"
@@ -1627,7 +1628,7 @@ def : ProcModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features,
ProcessorFeatures.ZN2Tuning>;
def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
ProcessorFeatures.ZN3Tuning>;
-def : Proc<"znver4",ProcessorFeatures.ZN4Features,
+def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features,
ProcessorFeatures.ZN4Tuning>;
def : Proc<"geode", [FeatureX87, FeatureCX8, Feature3DNowA],
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 8fddd0037999..7e1c96a429eb 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -896,15 +896,15 @@ multiclass ATOMIC_LOGIC_OP<Format Form, string s> {
multiclass ATOMIC_LOGIC_OP_RM<bits<8> Opc8, string s> {
let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
SchedRW = [WriteBitTestSetRegRMW] in {
- def 16rm : Ii8<Opc8, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ def 16rm : I<Opc8, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
!strconcat(s, "{w}\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR16:$src2))]>,
OpSize16, TB, LOCK;
- def 32rm : Ii8<Opc8, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ def 32rm : I<Opc8, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
!strconcat(s, "{l}\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR32:$src2))]>,
OpSize32, TB, LOCK;
- def 64rm : RIi8<Opc8, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ def 64rm : RI<Opc8, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
!strconcat(s, "{q}\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (!cast<SDNode>("x86_rm_" # s) addr:$src1, GR64:$src2))]>,
TB, LOCK;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 561ba99db4af..f8660a9fa123 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -577,20 +577,37 @@ let Predicates = [HasAVX, NoVLX] in {
def : Pat<(alignedloadv8f16 addr:$src),
(VMOVAPSrm addr:$src)>;
+ def : Pat<(alignedloadv8bf16 addr:$src),
+ (VMOVAPSrm addr:$src)>;
def : Pat<(loadv8f16 addr:$src),
(VMOVUPSrm addr:$src)>;
+ def : Pat<(loadv8bf16 addr:$src),
+ (VMOVUPSrm addr:$src)>;
def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
(VMOVAPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(alignedstore (v8bf16 VR128:$src), addr:$dst),
+ (VMOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v8f16 VR128:$src), addr:$dst),
(VMOVUPSmr addr:$dst, VR128:$src)>;
+ def : Pat<(store (v8bf16 VR128:$src), addr:$dst),
+ (VMOVUPSmr addr:$dst, VR128:$src)>;
+
def : Pat<(alignedloadv16f16 addr:$src),
(VMOVAPSYrm addr:$src)>;
+ def : Pat<(alignedloadv16bf16 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
def : Pat<(loadv16f16 addr:$src),
(VMOVUPSYrm addr:$src)>;
+ def : Pat<(loadv16bf16 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore (v16bf16 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
def : Pat<(store (v16f16 VR256:$src), addr:$dst),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v16bf16 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
}
// Use movaps / movups for SSE integer load / store (one byte shorter).
diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td
index d2460e12b005..49ef6efc6aec 100644
--- a/llvm/lib/Target/X86/X86PfmCounters.td
+++ b/llvm/lib/Target/X86/X86PfmCounters.td
@@ -290,4 +290,17 @@ def ZnVer3PfmCounters : ProcPfmCounters {
];
}
def : PfmCountersBinding<"znver3", ZnVer3PfmCounters>;
-def : PfmCountersBinding<"znver4", ZnVer3PfmCounters>;
+
+def ZnVer4PfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cycles_not_in_halt">;
+ let UopsCounter = PfmCounter<"retired_ops">;
+ let IssueCounters = [
+ PfmIssueCounter<"Zn4Int", "ops_type_dispatched_from_decoder:int_disp_retire_mode">,
+ PfmIssueCounter<"Zn4FPU", "ops_type_dispatched_from_decoder:fp_disp_retire_mode">,
+ PfmIssueCounter<"Zn4Load", "ls_dispatch:ld_dispatch">,
+ PfmIssueCounter<"Zn4Store", "ls_dispatch:store_dispatch">,
+ PfmIssueCounter<"Zn4Divider", "div_op_count">,
+ PfmIssueCounter<"Zn4AGU", "ls_dispatch:ld_st_dispatch + ls_dispatch:ld_dispatch + ls_dispatch:store_dispatch">
+ ];
+}
+def : PfmCountersBinding<"znver4", ZnVer4PfmCounters>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
new file mode 100644
index 000000000000..c3f08998419f
--- /dev/null
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -0,0 +1,1957 @@
+//=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver4 to support instruction
+// scheduling and other instruction cost heuristics.
+// Based on:
+// * AMD Software Optimization Guide for AMD Family 19h Processors.
+// https://www.amd.com/system/files/TechDocs/56665.zip
+//===----------------------------------------------------------------------===//
+
+def Znver4Model : SchedMachineModel {
+ // AMD SOG 19h, 2.9.6 Dispatch
+ // The processor may dispatch up to 6 macro ops per cycle
+ // into the execution engine.
+ let IssueWidth = 6;
+ // AMD SOG 19h, 2.10.3
+ // The retire control unit (RCU) tracks the completion status of all
+ // outstanding operations (integer, load/store, and floating-point) and is
+ // the final arbiter for exception processing and recovery.
+ // The unit can receive up to 6 macro ops dispatched per cycle and track up
+ // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
+ let MicroOpBufferSize = 320;
+ // AMD SOG 19h, 2.9.1 Op Cache
+ // The op cache is organized as an associative cache with 64 sets and 8 ways.
+ // At each set-way intersection is an entry containing up to 8 macro ops.
+ // The maximum capacity of the op cache is 4K ops.
+ // Agner, 22.5 µop cache
+ // The size of the µop cache is big enough for holding most critical loops.
+ // FIXME: PR50584: MachineScheduler/PostRAScheduler have quadradic complexity,
+ // with large values here the compilation of certain loops
+ // ends up taking way too long.
+ // Ideally for znver4, we should have 6.75K. However we don't add that
+ // considerting the impact compile time and prefer using default values
+ // instead.
+ // let LoopMicroOpBufferSize = 6750;
+ // AMD SOG 19h, 2.6.2 L1 Data Cache
+ // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
+ // AMD SOG 19h, 2.12 L1 Data Cache
+ // The AGU and LS pipelines are optimized for simple address generation modes.
+ // <...> and can achieve 4-cycle load-to-use integer load latency.
+ let LoadLatency = 4;
+ // AMD SOG 19h, 2.12 L1 Data Cache
+ // The AGU and LS pipelines are optimized for simple address generation modes.
+ // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
+ int VecLoadLatency = 7;
+ // Latency of a simple store operation.
+ int StoreLatency = 1;
+ // FIXME:
+ let HighLatency = 25; // FIXME: any better choice?
+ // AMD SOG 19h, 2.8 Optimizing Branching
+ // The branch misprediction penalty is in the range from 11 to 18 cycles,
+ // <...>. The common case penalty is 13 cycles.
+ let MispredictPenalty = 13;
+
+ let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
+
+ let CompleteModel = 1;
+}
+
+let SchedModel = Znver4Model in {
+
+
+//===----------------------------------------------------------------------===//
+// RCU
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.3 Retire Control Unit
+// The unit can receive up to 6 macro ops dispatched per cycle and track up to
+// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
+// The retire unit handles in-order commit of up to nine macro ops per cycle.
+def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
+
+//===----------------------------------------------------------------------===//
+// Integer Execution Unit
+//
+
+// AMD SOG 19h, 2.4 Superscalar Organization
+// The processor uses four decoupled independent integer scheduler queues,
+// each one servicing one ALU pipeline and one or two other pipelines
+
+//
+// Execution pipes
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// The processor contains 4 general purpose integer execution pipes.
+// Each pipe has an ALU capable of general purpose integer operations.
+def Zn4ALU0 : ProcResource<1>;
+def Zn4ALU1 : ProcResource<1>;
+def Zn4ALU2 : ProcResource<1>;
+def Zn4ALU3 : ProcResource<1>;
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// There is also a separate branch execution unit.
+def Zn4BRU1 : ProcResource<1>;
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// There are three Address Generation Units (AGUs) for all load and store
+// address generation. There are also 3 store data movement units
+// associated with the same schedulers as the AGUs.
+def Zn4AGU0 : ProcResource<1>;
+def Zn4AGU1 : ProcResource<1>;
+def Zn4AGU2 : ProcResource<1>;
+
+//
+// Execution Units
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// ALU0 additionally has divide <...> execution capability.
+defvar Zn4Divider = Zn4ALU0;
+
+// AMD SOG 19h, 2.10.2 Execution Units
+// ALU0 additionally has <...> branch execution capability.
+defvar Zn4BRU0 = Zn4ALU0;
+
+// Integer Multiplication issued on ALU1.
+defvar Zn4Multiplier = Zn4ALU1;
+
+// Execution pipeline grouping
+//===----------------------------------------------------------------------===//
+
+// General ALU operations
+def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>;
+
+// General AGU operations
+def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>;
+
+// Control flow: jumps, calls
+def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>;
+
+// Everything that isn't control flow, but still needs to access CC register,
+// namely: conditional moves, SETcc.
+def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>;
+
+// Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
+
+// Simple bit twiddling: bit test, shift/rotate, bit extraction
+def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
+
+
+//
+// Scheduling
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.3 Retire Control Unit
+// The integer physical register file (PRF) consists of 224 registers.
+def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
+ 6, // Max moves that can be eliminated per cycle.
+ 0>; // Restrict move elimination to zero regs.
+
+// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
+// AMD SOG 19h, 2.10.1 Schedulers
+// The schedulers can receive up to six macro ops per cycle, with a limit of
+// two per scheduler. Each scheduler can issue one micro op per cycle into
+// each of its associated pipelines
+def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
+ Zn4ALU1, Zn4AGU1, // scheduler 1
+ Zn4ALU2, Zn4AGU2, // scheduler 2
+ Zn4ALU3, Zn4BRU1 // scheduler 3
+ ]> {
+ let BufferSize = !mul(4, 24);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Floating-Point Unit
+//
+
+// AMD SOG 19h, 2.4 Superscalar Organization
+// The processor uses <...> two decoupled independent floating point schedulers
+// each servicing two FP pipelines and one store or FP-to-integer pipeline.
+
+//
+// Execution pipes
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.10.1 Schedulers
+// <...>, and six FPU pipes.
+// Agner, 22.10 Floating point execution pipes
+// There are six floating point/vector execution pipes,
+def Zn4FP0 : ProcResource<1>;
+def Zn4FP1 : ProcResource<1>;
+def Zn4FP2 : ProcResource<1>;
+def Zn4FP3 : ProcResource<1>;
+def Zn4FP45 : ProcResource<2>;
+
+//
+// Execution Units
+//===----------------------------------------------------------------------===//
+// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
+
+// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
+defvar Zn4FPFMul0 = Zn4FP0;
+defvar Zn4FPFMul1 = Zn4FP1;
+
+// (v)FADD*
+defvar Zn4FPFAdd0 = Zn4FP2;
+defvar Zn4FPFAdd1 = Zn4FP3;
+
+// All convert operations except pack/unpack
+defvar Zn4FPFCvt0 = Zn4FP2;
+defvar Zn4FPFCvt1 = Zn4FP3;
+
+// All Divide and Square Root except Reciprocal Approximation
+// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
+// FDIV unit can support 2 simultaneous operations in flight
+// even though it occupies a single pipe.
+// FIXME: BufferSize=2 ?
+defvar Zn4FPFDiv = Zn4FP1;
+
+// Moves and Logical operations on Floating Point Data Types
+defvar Zn4FPFMisc0 = Zn4FP0;
+defvar Zn4FPFMisc1 = Zn4FP1;
+defvar Zn4FPFMisc2 = Zn4FP2;
+defvar Zn4FPFMisc3 = Zn4FP3;
+
+// Integer Adds, Subtracts, and Compares
+// Some complex VADD operations are not available in all pipes.
+defvar Zn4FPVAdd0 = Zn4FP0;
+defvar Zn4FPVAdd1 = Zn4FP1;
+defvar Zn4FPVAdd2 = Zn4FP2;
+defvar Zn4FPVAdd3 = Zn4FP3;
+
+// Integer Multiplies, SAD, Blendvb
+defvar Zn4FPVMul0 = Zn4FP0;
+defvar Zn4FPVMul1 = Zn4FP3;
+
+// Data Shuffles, Packs, Unpacks, Permute
+// Some complex shuffle operations are only available in pipe1.
+defvar Zn4FPVShuf = Zn4FP1;
+defvar Zn4FPVShufAux = Zn4FP2;
+
+// Bit Shift Left/Right operations
+defvar Zn4FPVShift0 = Zn4FP1;
+defvar Zn4FPVShift1 = Zn4FP2;
+
+// Moves and Logical operations on Packed Integer Data Types
+defvar Zn4FPVMisc0 = Zn4FP0;
+defvar Zn4FPVMisc1 = Zn4FP1;
+defvar Zn4FPVMisc2 = Zn4FP2;
+defvar Zn4FPVMisc3 = Zn4FP3;
+
+// *AES*
+defvar Zn4FPAES0 = Zn4FP0;
+defvar Zn4FPAES1 = Zn4FP1;
+
+// *CLM*
+defvar Zn4FPCLM0 = Zn4FP0;
+defvar Zn4FPCLM1 = Zn4FP1;
+
+// Execution pipeline grouping
+//===----------------------------------------------------------------------===//
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// Stores and floating point to general purpose register transfer
+// have 2 dedicated pipelines (pipe 5 and 6).
+def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
+
+// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
+def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>;
+
+// (v)FADD*
+// Some complex VADD operations are not available in all pipes.
+def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>;
+
+// All convert operations except pack/unpack
+def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>;
+
+// All Divide and Square Root except Reciprocal Approximation
+// def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>;
+
+// Moves and Logical operations on Floating Point Data Types
+def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>;
+
+// FIXUP and RANGE use FP01 pipelines
+def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>;
+def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>;
+// SCALE instructions use FP23 pipelines
+def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
+def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
+
+// Loads, Stores and Move to General Register (EX) Operations
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// Stores and floating point to general purpose register transfer
+// have 2 dedicated pipelines (pipe 5 and 6).
+defvar Zn4FPLd01 = Zn4FP45;
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// Note that FP stores are supported on two pipelines,
+// but throughput is limited to one per cycle.
+let Super = Zn4FP45 in
+def Zn4FPSt : ProcResource<1>;
+
+// Integer Adds, Subtracts, and Compares
+// Some complex VADD operations are not available in all pipes.
+def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>;
+
+def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>;
+def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>;
+
+// AVX512 Opmask pipelines
+def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>;
+def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>;
+
+// Integer Multiplies, SAD, Blendvb
+def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>;
+
+// Data Shuffles, Packs, Unpacks, Permute
+// Some complex shuffle operations are only available in pipe1.
+def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>;
+
+// Bit Shift Left/Right operations
+def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>;
+
+// Moves and Logical operations on Packed Integer Data Types
+def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>;
+
+// *AES*
+def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>;
+
+// *CLM*
+def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>;
+
+
+//
+// Scheduling
+//===----------------------------------------------------------------------===//
+
+// Agner, 21.8 Register renaming and out-of-order schedulers
+// The floating point register file has 192 vector registers
+// of 512b each in zen4.
+def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1],
+ 6, // Max moves that can be eliminated per cycle.
+ 0>; // Restrict move elimination to zero regs.
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// The floating-point scheduler has a 2*32 entry macro op capacity.
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// <...> the scheduler can issue 1 micro op per cycle for each pipe.
+// FIXME: those are two separate schedulers, not a single big one.
+def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
+ Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/ // scheduler 1
+ ]> {
+ let BufferSize = !mul(2, 32);
+}
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
+// even if floating-point scheduler is full.
+// FIXME: how to model this properly?
+
+
+//===----------------------------------------------------------------------===//
+// Load-Store Unit
+//
+
+// AMD SOG 19h, 2.12 Load-Store Unit
+// The LS unit contains three largely independent pipe-lines
+// enabling the execution of three 256-bit memory operations per cycle.
+def Zn4LSU : ProcResource<3>;
+
+// AMD SOG 19h, 2.12 Load-Store Unit
+// All three memory operations can be loads.
+let Super = Zn4LSU in
+def Zn4Load : ProcResource<3> {
+ // AMD SOG 19h, 2.12 Load-Store Unit
+ // The LS unit can process up to 72 out-of-order loads.
+ let BufferSize = 72;
+}
+
+def Zn4LoadQueue : LoadQueue<Zn4Load>;
+
+// AMD SOG 19h, 2.12 Load-Store Unit
+// A maximum of two of the memory operations can be stores.
+let Super = Zn4LSU in
+def Zn4Store : ProcResource<2> {
+ // AMD SOG 19h, 2.12 Load-Store Unit
+ // The LS unit utilizes a 64-entry store queue (STQ).
+ let BufferSize = 64;
+}
+
+def Zn4StoreQueue : StoreQueue<Zn4Store>;
+
+//===----------------------------------------------------------------------===//
+// Basic helper classes.
+//===----------------------------------------------------------------------===//
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+
+multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
+ int Lat = 1, list<int> Res = [], int UOps = 1> {
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+}
+
+multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat,
+ list<int> Res, int UOps, int LoadLat, int LoadUOps,
+ ProcResourceKind AGU, int LoadRes> {
+ defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+
+ defm : __Zn4WriteRes<SchedRW.Folded,
+ !listconcat([AGU, Zn4Load], ExePorts),
+ !add(Lat, LoadLat),
+ !if(!and(!empty(Res), !eq(LoadRes, 1)),
+ [],
+ !listconcat([1, LoadRes],
+ !if(!empty(Res),
+ !listsplat(1, !size(ExePorts)),
+ Res))),
+ !add(UOps, LoadUOps)>;
+}
+
+// For classes without folded loads.
+multiclass Zn4WriteResInt<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn4WriteResXMM<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn4WriteResYMM<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+multiclass Zn4WriteResZMM<SchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1> {
+ defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+}
+
+// For classes with folded loads.
+multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver4Model.LoadLatency,
+ LoadUOps, Zn4AGU012, LoadRes>;
+}
+
+multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver4Model.VecLoadLatency,
+ LoadUOps, Zn4FPLd01, LoadRes>;
+}
+
+multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 1,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver4Model.VecLoadLatency,
+ LoadUOps, Zn4FPLd01, LoadRes>;
+}
+
+multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts, int Lat = 1,
+ list<int> Res = [], int UOps = 2,
+ int LoadUOps = 0, int LoadRes = 1> {
+ defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+ Znver4Model.VecLoadLatency,
+ LoadUOps, Zn4FPLd01, LoadRes>;
+}
+
+//===----------------------------------------------------------------------===//
+// Here be dragons.
+//===----------------------------------------------------------------------===//
+
+def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>;
+
+def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
+def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
+def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
+
+// AMD SOG 19h, 2.11 Floating-Point Unit
+// There is 1 cycle of added latency for a result to cross
+// from F to I or I to F domain.
+def : ReadAdvance<ReadInt2Fpu, -1>;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>;
+
+// Loads, stores, and moves, not folded with other operations.
+defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>;
+
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>;
+
+def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> {
+ let Latency = !add(Znver4Model.LoadLatency, 1);
+ let ResourceCycles = [3, 1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
+
+defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
+defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
+defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
+ let Latency = Znver4Model.LoadLatency;
+ let ResourceCycles = [1, 1, 4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>;
+
+def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> {
+ let Latency = Znver4Model.StoreLatency;
+ let ResourceCycles = [4, 1, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
+
+// Arithmetic.
+defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op.
+
+def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
+ AND8i8, AND16i16, AND32i32, AND64i32,
+ OR8i8, OR16i16, OR32i32, OR64i32,
+ SUB8i8, SUB16i16, SUB32i32, SUB64i32,
+ XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
+
+def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
+
+def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
+
+def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> {
+ let Latency = 3;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
+ PEXT32rr, PEXT64rr)>;
+
+defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op.
+
+def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 1, 7, 1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
+
+// This is for simple LEAs with one or two input operands.
+defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>; // LEA instructions can't fold loads.
+
+// This write is used for slow LEA instructions.
+def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 2;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+
+// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset),
+// or an LEA with a `Scale` value different than 1.
+def Zn4SlowLEAPredicate : MCSchedPredicate<
+ CheckAny<[
+ // A 3-operand LEA (base, index, offset).
+ IsThreeOperandsLEAFn,
+ // An LEA with a "Scale" different than 1.
+ CheckAll<[
+ CheckIsImmOperand<2>,
+ CheckNot<CheckImmOperand<2, 1>>
+ ]>
+ ]>
+>;
+
+def Zn4WriteLEA : SchedWriteVariant<[
+ SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>,
+ SchedVar<NoSchedPred, [WriteLEA]>
+]>;
+
+def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+
+def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 2; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [4];
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>;
+
+// Integer multiplication
+defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
+defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
+defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
+defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
+defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>; // Integer 32-bit multiplication.
+defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
+defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
+defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
+defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>; // Integer 64-bit multiplication.
+defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
+defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
+defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
+defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>; // Integer multiplication, high part.
+defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>; // Integer multiplication, high part.
+
+defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
+defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
+
+defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
+
+def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 3;
+ let ResourceCycles = [12];
+ let NumMicroOps = 3;
+}
+def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
+
+defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>; // Compare and set, compare and swap.
+
+def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency);
+ let ResourceCycles = [1, 1, 12];
+ let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2);
+}
+def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
+
+def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 3; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [24];
+ let NumMicroOps = 19;
+}
+def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
+
+def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 4; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [59];
+ let NumMicroOps = 28;
+}
+def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
+
+def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
+
+def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
+ let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = 5;
+}
+def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
+
+def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
+ let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
+
+// Integer division.
+// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
+// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
+defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
+defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
+defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
+
+defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
+defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
+
+defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
+
+def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>;
+
+defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count.
+
+def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
+
+defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
+
+def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
+ let Latency = 2;
+ let ResourceCycles = [4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
+
+defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move.
+defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
+defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code.
+defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
+defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH.
+
+defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test
+defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>;
+defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>;
+
+defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set
+defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>;
+defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>;
+
+// Integer shifts and rotates.
+defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+
+def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
+ RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
+
+def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1);
+}
+def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
+ RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
+
+def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> {
+ let Latency = 3;
+ let ResourceCycles = [6];
+ let NumMicroOps = 7;
+}
+def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
+
+def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3);
+}
+def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
+
+def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+ let NumMicroOps = 9;
+}
+def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
+
+def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2);
+}
+def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
+
+defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+
+def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> {
+ let Latency = 3;
+ let ResourceCycles = [6];
+ let NumMicroOps = 7;
+}
+def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
+
+def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2);
+}
+def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
+
+def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+ let NumMicroOps = 9;
+}
+def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
+
+def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2);
+}
+def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
+
+// Double shift instructions.
+defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>;
+defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>;
+defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
+defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
+
+// BMI1 BEXTR/BLS, BMI2 BZHI
+defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>;
+defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
+
+// Floating point. This covers both scalar and vector operations.
+defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>;
+defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
+defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+
+def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> {
+ let Latency = 2; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr, MOVHPSmr,
+ VMOVHPDmr, VMOVHPSmr)>;
+
+defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+
+defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
+defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
+defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
+defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
+
+defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub.
+
+def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
+ let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1, 24];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
+ SUB_FI16m, SUB_FI32m,
+ SUBR_FI16m, SUBR_FI32m,
+ MUL_FI16m, MUL_FI32m)>;
+
+def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
+ let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1, 1, 62];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
+ DIVR_FI16m, DIVR_FI32m)>;
+
+defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
+defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
+defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM).
+defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub.
+defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
+defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
+defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM).
+defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare.
+defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 1, [1], 1>; // Floating point compare (XMM).
+defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (YMM).
+defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [4], 1>; // Floating point compare (ZMM).
+defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare.
+defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 1, [1], 1>; // Floating point double compare (XMM).
+defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (YMM).
+defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [4], 1>; // Floating point double compare (ZMM).
+defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87).
+defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
+defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication.
+defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
+defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
+defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM).
+defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication.
+defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
+defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
+defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM).
+defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>; // Floating point division.
+defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM).
+defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM).
+defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM).
+defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division.
+defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
+defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
+defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM).
+defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root.
+defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (XMM).
+defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>; // Floating point square root (YMM).
+defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>; // Floating point square root (ZMM).
+defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root.
+defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
+defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
+defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM).
+defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root.
+defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate.
+defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM).
+defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM).
+defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM).
+defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate.
+defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM).
+defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM).
+defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM).
+defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add.
+defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (XMM).
+defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (YMM).
+defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [4], 1>; // Fused Multiply Add (ZMM).
+defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
+defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
+defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
+defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs.
+defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding.
+defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
+defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM).
+
+defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
+defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
+defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM).
+defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
+defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
+defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM).
+defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
+defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
+defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM).
+defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
+defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
+defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM).
+defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends.
+defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
+defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM).
+defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends.
+defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
+defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM).
+
+// Horizontal Add/Sub (float and integer)
+defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>;
+defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>;
+defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>;
+
+// Vector integer operations.
+defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+
+def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
+ let Latency = 4;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rr, VEXTRACTI128rr)>;
+
+def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let ResourceCycles = [1, 1, 1];
+ let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
+}
+def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mr, VEXTRACTF128mr)>;
+
+def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let ResourceCycles = [1, 1, 1];
+ let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
+}
+def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rm)>;
+
+defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
+defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
+defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
+defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
+defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
+
+defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>;
+defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>;
+
+def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
+
+def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 4];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
+
+defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals.
+
+def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
+
+def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
+
+defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
+
+def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
+ PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
+ PAVGBrr, PAVGWrr,
+ PSIGNBrr, PSIGNDrr, PSIGNWrr,
+ VPABSBrr, VPABSDrr, VPABSWrr,
+ VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
+ VPAVGBrr, VPAVGWrr,
+ VPCMPEQQrr,
+ VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
+ PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
+
+def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVecOpMask], (instrs KADDBrr, KADDDrr, KADDQrr, KADDWrr,
+ KANDBrr, KANDDrr, KANDQrr, KANDWrr,
+ KANDNBrr, KANDNDrr, KANDNQrr, KANDNWrr,
+ KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk,
+ KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk,
+ KNOTBrr, KNOTDrr, KNOTQrr, KNOTWrr,
+ KORBrr, KORDrr, KORQrr, KORWrr,
+ KORTESTBrr, KORTESTDrr, KORTESTQrr, KORTESTWrr,
+ KTESTBrr, KTESTDrr, KTESTQrr, KTESTWrr,
+ KUNPCKBWrr, KUNPCKDQrr, KUNPCKWDrr,
+ KXNORBrr, KXNORDrr, KXNORQrr, KXNORWrr,
+ KXORBrr, KXORDrr, KXORQrr, KXORWrr)>;
+
+def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>;
+
+def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
+
+def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+ // TODO: All align instructions are expected to be of 4 cycle latency
+ let Latency = 4;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
+ VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
+ >;
+defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
+
+def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
+ VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
+ VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
+ VPAVGBYrr, VPAVGWYrr,
+ VPCMPEQQYrr,
+ VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
+
+defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM).
+
+defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals.
+defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
+defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
+defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM).
+defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
+defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM).
+defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (ZMM).
+defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (default).
+defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM).
+defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
+defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM).
+defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (default).
+defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
+defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
+defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM).
+defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (default).
+defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
+defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
+defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM).
+defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD.
+defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
+defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM).
+defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles.
+defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
+defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
+defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM).
+defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles.
+defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
+defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
+defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM).
+defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends.
+defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
+defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM).
+defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends.
+defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
+defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM).
+defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW.
+defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
+defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
+defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM).
+defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
+defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
+defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM).
+defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>; // Vector PHMINPOS.
+
+// Vector insert/extract operations.
+defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
+defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr.
+defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
+
+// MOVMSK operations.
+defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
+defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
+defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>;
+defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
+
+// Conversion between integer and float.
+defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>; // Double -> Integer.
+defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM).
+defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM).
+defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM).
+
+def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>; // Float -> Integer.
+
+defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
+defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM).
+defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM).
+
+defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double.
+defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
+defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
+defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM).
+
+def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> {
+ let Latency = 2;
+ let ResourceCycles = [6];
+ let NumMicroOps = 2;
+}
+
+defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float.
+defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
+defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
+defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM).
+
+def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> {
+ let Latency = 3;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+
+defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion.
+defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
+defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
+defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM).
+
+defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion.
+defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
+defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
+defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM).
+
+defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
+defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
+defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM).
+
+defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
+defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
+defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM).
+
+defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
+defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
+defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM).
+
+// CRC32 instruction.
+defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>;
+
+def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
+
+def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
+}
+def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
+
+def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
+ let Latency = 1;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
+
+def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
+}
+def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
+
+def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
+ let Latency = 2;
+ let ResourceCycles = [3];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
+
+def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
+ let ResourceCycles = [1, 1, 3];
+ let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
+}
+def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
+
+def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
+ let Latency = 3;
+ let ResourceCycles = [8];
+ let NumMicroOps = 4;
+}
+def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
+
+def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
+ let ResourceCycles = [1, 1, 8];
+ let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
+}
+def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
+
+def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> {
+ let Latency = 6;
+ let ResourceCycles = [8];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
+
+def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
+// Packed Compare Explicit Length Strings, Return Mask
+defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
+// Packed Compare Implicit Length Strings, Return Index
+defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
+// Packed Compare Explicit Length Strings, Return Index
+defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
+
+// AES instructions.
+defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption.
+defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn.
+defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
+
+// Carry-less multiplication instructions.
+defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
+
+// EMMS/FEMMS
+defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
+
+// Load/store MXCSR
+defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
+defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
+
+// Catch-all for expensive system instructions.
+defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>;
+
+def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> {
+ let Latency = 0; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>;
+
+def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> {
+ let Latency = 10; // FIXME: not from llvm-exegesis
+ let ResourceCycles = [24];
+ let NumMicroOps = 18;
+}
+def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>;
+
+// AVX2.
+defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
+defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
+defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles.
+
+def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
+ let Latency = 3;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rr, VPERM2F128rr)>;
+
+def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
+ let ResourceCycles = [1, 1, 1];
+ let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
+}
+def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rm)>;
+
+def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
+ let Latency = 7;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
+
+def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
+}
+def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
+
+def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
+ let Latency = 6;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
+
+def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
+}
+def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
+
+def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
+ let Latency = 5;
+ let ResourceCycles = [1];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
+
+def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
+ let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
+ let ResourceCycles = [1, 1, 2];
+ let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
+}
+def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
+
+defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
+defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
+defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts.
+defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
+defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM).
+
+// Old microcoded instructions that nobody use.
+defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>;
+
+// Fence instructions.
+defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>;
+
+def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> {
+ let Latency = 1;
+ let ResourceCycles = [30];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>;
+
+def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>;
+
+// Nop, not very useful expect it provides a model for nops!
+defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Zero Cycle Move
+///////////////////////////////////////////////////////////////////////////////
+
+def Zn4WriteZeroLatency : SchedWriteRes<[]> {
+ let Latency = 0;
+ let ResourceCycles = [];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
+ MOV64rr, MOV64rr_REV,
+ MOVSX32rr32)>;
+
+def Zn4WriteSwapRenameable : SchedWriteRes<[]> {
+ let Latency = 0;
+ let ResourceCycles = [];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
+ XCHG64rr, XCHG64ar)>;
+
+defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support.
+
+defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>;
+defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>;
+defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>;
+
+defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX
+defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
+defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
+defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>;
+
+def : IsOptimizableRegisterMove<[
+ InstructionEquivalenceClass<[
+ // GPR variants.
+ MOV32rr, MOV32rr_REV,
+ MOV64rr, MOV64rr_REV,
+ MOVSX32rr32,
+ XCHG32rr, XCHG32ar,
+ XCHG64rr, XCHG64ar,
+
+ // MMX variants.
+ // MMX moves are *NOT* eliminated.
+
+ // SSE variants.
+ MOVAPSrr, MOVAPSrr_REV,
+ MOVUPSrr, MOVUPSrr_REV,
+ MOVAPDrr, MOVAPDrr_REV,
+ MOVUPDrr, MOVUPDrr_REV,
+ MOVDQArr, MOVDQArr_REV,
+ MOVDQUrr, MOVDQUrr_REV,
+
+ // AVX variants.
+ VMOVAPSrr, VMOVAPSrr_REV,
+ VMOVUPSrr, VMOVUPSrr_REV,
+ VMOVAPDrr, VMOVAPDrr_REV,
+ VMOVUPDrr, VMOVUPDrr_REV,
+ VMOVDQArr, VMOVDQArr_REV,
+ VMOVDQUrr, VMOVDQUrr_REV,
+
+ // AVX YMM variants.
+ VMOVAPSYrr, VMOVAPSYrr_REV,
+ VMOVUPSYrr, VMOVUPSYrr_REV,
+ VMOVAPDYrr, VMOVAPDYrr_REV,
+ VMOVUPDYrr, VMOVUPDYrr_REV,
+ VMOVDQAYrr, VMOVDQAYrr_REV,
+ VMOVDQUYrr, VMOVDQUYrr_REV,
+ ], TruePred >
+]>;
+
+// FIXUP and RANGE Instructions
+def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
+ "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
+ "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
+ "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
+ )>;
+
+// SCALE & REDUCE instructions
+def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> {
+ let Latency = 6;
+ let ResourceCycles = [6];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteSCALErr], (instregex
+ "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?|_Intkz?)",
+ "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)"
+ )>;
+
+//BF16PS Instructions
+def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> {
+ let Latency = 6;
+ let ResourceCycles = [6];
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn4WriteBF16], (instregex
+ "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)"
+ )>;
+
+// BUSD and VPMADD Instructions
+def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 4;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
+ "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
+ "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
+ )>;
+
+// SHIFT instructions
+def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteSHIFTrr], (instregex
+ "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)",
+ "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)",
+ "(V?)P(SLL|SRL|SRA)DQYri",
+ "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri",
+ "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)",
+ "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
+ "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
+ "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
+ "VPSHUFBITQMBZ128rr", "VFMSUB231SSZr_Intkz"
+ )>;
+
+def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 1;
+ let ResourceCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteSHIFTri], (instregex
+ "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
+ )>;
+
+// ALIGN Instructions
+def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteALIGN], (instregex
+ "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
+ )>;
+
+//PACK Instructions
+def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WritePACK], (instregex
+ "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
+ )>;
+
+// MAX and MIN Instructions
+def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteFCmp64], (instregex
+ "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)",
+ "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)",
+ "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)",
+ "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)"
+ )>;
+
+// MOV Instructions
+def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4MOVS], (instregex
+ "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z128?|Z256?)(rr|rrk|rrkz)",
+ "(V?)PMOV(SX|QD|UZ|ZX)(BD|BQ|BW?)(Y|Z128?)(rr|rrk|rrkz)",
+ "(V?)PMOV(SX|US|ZX)(DQ|WD|QW|WQ?)(Y|Z128?)(rr|rrk|rrkz)",
+ "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)",
+ "VPMOV(DB|DW|QB|QD|QW|SDB|SDW|SQB|SQD|SQW|SWB|USDB|USDW|USQB|USQD|USWB|WB)(Z128?)(rr|rrk|rrkz)"
+ )>;
+
+def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 4;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4MOVSZ], (instregex
+ "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Z?)(rr|rrk|rrkz)"
+ )>;
+
+def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 5;
+ let ResourceCycles = [5];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4MOVSrr], (instregex
+ "(V?)PMOV(DB|QB|QW|SDB|SQB|SQW|USDB|USQB|USQW)(Z?)(rr|rrk|rrkz)"
+ )>;
+
+
+//VPTEST Instructions
+def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 3;
+ let ResourceCycles = [3];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4VPTESTZ128], (instregex
+ "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)"
+ )>;
+
+def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 4;
+ let ResourceCycles = [4];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4VPTESTZ256], (instregex
+ "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)"
+ )>;
+
+def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 5;
+ let ResourceCycles = [5];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4VPTESTZ], (instregex
+ "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)"
+ )>;
+
+// CONFLICT Instructions
+def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4CONFLICTZ128], (instregex
+ "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)"
+ )>;
+
+def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> {
+ let Latency = 6;
+ let ResourceCycles = [2,2,2];
+ let NumMicroOps = 4;
+}
+def : InstRW<[Zn4CONFLICTrr], (instregex
+ "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)"
+ )>;
+
+// RSQRT Instructions
+def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 5;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4VRSQRT14PDZ256], (instregex
+ "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)"
+ )>;
+
+
+// PERM Instructions
+def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4PERMILP], (instregex
+ "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)"
+ )>;
+
+def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 3;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4PERMIT2_128], (instregex
+ "VPERM(I2|T2)(PS|PD|W)128(rr|rrk|rrkz)",
+ "VPERM(I2|T2)(B|D|Q)128(rr|rrk|rrkz)"
+ )>;
+
+def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4PERMIT2_128rr], (instregex
+ "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)",
+ "VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)"
+ )>;
+
+def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 4;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4PERMIT2_256], (instregex
+ "VPERM(I2|T2)(PS|PD|W)256(rr|rrk|rrkz)",
+ "VPERMP(S|D)Z256(rr|rrk|rrkz)",
+ "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)",
+ "VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)",
+ "VPERM(I2|Q|T2)(B|D|Q)(Z?)256(rr|rrk|rrkz)",
+ "VPEXPAND(B|W)Z256(rr|rrk|rrkz)"
+ )>;
+
+def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 5;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4PERMIT2Z], (instregex
+ "VPERM(I2|T2)(PS|PD|W)(rr|rrk|rrkz)",
+ "VPERM(B|D|W)Z(rr|rrk|rrkz)",
+ "VPERM(I2|Q|T2)(B|D|Q)(Z?)(rr|rrk|rrkz)",
+ "V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)",
+ "VPEXPAND(B|W)Z(rr|rrk|rrkz)",
+ "VPERMP(S|D)Z(rr|rrk|rrkz)"
+ )>;
+
+// ALU SLOW Misc Instructions
+def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4VecALUZSlow], (instrs
+ VPABSBZ128rr, VPABSBZ128rrk, VPABSBZ128rrkz, VPABSDZ128rr,
+ VPABSDZ128rrk, VPABSDZ128rrkz, VPABSQZ128rr, VPABSQZ128rrk,
+ VPABSQZ128rrkz, VPABSWZ128rr, VPABSWZ128rrk, VPABSWZ128rrkz,
+ VPADDSBZ128rr, VPADDSBZ128rrk, VPADDSBZ128rrkz, VPADDSWZ128rr,
+ VPADDSWZ128rrk, VPADDSWZ128rrkz,VPADDUSBZ128rr, VPADDUSBZ128rrk,
+ VPADDUSBZ128rrkz, VPADDUSWZ128rr, VPADDUSWZ128rrk, VPADDUSWZ128rrkz,
+ VPAVGBZ128rr, VPAVGBZ128rrk, VPAVGBZ128rrkz, VPAVGWZ128rr,
+ VPAVGWZ128rrk, VPAVGWZ128rrkz, VPOPCNTBZ128rr, VPOPCNTBZ128rrk,
+ VPOPCNTBZ128rrkz, VPOPCNTDZ128rr, VPOPCNTDZ128rrk, VPOPCNTDZ128rrkz,
+ VPOPCNTQZ128rr, VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr,
+ VPOPCNTWZ128rrk, VPOPCNTWZ128rrkz,VPSUBSBZ128rr, VPSUBSBZ128rrk,
+ VPSUBSBZ128rrkz, VPSUBSWZ128rr, VPSUBSWZ128rrk, VPSUBSWZ128rrkz,
+ VPSUBUSBZ128rr, VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr,
+ VPSUBUSWZ128rrk, VPSUBUSWZ128rrkz
+ )>;
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def Zn4WriteZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
+ XOR64rr, XOR64rr_REV,
+ SUB32rr, SUB32rr_REV,
+ SUB64rr, SUB64rr_REV)>;
+
+def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteALU]>
+]>;
+def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr, CMP8rr_REV,
+ CMP16rr, CMP16rr_REV,
+ CMP32rr, CMP32rr_REV,
+ CMP64rr, CMP64rr_REV)>;
+
+def Zn4WriteFZeroIdiom : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogic]>
+]>;
+// NOTE: XORPSrr, XORPDrr are not zero-cycle!
+def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
+ VANDNPSrr, VANDNPDrr)>;
+
+def Zn4WriteFZeroIdiomY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteFLogicY]>
+]>;
+def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
+ VANDNPSYrr, VANDNPDYrr)>;
+
+def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicX]>
+]>;
+// NOTE: PXORrr,PANDNrr are not zero-cycle!
+def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr, VPANDNrr)>;
+
+def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecLogicY]>
+]>;
+def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr, VPANDNYrr)>;
+
+def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUX]>
+]>;
+// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+// PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
+def : InstRW<[Zn4WriteVZeroIdiomALUX],
+ (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+ VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr)>;
+
+def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[
+ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
+ SchedVar<NoSchedPred, [WriteVecALUY]>
+]>;
+def : InstRW<[Zn4WriteVZeroIdiomALUY],
+ (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+ VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr)>;
+
+def : IsZeroIdiomFunction<[
+ // GPR Zero-idioms.
+ DepBreakingClass<[ XOR32rr, XOR32rr_REV,
+ XOR64rr, XOR64rr_REV,
+ SUB32rr, SUB32rr_REV,
+ SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
+
+ // SSE XMM Zero-idioms.
+ DepBreakingClass<[
+ // fp variants.
+ XORPSrr, XORPDrr,
+ ANDNPSrr, ANDNPDrr,
+
+ // int variants.
+ PXORrr,
+ PANDNrr,
+ PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+ PSUBSBrr, PSUBSWrr,
+ PSUBUSBrr, PSUBUSWrr,
+ PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
+ ], ZeroIdiomPredicate>,
+
+ // AVX XMM Zero-idioms.
+ DepBreakingClass<[
+ // fp variants.
+ VXORPSrr, VXORPDrr,
+ VANDNPSrr, VANDNPDrr,
+
+ // int variants.
+ VPXORrr,
+ VPANDNrr,
+ VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+ VPSUBSBrr, VPSUBSWrr,
+ VPSUBUSBrr, VPSUBUSWrr,
+ VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+ ], ZeroIdiomPredicate>,
+
+ // AVX YMM Zero-idioms.
+ DepBreakingClass<[
+ // fp variants.
+ VXORPSYrr, VXORPDYrr,
+ VANDNPSYrr, VANDNPDYrr,
+
+ // int variants.
+ VPXORYrr,
+ VPANDNYrr,
+ VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+ VPSUBSBYrr, VPSUBSWYrr,
+ VPSUBUSBYrr, VPSUBUSWYrr,
+ VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
+ ], ZeroIdiomPredicate>,
+]>;
+
+def : IsDepBreakingFunction<[
+ // GPR
+ DepBreakingClass<[ SBB32rr, SBB32rr_REV,
+ SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
+ DepBreakingClass<[ CMP8rr, CMP8rr_REV,
+ CMP16rr, CMP16rr_REV,
+ CMP32rr, CMP32rr_REV,
+ CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
+ // SSE
+ DepBreakingClass<[
+ PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
+ ], ZeroIdiomPredicate>,
+
+ // AVX XMM
+ DepBreakingClass<[
+ VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
+ ], ZeroIdiomPredicate>,
+
+ // AVX YMM
+ DepBreakingClass<[
+ VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
+ ], ZeroIdiomPredicate>,
+]>;
+
+} // SchedModel
+
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index af98ecb122d6..22b985df9302 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -523,7 +523,7 @@ StringRef ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) {
default:
if (TT.isOSNetBSD())
return "apcs-gnu";
- if (TT.isOSOpenBSD())
+ if (TT.isOSFreeBSD() || TT.isOSOpenBSD())
return "aapcs-linux";
return "aapcs";
}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 1480a0ff9e2f..de3095852048 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -3184,16 +3184,6 @@ Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
}
break;
}
- case Instruction::And: {
- const APInt *BOC;
- if (match(BOp1, m_APInt(BOC))) {
- // If we have ((X & C) == C), turn it into ((X & C) != 0).
- if (C == *BOC && C.isPowerOf2())
- return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
- BO, Constant::getNullValue(RHS->getType()));
- }
- break;
- }
case Instruction::UDiv:
if (C.isZero()) {
// (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
@@ -5653,6 +5643,12 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
}
}
}
+
+ // Op0 eq C_Pow2 -> Op0 ne 0 if Op0 is known to be C_Pow2 or zero.
+ if (Op1Known.isConstant() && Op1Known.getConstant().isPowerOf2() &&
+ (Op0Known & Op1Known) == Op0Known)
+ return new ICmpInst(CmpInst::getInversePredicate(Pred), Op0,
+ ConstantInt::getNullValue(Op1->getType()));
break;
}
case ICmpInst::ICMP_ULT: {
diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
index de7f883d24a8..c6627c75157b 100644
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -145,10 +145,10 @@ def reloc : Flag<["--"], "reloc">,
def : Flag<["-"], "r">, Alias<reloc>, HelpText<"Alias for --reloc">;
def print_imm_hex : Flag<["--"], "print-imm-hex">,
- HelpText<"Use hex format for immediate values">;
+ HelpText<"Use hex format for immediate values (default)">;
def no_print_imm_hex : Flag<["--"], "no-print-imm-hex">,
- HelpText<"Do not use hex format for immediate values (default)">;
+ HelpText<"Do not use hex format for immediate values">;
def : Flag<["--"], "print-imm-hex=false">, Alias<no_print_imm_hex>;
def private_headers : Flag<["--"], "private-headers">,