aboutsummaryrefslogtreecommitdiff
path: root/openmp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-12-09 13:28:42 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-12-09 13:28:42 +0000
commitb1c73532ee8997fe5dfbeb7d223027bdf99758a0 (patch)
tree7d6e51c294ab6719475d660217aa0c0ad0526292 /openmp
parent7fa27ce4a07f19b07799a767fc29416f3b625afb (diff)
downloadsrc-b1c73532ee8997fe5dfbeb7d223027bdf99758a0.tar.gz
src-b1c73532ee8997fe5dfbeb7d223027bdf99758a0.zip
Vendor import of llvm-project main llvmorg-18-init-14265-ga17671084db1.vendor/llvm-project/llvmorg-18-init-14265-ga17671084db1
Diffstat (limited to 'openmp')
-rw-r--r--openmp/runtime/src/dllexports4
-rw-r--r--openmp/runtime/src/i18n/en_US.txt2
-rw-r--r--openmp/runtime/src/include/omp.h.var8
-rw-r--r--openmp/runtime/src/include/omp_lib.f90.var22
-rw-r--r--openmp/runtime/src/include/omp_lib.h.var22
-rw-r--r--openmp/runtime/src/include/ompx.h.var165
-rw-r--r--openmp/runtime/src/kmp.h192
-rw-r--r--openmp/runtime/src/kmp_affinity.cpp364
-rw-r--r--openmp/runtime/src/kmp_affinity.h81
-rw-r--r--openmp/runtime/src/kmp_barrier.cpp2
-rw-r--r--openmp/runtime/src/kmp_barrier.h5
-rw-r--r--openmp/runtime/src/kmp_collapse.cpp85
-rw-r--r--openmp/runtime/src/kmp_config.h.cmake2
-rw-r--r--openmp/runtime/src/kmp_csupport.cpp47
-rw-r--r--openmp/runtime/src/kmp_dispatch.cpp209
-rw-r--r--openmp/runtime/src/kmp_dispatch.h14
-rw-r--r--openmp/runtime/src/kmp_environment.cpp4
-rw-r--r--openmp/runtime/src/kmp_ftn_entry.h29
-rw-r--r--openmp/runtime/src/kmp_ftn_os.h2
-rw-r--r--openmp/runtime/src/kmp_global.cpp7
-rw-r--r--openmp/runtime/src/kmp_itt.inl2
-rw-r--r--openmp/runtime/src/kmp_lock.cpp2
-rw-r--r--openmp/runtime/src/kmp_os.h11
-rw-r--r--openmp/runtime/src/kmp_platform.h22
-rw-r--r--openmp/runtime/src/kmp_runtime.cpp315
-rw-r--r--openmp/runtime/src/kmp_safe_c_api.h4
-rw-r--r--openmp/runtime/src/kmp_settings.cpp388
-rw-r--r--openmp/runtime/src/kmp_settings.h1
-rw-r--r--openmp/runtime/src/kmp_str.cpp15
-rw-r--r--openmp/runtime/src/kmp_str.h1
-rw-r--r--openmp/runtime/src/kmp_taskdeps.cpp59
-rw-r--r--openmp/runtime/src/kmp_tasking.cpp26
-rw-r--r--openmp/runtime/src/kmp_wrapper_getpid.h4
-rw-r--r--openmp/runtime/src/ompt-event-specific.h13
-rw-r--r--openmp/runtime/src/ompt-specific.cpp21
-rw-r--r--openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h12
-rw-r--r--openmp/runtime/src/z_Linux_asm.S352
-rw-r--r--openmp/runtime/src/z_Linux_util.cpp76
-rw-r--r--openmp/runtime/src/z_Windows_NT_util.cpp2
39 files changed, 2042 insertions, 550 deletions
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index f740f29346ae..0d49643709e0 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -518,6 +518,8 @@ kmp_set_warnings_off 780
omp_target_memcpy_rect 887
omp_target_associate_ptr 888
omp_target_disassociate_ptr 889
+ omp_target_memset 3000
+ omp_target_memset_async 3001
%endif
kmp_set_disp_num_buffers 890
@@ -1268,4 +1270,6 @@ kmp_set_disp_num_buffers 890
%endif
+__kmpc_set_thread_limit
+
# end of file #
diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt
index 228bcdb25a8e..08e837d3dea1 100644
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@@ -480,6 +480,8 @@ AffHWSubsetAllFiltered "KMP_HW_SUBSET ignored: all hardware resources woul
AffHWSubsetAttrsNonHybrid "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre."
AffHWSubsetIgnoringAttr "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre."
TargetMemNotAvailable "Target memory not available, will use default allocator."
+AffIgnoringNonHybrid "%1$s ignored: This machine is not a hybrid architecutre. Using \"%2$s\" instead."
+AffIgnoringNotAvailable "%1$s ignored: %2$s is not available. Using \"%3$s\" instead."
# --------------------------------------------------------------------------------------------------
-*- HINTS -*-
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 1b2c467a2a12..a1488ae9d21c 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -15,6 +15,7 @@
#ifndef __OMP_H
# define __OMP_H
+# include <stddef.h>
# include <stdlib.h>
# include <stdint.h>
@@ -236,6 +237,11 @@
extern int __KAI_KMPC_CONVENTION omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *,
const size_t *, const size_t *, const size_t *, const size_t *, int, int,
int, omp_depend_t *);
+
+ /* OpenMP 6.0 device memory routines */
+ extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int);
+ extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *);
+
/*!
* The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device.
*/
@@ -497,7 +503,7 @@
extern int __KAI_KMPC_CONVENTION omp_in_explicit_task(void);
/* LLVM Extensions */
- extern void *llvm_omp_target_dynamic_shared_alloc();
+ extern void *llvm_omp_target_dynamic_shared_alloc(void);
# undef __KAI_KMPC_CONVENTION
# undef __KMP_IMP
diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.f90.var
index c72287422809..1ca542db3767 100644
--- a/openmp/runtime/src/include/omp_lib.f90.var
+++ b/openmp/runtime/src/include/omp_lib.f90.var
@@ -635,6 +635,28 @@
integer (omp_depend_kind), optional :: depobj_list(*)
end function omp_target_memcpy_rect_async
+ function omp_target_memset(ptr, val, count, device_num) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ type(c_ptr) :: omp_target_memset
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ end function
+
+ function omp_target_memset_async(ptr, val, count, device_num, &
+ depobj_count, depobj_list) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ use omp_lib_kinds
+ type(c_ptr) :: omp_target_memset_async
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ integer(c_int), value :: depobj_count
+ integer(omp_depend_kind), optional :: depobj_list(*)
+ end function
+
function omp_target_associate_ptr(host_ptr, device_ptr, size, &
device_offset, device_num) bind(c)
use omp_lib_kinds
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index 9f5e58515e75..d20aade6ef8b 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -732,6 +732,28 @@
integer(omp_depend_kind), optional :: depobj_list(*)
end function omp_target_memcpy_rect_async
+ function omp_target_memset(ptr, val, count, device_num) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ type(c_ptr) :: omp_target_memset
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ end function
+
+ function omp_target_memset_async(ptr, val, count, device_num, &
+ depobj_count, depobj_list) bind(c)
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+ use omp_lib_kinds
+ type(c_ptr) :: omp_target_memset_async
+ type(c_ptr), value :: ptr
+ integer(c_int), value :: val
+ integer(c_size_t), value :: count
+ integer(c_int), value :: device_num
+ integer(c_int), value :: depobj_count
+ integer(omp_depend_kind), optional :: depobj_list(*)
+ end function
+
function omp_target_associate_ptr(host_ptr, device_ptr, size, &
& device_offset, device_num) bind(c)
use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
new file mode 100644
index 000000000000..5dd8e8355e4c
--- /dev/null
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -0,0 +1,165 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPX_H
+#define __OMPX_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int omp_get_ancestor_thread_num(int);
+int omp_get_team_size(int);
+
+#ifdef __cplusplus
+}
+#endif
+
+/// Target kernel language extensions
+///
+/// These extensions exist for the host to allow fallback implementations,
+/// however, they cannot be arbitrarily composed with OpenMP. If the rules of
+/// the kernel language are followed, the host fallbacks should behave as
+/// expected since the kernel is represented as 3 sequential outer loops, one
+/// for each grid dimension, and three (nested) parallel loops, one for each
+/// block dimension. This fallback is not supposed to be optimal and should be
+/// configurable by the user.
+///
+///{
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+ ompx_relaxed = __ATOMIC_RELAXED,
+ ompx_aquire = __ATOMIC_ACQUIRE,
+ ompx_release = __ATOMIC_RELEASE,
+ ompx_acq_rel = __ATOMIC_ACQ_REL,
+ ompx_seq_cst = __ATOMIC_SEQ_CST,
+};
+
+enum {
+ ompx_dim_x = 0,
+ ompx_dim_y = 1,
+ ompx_dim_z = 2,
+};
+
+/// ompx_{thread,block}_{id,dim}
+///{
+#pragma omp begin declare variant match(device = {kind(cpu)})
+#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(NAME, VALUE) \
+ static inline int ompx_##NAME(int Dim) { return VALUE; }
+
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(thread_id,
+ omp_get_ancestor_thread_num(Dim + 1))
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(block_dim, omp_get_team_size(Dim + 1))
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(block_id, 0)
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(grid_dim, 1)
+#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C
+///}
+
+/// ompx_{sync_block}_{,divergent}
+///{
+#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(RETTY, NAME, ARGS, BODY) \
+ static inline RETTY ompx_##NAME(ARGS) { BODY; }
+
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block, int Ordering,
+ _Pragma("omp barrier"));
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_acq_rel, void,
+ ompx_sync_block(ompx_acq_rel));
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
+ ompx_sync_block(Ordering));
+#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C
+///}
+
+#pragma omp end declare variant
+
+/// ompx_{sync_block}_{,divergent}
+///{
+#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS) \
+ RETTY ompx_##NAME(ARGS);
+
+_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block, int Ordering);
+_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_acq_rel, void);
+_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_divergent, int Ordering);
+#undef _TGT_KERNEL_LANGUAGE_DECL_SYNC_C
+///}
+
+/// ompx_{thread,block}_{id,dim}_{x,y,z}
+///{
+#define _TGT_KERNEL_LANGUAGE_DECL_GRID_C(NAME) \
+ int ompx_##NAME(int Dim); \
+ static inline int ompx_##NAME##_x() { return ompx_##NAME(ompx_dim_x); } \
+ static inline int ompx_##NAME##_y() { return ompx_##NAME(ompx_dim_y); } \
+ static inline int ompx_##NAME##_z() { return ompx_##NAME(ompx_dim_z); }
+
+_TGT_KERNEL_LANGUAGE_DECL_GRID_C(thread_id)
+_TGT_KERNEL_LANGUAGE_DECL_GRID_C(block_dim)
+_TGT_KERNEL_LANGUAGE_DECL_GRID_C(block_id)
+_TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim)
+#undef _TGT_KERNEL_LANGUAGE_DECL_GRID_C
+///}
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+namespace ompx {
+
+enum {
+ dim_x = ompx_dim_x,
+ dim_y = ompx_dim_y,
+ dim_z = ompx_dim_z,
+};
+
+enum {
+ relaxed = ompx_relaxed ,
+ aquire = ompx_aquire,
+ release = ompx_release,
+ acc_rel = ompx_acq_rel,
+ seq_cst = ompx_seq_cst,
+};
+
+/// ompx::{thread,block}_{id,dim}_{,x,y,z}
+///{
+#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(NAME) \
+ static inline int NAME(int Dim) noexcept { return ompx_##NAME(Dim); } \
+ static inline int NAME##_x() noexcept { return NAME(ompx_dim_x); } \
+ static inline int NAME##_y() noexcept { return NAME(ompx_dim_y); } \
+ static inline int NAME##_z() noexcept { return NAME(ompx_dim_z); }
+
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(thread_id)
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(block_dim)
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(block_id)
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(grid_dim)
+#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX
+///}
+
+/// ompx_{sync_block}_{,divergent}
+///{
+#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(RETTY, NAME, ARGS, CALL_ARGS) \
+ static inline RETTY NAME(ARGS) { \
+ return ompx_##NAME(CALL_ARGS); \
+ }
+
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block, int Ordering = acc_rel,
+ Ordering);
+_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent,
+ int Ordering = acc_rel, Ordering);
+#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX
+///}
+
+} // namespace ompx
+#endif
+
+///}
+
+#endif /* __OMPX_H */
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 641d32357ce8..d34adf7cbf8a 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -27,6 +27,9 @@
#ifndef KMP_STATIC_STEAL_ENABLED
#define KMP_STATIC_STEAL_ENABLED 1
#endif
+#define KMP_WEIGHTED_ITERATIONS_SUPPORTED \
+ (KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED && \
+ (KMP_ARCH_X86 || KMP_ARCH_X86_64))
#define TASK_CURRENT_NOT_QUEUED 0
#define TASK_CURRENT_QUEUED 1
@@ -180,6 +183,7 @@ class kmp_stats_list;
#define KMP_NSEC_PER_SEC 1000000000L
#define KMP_USEC_PER_SEC 1000000L
+#define KMP_NSEC_PER_USEC 1000L
/*!
@ingroup BASIC_TYPES
@@ -690,10 +694,12 @@ extern size_t __kmp_affin_mask_size;
#define KMP_CPU_ISSET(i, mask) (mask)->is_set(i)
#define KMP_CPU_CLR(i, mask) (mask)->clear(i)
#define KMP_CPU_ZERO(mask) (mask)->zero()
+#define KMP_CPU_ISEMPTY(mask) (mask)->empty()
#define KMP_CPU_COPY(dest, src) (dest)->copy(src)
#define KMP_CPU_AND(dest, src) (dest)->bitwise_and(src)
#define KMP_CPU_COMPLEMENT(max_bit_number, mask) (mask)->bitwise_not()
#define KMP_CPU_UNION(dest, src) (dest)->bitwise_or(src)
+#define KMP_CPU_EQUAL(dest, src) (dest)->is_equal(src)
#define KMP_CPU_ALLOC(ptr) (ptr = __kmp_affinity_dispatch->allocate_mask())
#define KMP_CPU_FREE(ptr) __kmp_affinity_dispatch->deallocate_mask(ptr)
#define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr)
@@ -730,6 +736,8 @@ public:
virtual void clear(int i) {}
// Zero out entire mask
virtual void zero() {}
+ // Check whether mask is empty
+ virtual bool empty() const { return true; }
// Copy src into this mask
virtual void copy(const Mask *src) {}
// this &= rhs
@@ -738,6 +746,8 @@ public:
virtual void bitwise_or(const Mask *rhs) {}
// this = ~this
virtual void bitwise_not() {}
+ // this == rhs
+ virtual bool is_equal(const Mask *rhs) const { return false; }
// API for iterating over an affinity mask
// for (int i = mask->begin(); i != mask->end(); i = mask->next(i))
virtual int begin() const { return 0; }
@@ -866,19 +876,16 @@ typedef struct kmp_affinity_flags_t {
unsigned respect : 2;
unsigned reset : 1;
unsigned initialized : 1;
- unsigned reserved : 25;
+ unsigned core_types_gran : 1;
+ unsigned core_effs_gran : 1;
+ unsigned omp_places : 1;
+ unsigned reserved : 22;
} kmp_affinity_flags_t;
KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);
typedef struct kmp_affinity_ids_t {
+ int os_id;
int ids[KMP_HW_LAST];
- int operator[](size_t idx) const { return ids[idx]; }
- int &operator[](size_t idx) { return ids[idx]; }
- kmp_affinity_ids_t &operator=(const kmp_affinity_ids_t &rhs) {
- for (int i = 0; i < KMP_HW_LAST; ++i)
- ids[i] = rhs[i];
- return *this;
- }
} kmp_affinity_ids_t;
typedef struct kmp_affinity_attrs_t {
@@ -895,6 +902,7 @@ typedef struct kmp_affinity_t {
enum affinity_type type;
kmp_hw_t gran;
int gran_levels;
+ kmp_affinity_attrs_t core_attr_gran;
int compact;
int offset;
kmp_affinity_flags_t flags;
@@ -909,9 +917,11 @@ typedef struct kmp_affinity_t {
#define KMP_AFFINITY_INIT(env) \
{ \
- nullptr, affinity_default, KMP_HW_UNKNOWN, -1, 0, 0, \
- {TRUE, FALSE, TRUE, affinity_respect_mask_default, FALSE, FALSE}, 0, \
- nullptr, nullptr, nullptr, 0, nullptr, env \
+ nullptr, affinity_default, KMP_HW_UNKNOWN, -1, KMP_AFFINITY_ATTRS_UNKNOWN, \
+ 0, 0, \
+ {TRUE, FALSE, TRUE, affinity_respect_mask_default, FALSE, FALSE, \
+ FALSE, FALSE, FALSE}, \
+ 0, nullptr, nullptr, nullptr, 0, nullptr, env \
}
extern enum affinity_top_method __kmp_affinity_top_method;
@@ -925,6 +935,10 @@ extern kmp_affin_mask_t *__kmp_affin_fullMask;
extern kmp_affin_mask_t *__kmp_affin_origMask;
extern char *__kmp_cpuinfo_file;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_first_osid_with_ecore;
+#endif
+
#endif /* KMP_AFFINITY_SUPPORTED */
// This needs to be kept in sync with the values in omp.h !!!
@@ -1140,8 +1154,15 @@ extern void __kmp_init_target_task();
#if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX
#define KMP_MAX_NTH PTHREAD_THREADS_MAX
#else
+#ifdef __ve__
+// VE's pthread supports only up to 64 threads per a VE process.
+// Please check p. 14 of following documentation for more details.
+// https://sxauroratsubasa.sakura.ne.jp/documents/veos/en/VEOS_high_level_design.pdf
+#define KMP_MAX_NTH 64
+#else
#define KMP_MAX_NTH INT_MAX
#endif
+#endif
#endif /* KMP_MAX_NTH */
#ifdef PTHREAD_STACK_MIN
@@ -1157,6 +1178,10 @@ extern void __kmp_init_target_task();
#elif KMP_ARCH_X86_64
#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
#define KMP_BACKUP_STKSIZE ((size_t)(2 * 1024 * 1024))
+#elif KMP_ARCH_VE
+// Minimum stack size for pthread for VE is 4MB.
+// https://www.hpc.nec/documents/veos/en/glibc/Difference_Points_glibc.htm
+#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
#else
#define KMP_DEFAULT_STKSIZE ((size_t)(1024 * 1024))
#endif
@@ -1178,13 +1203,13 @@ extern void __kmp_init_target_task();
#define KMP_MAX_STKPADDING (2 * 1024 * 1024)
#define KMP_BLOCKTIME_MULTIPLIER \
- (1000) /* number of blocktime units per second */
+ (1000000) /* number of blocktime units per second */
#define KMP_MIN_BLOCKTIME (0)
#define KMP_MAX_BLOCKTIME \
(INT_MAX) /* Must be this for "infinite" setting the work */
-/* __kmp_blocktime is in milliseconds */
-#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200))
+/* __kmp_blocktime is in microseconds */
+#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200000))
#if KMP_USE_MONITOR
#define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(64 * 1024))
@@ -1211,22 +1236,21 @@ extern void __kmp_init_target_task();
#if KMP_OS_UNIX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
// HW TSC is used to reduce overhead (clock tick instead of nanosecond).
extern kmp_uint64 __kmp_ticks_per_msec;
+extern kmp_uint64 __kmp_ticks_per_usec;
#if KMP_COMPILER_ICC || KMP_COMPILER_ICX
#define KMP_NOW() ((kmp_uint64)_rdtsc())
#else
#define KMP_NOW() __kmp_hardware_timestamp()
#endif
-#define KMP_NOW_MSEC() (KMP_NOW() / __kmp_ticks_per_msec)
#define KMP_BLOCKTIME_INTERVAL(team, tid) \
- (KMP_BLOCKTIME(team, tid) * __kmp_ticks_per_msec)
+ ((kmp_uint64)KMP_BLOCKTIME(team, tid) * __kmp_ticks_per_usec)
#define KMP_BLOCKING(goal, count) ((goal) > KMP_NOW())
#else
// System time is retrieved sporadically while blocking.
extern kmp_uint64 __kmp_now_nsec();
#define KMP_NOW() __kmp_now_nsec()
-#define KMP_NOW_MSEC() (KMP_NOW() / KMP_USEC_PER_SEC)
#define KMP_BLOCKTIME_INTERVAL(team, tid) \
- (KMP_BLOCKTIME(team, tid) * KMP_USEC_PER_SEC)
+ ((kmp_uint64)KMP_BLOCKTIME(team, tid) * (kmp_uint64)KMP_NSEC_PER_USEC)
#define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW())
#endif
#endif // KMP_USE_MONITOR
@@ -1304,12 +1328,16 @@ extern kmp_uint64 __kmp_now_nsec();
/* TODO: tune for KMP_OS_NETBSD */
#define KMP_INIT_WAIT 1024U /* initial number of spin-tests */
#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
+#elif KMP_OS_OPENBSD
+/* TODO: tune for KMP_OS_OPENBSD */
+#define KMP_INIT_WAIT 1024U /* initial number of spin-tests */
+#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
#elif KMP_OS_HURD
/* TODO: tune for KMP_OS_HURD */
#define KMP_INIT_WAIT 1024U /* initial number of spin-tests */
#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
-#elif KMP_OS_OPENBSD
-/* TODO: tune for KMP_OS_OPENBSD */
+#elif KMP_OS_SOLARIS
+/* TODO: tune for KMP_OS_SOLARIS */
#define KMP_INIT_WAIT 1024U /* initial number of spin-tests */
#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
#endif
@@ -1822,12 +1850,9 @@ typedef struct kmp_sched_flags {
unsigned ordered : 1;
unsigned nomerge : 1;
unsigned contains_last : 1;
-#if KMP_USE_HIER_SCHED
- unsigned use_hier : 1;
- unsigned unused : 28;
-#else
- unsigned unused : 29;
-#endif
+ unsigned use_hier : 1; // Used in KMP_USE_HIER_SCHED code
+ unsigned use_hybrid : 1; // Used in KMP_WEIGHTED_ITERATIONS_SUPPORTED code
+ unsigned unused : 27;
} kmp_sched_flags_t;
KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
@@ -1841,26 +1866,37 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 st;
kmp_int32 tc;
kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+ kmp_uint32 ordered_lower;
+ kmp_uint32 ordered_upper;
+
// KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
// a) parm3 is properly aligned and
// b) all parm1-4 are on the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are on the same cache line (not measured though).
- struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
- kmp_int32 parm1; // structures in kmp_dispatch.cpp. This should
- kmp_int32 parm2; // make no real change at least while padding is off.
+ struct KMP_ALIGN(32) {
+ kmp_int32 parm1;
+ kmp_int32 parm2;
kmp_int32 parm3;
kmp_int32 parm4;
};
- kmp_uint32 ordered_lower;
- kmp_uint32 ordered_upper;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ kmp_uint32 pchunks;
+ kmp_uint32 num_procs_with_pcore;
+ kmp_int32 first_thread_with_ecore;
+#endif
#if KMP_OS_WINDOWS
kmp_int32 last_upper;
#endif /* KMP_OS_WINDOWS */
} dispatch_private_info32_t;
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128);
+#endif
+
typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 count; // current chunk number for static & static-steal scheduling
kmp_int64 ub; /* upper-bound */
@@ -1869,14 +1905,16 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 st; /* stride */
kmp_int64 tc; /* trip count (number of iterations) */
kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+ kmp_uint64 ordered_lower;
+ kmp_uint64 ordered_upper;
/* parm[1-4] are used in different ways by different scheduling algorithms */
- // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+ // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
// a) parm3 is properly aligned and
// b) all parm1-4 are in the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are in the same line (not measured though).
-
struct KMP_ALIGN(32) {
kmp_int64 parm1;
kmp_int64 parm2;
@@ -1884,12 +1922,21 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 parm4;
};
- kmp_uint64 ordered_lower;
- kmp_uint64 ordered_upper;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ kmp_uint64 pchunks;
+ kmp_uint64 num_procs_with_pcore;
+ kmp_int64 first_thread_with_ecore;
+#endif
+
#if KMP_OS_WINDOWS
kmp_int64 last_upper;
#endif /* KMP_OS_WINDOWS */
} dispatch_private_info64_t;
+
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128);
+#endif
+
#else /* KMP_STATIC_STEAL_ENABLED */
typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 lb;
@@ -2099,6 +2146,7 @@ typedef struct kmp_internal_control {
int nproc; /* internal control for #threads for next parallel region (per
thread) */
int thread_limit; /* internal control for thread-limit-var */
+ int task_thread_limit; /* internal control for thread-limit-var of a task*/
int max_active_levels; /* internal control for max_active_levels */
kmp_r_sched_t
sched; /* internal control for runtime schedule {sched,chunk} pair */
@@ -2432,12 +2480,22 @@ typedef struct kmp_depend_info {
union {
kmp_uint8 flag; // flag as an unsigned char
struct { // flag as a set of 8 bits
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ /* Same fields as in the #else branch, but in reverse order */
+ unsigned all : 1;
+ unsigned unused : 3;
+ unsigned set : 1;
+ unsigned mtx : 1;
+ unsigned out : 1;
+ unsigned in : 1;
+#else
unsigned in : 1;
unsigned out : 1;
unsigned mtx : 1;
unsigned set : 1;
unsigned unused : 3;
unsigned all : 1;
+#endif
} flags;
};
} kmp_depend_info_t;
@@ -2587,6 +2645,33 @@ typedef struct kmp_task_stack {
#endif // BUILD_TIED_TASK_STACK
typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ /* Same fields as in the #else branch, but in reverse order */
+#if OMPX_TASKGRAPH
+ unsigned reserved31 : 6;
+ unsigned onced : 1;
+#else
+ unsigned reserved31 : 7;
+#endif
+ unsigned native : 1;
+ unsigned freed : 1;
+ unsigned complete : 1;
+ unsigned executing : 1;
+ unsigned started : 1;
+ unsigned team_serial : 1;
+ unsigned tasking_ser : 1;
+ unsigned task_serial : 1;
+ unsigned tasktype : 1;
+ unsigned reserved : 8;
+ unsigned hidden_helper : 1;
+ unsigned detachable : 1;
+ unsigned priority_specified : 1;
+ unsigned proxy : 1;
+ unsigned destructors_thunk : 1;
+ unsigned merged_if0 : 1;
+ unsigned final : 1;
+ unsigned tiedness : 1;
+#else
/* Compiler flags */ /* Total compiler flags must be 16 bits */
unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
unsigned final : 1; /* task is final(1) so execute immediately */
@@ -2622,7 +2707,7 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
#else
unsigned reserved31 : 7; /* reserved for library use */
#endif
-
+#endif
} kmp_tasking_flags_t;
typedef struct kmp_target_data {
@@ -3328,6 +3413,7 @@ extern int __kmp_sys_max_nth; /* system-imposed maximum number of threads */
extern int __kmp_max_nth;
// maximum total number of concurrently-existing threads in a contention group
extern int __kmp_cg_max_nth;
+extern int __kmp_task_max_nth; // max threads used in a task
extern int __kmp_teams_max_nth; // max threads used in a teams construct
extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and
__kmp_root */
@@ -3339,9 +3425,22 @@ extern int __kmp_tp_capacity; /* capacity of __kmp_threads if threadprivate is
used (fixed) */
extern int __kmp_tp_cached; /* whether threadprivate cache has been created
(__kmpc_threadprivate_cached()) */
-extern int __kmp_dflt_blocktime; /* number of milliseconds to wait before
+extern int __kmp_dflt_blocktime; /* number of microseconds to wait before
blocking (env setting) */
+extern char __kmp_blocktime_units; /* 'm' or 'u' to note units specified */
extern bool __kmp_wpolicy_passive; /* explicitly set passive wait policy */
+
+// Convert raw blocktime from ms to us if needed.
+static inline void __kmp_aux_convert_blocktime(int *bt) {
+ if (__kmp_blocktime_units == 'm') {
+ if (*bt > INT_MAX / 1000) {
+ *bt = INT_MAX / 1000;
+ KMP_INFORM(MaxValueUsing, "kmp_set_blocktime(ms)", bt);
+ }
+ *bt = *bt * 1000;
+ }
+}
+
#if KMP_USE_MONITOR
extern int
__kmp_monitor_wakeups; /* number of times monitor wakes up per second */
@@ -3589,6 +3688,9 @@ extern void __kmp_warn(char const *format, ...);
extern void __kmp_set_num_threads(int new_nth, int gtid);
+extern bool __kmp_detect_shm();
+extern bool __kmp_detect_tmp();
+
// Returns current thread (pointer to kmp_info_t). Current thread *must* be
// registered.
static inline kmp_info_t *__kmp_entry_thread() {
@@ -3770,7 +3872,8 @@ extern void __kmp_affinity_initialize(kmp_affinity_t &affinity);
extern void __kmp_affinity_uninitialize(void);
extern void __kmp_affinity_set_init_mask(
int gtid, int isa_root); /* set affinity according to KMP_AFFINITY */
-extern void __kmp_affinity_set_place(int gtid);
+void __kmp_affinity_bind_init_mask(int gtid);
+extern void __kmp_affinity_bind_place(int gtid);
extern void __kmp_affinity_determine_capable(const char *env_var);
extern int __kmp_aux_set_affinity(void **mask);
extern int __kmp_aux_get_affinity(void **mask);
@@ -3779,6 +3882,9 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_get_first_osid_with_ecore(void);
+#endif
#if KMP_OS_LINUX || KMP_OS_FREEBSD
extern int kmp_set_thread_affinity_mask_initial(void);
#endif
@@ -3786,7 +3892,8 @@ static inline void __kmp_assign_root_init_mask() {
int gtid = __kmp_entry_gtid();
kmp_root_t *r = __kmp_threads[gtid]->th.th_root;
if (r->r.r_uber_thread == __kmp_threads[gtid] && !r->r.r_affinity_assigned) {
- __kmp_affinity_set_init_mask(gtid, TRUE);
+ __kmp_affinity_set_init_mask(gtid, /*isa_root=*/TRUE);
+ __kmp_affinity_bind_init_mask(gtid);
r->r.r_affinity_assigned = TRUE;
}
}
@@ -4130,6 +4237,11 @@ KMP_EXPORT kmp_int32 __kmpc_omp_task_with_deps(
ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 ndeps,
kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
kmp_depend_info_t *noalias_dep_list);
+
+KMP_EXPORT kmp_base_depnode_t *__kmpc_task_get_depnode(kmp_task_t *task);
+
+KMP_EXPORT kmp_depnode_list_t *__kmpc_task_get_successors(kmp_task_t *task);
+
KMP_EXPORT void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid,
kmp_int32 ndeps,
kmp_depend_info_t *dep_list,
@@ -4270,6 +4382,8 @@ KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
kmp_int32 num_teams,
kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
+ kmp_int32 thread_limit);
/* Function for OpenMP 5.1 num_teams clause */
KMP_EXPORT void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
kmp_int32 num_teams_lb,
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index cbb80bf3a848..7009730a49ba 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -38,6 +38,43 @@ static hierarchy_info machine_hierarchy;
void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
+#if KMP_AFFINITY_SUPPORTED
+// Helper class to see if place lists further restrict the fullMask
+class kmp_full_mask_modifier_t {
+ kmp_affin_mask_t *mask;
+
+public:
+ kmp_full_mask_modifier_t() {
+ KMP_CPU_ALLOC(mask);
+ KMP_CPU_ZERO(mask);
+ }
+ ~kmp_full_mask_modifier_t() {
+ KMP_CPU_FREE(mask);
+ mask = nullptr;
+ }
+ void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
+ // If the new full mask is different from the current full mask,
+ // then switch them. Returns true if full mask was affected, false otherwise.
+ bool restrict_to_mask() {
+ // See if the new mask further restricts or changes the full mask
+ if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
+ return false;
+ return __kmp_topology->restrict_to_mask(mask);
+ }
+};
+
+static inline const char *
+__kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
+ bool for_binding = false) {
+ if (affinity.flags.omp_places) {
+ if (for_binding)
+ return "OMP_PROC_BIND";
+ return "OMP_PLACES";
+ }
+ return affinity.env_var;
+}
+#endif // KMP_AFFINITY_SUPPORTED
+
void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
kmp_uint32 depth;
// The test below is true if affinity is available, but set to "none". Need to
@@ -207,6 +244,8 @@ void kmp_hw_thread_t::print() const {
if (attrs.is_core_eff_valid())
printf(" (eff=%d)", attrs.get_core_eff());
}
+ if (leader)
+ printf(" (leader)");
printf("\n");
}
@@ -797,7 +836,40 @@ void kmp_topology_t::print(const char *env_var) const {
#if KMP_AFFINITY_SUPPORTED
void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
- const char *env_var = affinity.env_var;
+ const char *env_var = __kmp_get_affinity_env_var(affinity);
+ // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
+ // KMP_AFFINITY), but none exist, then reset granularity and have below method
+ // select a granularity and warn user.
+ if (!__kmp_is_hybrid_cpu()) {
+ if (affinity.core_attr_gran.valid) {
+ // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
+ // instead
+ KMP_AFF_WARNING(
+ affinity, AffIgnoringNonHybrid, env_var,
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+ affinity.gran = KMP_HW_CORE;
+ affinity.gran_levels = -1;
+ affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+ affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+ } else if (affinity.flags.core_types_gran ||
+ affinity.flags.core_effs_gran) {
+ // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
+ if (affinity.flags.omp_places) {
+ KMP_AFF_WARNING(
+ affinity, AffIgnoringNonHybrid, env_var,
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
+ } else {
+ // KMP_AFFINITY=granularity=core_type|core_eff,...
+ KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
+ "Intel(R) Hybrid Technology core attribute",
+ __kmp_hw_get_catalog_string(KMP_HW_CORE));
+ }
+ affinity.gran = KMP_HW_CORE;
+ affinity.gran_levels = -1;
+ affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
+ affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
+ }
+ }
// Set the number of affinity granularity levels
if (affinity.gran_levels < 0) {
kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
@@ -937,6 +1009,7 @@ public:
}
};
+#if KMP_AFFINITY_SUPPORTED
static kmp_str_buf_t *
__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
bool plural) {
@@ -952,6 +1025,41 @@ __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
return buf;
}
+bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
+ // Apply the filter
+ bool affected;
+ int new_index = 0;
+ for (int i = 0; i < num_hw_threads; ++i) {
+ int os_id = hw_threads[i].os_id;
+ if (KMP_CPU_ISSET(os_id, mask)) {
+ if (i != new_index)
+ hw_threads[new_index] = hw_threads[i];
+ new_index++;
+ } else {
+ KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
+ __kmp_avail_proc--;
+ }
+ }
+
+ KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
+ affected = (num_hw_threads != new_index);
+ num_hw_threads = new_index;
+
+ // Post hardware subset canonicalization
+ if (affected) {
+ _gather_enumeration_information();
+ _discover_uniformity();
+ _set_globals();
+ _set_last_level_cache();
+#if KMP_OS_WINDOWS
+ // Copy filtered full mask if topology has single processor group
+ if (__kmp_num_proc_groups <= 1)
+#endif
+ __kmp_affin_origMask->copy(__kmp_affin_fullMask);
+ }
+ return affected;
+}
+
// Apply the KMP_HW_SUBSET envirable to the topology
// Returns true if KMP_HW_SUBSET filtered any processors
// otherwise, returns false
@@ -1156,7 +1264,9 @@ bool kmp_topology_t::filter_hw_subset() {
// Determine which hardware threads should be filtered.
int num_filtered = 0;
- bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads);
+ kmp_affin_mask_t *filtered_mask;
+ KMP_CPU_ALLOC(filtered_mask);
+ KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
for (int i = 0; i < num_hw_threads; ++i) {
kmp_hw_thread_t &hw_thread = hw_threads[i];
// Update type_sub_id
@@ -1218,51 +1328,35 @@ bool kmp_topology_t::filter_hw_subset() {
}
}
// Collect filtering information
- filtered[i] = should_be_filtered;
- if (should_be_filtered)
+ if (should_be_filtered) {
+ KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
num_filtered++;
+ }
}
// One last check that we shouldn't allow filtering entire machine
if (num_filtered == num_hw_threads) {
KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
- __kmp_free(filtered);
return false;
}
// Apply the filter
- int new_index = 0;
- for (int i = 0; i < num_hw_threads; ++i) {
- if (!filtered[i]) {
- if (i != new_index)
- hw_threads[new_index] = hw_threads[i];
- new_index++;
- } else {
-#if KMP_AFFINITY_SUPPORTED
- KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask);
-#endif
- __kmp_avail_proc--;
- }
- }
-
- KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
- num_hw_threads = new_index;
-
- // Post hardware subset canonicalization
- _gather_enumeration_information();
- _discover_uniformity();
- _set_globals();
- _set_last_level_cache();
- __kmp_free(filtered);
+ restrict_to_mask(filtered_mask);
return true;
}
-bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
+bool kmp_topology_t::is_close(int hwt1, int hwt2,
+ const kmp_affinity_t &stgs) const {
+ int hw_level = stgs.gran_levels;
if (hw_level >= depth)
return true;
bool retval = true;
const kmp_hw_thread_t &t1 = hw_threads[hwt1];
const kmp_hw_thread_t &t2 = hw_threads[hwt2];
+ if (stgs.flags.core_types_gran)
+ return t1.attrs.get_core_type() == t2.attrs.get_core_type();
+ if (stgs.flags.core_effs_gran)
+ return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
for (int i = 0; i < (depth - hw_level); ++i) {
if (t1.ids[i] != t2.ids[i])
return false;
@@ -1272,8 +1366,6 @@ bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const {
////////////////////////////////////////////////////////////////////////////////
-#if KMP_AFFINITY_SUPPORTED
-
bool KMPAffinity::picked_api = false;
void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
@@ -2718,7 +2810,7 @@ static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
// Set the array sizes for the hierarchy layers
static void __kmp_dispatch_set_hierarchy_values() {
// Set the maximum number of L1's to number of cores
- // Set the maximum number of L2's to to either number of cores / 2 for
+ // Set the maximum number of L2's to either number of cores / 2 for
// Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
// Or the number of cores for Intel(R) Xeon(R) processors
// Set the maximum number of NUMA nodes and L3's to number of packages
@@ -2898,6 +2990,9 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
unsigned num_avail = 0;
*line = 0;
+#if KMP_ARCH_S390X
+ bool reading_s390x_sys_info = true;
+#endif
while (!feof(f)) {
// Create an inner scoping level, so that all the goto targets at the end of
// the loop appear in an outer scoping level. This avoids warnings about
@@ -2943,8 +3038,21 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
if (*buf == '\n' && *line == 2)
continue;
#endif
+#if KMP_ARCH_S390X
+ // s390x /proc/cpuinfo starts with a variable number of lines containing
+ // the overall system information. Skip them.
+ if (reading_s390x_sys_info) {
+ if (*buf == '\n')
+ reading_s390x_sys_info = false;
+ continue;
+ }
+#endif
+#if KMP_ARCH_S390X
+ char s1[] = "cpu number";
+#else
char s1[] = "processor";
+#endif
if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
CHECK_LINE;
char *p = strchr(buf + sizeof(s1) - 1, ':');
@@ -2970,6 +3078,23 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
threadInfo[num_avail][osIdIndex]);
__kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
+#if KMP_ARCH_S390X
+ // Disambiguate physical_package_id.
+ unsigned book_id;
+ KMP_SNPRINTF(path, sizeof(path),
+ "/sys/devices/system/cpu/cpu%u/topology/book_id",
+ threadInfo[num_avail][osIdIndex]);
+ __kmp_read_from_file(path, "%u", &book_id);
+ threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
+
+ unsigned drawer_id;
+ KMP_SNPRINTF(path, sizeof(path),
+ "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
+ threadInfo[num_avail][osIdIndex]);
+ __kmp_read_from_file(path, "%u", &drawer_id);
+ threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
+#endif
+
KMP_SNPRINTF(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/topology/core_id",
threadInfo[num_avail][osIdIndex]);
@@ -3224,7 +3349,7 @@ restart_radix_check:
return false;
}
- // If the thread ids were not specified and we see entries entries that
+ // If the thread ids were not specified and we see entries that
// are duplicates, start the loop over and assign the thread ids manually.
assign_thread_ids = true;
goto restart_radix_check;
@@ -3353,17 +3478,25 @@ restart_radix_check:
// Create and return a table of affinity masks, indexed by OS thread ID.
// This routine handles OR'ing together all the affinity masks of threads
// that are sufficiently close, if granularity > fine.
+template <typename FindNextFunctionType>
static void __kmp_create_os_id_masks(unsigned *numUnique,
- kmp_affinity_t &affinity) {
+ kmp_affinity_t &affinity,
+ FindNextFunctionType find_next) {
// First form a table of affinity masks in order of OS thread id.
int maxOsId;
int i;
int numAddrs = __kmp_topology->get_num_hw_threads();
int depth = __kmp_topology->get_depth();
- const char *env_var = affinity.env_var;
+ const char *env_var = __kmp_get_affinity_env_var(affinity);
KMP_ASSERT(numAddrs);
KMP_ASSERT(depth);
+ i = find_next(-1);
+ // If could not find HW thread location with attributes, then return and
+ // fallback to increment find_next and disregard core attributes.
+ if (i >= numAddrs)
+ return;
+
maxOsId = 0;
for (i = numAddrs - 1;; --i) {
int osId = __kmp_topology->at(i).os_id;
@@ -3393,19 +3526,22 @@ static void __kmp_create_os_id_masks(unsigned *numUnique,
kmp_affin_mask_t *sum;
KMP_CPU_ALLOC_ON_STACK(sum);
KMP_CPU_ZERO(sum);
- KMP_CPU_SET(__kmp_topology->at(0).os_id, sum);
- for (i = 1; i < numAddrs; i++) {
+
+ i = j = leader = find_next(-1);
+ KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
+ kmp_full_mask_modifier_t full_mask;
+ for (i = find_next(i); i < numAddrs; i = find_next(i)) {
// If this thread is sufficiently close to the leader (within the
// granularity setting), then set the bit for this os thread in the
// affinity mask for this group, and go on to the next thread.
- if (__kmp_topology->is_close(leader, i, affinity.gran_levels)) {
+ if (__kmp_topology->is_close(leader, i, affinity)) {
KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
continue;
}
// For every thread in this group, copy the mask to the thread's entry in
// the OS Id mask table. Mark the first address as a leader.
- for (; j < i; j++) {
+ for (; j < i; j = find_next(j)) {
int osId = __kmp_topology->at(j).os_id;
KMP_DEBUG_ASSERT(osId <= maxOsId);
kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
@@ -3416,22 +3552,29 @@ static void __kmp_create_os_id_masks(unsigned *numUnique,
// Start a new mask.
leader = i;
+ full_mask.include(sum);
KMP_CPU_ZERO(sum);
KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
}
// For every thread in last group, copy the mask to the thread's
// entry in the OS Id mask table.
- for (; j < i; j++) {
+ for (; j < i; j = find_next(j)) {
int osId = __kmp_topology->at(j).os_id;
KMP_DEBUG_ASSERT(osId <= maxOsId);
kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
KMP_CPU_COPY(mask, sum);
__kmp_topology->at(j).leader = (j == leader);
}
+ full_mask.include(sum);
unique++;
KMP_CPU_FREE_FROM_STACK(sum);
+ // See if the OS Id mask table further restricts or changes the full mask
+ if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+ __kmp_topology->print(env_var);
+ }
+
*numUnique = unique;
}
@@ -4053,7 +4196,7 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
// Initiailze ids and attrs thread data
for (int i = 0; i < KMP_HW_LAST; ++i)
- ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+ ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
// Iterate through each os id within the mask and determine
@@ -4062,19 +4205,20 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
int depth = __kmp_topology->get_depth();
KMP_CPU_SET_ITERATE(cpu, mask) {
int osid_idx = __kmp_osid_to_hwthread_map[cpu];
+ ids.os_id = cpu;
const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
for (int level = 0; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);
int id = hw_thread.sub_ids[level];
- if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids[type] == id) {
- ids[type] = id;
+ if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
+ ids.ids[type] = id;
} else {
// This mask spans across multiple topology units, set it as such
// and mark every level below as such as well.
- ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+ ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
for (; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);
- ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+ ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
}
}
}
@@ -4134,8 +4278,11 @@ static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
}
// Create the OS proc to hardware thread map
- for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread)
- __kmp_osid_to_hwthread_map[__kmp_topology->at(hw_thread).os_id] = hw_thread;
+ for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
+ int os_id = __kmp_topology->at(hw_thread).os_id;
+ if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
+ __kmp_osid_to_hwthread_map[os_id] = hw_thread;
+ }
for (unsigned i = 0; i < affinity.num_masks; ++i) {
kmp_affinity_ids_t &ids = affinity.ids[i];
@@ -4145,16 +4292,29 @@ static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
}
}
+// Called when __kmp_topology is ready
+static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
+ // Initialize other data structures which depend on the topology
+ if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
+ machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
+ __kmp_affinity_get_topology_info(affinity);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
+#endif
+ }
+}
+
// Create a one element mask array (set of places) which only contains the
// initial process's affinity mask
static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
KMP_ASSERT(__kmp_affin_fullMask != NULL);
KMP_ASSERT(affinity.type == affinity_none);
+ KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
affinity.num_masks = 1;
KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
KMP_CPU_COPY(dest, __kmp_affin_fullMask);
- __kmp_affinity_get_topology_info(affinity);
+ __kmp_aux_affinity_initialize_other_data(affinity);
}
static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
@@ -4383,13 +4543,6 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
if (verbose)
__kmp_topology->print(env_var);
bool filtered = __kmp_topology->filter_hw_subset();
- if (filtered) {
-#if KMP_OS_WINDOWS
- // Copy filtered full mask if topology has single processor group
- if (__kmp_num_proc_groups <= 1)
-#endif
- __kmp_affin_origMask->copy(__kmp_affin_fullMask);
- }
if (filtered && verbose)
__kmp_topology->print("KMP_HW_SUBSET");
return success;
@@ -4398,7 +4551,7 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
bool is_regular_affinity = (&affinity == &__kmp_affinity);
bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
- const char *env_var = affinity.env_var;
+ const char *env_var = __kmp_get_affinity_env_var(affinity);
if (affinity.flags.initialized) {
KMP_ASSERT(__kmp_affin_fullMask != NULL);
@@ -4411,8 +4564,6 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
if (is_regular_affinity && !__kmp_topology) {
bool success = __kmp_aux_affinity_initialize_topology(affinity);
if (success) {
- // Initialize other data structures which depend on the topology
- machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
} else {
affinity.type = affinity_none;
@@ -4437,7 +4588,36 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
// Create the table of masks, indexed by thread Id.
unsigned numUnique;
- __kmp_create_os_id_masks(&numUnique, affinity);
+ int numAddrs = __kmp_topology->get_num_hw_threads();
+ // If OMP_PLACES=cores:<attribute> specified, then attempt
+ // to make OS Id mask table using those attributes
+ if (affinity.core_attr_gran.valid) {
+ __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
+ KMP_ASSERT(idx >= -1);
+ for (int i = idx + 1; i < numAddrs; ++i)
+ if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
+ return i;
+ return numAddrs;
+ });
+ if (!affinity.os_id_masks) {
+ const char *core_attribute;
+ if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
+ core_attribute = "core_efficiency";
+ else
+ core_attribute = "core_type";
+ KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
+ core_attribute,
+ __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
+ }
+ }
+ // If core attributes did not work, or none were specified,
+ // then make OS Id mask table using typical incremental way.
+ if (!affinity.os_id_masks) {
+ __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
+ KMP_ASSERT(idx >= -1);
+ return idx + 1;
+ });
+ }
if (affinity.gran_levels == 0) {
KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
}
@@ -4578,6 +4758,7 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
int i;
unsigned j;
int num_hw_threads = __kmp_topology->get_num_hw_threads();
+ kmp_full_mask_modifier_t full_mask;
for (i = 0, j = 0; i < num_hw_threads; i++) {
if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
continue;
@@ -4588,11 +4769,16 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
KMP_ASSERT(KMP_CPU_ISSET(osId, src));
KMP_CPU_COPY(dest, src);
+ full_mask.include(src);
if (++j >= affinity.num_masks) {
break;
}
}
KMP_DEBUG_ASSERT(j == affinity.num_masks);
+ // See if the places list further restricts or changes the full mask
+ if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
+ __kmp_topology->print(env_var);
+ }
}
// Sort the topology back using ids
__kmp_topology->sort_ids();
@@ -4601,7 +4787,7 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
default:
KMP_ASSERT2(0, "Unexpected affinity setting");
}
- __kmp_affinity_get_topology_info(affinity);
+ __kmp_aux_affinity_initialize_other_data(affinity);
affinity.flags.initialized = TRUE;
}
@@ -4694,7 +4880,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
// Set the thread topology information to default of unknown
for (int id = 0; id < KMP_HW_LAST; ++id)
- th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
+ th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
if (!KMP_AFFINITY_CAPABLE()) {
@@ -4715,14 +4901,12 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
kmp_affin_mask_t *mask;
int i;
const kmp_affinity_t *affinity;
- const char *env_var;
bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
if (is_hidden_helper)
affinity = &__kmp_hh_affinity;
else
affinity = &__kmp_affinity;
- env_var = affinity->env_var;
if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
if ((affinity->type == affinity_none) ||
@@ -4772,19 +4956,34 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
}
if (i == KMP_PLACE_ALL) {
- KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
+ KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
gtid));
} else {
- KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
+ KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
gtid, i));
}
KMP_CPU_COPY(th->th.th_affin_mask, mask);
+}
+void __kmp_affinity_bind_init_mask(int gtid) {
+ if (!KMP_AFFINITY_CAPABLE()) {
+ return;
+ }
+ kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
+ const kmp_affinity_t *affinity;
+ const char *env_var;
+ bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
+
+ if (is_hidden_helper)
+ affinity = &__kmp_hh_affinity;
+ else
+ affinity = &__kmp_affinity;
+ env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
/* to avoid duplicate printing (will be correctly printed on barrier) */
- if (affinity->flags.verbose &&
- (affinity->type == affinity_none ||
- (i != KMP_PLACE_ALL && affinity->type != affinity_balanced)) &&
+ if (affinity->flags.verbose && (affinity->type == affinity_none ||
+ (th->th.th_current_place != KMP_PLACE_ALL &&
+ affinity->type != affinity_balanced)) &&
!KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
@@ -4804,7 +5003,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
__kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
}
-void __kmp_affinity_set_place(int gtid) {
+void __kmp_affinity_bind_place(int gtid) {
// Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
return;
@@ -4812,7 +5011,7 @@ void __kmp_affinity_set_place(int gtid) {
kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
- KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
+ KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
"place = %d)\n",
gtid, th->th.th_new_place, th->th.th_current_place));
@@ -4834,9 +5033,6 @@ void __kmp_affinity_set_place(int gtid) {
KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
KMP_CPU_COPY(th->th.th_affin_mask, mask);
th->th.th_current_place = th->th.th_new_place;
- // Copy topology information associated with the place
- th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
- th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
if (__kmp_affinity.flags.verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
@@ -5081,6 +5277,28 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
}
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Returns first os proc id with ATOM core
+int __kmp_get_first_osid_with_ecore(void) {
+ int low = 0;
+ int high = __kmp_topology->get_num_hw_threads() - 1;
+ int mid = 0;
+ while (high - low > 1) {
+ mid = (high + low) / 2;
+ if (__kmp_topology->at(mid).attrs.get_core_type() ==
+ KMP_HW_CORE_TYPE_CORE) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
+ return mid;
+ }
+ return -1;
+}
+#endif
+
// Dynamic affinity settings - Affinity balanced
void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
KMP_DEBUG_ASSERT(th);
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index f27dd9a5339e..5464259784e2 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -34,6 +34,7 @@ public:
bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
void clear(int i) override { hwloc_bitmap_clr(mask, i); }
void zero() override { hwloc_bitmap_zero(mask); }
+ bool empty() const override { return hwloc_bitmap_iszero(mask); }
void copy(const KMPAffinity::Mask *src) override {
const Mask *convert = static_cast<const Mask *>(src);
hwloc_bitmap_copy(mask, convert->mask);
@@ -47,6 +48,10 @@ public:
hwloc_bitmap_or(mask, mask, convert->mask);
}
void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
+ bool is_equal(const KMPAffinity::Mask *rhs) const override {
+ const Mask *convert = static_cast<const Mask *>(rhs);
+ return hwloc_bitmap_isequal(mask, convert->mask);
+ }
int begin() const override { return hwloc_bitmap_first(mask); }
int end() const override { return -1; }
int next(int previous) const override {
@@ -281,6 +286,28 @@ public:
#elif __NR_sched_getaffinity != 123
#error Wrong code for getaffinity system call.
#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_VE
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 203
+#elif __NR_sched_setaffinity != 203
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 204
+#elif __NR_sched_getaffinity != 204
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
+#elif KMP_ARCH_S390X
+#ifndef __NR_sched_setaffinity
+#define __NR_sched_setaffinity 239
+#elif __NR_sched_setaffinity != 239
+#error Wrong code for setaffinity system call.
+#endif /* __NR_sched_setaffinity */
+#ifndef __NR_sched_getaffinity
+#define __NR_sched_getaffinity 240
+#elif __NR_sched_getaffinity != 240
+#error Wrong code for getaffinity system call.
+#endif /* __NR_sched_getaffinity */
#else
#error Unknown or unsupported architecture
#endif /* KMP_ARCH_* */
@@ -319,6 +346,13 @@ class KMPNativeAffinity : public KMPAffinity {
for (mask_size_type i = 0; i < e; ++i)
mask[i] = (mask_t)0;
}
+ bool empty() const override {
+ mask_size_type e = get_num_mask_types();
+ for (mask_size_type i = 0; i < e; ++i)
+ if (mask[i] != (mask_t)0)
+ return false;
+ return true;
+ }
void copy(const KMPAffinity::Mask *src) override {
const Mask *convert = static_cast<const Mask *>(src);
mask_size_type e = get_num_mask_types();
@@ -342,6 +376,14 @@ class KMPNativeAffinity : public KMPAffinity {
for (mask_size_type i = 0; i < e; ++i)
mask[i] = ~(mask[i]);
}
+ bool is_equal(const KMPAffinity::Mask *rhs) const override {
+ const Mask *convert = static_cast<const Mask *>(rhs);
+ mask_size_type e = get_num_mask_types();
+ for (mask_size_type i = 0; i < e; ++i)
+ if (mask[i] != convert->mask[i])
+ return false;
+ return true;
+ }
int begin() const override {
int retval = 0;
while (retval < end() && !is_set(retval))
@@ -459,6 +501,12 @@ class KMPNativeAffinity : public KMPAffinity {
for (int i = 0; i < __kmp_num_proc_groups; ++i)
mask[i] = 0;
}
+ bool empty() const override {
+ for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+ if (mask[i])
+ return false;
+ return true;
+ }
void copy(const KMPAffinity::Mask *src) override {
const Mask *convert = static_cast<const Mask *>(src);
for (int i = 0; i < __kmp_num_proc_groups; ++i)
@@ -478,6 +526,13 @@ class KMPNativeAffinity : public KMPAffinity {
for (int i = 0; i < __kmp_num_proc_groups; ++i)
mask[i] = ~(mask[i]);
}
+ bool is_equal(const KMPAffinity::Mask *rhs) const override {
+ const Mask *convert = static_cast<const Mask *>(rhs);
+ for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
+ if (mask[i] != convert->mask[i])
+ return false;
+ return true;
+ }
int begin() const override {
int retval = 0;
while (retval < end() && !is_set(retval))
@@ -679,6 +734,21 @@ struct kmp_hw_attr_t {
}
return false;
}
+#if KMP_AFFINITY_SUPPORTED
+ bool contains(const kmp_affinity_attrs_t &attr) const {
+ if (!valid && !attr.valid)
+ return true;
+ if (valid && attr.valid) {
+ if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
+ return (is_core_type_valid() &&
+ (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
+ if (attr.core_eff != UNKNOWN_CORE_EFF)
+ return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
+ return true;
+ }
+ return false;
+ }
+#endif // KMP_AFFINITY_SUPPORTED
bool operator==(const kmp_hw_attr_t &rhs) const {
return (rhs.valid == valid && rhs.core_eff == core_eff &&
rhs.core_type == core_type);
@@ -834,13 +904,18 @@ public:
#if KMP_AFFINITY_SUPPORTED
// Set the granularity for affinity settings
void set_granularity(kmp_affinity_t &stgs) const;
-#endif
+ bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
+ bool restrict_to_mask(const kmp_affin_mask_t *mask);
bool filter_hw_subset();
- bool is_close(int hwt1, int hwt2, int level) const;
+#endif
bool is_uniform() const { return flags.uniform; }
// Tell whether a type is a valid type in the topology
// returns KMP_HW_UNKNOWN when there is no equivalent type
- kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
+ kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
+ if (type == KMP_HW_UNKNOWN)
+ return KMP_HW_UNKNOWN;
+ return equivalent[type];
+ }
// Set type1 = type2
void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index bf56c7884970..281b8e9c2883 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -2591,7 +2591,7 @@ void __kmp_fork_barrier(int gtid, int tid) {
__kmp_gtid_from_thread(this_thr),
this_thr->th.th_current_place));
} else {
- __kmp_affinity_set_place(gtid);
+ __kmp_affinity_bind_place(gtid);
}
}
#endif // KMP_AFFINITY_SUPPORTED
diff --git a/openmp/runtime/src/kmp_barrier.h b/openmp/runtime/src/kmp_barrier.h
index ac28a13217e9..ae9b8d62f4c3 100644
--- a/openmp/runtime/src/kmp_barrier.h
+++ b/openmp/runtime/src/kmp_barrier.h
@@ -21,7 +21,10 @@
#define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment)
#define KMP_ALIGNED_FREE(ptr) _mm_free(ptr)
#elif KMP_HAVE_ALIGNED_ALLOC
-#define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size)
+#define KMP_ALGIN_UP(val, alignment) \
+ (((val) + (alignment)-1) / (alignment) * (alignment))
+#define KMP_ALIGNED_ALLOCATE(size, alignment) \
+ aligned_alloc(alignment, KMP_ALGIN_UP(size, alignment))
#define KMP_ALIGNED_FREE(ptr) free(ptr)
#elif KMP_HAVE_POSIX_MEMALIGN
static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) {
diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp
index 8d0ed0e945c0..2c410ca9b603 100644
--- a/openmp/runtime/src/kmp_collapse.cpp
+++ b/openmp/runtime/src/kmp_collapse.cpp
@@ -27,7 +27,7 @@
// avoid inadevertently using a library based abs
template <typename T> T __kmp_abs(const T val) {
- return (val < 0) ? -val: val;
+ return (val < 0) ? -val : val;
}
kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
@@ -36,7 +36,34 @@ kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
// Common functions for working with rectangular and non-rectangular loops
//----------------------------------------------------------------------------
-template <typename T> int __kmp_sign(T val) { return (T(0) < val) - (val < T(0)); }
+template <typename T> int __kmp_sign(T val) {
+ return (T(0) < val) - (val < T(0));
+}
+
+template <typename T> class CollapseAllocator {
+ typedef T *pT;
+
+private:
+ static const size_t allocaSize = 32; // size limit for stack allocations
+ // (8 bytes x 4 nested loops)
+ char stackAlloc[allocaSize];
+ static constexpr size_t maxElemCount = allocaSize / sizeof(T);
+ pT pTAlloc;
+
+public:
+ CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
+ if (n > maxElemCount) {
+ pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
+ }
+ }
+ ~CollapseAllocator() {
+ if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
+ __kmp_free(pTAlloc);
+ }
+ }
+ T &operator[](int index) { return pTAlloc[index]; }
+ operator const pT() { return pTAlloc; }
+};
//----------Loop canonicalization---------------------------------------------
@@ -463,8 +490,7 @@ __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
/*out*/ kmp_uint64 *original_ivs,
kmp_index_t n) {
- kmp_iterations_t iterations =
- (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
+ CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
// First, calc corresponding iteration in every original loop:
for (kmp_index_t ind = n; ind > 0;) {
@@ -485,7 +511,6 @@ __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
}
- __kmp_free(iterations);
}
//----------------------------------------------------------------------------
@@ -924,9 +949,7 @@ bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
/*out*/ kmp_point_t original_ivs) {
// Iterations in the original space, multiplied by step:
- kmp_iterations_t iterations =
- (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
-
+ CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
for (kmp_index_t ind = n; ind > 0;) {
--ind;
iterations[ind] = 0;
@@ -936,7 +959,6 @@ bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
/*in/out*/ original_ivs,
/*in/out*/ iterations, 0);
- __kmp_free(iterations);
return b;
}
@@ -948,9 +970,7 @@ bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
kmp_index_t n, const kmp_point_t original_ivs,
/*out*/ kmp_point_t next_original_ivs) {
// Iterations in the original space, multiplied by step (so can be negative):
- kmp_iterations_t iterations =
- (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
-
+ CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
// First, calc corresponding iteration in every original loop:
for (kmp_index_t ind = 0; ind < n; ++ind) {
auto bounds = &(original_bounds_nest[ind]);
@@ -969,7 +989,6 @@ bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
bool b = kmp_calc_original_ivs_from_iterations(
original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
- __kmp_free(iterations);
return b;
}
@@ -1132,9 +1151,7 @@ bool kmp_calc_original_ivs_for_chunk_end(
/*out*/ kmp_point_t original_ivs) {
// Iterations in the expanded space:
- kmp_iterations_t iterations =
- (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n);
-
+ CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
// First, calc corresponding iteration in every modified loop:
for (kmp_index_t ind = n; ind > 0;) {
--ind;
@@ -1166,7 +1183,6 @@ bool kmp_calc_original_ivs_for_chunk_end(
// Too big (or too small for >=).
if (ind == 0) {
// Need to reduce to the end.
- __kmp_free(iterations);
return false;
} else {
// Go to next iteration on outer loop:
@@ -1197,7 +1213,6 @@ bool kmp_calc_original_ivs_for_chunk_end(
++ind;
}
- __kmp_free(iterations);
return true;
}
@@ -1291,9 +1306,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
- bounds_info_internal_t *updated_bounds_nest =
- (bounds_info_internal_t *)__kmp_allocate(sizeof(bounds_info_internal_t) *
- n);
+ CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
for (kmp_index_t i = 0; i < n; ++i) {
updated_bounds_nest[i].b = original_bounds_nest[i];
@@ -1308,7 +1321,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
if (total == 0) {
// Loop won't execute:
- __kmp_free(updated_bounds_nest);
return FALSE;
}
@@ -1322,20 +1334,11 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
KMP_DEBUG_ASSERT(tid < nth);
- kmp_point_t original_ivs_start =
- (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
- kmp_point_t original_ivs_end =
- (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
- kmp_point_t original_ivs_next_start =
- (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n);
+ CollapseAllocator<kmp_uint64> original_ivs_start(n);
if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
/*out*/ original_ivs_start)) {
// Loop won't execute:
- __kmp_free(updated_bounds_nest);
- __kmp_free(original_ivs_start);
- __kmp_free(original_ivs_end);
- __kmp_free(original_ivs_next_start);
return FALSE;
}
@@ -1354,10 +1357,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
// if (plastiter != NULL) {
// *plastiter = TRUE;
// }
- // __kmp_free(updated_bounds_nest);
- // __kmp_free(original_ivs_start);
- // __kmp_free(original_ivs_end);
- // __kmp_free(original_ivs_next_start);
// return TRUE;
//}
@@ -1391,6 +1390,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
new_iv += curr_chunk_size - 1;
}
+ CollapseAllocator<kmp_uint64> original_ivs_end(n);
if ((nth == 1) || (new_iv >= total - 1)) {
// Do this one till the end - just in case we miscalculated
// and either too much is left to process or new_iv is a bit too big:
@@ -1421,10 +1421,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
if (last_iter && (tid != 0)) {
// We are done, this was last chunk, but no chunk for current thread was
// found:
- __kmp_free(updated_bounds_nest);
- __kmp_free(original_ivs_start);
- __kmp_free(original_ivs_end);
- __kmp_free(original_ivs_next_start);
return FALSE;
}
@@ -1432,6 +1428,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
// We found the chunk for this thread, now we need to check if it's the
// last chunk or not:
+ CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
if (last_iter ||
!kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
/*out*/ original_ivs_next_start)) {
@@ -1453,10 +1450,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
chunk_bounds_nest[i].ub1_u64 = 0;
}
- __kmp_free(updated_bounds_nest);
- __kmp_free(original_ivs_start);
- __kmp_free(original_ivs_end);
- __kmp_free(original_ivs_next_start);
return TRUE;
}
@@ -1478,9 +1471,5 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
original_ivs_start, n);
}
- __kmp_free(updated_bounds_nest);
- __kmp_free(original_ivs_start);
- __kmp_free(original_ivs_end);
- __kmp_free(original_ivs_next_start);
return FALSE;
}
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
index 58bf64112b1a..5f04301c91c6 100644
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -104,6 +104,8 @@
# define CACHE_LINE 128
#elif KMP_ARCH_AARCH64_A64FX
# define CACHE_LINE 256
+#elif KMP_ARCH_S390X
+# define CACHE_LINE 256
#else
# define CACHE_LINE 64
#endif
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 95f724f68255..9eeaeb88fb9e 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -343,7 +343,6 @@ Perform a fork only if the condition is true.
void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
kmp_int32 cond, void *args) {
int gtid = __kmp_entry_gtid();
- int zero = 0;
if (cond) {
if (args)
__kmpc_fork_call(loc, argc, microtask, args);
@@ -352,10 +351,29 @@ void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
} else {
__kmpc_serialized_parallel(loc, gtid);
+#if OMPT_SUPPORT
+ void *exit_frame_ptr;
+#endif
+
if (args)
- microtask(&gtid, &zero, args);
+ __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
+ /*npr=*/0,
+ /*argc=*/1, &args
+#if OMPT_SUPPORT
+ ,
+ &exit_frame_ptr
+#endif
+ );
else
- microtask(&gtid, &zero);
+ __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid,
+ /*npr=*/0,
+ /*argc=*/0,
+ /*args=*/nullptr
+#if OMPT_SUPPORT
+ ,
+ &exit_frame_ptr
+#endif
+ );
__kmpc_end_serialized_parallel(loc, gtid);
}
@@ -385,6 +403,24 @@ void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
@ingroup PARALLEL
@param loc source location information
@param global_tid global thread number
+@param thread_limit limit on number of threads which can be created within the
+current task
+
+Set the thread_limit for the current task
+This call is there to support `thread_limit` clause on the `target` construct
+*/
+void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
+ kmp_int32 thread_limit) {
+ __kmp_assert_valid_gtid(global_tid);
+ kmp_info_t *thread = __kmp_threads[global_tid];
+ if (thread_limit > 0)
+ thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit;
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
@param num_teams_lb lower bound on number of teams requested for the teams
construct
@param num_teams_ub upper bound on number of teams requested for the teams
@@ -2065,14 +2101,15 @@ void kmpc_set_stacksize_s(size_t arg) {
}
void kmpc_set_blocktime(int arg) {
- int gtid, tid;
+ int gtid, tid, bt = arg;
kmp_info_t *thread;
gtid = __kmp_entry_gtid();
tid = __kmp_tid_from_gtid(gtid);
thread = __kmp_thread_from_gtid(gtid);
- __kmp_aux_set_blocktime(arg, thread, tid);
+ __kmp_aux_convert_blocktime(&bt);
+ __kmp_aux_set_blocktime(bt, thread, tid);
}
void kmpc_set_library(int arg) {
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index a6ee844e5988..ac85b2b3f2fc 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -90,6 +90,70 @@ static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
return monotonicity;
}
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Return floating point number rounded to two decimal points
+static inline float __kmp_round_2decimal_val(float num) {
+ return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
+}
+static inline int __kmp_get_round_val(float num) {
+ return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
+}
+#endif
+
+template <typename T>
+inline void
+__kmp_initialize_self_buffer(kmp_team_t *team, T id,
+ dispatch_private_info_template<T> *pr,
+ typename traits_t<T>::unsigned_t nchunks, T nproc,
+ typename traits_t<T>::unsigned_t &init,
+ T &small_chunk, T &extras, T &p_extra) {
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ if (pr->flags.use_hybrid) {
+ kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
+ kmp_hw_core_type_t type =
+ (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+ T pchunks = pr->u.p.pchunks;
+ T echunks = nchunks - pchunks;
+ T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
+ T num_procs_with_ecore = nproc - num_procs_with_pcore;
+ T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
+ T big_chunk =
+ pchunks / num_procs_with_pcore; // chunks per thread with p-core
+ small_chunk =
+ echunks / num_procs_with_ecore; // chunks per thread with e-core
+
+ extras =
+ (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
+
+ p_extra = (big_chunk - small_chunk);
+
+ if (type == KMP_HW_CORE_TYPE_CORE) {
+ if (id < first_thread_with_ecore) {
+ init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+ } else {
+ init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+ (id < extras ? id : extras);
+ }
+ } else {
+ if (id == first_thread_with_ecore) {
+ init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+ } else {
+ init = id * small_chunk + first_thread_with_ecore * p_extra +
+ (id < extras ? id : extras);
+ }
+ }
+ p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+ return;
+ }
+#endif
+
+ small_chunk = nchunks / nproc; // chunks per thread
+ extras = nchunks % nproc;
+ p_extra = 0;
+ init = id * small_chunk + (id < extras ? id : extras);
+}
+
#if KMP_STATIC_STEAL_ENABLED
enum { // values for steal_flag (possible states of private per-loop buffer)
UNUSED = 0,
@@ -366,7 +430,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
switch (schedule) {
#if KMP_STATIC_STEAL_ENABLED
case kmp_sch_static_steal: {
- T ntc, init;
+ T ntc, init = 0;
KD_TRACE(100,
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
@@ -376,7 +440,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
if (nproc > 1 && ntc >= nproc) {
KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
T id = tid;
- T small_chunk, extras;
+ T small_chunk, extras, p_extra = 0;
kmp_uint32 old = UNUSED;
int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
if (traits_t<T>::type_size > 4) {
@@ -388,13 +452,110 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
__kmp_init_lock(pr->u.p.steal_lock);
}
- small_chunk = ntc / nproc;
- extras = ntc % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ // Iterations are divided in a 60/40 skewed distribution among CORE and
+ // ATOM processors for hybrid systems
+ bool use_hybrid = false;
+ kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+ T first_thread_with_ecore = 0;
+ T num_procs_with_pcore = 0;
+ T num_procs_with_ecore = 0;
+ T p_ntc = 0, e_ntc = 0;
+ if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
+ __kmp_affinity.type != affinity_explicit) {
+ use_hybrid = true;
+ core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+ if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
+ __kmp_first_osid_with_ecore > -1) {
+ for (int i = 0; i < team->t.t_nproc; ++i) {
+ kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
+ ->th.th_topology_attrs.core_type;
+ int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
+ if (id == __kmp_first_osid_with_ecore) {
+ first_thread_with_ecore =
+ team->t.t_threads[i]->th.th_info.ds.ds_tid;
+ }
+ if (type == KMP_HW_CORE_TYPE_CORE) {
+ num_procs_with_pcore++;
+ } else if (type == KMP_HW_CORE_TYPE_ATOM) {
+ num_procs_with_ecore++;
+ } else {
+ use_hybrid = false;
+ break;
+ }
+ }
+ }
+ if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
+ float multiplier = 60.0 / 40.0;
+ float p_ratio = (float)num_procs_with_pcore / nproc;
+ float e_ratio = (float)num_procs_with_ecore / nproc;
+ float e_multiplier =
+ (float)1 /
+ (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
+ float p_multiplier = multiplier * e_multiplier;
+ p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
+ if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
+ e_ntc =
+ (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
+ else
+ e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
+ KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
+
+ // Use regular static steal if not enough chunks for skewed
+ // distribution
+ use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
+ e_ntc >= num_procs_with_ecore)
+ ? true
+ : false);
+ } else {
+ use_hybrid = false;
+ }
+ }
+ pr->flags.use_hybrid = use_hybrid;
+ pr->u.p.pchunks = p_ntc;
+ pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
+ pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
+
+ if (use_hybrid) {
+ KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
+ T big_chunk = p_ntc / num_procs_with_pcore;
+ small_chunk = e_ntc / num_procs_with_ecore;
+
+ extras =
+ (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
+
+ p_extra = (big_chunk - small_chunk);
+
+ if (core_type == KMP_HW_CORE_TYPE_CORE) {
+ if (id < first_thread_with_ecore) {
+ init =
+ id * small_chunk + id * p_extra + (id < extras ? id : extras);
+ } else {
+ init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+ (id < extras ? id : extras);
+ }
+ } else {
+ if (id == first_thread_with_ecore) {
+ init =
+ id * small_chunk + id * p_extra + (id < extras ? id : extras);
+ } else {
+ init = id * small_chunk + first_thread_with_ecore * p_extra +
+ (id < extras ? id : extras);
+ }
+ }
+ p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+ } else
+#endif
+ {
+ small_chunk = ntc / nproc;
+ extras = ntc % nproc;
+ init = id * small_chunk + (id < extras ? id : extras);
+ p_extra = 0;
+ }
pr->u.p.count = init;
if (claimed) { // are we succeeded in claiming own buffer?
- pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+ pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
// Other threads will inspect steal_flag when searching for a victim.
// READY means other threads may steal from this thread from now on.
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
@@ -1261,13 +1422,13 @@ int __kmp_dispatch_next_algorithm(int gtid,
if (status) {
// initialize self buffer with victim's whole range of chunks
T id = victimId;
- T small_chunk, extras;
- small_chunk = nchunks / nproc; // chunks per thread
- extras = nchunks % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
+ T small_chunk = 0, extras = 0, p_extra = 0;
+ __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+ init, small_chunk, extras,
+ p_extra);
__kmp_acquire_lock(lck, gtid);
pr->u.p.count = init + 1; // exclude one we execute immediately
- pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+ pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
__kmp_release_lock(lck, gtid);
pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
// no need to reinitialize other thread invariants: lb, st, etc.
@@ -1275,10 +1436,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
{
char *buff;
// create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
- "count:%%%s ub:%%%s\n",
- traits_t<UT>::spec, traits_t<T>::spec);
+ buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+ "stolen chunks from T#%%d, "
+ "count:%%%s ub:%%%s\n",
+ traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);
}
@@ -1404,12 +1565,12 @@ int __kmp_dispatch_next_algorithm(int gtid,
if (status) {
// initialize self buffer with victim's whole range of chunks
T id = victimId;
- T small_chunk, extras;
- small_chunk = nchunks / nproc; // chunks per thread
- extras = nchunks % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
+ T small_chunk = 0, extras = 0, p_extra = 0;
+ __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+ init, small_chunk, extras,
+ p_extra);
vnew.p.count = init + 1;
- vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+ vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
// write pair (count, ub) at once atomically
#if KMP_ARCH_X86
KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
@@ -1422,10 +1583,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
{
char *buff;
// create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
- "count:%%%s ub:%%%s\n",
- traits_t<UT>::spec, traits_t<T>::spec);
+ buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+ "stolen chunks from T#%%d, "
+ "count:%%%s ub:%%%s\n",
+ traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);
}
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
index 154db174613d..cf19eb52662c 100644
--- a/openmp/runtime/src/kmp_dispatch.h
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -75,14 +75,17 @@ template <typename T> struct dispatch_private_infoXX_template {
ST st; // signed
UT tc; // unsigned
kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+ UT ordered_lower; // unsigned
+ UT ordered_upper; // unsigned
+
/* parm[1-4] are used in different ways by different scheduling algorithms */
- // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+ // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
// a) parm3 is properly aligned and
// b) all parm1-4 are in the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are in the same line (not measured though).
-
struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
T parm1;
T parm2;
@@ -90,8 +93,11 @@ template <typename T> struct dispatch_private_infoXX_template {
T parm4;
};
- UT ordered_lower; // unsigned
- UT ordered_upper; // unsigned
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ UT pchunks; // total number of chunks for processes with p-core
+ UT num_procs_with_pcore; // number of threads with p-core
+ T first_thread_with_ecore;
+#endif
#if KMP_OS_WINDOWS
T last_upper;
#endif /* KMP_OS_WINDOWS */
diff --git a/openmp/runtime/src/kmp_environment.cpp b/openmp/runtime/src/kmp_environment.cpp
index b35027b57f03..4def6ea9ac20 100644
--- a/openmp/runtime/src/kmp_environment.cpp
+++ b/openmp/runtime/src/kmp_environment.cpp
@@ -407,9 +407,11 @@ ___kmp_env_blk_parse_unix(kmp_env_blk_t *block, // M: Env block to fill.
int i;
var = bulk;
for (i = 0; i < count; ++i) {
+ KMP_ASSERT(var < bulk + size);
+ [[maybe_unused]] size_t ssize = size - (var - bulk);
// Copy variable to bulk.
len = KMP_STRLEN(env[i]);
- KMP_MEMCPY_S(var, size, env[i], len + 1);
+ KMP_MEMCPY_S(var, ssize, env[i], len + 1);
// Save found variable in vars array.
__kmp_str_split(var, '=', &name, &value);
vars[i].name = name;
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index 038bccfba3ea..ad19079cb650 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -112,17 +112,19 @@ void FTN_STDCALL FTN_SET_BLOCKTIME(int KMP_DEREF arg) {
#ifdef KMP_STUB
__kmps_set_blocktime(KMP_DEREF arg);
#else
- int gtid, tid;
+ int gtid, tid, bt = (KMP_DEREF arg);
kmp_info_t *thread;
gtid = __kmp_entry_gtid();
tid = __kmp_tid_from_gtid(gtid);
thread = __kmp_thread_from_gtid(gtid);
- __kmp_aux_set_blocktime(KMP_DEREF arg, thread, tid);
+ __kmp_aux_convert_blocktime(&bt);
+ __kmp_aux_set_blocktime(bt, thread, tid);
#endif
}
+// Gets blocktime in units used for KMP_BLOCKTIME, ms otherwise
int FTN_STDCALL FTN_GET_BLOCKTIME(void) {
#ifdef KMP_STUB
return __kmps_get_blocktime();
@@ -136,21 +138,24 @@ int FTN_STDCALL FTN_GET_BLOCKTIME(void) {
/* These must match the settings used in __kmp_wait_sleep() */
if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
- KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
- team->t.t_id, tid, KMP_MAX_BLOCKTIME));
+ KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid,
+ team->t.t_id, tid, KMP_MAX_BLOCKTIME, __kmp_blocktime_units));
return KMP_MAX_BLOCKTIME;
}
#ifdef KMP_ADJUST_BLOCKTIME
else if (__kmp_zero_bt && !get__bt_set(team, tid)) {
- KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
- team->t.t_id, tid, 0));
+ KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid,
+ team->t.t_id, tid, 0, __kmp_blocktime_units));
return 0;
}
#endif /* KMP_ADJUST_BLOCKTIME */
else {
- KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid,
- team->t.t_id, tid, get__blocktime(team, tid)));
- return get__blocktime(team, tid);
+ int bt = get__blocktime(team, tid);
+ if (__kmp_blocktime_units == 'm')
+ bt = bt / 1000;
+ KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid,
+ team->t.t_id, tid, bt, __kmp_blocktime_units));
+ return bt;
}
#endif
}
@@ -577,7 +582,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) {
int gtid;
#if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
- KMP_OS_HURD || KMP_OS_OPENBSD
+ KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS
gtid = __kmp_entry_gtid();
#elif KMP_OS_WINDOWS
if (!__kmp_init_parallel ||
@@ -802,6 +807,10 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_LIMIT)(void) {
gtid = __kmp_entry_gtid();
thread = __kmp_threads[gtid];
+ // If thread_limit for the target task is defined, return that instead of the
+ // regular task thread_limit
+ if (int thread_limit = thread->th.th_current_task->td_icvs.task_thread_limit)
+ return thread_limit;
return thread->th.th_current_task->td_icvs.thread_limit;
#endif
}
diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h
index d37c9c86028e..7d595b947f4a 100644
--- a/openmp/runtime/src/kmp_ftn_os.h
+++ b/openmp/runtime/src/kmp_ftn_os.h
@@ -116,6 +116,8 @@
#define FTN_TARGET_IS_PRESENT omp_target_is_present
#define FTN_TARGET_MEMCPY omp_target_memcpy
#define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
+#define FTN_TARGET_MEMSET omp_target_memset
+#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async
#define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
#define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
#endif
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 4ce0691abf8d..b132f38fd3b0 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -125,6 +125,7 @@ size_t __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
int __kmp_sys_max_nth = KMP_MAX_NTH;
int __kmp_max_nth = 0;
int __kmp_cg_max_nth = 0;
+int __kmp_task_max_nth = 0;
int __kmp_teams_max_nth = 0;
int __kmp_threads_capacity = 0;
int __kmp_dflt_team_nth = 0;
@@ -154,7 +155,8 @@ int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
kmp_hier_sched_env_t __kmp_hier_scheds = {0, 0, NULL, NULL, NULL};
#endif
-int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; // in microseconds
+char __kmp_blocktime_units = 'm'; // Units specified in KMP_BLOCKTIME
bool __kmp_wpolicy_passive = false;
#if KMP_USE_MONITOR
int __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS;
@@ -280,6 +282,9 @@ kmp_affinity_t __kmp_hh_affinity =
kmp_affinity_t *__kmp_affinities[] = {&__kmp_affinity, &__kmp_hh_affinity};
char *__kmp_cpuinfo_file = NULL;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+int __kmp_first_osid_with_ecore = -1;
+#endif
#endif /* KMP_AFFINITY_SUPPORTED */
diff --git a/openmp/runtime/src/kmp_itt.inl b/openmp/runtime/src/kmp_itt.inl
index 5e75f60124af..5236165c35b3 100644
--- a/openmp/runtime/src/kmp_itt.inl
+++ b/openmp/runtime/src/kmp_itt.inl
@@ -438,7 +438,7 @@ void *__kmp_itt_barrier_object(int gtid, int bt, int set_name,
KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= bs_last_barrier);
// This condition is a must (we would have zero divide otherwise).
KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= 2 * bs_last_barrier);
- // More strong condition: make sure we have room at least for for two
+ // More strong condition: make sure we have room at least for two
// different ids (for each barrier type).
object = reinterpret_cast<void *>(
(kmp_uintptr_t)(team) +
diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp
index 8fcddc710862..85c54f4cdc7e 100644
--- a/openmp/runtime/src/kmp_lock.cpp
+++ b/openmp/runtime/src/kmp_lock.cpp
@@ -3809,7 +3809,7 @@ static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) {
sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1));
table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table;
// We cannot free the previous table now, since it may be in use by other
- // threads. So save the pointer to the previous table in in the first
+ // threads. So save the pointer to the previous table in the first
// element of the new table. All the tables will be organized into a list,
// and could be freed when library shutting down.
__kmp_user_lock_table.table = table;
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index fec589ab6018..beb8d0197ddf 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -105,8 +105,9 @@
128-bit extended precision type yet */
typedef long double _Quad;
#elif KMP_COMPILER_GCC
-/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad */
-#if !KMP_OS_NETBSD
+/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad until
+ NetBSD 10.0 which ships with GCC 10.5 */
+#if (!KMP_OS_NETBSD || __GNUC__ >= 10)
typedef __float128 _Quad;
#undef KMP_HAVE_QUAD
#define KMP_HAVE_QUAD 1
@@ -178,7 +179,8 @@ typedef unsigned long long kmp_uint64;
#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS
#define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
#elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
- KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
+ KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
+ KMP_ARCH_VE || KMP_ARCH_S390X
#define KMP_SIZE_T_SPEC KMP_UINT64_SPEC
#else
#error "Can't determine size_t printf format specifier."
@@ -1043,7 +1045,8 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
#endif /* KMP_OS_WINDOWS */
#if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || \
- KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
+ KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
+ KMP_ARCH_VE || KMP_ARCH_S390X
#if KMP_OS_WINDOWS
#undef KMP_MB
#define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst)
diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h
index fcfd8bc5d8d9..b7972c7248dd 100644
--- a/openmp/runtime/src/kmp_platform.h
+++ b/openmp/runtime/src/kmp_platform.h
@@ -23,6 +23,7 @@
#define KMP_OS_DARWIN 0
#define KMP_OS_WINDOWS 0
#define KMP_OS_HURD 0
+#define KMP_OS_SOLARIS 0
#define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */
#ifdef _WIN32
@@ -70,13 +71,19 @@
#define KMP_OS_HURD 1
#endif
+#if (defined __sun__ && defined __svr4__)
+#undef KMP_OS_SOLARIS
+#define KMP_OS_SOLARIS 1
+#endif
+
#if (1 != KMP_OS_LINUX + KMP_OS_DRAGONFLY + KMP_OS_FREEBSD + KMP_OS_NETBSD + \
- KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD)
+ KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD + \
+ KMP_OS_SOLARIS)
#error Unknown OS
#endif
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
- KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD
+ KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD || KMP_OS_SOLARIS
#undef KMP_OS_UNIX
#define KMP_OS_UNIX 1
#endif
@@ -93,6 +100,8 @@
#define KMP_ARCH_MIPS64 0
#define KMP_ARCH_RISCV64 0
#define KMP_ARCH_LOONGARCH64 0
+#define KMP_ARCH_VE 0
+#define KMP_ARCH_S390X 0
#if KMP_OS_WINDOWS
#if defined(_M_AMD64) || defined(__x86_64)
@@ -142,6 +151,12 @@
#elif defined __loongarch__ && __loongarch_grlen == 64
#undef KMP_ARCH_LOONGARCH64
#define KMP_ARCH_LOONGARCH64 1
+#elif defined __ve__
+#undef KMP_ARCH_VE
+#define KMP_ARCH_VE 1
+#elif defined __s390x__
+#undef KMP_ARCH_S390X
+#define KMP_ARCH_S390X 1
#endif
#endif
@@ -206,7 +221,8 @@
// TODO: Fixme - This is clever, but really fugly
#if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 + \
KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 + \
- KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64)
+ KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE + \
+ KMP_ARCH_S390X)
#error Unknown or unsupported architecture
#endif
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index e55798df610c..25136691bc72 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -178,7 +178,12 @@ int __kmp_get_global_thread_id() {
if (stack_diff <= stack_size) {
/* The only way we can be closer than the allocated */
/* stack size is if we are running on this thread. */
- KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
+ // __kmp_gtid_get_specific can return negative value because this
+ // function can be called by thread destructor. However, before the
+ // thread destructor is called, the value of the corresponding
+ // thread-specific data will be reset to NULL.
+ KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
+ __kmp_gtid_get_specific() == i);
return i;
}
}
@@ -196,6 +201,12 @@ int __kmp_get_global_thread_id() {
if (i < 0)
return i;
+ // other_threads[i] can be nullptr at this point because the corresponding
+ // thread could have already been destructed. It can happen when this function
+ // is called in end library routine.
+ if (!TCR_SYNC_PTR(other_threads[i]))
+ return i;
+
/* dynamically updated stack window for uber threads to avoid get_specific
call */
if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
@@ -1872,6 +1883,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
int nthreads;
int master_active;
int master_set_numthreads;
+ int task_thread_limit = 0;
int level;
int active_level;
int teams_level;
@@ -1910,6 +1922,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
root = master_th->th.th_root;
master_active = root->r.r_active;
master_set_numthreads = master_th->th.th_set_nproc;
+ task_thread_limit =
+ master_th->th.th_current_task->td_icvs.task_thread_limit;
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
@@ -2000,6 +2014,11 @@ int __kmp_fork_call(ident_t *loc, int gtid,
? master_set_numthreads
// TODO: get nproc directly from current task
: get__nproc_2(parent_team, master_tid);
+ // Use the thread_limit set for the current target task if exists, else go
+ // with the deduced nthreads
+ nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
+ ? task_thread_limit
+ : nthreads;
// Check if we need to take forkjoin lock? (no need for serialized
// parallel out of teams construct).
if (nthreads > 1) {
@@ -3291,6 +3310,8 @@ static kmp_internal_control_t __kmp_get_global_icvs(void) {
// next parallel region (per thread)
// (use a max ub on value if __kmp_parallel_initialize not called yet)
__kmp_cg_max_nth, // int thread_limit;
+ __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
+ // on task. This is used in the case of target thread_limit
__kmp_dflt_max_active_levels, // int max_active_levels; //internal control
// for max_active_levels
r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
@@ -4671,6 +4692,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
}
#endif /* KMP_ADJUST_BLOCKTIME */
+#if KMP_AFFINITY_SUPPORTED
+ // Set the affinity and topology information for new thread
+ __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
+#endif
+
/* actually fork it and create the new worker thread */
KF_TRACE(
10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
@@ -4764,6 +4790,19 @@ static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
}
#if KMP_AFFINITY_SUPPORTED
+static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
+ int first, int last, int newp) {
+ th->th.th_first_place = first;
+ th->th.th_last_place = last;
+ th->th.th_new_place = newp;
+ if (newp != th->th.th_current_place) {
+ if (__kmp_display_affinity && team->t.t_display_affinity != 1)
+ team->t.t_display_affinity = 1;
+ // Copy topology information associated with the new place
+ th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
+ th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
+ }
+}
// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
// It calculates the worker + primary thread's partition based upon the parent
@@ -4803,13 +4842,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
for (f = 1; f < n_th; f++) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th != NULL);
- th->th.th_first_place = first_place;
- th->th.th_last_place = last_place;
- th->th.th_new_place = masters_place;
- if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
- team->t.t_display_affinity != 1) {
- team->t.t_display_affinity = 1;
- }
+ __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
"partition = [%d,%d]\n",
@@ -4840,13 +4873,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
} else {
place++;
}
- th->th.th_first_place = first_place;
- th->th.th_last_place = last_place;
- th->th.th_new_place = place;
- if (__kmp_display_affinity && place != th->th.th_current_place &&
- team->t.t_display_affinity != 1) {
- team->t.t_display_affinity = 1;
- }
+ __kmp_set_thread_place(team, th, first_place, last_place, place);
KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
"partition = [%d,%d]\n",
@@ -4865,13 +4892,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th != NULL);
- th->th.th_first_place = first_place;
- th->th.th_last_place = last_place;
- th->th.th_new_place = place;
- if (__kmp_display_affinity && place != th->th.th_current_place &&
- team->t.t_display_affinity != 1) {
- team->t.t_display_affinity = 1;
- }
+ __kmp_set_thread_place(team, th, first_place, last_place, place);
s_count++;
if ((s_count == S) && rem && (gap_ct == gap)) {
@@ -4938,12 +4959,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th != NULL);
- th->th.th_first_place = place;
- th->th.th_new_place = place;
- if (__kmp_display_affinity && place != th->th.th_current_place &&
- team->t.t_display_affinity != 1) {
- team->t.t_display_affinity = 1;
- }
+ int fplace = place, nplace = place;
s_count = 1;
while (s_count < S) {
if (place == last_place) {
@@ -4966,7 +4982,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
rem--;
gap_ct = 0;
}
- th->th.th_last_place = place;
+ __kmp_set_thread_place(team, th, fplace, place, nplace);
gap_ct++;
if (place == last_place) {
@@ -5032,13 +5048,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
KMP_DEBUG_ASSERT(last_place >= first_place);
th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th);
- th->th.th_first_place = first;
- th->th.th_new_place = place;
- th->th.th_last_place = last;
- if (__kmp_display_affinity && place != th->th.th_current_place &&
- team->t.t_display_affinity != 1) {
- team->t.t_display_affinity = 1;
- }
+ __kmp_set_thread_place(team, th, first, last, place);
KA_TRACE(100,
("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
"partition = [%d,%d], spacing = %.4f\n",
@@ -5064,13 +5074,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
kmp_info_t *th = team->t.t_threads[f];
KMP_DEBUG_ASSERT(th != NULL);
- th->th.th_first_place = place;
- th->th.th_last_place = place;
- th->th.th_new_place = place;
- if (__kmp_display_affinity && place != th->th.th_current_place &&
- team->t.t_display_affinity != 1) {
- team->t.t_display_affinity = 1;
- }
+ __kmp_set_thread_place(team, th, place, place, place);
s_count++;
if ((s_count == S) && rem && (gap_ct == gap)) {
@@ -6713,6 +6717,8 @@ static inline char *__kmp_reg_status_name() {
} // __kmp_reg_status_get
#if defined(KMP_USE_SHM)
+bool __kmp_shm_available = false;
+bool __kmp_tmp_available = false;
// If /dev/shm is not accessible, we will create a temporary file under /tmp.
char *temp_reg_status_file_name = nullptr;
#endif
@@ -6742,60 +6748,108 @@ void __kmp_register_library_startup(void) {
char *value = NULL; // Actual value of the environment variable.
#if defined(KMP_USE_SHM)
- char *shm_name = __kmp_str_format("/%s", name);
- int shm_preexist = 0;
- char *data1;
- int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
- if ((fd1 == -1) && (errno == EEXIST)) {
- // file didn't open because it already exists.
- // try opening existing file
- fd1 = shm_open(shm_name, O_RDWR, 0666);
- if (fd1 == -1) { // file didn't open
- // error out here
- __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
- __kmp_msg_null);
- } else {
- // able to open existing file
- shm_preexist = 1;
+ char *shm_name = nullptr;
+ char *data1 = nullptr;
+ __kmp_shm_available = __kmp_detect_shm();
+ if (__kmp_shm_available) {
+ int fd1 = -1;
+ shm_name = __kmp_str_format("/%s", name);
+ int shm_preexist = 0;
+ fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+ if ((fd1 == -1) && (errno == EEXIST)) {
+ // file didn't open because it already exists.
+ // try opening existing file
+ fd1 = shm_open(shm_name, O_RDWR, 0666);
+ if (fd1 == -1) { // file didn't open
+ KMP_WARNING(FunctionError, "Can't open SHM");
+ __kmp_shm_available = false;
+ } else { // able to open existing file
+ shm_preexist = 1;
+ }
+ }
+ if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
+ if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
+ KMP_WARNING(FunctionError, "Can't set size of SHM");
+ __kmp_shm_available = false;
+ }
+ }
+ if (__kmp_shm_available) { // SHM exists, now map it
+ data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd1, 0);
+ if (data1 == MAP_FAILED) { // failed to map shared memory
+ KMP_WARNING(FunctionError, "Can't map SHM");
+ __kmp_shm_available = false;
+ }
+ }
+ if (__kmp_shm_available) { // SHM mapped
+ if (shm_preexist == 0) { // set data to SHM, set value
+ KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+ }
+ // Read value from either what we just wrote or existing file.
+ value = __kmp_str_format("%s", data1); // read value from SHM
+ munmap(data1, SHM_SIZE);
}
- } else if (fd1 == -1) {
- // SHM didn't open; it was due to error other than already exists. Try to
- // create a temp file under /tmp.
+ if (fd1 != -1)
+ close(fd1);
+ }
+ if (!__kmp_shm_available)
+ __kmp_tmp_available = __kmp_detect_tmp();
+ if (!__kmp_shm_available && __kmp_tmp_available) {
+ // SHM failed to work due to an error other than that the file already
+ // exists. Try to create a temp file under /tmp.
+ // If /tmp isn't accessible, fall back to using environment variable.
// TODO: /tmp might not always be the temporary directory. For now we will
- // not consider TMPDIR. If /tmp is not accessible, we simply error out.
- char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
- fd1 = mkstemp(temp_file_name);
- if (fd1 == -1) {
- // error out here.
- __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
- __kmp_msg_null);
+ // not consider TMPDIR.
+ int fd1 = -1;
+ temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
+ int tmp_preexist = 0;
+ fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+ if ((fd1 == -1) && (errno == EEXIST)) {
+ // file didn't open because it already exists.
+ // try opening existing file
+ fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
+ if (fd1 == -1) { // file didn't open if (fd1 == -1) {
+ KMP_WARNING(FunctionError, "Can't open TEMP");
+ __kmp_tmp_available = false;
+ } else {
+ tmp_preexist = 1;
+ }
}
- temp_reg_status_file_name = temp_file_name;
- }
- if (shm_preexist == 0) {
- // we created SHM now set size
- if (ftruncate(fd1, SHM_SIZE) == -1) {
- // error occured setting size;
- __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
- KMP_ERR(errno), __kmp_msg_null);
+ if (__kmp_tmp_available && tmp_preexist == 0) {
+ // we created /tmp file now set size
+ if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
+ KMP_WARNING(FunctionError, "Can't set size of /tmp file");
+ __kmp_tmp_available = false;
+ }
}
+ if (__kmp_tmp_available) {
+ data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd1, 0);
+ if (data1 == MAP_FAILED) { // failed to map /tmp
+ KMP_WARNING(FunctionError, "Can't map /tmp");
+ __kmp_tmp_available = false;
+ }
+ }
+ if (__kmp_tmp_available) {
+ if (tmp_preexist == 0) { // set data to TMP, set value
+ KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+ }
+ // Read value from either what we just wrote or existing file.
+ value = __kmp_str_format("%s", data1); // read value from SHM
+ munmap(data1, SHM_SIZE);
+ }
+ if (fd1 != -1)
+ close(fd1);
}
- data1 =
- (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
- if (data1 == MAP_FAILED) {
- // failed to map shared memory
- __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
- __kmp_msg_null);
- }
- if (shm_preexist == 0) { // set data to SHM, set value
- KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
+ if (!__kmp_shm_available && !__kmp_tmp_available) {
+ // no /dev/shm and no /tmp -- fall back to environment variable
+ // Set environment variable, but do not overwrite if it exists.
+ __kmp_env_set(name, __kmp_registration_str, 0);
+ // read value to see if it got set
+ value = __kmp_env_get(name);
}
- // Read value from either what we just wrote or existing file.
- value = __kmp_str_format("%s", data1); // read value from SHM
- munmap(data1, SHM_SIZE);
- close(fd1);
#else // Windows and unix with static library
- // Set environment variable, but do not overwrite if it is exist.
+ // Set environment variable, but do not overwrite if it exists.
__kmp_env_set(name, __kmp_registration_str, 0);
// read value to see if it got set
value = __kmp_env_get(name);
@@ -6855,8 +6909,14 @@ void __kmp_register_library_startup(void) {
case 2: { // Neighbor is dead.
#if defined(KMP_USE_SHM)
- // close shared memory.
- shm_unlink(shm_name); // this removes file in /dev/shm
+ if (__kmp_shm_available) { // close shared memory.
+ shm_unlink(shm_name); // this removes file in /dev/shm
+ } else if (__kmp_tmp_available) {
+ unlink(temp_reg_status_file_name); // this removes the temp file
+ } else {
+ // Clear the variable and try to register library again.
+ __kmp_env_unset(name);
+ }
#else
// Clear the variable and try to register library again.
__kmp_env_unset(name);
@@ -6869,7 +6929,8 @@ void __kmp_register_library_startup(void) {
}
KMP_INTERNAL_FREE((void *)value);
#if defined(KMP_USE_SHM)
- KMP_INTERNAL_FREE((void *)shm_name);
+ if (shm_name)
+ KMP_INTERNAL_FREE((void *)shm_name);
#endif
} // while
KMP_INTERNAL_FREE((void *)name);
@@ -6882,25 +6943,32 @@ void __kmp_unregister_library(void) {
char *value = NULL;
#if defined(KMP_USE_SHM)
- bool use_shm = true;
- char *shm_name = __kmp_str_format("/%s", name);
- int fd1 = shm_open(shm_name, O_RDONLY, 0666);
- if (fd1 == -1) {
- // File did not open. Try the temporary file.
- use_shm = false;
- KMP_DEBUG_ASSERT(temp_reg_status_file_name);
+ char *shm_name = nullptr;
+ int fd1;
+ if (__kmp_shm_available) {
+ shm_name = __kmp_str_format("/%s", name);
+ fd1 = shm_open(shm_name, O_RDONLY, 0666);
+ if (fd1 != -1) { // File opened successfully
+ char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
+ if (data1 != MAP_FAILED) {
+ value = __kmp_str_format("%s", data1); // read value from SHM
+ munmap(data1, SHM_SIZE);
+ }
+ close(fd1);
+ }
+ } else if (__kmp_tmp_available) { // try /tmp
fd1 = open(temp_reg_status_file_name, O_RDONLY);
- if (fd1 == -1) {
- // give it up now.
- return;
+ if (fd1 != -1) { // File opened successfully
+ char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
+ if (data1 != MAP_FAILED) {
+ value = __kmp_str_format("%s", data1); // read value from /tmp
+ munmap(data1, SHM_SIZE);
+ }
+ close(fd1);
}
+ } else { // fall back to envirable
+ value = __kmp_env_get(name);
}
- char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
- if (data1 != MAP_FAILED) {
- value = __kmp_str_format("%s", data1); // read value from SHM
- munmap(data1, SHM_SIZE);
- }
- close(fd1);
#else
value = __kmp_env_get(name);
#endif
@@ -6910,11 +6978,12 @@ void __kmp_unregister_library(void) {
if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
// Ok, this is our variable. Delete it.
#if defined(KMP_USE_SHM)
- if (use_shm) {
+ if (__kmp_shm_available) {
shm_unlink(shm_name); // this removes file in /dev/shm
- } else {
- KMP_DEBUG_ASSERT(temp_reg_status_file_name);
+ } else if (__kmp_tmp_available) {
unlink(temp_reg_status_file_name); // this removes the temp file
+ } else {
+ __kmp_env_unset(name);
}
#else
__kmp_env_unset(name);
@@ -6922,11 +6991,10 @@ void __kmp_unregister_library(void) {
}
#if defined(KMP_USE_SHM)
- KMP_INTERNAL_FREE(shm_name);
- if (!use_shm) {
- KMP_DEBUG_ASSERT(temp_reg_status_file_name);
+ if (shm_name)
+ KMP_INTERNAL_FREE(shm_name);
+ if (temp_reg_status_file_name)
KMP_INTERNAL_FREE(temp_reg_status_file_name);
- }
#endif
KMP_INTERNAL_FREE(__kmp_registration_str);
@@ -8729,9 +8797,8 @@ void __kmp_aux_display_affinity(int gtid, const char *format) {
}
/* ------------------------------------------------------------------------ */
-
void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
- int blocktime = arg; /* argument is in milliseconds */
+ int blocktime = arg; /* argument is in microseconds */
#if KMP_USE_MONITOR
int bt_intervals;
#endif
@@ -8827,10 +8894,12 @@ __kmp_determine_reduction_method(
int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
- KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
+ KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
+ KMP_ARCH_VE || KMP_ARCH_S390X
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
- KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
+ KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || \
+ KMP_OS_SOLARIS
int teamsize_cutoff = 4;
@@ -8854,11 +8923,13 @@ __kmp_determine_reduction_method(
#else
#error "Unknown or unsupported OS"
#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
- // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
+ // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
+ // KMP_OS_SOLARIS
#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
-#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
+ KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS
// basic tuning
diff --git a/openmp/runtime/src/kmp_safe_c_api.h b/openmp/runtime/src/kmp_safe_c_api.h
index 72f26fd9897d..79f4a7f5732a 100644
--- a/openmp/runtime/src/kmp_safe_c_api.h
+++ b/openmp/runtime/src/kmp_safe_c_api.h
@@ -56,7 +56,11 @@ template <typename T> struct kmp_get_rmax_t<T, true> {
// For now, these macros use the existing API.
+#if KMP_OS_NETBSD
+#define KMP_ALLOCA __builtin_alloca
+#else
#define KMP_ALLOCA alloca
+#endif
#define KMP_MEMCPY_S(dst, bsz, src, cnt) memcpy(dst, src, cnt)
#define KMP_SNPRINTF snprintf
#define KMP_SSCANF sscanf
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index b81376d1632b..e731bf45e8ee 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -149,70 +149,6 @@ static size_t __kmp_round4k(size_t size) {
} // __kmp_round4k
#endif
-/* Here, multipliers are like __kmp_convert_to_seconds, but floating-point
- values are allowed, and the return value is in milliseconds. The default
- multiplier is milliseconds. Returns INT_MAX only if the value specified
- matches "infinit*". Returns -1 if specified string is invalid. */
-int __kmp_convert_to_milliseconds(char const *data) {
- int ret, nvalues, factor;
- char mult, extra;
- double value;
-
- if (data == NULL)
- return (-1);
- if (__kmp_str_match("infinit", -1, data))
- return (INT_MAX);
- value = (double)0.0;
- mult = '\0';
-#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
- // On Windows, each %c parameter needs additional size parameter for sscanf_s
- nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, 1, &extra, 1);
-#else
- nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, &extra);
-#endif
- if (nvalues < 1)
- return (-1);
- if (nvalues == 1)
- mult = '\0';
- if (nvalues == 3)
- return (-1);
-
- if (value < 0)
- return (-1);
-
- switch (mult) {
- case '\0':
- /* default is milliseconds */
- factor = 1;
- break;
- case 's':
- case 'S':
- factor = 1000;
- break;
- case 'm':
- case 'M':
- factor = 1000 * 60;
- break;
- case 'h':
- case 'H':
- factor = 1000 * 60 * 60;
- break;
- case 'd':
- case 'D':
- factor = 1000 * 24 * 60 * 60;
- break;
- default:
- return (-1);
- }
-
- if (value >= ((INT_MAX - 1) / factor))
- ret = INT_MAX - 1; /* Don't allow infinite value here */
- else
- ret = (int)(value * (double)factor); /* truncate to int */
-
- return ret;
-}
-
static int __kmp_strcasecmp_with_sentinel(char const *a, char const *b,
char sentinel) {
if (a == NULL)
@@ -731,24 +667,73 @@ static void __kmp_stg_print_use_yield(kmp_str_buf_t *buffer, char const *name,
static void __kmp_stg_parse_blocktime(char const *name, char const *value,
void *data) {
- __kmp_dflt_blocktime = __kmp_convert_to_milliseconds(value);
- if (__kmp_dflt_blocktime < 0) {
- __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
+ const char *buf = value;
+ const char *next;
+ const int ms_mult = 1000;
+ int multiplier = 1;
+ int num;
+
+ // Read integer blocktime value
+ SKIP_WS(buf);
+ if ((*buf >= '0') && (*buf <= '9')) {
+ next = buf;
+ SKIP_DIGITS(next);
+ num = __kmp_basic_str_to_int(buf);
+ KMP_ASSERT(num >= 0);
+ buf = next;
+ SKIP_WS(buf);
+ } else {
+ num = -1;
+ }
+
+ // Read units: note that __kmp_dflt_blocktime units is now us
+ next = buf;
+ if (*buf == '\0' || __kmp_match_str("ms", buf, &next)) {
+ // units are in ms; convert
+ __kmp_dflt_blocktime = ms_mult * num;
+ __kmp_blocktime_units = 'm';
+ multiplier = ms_mult;
+ } else if (__kmp_match_str("us", buf, &next)) {
+ // units are in us
+ __kmp_dflt_blocktime = num;
+ __kmp_blocktime_units = 'u';
+ } else if (__kmp_match_str("infinite", buf, &next) ||
+ __kmp_match_str("infinity", buf, &next)) {
+ // units are in ms
+ __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+ __kmp_blocktime_units = 'm';
+ multiplier = ms_mult;
+ } else {
+ KMP_WARNING(StgInvalidValue, name, value);
+ // default units are in ms
+ __kmp_dflt_blocktime = ms_mult * num;
+ __kmp_blocktime_units = 'm';
+ multiplier = ms_mult;
+ }
+
+ if (num < 0 && __kmp_dflt_blocktime < 0) { // num out of range
+ __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; // now in us
__kmp_msg(kmp_ms_warning, KMP_MSG(InvalidValue, name, value),
__kmp_msg_null);
- KMP_INFORM(Using_int_Value, name, __kmp_dflt_blocktime);
+ // Inform in appropriate units
+ KMP_INFORM(Using_int_Value, name, __kmp_dflt_blocktime / multiplier);
__kmp_env_blocktime = FALSE; // Revert to default as if var not set.
+ } else if (num > 0 && __kmp_dflt_blocktime < 0) { // overflow
+ __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
+ __kmp_msg(kmp_ms_warning, KMP_MSG(LargeValue, name, value), __kmp_msg_null);
+ KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime / multiplier);
+ __kmp_env_blocktime = TRUE; // KMP_BLOCKTIME was specified.
} else {
if (__kmp_dflt_blocktime < KMP_MIN_BLOCKTIME) {
__kmp_dflt_blocktime = KMP_MIN_BLOCKTIME;
__kmp_msg(kmp_ms_warning, KMP_MSG(SmallValue, name, value),
__kmp_msg_null);
- KMP_INFORM(MinValueUsing, name, __kmp_dflt_blocktime);
+ KMP_INFORM(MinValueUsing, name, __kmp_dflt_blocktime / multiplier);
} else if (__kmp_dflt_blocktime > KMP_MAX_BLOCKTIME) {
__kmp_dflt_blocktime = KMP_MAX_BLOCKTIME;
__kmp_msg(kmp_ms_warning, KMP_MSG(LargeValue, name, value),
__kmp_msg_null);
- KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime);
+ KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime / multiplier);
}
__kmp_env_blocktime = TRUE; // KMP_BLOCKTIME was specified.
}
@@ -768,7 +753,17 @@ static void __kmp_stg_parse_blocktime(char const *name, char const *value,
static void __kmp_stg_print_blocktime(kmp_str_buf_t *buffer, char const *name,
void *data) {
- __kmp_stg_print_int(buffer, name, __kmp_dflt_blocktime);
+ int num = __kmp_dflt_blocktime;
+ if (__kmp_blocktime_units == 'm') {
+ num = num / 1000;
+ }
+ if (__kmp_env_format) {
+ KMP_STR_BUF_PRINT_NAME_EX(name);
+ } else {
+ __kmp_str_buf_print(buffer, " %s=", name);
+ }
+ __kmp_str_buf_print(buffer, "%d", num);
+ __kmp_str_buf_print(buffer, "%cs\n", __kmp_blocktime_units);
} // __kmp_stg_print_blocktime
// -----------------------------------------------------------------------------
@@ -1598,7 +1593,7 @@ static void __kmp_stg_parse_debug(char const *name, char const *value,
static void __kmp_stg_parse_debug_buf(char const *name, char const *value,
void *data) {
__kmp_stg_parse_bool(name, value, &__kmp_debug_buf);
- // !!! TODO: Move buffer initialization of of this file! It may works
+ // !!! TODO: Move buffer initialization of this file! It may works
// incorrectly if KMP_DEBUG_BUF is parsed before KMP_DEBUG_BUF_LINES or
// KMP_DEBUG_BUF_CHARS.
if (__kmp_debug_buf) {
@@ -2005,6 +2000,21 @@ static void __kmp_stg_print_foreign_threads_threadprivate(kmp_str_buf_t *buffer,
// -----------------------------------------------------------------------------
// KMP_AFFINITY, GOMP_CPU_AFFINITY, KMP_TOPOLOGY_METHOD
+static inline const char *
+__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
+ switch (type) {
+ case KMP_HW_CORE_TYPE_UNKNOWN:
+ return "unknown";
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ case KMP_HW_CORE_TYPE_ATOM:
+ return "intel_atom";
+ case KMP_HW_CORE_TYPE_CORE:
+ return "intel_core";
+#endif
+ }
+ return "unknown";
+}
+
#if KMP_AFFINITY_SUPPORTED
// Parse the proc id list. Return TRUE if successful, FALSE otherwise.
static int __kmp_parse_affinity_proc_id_list(const char *var, const char *env,
@@ -2359,14 +2369,32 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
buf = next;
- // Try any hardware topology type for granularity
- KMP_FOREACH_HW_TYPE(type) {
- const char *name = __kmp_hw_get_keyword(type);
- if (__kmp_match_str(name, buf, CCAST(const char **, &next))) {
- set_gran(type, -1);
- buf = next;
- set = true;
- break;
+ // Have to try core_type and core_efficiency matches first since "core"
+ // will register as core granularity with "extra chars"
+ if (__kmp_match_str("core_type", buf, CCAST(const char **, &next))) {
+ set_gran(KMP_HW_CORE, -1);
+ out_affinity->flags.core_types_gran = 1;
+ buf = next;
+ set = true;
+ } else if (__kmp_match_str("core_efficiency", buf,
+ CCAST(const char **, &next)) ||
+ __kmp_match_str("core_eff", buf,
+ CCAST(const char **, &next))) {
+ set_gran(KMP_HW_CORE, -1);
+ out_affinity->flags.core_effs_gran = 1;
+ buf = next;
+ set = true;
+ }
+ if (!set) {
+ // Try any hardware topology type for granularity
+ KMP_FOREACH_HW_TYPE(type) {
+ const char *name = __kmp_hw_get_keyword(type);
+ if (__kmp_match_str(name, buf, CCAST(const char **, &next))) {
+ set_gran(type, -1);
+ buf = next;
+ set = true;
+ break;
+ }
}
}
if (!set) {
@@ -2626,8 +2654,15 @@ static void __kmp_print_affinity_env(kmp_str_buf_t *buffer, char const *name,
__kmp_str_buf_print(buffer, "%s,", "noreset");
}
}
- __kmp_str_buf_print(buffer, "granularity=%s,",
- __kmp_hw_get_keyword(affinity.gran, false));
+ __kmp_str_buf_print(buffer, "granularity=");
+ if (affinity.flags.core_types_gran)
+ __kmp_str_buf_print(buffer, "core_type,");
+ else if (affinity.flags.core_effs_gran) {
+ __kmp_str_buf_print(buffer, "core_eff,");
+ } else {
+ __kmp_str_buf_print(
+ buffer, "%s,", __kmp_hw_get_keyword(affinity.gran, /*plural=*/false));
+ }
}
if (!KMP_AFFINITY_CAPABLE()) {
__kmp_str_buf_print(buffer, "%s", "disabled");
@@ -2745,11 +2780,7 @@ signed := + signed
signed := - signed
-----------------------------------------------------------------------------*/
-// Warning to issue for syntax error during parsing of OMP_PLACES
-static inline void __kmp_omp_places_syntax_warn(const char *var) {
- KMP_WARNING(SyntaxErrorUsing, var, "\"cores\"");
-}
-
+// Return TRUE if successful parse, FALSE otherwise
static int __kmp_parse_subplace_list(const char *var, const char **scan) {
const char *next;
@@ -2761,7 +2792,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
//
SKIP_WS(*scan);
if ((**scan < '0') || (**scan > '9')) {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
next = *scan;
@@ -2780,7 +2810,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
continue;
}
if (**scan != ':') {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
(*scan)++; // skip ':'
@@ -2788,7 +2817,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
// Read count parameter
SKIP_WS(*scan);
if ((**scan < '0') || (**scan > '9')) {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
next = *scan;
@@ -2807,7 +2835,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
continue;
}
if (**scan != ':') {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
(*scan)++; // skip ':'
@@ -2829,7 +2856,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
}
SKIP_WS(*scan);
if ((**scan < '0') || (**scan > '9')) {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
next = *scan;
@@ -2848,13 +2874,12 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) {
(*scan)++; // skip ','
continue;
}
-
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
return TRUE;
}
+// Return TRUE if successful parse, FALSE otherwise
static int __kmp_parse_place(const char *var, const char **scan) {
const char *next;
@@ -2866,7 +2891,6 @@ static int __kmp_parse_place(const char *var, const char **scan) {
return FALSE;
}
if (**scan != '}') {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
(*scan)++; // skip '}'
@@ -2880,12 +2904,12 @@ static int __kmp_parse_place(const char *var, const char **scan) {
KMP_ASSERT(proc >= 0);
*scan = next;
} else {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
return TRUE;
}
+// Return TRUE if successful parse, FALSE otherwise
static int __kmp_parse_place_list(const char *var, const char *env,
char **place_list) {
const char *scan = env;
@@ -2908,7 +2932,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
continue;
}
if (*scan != ':') {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
scan++; // skip ':'
@@ -2916,7 +2939,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
// Read count parameter
SKIP_WS(scan);
if ((*scan < '0') || (*scan > '9')) {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
next = scan;
@@ -2935,7 +2957,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
continue;
}
if (*scan != ':') {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
scan++; // skip ':'
@@ -2957,7 +2978,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
}
SKIP_WS(scan);
if ((*scan < '0') || (*scan > '9')) {
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
next = scan;
@@ -2977,7 +2997,6 @@ static int __kmp_parse_place_list(const char *var, const char *env,
continue;
}
- __kmp_omp_places_syntax_warn(var);
return FALSE;
}
@@ -2991,6 +3010,22 @@ static int __kmp_parse_place_list(const char *var, const char *env,
return TRUE;
}
+static inline void __kmp_places_set(enum affinity_type type, kmp_hw_t kind) {
+ __kmp_affinity.type = type;
+ __kmp_affinity.gran = kind;
+ __kmp_affinity.flags.dups = FALSE;
+ __kmp_affinity.flags.omp_places = TRUE;
+}
+
+static void __kmp_places_syntax_error_fallback(char const *name,
+ kmp_hw_t kind) {
+ const char *str = __kmp_hw_get_catalog_string(kind, /*plural=*/true);
+ KMP_WARNING(SyntaxErrorUsing, name, str);
+ __kmp_places_set(affinity_compact, kind);
+ if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default)
+ __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+}
+
static void __kmp_stg_parse_places(char const *name, char const *value,
void *data) {
struct kmp_place_t {
@@ -3001,7 +3036,6 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
bool set = false;
const char *scan = value;
const char *next = scan;
- const char *kind = "\"threads\"";
kmp_place_t std_places[] = {{"threads", KMP_HW_THREAD},
{"cores", KMP_HW_CORE},
{"numa_domains", KMP_HW_NUMA},
@@ -3020,10 +3054,54 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
const kmp_place_t &place = std_places[i];
if (__kmp_match_str(place.name, scan, &next)) {
scan = next;
- __kmp_affinity.type = affinity_compact;
- __kmp_affinity.gran = place.type;
- __kmp_affinity.flags.dups = FALSE;
+ __kmp_places_set(affinity_compact, place.type);
set = true;
+ // Parse core attribute if it exists
+ if (KMP_HW_MAX_NUM_CORE_TYPES > 1) {
+ SKIP_WS(scan);
+ if (*scan == ':') {
+ if (place.type != KMP_HW_CORE) {
+ __kmp_places_syntax_error_fallback(name, place.type);
+ return;
+ }
+ scan++; // skip ':'
+ SKIP_WS(scan);
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ if (__kmp_match_str("intel_core", scan, &next)) {
+ __kmp_affinity.core_attr_gran.core_type = KMP_HW_CORE_TYPE_CORE;
+ __kmp_affinity.core_attr_gran.valid = 1;
+ scan = next;
+ } else if (__kmp_match_str("intel_atom", scan, &next)) {
+ __kmp_affinity.core_attr_gran.core_type = KMP_HW_CORE_TYPE_ATOM;
+ __kmp_affinity.core_attr_gran.valid = 1;
+ scan = next;
+ } else
+#endif
+ if (__kmp_match_str("eff", scan, &next)) {
+ int eff;
+ if (!isdigit(*next)) {
+ __kmp_places_syntax_error_fallback(name, place.type);
+ return;
+ }
+ scan = next;
+ SKIP_DIGITS(next);
+ eff = __kmp_str_to_int(scan, *next);
+ if (eff < 0) {
+ __kmp_places_syntax_error_fallback(name, place.type);
+ return;
+ }
+ if (eff >= KMP_HW_MAX_NUM_CORE_EFFS)
+ eff = KMP_HW_MAX_NUM_CORE_EFFS - 1;
+ __kmp_affinity.core_attr_gran.core_eff = eff;
+ __kmp_affinity.core_attr_gran.valid = 1;
+ scan = next;
+ }
+ if (!__kmp_affinity.core_attr_gran.valid) {
+ __kmp_places_syntax_error_fallback(name, place.type);
+ return;
+ }
+ }
+ }
break;
}
}
@@ -3035,36 +3113,56 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
continue;
if (__kmp_match_str(name, scan, &next)) {
scan = next;
- __kmp_affinity.type = affinity_compact;
- __kmp_affinity.gran = type;
- __kmp_affinity.flags.dups = FALSE;
+ __kmp_places_set(affinity_compact, type);
set = true;
break;
}
}
}
+ // Implementation choices for OMP_PLACES based on core attributes
+ if (!set) {
+ if (__kmp_match_str("core_types", scan, &next)) {
+ scan = next;
+ if (*scan != '\0') {
+ KMP_WARNING(ParseExtraCharsWarn, name, scan);
+ }
+ __kmp_places_set(affinity_compact, KMP_HW_CORE);
+ __kmp_affinity.flags.core_types_gran = 1;
+ set = true;
+ } else if (__kmp_match_str("core_effs", scan, &next) ||
+ __kmp_match_str("core_efficiencies", scan, &next)) {
+ scan = next;
+ if (*scan != '\0') {
+ KMP_WARNING(ParseExtraCharsWarn, name, scan);
+ }
+ __kmp_places_set(affinity_compact, KMP_HW_CORE);
+ __kmp_affinity.flags.core_effs_gran = 1;
+ set = true;
+ }
+ }
+ // Explicit place list
if (!set) {
if (__kmp_affinity.proclist != NULL) {
KMP_INTERNAL_FREE((void *)__kmp_affinity.proclist);
__kmp_affinity.proclist = NULL;
}
if (__kmp_parse_place_list(name, value, &__kmp_affinity.proclist)) {
- __kmp_affinity.type = affinity_explicit;
- __kmp_affinity.gran = KMP_HW_THREAD;
- __kmp_affinity.flags.dups = FALSE;
+ __kmp_places_set(affinity_explicit, KMP_HW_THREAD);
} else {
// Syntax error fallback
- __kmp_affinity.type = affinity_compact;
- __kmp_affinity.gran = KMP_HW_CORE;
- __kmp_affinity.flags.dups = FALSE;
+ __kmp_places_syntax_error_fallback(name, KMP_HW_CORE);
}
if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
__kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
}
return;
}
+
+ kmp_hw_t gran = __kmp_affinity.gran;
if (__kmp_affinity.gran != KMP_HW_UNKNOWN) {
- kind = __kmp_hw_get_keyword(__kmp_affinity.gran);
+ gran = __kmp_affinity.gran;
+ } else {
+ gran = KMP_HW_CORE;
}
if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) {
@@ -3078,7 +3176,7 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
// Parse option count parameter in parentheses
if (*scan != '(') {
- KMP_WARNING(SyntaxErrorUsing, name, kind);
+ __kmp_places_syntax_error_fallback(name, gran);
return;
}
scan++; // skip '('
@@ -3092,7 +3190,7 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
SKIP_WS(scan);
if (*scan != ')') {
- KMP_WARNING(SyntaxErrorUsing, name, kind);
+ __kmp_places_syntax_error_fallback(name, gran);
return;
}
scan++; // skip ')'
@@ -3135,12 +3233,37 @@ static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name,
num = 0;
}
if (gran != KMP_HW_UNKNOWN) {
+ // If core_types or core_effs, just print and return
+ if (__kmp_affinity.flags.core_types_gran) {
+ __kmp_str_buf_print(buffer, "='%s'\n", "core_types");
+ return;
+ }
+ if (__kmp_affinity.flags.core_effs_gran) {
+ __kmp_str_buf_print(buffer, "='%s'\n", "core_effs");
+ return;
+ }
+
+ // threads, cores, sockets, cores:<attribute>, etc.
const char *name = __kmp_hw_get_keyword(gran, true);
- if (num > 0) {
- __kmp_str_buf_print(buffer, "='%s(%d)'\n", name, num);
- } else {
- __kmp_str_buf_print(buffer, "='%s'\n", name);
+ __kmp_str_buf_print(buffer, "='%s", name);
+
+ // Add core attributes if it exists
+ if (__kmp_affinity.core_attr_gran.valid) {
+ kmp_hw_core_type_t ct =
+ (kmp_hw_core_type_t)__kmp_affinity.core_attr_gran.core_type;
+ int eff = __kmp_affinity.core_attr_gran.core_eff;
+ if (ct != KMP_HW_CORE_TYPE_UNKNOWN) {
+ const char *ct_name = __kmp_hw_get_core_type_keyword(ct);
+ __kmp_str_buf_print(buffer, ":%s", name, ct_name);
+ } else if (eff >= 0 && eff < KMP_HW_MAX_NUM_CORE_EFFS) {
+ __kmp_str_buf_print(buffer, ":eff%d", name, eff);
+ }
}
+
+ // Add the '(#)' part if it exists
+ if (num > 0)
+ __kmp_str_buf_print(buffer, "(%d)", num);
+ __kmp_str_buf_print(buffer, "'\n");
} else {
__kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined));
}
@@ -5139,21 +5262,6 @@ err:
return;
}
-static inline const char *
-__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
- switch (type) {
- case KMP_HW_CORE_TYPE_UNKNOWN:
- return "unknown";
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
- case KMP_HW_CORE_TYPE_ATOM:
- return "intel_atom";
- case KMP_HW_CORE_TYPE_CORE:
- return "intel_core";
-#endif
- }
- return "unknown";
-}
-
static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
void *data) {
kmp_str_buf_t buf;
@@ -5984,7 +6092,13 @@ static void __kmp_aux_env_initialize(kmp_env_blk_t *block) {
/* KMP_BLOCKTIME */
value = __kmp_env_blk_var(block, "KMP_BLOCKTIME");
if (value) {
- kmpc_set_blocktime(__kmp_dflt_blocktime);
+ int gtid, tid;
+ kmp_info_t *thread;
+
+ gtid = __kmp_entry_gtid();
+ tid = __kmp_tid_from_gtid(gtid);
+ thread = __kmp_thread_from_gtid(gtid);
+ __kmp_aux_set_blocktime(__kmp_dflt_blocktime, thread, tid);
}
/* OMP_NESTED */
diff --git a/openmp/runtime/src/kmp_settings.h b/openmp/runtime/src/kmp_settings.h
index f63f105940ef..92bbcff52419 100644
--- a/openmp/runtime/src/kmp_settings.h
+++ b/openmp/runtime/src/kmp_settings.h
@@ -24,7 +24,6 @@ void __kmp_env_dump();
int __kmp_initial_threads_capacity(int req_nproc);
void __kmp_init_dflt_team_nth();
-int __kmp_convert_to_milliseconds(char const *);
int __kmp_default_tp_capacity(int, int, int);
#if KMP_MIC
diff --git a/openmp/runtime/src/kmp_str.cpp b/openmp/runtime/src/kmp_str.cpp
index 4cba56964a09..6ee2df724487 100644
--- a/openmp/runtime/src/kmp_str.cpp
+++ b/openmp/runtime/src/kmp_str.cpp
@@ -619,6 +619,21 @@ char *__kmp_str_token(
return token;
} // __kmp_str_token
+int __kmp_basic_str_to_int(char const *str) {
+ int result;
+ char const *t;
+
+ result = 0;
+
+ for (t = str; *t != '\0'; ++t) {
+ if (*t < '0' || *t > '9')
+ break;
+ result = (result * 10) + (*t - '0');
+ }
+
+ return result;
+}
+
int __kmp_str_to_int(char const *str, char sentinel) {
int result, factor;
char const *t;
diff --git a/openmp/runtime/src/kmp_str.h b/openmp/runtime/src/kmp_str.h
index 855b5df55d69..11f633cd8024 100644
--- a/openmp/runtime/src/kmp_str.h
+++ b/openmp/runtime/src/kmp_str.h
@@ -112,6 +112,7 @@ int __kmp_str_match_true(char const *data);
void __kmp_str_replace(char *str, char search_for, char replace_with);
void __kmp_str_split(char *str, char delim, char **head, char **tail);
char *__kmp_str_token(char *str, char const *delim, char **buf);
+int __kmp_basic_str_to_int(char const *str);
int __kmp_str_to_int(char const *str, char sentinel);
void __kmp_str_to_size(char const *str, size_t *out, size_t dfactor,
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index 3b39f5039736..f7529481393f 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -284,6 +284,16 @@ static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
#endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
}
+kmp_base_depnode_t *__kmpc_task_get_depnode(kmp_task_t *task) {
+ kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+ return td->td_depnode ? &(td->td_depnode->dn) : NULL;
+}
+
+kmp_depnode_list_t *__kmpc_task_get_successors(kmp_task_t *task) {
+ kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
+ return td->td_depnode->dn.successors;
+}
+
static inline kmp_int32
__kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
kmp_task_t *task, kmp_depnode_t *node,
@@ -307,16 +317,18 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
if (dep->dn.task) {
KMP_ACQUIRE_DEPNODE(gtid, dep);
if (dep->dn.task) {
+ if (!dep->dn.successors || dep->dn.successors->node != node) {
#if OMPX_TASKGRAPH
- if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+ if (!(__kmp_tdg_is_recording(tdg_status)) && task)
#endif
- __kmp_track_dependence(gtid, dep, node, task);
- dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
- KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
- "%p\n",
- gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
- KMP_TASK_TO_TASKDATA(task)));
- npredecessors++;
+ __kmp_track_dependence(gtid, dep, node, task);
+ dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
+ KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
+ "%p\n",
+ gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
+ KMP_TASK_TO_TASKDATA(task)));
+ npredecessors++;
+ }
}
KMP_RELEASE_DEPNODE(gtid, dep);
}
@@ -324,6 +336,7 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
return npredecessors;
}
+// Add the edge 'sink' -> 'source' in the task dependency graph
static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
kmp_info_t *thread,
kmp_task_t *task,
@@ -346,29 +359,31 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
// synchronously add source to sink' list of successors
KMP_ACQUIRE_DEPNODE(gtid, sink);
if (sink->dn.task) {
+ if (!sink->dn.successors || sink->dn.successors->node != source) {
#if OMPX_TASKGRAPH
- if (!(__kmp_tdg_is_recording(tdg_status)) && task)
+ if (!(__kmp_tdg_is_recording(tdg_status)) && task)
#endif
- __kmp_track_dependence(gtid, sink, source, task);
- sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
- KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
+ __kmp_track_dependence(gtid, sink, source, task);
+ sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
+ KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
"%p\n",
gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
KMP_TASK_TO_TASKDATA(task)));
#if OMPX_TASKGRAPH
- if (__kmp_tdg_is_recording(tdg_status)) {
- kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task);
- if (tdd->is_taskgraph) {
- if (tdd->td_flags.onced)
- // decrement npredecessors if sink->dn.task belongs to a taskgraph
- // and
- // 1) the task is reset to its initial state (by kmp_free_task) or
- // 2) the task is complete but not yet reset
- npredecessors--;
+ if (__kmp_tdg_is_recording(tdg_status)) {
+ kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task);
+ if (tdd->is_taskgraph) {
+ if (tdd->td_flags.onced)
+ // decrement npredecessors if sink->dn.task belongs to a taskgraph
+ // and
+ // 1) the task is reset to its initial state (by kmp_free_task) or
+ // 2) the task is complete but not yet reset
+ npredecessors--;
+ }
}
- }
#endif
npredecessors++;
+ }
}
KMP_RELEASE_DEPNODE(gtid, sink);
}
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index fefa609927e8..6e8b948efa06 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -839,6 +839,14 @@ static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
// loc_ref: source location information; points to beginning of task block.
// gtid: global thread number.
// task: task thunk for the started task.
+#ifdef __s390x__
+// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
+// In order for it to work correctly, the caller also needs to be compiled with
+// backchain. If a caller is compiled without backchain,
+// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
+// crash.
+__attribute__((target("backchain")))
+#endif
void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
kmp_task_t *task) {
#if OMPT_SUPPORT
@@ -1554,7 +1562,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
task = KMP_TASKDATA_TO_TASK(taskdata);
// Make sure task & taskdata are aligned appropriately
-#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
+#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
#else
@@ -1737,8 +1745,12 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
// gtid: global thread ID of caller
// task: the task to invoke
// current_task: the task to resume after task invocation
-static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
- kmp_taskdata_t *current_task) {
+#ifdef __s390x__
+__attribute__((target("backchain")))
+#endif
+static void
+__kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
+ kmp_taskdata_t *current_task) {
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
kmp_info_t *thread;
int discard = 0 /* false */;
@@ -2512,7 +2524,7 @@ void *__kmp_task_reduction_init(int gtid, int num, T *data) {
KMP_ASSERT(tg != NULL);
KMP_ASSERT(data != NULL);
KMP_ASSERT(num > 0);
- if (nth == 1) {
+ if (nth == 1 && !__kmp_enable_hidden_helper) {
KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
gtid, tg));
return (void *)tg;
@@ -2699,6 +2711,7 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
return p_priv[tid];
}
}
+ KMP_ASSERT(tg->parent);
tg = tg->parent;
arr = (kmp_taskred_data_t *)(tg->reduce_data);
num = tg->reduce_num_data;
@@ -2711,7 +2724,10 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
// Called from __kmpc_end_taskgroup()
static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
kmp_int32 nth = th->th.th_team_nproc;
- KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
+ KMP_DEBUG_ASSERT(
+ nth > 1 ||
+ __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
+ // are using hidden helper threads
kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
kmp_int32 num = tg->reduce_num_data;
for (int i = 0; i < num; ++i) {
diff --git a/openmp/runtime/src/kmp_wrapper_getpid.h b/openmp/runtime/src/kmp_wrapper_getpid.h
index 32ede3ed715b..f9d7f4804fbc 100644
--- a/openmp/runtime/src/kmp_wrapper_getpid.h
+++ b/openmp/runtime/src/kmp_wrapper_getpid.h
@@ -23,14 +23,14 @@
#if KMP_OS_DARWIN
// OS X
#define __kmp_gettid() pthread_mach_thread_np(pthread_self())
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
#include <pthread_np.h>
#define __kmp_gettid() pthread_getthreadid_np()
#elif KMP_OS_NETBSD
#include <lwp.h>
#define __kmp_gettid() _lwp_self()
#elif KMP_OS_OPENBSD
-#define __kmp_gettid() syscall(SYS_getthrid)
+#define __kmp_gettid() getthrid()
#elif defined(SYS_gettid)
// Hopefully other Unix systems define SYS_gettid syscall for getting os thread
// id
diff --git a/openmp/runtime/src/ompt-event-specific.h b/openmp/runtime/src/ompt-event-specific.h
index 5ac7f6d1e4e6..7736ba853163 100644
--- a/openmp/runtime/src/ompt-event-specific.h
+++ b/openmp/runtime/src/ompt-event-specific.h
@@ -55,13 +55,12 @@
#define ompt_callback_implicit_task_implemented ompt_event_MAY_ALWAYS
-#define ompt_callback_target_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_emi_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_data_op_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_data_op_emi_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_submit_implemented ompt_event_UNIMPLEMENTED
-#define ompt_callback_target_submit_emi_implemented ompt_event_UNIMPLEMENTED
-
+#define ompt_callback_target_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_emi_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_data_op_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_data_op_emi_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_submit_implemented ompt_event_MAY_ALWAYS
+#define ompt_callback_target_submit_emi_implemented ompt_event_MAY_ALWAYS
#define ompt_callback_control_tool_implemented ompt_event_MAY_ALWAYS
#define ompt_callback_device_initialize_implemented ompt_event_MAY_ALWAYS
diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp
index 54edd6e6af7c..9743f35d2c4f 100644
--- a/openmp/runtime/src/ompt-specific.cpp
+++ b/openmp/runtime/src/ompt-specific.cpp
@@ -463,6 +463,7 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
}
int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
+ *size = 0;
if (blocknum != 0)
return 0; // support only a single block
@@ -471,27 +472,13 @@ int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
return 0;
kmp_taskdata_t *taskdata = thr->th.th_current_task;
- kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
if (taskdata->td_flags.tasktype != TASK_EXPLICIT)
return 0; // support only explicit task
- void *ret_addr;
- int64_t ret_size = taskdata->td_size_alloc - sizeof(kmp_taskdata_t);
-
- // kmp_task_t->data1 is an optional member
- if (taskdata->td_flags.destructors_thunk)
- ret_addr = &task->data1 + 1;
- else
- ret_addr = &task->part_id + 1;
-
- ret_size -= (char *)(ret_addr) - (char *)(task);
- if (ret_size < 0)
- return 0;
-
- *addr = ret_addr;
- *size = (size_t)ret_size;
- return 1;
+ *addr = taskdata;
+ *size = taskdata->td_size_alloc;
+ return 0;
}
//----------------------------------------------------------
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
index a452b7643bdb..bd3fd9b43e57 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
@@ -162,6 +162,14 @@
#define ITT_ARCH_ARM64 6
#endif /* ITT_ARCH_ARM64 */
+#ifndef ITT_ARCH_VE
+#define ITT_ARCH_VE 8
+#endif /* ITT_ARCH_VE */
+
+#ifndef ITT_ARCH_S390X
+#define ITT_ARCH_S390X 8
+#endif /* ITT_ARCH_S390X */
+
#ifndef ITT_ARCH
#if defined _M_IX86 || defined __i386__
#define ITT_ARCH ITT_ARCH_IA32
@@ -175,6 +183,10 @@
#define ITT_ARCH ITT_ARCH_ARM64
#elif defined __powerpc64__
#define ITT_ARCH ITT_ARCH_PPC64
+#elif defined __ve__
+#define ITT_ARCH ITT_ARCH_VE
+#elif defined __s390x__
+#define ITT_ARCH ITT_ARCH_S390X
#endif
#endif
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index 27b063f09e7a..a72705528d41 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -2060,6 +2060,351 @@ __kmp_invoke_microtask:
#endif /* KMP_ARCH_LOONGARCH64 */
+#if KMP_ARCH_VE
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+// void *p_argv[]
+// #if OMPT_SUPPORT
+// ,
+// void **exit_frame_ptr
+// #endif
+// ) {
+// #if OMPT_SUPPORT
+// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+// (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+// return 1;
+// }
+//
+// Parameters:
+// s0: pkfn
+// s1: gtid
+// s2: tid
+// s3: argc
+// s4: p_argv
+// s5: exit_frame_ptr
+//
+// Locals:
+// __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+// __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp. registers:
+//
+// s34: used to calculate the dynamic stack size
+// s35: used as temporary for stack placement calculation
+// s36: used as temporary for stack arguments
+// s37: used as temporary for number of remaining pkfn parms
+// s38: used to traverse p_argv array
+//
+// return: s0 (always 1/TRUE)
+//
+
+__gtid = -4
+__tid = -8
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+ .text
+ .globl __kmp_invoke_microtask
+ // A function requires 8 bytes align.
+ .p2align 3
+ .type __kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+ .cfi_startproc
+
+ // First, save fp and lr. VE stores them at caller stack frame.
+ st %fp, 0(, %sp)
+ st %lr, 8(, %sp)
+ or %fp, 0, %sp
+ .cfi_def_cfa %fp, 0
+ .cfi_offset %lr, 8
+ .cfi_offset %fp, 0
+
+ // Compute the dynamic stack size:
+ //
+ // - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them
+ // by reference
+ // - We need 8 bytes for whole arguments. We have two + 'argc'
+ // arguments (condider &gtid and &tid). We need to reserve
+ // (argc + 2) * 8 bytes.
+ // - We need 176 bytes for RSA and others
+ //
+ // The total number of bytes is then (argc + 2) * 8 + 8 + 176.
+ //
+ // |------------------------------|
+ // | return address of callee | 8(%fp)
+ // |------------------------------|
+ // | frame pointer of callee | 0(%fp)
+ // |------------------------------| <------------------ %fp
+ // | __tid / __gtid | -8(%fp) / -4(%fp)
+ // |------------------------------|
+ // | argc+2 for arguments | 176(%sp)
+ // |------------------------------|
+ // | RSA |
+ // |------------------------------|
+ // | return address |
+ // |------------------------------|
+ // | frame pointer |
+ // |------------------------------| <------------------ %sp
+
+ adds.w.sx %s34, 2, %s3
+ sll %s34, %s34, 3
+ lea %s34, 184(, %s34)
+ subs.l %sp, %sp, %s34
+
+ // Align the stack to 16 bytes.
+ and %sp, -16, %sp
+
+ // Save pkfn.
+ or %s12, 0, %s0
+
+ // Call host to allocate stack if it is necessary.
+ brge.l %sp, %sl, .L_kmp_pass
+ ld %s61, 24(, %tp)
+ lea %s63, 0x13b
+ shm.l %s63, 0(%s61)
+ shm.l %sl, 8(%s61)
+ shm.l %sp, 16(%s61)
+ monc
+
+.L_kmp_pass:
+ lea %s35, 176(, %sp)
+ adds.w.sx %s37, 0, %s3
+ or %s38, 0, %s4
+
+#if OMPT_SUPPORT
+ // Save frame pointer into exit_frame.
+ st %fp, 0(%s5)
+#endif
+
+ // Prepare arguments for the pkfn function (first 8 using s0-s7
+ // registers, but need to store stack also because of varargs).
+
+ stl %s1, __gtid(%fp)
+ stl %s2, __tid(%fp)
+
+ adds.l %s0, __gtid, %fp
+ st %s0, 0(, %s35)
+ adds.l %s1, __tid, %fp
+ st %s1, 8(, %s35)
+
+ breq.l 0, %s37, .L_kmp_call
+ ld %s2, 0(, %s38)
+ st %s2, 16(, %s35)
+
+ breq.l 1, %s37, .L_kmp_call
+ ld %s3, 8(, %s38)
+ st %s3, 24(, %s35)
+
+ breq.l 2, %s37, .L_kmp_call
+ ld %s4, 16(, %s38)
+ st %s4, 32(, %s35)
+
+ breq.l 3, %s37, .L_kmp_call
+ ld %s5, 24(, %s38)
+ st %s5, 40(, %s35)
+
+ breq.l 4, %s37, .L_kmp_call
+ ld %s6, 32(, %s38)
+ st %s6, 48(, %s35)
+
+ breq.l 5, %s37, .L_kmp_call
+ ld %s7, 40(, %s38)
+ st %s7, 56(, %s35)
+
+ breq.l 6, %s37, .L_kmp_call
+
+ // Prepare any additional argument passed through the stack.
+ adds.l %s37, -6, %s37
+ lea %s38, 48(, %s38)
+ lea %s35, 64(, %s35)
+.L_kmp_loop:
+ ld %s36, 0(, %s38)
+ st %s36, 0(, %s35)
+ adds.l %s37, -1, %s37
+ adds.l %s38, 8, %s38
+ adds.l %s35, 8, %s35
+ brne.l 0, %s37, .L_kmp_loop
+
+.L_kmp_call:
+ // Call pkfn function.
+ bsic %lr, (, %s12)
+
+ // Return value.
+ lea %s0, 1
+
+ // Restore stack and return.
+ or %sp, 0, %fp
+ ld %lr, 8(, %sp)
+ ld %fp, 0(, %sp)
+ b.l.t (, %lr)
+.Lfunc_end0:
+ .size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+ .cfi_endproc
+
+// -- End __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_VE */
+
+#if KMP_ARCH_S390X
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+// void *p_argv[]
+// #if OMPT_SUPPORT
+// ,
+// void **exit_frame_ptr
+// #endif
+// ) {
+// #if OMPT_SUPPORT
+// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+// (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+// return 1;
+// }
+//
+// Parameters:
+// r2: pkfn
+// r3: gtid
+// r4: tid
+// r5: argc
+// r6: p_argv
+// SP+160: exit_frame_ptr
+//
+// Locals:
+// __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+// __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp. registers:
+//
+// r0: used to fetch argv slots
+// r7: used as temporary for number of remaining pkfn parms
+// r8: argv
+// r9: pkfn
+// r10: stack size
+// r11: previous fp
+// r12: stack parameter area
+// r13: argv slot
+//
+// return: r2 (always 1/TRUE)
+//
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+ .text
+ .globl __kmp_invoke_microtask
+ .p2align 1
+ .type __kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+ .cfi_startproc
+
+ stmg %r6,%r14,48(%r15)
+ .cfi_offset %r6, -112
+ .cfi_offset %r7, -104
+ .cfi_offset %r8, -96
+ .cfi_offset %r9, -88
+ .cfi_offset %r10, -80
+ .cfi_offset %r11, -72
+ .cfi_offset %r12, -64
+ .cfi_offset %r13, -56
+ .cfi_offset %r14, -48
+ .cfi_offset %r15, -40
+ lgr %r11,%r15
+ .cfi_def_cfa %r11, 160
+
+ // Compute the dynamic stack size:
+ //
+ // - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
+ // reference
+ // - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
+ // function by register. Given that we have 5 of such registers (r[2-6])
+ // and two + 'argc' arguments (consider &gtid and &tid), we need to
+ // reserve max(0, argc - 3)*8 extra bytes
+ //
+ // The total number of bytes is then max(0, argc - 3)*8 + 8
+
+ lgr %r10,%r5
+ aghi %r10,-2
+ jnm 0f
+ lghi %r10,0
+0:
+ sllg %r10,%r10,3
+ lgr %r12,%r10
+ aghi %r10,176
+ sgr %r15,%r10
+ agr %r12,%r15
+ stg %r11,0(%r15)
+
+ lgr %r9,%r2 // pkfn
+
+#if OMPT_SUPPORT
+ // Save frame pointer into exit_frame
+ lg %r8,160(%r11)
+ stg %r11,0(%r8)
+#endif
+
+ // Prepare arguments for the pkfn function (first 5 using r2-r6 registers)
+
+ stg %r3,160(%r12)
+ la %r2,164(%r12) // gid
+ stg %r4,168(%r12)
+ la %r3,172(%r12) // tid
+ lgr %r8,%r6 // argv
+
+ // If argc > 0
+ ltgr %r7,%r5
+ jz 1f
+
+ lg %r4,0(%r8) // argv[0]
+ aghi %r7,-1
+ jz 1f
+
+ // If argc > 1
+ lg %r5,8(%r8) // argv[1]
+ aghi %r7,-1
+ jz 1f
+
+ // If argc > 2
+ lg %r6,16(%r8) // argv[2]
+ aghi %r7,-1
+ jz 1f
+
+ lghi %r13,0 // Index [n]
+2:
+ lg %r0,24(%r13,%r8) // argv[2+n]
+ stg %r0,160(%r13,%r15) // parm[2+n]
+ aghi %r13,8 // Next
+ aghi %r7,-1
+ jnz 2b
+
+1:
+ basr %r14,%r9 // Call pkfn
+
+ // Restore stack and return
+
+ lgr %r15,%r11
+ lmg %r6,%r14,48(%r15)
+ lghi %r2,1
+ br %r14
+.Lfunc_end0:
+ .size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+ .cfi_endproc
+
+// -- End __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_S390X */
+
#if KMP_ARCH_ARM || KMP_ARCH_MIPS
.data
COMMON .gomp_critical_user_, 32, 3
@@ -2073,7 +2418,9 @@ __kmp_unnamed_critical_addr:
#endif
#endif /* KMP_ARCH_ARM */
-#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
+#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || \
+ KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE || \
+ KMP_ARCH_S390X
#ifndef KMP_PREFIX_UNDERSCORE
# define KMP_PREFIX_UNDERSCORE(x) x
#endif
@@ -2088,7 +2435,8 @@ KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
.size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
#endif
#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
- KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 */
+ KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE ||
+ KMP_ARCH_S390X */
#if KMP_OS_LINUX
# if KMP_ARCH_ARM || KMP_ARCH_AARCH64
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 260b982af200..72da0f79865d 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -60,6 +60,8 @@
#elif KMP_OS_NETBSD || KMP_OS_OPENBSD
#include <sys/types.h>
#include <sys/sysctl.h>
+#elif KMP_OS_SOLARIS
+#include <sys/loadavg.h>
#endif
#include <ctype.h>
@@ -70,6 +72,15 @@ struct kmp_sys_timer {
struct timespec start;
};
+#if KMP_OS_SOLARIS
+// Convert timeval to timespec.
+#define TIMEVAL_TO_TIMESPEC(tv, ts) \
+ do { \
+ (ts)->tv_sec = (tv)->tv_sec; \
+ (ts)->tv_nsec = (tv)->tv_usec * 1000; \
+ } while (0)
+#endif
+
// Convert timespec to nanoseconds.
#define TS2NS(timespec) \
(((timespec).tv_sec * (long int)1e9) + (timespec).tv_nsec)
@@ -93,6 +104,7 @@ static kmp_cond_align_t __kmp_wait_cv;
static kmp_mutex_align_t __kmp_wait_mx;
kmp_uint64 __kmp_ticks_per_msec = 1000000;
+kmp_uint64 __kmp_ticks_per_usec = 1000;
#ifdef DEBUG_SUSPEND
static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) {
@@ -408,7 +420,7 @@ void __kmp_terminate_thread(int gtid) {
static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
int stack_data;
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
- KMP_OS_HURD
+ KMP_OS_HURD || KMP_OS_SOLARIS
pthread_attr_t attr;
int status;
size_t size = 0;
@@ -447,7 +459,7 @@ static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
return TRUE;
}
#endif /* KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD \
- || KMP_OS_HURD */
+ || KMP_OS_HURD || KMP_OS_SOLARIS */
/* Use incremental refinement starting from initial conservative estimate */
TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
TCW_PTR(th->th.th_info.ds.ds_stackbase, &stack_data);
@@ -462,7 +474,7 @@ static void *__kmp_launch_worker(void *thr) {
#endif /* KMP_BLOCK_SIGNALS */
void *exit_val;
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
- KMP_OS_OPENBSD || KMP_OS_HURD
+ KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS
void *volatile padding = 0;
#endif
int gtid;
@@ -485,7 +497,7 @@ static void *__kmp_launch_worker(void *thr) {
#endif /* USE_ITT_BUILD */
#if KMP_AFFINITY_SUPPORTED
- __kmp_affinity_set_init_mask(gtid, FALSE);
+ __kmp_affinity_bind_init_mask(gtid);
#endif
#ifdef KMP_CANCEL_THREADS
@@ -511,7 +523,7 @@ static void *__kmp_launch_worker(void *thr) {
#endif /* KMP_BLOCK_SIGNALS */
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
- KMP_OS_OPENBSD
+ KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS
if (__kmp_stkoffset > 0 && gtid > 0) {
padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
(void)padding;
@@ -1242,6 +1254,7 @@ static void __kmp_atfork_child(void) {
*affinity = KMP_AFFINITY_INIT(affinity->env_var);
__kmp_affin_fullMask = nullptr;
__kmp_affin_origMask = nullptr;
+ __kmp_topology = nullptr;
#endif // KMP_AFFINITY_SUPPORTED
#if KMP_USE_MONITOR
@@ -1811,7 +1824,7 @@ static int __kmp_get_xproc(void) {
__kmp_type_convert(sysconf(_SC_NPROCESSORS_CONF), &(r));
#elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || \
- KMP_OS_HURD
+ KMP_OS_HURD || KMP_OS_SOLARIS
__kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r));
@@ -1892,6 +1905,13 @@ void __kmp_runtime_initialize(void) {
/* Query the maximum number of threads */
__kmp_type_convert(sysconf(_SC_THREAD_THREADS_MAX), &(__kmp_sys_max_nth));
+#ifdef __ve__
+ if (__kmp_sys_max_nth == -1) {
+ // VE's pthread supports only up to 64 threads per a VE process.
+ // So we use that KMP_MAX_NTH (predefined as 64) here.
+ __kmp_sys_max_nth = KMP_MAX_NTH;
+ }
+#else
if (__kmp_sys_max_nth == -1) {
/* Unlimited threads for NPTL */
__kmp_sys_max_nth = INT_MAX;
@@ -1899,6 +1919,7 @@ void __kmp_runtime_initialize(void) {
/* Can't tell, just use PTHREAD_THREADS_MAX */
__kmp_sys_max_nth = KMP_MAX_NTH;
}
+#endif
/* Query the minimum stack size */
__kmp_sys_min_stksize = sysconf(_SC_THREAD_STACK_MIN);
@@ -2001,7 +2022,7 @@ kmp_uint64 __kmp_now_nsec() {
/* Measure clock ticks per millisecond */
void __kmp_initialize_system_tick() {
kmp_uint64 now, nsec2, diff;
- kmp_uint64 delay = 100000; // 50~100 usec on most machines.
+ kmp_uint64 delay = 1000000; // ~450 usec on most machines.
kmp_uint64 nsec = __kmp_now_nsec();
kmp_uint64 goal = __kmp_hardware_timestamp() + delay;
while ((now = __kmp_hardware_timestamp()) < goal)
@@ -2009,9 +2030,11 @@ void __kmp_initialize_system_tick() {
nsec2 = __kmp_now_nsec();
diff = nsec2 - nsec;
if (diff > 0) {
- kmp_uint64 tpms = ((kmp_uint64)1e6 * (delay + (now - goal)) / diff);
- if (tpms > 0)
- __kmp_ticks_per_msec = tpms;
+ double tpus = 1000.0 * (double)(delay + (now - goal)) / (double)diff;
+ if (tpus > 0.0) {
+ __kmp_ticks_per_msec = (kmp_uint64)(tpus * 1000.0);
+ __kmp_ticks_per_usec = (kmp_uint64)tpus;
+ }
}
}
#endif
@@ -2177,9 +2200,9 @@ int __kmp_is_address_mapped(void *addr) {
}
kiv.kve_start += 1;
}
-#elif KMP_OS_DRAGONFLY
+#elif KMP_OS_DRAGONFLY || KMP_OS_SOLARIS
- // FIXME(DragonFly): Implement this
+ // FIXME(DragonFly, Solaris): Implement this
found = 1;
#else
@@ -2194,7 +2217,8 @@ int __kmp_is_address_mapped(void *addr) {
#ifdef USE_LOAD_BALANCE
-#if KMP_OS_DARWIN || KMP_OS_NETBSD
+#if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
+ KMP_OS_OPENBSD || KMP_OS_SOLARIS
// The function returns the rounded value of the system load average
// during given time interval which depends on the value of
@@ -2452,7 +2476,7 @@ finish: // Clean up and exit.
#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || \
((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || \
KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
- KMP_ARCH_ARM)
+ KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X)
// we really only need the case with 1 argument, because CLANG always build
// a struct of pointers to shared variables referenced in the outlined function
@@ -2740,4 +2764,28 @@ void __kmp_hidden_helper_threads_deinitz_release() {
}
#endif // KMP_OS_LINUX
+bool __kmp_detect_shm() {
+ DIR *dir = opendir("/dev/shm");
+ if (dir) { // /dev/shm exists
+ closedir(dir);
+ return true;
+ } else if (ENOENT == errno) { // /dev/shm does not exist
+ return false;
+ } else { // opendir() failed
+ return false;
+ }
+}
+
+bool __kmp_detect_tmp() {
+ DIR *dir = opendir("/tmp");
+ if (dir) { // /tmp exists
+ closedir(dir);
+ return true;
+ } else if (ENOENT == errno) { // /tmp does not exist
+ return false;
+ } else { // opendir() failed
+ return false;
+ }
+}
+
// end of file //
diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp
index eb18efcac61a..9e264ab45b87 100644
--- a/openmp/runtime/src/z_Windows_NT_util.cpp
+++ b/openmp/runtime/src/z_Windows_NT_util.cpp
@@ -1006,7 +1006,7 @@ extern "C" void *__stdcall __kmp_launch_worker(void *arg) {
__kmp_itt_thread_name(gtid);
#endif /* USE_ITT_BUILD */
- __kmp_affinity_set_init_mask(gtid, FALSE);
+ __kmp_affinity_bind_init_mask(gtid);
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
// Set FP control regs to be a copy of the parallel initialization thread's.