diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2023-12-09 13:28:42 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2023-12-09 13:28:42 +0000 |
commit | b1c73532ee8997fe5dfbeb7d223027bdf99758a0 (patch) | |
tree | 7d6e51c294ab6719475d660217aa0c0ad0526292 /openmp | |
parent | 7fa27ce4a07f19b07799a767fc29416f3b625afb (diff) | |
download | src-b1c73532ee8997fe5dfbeb7d223027bdf99758a0.tar.gz src-b1c73532ee8997fe5dfbeb7d223027bdf99758a0.zip |
Vendor import of llvm-project main llvmorg-18-init-14265-ga17671084db1.vendor/llvm-project/llvmorg-18-init-14265-ga17671084db1
Diffstat (limited to 'openmp')
39 files changed, 2042 insertions, 550 deletions
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports index f740f29346ae..0d49643709e0 100644 --- a/openmp/runtime/src/dllexports +++ b/openmp/runtime/src/dllexports @@ -518,6 +518,8 @@ kmp_set_warnings_off 780 omp_target_memcpy_rect 887 omp_target_associate_ptr 888 omp_target_disassociate_ptr 889 + omp_target_memset 3000 + omp_target_memset_async 3001 %endif kmp_set_disp_num_buffers 890 @@ -1268,4 +1270,6 @@ kmp_set_disp_num_buffers 890 %endif +__kmpc_set_thread_limit + # end of file # diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt index 228bcdb25a8e..08e837d3dea1 100644 --- a/openmp/runtime/src/i18n/en_US.txt +++ b/openmp/runtime/src/i18n/en_US.txt @@ -480,6 +480,8 @@ AffHWSubsetAllFiltered "KMP_HW_SUBSET ignored: all hardware resources woul AffHWSubsetAttrsNonHybrid "KMP_HW_SUBSET ignored: Too many attributes specified. This machine is not a hybrid architecutre." AffHWSubsetIgnoringAttr "KMP_HW_SUBSET: ignoring %1$s attribute. This machine is not a hybrid architecutre." TargetMemNotAvailable "Target memory not available, will use default allocator." +AffIgnoringNonHybrid "%1$s ignored: This machine is not a hybrid architecutre. Using \"%2$s\" instead." +AffIgnoringNotAvailable "%1$s ignored: %2$s is not available. Using \"%3$s\" instead." # -------------------------------------------------------------------------------------------------- -*- HINTS -*- diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var index 1b2c467a2a12..a1488ae9d21c 100644 --- a/openmp/runtime/src/include/omp.h.var +++ b/openmp/runtime/src/include/omp.h.var @@ -15,6 +15,7 @@ #ifndef __OMP_H # define __OMP_H +# include <stddef.h> # include <stdlib.h> # include <stdint.h> @@ -236,6 +237,11 @@ extern int __KAI_KMPC_CONVENTION omp_target_memcpy_rect_async(void *, const void *, size_t, int, const size_t *, const size_t *, const size_t *, const size_t *, const size_t *, int, int, int, omp_depend_t *); + + /* OpenMP 6.0 device memory routines */ + extern void * __KAI_KMPC_CONVENTION omp_target_memset(void *, int, size_t, int); + extern void * __KAI_KMPC_CONVENTION omp_target_memset_async(void *, int, size_t, int, int, omp_depend_t *); + /*! * The `omp_get_mapped_ptr` routine returns the device pointer that is associated with a host pointer for a given device. */ @@ -497,7 +503,7 @@ extern int __KAI_KMPC_CONVENTION omp_in_explicit_task(void); /* LLVM Extensions */ - extern void *llvm_omp_target_dynamic_shared_alloc(); + extern void *llvm_omp_target_dynamic_shared_alloc(void); # undef __KAI_KMPC_CONVENTION # undef __KMP_IMP diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.f90.var index c72287422809..1ca542db3767 100644 --- a/openmp/runtime/src/include/omp_lib.f90.var +++ b/openmp/runtime/src/include/omp_lib.f90.var @@ -635,6 +635,28 @@ integer (omp_depend_kind), optional :: depobj_list(*) end function omp_target_memcpy_rect_async + function omp_target_memset(ptr, val, count, device_num) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + type(c_ptr) :: omp_target_memset + type(c_ptr), value :: ptr + integer(c_int), value :: val + integer(c_size_t), value :: count + integer(c_int), value :: device_num + end function + + function omp_target_memset_async(ptr, val, count, device_num, & + depobj_count, depobj_list) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + use omp_lib_kinds + type(c_ptr) :: omp_target_memset_async + type(c_ptr), value :: ptr + integer(c_int), value :: val + integer(c_size_t), value :: count + integer(c_int), value :: device_num + integer(c_int), value :: depobj_count + integer(omp_depend_kind), optional :: depobj_list(*) + end function + function omp_target_associate_ptr(host_ptr, device_ptr, size, & device_offset, device_num) bind(c) use omp_lib_kinds diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var index 9f5e58515e75..d20aade6ef8b 100644 --- a/openmp/runtime/src/include/omp_lib.h.var +++ b/openmp/runtime/src/include/omp_lib.h.var @@ -732,6 +732,28 @@ integer(omp_depend_kind), optional :: depobj_list(*) end function omp_target_memcpy_rect_async + function omp_target_memset(ptr, val, count, device_num) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + type(c_ptr) :: omp_target_memset + type(c_ptr), value :: ptr + integer(c_int), value :: val + integer(c_size_t), value :: count + integer(c_int), value :: device_num + end function + + function omp_target_memset_async(ptr, val, count, device_num, & + depobj_count, depobj_list) bind(c) + use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t + use omp_lib_kinds + type(c_ptr) :: omp_target_memset_async + type(c_ptr), value :: ptr + integer(c_int), value :: val + integer(c_size_t), value :: count + integer(c_int), value :: device_num + integer(c_int), value :: depobj_count + integer(omp_depend_kind), optional :: depobj_list(*) + end function + function omp_target_associate_ptr(host_ptr, device_ptr, size, & & device_offset, device_num) bind(c) use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var new file mode 100644 index 000000000000..5dd8e8355e4c --- /dev/null +++ b/openmp/runtime/src/include/ompx.h.var @@ -0,0 +1,165 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __OMPX_H +#define __OMPX_H + +#ifdef __cplusplus +extern "C" { +#endif + +int omp_get_ancestor_thread_num(int); +int omp_get_team_size(int); + +#ifdef __cplusplus +} +#endif + +/// Target kernel language extensions +/// +/// These extensions exist for the host to allow fallback implementations, +/// however, they cannot be arbitrarily composed with OpenMP. If the rules of +/// the kernel language are followed, the host fallbacks should behave as +/// expected since the kernel is represented as 3 sequential outer loops, one +/// for each grid dimension, and three (nested) parallel loops, one for each +/// block dimension. This fallback is not supposed to be optimal and should be +/// configurable by the user. +/// +///{ + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + ompx_relaxed = __ATOMIC_RELAXED, + ompx_aquire = __ATOMIC_ACQUIRE, + ompx_release = __ATOMIC_RELEASE, + ompx_acq_rel = __ATOMIC_ACQ_REL, + ompx_seq_cst = __ATOMIC_SEQ_CST, +}; + +enum { + ompx_dim_x = 0, + ompx_dim_y = 1, + ompx_dim_z = 2, +}; + +/// ompx_{thread,block}_{id,dim} +///{ +#pragma omp begin declare variant match(device = {kind(cpu)}) +#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(NAME, VALUE) \ + static inline int ompx_##NAME(int Dim) { return VALUE; } + +_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(thread_id, + omp_get_ancestor_thread_num(Dim + 1)) +_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(block_dim, omp_get_team_size(Dim + 1)) +_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(block_id, 0) +_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(grid_dim, 1) +#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C +///} + +/// ompx_{sync_block}_{,divergent} +///{ +#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(RETTY, NAME, ARGS, BODY) \ + static inline RETTY ompx_##NAME(ARGS) { BODY; } + +_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block, int Ordering, + _Pragma("omp barrier")); +_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_acq_rel, void, + ompx_sync_block(ompx_acq_rel)); +_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering, + ompx_sync_block(Ordering)); +#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C +///} + +#pragma omp end declare variant + +/// ompx_{sync_block}_{,divergent} +///{ +#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS) \ + RETTY ompx_##NAME(ARGS); + +_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block, int Ordering); +_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_acq_rel, void); +_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_divergent, int Ordering); +#undef _TGT_KERNEL_LANGUAGE_DECL_SYNC_C +///} + +/// ompx_{thread,block}_{id,dim}_{x,y,z} +///{ +#define _TGT_KERNEL_LANGUAGE_DECL_GRID_C(NAME) \ + int ompx_##NAME(int Dim); \ + static inline int ompx_##NAME##_x() { return ompx_##NAME(ompx_dim_x); } \ + static inline int ompx_##NAME##_y() { return ompx_##NAME(ompx_dim_y); } \ + static inline int ompx_##NAME##_z() { return ompx_##NAME(ompx_dim_z); } + +_TGT_KERNEL_LANGUAGE_DECL_GRID_C(thread_id) +_TGT_KERNEL_LANGUAGE_DECL_GRID_C(block_dim) +_TGT_KERNEL_LANGUAGE_DECL_GRID_C(block_id) +_TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim) +#undef _TGT_KERNEL_LANGUAGE_DECL_GRID_C +///} + +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus + +namespace ompx { + +enum { + dim_x = ompx_dim_x, + dim_y = ompx_dim_y, + dim_z = ompx_dim_z, +}; + +enum { + relaxed = ompx_relaxed , + aquire = ompx_aquire, + release = ompx_release, + acc_rel = ompx_acq_rel, + seq_cst = ompx_seq_cst, +}; + +/// ompx::{thread,block}_{id,dim}_{,x,y,z} +///{ +#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(NAME) \ + static inline int NAME(int Dim) noexcept { return ompx_##NAME(Dim); } \ + static inline int NAME##_x() noexcept { return NAME(ompx_dim_x); } \ + static inline int NAME##_y() noexcept { return NAME(ompx_dim_y); } \ + static inline int NAME##_z() noexcept { return NAME(ompx_dim_z); } + +_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(thread_id) +_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(block_dim) +_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(block_id) +_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(grid_dim) +#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX +///} + +/// ompx_{sync_block}_{,divergent} +///{ +#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(RETTY, NAME, ARGS, CALL_ARGS) \ + static inline RETTY NAME(ARGS) { \ + return ompx_##NAME(CALL_ARGS); \ + } + +_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block, int Ordering = acc_rel, + Ordering); +_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent, + int Ordering = acc_rel, Ordering); +#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX +///} + +} // namespace ompx +#endif + +///} + +#endif /* __OMPX_H */ diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 641d32357ce8..d34adf7cbf8a 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -27,6 +27,9 @@ #ifndef KMP_STATIC_STEAL_ENABLED #define KMP_STATIC_STEAL_ENABLED 1 #endif +#define KMP_WEIGHTED_ITERATIONS_SUPPORTED \ + (KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED && \ + (KMP_ARCH_X86 || KMP_ARCH_X86_64)) #define TASK_CURRENT_NOT_QUEUED 0 #define TASK_CURRENT_QUEUED 1 @@ -180,6 +183,7 @@ class kmp_stats_list; #define KMP_NSEC_PER_SEC 1000000000L #define KMP_USEC_PER_SEC 1000000L +#define KMP_NSEC_PER_USEC 1000L /*! @ingroup BASIC_TYPES @@ -690,10 +694,12 @@ extern size_t __kmp_affin_mask_size; #define KMP_CPU_ISSET(i, mask) (mask)->is_set(i) #define KMP_CPU_CLR(i, mask) (mask)->clear(i) #define KMP_CPU_ZERO(mask) (mask)->zero() +#define KMP_CPU_ISEMPTY(mask) (mask)->empty() #define KMP_CPU_COPY(dest, src) (dest)->copy(src) #define KMP_CPU_AND(dest, src) (dest)->bitwise_and(src) #define KMP_CPU_COMPLEMENT(max_bit_number, mask) (mask)->bitwise_not() #define KMP_CPU_UNION(dest, src) (dest)->bitwise_or(src) +#define KMP_CPU_EQUAL(dest, src) (dest)->is_equal(src) #define KMP_CPU_ALLOC(ptr) (ptr = __kmp_affinity_dispatch->allocate_mask()) #define KMP_CPU_FREE(ptr) __kmp_affinity_dispatch->deallocate_mask(ptr) #define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr) @@ -730,6 +736,8 @@ public: virtual void clear(int i) {} // Zero out entire mask virtual void zero() {} + // Check whether mask is empty + virtual bool empty() const { return true; } // Copy src into this mask virtual void copy(const Mask *src) {} // this &= rhs @@ -738,6 +746,8 @@ public: virtual void bitwise_or(const Mask *rhs) {} // this = ~this virtual void bitwise_not() {} + // this == rhs + virtual bool is_equal(const Mask *rhs) const { return false; } // API for iterating over an affinity mask // for (int i = mask->begin(); i != mask->end(); i = mask->next(i)) virtual int begin() const { return 0; } @@ -866,19 +876,16 @@ typedef struct kmp_affinity_flags_t { unsigned respect : 2; unsigned reset : 1; unsigned initialized : 1; - unsigned reserved : 25; + unsigned core_types_gran : 1; + unsigned core_effs_gran : 1; + unsigned omp_places : 1; + unsigned reserved : 22; } kmp_affinity_flags_t; KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4); typedef struct kmp_affinity_ids_t { + int os_id; int ids[KMP_HW_LAST]; - int operator[](size_t idx) const { return ids[idx]; } - int &operator[](size_t idx) { return ids[idx]; } - kmp_affinity_ids_t &operator=(const kmp_affinity_ids_t &rhs) { - for (int i = 0; i < KMP_HW_LAST; ++i) - ids[i] = rhs[i]; - return *this; - } } kmp_affinity_ids_t; typedef struct kmp_affinity_attrs_t { @@ -895,6 +902,7 @@ typedef struct kmp_affinity_t { enum affinity_type type; kmp_hw_t gran; int gran_levels; + kmp_affinity_attrs_t core_attr_gran; int compact; int offset; kmp_affinity_flags_t flags; @@ -909,9 +917,11 @@ typedef struct kmp_affinity_t { #define KMP_AFFINITY_INIT(env) \ { \ - nullptr, affinity_default, KMP_HW_UNKNOWN, -1, 0, 0, \ - {TRUE, FALSE, TRUE, affinity_respect_mask_default, FALSE, FALSE}, 0, \ - nullptr, nullptr, nullptr, 0, nullptr, env \ + nullptr, affinity_default, KMP_HW_UNKNOWN, -1, KMP_AFFINITY_ATTRS_UNKNOWN, \ + 0, 0, \ + {TRUE, FALSE, TRUE, affinity_respect_mask_default, FALSE, FALSE, \ + FALSE, FALSE, FALSE}, \ + 0, nullptr, nullptr, nullptr, 0, nullptr, env \ } extern enum affinity_top_method __kmp_affinity_top_method; @@ -925,6 +935,10 @@ extern kmp_affin_mask_t *__kmp_affin_fullMask; extern kmp_affin_mask_t *__kmp_affin_origMask; extern char *__kmp_cpuinfo_file; +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED +extern int __kmp_first_osid_with_ecore; +#endif + #endif /* KMP_AFFINITY_SUPPORTED */ // This needs to be kept in sync with the values in omp.h !!! @@ -1140,8 +1154,15 @@ extern void __kmp_init_target_task(); #if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX #define KMP_MAX_NTH PTHREAD_THREADS_MAX #else +#ifdef __ve__ +// VE's pthread supports only up to 64 threads per a VE process. +// Please check p. 14 of following documentation for more details. +// https://sxauroratsubasa.sakura.ne.jp/documents/veos/en/VEOS_high_level_design.pdf +#define KMP_MAX_NTH 64 +#else #define KMP_MAX_NTH INT_MAX #endif +#endif #endif /* KMP_MAX_NTH */ #ifdef PTHREAD_STACK_MIN @@ -1157,6 +1178,10 @@ extern void __kmp_init_target_task(); #elif KMP_ARCH_X86_64 #define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024)) #define KMP_BACKUP_STKSIZE ((size_t)(2 * 1024 * 1024)) +#elif KMP_ARCH_VE +// Minimum stack size for pthread for VE is 4MB. +// https://www.hpc.nec/documents/veos/en/glibc/Difference_Points_glibc.htm +#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024)) #else #define KMP_DEFAULT_STKSIZE ((size_t)(1024 * 1024)) #endif @@ -1178,13 +1203,13 @@ extern void __kmp_init_target_task(); #define KMP_MAX_STKPADDING (2 * 1024 * 1024) #define KMP_BLOCKTIME_MULTIPLIER \ - (1000) /* number of blocktime units per second */ + (1000000) /* number of blocktime units per second */ #define KMP_MIN_BLOCKTIME (0) #define KMP_MAX_BLOCKTIME \ (INT_MAX) /* Must be this for "infinite" setting the work */ -/* __kmp_blocktime is in milliseconds */ -#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200)) +/* __kmp_blocktime is in microseconds */ +#define KMP_DEFAULT_BLOCKTIME (__kmp_is_hybrid_cpu() ? (0) : (200000)) #if KMP_USE_MONITOR #define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(64 * 1024)) @@ -1211,22 +1236,21 @@ extern void __kmp_init_target_task(); #if KMP_OS_UNIX && (KMP_ARCH_X86 || KMP_ARCH_X86_64) // HW TSC is used to reduce overhead (clock tick instead of nanosecond). extern kmp_uint64 __kmp_ticks_per_msec; +extern kmp_uint64 __kmp_ticks_per_usec; #if KMP_COMPILER_ICC || KMP_COMPILER_ICX #define KMP_NOW() ((kmp_uint64)_rdtsc()) #else #define KMP_NOW() __kmp_hardware_timestamp() #endif -#define KMP_NOW_MSEC() (KMP_NOW() / __kmp_ticks_per_msec) #define KMP_BLOCKTIME_INTERVAL(team, tid) \ - (KMP_BLOCKTIME(team, tid) * __kmp_ticks_per_msec) + ((kmp_uint64)KMP_BLOCKTIME(team, tid) * __kmp_ticks_per_usec) #define KMP_BLOCKING(goal, count) ((goal) > KMP_NOW()) #else // System time is retrieved sporadically while blocking. extern kmp_uint64 __kmp_now_nsec(); #define KMP_NOW() __kmp_now_nsec() -#define KMP_NOW_MSEC() (KMP_NOW() / KMP_USEC_PER_SEC) #define KMP_BLOCKTIME_INTERVAL(team, tid) \ - (KMP_BLOCKTIME(team, tid) * KMP_USEC_PER_SEC) + ((kmp_uint64)KMP_BLOCKTIME(team, tid) * (kmp_uint64)KMP_NSEC_PER_USEC) #define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW()) #endif #endif // KMP_USE_MONITOR @@ -1304,12 +1328,16 @@ extern kmp_uint64 __kmp_now_nsec(); /* TODO: tune for KMP_OS_NETBSD */ #define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ +#elif KMP_OS_OPENBSD +/* TODO: tune for KMP_OS_OPENBSD */ +#define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ +#define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ #elif KMP_OS_HURD /* TODO: tune for KMP_OS_HURD */ #define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ -#elif KMP_OS_OPENBSD -/* TODO: tune for KMP_OS_OPENBSD */ +#elif KMP_OS_SOLARIS +/* TODO: tune for KMP_OS_SOLARIS */ #define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ #endif @@ -1822,12 +1850,9 @@ typedef struct kmp_sched_flags { unsigned ordered : 1; unsigned nomerge : 1; unsigned contains_last : 1; -#if KMP_USE_HIER_SCHED - unsigned use_hier : 1; - unsigned unused : 28; -#else - unsigned unused : 29; -#endif + unsigned use_hier : 1; // Used in KMP_USE_HIER_SCHED code + unsigned use_hybrid : 1; // Used in KMP_WEIGHTED_ITERATIONS_SUPPORTED code + unsigned unused : 27; } kmp_sched_flags_t; KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4); @@ -1841,26 +1866,37 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 { kmp_int32 st; kmp_int32 tc; kmp_lock_t *steal_lock; // lock used for chunk stealing + + kmp_uint32 ordered_lower; + kmp_uint32 ordered_upper; + // KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on) // a) parm3 is properly aligned and // b) all parm1-4 are on the same cache line. // Because of parm1-4 are used together, performance seems to be better // if they are on the same cache line (not measured though). - struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template - kmp_int32 parm1; // structures in kmp_dispatch.cpp. This should - kmp_int32 parm2; // make no real change at least while padding is off. + struct KMP_ALIGN(32) { + kmp_int32 parm1; + kmp_int32 parm2; kmp_int32 parm3; kmp_int32 parm4; }; - kmp_uint32 ordered_lower; - kmp_uint32 ordered_upper; +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED + kmp_uint32 pchunks; + kmp_uint32 num_procs_with_pcore; + kmp_int32 first_thread_with_ecore; +#endif #if KMP_OS_WINDOWS kmp_int32 last_upper; #endif /* KMP_OS_WINDOWS */ } dispatch_private_info32_t; +#if CACHE_LINE <= 128 +KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128); +#endif + typedef struct KMP_ALIGN_CACHE dispatch_private_info64 { kmp_int64 count; // current chunk number for static & static-steal scheduling kmp_int64 ub; /* upper-bound */ @@ -1869,14 +1905,16 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 { kmp_int64 st; /* stride */ kmp_int64 tc; /* trip count (number of iterations) */ kmp_lock_t *steal_lock; // lock used for chunk stealing + + kmp_uint64 ordered_lower; + kmp_uint64 ordered_upper; /* parm[1-4] are used in different ways by different scheduling algorithms */ - // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) + // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on ) // a) parm3 is properly aligned and // b) all parm1-4 are in the same cache line. // Because of parm1-4 are used together, performance seems to be better // if they are in the same line (not measured though). - struct KMP_ALIGN(32) { kmp_int64 parm1; kmp_int64 parm2; @@ -1884,12 +1922,21 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 { kmp_int64 parm4; }; - kmp_uint64 ordered_lower; - kmp_uint64 ordered_upper; +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED + kmp_uint64 pchunks; + kmp_uint64 num_procs_with_pcore; + kmp_int64 first_thread_with_ecore; +#endif + #if KMP_OS_WINDOWS kmp_int64 last_upper; #endif /* KMP_OS_WINDOWS */ } dispatch_private_info64_t; + +#if CACHE_LINE <= 128 +KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128); +#endif + #else /* KMP_STATIC_STEAL_ENABLED */ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 { kmp_int32 lb; @@ -2099,6 +2146,7 @@ typedef struct kmp_internal_control { int nproc; /* internal control for #threads for next parallel region (per thread) */ int thread_limit; /* internal control for thread-limit-var */ + int task_thread_limit; /* internal control for thread-limit-var of a task*/ int max_active_levels; /* internal control for max_active_levels */ kmp_r_sched_t sched; /* internal control for runtime schedule {sched,chunk} pair */ @@ -2432,12 +2480,22 @@ typedef struct kmp_depend_info { union { kmp_uint8 flag; // flag as an unsigned char struct { // flag as a set of 8 bits +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* Same fields as in the #else branch, but in reverse order */ + unsigned all : 1; + unsigned unused : 3; + unsigned set : 1; + unsigned mtx : 1; + unsigned out : 1; + unsigned in : 1; +#else unsigned in : 1; unsigned out : 1; unsigned mtx : 1; unsigned set : 1; unsigned unused : 3; unsigned all : 1; +#endif } flags; }; } kmp_depend_info_t; @@ -2587,6 +2645,33 @@ typedef struct kmp_task_stack { #endif // BUILD_TIED_TASK_STACK typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */ +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* Same fields as in the #else branch, but in reverse order */ +#if OMPX_TASKGRAPH + unsigned reserved31 : 6; + unsigned onced : 1; +#else + unsigned reserved31 : 7; +#endif + unsigned native : 1; + unsigned freed : 1; + unsigned complete : 1; + unsigned executing : 1; + unsigned started : 1; + unsigned team_serial : 1; + unsigned tasking_ser : 1; + unsigned task_serial : 1; + unsigned tasktype : 1; + unsigned reserved : 8; + unsigned hidden_helper : 1; + unsigned detachable : 1; + unsigned priority_specified : 1; + unsigned proxy : 1; + unsigned destructors_thunk : 1; + unsigned merged_if0 : 1; + unsigned final : 1; + unsigned tiedness : 1; +#else /* Compiler flags */ /* Total compiler flags must be 16 bits */ unsigned tiedness : 1; /* task is either tied (1) or untied (0) */ unsigned final : 1; /* task is final(1) so execute immediately */ @@ -2622,7 +2707,7 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */ #else unsigned reserved31 : 7; /* reserved for library use */ #endif - +#endif } kmp_tasking_flags_t; typedef struct kmp_target_data { @@ -3328,6 +3413,7 @@ extern int __kmp_sys_max_nth; /* system-imposed maximum number of threads */ extern int __kmp_max_nth; // maximum total number of concurrently-existing threads in a contention group extern int __kmp_cg_max_nth; +extern int __kmp_task_max_nth; // max threads used in a task extern int __kmp_teams_max_nth; // max threads used in a teams construct extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and __kmp_root */ @@ -3339,9 +3425,22 @@ extern int __kmp_tp_capacity; /* capacity of __kmp_threads if threadprivate is used (fixed) */ extern int __kmp_tp_cached; /* whether threadprivate cache has been created (__kmpc_threadprivate_cached()) */ -extern int __kmp_dflt_blocktime; /* number of milliseconds to wait before +extern int __kmp_dflt_blocktime; /* number of microseconds to wait before blocking (env setting) */ +extern char __kmp_blocktime_units; /* 'm' or 'u' to note units specified */ extern bool __kmp_wpolicy_passive; /* explicitly set passive wait policy */ + +// Convert raw blocktime from ms to us if needed. +static inline void __kmp_aux_convert_blocktime(int *bt) { + if (__kmp_blocktime_units == 'm') { + if (*bt > INT_MAX / 1000) { + *bt = INT_MAX / 1000; + KMP_INFORM(MaxValueUsing, "kmp_set_blocktime(ms)", bt); + } + *bt = *bt * 1000; + } +} + #if KMP_USE_MONITOR extern int __kmp_monitor_wakeups; /* number of times monitor wakes up per second */ @@ -3589,6 +3688,9 @@ extern void __kmp_warn(char const *format, ...); extern void __kmp_set_num_threads(int new_nth, int gtid); +extern bool __kmp_detect_shm(); +extern bool __kmp_detect_tmp(); + // Returns current thread (pointer to kmp_info_t). Current thread *must* be // registered. static inline kmp_info_t *__kmp_entry_thread() { @@ -3770,7 +3872,8 @@ extern void __kmp_affinity_initialize(kmp_affinity_t &affinity); extern void __kmp_affinity_uninitialize(void); extern void __kmp_affinity_set_init_mask( int gtid, int isa_root); /* set affinity according to KMP_AFFINITY */ -extern void __kmp_affinity_set_place(int gtid); +void __kmp_affinity_bind_init_mask(int gtid); +extern void __kmp_affinity_bind_place(int gtid); extern void __kmp_affinity_determine_capable(const char *env_var); extern int __kmp_aux_set_affinity(void **mask); extern int __kmp_aux_get_affinity(void **mask); @@ -3779,6 +3882,9 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask); extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask); extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask); extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size); +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED +extern int __kmp_get_first_osid_with_ecore(void); +#endif #if KMP_OS_LINUX || KMP_OS_FREEBSD extern int kmp_set_thread_affinity_mask_initial(void); #endif @@ -3786,7 +3892,8 @@ static inline void __kmp_assign_root_init_mask() { int gtid = __kmp_entry_gtid(); kmp_root_t *r = __kmp_threads[gtid]->th.th_root; if (r->r.r_uber_thread == __kmp_threads[gtid] && !r->r.r_affinity_assigned) { - __kmp_affinity_set_init_mask(gtid, TRUE); + __kmp_affinity_set_init_mask(gtid, /*isa_root=*/TRUE); + __kmp_affinity_bind_init_mask(gtid); r->r.r_affinity_assigned = TRUE; } } @@ -4130,6 +4237,11 @@ KMP_EXPORT kmp_int32 __kmpc_omp_task_with_deps( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list); + +KMP_EXPORT kmp_base_depnode_t *__kmpc_task_get_depnode(kmp_task_t *task); + +KMP_EXPORT kmp_depnode_list_t *__kmpc_task_get_successors(kmp_task_t *task); + KMP_EXPORT void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list, @@ -4270,6 +4382,8 @@ KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads); +KMP_EXPORT void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid, + kmp_int32 thread_limit); /* Function for OpenMP 5.1 num_teams clause */ KMP_EXPORT void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams_lb, diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp index cbb80bf3a848..7009730a49ba 100644 --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -38,6 +38,43 @@ static hierarchy_info machine_hierarchy; void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } +#if KMP_AFFINITY_SUPPORTED +// Helper class to see if place lists further restrict the fullMask +class kmp_full_mask_modifier_t { + kmp_affin_mask_t *mask; + +public: + kmp_full_mask_modifier_t() { + KMP_CPU_ALLOC(mask); + KMP_CPU_ZERO(mask); + } + ~kmp_full_mask_modifier_t() { + KMP_CPU_FREE(mask); + mask = nullptr; + } + void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); } + // If the new full mask is different from the current full mask, + // then switch them. Returns true if full mask was affected, false otherwise. + bool restrict_to_mask() { + // See if the new mask further restricts or changes the full mask + if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask)) + return false; + return __kmp_topology->restrict_to_mask(mask); + } +}; + +static inline const char * +__kmp_get_affinity_env_var(const kmp_affinity_t &affinity, + bool for_binding = false) { + if (affinity.flags.omp_places) { + if (for_binding) + return "OMP_PROC_BIND"; + return "OMP_PLACES"; + } + return affinity.env_var; +} +#endif // KMP_AFFINITY_SUPPORTED + void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { kmp_uint32 depth; // The test below is true if affinity is available, but set to "none". Need to @@ -207,6 +244,8 @@ void kmp_hw_thread_t::print() const { if (attrs.is_core_eff_valid()) printf(" (eff=%d)", attrs.get_core_eff()); } + if (leader) + printf(" (leader)"); printf("\n"); } @@ -797,7 +836,40 @@ void kmp_topology_t::print(const char *env_var) const { #if KMP_AFFINITY_SUPPORTED void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const { - const char *env_var = affinity.env_var; + const char *env_var = __kmp_get_affinity_env_var(affinity); + // If requested hybrid CPU attributes for granularity (either OMP_PLACES or + // KMP_AFFINITY), but none exist, then reset granularity and have below method + // select a granularity and warn user. + if (!__kmp_is_hybrid_cpu()) { + if (affinity.core_attr_gran.valid) { + // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores + // instead + KMP_AFF_WARNING( + affinity, AffIgnoringNonHybrid, env_var, + __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true)); + affinity.gran = KMP_HW_CORE; + affinity.gran_levels = -1; + affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN; + affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0; + } else if (affinity.flags.core_types_gran || + affinity.flags.core_effs_gran) { + // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead + if (affinity.flags.omp_places) { + KMP_AFF_WARNING( + affinity, AffIgnoringNonHybrid, env_var, + __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true)); + } else { + // KMP_AFFINITY=granularity=core_type|core_eff,... + KMP_AFF_WARNING(affinity, AffGranularityBad, env_var, + "Intel(R) Hybrid Technology core attribute", + __kmp_hw_get_catalog_string(KMP_HW_CORE)); + } + affinity.gran = KMP_HW_CORE; + affinity.gran_levels = -1; + affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN; + affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0; + } + } // Set the number of affinity granularity levels if (affinity.gran_levels < 0) { kmp_hw_t gran_type = get_equivalent_type(affinity.gran); @@ -937,6 +1009,7 @@ public: } }; +#if KMP_AFFINITY_SUPPORTED static kmp_str_buf_t * __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf, bool plural) { @@ -952,6 +1025,41 @@ __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf, return buf; } +bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) { + // Apply the filter + bool affected; + int new_index = 0; + for (int i = 0; i < num_hw_threads; ++i) { + int os_id = hw_threads[i].os_id; + if (KMP_CPU_ISSET(os_id, mask)) { + if (i != new_index) + hw_threads[new_index] = hw_threads[i]; + new_index++; + } else { + KMP_CPU_CLR(os_id, __kmp_affin_fullMask); + __kmp_avail_proc--; + } + } + + KMP_DEBUG_ASSERT(new_index <= num_hw_threads); + affected = (num_hw_threads != new_index); + num_hw_threads = new_index; + + // Post hardware subset canonicalization + if (affected) { + _gather_enumeration_information(); + _discover_uniformity(); + _set_globals(); + _set_last_level_cache(); +#if KMP_OS_WINDOWS + // Copy filtered full mask if topology has single processor group + if (__kmp_num_proc_groups <= 1) +#endif + __kmp_affin_origMask->copy(__kmp_affin_fullMask); + } + return affected; +} + // Apply the KMP_HW_SUBSET envirable to the topology // Returns true if KMP_HW_SUBSET filtered any processors // otherwise, returns false @@ -1156,7 +1264,9 @@ bool kmp_topology_t::filter_hw_subset() { // Determine which hardware threads should be filtered. int num_filtered = 0; - bool *filtered = (bool *)__kmp_allocate(sizeof(bool) * num_hw_threads); + kmp_affin_mask_t *filtered_mask; + KMP_CPU_ALLOC(filtered_mask); + KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask); for (int i = 0; i < num_hw_threads; ++i) { kmp_hw_thread_t &hw_thread = hw_threads[i]; // Update type_sub_id @@ -1218,51 +1328,35 @@ bool kmp_topology_t::filter_hw_subset() { } } // Collect filtering information - filtered[i] = should_be_filtered; - if (should_be_filtered) + if (should_be_filtered) { + KMP_CPU_CLR(hw_thread.os_id, filtered_mask); num_filtered++; + } } // One last check that we shouldn't allow filtering entire machine if (num_filtered == num_hw_threads) { KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered); - __kmp_free(filtered); return false; } // Apply the filter - int new_index = 0; - for (int i = 0; i < num_hw_threads; ++i) { - if (!filtered[i]) { - if (i != new_index) - hw_threads[new_index] = hw_threads[i]; - new_index++; - } else { -#if KMP_AFFINITY_SUPPORTED - KMP_CPU_CLR(hw_threads[i].os_id, __kmp_affin_fullMask); -#endif - __kmp_avail_proc--; - } - } - - KMP_DEBUG_ASSERT(new_index <= num_hw_threads); - num_hw_threads = new_index; - - // Post hardware subset canonicalization - _gather_enumeration_information(); - _discover_uniformity(); - _set_globals(); - _set_last_level_cache(); - __kmp_free(filtered); + restrict_to_mask(filtered_mask); return true; } -bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { +bool kmp_topology_t::is_close(int hwt1, int hwt2, + const kmp_affinity_t &stgs) const { + int hw_level = stgs.gran_levels; if (hw_level >= depth) return true; bool retval = true; const kmp_hw_thread_t &t1 = hw_threads[hwt1]; const kmp_hw_thread_t &t2 = hw_threads[hwt2]; + if (stgs.flags.core_types_gran) + return t1.attrs.get_core_type() == t2.attrs.get_core_type(); + if (stgs.flags.core_effs_gran) + return t1.attrs.get_core_eff() == t2.attrs.get_core_eff(); for (int i = 0; i < (depth - hw_level); ++i) { if (t1.ids[i] != t2.ids[i]) return false; @@ -1272,8 +1366,6 @@ bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { //////////////////////////////////////////////////////////////////////////////// -#if KMP_AFFINITY_SUPPORTED - bool KMPAffinity::picked_api = false; void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } @@ -2718,7 +2810,7 @@ static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, // Set the array sizes for the hierarchy layers static void __kmp_dispatch_set_hierarchy_values() { // Set the maximum number of L1's to number of cores - // Set the maximum number of L2's to to either number of cores / 2 for + // Set the maximum number of L2's to either number of cores / 2 for // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing // Or the number of cores for Intel(R) Xeon(R) processors // Set the maximum number of NUMA nodes and L3's to number of packages @@ -2898,6 +2990,9 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line, unsigned num_avail = 0; *line = 0; +#if KMP_ARCH_S390X + bool reading_s390x_sys_info = true; +#endif while (!feof(f)) { // Create an inner scoping level, so that all the goto targets at the end of // the loop appear in an outer scoping level. This avoids warnings about @@ -2943,8 +3038,21 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line, if (*buf == '\n' && *line == 2) continue; #endif +#if KMP_ARCH_S390X + // s390x /proc/cpuinfo starts with a variable number of lines containing + // the overall system information. Skip them. + if (reading_s390x_sys_info) { + if (*buf == '\n') + reading_s390x_sys_info = false; + continue; + } +#endif +#if KMP_ARCH_S390X + char s1[] = "cpu number"; +#else char s1[] = "processor"; +#endif if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { CHECK_LINE; char *p = strchr(buf + sizeof(s1) - 1, ':'); @@ -2970,6 +3078,23 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line, threadInfo[num_avail][osIdIndex]); __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); +#if KMP_ARCH_S390X + // Disambiguate physical_package_id. + unsigned book_id; + KMP_SNPRINTF(path, sizeof(path), + "/sys/devices/system/cpu/cpu%u/topology/book_id", + threadInfo[num_avail][osIdIndex]); + __kmp_read_from_file(path, "%u", &book_id); + threadInfo[num_avail][pkgIdIndex] |= (book_id << 8); + + unsigned drawer_id; + KMP_SNPRINTF(path, sizeof(path), + "/sys/devices/system/cpu/cpu%u/topology/drawer_id", + threadInfo[num_avail][osIdIndex]); + __kmp_read_from_file(path, "%u", &drawer_id); + threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16); +#endif + KMP_SNPRINTF(path, sizeof(path), "/sys/devices/system/cpu/cpu%u/topology/core_id", threadInfo[num_avail][osIdIndex]); @@ -3224,7 +3349,7 @@ restart_radix_check: return false; } - // If the thread ids were not specified and we see entries entries that + // If the thread ids were not specified and we see entries that // are duplicates, start the loop over and assign the thread ids manually. assign_thread_ids = true; goto restart_radix_check; @@ -3353,17 +3478,25 @@ restart_radix_check: // Create and return a table of affinity masks, indexed by OS thread ID. // This routine handles OR'ing together all the affinity masks of threads // that are sufficiently close, if granularity > fine. +template <typename FindNextFunctionType> static void __kmp_create_os_id_masks(unsigned *numUnique, - kmp_affinity_t &affinity) { + kmp_affinity_t &affinity, + FindNextFunctionType find_next) { // First form a table of affinity masks in order of OS thread id. int maxOsId; int i; int numAddrs = __kmp_topology->get_num_hw_threads(); int depth = __kmp_topology->get_depth(); - const char *env_var = affinity.env_var; + const char *env_var = __kmp_get_affinity_env_var(affinity); KMP_ASSERT(numAddrs); KMP_ASSERT(depth); + i = find_next(-1); + // If could not find HW thread location with attributes, then return and + // fallback to increment find_next and disregard core attributes. + if (i >= numAddrs) + return; + maxOsId = 0; for (i = numAddrs - 1;; --i) { int osId = __kmp_topology->at(i).os_id; @@ -3393,19 +3526,22 @@ static void __kmp_create_os_id_masks(unsigned *numUnique, kmp_affin_mask_t *sum; KMP_CPU_ALLOC_ON_STACK(sum); KMP_CPU_ZERO(sum); - KMP_CPU_SET(__kmp_topology->at(0).os_id, sum); - for (i = 1; i < numAddrs; i++) { + + i = j = leader = find_next(-1); + KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); + kmp_full_mask_modifier_t full_mask; + for (i = find_next(i); i < numAddrs; i = find_next(i)) { // If this thread is sufficiently close to the leader (within the // granularity setting), then set the bit for this os thread in the // affinity mask for this group, and go on to the next thread. - if (__kmp_topology->is_close(leader, i, affinity.gran_levels)) { + if (__kmp_topology->is_close(leader, i, affinity)) { KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); continue; } // For every thread in this group, copy the mask to the thread's entry in // the OS Id mask table. Mark the first address as a leader. - for (; j < i; j++) { + for (; j < i; j = find_next(j)) { int osId = __kmp_topology->at(j).os_id; KMP_DEBUG_ASSERT(osId <= maxOsId); kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId); @@ -3416,22 +3552,29 @@ static void __kmp_create_os_id_masks(unsigned *numUnique, // Start a new mask. leader = i; + full_mask.include(sum); KMP_CPU_ZERO(sum); KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); } // For every thread in last group, copy the mask to the thread's // entry in the OS Id mask table. - for (; j < i; j++) { + for (; j < i; j = find_next(j)) { int osId = __kmp_topology->at(j).os_id; KMP_DEBUG_ASSERT(osId <= maxOsId); kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId); KMP_CPU_COPY(mask, sum); __kmp_topology->at(j).leader = (j == leader); } + full_mask.include(sum); unique++; KMP_CPU_FREE_FROM_STACK(sum); + // See if the OS Id mask table further restricts or changes the full mask + if (full_mask.restrict_to_mask() && affinity.flags.verbose) { + __kmp_topology->print(env_var); + } + *numUnique = unique; } @@ -4053,7 +4196,7 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask, // Initiailze ids and attrs thread data for (int i = 0; i < KMP_HW_LAST; ++i) - ids[i] = kmp_hw_thread_t::UNKNOWN_ID; + ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID; attrs = KMP_AFFINITY_ATTRS_UNKNOWN; // Iterate through each os id within the mask and determine @@ -4062,19 +4205,20 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask, int depth = __kmp_topology->get_depth(); KMP_CPU_SET_ITERATE(cpu, mask) { int osid_idx = __kmp_osid_to_hwthread_map[cpu]; + ids.os_id = cpu; const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx); for (int level = 0; level < depth; ++level) { kmp_hw_t type = __kmp_topology->get_type(level); int id = hw_thread.sub_ids[level]; - if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids[type] == id) { - ids[type] = id; + if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) { + ids.ids[type] = id; } else { // This mask spans across multiple topology units, set it as such // and mark every level below as such as well. - ids[type] = kmp_hw_thread_t::MULTIPLE_ID; + ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID; for (; level < depth; ++level) { kmp_hw_t type = __kmp_topology->get_type(level); - ids[type] = kmp_hw_thread_t::MULTIPLE_ID; + ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID; } } } @@ -4134,8 +4278,11 @@ static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) { } // Create the OS proc to hardware thread map - for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) - __kmp_osid_to_hwthread_map[__kmp_topology->at(hw_thread).os_id] = hw_thread; + for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) { + int os_id = __kmp_topology->at(hw_thread).os_id; + if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask)) + __kmp_osid_to_hwthread_map[os_id] = hw_thread; + } for (unsigned i = 0; i < affinity.num_masks; ++i) { kmp_affinity_ids_t &ids = affinity.ids[i]; @@ -4145,16 +4292,29 @@ static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) { } } +// Called when __kmp_topology is ready +static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) { + // Initialize other data structures which depend on the topology + if (__kmp_topology && __kmp_topology->get_num_hw_threads()) { + machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); + __kmp_affinity_get_topology_info(affinity); +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED + __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore(); +#endif + } +} + // Create a one element mask array (set of places) which only contains the // initial process's affinity mask static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) { KMP_ASSERT(__kmp_affin_fullMask != NULL); KMP_ASSERT(affinity.type == affinity_none); + KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); affinity.num_masks = 1; KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks); kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0); KMP_CPU_COPY(dest, __kmp_affin_fullMask); - __kmp_affinity_get_topology_info(affinity); + __kmp_aux_affinity_initialize_other_data(affinity); } static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) { @@ -4383,13 +4543,6 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) { if (verbose) __kmp_topology->print(env_var); bool filtered = __kmp_topology->filter_hw_subset(); - if (filtered) { -#if KMP_OS_WINDOWS - // Copy filtered full mask if topology has single processor group - if (__kmp_num_proc_groups <= 1) -#endif - __kmp_affin_origMask->copy(__kmp_affin_fullMask); - } if (filtered && verbose) __kmp_topology->print("KMP_HW_SUBSET"); return success; @@ -4398,7 +4551,7 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) { static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) { bool is_regular_affinity = (&affinity == &__kmp_affinity); bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity); - const char *env_var = affinity.env_var; + const char *env_var = __kmp_get_affinity_env_var(affinity); if (affinity.flags.initialized) { KMP_ASSERT(__kmp_affin_fullMask != NULL); @@ -4411,8 +4564,6 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) { if (is_regular_affinity && !__kmp_topology) { bool success = __kmp_aux_affinity_initialize_topology(affinity); if (success) { - // Initialize other data structures which depend on the topology - machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); } else { affinity.type = affinity_none; @@ -4437,7 +4588,36 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) { // Create the table of masks, indexed by thread Id. unsigned numUnique; - __kmp_create_os_id_masks(&numUnique, affinity); + int numAddrs = __kmp_topology->get_num_hw_threads(); + // If OMP_PLACES=cores:<attribute> specified, then attempt + // to make OS Id mask table using those attributes + if (affinity.core_attr_gran.valid) { + __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) { + KMP_ASSERT(idx >= -1); + for (int i = idx + 1; i < numAddrs; ++i) + if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran)) + return i; + return numAddrs; + }); + if (!affinity.os_id_masks) { + const char *core_attribute; + if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF) + core_attribute = "core_efficiency"; + else + core_attribute = "core_type"; + KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var, + core_attribute, + __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true)) + } + } + // If core attributes did not work, or none were specified, + // then make OS Id mask table using typical incremental way. + if (!affinity.os_id_masks) { + __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) { + KMP_ASSERT(idx >= -1); + return idx + 1; + }); + } if (affinity.gran_levels == 0) { KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); } @@ -4578,6 +4758,7 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) { int i; unsigned j; int num_hw_threads = __kmp_topology->get_num_hw_threads(); + kmp_full_mask_modifier_t full_mask; for (i = 0, j = 0; i < num_hw_threads; i++) { if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) { continue; @@ -4588,11 +4769,16 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) { kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j); KMP_ASSERT(KMP_CPU_ISSET(osId, src)); KMP_CPU_COPY(dest, src); + full_mask.include(src); if (++j >= affinity.num_masks) { break; } } KMP_DEBUG_ASSERT(j == affinity.num_masks); + // See if the places list further restricts or changes the full mask + if (full_mask.restrict_to_mask() && affinity.flags.verbose) { + __kmp_topology->print(env_var); + } } // Sort the topology back using ids __kmp_topology->sort_ids(); @@ -4601,7 +4787,7 @@ static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) { default: KMP_ASSERT2(0, "Unexpected affinity setting"); } - __kmp_affinity_get_topology_info(affinity); + __kmp_aux_affinity_initialize_other_data(affinity); affinity.flags.initialized = TRUE; } @@ -4694,7 +4880,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) { // Set the thread topology information to default of unknown for (int id = 0; id < KMP_HW_LAST; ++id) - th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID; + th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID; th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN; if (!KMP_AFFINITY_CAPABLE()) { @@ -4715,14 +4901,12 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) { kmp_affin_mask_t *mask; int i; const kmp_affinity_t *affinity; - const char *env_var; bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid); if (is_hidden_helper) affinity = &__kmp_hh_affinity; else affinity = &__kmp_affinity; - env_var = affinity->env_var; if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) { if ((affinity->type == affinity_none) || @@ -4772,19 +4956,34 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) { } if (i == KMP_PLACE_ALL) { - KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", + KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n", gtid)); } else { - KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", + KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n", gtid, i)); } KMP_CPU_COPY(th->th.th_affin_mask, mask); +} +void __kmp_affinity_bind_init_mask(int gtid) { + if (!KMP_AFFINITY_CAPABLE()) { + return; + } + kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); + const kmp_affinity_t *affinity; + const char *env_var; + bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid); + + if (is_hidden_helper) + affinity = &__kmp_hh_affinity; + else + affinity = &__kmp_affinity; + env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true); /* to avoid duplicate printing (will be correctly printed on barrier) */ - if (affinity->flags.verbose && - (affinity->type == affinity_none || - (i != KMP_PLACE_ALL && affinity->type != affinity_balanced)) && + if (affinity->flags.verbose && (affinity->type == affinity_none || + (th->th.th_current_place != KMP_PLACE_ALL && + affinity->type != affinity_balanced)) && !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) { char buf[KMP_AFFIN_MASK_PRINT_LEN]; __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, @@ -4804,7 +5003,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) { __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); } -void __kmp_affinity_set_place(int gtid) { +void __kmp_affinity_bind_place(int gtid) { // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) { return; @@ -4812,7 +5011,7 @@ void __kmp_affinity_set_place(int gtid) { kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); - KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " + KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current " "place = %d)\n", gtid, th->th.th_new_place, th->th.th_current_place)); @@ -4834,9 +5033,6 @@ void __kmp_affinity_set_place(int gtid) { KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place); KMP_CPU_COPY(th->th.th_affin_mask, mask); th->th.th_current_place = th->th.th_new_place; - // Copy topology information associated with the place - th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place]; - th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place]; if (__kmp_affinity.flags.verbose) { char buf[KMP_AFFIN_MASK_PRINT_LEN]; @@ -5081,6 +5277,28 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); } +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED +// Returns first os proc id with ATOM core +int __kmp_get_first_osid_with_ecore(void) { + int low = 0; + int high = __kmp_topology->get_num_hw_threads() - 1; + int mid = 0; + while (high - low > 1) { + mid = (high + low) / 2; + if (__kmp_topology->at(mid).attrs.get_core_type() == + KMP_HW_CORE_TYPE_CORE) { + low = mid + 1; + } else { + high = mid; + } + } + if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) { + return mid; + } + return -1; +} +#endif + // Dynamic affinity settings - Affinity balanced void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { KMP_DEBUG_ASSERT(th); diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h index f27dd9a5339e..5464259784e2 100644 --- a/openmp/runtime/src/kmp_affinity.h +++ b/openmp/runtime/src/kmp_affinity.h @@ -34,6 +34,7 @@ public: bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } void clear(int i) override { hwloc_bitmap_clr(mask, i); } void zero() override { hwloc_bitmap_zero(mask); } + bool empty() const override { return hwloc_bitmap_iszero(mask); } void copy(const KMPAffinity::Mask *src) override { const Mask *convert = static_cast<const Mask *>(src); hwloc_bitmap_copy(mask, convert->mask); @@ -47,6 +48,10 @@ public: hwloc_bitmap_or(mask, mask, convert->mask); } void bitwise_not() override { hwloc_bitmap_not(mask, mask); } + bool is_equal(const KMPAffinity::Mask *rhs) const override { + const Mask *convert = static_cast<const Mask *>(rhs); + return hwloc_bitmap_isequal(mask, convert->mask); + } int begin() const override { return hwloc_bitmap_first(mask); } int end() const override { return -1; } int next(int previous) const override { @@ -281,6 +286,28 @@ public: #elif __NR_sched_getaffinity != 123 #error Wrong code for getaffinity system call. #endif /* __NR_sched_getaffinity */ +#elif KMP_ARCH_VE +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 203 +#elif __NR_sched_setaffinity != 203 +#error Wrong code for setaffinity system call. +#endif /* __NR_sched_setaffinity */ +#ifndef __NR_sched_getaffinity +#define __NR_sched_getaffinity 204 +#elif __NR_sched_getaffinity != 204 +#error Wrong code for getaffinity system call. +#endif /* __NR_sched_getaffinity */ +#elif KMP_ARCH_S390X +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 239 +#elif __NR_sched_setaffinity != 239 +#error Wrong code for setaffinity system call. +#endif /* __NR_sched_setaffinity */ +#ifndef __NR_sched_getaffinity +#define __NR_sched_getaffinity 240 +#elif __NR_sched_getaffinity != 240 +#error Wrong code for getaffinity system call. +#endif /* __NR_sched_getaffinity */ #else #error Unknown or unsupported architecture #endif /* KMP_ARCH_* */ @@ -319,6 +346,13 @@ class KMPNativeAffinity : public KMPAffinity { for (mask_size_type i = 0; i < e; ++i) mask[i] = (mask_t)0; } + bool empty() const override { + mask_size_type e = get_num_mask_types(); + for (mask_size_type i = 0; i < e; ++i) + if (mask[i] != (mask_t)0) + return false; + return true; + } void copy(const KMPAffinity::Mask *src) override { const Mask *convert = static_cast<const Mask *>(src); mask_size_type e = get_num_mask_types(); @@ -342,6 +376,14 @@ class KMPNativeAffinity : public KMPAffinity { for (mask_size_type i = 0; i < e; ++i) mask[i] = ~(mask[i]); } + bool is_equal(const KMPAffinity::Mask *rhs) const override { + const Mask *convert = static_cast<const Mask *>(rhs); + mask_size_type e = get_num_mask_types(); + for (mask_size_type i = 0; i < e; ++i) + if (mask[i] != convert->mask[i]) + return false; + return true; + } int begin() const override { int retval = 0; while (retval < end() && !is_set(retval)) @@ -459,6 +501,12 @@ class KMPNativeAffinity : public KMPAffinity { for (int i = 0; i < __kmp_num_proc_groups; ++i) mask[i] = 0; } + bool empty() const override { + for (size_t i = 0; i < __kmp_num_proc_groups; ++i) + if (mask[i]) + return false; + return true; + } void copy(const KMPAffinity::Mask *src) override { const Mask *convert = static_cast<const Mask *>(src); for (int i = 0; i < __kmp_num_proc_groups; ++i) @@ -478,6 +526,13 @@ class KMPNativeAffinity : public KMPAffinity { for (int i = 0; i < __kmp_num_proc_groups; ++i) mask[i] = ~(mask[i]); } + bool is_equal(const KMPAffinity::Mask *rhs) const override { + const Mask *convert = static_cast<const Mask *>(rhs); + for (size_t i = 0; i < __kmp_num_proc_groups; ++i) + if (mask[i] != convert->mask[i]) + return false; + return true; + } int begin() const override { int retval = 0; while (retval < end() && !is_set(retval)) @@ -679,6 +734,21 @@ struct kmp_hw_attr_t { } return false; } +#if KMP_AFFINITY_SUPPORTED + bool contains(const kmp_affinity_attrs_t &attr) const { + if (!valid && !attr.valid) + return true; + if (valid && attr.valid) { + if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN) + return (is_core_type_valid() && + (get_core_type() == (kmp_hw_core_type_t)attr.core_type)); + if (attr.core_eff != UNKNOWN_CORE_EFF) + return (is_core_eff_valid() && (get_core_eff() == attr.core_eff)); + return true; + } + return false; + } +#endif // KMP_AFFINITY_SUPPORTED bool operator==(const kmp_hw_attr_t &rhs) const { return (rhs.valid == valid && rhs.core_eff == core_eff && rhs.core_type == core_type); @@ -834,13 +904,18 @@ public: #if KMP_AFFINITY_SUPPORTED // Set the granularity for affinity settings void set_granularity(kmp_affinity_t &stgs) const; -#endif + bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const; + bool restrict_to_mask(const kmp_affin_mask_t *mask); bool filter_hw_subset(); - bool is_close(int hwt1, int hwt2, int level) const; +#endif bool is_uniform() const { return flags.uniform; } // Tell whether a type is a valid type in the topology // returns KMP_HW_UNKNOWN when there is no equivalent type - kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; } + kmp_hw_t get_equivalent_type(kmp_hw_t type) const { + if (type == KMP_HW_UNKNOWN) + return KMP_HW_UNKNOWN; + return equivalent[type]; + } // Set type1 = type2 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp index bf56c7884970..281b8e9c2883 100644 --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -2591,7 +2591,7 @@ void __kmp_fork_barrier(int gtid, int tid) { __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place)); } else { - __kmp_affinity_set_place(gtid); + __kmp_affinity_bind_place(gtid); } } #endif // KMP_AFFINITY_SUPPORTED diff --git a/openmp/runtime/src/kmp_barrier.h b/openmp/runtime/src/kmp_barrier.h index ac28a13217e9..ae9b8d62f4c3 100644 --- a/openmp/runtime/src/kmp_barrier.h +++ b/openmp/runtime/src/kmp_barrier.h @@ -21,7 +21,10 @@ #define KMP_ALIGNED_ALLOCATE(size, alignment) _mm_malloc(size, alignment) #define KMP_ALIGNED_FREE(ptr) _mm_free(ptr) #elif KMP_HAVE_ALIGNED_ALLOC -#define KMP_ALIGNED_ALLOCATE(size, alignment) aligned_alloc(alignment, size) +#define KMP_ALGIN_UP(val, alignment) \ + (((val) + (alignment)-1) / (alignment) * (alignment)) +#define KMP_ALIGNED_ALLOCATE(size, alignment) \ + aligned_alloc(alignment, KMP_ALGIN_UP(size, alignment)) #define KMP_ALIGNED_FREE(ptr) free(ptr) #elif KMP_HAVE_POSIX_MEMALIGN static inline void *KMP_ALIGNED_ALLOCATE(size_t size, size_t alignment) { diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp index 8d0ed0e945c0..2c410ca9b603 100644 --- a/openmp/runtime/src/kmp_collapse.cpp +++ b/openmp/runtime/src/kmp_collapse.cpp @@ -27,7 +27,7 @@ // avoid inadevertently using a library based abs template <typename T> T __kmp_abs(const T val) { - return (val < 0) ? -val: val; + return (val < 0) ? -val : val; } kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; } kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; } @@ -36,7 +36,34 @@ kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; } // Common functions for working with rectangular and non-rectangular loops //---------------------------------------------------------------------------- -template <typename T> int __kmp_sign(T val) { return (T(0) < val) - (val < T(0)); } +template <typename T> int __kmp_sign(T val) { + return (T(0) < val) - (val < T(0)); +} + +template <typename T> class CollapseAllocator { + typedef T *pT; + +private: + static const size_t allocaSize = 32; // size limit for stack allocations + // (8 bytes x 4 nested loops) + char stackAlloc[allocaSize]; + static constexpr size_t maxElemCount = allocaSize / sizeof(T); + pT pTAlloc; + +public: + CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) { + if (n > maxElemCount) { + pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T))); + } + } + ~CollapseAllocator() { + if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) { + __kmp_free(pTAlloc); + } + } + T &operator[](int index) { return pTAlloc[index]; } + operator const pT() { return pTAlloc; } +}; //----------Loop canonicalization--------------------------------------------- @@ -463,8 +490,7 @@ __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, /*out*/ kmp_uint64 *original_ivs, kmp_index_t n) { - kmp_iterations_t iterations = - (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n); + CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); // First, calc corresponding iteration in every original loop: for (kmp_index_t ind = n; ind > 0;) { @@ -485,7 +511,6 @@ __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv, kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind); } - __kmp_free(iterations); } //---------------------------------------------------------------------------- @@ -924,9 +949,7 @@ bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, /*out*/ kmp_point_t original_ivs) { // Iterations in the original space, multiplied by step: - kmp_iterations_t iterations = - (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n); - + CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); for (kmp_index_t ind = n; ind > 0;) { --ind; iterations[ind] = 0; @@ -936,7 +959,6 @@ bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest, bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n, /*in/out*/ original_ivs, /*in/out*/ iterations, 0); - __kmp_free(iterations); return b; } @@ -948,9 +970,7 @@ bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, kmp_index_t n, const kmp_point_t original_ivs, /*out*/ kmp_point_t next_original_ivs) { // Iterations in the original space, multiplied by step (so can be negative): - kmp_iterations_t iterations = - (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n); - + CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); // First, calc corresponding iteration in every original loop: for (kmp_index_t ind = 0; ind < n; ++ind) { auto bounds = &(original_bounds_nest[ind]); @@ -969,7 +989,6 @@ bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest, bool b = kmp_calc_original_ivs_from_iterations( original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind); - __kmp_free(iterations); return b; } @@ -1132,9 +1151,7 @@ bool kmp_calc_original_ivs_for_chunk_end( /*out*/ kmp_point_t original_ivs) { // Iterations in the expanded space: - kmp_iterations_t iterations = - (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n); - + CollapseAllocator<kmp_loop_nest_iv_t> iterations(n); // First, calc corresponding iteration in every modified loop: for (kmp_index_t ind = n; ind > 0;) { --ind; @@ -1166,7 +1183,6 @@ bool kmp_calc_original_ivs_for_chunk_end( // Too big (or too small for >=). if (ind == 0) { // Need to reduce to the end. - __kmp_free(iterations); return false; } else { // Go to next iteration on outer loop: @@ -1197,7 +1213,6 @@ bool kmp_calc_original_ivs_for_chunk_end( ++ind; } - __kmp_free(iterations); return true; } @@ -1291,9 +1306,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n); - bounds_info_internal_t *updated_bounds_nest = - (bounds_info_internal_t *)__kmp_allocate(sizeof(bounds_info_internal_t) * - n); + CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n); for (kmp_index_t i = 0; i < n; ++i) { updated_bounds_nest[i].b = original_bounds_nest[i]; @@ -1308,7 +1321,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, if (total == 0) { // Loop won't execute: - __kmp_free(updated_bounds_nest); return FALSE; } @@ -1322,20 +1334,11 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, KMP_DEBUG_ASSERT(tid < nth); - kmp_point_t original_ivs_start = - (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n); - kmp_point_t original_ivs_end = - (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n); - kmp_point_t original_ivs_next_start = - (kmp_point_t)__kmp_allocate(sizeof(kmp_uint64) * n); + CollapseAllocator<kmp_uint64> original_ivs_start(n); if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n, /*out*/ original_ivs_start)) { // Loop won't execute: - __kmp_free(updated_bounds_nest); - __kmp_free(original_ivs_start); - __kmp_free(original_ivs_end); - __kmp_free(original_ivs_next_start); return FALSE; } @@ -1354,10 +1357,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, // if (plastiter != NULL) { // *plastiter = TRUE; // } - // __kmp_free(updated_bounds_nest); - // __kmp_free(original_ivs_start); - // __kmp_free(original_ivs_end); - // __kmp_free(original_ivs_next_start); // return TRUE; //} @@ -1391,6 +1390,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, new_iv += curr_chunk_size - 1; } + CollapseAllocator<kmp_uint64> original_ivs_end(n); if ((nth == 1) || (new_iv >= total - 1)) { // Do this one till the end - just in case we miscalculated // and either too much is left to process or new_iv is a bit too big: @@ -1421,10 +1421,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, if (last_iter && (tid != 0)) { // We are done, this was last chunk, but no chunk for current thread was // found: - __kmp_free(updated_bounds_nest); - __kmp_free(original_ivs_start); - __kmp_free(original_ivs_end); - __kmp_free(original_ivs_next_start); return FALSE; } @@ -1432,6 +1428,7 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, // We found the chunk for this thread, now we need to check if it's the // last chunk or not: + CollapseAllocator<kmp_uint64> original_ivs_next_start(n); if (last_iter || !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_next_start)) { @@ -1453,10 +1450,6 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, chunk_bounds_nest[i].ub1_u64 = 0; } - __kmp_free(updated_bounds_nest); - __kmp_free(original_ivs_start); - __kmp_free(original_ivs_end); - __kmp_free(original_ivs_next_start); return TRUE; } @@ -1478,9 +1471,5 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid, original_ivs_start, n); } - __kmp_free(updated_bounds_nest); - __kmp_free(original_ivs_start); - __kmp_free(original_ivs_end); - __kmp_free(original_ivs_next_start); return FALSE; } diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake index 58bf64112b1a..5f04301c91c6 100644 --- a/openmp/runtime/src/kmp_config.h.cmake +++ b/openmp/runtime/src/kmp_config.h.cmake @@ -104,6 +104,8 @@ # define CACHE_LINE 128 #elif KMP_ARCH_AARCH64_A64FX # define CACHE_LINE 256 +#elif KMP_ARCH_S390X +# define CACHE_LINE 256 #else # define CACHE_LINE 64 #endif diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp index 95f724f68255..9eeaeb88fb9e 100644 --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -343,7 +343,6 @@ Perform a fork only if the condition is true. void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, kmp_int32 cond, void *args) { int gtid = __kmp_entry_gtid(); - int zero = 0; if (cond) { if (args) __kmpc_fork_call(loc, argc, microtask, args); @@ -352,10 +351,29 @@ void __kmpc_fork_call_if(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, } else { __kmpc_serialized_parallel(loc, gtid); +#if OMPT_SUPPORT + void *exit_frame_ptr; +#endif + if (args) - microtask(>id, &zero, args); + __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid, + /*npr=*/0, + /*argc=*/1, &args +#if OMPT_SUPPORT + , + &exit_frame_ptr +#endif + ); else - microtask(>id, &zero); + __kmp_invoke_microtask(VOLATILE_CAST(microtask_t) microtask, gtid, + /*npr=*/0, + /*argc=*/0, + /*args=*/nullptr +#if OMPT_SUPPORT + , + &exit_frame_ptr +#endif + ); __kmpc_end_serialized_parallel(loc, gtid); } @@ -385,6 +403,24 @@ void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, @ingroup PARALLEL @param loc source location information @param global_tid global thread number +@param thread_limit limit on number of threads which can be created within the +current task + +Set the thread_limit for the current task +This call is there to support `thread_limit` clause on the `target` construct +*/ +void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid, + kmp_int32 thread_limit) { + __kmp_assert_valid_gtid(global_tid); + kmp_info_t *thread = __kmp_threads[global_tid]; + if (thread_limit > 0) + thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit; +} + +/*! +@ingroup PARALLEL +@param loc source location information +@param global_tid global thread number @param num_teams_lb lower bound on number of teams requested for the teams construct @param num_teams_ub upper bound on number of teams requested for the teams @@ -2065,14 +2101,15 @@ void kmpc_set_stacksize_s(size_t arg) { } void kmpc_set_blocktime(int arg) { - int gtid, tid; + int gtid, tid, bt = arg; kmp_info_t *thread; gtid = __kmp_entry_gtid(); tid = __kmp_tid_from_gtid(gtid); thread = __kmp_thread_from_gtid(gtid); - __kmp_aux_set_blocktime(arg, thread, tid); + __kmp_aux_convert_blocktime(&bt); + __kmp_aux_set_blocktime(bt, thread, tid); } void kmpc_set_library(int arg) { diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp index a6ee844e5988..ac85b2b3f2fc 100644 --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -90,6 +90,70 @@ static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, return monotonicity; } +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED +// Return floating point number rounded to two decimal points +static inline float __kmp_round_2decimal_val(float num) { + return (float)(static_cast<int>(num * 100 + 0.5)) / 100; +} +static inline int __kmp_get_round_val(float num) { + return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5); +} +#endif + +template <typename T> +inline void +__kmp_initialize_self_buffer(kmp_team_t *team, T id, + dispatch_private_info_template<T> *pr, + typename traits_t<T>::unsigned_t nchunks, T nproc, + typename traits_t<T>::unsigned_t &init, + T &small_chunk, T &extras, T &p_extra) { + +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED + if (pr->flags.use_hybrid) { + kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)]; + kmp_hw_core_type_t type = + (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type; + T pchunks = pr->u.p.pchunks; + T echunks = nchunks - pchunks; + T num_procs_with_pcore = pr->u.p.num_procs_with_pcore; + T num_procs_with_ecore = nproc - num_procs_with_pcore; + T first_thread_with_ecore = pr->u.p.first_thread_with_ecore; + T big_chunk = + pchunks / num_procs_with_pcore; // chunks per thread with p-core + small_chunk = + echunks / num_procs_with_ecore; // chunks per thread with e-core + + extras = + (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore); + + p_extra = (big_chunk - small_chunk); + + if (type == KMP_HW_CORE_TYPE_CORE) { + if (id < first_thread_with_ecore) { + init = id * small_chunk + id * p_extra + (id < extras ? id : extras); + } else { + init = id * small_chunk + (id - num_procs_with_ecore) * p_extra + + (id < extras ? id : extras); + } + } else { + if (id == first_thread_with_ecore) { + init = id * small_chunk + id * p_extra + (id < extras ? id : extras); + } else { + init = id * small_chunk + first_thread_with_ecore * p_extra + + (id < extras ? id : extras); + } + } + p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0; + return; + } +#endif + + small_chunk = nchunks / nproc; // chunks per thread + extras = nchunks % nproc; + p_extra = 0; + init = id * small_chunk + (id < extras ? id : extras); +} + #if KMP_STATIC_STEAL_ENABLED enum { // values for steal_flag (possible states of private per-loop buffer) UNUSED = 0, @@ -366,7 +430,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, switch (schedule) { #if KMP_STATIC_STEAL_ENABLED case kmp_sch_static_steal: { - T ntc, init; + T ntc, init = 0; KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n", @@ -376,7 +440,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, if (nproc > 1 && ntc >= nproc) { KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); T id = tid; - T small_chunk, extras; + T small_chunk, extras, p_extra = 0; kmp_uint32 old = UNUSED; int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED); if (traits_t<T>::type_size > 4) { @@ -388,13 +452,110 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); __kmp_init_lock(pr->u.p.steal_lock); } - small_chunk = ntc / nproc; - extras = ntc % nproc; - init = id * small_chunk + (id < extras ? id : extras); +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED + // Iterations are divided in a 60/40 skewed distribution among CORE and + // ATOM processors for hybrid systems + bool use_hybrid = false; + kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN; + T first_thread_with_ecore = 0; + T num_procs_with_pcore = 0; + T num_procs_with_ecore = 0; + T p_ntc = 0, e_ntc = 0; + if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none && + __kmp_affinity.type != affinity_explicit) { + use_hybrid = true; + core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type; + if (core_type != KMP_HW_CORE_TYPE_UNKNOWN && + __kmp_first_osid_with_ecore > -1) { + for (int i = 0; i < team->t.t_nproc; ++i) { + kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i] + ->th.th_topology_attrs.core_type; + int id = team->t.t_threads[i]->th.th_topology_ids.os_id; + if (id == __kmp_first_osid_with_ecore) { + first_thread_with_ecore = + team->t.t_threads[i]->th.th_info.ds.ds_tid; + } + if (type == KMP_HW_CORE_TYPE_CORE) { + num_procs_with_pcore++; + } else if (type == KMP_HW_CORE_TYPE_ATOM) { + num_procs_with_ecore++; + } else { + use_hybrid = false; + break; + } + } + } + if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) { + float multiplier = 60.0 / 40.0; + float p_ratio = (float)num_procs_with_pcore / nproc; + float e_ratio = (float)num_procs_with_ecore / nproc; + float e_multiplier = + (float)1 / + (((multiplier * num_procs_with_pcore) / nproc) + e_ratio); + float p_multiplier = multiplier * e_multiplier; + p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier); + if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier)) + e_ntc = + (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier)); + else + e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier); + KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc); + + // Use regular static steal if not enough chunks for skewed + // distribution + use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore && + e_ntc >= num_procs_with_ecore) + ? true + : false); + } else { + use_hybrid = false; + } + } + pr->flags.use_hybrid = use_hybrid; + pr->u.p.pchunks = p_ntc; + pr->u.p.num_procs_with_pcore = num_procs_with_pcore; + pr->u.p.first_thread_with_ecore = first_thread_with_ecore; + + if (use_hybrid) { + KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore); + T big_chunk = p_ntc / num_procs_with_pcore; + small_chunk = e_ntc / num_procs_with_ecore; + + extras = + (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore); + + p_extra = (big_chunk - small_chunk); + + if (core_type == KMP_HW_CORE_TYPE_CORE) { + if (id < first_thread_with_ecore) { + init = + id * small_chunk + id * p_extra + (id < extras ? id : extras); + } else { + init = id * small_chunk + (id - num_procs_with_ecore) * p_extra + + (id < extras ? id : extras); + } + } else { + if (id == first_thread_with_ecore) { + init = + id * small_chunk + id * p_extra + (id < extras ? id : extras); + } else { + init = id * small_chunk + first_thread_with_ecore * p_extra + + (id < extras ? id : extras); + } + } + p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0; + } else +#endif + { + small_chunk = ntc / nproc; + extras = ntc % nproc; + init = id * small_chunk + (id < extras ? id : extras); + p_extra = 0; + } pr->u.p.count = init; if (claimed) { // are we succeeded in claiming own buffer? - pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); + pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0); // Other threads will inspect steal_flag when searching for a victim. // READY means other threads may steal from this thread from now on. KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); @@ -1261,13 +1422,13 @@ int __kmp_dispatch_next_algorithm(int gtid, if (status) { // initialize self buffer with victim's whole range of chunks T id = victimId; - T small_chunk, extras; - small_chunk = nchunks / nproc; // chunks per thread - extras = nchunks % nproc; - init = id * small_chunk + (id < extras ? id : extras); + T small_chunk = 0, extras = 0, p_extra = 0; + __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc, + init, small_chunk, extras, + p_extra); __kmp_acquire_lock(lck, gtid); pr->u.p.count = init + 1; // exclude one we execute immediately - pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); + pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0); __kmp_release_lock(lck, gtid); pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid // no need to reinitialize other thread invariants: lb, st, etc. @@ -1275,10 +1436,10 @@ int __kmp_dispatch_next_algorithm(int gtid, { char *buff; // create format specifiers before the debug output - buff = __kmp_str_format( - "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " - "count:%%%s ub:%%%s\n", - traits_t<UT>::spec, traits_t<T>::spec); + buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " + "stolen chunks from T#%%d, " + "count:%%%s ub:%%%s\n", + traits_t<UT>::spec, traits_t<T>::spec); KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); __kmp_str_free(&buff); } @@ -1404,12 +1565,12 @@ int __kmp_dispatch_next_algorithm(int gtid, if (status) { // initialize self buffer with victim's whole range of chunks T id = victimId; - T small_chunk, extras; - small_chunk = nchunks / nproc; // chunks per thread - extras = nchunks % nproc; - init = id * small_chunk + (id < extras ? id : extras); + T small_chunk = 0, extras = 0, p_extra = 0; + __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc, + init, small_chunk, extras, + p_extra); vnew.p.count = init + 1; - vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0); + vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0); // write pair (count, ub) at once atomically #if KMP_ARCH_X86 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b); @@ -1422,10 +1583,10 @@ int __kmp_dispatch_next_algorithm(int gtid, { char *buff; // create format specifiers before the debug output - buff = __kmp_str_format( - "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " - "count:%%%s ub:%%%s\n", - traits_t<UT>::spec, traits_t<T>::spec); + buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d " + "stolen chunks from T#%%d, " + "count:%%%s ub:%%%s\n", + traits_t<UT>::spec, traits_t<T>::spec); KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); __kmp_str_free(&buff); } diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h index 154db174613d..cf19eb52662c 100644 --- a/openmp/runtime/src/kmp_dispatch.h +++ b/openmp/runtime/src/kmp_dispatch.h @@ -75,14 +75,17 @@ template <typename T> struct dispatch_private_infoXX_template { ST st; // signed UT tc; // unsigned kmp_lock_t *steal_lock; // lock used for chunk stealing + + UT ordered_lower; // unsigned + UT ordered_upper; // unsigned + /* parm[1-4] are used in different ways by different scheduling algorithms */ - // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) + // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on ) // a) parm3 is properly aligned and // b) all parm1-4 are in the same cache line. // Because of parm1-4 are used together, performance seems to be better // if they are in the same line (not measured though). - struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 T parm1; T parm2; @@ -90,8 +93,11 @@ template <typename T> struct dispatch_private_infoXX_template { T parm4; }; - UT ordered_lower; // unsigned - UT ordered_upper; // unsigned +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED + UT pchunks; // total number of chunks for processes with p-core + UT num_procs_with_pcore; // number of threads with p-core + T first_thread_with_ecore; +#endif #if KMP_OS_WINDOWS T last_upper; #endif /* KMP_OS_WINDOWS */ diff --git a/openmp/runtime/src/kmp_environment.cpp b/openmp/runtime/src/kmp_environment.cpp index b35027b57f03..4def6ea9ac20 100644 --- a/openmp/runtime/src/kmp_environment.cpp +++ b/openmp/runtime/src/kmp_environment.cpp @@ -407,9 +407,11 @@ ___kmp_env_blk_parse_unix(kmp_env_blk_t *block, // M: Env block to fill. int i; var = bulk; for (i = 0; i < count; ++i) { + KMP_ASSERT(var < bulk + size); + [[maybe_unused]] size_t ssize = size - (var - bulk); // Copy variable to bulk. len = KMP_STRLEN(env[i]); - KMP_MEMCPY_S(var, size, env[i], len + 1); + KMP_MEMCPY_S(var, ssize, env[i], len + 1); // Save found variable in vars array. __kmp_str_split(var, '=', &name, &value); vars[i].name = name; diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h index 038bccfba3ea..ad19079cb650 100644 --- a/openmp/runtime/src/kmp_ftn_entry.h +++ b/openmp/runtime/src/kmp_ftn_entry.h @@ -112,17 +112,19 @@ void FTN_STDCALL FTN_SET_BLOCKTIME(int KMP_DEREF arg) { #ifdef KMP_STUB __kmps_set_blocktime(KMP_DEREF arg); #else - int gtid, tid; + int gtid, tid, bt = (KMP_DEREF arg); kmp_info_t *thread; gtid = __kmp_entry_gtid(); tid = __kmp_tid_from_gtid(gtid); thread = __kmp_thread_from_gtid(gtid); - __kmp_aux_set_blocktime(KMP_DEREF arg, thread, tid); + __kmp_aux_convert_blocktime(&bt); + __kmp_aux_set_blocktime(bt, thread, tid); #endif } +// Gets blocktime in units used for KMP_BLOCKTIME, ms otherwise int FTN_STDCALL FTN_GET_BLOCKTIME(void) { #ifdef KMP_STUB return __kmps_get_blocktime(); @@ -136,21 +138,24 @@ int FTN_STDCALL FTN_GET_BLOCKTIME(void) { /* These must match the settings used in __kmp_wait_sleep() */ if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { - KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid, - team->t.t_id, tid, KMP_MAX_BLOCKTIME)); + KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid, + team->t.t_id, tid, KMP_MAX_BLOCKTIME, __kmp_blocktime_units)); return KMP_MAX_BLOCKTIME; } #ifdef KMP_ADJUST_BLOCKTIME else if (__kmp_zero_bt && !get__bt_set(team, tid)) { - KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid, - team->t.t_id, tid, 0)); + KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid, + team->t.t_id, tid, 0, __kmp_blocktime_units)); return 0; } #endif /* KMP_ADJUST_BLOCKTIME */ else { - KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d\n", gtid, - team->t.t_id, tid, get__blocktime(team, tid))); - return get__blocktime(team, tid); + int bt = get__blocktime(team, tid); + if (__kmp_blocktime_units == 'm') + bt = bt / 1000; + KF_TRACE(10, ("kmp_get_blocktime: T#%d(%d:%d), blocktime=%d%cs\n", gtid, + team->t.t_id, tid, bt, __kmp_blocktime_units)); + return bt; } #endif } @@ -577,7 +582,7 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_NUM)(void) { int gtid; #if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ - KMP_OS_HURD || KMP_OS_OPENBSD + KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS gtid = __kmp_entry_gtid(); #elif KMP_OS_WINDOWS if (!__kmp_init_parallel || @@ -802,6 +807,10 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_LIMIT)(void) { gtid = __kmp_entry_gtid(); thread = __kmp_threads[gtid]; + // If thread_limit for the target task is defined, return that instead of the + // regular task thread_limit + if (int thread_limit = thread->th.th_current_task->td_icvs.task_thread_limit) + return thread_limit; return thread->th.th_current_task->td_icvs.thread_limit; #endif } diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h index d37c9c86028e..7d595b947f4a 100644 --- a/openmp/runtime/src/kmp_ftn_os.h +++ b/openmp/runtime/src/kmp_ftn_os.h @@ -116,6 +116,8 @@ #define FTN_TARGET_IS_PRESENT omp_target_is_present #define FTN_TARGET_MEMCPY omp_target_memcpy #define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect +#define FTN_TARGET_MEMSET omp_target_memset +#define FTN_TARGET_MEMSET_ASYNC omp_target_memset_async #define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr #define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr #endif diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index 4ce0691abf8d..b132f38fd3b0 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -125,6 +125,7 @@ size_t __kmp_sys_min_stksize = KMP_MIN_STKSIZE; int __kmp_sys_max_nth = KMP_MAX_NTH; int __kmp_max_nth = 0; int __kmp_cg_max_nth = 0; +int __kmp_task_max_nth = 0; int __kmp_teams_max_nth = 0; int __kmp_threads_capacity = 0; int __kmp_dflt_team_nth = 0; @@ -154,7 +155,8 @@ int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1]; int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1]; kmp_hier_sched_env_t __kmp_hier_scheds = {0, 0, NULL, NULL, NULL}; #endif -int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; +int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; // in microseconds +char __kmp_blocktime_units = 'm'; // Units specified in KMP_BLOCKTIME bool __kmp_wpolicy_passive = false; #if KMP_USE_MONITOR int __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS; @@ -280,6 +282,9 @@ kmp_affinity_t __kmp_hh_affinity = kmp_affinity_t *__kmp_affinities[] = {&__kmp_affinity, &__kmp_hh_affinity}; char *__kmp_cpuinfo_file = NULL; +#if KMP_WEIGHTED_ITERATIONS_SUPPORTED +int __kmp_first_osid_with_ecore = -1; +#endif #endif /* KMP_AFFINITY_SUPPORTED */ diff --git a/openmp/runtime/src/kmp_itt.inl b/openmp/runtime/src/kmp_itt.inl index 5e75f60124af..5236165c35b3 100644 --- a/openmp/runtime/src/kmp_itt.inl +++ b/openmp/runtime/src/kmp_itt.inl @@ -438,7 +438,7 @@ void *__kmp_itt_barrier_object(int gtid, int bt, int set_name, KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= bs_last_barrier); // This condition is a must (we would have zero divide otherwise). KMP_BUILD_ASSERT(sizeof(kmp_team_t) >= 2 * bs_last_barrier); - // More strong condition: make sure we have room at least for for two + // More strong condition: make sure we have room at least for two // different ids (for each barrier type). object = reinterpret_cast<void *>( (kmp_uintptr_t)(team) + diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp index 8fcddc710862..85c54f4cdc7e 100644 --- a/openmp/runtime/src/kmp_lock.cpp +++ b/openmp/runtime/src/kmp_lock.cpp @@ -3809,7 +3809,7 @@ static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) { sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1)); table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table; // We cannot free the previous table now, since it may be in use by other - // threads. So save the pointer to the previous table in in the first + // threads. So save the pointer to the previous table in the first // element of the new table. All the tables will be organized into a list, // and could be freed when library shutting down. __kmp_user_lock_table.table = table; diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index fec589ab6018..beb8d0197ddf 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -105,8 +105,9 @@ 128-bit extended precision type yet */ typedef long double _Quad; #elif KMP_COMPILER_GCC -/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad */ -#if !KMP_OS_NETBSD +/* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad until + NetBSD 10.0 which ships with GCC 10.5 */ +#if (!KMP_OS_NETBSD || __GNUC__ >= 10) typedef __float128 _Quad; #undef KMP_HAVE_QUAD #define KMP_HAVE_QUAD 1 @@ -178,7 +179,8 @@ typedef unsigned long long kmp_uint64; #if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS #define KMP_SIZE_T_SPEC KMP_UINT32_SPEC #elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ - KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 + KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \ + KMP_ARCH_VE || KMP_ARCH_S390X #define KMP_SIZE_T_SPEC KMP_UINT64_SPEC #else #error "Can't determine size_t printf format specifier." @@ -1043,7 +1045,8 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v); #endif /* KMP_OS_WINDOWS */ #if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || \ - KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 + KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \ + KMP_ARCH_VE || KMP_ARCH_S390X #if KMP_OS_WINDOWS #undef KMP_MB #define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst) diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h index fcfd8bc5d8d9..b7972c7248dd 100644 --- a/openmp/runtime/src/kmp_platform.h +++ b/openmp/runtime/src/kmp_platform.h @@ -23,6 +23,7 @@ #define KMP_OS_DARWIN 0 #define KMP_OS_WINDOWS 0 #define KMP_OS_HURD 0 +#define KMP_OS_SOLARIS 0 #define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */ #ifdef _WIN32 @@ -70,13 +71,19 @@ #define KMP_OS_HURD 1 #endif +#if (defined __sun__ && defined __svr4__) +#undef KMP_OS_SOLARIS +#define KMP_OS_SOLARIS 1 +#endif + #if (1 != KMP_OS_LINUX + KMP_OS_DRAGONFLY + KMP_OS_FREEBSD + KMP_OS_NETBSD + \ - KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD) + KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD + \ + KMP_OS_SOLARIS) #error Unknown OS #endif #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ - KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD + KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD || KMP_OS_SOLARIS #undef KMP_OS_UNIX #define KMP_OS_UNIX 1 #endif @@ -93,6 +100,8 @@ #define KMP_ARCH_MIPS64 0 #define KMP_ARCH_RISCV64 0 #define KMP_ARCH_LOONGARCH64 0 +#define KMP_ARCH_VE 0 +#define KMP_ARCH_S390X 0 #if KMP_OS_WINDOWS #if defined(_M_AMD64) || defined(__x86_64) @@ -142,6 +151,12 @@ #elif defined __loongarch__ && __loongarch_grlen == 64 #undef KMP_ARCH_LOONGARCH64 #define KMP_ARCH_LOONGARCH64 1 +#elif defined __ve__ +#undef KMP_ARCH_VE +#define KMP_ARCH_VE 1 +#elif defined __s390x__ +#undef KMP_ARCH_S390X +#define KMP_ARCH_S390X 1 #endif #endif @@ -206,7 +221,8 @@ // TODO: Fixme - This is clever, but really fugly #if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 + \ KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 + \ - KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64) + KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE + \ + KMP_ARCH_S390X) #error Unknown or unsupported architecture #endif diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index e55798df610c..25136691bc72 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -178,7 +178,12 @@ int __kmp_get_global_thread_id() { if (stack_diff <= stack_size) { /* The only way we can be closer than the allocated */ /* stack size is if we are running on this thread. */ - KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); + // __kmp_gtid_get_specific can return negative value because this + // function can be called by thread destructor. However, before the + // thread destructor is called, the value of the corresponding + // thread-specific data will be reset to NULL. + KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 || + __kmp_gtid_get_specific() == i); return i; } } @@ -196,6 +201,12 @@ int __kmp_get_global_thread_id() { if (i < 0) return i; + // other_threads[i] can be nullptr at this point because the corresponding + // thread could have already been destructed. It can happen when this function + // is called in end library routine. + if (!TCR_SYNC_PTR(other_threads[i])) + return i; + /* dynamically updated stack window for uber threads to avoid get_specific call */ if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { @@ -1872,6 +1883,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, int nthreads; int master_active; int master_set_numthreads; + int task_thread_limit = 0; int level; int active_level; int teams_level; @@ -1910,6 +1922,8 @@ int __kmp_fork_call(ident_t *loc, int gtid, root = master_th->th.th_root; master_active = root->r.r_active; master_set_numthreads = master_th->th.th_set_nproc; + task_thread_limit = + master_th->th.th_current_task->td_icvs.task_thread_limit; #if OMPT_SUPPORT ompt_data_t ompt_parallel_data = ompt_data_none; @@ -2000,6 +2014,11 @@ int __kmp_fork_call(ident_t *loc, int gtid, ? master_set_numthreads // TODO: get nproc directly from current task : get__nproc_2(parent_team, master_tid); + // Use the thread_limit set for the current target task if exists, else go + // with the deduced nthreads + nthreads = task_thread_limit > 0 && task_thread_limit < nthreads + ? task_thread_limit + : nthreads; // Check if we need to take forkjoin lock? (no need for serialized // parallel out of teams construct). if (nthreads > 1) { @@ -3291,6 +3310,8 @@ static kmp_internal_control_t __kmp_get_global_icvs(void) { // next parallel region (per thread) // (use a max ub on value if __kmp_parallel_initialize not called yet) __kmp_cg_max_nth, // int thread_limit; + __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit + // on task. This is used in the case of target thread_limit __kmp_dflt_max_active_levels, // int max_active_levels; //internal control // for max_active_levels r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule @@ -4671,6 +4692,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, } #endif /* KMP_ADJUST_BLOCKTIME */ +#if KMP_AFFINITY_SUPPORTED + // Set the affinity and topology information for new thread + __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE); +#endif + /* actually fork it and create the new worker thread */ KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); @@ -4764,6 +4790,19 @@ static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, } #if KMP_AFFINITY_SUPPORTED +static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th, + int first, int last, int newp) { + th->th.th_first_place = first; + th->th.th_last_place = last; + th->th.th_new_place = newp; + if (newp != th->th.th_current_place) { + if (__kmp_display_affinity && team->t.t_display_affinity != 1) + team->t.t_display_affinity = 1; + // Copy topology information associated with the new place + th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place]; + th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place]; + } +} // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. // It calculates the worker + primary thread's partition based upon the parent @@ -4803,13 +4842,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { for (f = 1; f < n_th; f++) { kmp_info_t *th = team->t.t_threads[f]; KMP_DEBUG_ASSERT(th != NULL); - th->th.th_first_place = first_place; - th->th.th_last_place = last_place; - th->th.th_new_place = masters_place; - if (__kmp_display_affinity && masters_place != th->th.th_current_place && - team->t.t_display_affinity != 1) { - team->t.t_display_affinity = 1; - } + __kmp_set_thread_place(team, th, first_place, last_place, masters_place); KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " "partition = [%d,%d]\n", @@ -4840,13 +4873,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { } else { place++; } - th->th.th_first_place = first_place; - th->th.th_last_place = last_place; - th->th.th_new_place = place; - if (__kmp_display_affinity && place != th->th.th_current_place && - team->t.t_display_affinity != 1) { - team->t.t_display_affinity = 1; - } + __kmp_set_thread_place(team, th, first_place, last_place, place); KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " "partition = [%d,%d]\n", @@ -4865,13 +4892,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { kmp_info_t *th = team->t.t_threads[f]; KMP_DEBUG_ASSERT(th != NULL); - th->th.th_first_place = first_place; - th->th.th_last_place = last_place; - th->th.th_new_place = place; - if (__kmp_display_affinity && place != th->th.th_current_place && - team->t.t_display_affinity != 1) { - team->t.t_display_affinity = 1; - } + __kmp_set_thread_place(team, th, first_place, last_place, place); s_count++; if ((s_count == S) && rem && (gap_ct == gap)) { @@ -4938,12 +4959,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { kmp_info_t *th = team->t.t_threads[f]; KMP_DEBUG_ASSERT(th != NULL); - th->th.th_first_place = place; - th->th.th_new_place = place; - if (__kmp_display_affinity && place != th->th.th_current_place && - team->t.t_display_affinity != 1) { - team->t.t_display_affinity = 1; - } + int fplace = place, nplace = place; s_count = 1; while (s_count < S) { if (place == last_place) { @@ -4966,7 +4982,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { rem--; gap_ct = 0; } - th->th.th_last_place = place; + __kmp_set_thread_place(team, th, fplace, place, nplace); gap_ct++; if (place == last_place) { @@ -5032,13 +5048,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { KMP_DEBUG_ASSERT(last_place >= first_place); th = team->t.t_threads[f]; KMP_DEBUG_ASSERT(th); - th->th.th_first_place = first; - th->th.th_new_place = place; - th->th.th_last_place = last; - if (__kmp_display_affinity && place != th->th.th_current_place && - team->t.t_display_affinity != 1) { - team->t.t_display_affinity = 1; - } + __kmp_set_thread_place(team, th, first, last, place); KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " "partition = [%d,%d], spacing = %.4f\n", @@ -5064,13 +5074,7 @@ static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { kmp_info_t *th = team->t.t_threads[f]; KMP_DEBUG_ASSERT(th != NULL); - th->th.th_first_place = place; - th->th.th_last_place = place; - th->th.th_new_place = place; - if (__kmp_display_affinity && place != th->th.th_current_place && - team->t.t_display_affinity != 1) { - team->t.t_display_affinity = 1; - } + __kmp_set_thread_place(team, th, place, place, place); s_count++; if ((s_count == S) && rem && (gap_ct == gap)) { @@ -6713,6 +6717,8 @@ static inline char *__kmp_reg_status_name() { } // __kmp_reg_status_get #if defined(KMP_USE_SHM) +bool __kmp_shm_available = false; +bool __kmp_tmp_available = false; // If /dev/shm is not accessible, we will create a temporary file under /tmp. char *temp_reg_status_file_name = nullptr; #endif @@ -6742,60 +6748,108 @@ void __kmp_register_library_startup(void) { char *value = NULL; // Actual value of the environment variable. #if defined(KMP_USE_SHM) - char *shm_name = __kmp_str_format("/%s", name); - int shm_preexist = 0; - char *data1; - int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); - if ((fd1 == -1) && (errno == EEXIST)) { - // file didn't open because it already exists. - // try opening existing file - fd1 = shm_open(shm_name, O_RDWR, 0666); - if (fd1 == -1) { // file didn't open - // error out here - __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), - __kmp_msg_null); - } else { - // able to open existing file - shm_preexist = 1; + char *shm_name = nullptr; + char *data1 = nullptr; + __kmp_shm_available = __kmp_detect_shm(); + if (__kmp_shm_available) { + int fd1 = -1; + shm_name = __kmp_str_format("/%s", name); + int shm_preexist = 0; + fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); + if ((fd1 == -1) && (errno == EEXIST)) { + // file didn't open because it already exists. + // try opening existing file + fd1 = shm_open(shm_name, O_RDWR, 0666); + if (fd1 == -1) { // file didn't open + KMP_WARNING(FunctionError, "Can't open SHM"); + __kmp_shm_available = false; + } else { // able to open existing file + shm_preexist = 1; + } + } + if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size + if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size; + KMP_WARNING(FunctionError, "Can't set size of SHM"); + __kmp_shm_available = false; + } + } + if (__kmp_shm_available) { // SHM exists, now map it + data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, + fd1, 0); + if (data1 == MAP_FAILED) { // failed to map shared memory + KMP_WARNING(FunctionError, "Can't map SHM"); + __kmp_shm_available = false; + } + } + if (__kmp_shm_available) { // SHM mapped + if (shm_preexist == 0) { // set data to SHM, set value + KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); + } + // Read value from either what we just wrote or existing file. + value = __kmp_str_format("%s", data1); // read value from SHM + munmap(data1, SHM_SIZE); } - } else if (fd1 == -1) { - // SHM didn't open; it was due to error other than already exists. Try to - // create a temp file under /tmp. + if (fd1 != -1) + close(fd1); + } + if (!__kmp_shm_available) + __kmp_tmp_available = __kmp_detect_tmp(); + if (!__kmp_shm_available && __kmp_tmp_available) { + // SHM failed to work due to an error other than that the file already + // exists. Try to create a temp file under /tmp. + // If /tmp isn't accessible, fall back to using environment variable. // TODO: /tmp might not always be the temporary directory. For now we will - // not consider TMPDIR. If /tmp is not accessible, we simply error out. - char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name); - fd1 = mkstemp(temp_file_name); - if (fd1 == -1) { - // error out here. - __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno), - __kmp_msg_null); + // not consider TMPDIR. + int fd1 = -1; + temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name); + int tmp_preexist = 0; + fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666); + if ((fd1 == -1) && (errno == EEXIST)) { + // file didn't open because it already exists. + // try opening existing file + fd1 = open(temp_reg_status_file_name, O_RDWR, 0666); + if (fd1 == -1) { // file didn't open if (fd1 == -1) { + KMP_WARNING(FunctionError, "Can't open TEMP"); + __kmp_tmp_available = false; + } else { + tmp_preexist = 1; + } } - temp_reg_status_file_name = temp_file_name; - } - if (shm_preexist == 0) { - // we created SHM now set size - if (ftruncate(fd1, SHM_SIZE) == -1) { - // error occured setting size; - __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), - KMP_ERR(errno), __kmp_msg_null); + if (__kmp_tmp_available && tmp_preexist == 0) { + // we created /tmp file now set size + if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size; + KMP_WARNING(FunctionError, "Can't set size of /tmp file"); + __kmp_tmp_available = false; + } } + if (__kmp_tmp_available) { + data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, + fd1, 0); + if (data1 == MAP_FAILED) { // failed to map /tmp + KMP_WARNING(FunctionError, "Can't map /tmp"); + __kmp_tmp_available = false; + } + } + if (__kmp_tmp_available) { + if (tmp_preexist == 0) { // set data to TMP, set value + KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); + } + // Read value from either what we just wrote or existing file. + value = __kmp_str_format("%s", data1); // read value from SHM + munmap(data1, SHM_SIZE); + } + if (fd1 != -1) + close(fd1); } - data1 = - (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); - if (data1 == MAP_FAILED) { - // failed to map shared memory - __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), - __kmp_msg_null); - } - if (shm_preexist == 0) { // set data to SHM, set value - KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); + if (!__kmp_shm_available && !__kmp_tmp_available) { + // no /dev/shm and no /tmp -- fall back to environment variable + // Set environment variable, but do not overwrite if it exists. + __kmp_env_set(name, __kmp_registration_str, 0); + // read value to see if it got set + value = __kmp_env_get(name); } - // Read value from either what we just wrote or existing file. - value = __kmp_str_format("%s", data1); // read value from SHM - munmap(data1, SHM_SIZE); - close(fd1); #else // Windows and unix with static library - // Set environment variable, but do not overwrite if it is exist. + // Set environment variable, but do not overwrite if it exists. __kmp_env_set(name, __kmp_registration_str, 0); // read value to see if it got set value = __kmp_env_get(name); @@ -6855,8 +6909,14 @@ void __kmp_register_library_startup(void) { case 2: { // Neighbor is dead. #if defined(KMP_USE_SHM) - // close shared memory. - shm_unlink(shm_name); // this removes file in /dev/shm + if (__kmp_shm_available) { // close shared memory. + shm_unlink(shm_name); // this removes file in /dev/shm + } else if (__kmp_tmp_available) { + unlink(temp_reg_status_file_name); // this removes the temp file + } else { + // Clear the variable and try to register library again. + __kmp_env_unset(name); + } #else // Clear the variable and try to register library again. __kmp_env_unset(name); @@ -6869,7 +6929,8 @@ void __kmp_register_library_startup(void) { } KMP_INTERNAL_FREE((void *)value); #if defined(KMP_USE_SHM) - KMP_INTERNAL_FREE((void *)shm_name); + if (shm_name) + KMP_INTERNAL_FREE((void *)shm_name); #endif } // while KMP_INTERNAL_FREE((void *)name); @@ -6882,25 +6943,32 @@ void __kmp_unregister_library(void) { char *value = NULL; #if defined(KMP_USE_SHM) - bool use_shm = true; - char *shm_name = __kmp_str_format("/%s", name); - int fd1 = shm_open(shm_name, O_RDONLY, 0666); - if (fd1 == -1) { - // File did not open. Try the temporary file. - use_shm = false; - KMP_DEBUG_ASSERT(temp_reg_status_file_name); + char *shm_name = nullptr; + int fd1; + if (__kmp_shm_available) { + shm_name = __kmp_str_format("/%s", name); + fd1 = shm_open(shm_name, O_RDONLY, 0666); + if (fd1 != -1) { // File opened successfully + char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); + if (data1 != MAP_FAILED) { + value = __kmp_str_format("%s", data1); // read value from SHM + munmap(data1, SHM_SIZE); + } + close(fd1); + } + } else if (__kmp_tmp_available) { // try /tmp fd1 = open(temp_reg_status_file_name, O_RDONLY); - if (fd1 == -1) { - // give it up now. - return; + if (fd1 != -1) { // File opened successfully + char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); + if (data1 != MAP_FAILED) { + value = __kmp_str_format("%s", data1); // read value from /tmp + munmap(data1, SHM_SIZE); + } + close(fd1); } + } else { // fall back to envirable + value = __kmp_env_get(name); } - char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); - if (data1 != MAP_FAILED) { - value = __kmp_str_format("%s", data1); // read value from SHM - munmap(data1, SHM_SIZE); - } - close(fd1); #else value = __kmp_env_get(name); #endif @@ -6910,11 +6978,12 @@ void __kmp_unregister_library(void) { if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { // Ok, this is our variable. Delete it. #if defined(KMP_USE_SHM) - if (use_shm) { + if (__kmp_shm_available) { shm_unlink(shm_name); // this removes file in /dev/shm - } else { - KMP_DEBUG_ASSERT(temp_reg_status_file_name); + } else if (__kmp_tmp_available) { unlink(temp_reg_status_file_name); // this removes the temp file + } else { + __kmp_env_unset(name); } #else __kmp_env_unset(name); @@ -6922,11 +6991,10 @@ void __kmp_unregister_library(void) { } #if defined(KMP_USE_SHM) - KMP_INTERNAL_FREE(shm_name); - if (!use_shm) { - KMP_DEBUG_ASSERT(temp_reg_status_file_name); + if (shm_name) + KMP_INTERNAL_FREE(shm_name); + if (temp_reg_status_file_name) KMP_INTERNAL_FREE(temp_reg_status_file_name); - } #endif KMP_INTERNAL_FREE(__kmp_registration_str); @@ -8729,9 +8797,8 @@ void __kmp_aux_display_affinity(int gtid, const char *format) { } /* ------------------------------------------------------------------------ */ - void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { - int blocktime = arg; /* argument is in milliseconds */ + int blocktime = arg; /* argument is in microseconds */ #if KMP_USE_MONITOR int bt_intervals; #endif @@ -8827,10 +8894,12 @@ __kmp_determine_reduction_method( int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ - KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 + KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \ + KMP_ARCH_VE || KMP_ARCH_S390X #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ - KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD + KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || \ + KMP_OS_SOLARIS int teamsize_cutoff = 4; @@ -8854,11 +8923,13 @@ __kmp_determine_reduction_method( #else #error "Unknown or unsupported OS" #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || - // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD + // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || + // KMP_OS_SOLARIS #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS -#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD +#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ + KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS // basic tuning diff --git a/openmp/runtime/src/kmp_safe_c_api.h b/openmp/runtime/src/kmp_safe_c_api.h index 72f26fd9897d..79f4a7f5732a 100644 --- a/openmp/runtime/src/kmp_safe_c_api.h +++ b/openmp/runtime/src/kmp_safe_c_api.h @@ -56,7 +56,11 @@ template <typename T> struct kmp_get_rmax_t<T, true> { // For now, these macros use the existing API. +#if KMP_OS_NETBSD +#define KMP_ALLOCA __builtin_alloca +#else #define KMP_ALLOCA alloca +#endif #define KMP_MEMCPY_S(dst, bsz, src, cnt) memcpy(dst, src, cnt) #define KMP_SNPRINTF snprintf #define KMP_SSCANF sscanf diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index b81376d1632b..e731bf45e8ee 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -149,70 +149,6 @@ static size_t __kmp_round4k(size_t size) { } // __kmp_round4k #endif -/* Here, multipliers are like __kmp_convert_to_seconds, but floating-point - values are allowed, and the return value is in milliseconds. The default - multiplier is milliseconds. Returns INT_MAX only if the value specified - matches "infinit*". Returns -1 if specified string is invalid. */ -int __kmp_convert_to_milliseconds(char const *data) { - int ret, nvalues, factor; - char mult, extra; - double value; - - if (data == NULL) - return (-1); - if (__kmp_str_match("infinit", -1, data)) - return (INT_MAX); - value = (double)0.0; - mult = '\0'; -#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT - // On Windows, each %c parameter needs additional size parameter for sscanf_s - nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, 1, &extra, 1); -#else - nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, &extra); -#endif - if (nvalues < 1) - return (-1); - if (nvalues == 1) - mult = '\0'; - if (nvalues == 3) - return (-1); - - if (value < 0) - return (-1); - - switch (mult) { - case '\0': - /* default is milliseconds */ - factor = 1; - break; - case 's': - case 'S': - factor = 1000; - break; - case 'm': - case 'M': - factor = 1000 * 60; - break; - case 'h': - case 'H': - factor = 1000 * 60 * 60; - break; - case 'd': - case 'D': - factor = 1000 * 24 * 60 * 60; - break; - default: - return (-1); - } - - if (value >= ((INT_MAX - 1) / factor)) - ret = INT_MAX - 1; /* Don't allow infinite value here */ - else - ret = (int)(value * (double)factor); /* truncate to int */ - - return ret; -} - static int __kmp_strcasecmp_with_sentinel(char const *a, char const *b, char sentinel) { if (a == NULL) @@ -731,24 +667,73 @@ static void __kmp_stg_print_use_yield(kmp_str_buf_t *buffer, char const *name, static void __kmp_stg_parse_blocktime(char const *name, char const *value, void *data) { - __kmp_dflt_blocktime = __kmp_convert_to_milliseconds(value); - if (__kmp_dflt_blocktime < 0) { - __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; + const char *buf = value; + const char *next; + const int ms_mult = 1000; + int multiplier = 1; + int num; + + // Read integer blocktime value + SKIP_WS(buf); + if ((*buf >= '0') && (*buf <= '9')) { + next = buf; + SKIP_DIGITS(next); + num = __kmp_basic_str_to_int(buf); + KMP_ASSERT(num >= 0); + buf = next; + SKIP_WS(buf); + } else { + num = -1; + } + + // Read units: note that __kmp_dflt_blocktime units is now us + next = buf; + if (*buf == '\0' || __kmp_match_str("ms", buf, &next)) { + // units are in ms; convert + __kmp_dflt_blocktime = ms_mult * num; + __kmp_blocktime_units = 'm'; + multiplier = ms_mult; + } else if (__kmp_match_str("us", buf, &next)) { + // units are in us + __kmp_dflt_blocktime = num; + __kmp_blocktime_units = 'u'; + } else if (__kmp_match_str("infinite", buf, &next) || + __kmp_match_str("infinity", buf, &next)) { + // units are in ms + __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME; + __kmp_blocktime_units = 'm'; + multiplier = ms_mult; + } else { + KMP_WARNING(StgInvalidValue, name, value); + // default units are in ms + __kmp_dflt_blocktime = ms_mult * num; + __kmp_blocktime_units = 'm'; + multiplier = ms_mult; + } + + if (num < 0 && __kmp_dflt_blocktime < 0) { // num out of range + __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; // now in us __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidValue, name, value), __kmp_msg_null); - KMP_INFORM(Using_int_Value, name, __kmp_dflt_blocktime); + // Inform in appropriate units + KMP_INFORM(Using_int_Value, name, __kmp_dflt_blocktime / multiplier); __kmp_env_blocktime = FALSE; // Revert to default as if var not set. + } else if (num > 0 && __kmp_dflt_blocktime < 0) { // overflow + __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME; + __kmp_msg(kmp_ms_warning, KMP_MSG(LargeValue, name, value), __kmp_msg_null); + KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime / multiplier); + __kmp_env_blocktime = TRUE; // KMP_BLOCKTIME was specified. } else { if (__kmp_dflt_blocktime < KMP_MIN_BLOCKTIME) { __kmp_dflt_blocktime = KMP_MIN_BLOCKTIME; __kmp_msg(kmp_ms_warning, KMP_MSG(SmallValue, name, value), __kmp_msg_null); - KMP_INFORM(MinValueUsing, name, __kmp_dflt_blocktime); + KMP_INFORM(MinValueUsing, name, __kmp_dflt_blocktime / multiplier); } else if (__kmp_dflt_blocktime > KMP_MAX_BLOCKTIME) { __kmp_dflt_blocktime = KMP_MAX_BLOCKTIME; __kmp_msg(kmp_ms_warning, KMP_MSG(LargeValue, name, value), __kmp_msg_null); - KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime); + KMP_INFORM(MaxValueUsing, name, __kmp_dflt_blocktime / multiplier); } __kmp_env_blocktime = TRUE; // KMP_BLOCKTIME was specified. } @@ -768,7 +753,17 @@ static void __kmp_stg_parse_blocktime(char const *name, char const *value, static void __kmp_stg_print_blocktime(kmp_str_buf_t *buffer, char const *name, void *data) { - __kmp_stg_print_int(buffer, name, __kmp_dflt_blocktime); + int num = __kmp_dflt_blocktime; + if (__kmp_blocktime_units == 'm') { + num = num / 1000; + } + if (__kmp_env_format) { + KMP_STR_BUF_PRINT_NAME_EX(name); + } else { + __kmp_str_buf_print(buffer, " %s=", name); + } + __kmp_str_buf_print(buffer, "%d", num); + __kmp_str_buf_print(buffer, "%cs\n", __kmp_blocktime_units); } // __kmp_stg_print_blocktime // ----------------------------------------------------------------------------- @@ -1598,7 +1593,7 @@ static void __kmp_stg_parse_debug(char const *name, char const *value, static void __kmp_stg_parse_debug_buf(char const *name, char const *value, void *data) { __kmp_stg_parse_bool(name, value, &__kmp_debug_buf); - // !!! TODO: Move buffer initialization of of this file! It may works + // !!! TODO: Move buffer initialization of this file! It may works // incorrectly if KMP_DEBUG_BUF is parsed before KMP_DEBUG_BUF_LINES or // KMP_DEBUG_BUF_CHARS. if (__kmp_debug_buf) { @@ -2005,6 +2000,21 @@ static void __kmp_stg_print_foreign_threads_threadprivate(kmp_str_buf_t *buffer, // ----------------------------------------------------------------------------- // KMP_AFFINITY, GOMP_CPU_AFFINITY, KMP_TOPOLOGY_METHOD +static inline const char * +__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) { + switch (type) { + case KMP_HW_CORE_TYPE_UNKNOWN: + return "unknown"; +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + case KMP_HW_CORE_TYPE_ATOM: + return "intel_atom"; + case KMP_HW_CORE_TYPE_CORE: + return "intel_core"; +#endif + } + return "unknown"; +} + #if KMP_AFFINITY_SUPPORTED // Parse the proc id list. Return TRUE if successful, FALSE otherwise. static int __kmp_parse_affinity_proc_id_list(const char *var, const char *env, @@ -2359,14 +2369,32 @@ static void __kmp_parse_affinity_env(char const *name, char const *value, buf = next; - // Try any hardware topology type for granularity - KMP_FOREACH_HW_TYPE(type) { - const char *name = __kmp_hw_get_keyword(type); - if (__kmp_match_str(name, buf, CCAST(const char **, &next))) { - set_gran(type, -1); - buf = next; - set = true; - break; + // Have to try core_type and core_efficiency matches first since "core" + // will register as core granularity with "extra chars" + if (__kmp_match_str("core_type", buf, CCAST(const char **, &next))) { + set_gran(KMP_HW_CORE, -1); + out_affinity->flags.core_types_gran = 1; + buf = next; + set = true; + } else if (__kmp_match_str("core_efficiency", buf, + CCAST(const char **, &next)) || + __kmp_match_str("core_eff", buf, + CCAST(const char **, &next))) { + set_gran(KMP_HW_CORE, -1); + out_affinity->flags.core_effs_gran = 1; + buf = next; + set = true; + } + if (!set) { + // Try any hardware topology type for granularity + KMP_FOREACH_HW_TYPE(type) { + const char *name = __kmp_hw_get_keyword(type); + if (__kmp_match_str(name, buf, CCAST(const char **, &next))) { + set_gran(type, -1); + buf = next; + set = true; + break; + } } } if (!set) { @@ -2626,8 +2654,15 @@ static void __kmp_print_affinity_env(kmp_str_buf_t *buffer, char const *name, __kmp_str_buf_print(buffer, "%s,", "noreset"); } } - __kmp_str_buf_print(buffer, "granularity=%s,", - __kmp_hw_get_keyword(affinity.gran, false)); + __kmp_str_buf_print(buffer, "granularity="); + if (affinity.flags.core_types_gran) + __kmp_str_buf_print(buffer, "core_type,"); + else if (affinity.flags.core_effs_gran) { + __kmp_str_buf_print(buffer, "core_eff,"); + } else { + __kmp_str_buf_print( + buffer, "%s,", __kmp_hw_get_keyword(affinity.gran, /*plural=*/false)); + } } if (!KMP_AFFINITY_CAPABLE()) { __kmp_str_buf_print(buffer, "%s", "disabled"); @@ -2745,11 +2780,7 @@ signed := + signed signed := - signed -----------------------------------------------------------------------------*/ -// Warning to issue for syntax error during parsing of OMP_PLACES -static inline void __kmp_omp_places_syntax_warn(const char *var) { - KMP_WARNING(SyntaxErrorUsing, var, "\"cores\""); -} - +// Return TRUE if successful parse, FALSE otherwise static int __kmp_parse_subplace_list(const char *var, const char **scan) { const char *next; @@ -2761,7 +2792,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) { // SKIP_WS(*scan); if ((**scan < '0') || (**scan > '9')) { - __kmp_omp_places_syntax_warn(var); return FALSE; } next = *scan; @@ -2780,7 +2810,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) { continue; } if (**scan != ':') { - __kmp_omp_places_syntax_warn(var); return FALSE; } (*scan)++; // skip ':' @@ -2788,7 +2817,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) { // Read count parameter SKIP_WS(*scan); if ((**scan < '0') || (**scan > '9')) { - __kmp_omp_places_syntax_warn(var); return FALSE; } next = *scan; @@ -2807,7 +2835,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) { continue; } if (**scan != ':') { - __kmp_omp_places_syntax_warn(var); return FALSE; } (*scan)++; // skip ':' @@ -2829,7 +2856,6 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) { } SKIP_WS(*scan); if ((**scan < '0') || (**scan > '9')) { - __kmp_omp_places_syntax_warn(var); return FALSE; } next = *scan; @@ -2848,13 +2874,12 @@ static int __kmp_parse_subplace_list(const char *var, const char **scan) { (*scan)++; // skip ',' continue; } - - __kmp_omp_places_syntax_warn(var); return FALSE; } return TRUE; } +// Return TRUE if successful parse, FALSE otherwise static int __kmp_parse_place(const char *var, const char **scan) { const char *next; @@ -2866,7 +2891,6 @@ static int __kmp_parse_place(const char *var, const char **scan) { return FALSE; } if (**scan != '}') { - __kmp_omp_places_syntax_warn(var); return FALSE; } (*scan)++; // skip '}' @@ -2880,12 +2904,12 @@ static int __kmp_parse_place(const char *var, const char **scan) { KMP_ASSERT(proc >= 0); *scan = next; } else { - __kmp_omp_places_syntax_warn(var); return FALSE; } return TRUE; } +// Return TRUE if successful parse, FALSE otherwise static int __kmp_parse_place_list(const char *var, const char *env, char **place_list) { const char *scan = env; @@ -2908,7 +2932,6 @@ static int __kmp_parse_place_list(const char *var, const char *env, continue; } if (*scan != ':') { - __kmp_omp_places_syntax_warn(var); return FALSE; } scan++; // skip ':' @@ -2916,7 +2939,6 @@ static int __kmp_parse_place_list(const char *var, const char *env, // Read count parameter SKIP_WS(scan); if ((*scan < '0') || (*scan > '9')) { - __kmp_omp_places_syntax_warn(var); return FALSE; } next = scan; @@ -2935,7 +2957,6 @@ static int __kmp_parse_place_list(const char *var, const char *env, continue; } if (*scan != ':') { - __kmp_omp_places_syntax_warn(var); return FALSE; } scan++; // skip ':' @@ -2957,7 +2978,6 @@ static int __kmp_parse_place_list(const char *var, const char *env, } SKIP_WS(scan); if ((*scan < '0') || (*scan > '9')) { - __kmp_omp_places_syntax_warn(var); return FALSE; } next = scan; @@ -2977,7 +2997,6 @@ static int __kmp_parse_place_list(const char *var, const char *env, continue; } - __kmp_omp_places_syntax_warn(var); return FALSE; } @@ -2991,6 +3010,22 @@ static int __kmp_parse_place_list(const char *var, const char *env, return TRUE; } +static inline void __kmp_places_set(enum affinity_type type, kmp_hw_t kind) { + __kmp_affinity.type = type; + __kmp_affinity.gran = kind; + __kmp_affinity.flags.dups = FALSE; + __kmp_affinity.flags.omp_places = TRUE; +} + +static void __kmp_places_syntax_error_fallback(char const *name, + kmp_hw_t kind) { + const char *str = __kmp_hw_get_catalog_string(kind, /*plural=*/true); + KMP_WARNING(SyntaxErrorUsing, name, str); + __kmp_places_set(affinity_compact, kind); + if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) + __kmp_nested_proc_bind.bind_types[0] = proc_bind_true; +} + static void __kmp_stg_parse_places(char const *name, char const *value, void *data) { struct kmp_place_t { @@ -3001,7 +3036,6 @@ static void __kmp_stg_parse_places(char const *name, char const *value, bool set = false; const char *scan = value; const char *next = scan; - const char *kind = "\"threads\""; kmp_place_t std_places[] = {{"threads", KMP_HW_THREAD}, {"cores", KMP_HW_CORE}, {"numa_domains", KMP_HW_NUMA}, @@ -3020,10 +3054,54 @@ static void __kmp_stg_parse_places(char const *name, char const *value, const kmp_place_t &place = std_places[i]; if (__kmp_match_str(place.name, scan, &next)) { scan = next; - __kmp_affinity.type = affinity_compact; - __kmp_affinity.gran = place.type; - __kmp_affinity.flags.dups = FALSE; + __kmp_places_set(affinity_compact, place.type); set = true; + // Parse core attribute if it exists + if (KMP_HW_MAX_NUM_CORE_TYPES > 1) { + SKIP_WS(scan); + if (*scan == ':') { + if (place.type != KMP_HW_CORE) { + __kmp_places_syntax_error_fallback(name, place.type); + return; + } + scan++; // skip ':' + SKIP_WS(scan); +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 + if (__kmp_match_str("intel_core", scan, &next)) { + __kmp_affinity.core_attr_gran.core_type = KMP_HW_CORE_TYPE_CORE; + __kmp_affinity.core_attr_gran.valid = 1; + scan = next; + } else if (__kmp_match_str("intel_atom", scan, &next)) { + __kmp_affinity.core_attr_gran.core_type = KMP_HW_CORE_TYPE_ATOM; + __kmp_affinity.core_attr_gran.valid = 1; + scan = next; + } else +#endif + if (__kmp_match_str("eff", scan, &next)) { + int eff; + if (!isdigit(*next)) { + __kmp_places_syntax_error_fallback(name, place.type); + return; + } + scan = next; + SKIP_DIGITS(next); + eff = __kmp_str_to_int(scan, *next); + if (eff < 0) { + __kmp_places_syntax_error_fallback(name, place.type); + return; + } + if (eff >= KMP_HW_MAX_NUM_CORE_EFFS) + eff = KMP_HW_MAX_NUM_CORE_EFFS - 1; + __kmp_affinity.core_attr_gran.core_eff = eff; + __kmp_affinity.core_attr_gran.valid = 1; + scan = next; + } + if (!__kmp_affinity.core_attr_gran.valid) { + __kmp_places_syntax_error_fallback(name, place.type); + return; + } + } + } break; } } @@ -3035,36 +3113,56 @@ static void __kmp_stg_parse_places(char const *name, char const *value, continue; if (__kmp_match_str(name, scan, &next)) { scan = next; - __kmp_affinity.type = affinity_compact; - __kmp_affinity.gran = type; - __kmp_affinity.flags.dups = FALSE; + __kmp_places_set(affinity_compact, type); set = true; break; } } } + // Implementation choices for OMP_PLACES based on core attributes + if (!set) { + if (__kmp_match_str("core_types", scan, &next)) { + scan = next; + if (*scan != '\0') { + KMP_WARNING(ParseExtraCharsWarn, name, scan); + } + __kmp_places_set(affinity_compact, KMP_HW_CORE); + __kmp_affinity.flags.core_types_gran = 1; + set = true; + } else if (__kmp_match_str("core_effs", scan, &next) || + __kmp_match_str("core_efficiencies", scan, &next)) { + scan = next; + if (*scan != '\0') { + KMP_WARNING(ParseExtraCharsWarn, name, scan); + } + __kmp_places_set(affinity_compact, KMP_HW_CORE); + __kmp_affinity.flags.core_effs_gran = 1; + set = true; + } + } + // Explicit place list if (!set) { if (__kmp_affinity.proclist != NULL) { KMP_INTERNAL_FREE((void *)__kmp_affinity.proclist); __kmp_affinity.proclist = NULL; } if (__kmp_parse_place_list(name, value, &__kmp_affinity.proclist)) { - __kmp_affinity.type = affinity_explicit; - __kmp_affinity.gran = KMP_HW_THREAD; - __kmp_affinity.flags.dups = FALSE; + __kmp_places_set(affinity_explicit, KMP_HW_THREAD); } else { // Syntax error fallback - __kmp_affinity.type = affinity_compact; - __kmp_affinity.gran = KMP_HW_CORE; - __kmp_affinity.flags.dups = FALSE; + __kmp_places_syntax_error_fallback(name, KMP_HW_CORE); } if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) { __kmp_nested_proc_bind.bind_types[0] = proc_bind_true; } return; } + + kmp_hw_t gran = __kmp_affinity.gran; if (__kmp_affinity.gran != KMP_HW_UNKNOWN) { - kind = __kmp_hw_get_keyword(__kmp_affinity.gran); + gran = __kmp_affinity.gran; + } else { + gran = KMP_HW_CORE; } if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_default) { @@ -3078,7 +3176,7 @@ static void __kmp_stg_parse_places(char const *name, char const *value, // Parse option count parameter in parentheses if (*scan != '(') { - KMP_WARNING(SyntaxErrorUsing, name, kind); + __kmp_places_syntax_error_fallback(name, gran); return; } scan++; // skip '(' @@ -3092,7 +3190,7 @@ static void __kmp_stg_parse_places(char const *name, char const *value, SKIP_WS(scan); if (*scan != ')') { - KMP_WARNING(SyntaxErrorUsing, name, kind); + __kmp_places_syntax_error_fallback(name, gran); return; } scan++; // skip ')' @@ -3135,12 +3233,37 @@ static void __kmp_stg_print_places(kmp_str_buf_t *buffer, char const *name, num = 0; } if (gran != KMP_HW_UNKNOWN) { + // If core_types or core_effs, just print and return + if (__kmp_affinity.flags.core_types_gran) { + __kmp_str_buf_print(buffer, "='%s'\n", "core_types"); + return; + } + if (__kmp_affinity.flags.core_effs_gran) { + __kmp_str_buf_print(buffer, "='%s'\n", "core_effs"); + return; + } + + // threads, cores, sockets, cores:<attribute>, etc. const char *name = __kmp_hw_get_keyword(gran, true); - if (num > 0) { - __kmp_str_buf_print(buffer, "='%s(%d)'\n", name, num); - } else { - __kmp_str_buf_print(buffer, "='%s'\n", name); + __kmp_str_buf_print(buffer, "='%s", name); + + // Add core attributes if it exists + if (__kmp_affinity.core_attr_gran.valid) { + kmp_hw_core_type_t ct = + (kmp_hw_core_type_t)__kmp_affinity.core_attr_gran.core_type; + int eff = __kmp_affinity.core_attr_gran.core_eff; + if (ct != KMP_HW_CORE_TYPE_UNKNOWN) { + const char *ct_name = __kmp_hw_get_core_type_keyword(ct); + __kmp_str_buf_print(buffer, ":%s", name, ct_name); + } else if (eff >= 0 && eff < KMP_HW_MAX_NUM_CORE_EFFS) { + __kmp_str_buf_print(buffer, ":eff%d", name, eff); + } } + + // Add the '(#)' part if it exists + if (num > 0) + __kmp_str_buf_print(buffer, "(%d)", num); + __kmp_str_buf_print(buffer, "'\n"); } else { __kmp_str_buf_print(buffer, ": %s\n", KMP_I18N_STR(NotDefined)); } @@ -5139,21 +5262,6 @@ err: return; } -static inline const char * -__kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) { - switch (type) { - case KMP_HW_CORE_TYPE_UNKNOWN: - return "unknown"; -#if KMP_ARCH_X86 || KMP_ARCH_X86_64 - case KMP_HW_CORE_TYPE_ATOM: - return "intel_atom"; - case KMP_HW_CORE_TYPE_CORE: - return "intel_core"; -#endif - } - return "unknown"; -} - static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name, void *data) { kmp_str_buf_t buf; @@ -5984,7 +6092,13 @@ static void __kmp_aux_env_initialize(kmp_env_blk_t *block) { /* KMP_BLOCKTIME */ value = __kmp_env_blk_var(block, "KMP_BLOCKTIME"); if (value) { - kmpc_set_blocktime(__kmp_dflt_blocktime); + int gtid, tid; + kmp_info_t *thread; + + gtid = __kmp_entry_gtid(); + tid = __kmp_tid_from_gtid(gtid); + thread = __kmp_thread_from_gtid(gtid); + __kmp_aux_set_blocktime(__kmp_dflt_blocktime, thread, tid); } /* OMP_NESTED */ diff --git a/openmp/runtime/src/kmp_settings.h b/openmp/runtime/src/kmp_settings.h index f63f105940ef..92bbcff52419 100644 --- a/openmp/runtime/src/kmp_settings.h +++ b/openmp/runtime/src/kmp_settings.h @@ -24,7 +24,6 @@ void __kmp_env_dump(); int __kmp_initial_threads_capacity(int req_nproc); void __kmp_init_dflt_team_nth(); -int __kmp_convert_to_milliseconds(char const *); int __kmp_default_tp_capacity(int, int, int); #if KMP_MIC diff --git a/openmp/runtime/src/kmp_str.cpp b/openmp/runtime/src/kmp_str.cpp index 4cba56964a09..6ee2df724487 100644 --- a/openmp/runtime/src/kmp_str.cpp +++ b/openmp/runtime/src/kmp_str.cpp @@ -619,6 +619,21 @@ char *__kmp_str_token( return token; } // __kmp_str_token +int __kmp_basic_str_to_int(char const *str) { + int result; + char const *t; + + result = 0; + + for (t = str; *t != '\0'; ++t) { + if (*t < '0' || *t > '9') + break; + result = (result * 10) + (*t - '0'); + } + + return result; +} + int __kmp_str_to_int(char const *str, char sentinel) { int result, factor; char const *t; diff --git a/openmp/runtime/src/kmp_str.h b/openmp/runtime/src/kmp_str.h index 855b5df55d69..11f633cd8024 100644 --- a/openmp/runtime/src/kmp_str.h +++ b/openmp/runtime/src/kmp_str.h @@ -112,6 +112,7 @@ int __kmp_str_match_true(char const *data); void __kmp_str_replace(char *str, char search_for, char replace_with); void __kmp_str_split(char *str, char delim, char **head, char **tail); char *__kmp_str_token(char *str, char const *delim, char **buf); +int __kmp_basic_str_to_int(char const *str); int __kmp_str_to_int(char const *str, char sentinel); void __kmp_str_to_size(char const *str, size_t *out, size_t dfactor, diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp index 3b39f5039736..f7529481393f 100644 --- a/openmp/runtime/src/kmp_taskdeps.cpp +++ b/openmp/runtime/src/kmp_taskdeps.cpp @@ -284,6 +284,16 @@ static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source, #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */ } +kmp_base_depnode_t *__kmpc_task_get_depnode(kmp_task_t *task) { + kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task); + return td->td_depnode ? &(td->td_depnode->dn) : NULL; +} + +kmp_depnode_list_t *__kmpc_task_get_successors(kmp_task_t *task) { + kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task); + return td->td_depnode->dn.successors; +} + static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread, kmp_task_t *task, kmp_depnode_t *node, @@ -307,16 +317,18 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread, if (dep->dn.task) { KMP_ACQUIRE_DEPNODE(gtid, dep); if (dep->dn.task) { + if (!dep->dn.successors || dep->dn.successors->node != node) { #if OMPX_TASKGRAPH - if (!(__kmp_tdg_is_recording(tdg_status)) && task) + if (!(__kmp_tdg_is_recording(tdg_status)) && task) #endif - __kmp_track_dependence(gtid, dep, node, task); - dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node); - KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " - "%p\n", - gtid, KMP_TASK_TO_TASKDATA(dep->dn.task), - KMP_TASK_TO_TASKDATA(task))); - npredecessors++; + __kmp_track_dependence(gtid, dep, node, task); + dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node); + KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " + "%p\n", + gtid, KMP_TASK_TO_TASKDATA(dep->dn.task), + KMP_TASK_TO_TASKDATA(task))); + npredecessors++; + } } KMP_RELEASE_DEPNODE(gtid, dep); } @@ -324,6 +336,7 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread, return npredecessors; } +// Add the edge 'sink' -> 'source' in the task dependency graph static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread, kmp_task_t *task, @@ -346,29 +359,31 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid, // synchronously add source to sink' list of successors KMP_ACQUIRE_DEPNODE(gtid, sink); if (sink->dn.task) { + if (!sink->dn.successors || sink->dn.successors->node != source) { #if OMPX_TASKGRAPH - if (!(__kmp_tdg_is_recording(tdg_status)) && task) + if (!(__kmp_tdg_is_recording(tdg_status)) && task) #endif - __kmp_track_dependence(gtid, sink, source, task); - sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source); - KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " + __kmp_track_dependence(gtid, sink, source, task); + sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source); + KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to " "%p\n", gtid, KMP_TASK_TO_TASKDATA(sink->dn.task), KMP_TASK_TO_TASKDATA(task))); #if OMPX_TASKGRAPH - if (__kmp_tdg_is_recording(tdg_status)) { - kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task); - if (tdd->is_taskgraph) { - if (tdd->td_flags.onced) - // decrement npredecessors if sink->dn.task belongs to a taskgraph - // and - // 1) the task is reset to its initial state (by kmp_free_task) or - // 2) the task is complete but not yet reset - npredecessors--; + if (__kmp_tdg_is_recording(tdg_status)) { + kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task); + if (tdd->is_taskgraph) { + if (tdd->td_flags.onced) + // decrement npredecessors if sink->dn.task belongs to a taskgraph + // and + // 1) the task is reset to its initial state (by kmp_free_task) or + // 2) the task is complete but not yet reset + npredecessors--; + } } - } #endif npredecessors++; + } } KMP_RELEASE_DEPNODE(gtid, sink); } diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index fefa609927e8..6e8b948efa06 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -839,6 +839,14 @@ static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, // loc_ref: source location information; points to beginning of task block. // gtid: global thread number. // task: task thunk for the started task. +#ifdef __s390x__ +// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x. +// In order for it to work correctly, the caller also needs to be compiled with +// backchain. If a caller is compiled without backchain, +// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not +// crash. +__attribute__((target("backchain"))) +#endif void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { #if OMPT_SUPPORT @@ -1554,7 +1562,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, task = KMP_TASKDATA_TO_TASK(taskdata); // Make sure task & taskdata are aligned appropriately -#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD +#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); #else @@ -1737,8 +1745,12 @@ __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, // gtid: global thread ID of caller // task: the task to invoke // current_task: the task to resume after task invocation -static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, - kmp_taskdata_t *current_task) { +#ifdef __s390x__ +__attribute__((target("backchain"))) +#endif +static void +__kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, + kmp_taskdata_t *current_task) { kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); kmp_info_t *thread; int discard = 0 /* false */; @@ -2512,7 +2524,7 @@ void *__kmp_task_reduction_init(int gtid, int num, T *data) { KMP_ASSERT(tg != NULL); KMP_ASSERT(data != NULL); KMP_ASSERT(num > 0); - if (nth == 1) { + if (nth == 1 && !__kmp_enable_hidden_helper) { KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", gtid, tg)); return (void *)tg; @@ -2699,6 +2711,7 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { return p_priv[tid]; } } + KMP_ASSERT(tg->parent); tg = tg->parent; arr = (kmp_taskred_data_t *)(tg->reduce_data); num = tg->reduce_num_data; @@ -2711,7 +2724,10 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { // Called from __kmpc_end_taskgroup() static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { kmp_int32 nth = th->th.th_team_nproc; - KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 + KMP_DEBUG_ASSERT( + nth > 1 || + __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we + // are using hidden helper threads kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data; kmp_int32 num = tg->reduce_num_data; for (int i = 0; i < num; ++i) { diff --git a/openmp/runtime/src/kmp_wrapper_getpid.h b/openmp/runtime/src/kmp_wrapper_getpid.h index 32ede3ed715b..f9d7f4804fbc 100644 --- a/openmp/runtime/src/kmp_wrapper_getpid.h +++ b/openmp/runtime/src/kmp_wrapper_getpid.h @@ -23,14 +23,14 @@ #if KMP_OS_DARWIN // OS X #define __kmp_gettid() pthread_mach_thread_np(pthread_self()) -#elif KMP_OS_FREEBSD +#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY #include <pthread_np.h> #define __kmp_gettid() pthread_getthreadid_np() #elif KMP_OS_NETBSD #include <lwp.h> #define __kmp_gettid() _lwp_self() #elif KMP_OS_OPENBSD -#define __kmp_gettid() syscall(SYS_getthrid) +#define __kmp_gettid() getthrid() #elif defined(SYS_gettid) // Hopefully other Unix systems define SYS_gettid syscall for getting os thread // id diff --git a/openmp/runtime/src/ompt-event-specific.h b/openmp/runtime/src/ompt-event-specific.h index 5ac7f6d1e4e6..7736ba853163 100644 --- a/openmp/runtime/src/ompt-event-specific.h +++ b/openmp/runtime/src/ompt-event-specific.h @@ -55,13 +55,12 @@ #define ompt_callback_implicit_task_implemented ompt_event_MAY_ALWAYS -#define ompt_callback_target_implemented ompt_event_UNIMPLEMENTED -#define ompt_callback_target_emi_implemented ompt_event_UNIMPLEMENTED -#define ompt_callback_target_data_op_implemented ompt_event_UNIMPLEMENTED -#define ompt_callback_target_data_op_emi_implemented ompt_event_UNIMPLEMENTED -#define ompt_callback_target_submit_implemented ompt_event_UNIMPLEMENTED -#define ompt_callback_target_submit_emi_implemented ompt_event_UNIMPLEMENTED - +#define ompt_callback_target_implemented ompt_event_MAY_ALWAYS +#define ompt_callback_target_emi_implemented ompt_event_MAY_ALWAYS +#define ompt_callback_target_data_op_implemented ompt_event_MAY_ALWAYS +#define ompt_callback_target_data_op_emi_implemented ompt_event_MAY_ALWAYS +#define ompt_callback_target_submit_implemented ompt_event_MAY_ALWAYS +#define ompt_callback_target_submit_emi_implemented ompt_event_MAY_ALWAYS #define ompt_callback_control_tool_implemented ompt_event_MAY_ALWAYS #define ompt_callback_device_initialize_implemented ompt_event_MAY_ALWAYS diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp index 54edd6e6af7c..9743f35d2c4f 100644 --- a/openmp/runtime/src/ompt-specific.cpp +++ b/openmp/runtime/src/ompt-specific.cpp @@ -463,6 +463,7 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type, } int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) { + *size = 0; if (blocknum != 0) return 0; // support only a single block @@ -471,27 +472,13 @@ int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) { return 0; kmp_taskdata_t *taskdata = thr->th.th_current_task; - kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata); if (taskdata->td_flags.tasktype != TASK_EXPLICIT) return 0; // support only explicit task - void *ret_addr; - int64_t ret_size = taskdata->td_size_alloc - sizeof(kmp_taskdata_t); - - // kmp_task_t->data1 is an optional member - if (taskdata->td_flags.destructors_thunk) - ret_addr = &task->data1 + 1; - else - ret_addr = &task->part_id + 1; - - ret_size -= (char *)(ret_addr) - (char *)(task); - if (ret_size < 0) - return 0; - - *addr = ret_addr; - *size = (size_t)ret_size; - return 1; + *addr = taskdata; + *size = taskdata->td_size_alloc; + return 0; } //---------------------------------------------------------- diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h index a452b7643bdb..bd3fd9b43e57 100644 --- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h +++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h @@ -162,6 +162,14 @@ #define ITT_ARCH_ARM64 6 #endif /* ITT_ARCH_ARM64 */ +#ifndef ITT_ARCH_VE +#define ITT_ARCH_VE 8 +#endif /* ITT_ARCH_VE */ + +#ifndef ITT_ARCH_S390X +#define ITT_ARCH_S390X 8 +#endif /* ITT_ARCH_S390X */ + #ifndef ITT_ARCH #if defined _M_IX86 || defined __i386__ #define ITT_ARCH ITT_ARCH_IA32 @@ -175,6 +183,10 @@ #define ITT_ARCH ITT_ARCH_ARM64 #elif defined __powerpc64__ #define ITT_ARCH ITT_ARCH_PPC64 +#elif defined __ve__ +#define ITT_ARCH ITT_ARCH_VE +#elif defined __s390x__ +#define ITT_ARCH ITT_ARCH_S390X #endif #endif diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S index 27b063f09e7a..a72705528d41 100644 --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -2060,6 +2060,351 @@ __kmp_invoke_microtask: #endif /* KMP_ARCH_LOONGARCH64 */ +#if KMP_ARCH_VE + +//------------------------------------------------------------------------ +// +// typedef void (*microtask_t)(int *gtid, int *tid, ...); +// +// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc, +// void *p_argv[] +// #if OMPT_SUPPORT +// , +// void **exit_frame_ptr +// #endif +// ) { +// #if OMPT_SUPPORT +// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0); +// #endif +// +// (*pkfn)(>id, &tid, argv[0], ...); +// +// return 1; +// } +// +// Parameters: +// s0: pkfn +// s1: gtid +// s2: tid +// s3: argc +// s4: p_argv +// s5: exit_frame_ptr +// +// Locals: +// __gtid: gtid param pushed on stack so can pass >id to pkfn +// __tid: tid param pushed on stack so can pass &tid to pkfn +// +// Temp. registers: +// +// s34: used to calculate the dynamic stack size +// s35: used as temporary for stack placement calculation +// s36: used as temporary for stack arguments +// s37: used as temporary for number of remaining pkfn parms +// s38: used to traverse p_argv array +// +// return: s0 (always 1/TRUE) +// + +__gtid = -4 +__tid = -8 + +// -- Begin __kmp_invoke_microtask +// mark_begin; + .text + .globl __kmp_invoke_microtask + // A function requires 8 bytes align. + .p2align 3 + .type __kmp_invoke_microtask,@function +__kmp_invoke_microtask: + .cfi_startproc + + // First, save fp and lr. VE stores them at caller stack frame. + st %fp, 0(, %sp) + st %lr, 8(, %sp) + or %fp, 0, %sp + .cfi_def_cfa %fp, 0 + .cfi_offset %lr, 8 + .cfi_offset %fp, 0 + + // Compute the dynamic stack size: + // + // - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them + // by reference + // - We need 8 bytes for whole arguments. We have two + 'argc' + // arguments (condider >id and &tid). We need to reserve + // (argc + 2) * 8 bytes. + // - We need 176 bytes for RSA and others + // + // The total number of bytes is then (argc + 2) * 8 + 8 + 176. + // + // |------------------------------| + // | return address of callee | 8(%fp) + // |------------------------------| + // | frame pointer of callee | 0(%fp) + // |------------------------------| <------------------ %fp + // | __tid / __gtid | -8(%fp) / -4(%fp) + // |------------------------------| + // | argc+2 for arguments | 176(%sp) + // |------------------------------| + // | RSA | + // |------------------------------| + // | return address | + // |------------------------------| + // | frame pointer | + // |------------------------------| <------------------ %sp + + adds.w.sx %s34, 2, %s3 + sll %s34, %s34, 3 + lea %s34, 184(, %s34) + subs.l %sp, %sp, %s34 + + // Align the stack to 16 bytes. + and %sp, -16, %sp + + // Save pkfn. + or %s12, 0, %s0 + + // Call host to allocate stack if it is necessary. + brge.l %sp, %sl, .L_kmp_pass + ld %s61, 24(, %tp) + lea %s63, 0x13b + shm.l %s63, 0(%s61) + shm.l %sl, 8(%s61) + shm.l %sp, 16(%s61) + monc + +.L_kmp_pass: + lea %s35, 176(, %sp) + adds.w.sx %s37, 0, %s3 + or %s38, 0, %s4 + +#if OMPT_SUPPORT + // Save frame pointer into exit_frame. + st %fp, 0(%s5) +#endif + + // Prepare arguments for the pkfn function (first 8 using s0-s7 + // registers, but need to store stack also because of varargs). + + stl %s1, __gtid(%fp) + stl %s2, __tid(%fp) + + adds.l %s0, __gtid, %fp + st %s0, 0(, %s35) + adds.l %s1, __tid, %fp + st %s1, 8(, %s35) + + breq.l 0, %s37, .L_kmp_call + ld %s2, 0(, %s38) + st %s2, 16(, %s35) + + breq.l 1, %s37, .L_kmp_call + ld %s3, 8(, %s38) + st %s3, 24(, %s35) + + breq.l 2, %s37, .L_kmp_call + ld %s4, 16(, %s38) + st %s4, 32(, %s35) + + breq.l 3, %s37, .L_kmp_call + ld %s5, 24(, %s38) + st %s5, 40(, %s35) + + breq.l 4, %s37, .L_kmp_call + ld %s6, 32(, %s38) + st %s6, 48(, %s35) + + breq.l 5, %s37, .L_kmp_call + ld %s7, 40(, %s38) + st %s7, 56(, %s35) + + breq.l 6, %s37, .L_kmp_call + + // Prepare any additional argument passed through the stack. + adds.l %s37, -6, %s37 + lea %s38, 48(, %s38) + lea %s35, 64(, %s35) +.L_kmp_loop: + ld %s36, 0(, %s38) + st %s36, 0(, %s35) + adds.l %s37, -1, %s37 + adds.l %s38, 8, %s38 + adds.l %s35, 8, %s35 + brne.l 0, %s37, .L_kmp_loop + +.L_kmp_call: + // Call pkfn function. + bsic %lr, (, %s12) + + // Return value. + lea %s0, 1 + + // Restore stack and return. + or %sp, 0, %fp + ld %lr, 8(, %sp) + ld %fp, 0(, %sp) + b.l.t (, %lr) +.Lfunc_end0: + .size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask + .cfi_endproc + +// -- End __kmp_invoke_microtask + +#endif /* KMP_ARCH_VE */ + +#if KMP_ARCH_S390X + +//------------------------------------------------------------------------ +// +// typedef void (*microtask_t)(int *gtid, int *tid, ...); +// +// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc, +// void *p_argv[] +// #if OMPT_SUPPORT +// , +// void **exit_frame_ptr +// #endif +// ) { +// #if OMPT_SUPPORT +// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0); +// #endif +// +// (*pkfn)(>id, &tid, argv[0], ...); +// +// return 1; +// } +// +// Parameters: +// r2: pkfn +// r3: gtid +// r4: tid +// r5: argc +// r6: p_argv +// SP+160: exit_frame_ptr +// +// Locals: +// __gtid: gtid param pushed on stack so can pass >id to pkfn +// __tid: tid param pushed on stack so can pass &tid to pkfn +// +// Temp. registers: +// +// r0: used to fetch argv slots +// r7: used as temporary for number of remaining pkfn parms +// r8: argv +// r9: pkfn +// r10: stack size +// r11: previous fp +// r12: stack parameter area +// r13: argv slot +// +// return: r2 (always 1/TRUE) +// + +// -- Begin __kmp_invoke_microtask +// mark_begin; + .text + .globl __kmp_invoke_microtask + .p2align 1 + .type __kmp_invoke_microtask,@function +__kmp_invoke_microtask: + .cfi_startproc + + stmg %r6,%r14,48(%r15) + .cfi_offset %r6, -112 + .cfi_offset %r7, -104 + .cfi_offset %r8, -96 + .cfi_offset %r9, -88 + .cfi_offset %r10, -80 + .cfi_offset %r11, -72 + .cfi_offset %r12, -64 + .cfi_offset %r13, -56 + .cfi_offset %r14, -48 + .cfi_offset %r15, -40 + lgr %r11,%r15 + .cfi_def_cfa %r11, 160 + + // Compute the dynamic stack size: + // + // - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by + // reference + // - We need 8 bytes for each argument that cannot be passed to the 'pkfn' + // function by register. Given that we have 5 of such registers (r[2-6]) + // and two + 'argc' arguments (consider >id and &tid), we need to + // reserve max(0, argc - 3)*8 extra bytes + // + // The total number of bytes is then max(0, argc - 3)*8 + 8 + + lgr %r10,%r5 + aghi %r10,-2 + jnm 0f + lghi %r10,0 +0: + sllg %r10,%r10,3 + lgr %r12,%r10 + aghi %r10,176 + sgr %r15,%r10 + agr %r12,%r15 + stg %r11,0(%r15) + + lgr %r9,%r2 // pkfn + +#if OMPT_SUPPORT + // Save frame pointer into exit_frame + lg %r8,160(%r11) + stg %r11,0(%r8) +#endif + + // Prepare arguments for the pkfn function (first 5 using r2-r6 registers) + + stg %r3,160(%r12) + la %r2,164(%r12) // gid + stg %r4,168(%r12) + la %r3,172(%r12) // tid + lgr %r8,%r6 // argv + + // If argc > 0 + ltgr %r7,%r5 + jz 1f + + lg %r4,0(%r8) // argv[0] + aghi %r7,-1 + jz 1f + + // If argc > 1 + lg %r5,8(%r8) // argv[1] + aghi %r7,-1 + jz 1f + + // If argc > 2 + lg %r6,16(%r8) // argv[2] + aghi %r7,-1 + jz 1f + + lghi %r13,0 // Index [n] +2: + lg %r0,24(%r13,%r8) // argv[2+n] + stg %r0,160(%r13,%r15) // parm[2+n] + aghi %r13,8 // Next + aghi %r7,-1 + jnz 2b + +1: + basr %r14,%r9 // Call pkfn + + // Restore stack and return + + lgr %r15,%r11 + lmg %r6,%r14,48(%r15) + lghi %r2,1 + br %r14 +.Lfunc_end0: + .size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask + .cfi_endproc + +// -- End __kmp_invoke_microtask + +#endif /* KMP_ARCH_S390X */ + #if KMP_ARCH_ARM || KMP_ARCH_MIPS .data COMMON .gomp_critical_user_, 32, 3 @@ -2073,7 +2418,9 @@ __kmp_unnamed_critical_addr: #endif #endif /* KMP_ARCH_ARM */ -#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 +#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || \ + KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE || \ + KMP_ARCH_S390X #ifndef KMP_PREFIX_UNDERSCORE # define KMP_PREFIX_UNDERSCORE(x) x #endif @@ -2088,7 +2435,8 @@ KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr): .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8 #endif #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || - KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 */ + KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE || + KMP_ARCH_S390X */ #if KMP_OS_LINUX # if KMP_ARCH_ARM || KMP_ARCH_AARCH64 diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp index 260b982af200..72da0f79865d 100644 --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -60,6 +60,8 @@ #elif KMP_OS_NETBSD || KMP_OS_OPENBSD #include <sys/types.h> #include <sys/sysctl.h> +#elif KMP_OS_SOLARIS +#include <sys/loadavg.h> #endif #include <ctype.h> @@ -70,6 +72,15 @@ struct kmp_sys_timer { struct timespec start; }; +#if KMP_OS_SOLARIS +// Convert timeval to timespec. +#define TIMEVAL_TO_TIMESPEC(tv, ts) \ + do { \ + (ts)->tv_sec = (tv)->tv_sec; \ + (ts)->tv_nsec = (tv)->tv_usec * 1000; \ + } while (0) +#endif + // Convert timespec to nanoseconds. #define TS2NS(timespec) \ (((timespec).tv_sec * (long int)1e9) + (timespec).tv_nsec) @@ -93,6 +104,7 @@ static kmp_cond_align_t __kmp_wait_cv; static kmp_mutex_align_t __kmp_wait_mx; kmp_uint64 __kmp_ticks_per_msec = 1000000; +kmp_uint64 __kmp_ticks_per_usec = 1000; #ifdef DEBUG_SUSPEND static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) { @@ -408,7 +420,7 @@ void __kmp_terminate_thread(int gtid) { static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) { int stack_data; #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ - KMP_OS_HURD + KMP_OS_HURD || KMP_OS_SOLARIS pthread_attr_t attr; int status; size_t size = 0; @@ -447,7 +459,7 @@ static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) { return TRUE; } #endif /* KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD \ - || KMP_OS_HURD */ + || KMP_OS_HURD || KMP_OS_SOLARIS */ /* Use incremental refinement starting from initial conservative estimate */ TCW_PTR(th->th.th_info.ds.ds_stacksize, 0); TCW_PTR(th->th.th_info.ds.ds_stackbase, &stack_data); @@ -462,7 +474,7 @@ static void *__kmp_launch_worker(void *thr) { #endif /* KMP_BLOCK_SIGNALS */ void *exit_val; #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ - KMP_OS_OPENBSD || KMP_OS_HURD + KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS void *volatile padding = 0; #endif int gtid; @@ -485,7 +497,7 @@ static void *__kmp_launch_worker(void *thr) { #endif /* USE_ITT_BUILD */ #if KMP_AFFINITY_SUPPORTED - __kmp_affinity_set_init_mask(gtid, FALSE); + __kmp_affinity_bind_init_mask(gtid); #endif #ifdef KMP_CANCEL_THREADS @@ -511,7 +523,7 @@ static void *__kmp_launch_worker(void *thr) { #endif /* KMP_BLOCK_SIGNALS */ #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ - KMP_OS_OPENBSD + KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS if (__kmp_stkoffset > 0 && gtid > 0) { padding = KMP_ALLOCA(gtid * __kmp_stkoffset); (void)padding; @@ -1242,6 +1254,7 @@ static void __kmp_atfork_child(void) { *affinity = KMP_AFFINITY_INIT(affinity->env_var); __kmp_affin_fullMask = nullptr; __kmp_affin_origMask = nullptr; + __kmp_topology = nullptr; #endif // KMP_AFFINITY_SUPPORTED #if KMP_USE_MONITOR @@ -1811,7 +1824,7 @@ static int __kmp_get_xproc(void) { __kmp_type_convert(sysconf(_SC_NPROCESSORS_CONF), &(r)); #elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_OPENBSD || \ - KMP_OS_HURD + KMP_OS_HURD || KMP_OS_SOLARIS __kmp_type_convert(sysconf(_SC_NPROCESSORS_ONLN), &(r)); @@ -1892,6 +1905,13 @@ void __kmp_runtime_initialize(void) { /* Query the maximum number of threads */ __kmp_type_convert(sysconf(_SC_THREAD_THREADS_MAX), &(__kmp_sys_max_nth)); +#ifdef __ve__ + if (__kmp_sys_max_nth == -1) { + // VE's pthread supports only up to 64 threads per a VE process. + // So we use that KMP_MAX_NTH (predefined as 64) here. + __kmp_sys_max_nth = KMP_MAX_NTH; + } +#else if (__kmp_sys_max_nth == -1) { /* Unlimited threads for NPTL */ __kmp_sys_max_nth = INT_MAX; @@ -1899,6 +1919,7 @@ void __kmp_runtime_initialize(void) { /* Can't tell, just use PTHREAD_THREADS_MAX */ __kmp_sys_max_nth = KMP_MAX_NTH; } +#endif /* Query the minimum stack size */ __kmp_sys_min_stksize = sysconf(_SC_THREAD_STACK_MIN); @@ -2001,7 +2022,7 @@ kmp_uint64 __kmp_now_nsec() { /* Measure clock ticks per millisecond */ void __kmp_initialize_system_tick() { kmp_uint64 now, nsec2, diff; - kmp_uint64 delay = 100000; // 50~100 usec on most machines. + kmp_uint64 delay = 1000000; // ~450 usec on most machines. kmp_uint64 nsec = __kmp_now_nsec(); kmp_uint64 goal = __kmp_hardware_timestamp() + delay; while ((now = __kmp_hardware_timestamp()) < goal) @@ -2009,9 +2030,11 @@ void __kmp_initialize_system_tick() { nsec2 = __kmp_now_nsec(); diff = nsec2 - nsec; if (diff > 0) { - kmp_uint64 tpms = ((kmp_uint64)1e6 * (delay + (now - goal)) / diff); - if (tpms > 0) - __kmp_ticks_per_msec = tpms; + double tpus = 1000.0 * (double)(delay + (now - goal)) / (double)diff; + if (tpus > 0.0) { + __kmp_ticks_per_msec = (kmp_uint64)(tpus * 1000.0); + __kmp_ticks_per_usec = (kmp_uint64)tpus; + } } } #endif @@ -2177,9 +2200,9 @@ int __kmp_is_address_mapped(void *addr) { } kiv.kve_start += 1; } -#elif KMP_OS_DRAGONFLY +#elif KMP_OS_DRAGONFLY || KMP_OS_SOLARIS - // FIXME(DragonFly): Implement this + // FIXME(DragonFly, Solaris): Implement this found = 1; #else @@ -2194,7 +2217,8 @@ int __kmp_is_address_mapped(void *addr) { #ifdef USE_LOAD_BALANCE -#if KMP_OS_DARWIN || KMP_OS_NETBSD +#if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ + KMP_OS_OPENBSD || KMP_OS_SOLARIS // The function returns the rounded value of the system load average // during given time interval which depends on the value of @@ -2452,7 +2476,7 @@ finish: // Clean up and exit. #if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || \ ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || \ KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \ - KMP_ARCH_ARM) + KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X) // we really only need the case with 1 argument, because CLANG always build // a struct of pointers to shared variables referenced in the outlined function @@ -2740,4 +2764,28 @@ void __kmp_hidden_helper_threads_deinitz_release() { } #endif // KMP_OS_LINUX +bool __kmp_detect_shm() { + DIR *dir = opendir("/dev/shm"); + if (dir) { // /dev/shm exists + closedir(dir); + return true; + } else if (ENOENT == errno) { // /dev/shm does not exist + return false; + } else { // opendir() failed + return false; + } +} + +bool __kmp_detect_tmp() { + DIR *dir = opendir("/tmp"); + if (dir) { // /tmp exists + closedir(dir); + return true; + } else if (ENOENT == errno) { // /tmp does not exist + return false; + } else { // opendir() failed + return false; + } +} + // end of file // diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp index eb18efcac61a..9e264ab45b87 100644 --- a/openmp/runtime/src/z_Windows_NT_util.cpp +++ b/openmp/runtime/src/z_Windows_NT_util.cpp @@ -1006,7 +1006,7 @@ extern "C" void *__stdcall __kmp_launch_worker(void *arg) { __kmp_itt_thread_name(gtid); #endif /* USE_ITT_BUILD */ - __kmp_affinity_set_init_mask(gtid, FALSE); + __kmp_affinity_bind_init_mask(gtid); #if KMP_ARCH_X86 || KMP_ARCH_X86_64 // Set FP control regs to be a copy of the parallel initialization thread's. |