aboutsummaryrefslogtreecommitdiff
path: root/openmp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2024-07-26 22:04:10 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-07-26 22:04:10 +0000
commitac9a064cb179f3425b310fa2847f8764ac970a4d (patch)
tree6f945cdaa68c2b4c688dcf9fec4f922d35f4d1a4 /openmp
parent4df029cc74e5ec124f14a5682e44999ce4f086df (diff)
downloadsrc-ac9a064cb179f3425b310fa2847f8764ac970a4d.tar.gz
src-ac9a064cb179f3425b310fa2847f8764ac970a4d.zip
Vendor import of llvm-project main llvmorg-19-init-18630-gf2ccf80136a0,vendor/llvm-project/llvmorg-19-init-18630-gf2ccf80136a0vendor/llvm-project/main
the last commit before the upstream release/19.x branch was created.
Diffstat (limited to 'openmp')
-rw-r--r--openmp/runtime/src/dllexports6
-rw-r--r--openmp/runtime/src/include/omp-tools.h.var18
-rw-r--r--openmp/runtime/src/include/omp.h.var2
-rw-r--r--openmp/runtime/src/include/omp_lib.F90.var (renamed from openmp/runtime/src/include/omp_lib.f90.var)468
-rw-r--r--openmp/runtime/src/include/omp_lib.h.var22
-rw-r--r--openmp/runtime/src/include/ompx.h.var85
-rw-r--r--openmp/runtime/src/kmp.h110
-rw-r--r--openmp/runtime/src/kmp_affinity.cpp316
-rw-r--r--openmp/runtime/src/kmp_affinity.h130
-rw-r--r--openmp/runtime/src/kmp_barrier.cpp62
-rw-r--r--openmp/runtime/src/kmp_collapse.cpp306
-rw-r--r--openmp/runtime/src/kmp_collapse.h11
-rw-r--r--openmp/runtime/src/kmp_csupport.cpp69
-rw-r--r--openmp/runtime/src/kmp_dispatch.cpp16
-rw-r--r--openmp/runtime/src/kmp_gsupport.cpp4
-rw-r--r--openmp/runtime/src/kmp_lock.cpp6
-rw-r--r--openmp/runtime/src/kmp_lock.h18
-rw-r--r--openmp/runtime/src/kmp_os.h10
-rw-r--r--openmp/runtime/src/kmp_platform.h9
-rw-r--r--openmp/runtime/src/kmp_runtime.cpp373
-rw-r--r--openmp/runtime/src/kmp_sched.cpp27
-rw-r--r--openmp/runtime/src/kmp_settings.cpp17
-rw-r--r--openmp/runtime/src/kmp_taskdeps.cpp8
-rw-r--r--openmp/runtime/src/kmp_tasking.cpp238
-rw-r--r--openmp/runtime/src/kmp_threadprivate.cpp12
-rw-r--r--openmp/runtime/src/kmp_utility.cpp68
-rw-r--r--openmp/runtime/src/kmp_wait_release.h16
-rw-r--r--openmp/runtime/src/ompt-general.cpp26
-rw-r--r--openmp/runtime/src/ompt-internal.h4
-rw-r--r--openmp/runtime/src/ompt-specific.cpp19
-rw-r--r--openmp/runtime/src/ompt-specific.h19
-rw-r--r--openmp/runtime/src/z_AIX_asm.S410
-rw-r--r--openmp/runtime/src/z_Linux_asm.S27
-rw-r--r--openmp/runtime/src/z_Linux_util.cpp429
34 files changed, 2551 insertions, 810 deletions
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 0d49643709e0..0667d53c35a1 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -1268,8 +1268,14 @@ kmp_set_disp_num_buffers 890
__kmpc_atomic_val_8_cas_cpt 2158
%endif
+ # No longer need to put ordinal numbers
+ __kmpc_push_num_threads_list
+ __kmpc_push_num_threads_strict
+ __kmpc_push_num_threads_list_strict
+
%endif
__kmpc_set_thread_limit
+__kmpc_dispatch_deinit
# end of file #
diff --git a/openmp/runtime/src/include/omp-tools.h.var b/openmp/runtime/src/include/omp-tools.h.var
index a3ec0309db18..471f46a9073e 100644
--- a/openmp/runtime/src/include/omp-tools.h.var
+++ b/openmp/runtime/src/include/omp-tools.h.var
@@ -78,6 +78,8 @@
/* implicit barrier at the end of worksharing */ \
macro (ompt_state_wait_barrier_implicit, 0x013) /* implicit barrier */ \
macro (ompt_state_wait_barrier_explicit, 0x014) /* explicit barrier */ \
+ macro (ompt_state_wait_barrier_implementation, 0x015) /* implementation barrier */ \
+ macro (ompt_state_wait_barrier_teams, 0x016) /* teams barrier */ \
\
/* task wait states (32..63) */ \
macro (ompt_state_wait_taskwait, 0x020) /* waiting at a taskwait */ \
@@ -211,6 +213,10 @@ typedef enum kmp_mutex_impl_t {
* definitions generated from spec
*****************************************************************************/
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
typedef enum ompt_callbacks_t {
ompt_callback_thread_begin = 1,
ompt_callback_thread_end = 2,
@@ -1404,6 +1410,14 @@ typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
ompt_buffer_cursor_t current
);
+#ifdef _WIN32
+__declspec(dllexport)
+#else
+__attribute__((visibility("default")))
+#endif
+ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
+ const char *runtime_version);
+
#define ompt_id_none 0
#define ompt_data_none {0}
#define ompt_time_none 0
@@ -1414,4 +1428,8 @@ typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
#define ompd_segment_none 0
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
#endif /* __OMPT__ */
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index a1488ae9d21c..eb3ab7778606 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -156,6 +156,8 @@
/* OpenMP 5.1 interop */
typedef intptr_t omp_intptr_t;
+ extern void __KAI_KMPC_CONVENTION ompx_dump_mapping_tables(void);
+
/* 0..omp_get_num_interop_properties()-1 are reserved for implementation-defined properties */
typedef enum omp_interop_property {
omp_ipr_fr_id = -1,
diff --git a/openmp/runtime/src/include/omp_lib.f90.var b/openmp/runtime/src/include/omp_lib.F90.var
index 1ca542db3767..63a3c93b8d92 100644
--- a/openmp/runtime/src/include/omp_lib.f90.var
+++ b/openmp/runtime/src/include/omp_lib.F90.var
@@ -14,37 +14,42 @@
use, intrinsic :: iso_c_binding
- integer, parameter :: omp_integer_kind = c_int
- integer, parameter :: omp_logical_kind = 4
- integer, parameter :: omp_real_kind = c_float
- integer, parameter :: kmp_double_kind = c_double
- integer, parameter :: omp_lock_kind = c_intptr_t
- integer, parameter :: omp_nest_lock_kind = c_intptr_t
- integer, parameter :: omp_sched_kind = omp_integer_kind
- integer, parameter :: omp_proc_bind_kind = omp_integer_kind
- integer, parameter :: kmp_pointer_kind = c_intptr_t
- integer, parameter :: kmp_size_t_kind = c_size_t
- integer, parameter :: kmp_affinity_mask_kind = c_intptr_t
- integer, parameter :: kmp_cancel_kind = omp_integer_kind
- integer, parameter :: omp_sync_hint_kind = omp_integer_kind
- integer, parameter :: omp_lock_hint_kind = omp_sync_hint_kind
- integer, parameter :: omp_control_tool_kind = omp_integer_kind
- integer, parameter :: omp_control_tool_result_kind = omp_integer_kind
- integer, parameter :: omp_allocator_handle_kind = c_intptr_t
- integer, parameter :: omp_memspace_handle_kind = c_intptr_t
- integer, parameter :: omp_alloctrait_key_kind = omp_integer_kind
- integer, parameter :: omp_alloctrait_val_kind = c_intptr_t
- integer, parameter :: omp_interop_kind = c_intptr_t
- integer, parameter :: omp_interop_fr_kind = omp_integer_kind
+ ! Set PRIVATE by default to explicitly only export what is meant
+ ! to be exported by this MODULE.
+ private
+
+ integer, parameter, public :: omp_integer_kind = c_int
+ integer, parameter, public :: omp_logical_kind = 4
+ integer, parameter, public :: omp_real_kind = c_float
+ integer, parameter, public :: kmp_double_kind = c_double
+ integer, parameter, public :: omp_lock_kind = c_intptr_t
+ integer, parameter, public :: omp_nest_lock_kind = c_intptr_t
+ integer, parameter, public :: omp_sched_kind = omp_integer_kind
+ integer, parameter, public :: omp_proc_bind_kind = omp_integer_kind
+ integer, parameter, public :: kmp_pointer_kind = c_intptr_t
+ integer, parameter, public :: kmp_size_t_kind = c_size_t
+ integer, parameter, public :: kmp_affinity_mask_kind = c_intptr_t
+ integer, parameter, public :: kmp_cancel_kind = omp_integer_kind
+ integer, parameter, public :: omp_sync_hint_kind = omp_integer_kind
+ integer, parameter, public :: omp_lock_hint_kind = omp_sync_hint_kind
+ integer, parameter, public :: omp_control_tool_kind = omp_integer_kind
+ integer, parameter, public :: omp_control_tool_result_kind = omp_integer_kind
+ integer, parameter, public :: omp_allocator_handle_kind = c_intptr_t
+ integer, parameter, public :: omp_memspace_handle_kind = c_intptr_t
+ integer, parameter, public :: omp_alloctrait_key_kind = omp_integer_kind
+ integer, parameter, public :: omp_alloctrait_val_kind = c_intptr_t
+ integer, parameter, public :: omp_interop_kind = c_intptr_t
+ integer, parameter, public :: omp_interop_fr_kind = omp_integer_kind
type omp_alloctrait
integer(kind=omp_alloctrait_key_kind) key
integer(kind=omp_alloctrait_val_kind) value
end type omp_alloctrait
+ public :: omp_alloctrait
- integer, parameter :: omp_pause_resource_kind = omp_integer_kind
- integer, parameter :: omp_depend_kind = c_intptr_t
- integer, parameter :: omp_event_handle_kind = c_intptr_t
+ integer, parameter, public :: omp_pause_resource_kind = omp_integer_kind
+ integer, parameter, public :: omp_depend_kind = c_intptr_t
+ integer, parameter, public :: omp_event_handle_kind = c_intptr_t
end module omp_lib_kinds
@@ -52,119 +57,151 @@
use omp_lib_kinds
- integer (kind=omp_integer_kind), parameter :: openmp_version = @LIBOMP_OMP_YEAR_MONTH@
- integer (kind=omp_integer_kind), parameter :: kmp_version_major = @LIBOMP_VERSION_MAJOR@
- integer (kind=omp_integer_kind), parameter :: kmp_version_minor = @LIBOMP_VERSION_MINOR@
- integer (kind=omp_integer_kind), parameter :: kmp_version_build = @LIBOMP_VERSION_BUILD@
+ ! Set PRIVATE by default to explicitly only export what is meant
+ ! to be exported by this MODULE.
+ private
+
+ ! Re-export definitions in omp_lib_kinds
+ public :: omp_integer_kind
+ public :: omp_logical_kind
+ public :: omp_real_kind
+ public :: kmp_double_kind
+ public :: omp_lock_kind
+ public :: omp_nest_lock_kind
+ public :: omp_sched_kind
+ public :: omp_proc_bind_kind
+ public :: kmp_pointer_kind
+ public :: kmp_size_t_kind
+ public :: kmp_affinity_mask_kind
+ public :: kmp_cancel_kind
+ public :: omp_sync_hint_kind
+ public :: omp_lock_hint_kind
+ public :: omp_control_tool_kind
+ public :: omp_control_tool_result_kind
+ public :: omp_allocator_handle_kind
+ public :: omp_memspace_handle_kind
+ public :: omp_alloctrait_key_kind
+ public :: omp_alloctrait_val_kind
+ public :: omp_interop_kind
+ public :: omp_interop_fr_kind
+ public :: omp_alloctrait
+ public :: omp_pause_resource_kind
+ public :: omp_depend_kind
+ public :: omp_event_handle_kind
+
+ integer (kind=omp_integer_kind), parameter, public :: openmp_version = @LIBOMP_OMP_YEAR_MONTH@
+ integer (kind=omp_integer_kind), parameter, public :: kmp_version_major = @LIBOMP_VERSION_MAJOR@
+ integer (kind=omp_integer_kind), parameter, public :: kmp_version_minor = @LIBOMP_VERSION_MINOR@
+ integer (kind=omp_integer_kind), parameter, public :: kmp_version_build = @LIBOMP_VERSION_BUILD@
character(*) kmp_build_date
parameter( kmp_build_date = '@LIBOMP_BUILD_DATE@' )
- integer(kind=omp_sched_kind), parameter :: omp_sched_static = 1
- integer(kind=omp_sched_kind), parameter :: omp_sched_dynamic = 2
- integer(kind=omp_sched_kind), parameter :: omp_sched_guided = 3
- integer(kind=omp_sched_kind), parameter :: omp_sched_auto = 4
- integer(kind=omp_sched_kind), parameter :: omp_sched_monotonic = int(Z'80000000', kind=omp_sched_kind)
-
- integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
- integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
- integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
- integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
- integer (kind=omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
-
- integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_parallel = 1
- integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_loop = 2
- integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_sections = 3
- integer (kind=kmp_cancel_kind), parameter :: kmp_cancel_taskgroup = 4
-
- integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_none = 0
- integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_uncontended = 1
- integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_contended = 2
- integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_nonspeculative = 4
- integer (kind=omp_sync_hint_kind), parameter :: omp_sync_hint_speculative = 8
- integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_none = omp_sync_hint_none
- integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_uncontended = omp_sync_hint_uncontended
- integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_contended = omp_sync_hint_contended
- integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative
- integer (kind=omp_lock_hint_kind), parameter :: omp_lock_hint_speculative = omp_sync_hint_speculative
- integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_hle = 65536
- integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_rtm = 131072
- integer (kind=omp_lock_hint_kind), parameter :: kmp_lock_hint_adaptive = 262144
-
- integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_start = 1
- integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_pause = 2
- integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_flush = 3
- integer (kind=omp_control_tool_kind), parameter :: omp_control_tool_end = 4
-
- integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_notool = -2
- integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_nocallback = -1
- integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_success = 0
- integer (kind=omp_control_tool_result_kind), parameter :: omp_control_tool_ignored = 1
-
- integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_sync_hint = 1
- integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_alignment = 2
- integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_access = 3
- integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pool_size = 4
- integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_fallback = 5
- integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_fb_data = 6
- integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_pinned = 7
- integer (kind=omp_alloctrait_key_kind), parameter :: omp_atk_partition = 8
-
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default = -1
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_false = 0
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_true = 1
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_contended = 3
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_uncontended = 4
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_serialized = 5
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_sequential = omp_atv_serialized
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_private = 6
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_all = 7
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_thread = 8
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_pteam = 9
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_cgroup = 10
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_default_mem_fb = 11
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_null_fb = 12
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_abort_fb = 13
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_allocator_fb = 14
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_environment = 15
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_nearest = 16
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_blocked = 17
- integer (kind=omp_alloctrait_val_kind), parameter :: omp_atv_interleaved = 18
-
- integer (kind=omp_allocator_handle_kind), parameter :: omp_null_allocator = 0
- integer (kind=omp_allocator_handle_kind), parameter :: omp_default_mem_alloc = 1
- integer (kind=omp_allocator_handle_kind), parameter :: omp_large_cap_mem_alloc = 2
- integer (kind=omp_allocator_handle_kind), parameter :: omp_const_mem_alloc = 3
- integer (kind=omp_allocator_handle_kind), parameter :: omp_high_bw_mem_alloc = 4
- integer (kind=omp_allocator_handle_kind), parameter :: omp_low_lat_mem_alloc = 5
- integer (kind=omp_allocator_handle_kind), parameter :: omp_cgroup_mem_alloc = 6
- integer (kind=omp_allocator_handle_kind), parameter :: omp_pteam_mem_alloc = 7
- integer (kind=omp_allocator_handle_kind), parameter :: omp_thread_mem_alloc = 8
- integer (kind=omp_allocator_handle_kind), parameter :: llvm_omp_target_host_mem_alloc = 100
- integer (kind=omp_allocator_handle_kind), parameter :: llvm_omp_target_shared_mem_alloc = 101
- integer (kind=omp_allocator_handle_kind), parameter :: llvm_omp_target_device_mem_alloc = 102
-
- integer (kind=omp_memspace_handle_kind), parameter :: omp_default_mem_space = 0
- integer (kind=omp_memspace_handle_kind), parameter :: omp_large_cap_mem_space = 1
- integer (kind=omp_memspace_handle_kind), parameter :: omp_const_mem_space = 2
- integer (kind=omp_memspace_handle_kind), parameter :: omp_high_bw_mem_space = 3
- integer (kind=omp_memspace_handle_kind), parameter :: omp_low_lat_mem_space = 4
- integer (kind=omp_memspace_handle_kind), parameter :: llvm_omp_target_host_mem_space = 100
- integer (kind=omp_memspace_handle_kind), parameter :: llvm_omp_target_shared_mem_space = 101
- integer (kind=omp_memspace_handle_kind), parameter :: llvm_omp_target_device_mem_space = 102
-
- integer (kind=omp_pause_resource_kind), parameter :: omp_pause_resume = 0
- integer (kind=omp_pause_resource_kind), parameter :: omp_pause_soft = 1
- integer (kind=omp_pause_resource_kind), parameter :: omp_pause_hard = 2
-
- integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_cuda = 1
- integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_cuda_driver = 2
- integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_opencl = 3
- integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_sycl = 4
- integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_hip = 5
- integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_level_zero = 6
- integer (kind=omp_interop_fr_kind), parameter :: omp_ifr_last = 7
-
- integer (kind=omp_interop_kind), parameter :: omp_interop_none = 0
+ integer(kind=omp_sched_kind), parameter, public :: omp_sched_static = 1
+ integer(kind=omp_sched_kind), parameter, public :: omp_sched_dynamic = 2
+ integer(kind=omp_sched_kind), parameter, public :: omp_sched_guided = 3
+ integer(kind=omp_sched_kind), parameter, public :: omp_sched_auto = 4
+ integer(kind=omp_sched_kind), parameter, public :: omp_sched_monotonic = int(Z'80000000', kind=omp_sched_kind)
+
+ integer (kind=omp_proc_bind_kind), parameter, public :: omp_proc_bind_false = 0
+ integer (kind=omp_proc_bind_kind), parameter, public :: omp_proc_bind_true = 1
+ integer (kind=omp_proc_bind_kind), parameter, public :: omp_proc_bind_master = 2
+ integer (kind=omp_proc_bind_kind), parameter, public :: omp_proc_bind_close = 3
+ integer (kind=omp_proc_bind_kind), parameter, public :: omp_proc_bind_spread = 4
+
+ integer (kind=kmp_cancel_kind), parameter, public :: kmp_cancel_parallel = 1
+ integer (kind=kmp_cancel_kind), parameter, public :: kmp_cancel_loop = 2
+ integer (kind=kmp_cancel_kind), parameter, public :: kmp_cancel_sections = 3
+ integer (kind=kmp_cancel_kind), parameter, public :: kmp_cancel_taskgroup = 4
+
+ integer (kind=omp_sync_hint_kind), parameter, public :: omp_sync_hint_none = 0
+ integer (kind=omp_sync_hint_kind), parameter, public :: omp_sync_hint_uncontended = 1
+ integer (kind=omp_sync_hint_kind), parameter, public :: omp_sync_hint_contended = 2
+ integer (kind=omp_sync_hint_kind), parameter, public :: omp_sync_hint_nonspeculative = 4
+ integer (kind=omp_sync_hint_kind), parameter, public :: omp_sync_hint_speculative = 8
+ integer (kind=omp_lock_hint_kind), parameter, public :: omp_lock_hint_none = omp_sync_hint_none
+ integer (kind=omp_lock_hint_kind), parameter, public :: omp_lock_hint_uncontended = omp_sync_hint_uncontended
+ integer (kind=omp_lock_hint_kind), parameter, public :: omp_lock_hint_contended = omp_sync_hint_contended
+ integer (kind=omp_lock_hint_kind), parameter, public :: omp_lock_hint_nonspeculative = omp_sync_hint_nonspeculative
+ integer (kind=omp_lock_hint_kind), parameter, public :: omp_lock_hint_speculative = omp_sync_hint_speculative
+ integer (kind=omp_lock_hint_kind), parameter, public :: kmp_lock_hint_hle = 65536
+ integer (kind=omp_lock_hint_kind), parameter, public :: kmp_lock_hint_rtm = 131072
+ integer (kind=omp_lock_hint_kind), parameter, public :: kmp_lock_hint_adaptive = 262144
+
+ integer (kind=omp_control_tool_kind), parameter, public :: omp_control_tool_start = 1
+ integer (kind=omp_control_tool_kind), parameter, public :: omp_control_tool_pause = 2
+ integer (kind=omp_control_tool_kind), parameter, public :: omp_control_tool_flush = 3
+ integer (kind=omp_control_tool_kind), parameter, public :: omp_control_tool_end = 4
+
+ integer (kind=omp_control_tool_result_kind), parameter, public :: omp_control_tool_notool = -2
+ integer (kind=omp_control_tool_result_kind), parameter, public :: omp_control_tool_nocallback = -1
+ integer (kind=omp_control_tool_result_kind), parameter, public :: omp_control_tool_success = 0
+ integer (kind=omp_control_tool_result_kind), parameter, public :: omp_control_tool_ignored = 1
+
+ integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_sync_hint = 1
+ integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_alignment = 2
+ integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_access = 3
+ integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_pool_size = 4
+ integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_fallback = 5
+ integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_fb_data = 6
+ integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_pinned = 7
+ integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_partition = 8
+
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_default = -1
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_false = 0
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_true = 1
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_contended = 3
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_uncontended = 4
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_serialized = 5
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_sequential = omp_atv_serialized
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_private = 6
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_all = 7
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_thread = 8
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_pteam = 9
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_cgroup = 10
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_default_mem_fb = 11
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_null_fb = 12
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_abort_fb = 13
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_allocator_fb = 14
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_environment = 15
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_nearest = 16
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_blocked = 17
+ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_interleaved = 18
+
+ integer (kind=omp_allocator_handle_kind), parameter, public :: omp_null_allocator = 0
+ integer (kind=omp_allocator_handle_kind), parameter, public :: omp_default_mem_alloc = 1
+ integer (kind=omp_allocator_handle_kind), parameter, public :: omp_large_cap_mem_alloc = 2
+ integer (kind=omp_allocator_handle_kind), parameter, public :: omp_const_mem_alloc = 3
+ integer (kind=omp_allocator_handle_kind), parameter, public :: omp_high_bw_mem_alloc = 4
+ integer (kind=omp_allocator_handle_kind), parameter, public :: omp_low_lat_mem_alloc = 5
+ integer (kind=omp_allocator_handle_kind), parameter, public :: omp_cgroup_mem_alloc = 6
+ integer (kind=omp_allocator_handle_kind), parameter, public :: omp_pteam_mem_alloc = 7
+ integer (kind=omp_allocator_handle_kind), parameter, public :: omp_thread_mem_alloc = 8
+ integer (kind=omp_allocator_handle_kind), parameter, public :: llvm_omp_target_host_mem_alloc = 100
+ integer (kind=omp_allocator_handle_kind), parameter, public :: llvm_omp_target_shared_mem_alloc = 101
+ integer (kind=omp_allocator_handle_kind), parameter, public :: llvm_omp_target_device_mem_alloc = 102
+
+ integer (kind=omp_memspace_handle_kind), parameter, public :: omp_default_mem_space = 0
+ integer (kind=omp_memspace_handle_kind), parameter, public :: omp_large_cap_mem_space = 1
+ integer (kind=omp_memspace_handle_kind), parameter, public :: omp_const_mem_space = 2
+ integer (kind=omp_memspace_handle_kind), parameter, public :: omp_high_bw_mem_space = 3
+ integer (kind=omp_memspace_handle_kind), parameter, public :: omp_low_lat_mem_space = 4
+ integer (kind=omp_memspace_handle_kind), parameter, public :: llvm_omp_target_host_mem_space = 100
+ integer (kind=omp_memspace_handle_kind), parameter, public :: llvm_omp_target_shared_mem_space = 101
+ integer (kind=omp_memspace_handle_kind), parameter, public :: llvm_omp_target_device_mem_space = 102
+
+ integer (kind=omp_pause_resource_kind), parameter, public :: omp_pause_resume = 0
+ integer (kind=omp_pause_resource_kind), parameter, public :: omp_pause_soft = 1
+ integer (kind=omp_pause_resource_kind), parameter, public :: omp_pause_hard = 2
+
+ integer (kind=omp_interop_fr_kind), parameter, public :: omp_ifr_cuda = 1
+ integer (kind=omp_interop_fr_kind), parameter, public :: omp_ifr_cuda_driver = 2
+ integer (kind=omp_interop_fr_kind), parameter, public :: omp_ifr_opencl = 3
+ integer (kind=omp_interop_fr_kind), parameter, public :: omp_ifr_sycl = 4
+ integer (kind=omp_interop_fr_kind), parameter, public :: omp_ifr_hip = 5
+ integer (kind=omp_interop_fr_kind), parameter, public :: omp_ifr_level_zero = 6
+ integer (kind=omp_interop_fr_kind), parameter, public :: omp_ifr_last = 7
+
+ integer (kind=omp_interop_kind), parameter, public :: omp_interop_none = 0
interface
@@ -392,82 +429,102 @@
end subroutine omp_fulfill_event
subroutine omp_init_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_init_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
integer (kind=omp_lock_kind) svar
end subroutine omp_init_lock
subroutine omp_destroy_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_destroy_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
integer (kind=omp_lock_kind) svar
end subroutine omp_destroy_lock
subroutine omp_set_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_set_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
integer (kind=omp_lock_kind) svar
end subroutine omp_set_lock
subroutine omp_unset_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_unset_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
integer (kind=omp_lock_kind) svar
end subroutine omp_unset_lock
function omp_test_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_test_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
logical (kind=omp_logical_kind) omp_test_lock
integer (kind=omp_lock_kind) svar
end function omp_test_lock
subroutine omp_init_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_init_nest_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
integer (kind=omp_nest_lock_kind) nvar
end subroutine omp_init_nest_lock
subroutine omp_destroy_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
integer (kind=omp_nest_lock_kind) nvar
end subroutine omp_destroy_nest_lock
subroutine omp_set_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_set_nest_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
integer (kind=omp_nest_lock_kind) nvar
end subroutine omp_set_nest_lock
subroutine omp_unset_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
integer (kind=omp_nest_lock_kind) nvar
end subroutine omp_unset_nest_lock
function omp_test_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_test_nest_lock
!DIR$ ENDIF
+#endif
use omp_lib_kinds
integer (kind=omp_integer_kind) omp_test_nest_lock
integer (kind=omp_nest_lock_kind) nvar
@@ -568,20 +625,20 @@
end subroutine omp_display_env
function omp_target_alloc(size, device_num) bind(c)
- use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int
type(c_ptr) omp_target_alloc
integer(c_size_t), value :: size
integer(c_int), value :: device_num
end function omp_target_alloc
subroutine omp_target_free(device_ptr, device_num) bind(c)
- use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_int
type(c_ptr), value :: device_ptr
integer(c_int), value :: device_num
end subroutine omp_target_free
function omp_target_is_present(ptr, device_num) bind(c)
- use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_int
integer(c_int) omp_target_is_present
type(c_ptr), value :: ptr
integer(c_int), value :: device_num
@@ -589,7 +646,7 @@
function omp_target_memcpy(dst, src, length, dst_offset, src_offset, &
dst_device_num, src_device_num) bind(c)
- use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int
integer(c_int) omp_target_memcpy
type(c_ptr), value :: dst, src
integer(c_size_t), value :: length, dst_offset, src_offset
@@ -599,7 +656,7 @@
function omp_target_memcpy_rect(dst, src, element_size, num_dims, &
volume, dst_offsets, src_offsets, dst_dimensions, &
src_dimensions, dst_device_num, src_device_num) bind(c)
- use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int
integer(c_int) omp_target_memcpy_rect
type(c_ptr), value :: dst, src
integer(c_size_t), value :: element_size
@@ -612,6 +669,7 @@
src_offset, dst_device_num, src_device_num, depobj_count, &
depobj_list) bind(c)
use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int
integer(c_int) omp_target_memcpy_async
type(c_ptr), value :: dst, src
integer(c_size_t), value :: length, dst_offset, src_offset
@@ -625,6 +683,7 @@
src_dimensions, dst_device_num, src_device_num, depobj_count, &
depobj_list) bind(c)
use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int
integer(c_int) omp_target_memcpy_rect_async
type(c_ptr), value :: dst, src
integer(c_size_t), value :: element_size
@@ -646,8 +705,8 @@
function omp_target_memset_async(ptr, val, count, device_num, &
depobj_count, depobj_list) bind(c)
- use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
type(c_ptr) :: omp_target_memset_async
type(c_ptr), value :: ptr
integer(c_int), value :: val
@@ -659,7 +718,7 @@
function omp_target_associate_ptr(host_ptr, device_ptr, size, &
device_offset, device_num) bind(c)
- use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int
integer(c_int) omp_target_associate_ptr
type(c_ptr), value :: host_ptr, device_ptr
integer(c_size_t), value :: size, device_offset
@@ -667,21 +726,20 @@
end function omp_target_associate_ptr
function omp_get_mapped_ptr(ptr, device_num) bind(c)
- use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_int
type(c_ptr) omp_get_mapped_ptr
type(c_ptr), value :: ptr
integer(c_int), value :: device_num
end function omp_get_mapped_ptr
function omp_target_disassociate_ptr(ptr, device_num) bind(c)
- use omp_lib_kinds
+ use, intrinsic :: iso_c_binding, only: c_ptr, c_int
integer(c_int) omp_target_disassociate_ptr
type(c_ptr), value :: ptr
integer(c_int), value :: device_num
end function omp_target_disassociate_ptr
function omp_target_is_accessible(ptr, size, device_num) bind(c)
- use omp_lib_kinds
use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
integer(c_int) omp_target_is_accessible
type(c_ptr), value :: ptr
@@ -900,4 +958,124 @@
end interface
+ ! make the above routine definitions public
+ public :: omp_set_num_threads
+ public :: omp_set_dynamic
+ public :: omp_set_nested
+ public :: omp_get_num_threads
+ public :: omp_get_max_threads
+ public :: omp_get_thread_num
+ public :: omp_get_num_procs
+ public :: omp_in_parallel
+ public :: omp_in_final
+ public :: omp_get_dynamic
+ public :: omp_get_nested
+ public :: omp_get_thread_limit
+ public :: omp_set_max_active_levels
+ public :: omp_get_max_active_levels
+ public :: omp_get_level
+ public :: omp_get_active_level
+ public :: omp_get_ancestor_thread_num
+ public :: omp_get_team_size
+ public :: omp_set_schedule
+ public :: omp_get_schedule
+ public :: omp_get_proc_bind
+ public :: omp_get_num_places
+ public :: omp_get_place_num_procs
+ public :: omp_get_place_proc_ids
+ public :: omp_get_place_num
+ public :: omp_get_partition_num_places
+ public :: omp_get_partition_place_nums
+ public :: omp_get_wtime
+ public :: omp_get_wtick
+ public :: omp_get_default_device
+ public :: omp_set_default_device
+ public :: omp_get_num_devices
+ public :: omp_get_num_teams
+ public :: omp_get_team_num
+ public :: omp_get_cancellation
+ public :: omp_is_initial_device
+ public :: omp_get_initial_device
+ public :: omp_get_device_num
+ public :: omp_pause_resource
+ public :: omp_pause_resource_all
+ public :: omp_get_supported_active_levels
+ public :: omp_fulfill_event
+ public :: omp_init_lock
+ public :: omp_destroy_lock
+ public :: omp_set_lock
+ public :: omp_unset_lock
+ public :: omp_test_lock
+ public :: omp_init_nest_lock
+ public :: omp_destroy_nest_lock
+ public :: omp_set_nest_lock
+ public :: omp_unset_nest_lock
+ public :: omp_test_nest_lock
+ public :: omp_get_max_task_priority
+ public :: omp_init_lock_with_hint
+ public :: omp_init_nest_lock_with_hint
+ public :: omp_control_tool
+ public :: omp_init_allocator
+ public :: omp_destroy_allocator
+ public :: omp_set_default_allocator
+ public :: omp_get_default_allocator
+ public :: omp_set_affinity_format
+ public :: omp_get_affinity_format
+ public :: omp_display_affinity
+ public :: omp_capture_affinity
+ public :: omp_set_num_teams
+ public :: omp_get_max_teams
+ public :: omp_set_teams_thread_limit
+ public :: omp_get_teams_thread_limit
+ public :: omp_display_env
+ public :: omp_target_alloc
+ public :: omp_target_free
+ public :: omp_target_is_present
+ public :: omp_target_memcpy
+ public :: omp_target_memcpy_rect
+ public :: omp_target_memcpy_async
+ public :: omp_target_memcpy_rect_async
+ public :: omp_target_memset
+ public :: omp_target_memset_async
+ public :: omp_target_associate_ptr
+ public :: omp_get_mapped_ptr
+ public :: omp_target_disassociate_ptr
+ public :: omp_target_is_accessible
+ public :: omp_alloc
+ public :: omp_aligned_alloc
+ public :: omp_calloc
+ public :: omp_aligned_calloc
+ public :: omp_realloc
+ public :: omp_free
+ public :: omp_in_explicit_task
+ public :: kmp_set_stacksize
+ public :: kmp_set_stacksize_s
+ public :: kmp_set_blocktime
+ public :: kmp_set_library_serial
+ public :: kmp_set_library_turnaround
+ public :: kmp_set_library_throughput
+ public :: kmp_set_library
+ public :: kmp_set_defaults
+ public :: kmp_get_stacksize
+ public :: kmp_get_stacksize_s
+ public :: kmp_get_blocktime
+ public :: kmp_get_library
+ public :: kmp_set_disp_num_buffers
+ public :: kmp_set_affinity
+ public :: kmp_get_affinity
+ public :: kmp_get_affinity_max_proc
+ public :: kmp_create_affinity_mask
+ public :: kmp_destroy_affinity_mask
+ public :: kmp_set_affinity_mask_proc
+ public :: kmp_unset_affinity_mask_proc
+ public :: kmp_get_affinity_mask_proc
+ public :: kmp_malloc
+ public :: kmp_aligned_malloc
+ public :: kmp_calloc
+ public :: kmp_realloc
+ public :: kmp_free
+ public :: kmp_set_warnings_on
+ public :: kmp_set_warnings_off
+ public :: kmp_get_cancellation_status
+
end module omp_lib
diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var
index 617f2321676d..a709a2f298f8 100644
--- a/openmp/runtime/src/include/omp_lib.h.var
+++ b/openmp/runtime/src/include/omp_lib.h.var
@@ -486,82 +486,102 @@
end subroutine omp_fulfill_event
subroutine omp_init_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_init_lock
!DIR$ ENDIF
+#endif
import
integer (kind=omp_lock_kind) svar
end subroutine omp_init_lock
subroutine omp_destroy_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_destroy_lock
!DIR$ ENDIF
+#endif
import
integer (kind=omp_lock_kind) svar
end subroutine omp_destroy_lock
subroutine omp_set_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_set_lock
!DIR$ ENDIF
+#endif
import
integer (kind=omp_lock_kind) svar
end subroutine omp_set_lock
subroutine omp_unset_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_unset_lock
!DIR$ ENDIF
+#endif
import
integer (kind=omp_lock_kind) svar
end subroutine omp_unset_lock
function omp_test_lock(svar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_test_lock
!DIR$ ENDIF
+#endif
import
logical (kind=omp_logical_kind) omp_test_lock
integer (kind=omp_lock_kind) svar
end function omp_test_lock
subroutine omp_init_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_init_nest_lock
!DIR$ ENDIF
+#endif
import
integer (kind=omp_nest_lock_kind) nvar
end subroutine omp_init_nest_lock
subroutine omp_destroy_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_destroy_nest_lock
!DIR$ ENDIF
+#endif
import
integer (kind=omp_nest_lock_kind) nvar
end subroutine omp_destroy_nest_lock
subroutine omp_set_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_set_nest_lock
!DIR$ ENDIF
+#endif
import
integer (kind=omp_nest_lock_kind) nvar
end subroutine omp_set_nest_lock
subroutine omp_unset_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_unset_nest_lock
!DIR$ ENDIF
+#endif
import
integer (kind=omp_nest_lock_kind) nvar
end subroutine omp_unset_nest_lock
function omp_test_nest_lock(nvar) bind(c)
+#ifdef __INTEL_COMPILER
!DIR$ IF(__INTEL_COMPILER.GE.1400)
!DIR$ attributes known_intrinsic :: omp_test_nest_lock
!DIR$ ENDIF
+#endif
import
integer (kind=omp_integer_kind) omp_test_nest_lock
integer (kind=omp_nest_lock_kind) nvar
@@ -990,6 +1010,7 @@
end subroutine kmp_set_warnings_off
end interface
+#ifdef __INTEL_COMPILER
!DIR$ IF DEFINED (__INTEL_OFFLOAD)
!DIR$ IF(__INTEL_COMPILER.LT.1900)
@@ -1158,3 +1179,4 @@
!$omp declare target(omp_init_nest_lock_with_hint )
!DIR$ ENDIF
!DIR$ ENDIF
+#endif
diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 5dd8e8355e4c..623f0b9c315b 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -9,6 +9,14 @@
#ifndef __OMPX_H
#define __OMPX_H
+#ifdef __AMDGCN_WAVEFRONT_SIZE
+#define __WARP_SIZE __AMDGCN_WAVEFRONT_SIZE
+#else
+#define __WARP_SIZE 32
+#endif
+
+typedef unsigned long uint64_t;
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -50,9 +58,12 @@ enum {
ompx_dim_z = 2,
};
+// TODO: The following implementation is for host fallback. We need to disable
+// generation of host fallback in kernel language mode.
+#pragma omp begin declare variant match(device = {kind(cpu)})
+
/// ompx_{thread,block}_{id,dim}
///{
-#pragma omp begin declare variant match(device = {kind(cpu)})
#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(NAME, VALUE) \
static inline int ompx_##NAME(int Dim) { return VALUE; }
@@ -70,14 +81,34 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(grid_dim, 1)
static inline RETTY ompx_##NAME(ARGS) { BODY; }
_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block, int Ordering,
- _Pragma("omp barrier"));
+ _Pragma("omp barrier"))
_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_acq_rel, void,
- ompx_sync_block(ompx_acq_rel));
+ ompx_sync_block(ompx_acq_rel))
_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
- ompx_sync_block(Ordering));
+ ompx_sync_block(Ordering))
#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C
///}
+static inline uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
+ __builtin_trap();
+}
+
+/// ompx_shfl_down_sync_{i,f,l,d}
+///{
+#define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC_HOST_IMPL(TYPE, TY) \
+ static inline TYPE ompx_shfl_down_sync_##TY(uint64_t mask, TYPE var, \
+ unsigned delta, int width) { \
+ __builtin_trap(); \
+ }
+
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC_HOST_IMPL(int, i)
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC_HOST_IMPL(float, f)
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC_HOST_IMPL(long, l)
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC_HOST_IMPL(double, d)
+
+#undef _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC_HOST_IMPL
+///}
+
#pragma omp end declare variant
/// ompx_{sync_block}_{,divergent}
@@ -85,9 +116,9 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS) \
RETTY ompx_##NAME(ARGS);
-_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block, int Ordering);
-_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_acq_rel, void);
-_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_divergent, int Ordering);
+_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block, int Ordering)
+_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_acq_rel, void)
+_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_divergent, int Ordering)
#undef _TGT_KERNEL_LANGUAGE_DECL_SYNC_C
///}
@@ -106,6 +137,22 @@ _TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim)
#undef _TGT_KERNEL_LANGUAGE_DECL_GRID_C
///}
+uint64_t ompx_ballot_sync(uint64_t mask, int pred);
+
+/// ompx_shfl_down_sync_{i,f,l,d}
+///{
+#define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(TYPE, TY) \
+ TYPE ompx_shfl_down_sync_##TY(uint64_t mask, TYPE var, unsigned delta, \
+ int width);
+
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(int, i)
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(float, f)
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(long, l)
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(double, d)
+
+#undef _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC
+///}
+
#ifdef __cplusplus
}
#endif
@@ -151,12 +198,32 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(grid_dim)
}
_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block, int Ordering = acc_rel,
- Ordering);
+ Ordering)
_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent,
- int Ordering = acc_rel, Ordering);
+ int Ordering = acc_rel, Ordering)
#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX
///}
+static inline uint64_t ballot_sync(uint64_t mask, int pred) {
+ return ompx_ballot_sync(mask, pred);
+}
+
+/// shfl_down_sync
+///{
+#define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(TYPE, TY) \
+ static inline TYPE shfl_down_sync(uint64_t mask, TYPE var, unsigned delta, \
+ int width = __WARP_SIZE) { \
+ return ompx_shfl_down_sync_##TY(mask, var, delta, width); \
+ }
+
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(int, i)
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(float, f)
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(long, l)
+_TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(double, d)
+
+#undef _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC
+///}
+
} // namespace ompx
#endif
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index c287a31e0b1b..916c1dc25700 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -103,7 +103,8 @@ class kmp_stats_list;
#define KMP_USE_HIER_SCHED KMP_AFFINITY_SUPPORTED
#endif
-#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED
+// OMPD_SKIP_HWLOC used in libompd/omp-icv.cpp to avoid OMPD depending on hwloc
+#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED && !defined(OMPD_SKIP_HWLOC)
#include "hwloc.h"
#ifndef HWLOC_OBJ_NUMANODE
#define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
@@ -531,6 +532,15 @@ enum clock_function_type {
enum mic_type { non_mic, mic1, mic2, mic3, dummy };
#endif
+// OpenMP 3.1 - Nested num threads array
+typedef struct kmp_nested_nthreads_t {
+ int *nth;
+ int size;
+ int used;
+} kmp_nested_nthreads_t;
+
+extern kmp_nested_nthreads_t __kmp_nested_nth;
+
/* -- fast reduction stuff ------------------------------------------------ */
#undef KMP_FAST_REDUCTION_BARRIER
@@ -689,7 +699,7 @@ typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *,
extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
#endif /* KMP_OS_WINDOWS */
-#if KMP_USE_HWLOC
+#if KMP_USE_HWLOC && !defined(OMPD_SKIP_HWLOC)
extern hwloc_topology_t __kmp_hwloc_topology;
extern int __kmp_hwloc_error;
#endif
@@ -818,13 +828,14 @@ private:
typedef KMPAffinity::Mask kmp_affin_mask_t;
extern KMPAffinity *__kmp_affinity_dispatch;
+#ifndef KMP_OS_AIX
class kmp_affinity_raii_t {
kmp_affin_mask_t *mask;
bool restored;
public:
kmp_affinity_raii_t(const kmp_affin_mask_t *new_mask = nullptr)
- : restored(false) {
+ : mask(nullptr), restored(false) {
if (KMP_AFFINITY_CAPABLE()) {
KMP_CPU_ALLOC(mask);
KMP_ASSERT(mask != NULL);
@@ -834,7 +845,7 @@ public:
}
}
void restore() {
- if (!restored && KMP_AFFINITY_CAPABLE()) {
+ if (mask && KMP_AFFINITY_CAPABLE() && !restored) {
__kmp_set_system_affinity(mask, /*abort_on_error=*/true);
KMP_CPU_FREE(mask);
}
@@ -842,6 +853,7 @@ public:
}
~kmp_affinity_raii_t() { restore(); }
};
+#endif // !KMP_OS_AIX
// Declare local char buffers with this size for printing debug and info
// messages, using __kmp_affinity_print_mask().
@@ -1181,7 +1193,11 @@ extern void __kmp_init_target_task();
#define KMP_MIN_STKSIZE ((size_t)(32 * 1024))
#endif
+#if KMP_OS_AIX && KMP_ARCH_PPC
+#define KMP_MAX_STKSIZE 0x10000000 /* 256Mb max size on 32-bit AIX */
+#else
#define KMP_MAX_STKSIZE (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
+#endif
#if KMP_ARCH_X86
#define KMP_DEFAULT_STKSIZE ((size_t)(2 * 1024 * 1024))
@@ -1387,8 +1403,6 @@ typedef struct kmp_cpuinfo {
int stepping; // CPUID(1).EAX[3:0] ( Stepping )
kmp_cpuinfo_flags_t flags;
int apic_id;
- int physical_id;
- int logical_id;
kmp_uint64 frequency; // Nominal CPU frequency in Hz.
char name[3 * sizeof(kmp_cpuid_t)]; // CPUID(0x80000002,0x80000003,0x80000004)
} kmp_cpuinfo_t;
@@ -2494,14 +2508,15 @@ typedef struct kmp_dephash_entry kmp_dephash_entry_t;
#define KMP_DEP_MTX 0x4
#define KMP_DEP_SET 0x8
#define KMP_DEP_ALL 0x80
-// Compiler sends us this info:
+// Compiler sends us this info. Note: some test cases contain an explicit copy
+// of this struct and should be in sync with any changes here.
typedef struct kmp_depend_info {
kmp_intptr_t base_addr;
size_t len;
union {
kmp_uint8 flag; // flag as an unsigned char
struct { // flag as a set of 8 bits
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
/* Same fields as in the #else branch, but in reverse order */
unsigned all : 1;
unsigned unused : 3;
@@ -2666,14 +2681,15 @@ typedef struct kmp_task_stack {
#endif // BUILD_TIED_TASK_STACK
typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
/* Same fields as in the #else branch, but in reverse order */
#if OMPX_TASKGRAPH
- unsigned reserved31 : 6;
+ unsigned reserved31 : 5;
unsigned onced : 1;
#else
- unsigned reserved31 : 7;
+ unsigned reserved31 : 6;
#endif
+ unsigned target : 1;
unsigned native : 1;
unsigned freed : 1;
unsigned complete : 1;
@@ -2722,11 +2738,12 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
unsigned complete : 1; /* 1==complete, 0==not complete */
unsigned freed : 1; /* 1==freed, 0==allocated */
unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+ unsigned target : 1;
#if OMPX_TASKGRAPH
unsigned onced : 1; /* 1==ran once already, 0==never ran, record & replay purposes */
- unsigned reserved31 : 6; /* reserved for library use */
+ unsigned reserved31 : 5; /* reserved for library use */
#else
- unsigned reserved31 : 7; /* reserved for library use */
+ unsigned reserved31 : 6; /* reserved for library use */
#endif
#endif
} kmp_tasking_flags_t;
@@ -2865,6 +2882,11 @@ union KMP_ALIGN_CACHE kmp_task_team {
char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)];
};
+typedef struct kmp_task_team_list_t {
+ kmp_task_team_t *task_team;
+ kmp_task_team_list_t *next;
+} kmp_task_team_list_t;
+
#if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
// Free lists keep same-size free memory slots for fast memory allocation
// routines
@@ -2952,6 +2974,12 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
/* The data set by the primary thread at reinit, then R/W by the worker */
KMP_ALIGN_CACHE int
th_set_nproc; /* if > 0, then only use this request for the next fork */
+ int *th_set_nested_nth;
+ bool th_nt_strict; // num_threads clause has strict modifier
+ ident_t *th_nt_loc; // loc for strict modifier
+ int th_nt_sev; // error severity for strict modifier
+ const char *th_nt_msg; // error message for strict modifier
+ int th_set_nested_nth_sz;
#if KMP_NESTED_HOT_TEAMS
kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
#endif
@@ -3002,10 +3030,6 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
kmp_task_team_t *th_task_team; // Task team struct
kmp_taskdata_t *th_current_task; // Innermost Task being executed
kmp_uint8 th_task_state; // alternating 0/1 for task team identification
- kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state
- // at nested levels
- kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack
- kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack
kmp_uint32 th_reap_state; // Non-zero indicates thread is not
// tasking, thus safe to reap
@@ -3127,6 +3151,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
kmp_disp_t *t_dispatch; // thread's dispatch data
kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2
kmp_proc_bind_t t_proc_bind; // bind type for par region
+ int t_primary_task_state; // primary thread's task state saved
#if USE_ITT_BUILD
kmp_uint64 t_region_time; // region begin timestamp
#endif /* USE_ITT_BUILD */
@@ -3196,8 +3221,15 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
void *t_stack_id; // team specific stack stitching id (for ittnotify)
#endif /* USE_ITT_BUILD */
distributedBarrier *b; // Distributed barrier data associated with team
+ kmp_nested_nthreads_t *t_nested_nth;
} kmp_base_team_t;
+// Assert that the list structure fits and aligns within
+// the double task team pointer
+KMP_BUILD_ASSERT(sizeof(kmp_task_team_t *[2]) == sizeof(kmp_task_team_list_t));
+KMP_BUILD_ASSERT(alignof(kmp_task_team_t *[2]) ==
+ alignof(kmp_task_team_list_t));
+
union KMP_ALIGN_CACHE kmp_team {
kmp_base_team_t t;
double t_align; /* use worst case alignment */
@@ -3526,15 +3558,6 @@ extern enum mic_type __kmp_mic_type;
extern double __kmp_load_balance_interval; // load balance algorithm interval
#endif /* USE_LOAD_BALANCE */
-// OpenMP 3.1 - Nested num threads array
-typedef struct kmp_nested_nthreads_t {
- int *nth;
- int size;
- int used;
-} kmp_nested_nthreads_t;
-
-extern kmp_nested_nthreads_t __kmp_nested_nth;
-
#if KMP_USE_ADAPTIVE_LOCKS
// Parameters for the speculative lock backoff system.
@@ -3769,6 +3792,11 @@ extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL);
___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR)
extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads);
+extern void __kmp_push_num_threads_list(ident_t *loc, int gtid,
+ kmp_uint32 list_length,
+ int *num_threads_list);
+extern void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+ const char *msg);
extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
kmp_proc_bind_t proc_bind);
@@ -3812,6 +3840,8 @@ extern void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid);
extern void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid);
extern void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid);
+extern void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid);
+
#ifdef KMP_GOMP_COMPAT
extern void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
@@ -3906,7 +3936,8 @@ extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
extern int __kmp_get_first_osid_with_ecore(void);
#endif
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_AIX
extern int kmp_set_thread_affinity_mask_initial(void);
#endif
static inline void __kmp_assign_root_init_mask() {
@@ -4107,9 +4138,10 @@ extern void __kmp_fulfill_event(kmp_event_t *event);
extern void __kmp_free_task_team(kmp_info_t *thread,
kmp_task_team_t *task_team);
extern void __kmp_reap_task_teams(void);
+extern void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team);
+extern void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team);
extern void __kmp_wait_to_unref_task_teams(void);
-extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
- int always);
+extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team);
extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team);
extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
#if USE_ITT_BUILD
@@ -4120,6 +4152,14 @@ extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
int wait = 1);
extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
int gtid);
+#if KMP_DEBUG
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr) \
+ KMP_DEBUG_ASSERT( \
+ __kmp_tasking_mode != tskm_task_teams || team->t.t_nproc == 1 || \
+ thr->th.th_task_team == team->t.t_task_team[thr->th.th_task_state])
+#else
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr) /* Nothing */
+#endif
extern int __kmp_is_address_mapped(void *addr);
extern kmp_uint64 __kmp_hardware_timestamp(void);
@@ -4397,6 +4437,18 @@ KMP_EXPORT kmp_int32 __kmpc_in_parallel(ident_t *loc);
KMP_EXPORT void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid);
KMP_EXPORT void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_push_num_threads_strict(ident_t *loc,
+ kmp_int32 global_tid,
+ kmp_int32 num_threads,
+ int severity,
+ const char *message);
+
+KMP_EXPORT void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+ kmp_uint32 list_length,
+ kmp_int32 *num_threads_list);
+KMP_EXPORT void __kmpc_push_num_threads_list_strict(
+ ident_t *loc, kmp_int32 global_tid, kmp_uint32 list_length,
+ kmp_int32 *num_threads_list, int severity, const char *message);
KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
int proc_bind);
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 6a41d34b0237..f34e55555545 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -327,6 +327,9 @@ void kmp_topology_t::_insert_windows_proc_groups() {
KMP_CPU_FREE(mask);
_insert_layer(KMP_HW_PROC_GROUP, ids);
__kmp_free(ids);
+
+ // sort topology after adding proc groups
+ __kmp_topology->sort_ids();
}
#endif
@@ -984,41 +987,6 @@ void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
_discover_uniformity();
}
-// Represents running sub IDs for a single core attribute where
-// attribute values have SIZE possibilities.
-template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
- int last_level; // last level in topology to consider for sub_ids
- int sub_id[SIZE]; // The sub ID for a given attribute value
- int prev_sub_id[KMP_HW_LAST];
- IndexFunc indexer;
-
-public:
- kmp_sub_ids_t(int last_level) : last_level(last_level) {
- KMP_ASSERT(last_level < KMP_HW_LAST);
- for (size_t i = 0; i < SIZE; ++i)
- sub_id[i] = -1;
- for (size_t i = 0; i < KMP_HW_LAST; ++i)
- prev_sub_id[i] = -1;
- }
- void update(const kmp_hw_thread_t &hw_thread) {
- int idx = indexer(hw_thread);
- KMP_ASSERT(idx < (int)SIZE);
- for (int level = 0; level <= last_level; ++level) {
- if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
- if (level < last_level)
- sub_id[idx] = -1;
- sub_id[idx]++;
- break;
- }
- }
- for (int level = 0; level <= last_level; ++level)
- prev_sub_id[level] = hw_thread.sub_ids[level];
- }
- int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
- return sub_id[indexer(hw_thread)];
- }
-};
-
#if KMP_AFFINITY_SUPPORTED
static kmp_str_buf_t *
__kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
@@ -1081,9 +1049,12 @@ bool kmp_topology_t::filter_hw_subset() {
// First, sort the KMP_HW_SUBSET items by the machine topology
__kmp_hw_subset->sort();
+ __kmp_hw_subset->canonicalize(__kmp_topology);
+
// Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
bool using_core_types = false;
bool using_core_effs = false;
+ bool is_absolute = __kmp_hw_subset->is_absolute();
int hw_subset_depth = __kmp_hw_subset->get_depth();
kmp_hw_t specified[KMP_HW_LAST];
int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
@@ -1121,12 +1092,14 @@ bool kmp_topology_t::filter_hw_subset() {
// Check to see if each layer's num & offset parameters are valid
max_count = get_ratio(level);
- if (max_count < 0 ||
- (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
- bool plural = (num > 1);
- KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
- __kmp_hw_get_catalog_string(type, plural));
- return false;
+ if (!is_absolute) {
+ if (max_count < 0 ||
+ (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
+ bool plural = (num > 1);
+ KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
+ __kmp_hw_get_catalog_string(type, plural));
+ return false;
+ }
}
// Check to see if core attributes are consistent
@@ -1189,7 +1162,7 @@ bool kmp_topology_t::filter_hw_subset() {
}
// Check that the number of requested cores with attributes is valid
- if (using_core_types || using_core_effs) {
+ if ((using_core_types || using_core_effs) && !is_absolute) {
for (int j = 0; j < item.num_attrs; ++j) {
int num = item.num[j];
int offset = item.offset[j];
@@ -1245,46 +1218,92 @@ bool kmp_topology_t::filter_hw_subset() {
}
}
- struct core_type_indexer {
- int operator()(const kmp_hw_thread_t &t) const {
- switch (t.attrs.get_core_type()) {
- case KMP_HW_CORE_TYPE_UNKNOWN:
- case KMP_HW_MAX_NUM_CORE_TYPES:
- return 0;
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
- case KMP_HW_CORE_TYPE_ATOM:
- return 1;
- case KMP_HW_CORE_TYPE_CORE:
- return 2;
-#endif
- }
- KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
- KMP_BUILTIN_UNREACHABLE;
+ // For keeping track of sub_ids for an absolute KMP_HW_SUBSET
+ // or core attributes (core type or efficiency)
+ int prev_sub_ids[KMP_HW_LAST];
+ int abs_sub_ids[KMP_HW_LAST];
+ int core_eff_sub_ids[KMP_HW_MAX_NUM_CORE_EFFS];
+ int core_type_sub_ids[KMP_HW_MAX_NUM_CORE_TYPES];
+ for (size_t i = 0; i < KMP_HW_LAST; ++i) {
+ abs_sub_ids[i] = -1;
+ prev_sub_ids[i] = -1;
+ }
+ for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_EFFS; ++i)
+ core_eff_sub_ids[i] = -1;
+ for (size_t i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
+ core_type_sub_ids[i] = -1;
+
+ // Determine which hardware threads should be filtered.
+
+ // Helpful to determine if a topology layer is targeted by an absolute subset
+ auto is_targeted = [&](int level) {
+ if (is_absolute) {
+ for (int i = 0; i < hw_subset_depth; ++i)
+ if (topology_levels[i] == level)
+ return true;
+ return false;
}
+ // If not absolute KMP_HW_SUBSET, then every layer is seen as targeted
+ return true;
};
- struct core_eff_indexer {
- int operator()(const kmp_hw_thread_t &t) const {
- return t.attrs.get_core_eff();
+
+ // Helpful to index into core type sub Ids array
+ auto get_core_type_index = [](const kmp_hw_thread_t &t) {
+ switch (t.attrs.get_core_type()) {
+ case KMP_HW_CORE_TYPE_UNKNOWN:
+ case KMP_HW_MAX_NUM_CORE_TYPES:
+ return 0;
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ case KMP_HW_CORE_TYPE_ATOM:
+ return 1;
+ case KMP_HW_CORE_TYPE_CORE:
+ return 2;
+#endif
}
+ KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
+ KMP_BUILTIN_UNREACHABLE;
};
- kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
- core_level);
- kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
- core_level);
+ // Helpful to index into core efficiencies sub Ids array
+ auto get_core_eff_index = [](const kmp_hw_thread_t &t) {
+ return t.attrs.get_core_eff();
+ };
- // Determine which hardware threads should be filtered.
int num_filtered = 0;
kmp_affin_mask_t *filtered_mask;
KMP_CPU_ALLOC(filtered_mask);
KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
for (int i = 0; i < num_hw_threads; ++i) {
kmp_hw_thread_t &hw_thread = hw_threads[i];
- // Update type_sub_id
- if (using_core_types)
- core_type_sub_ids.update(hw_thread);
- if (using_core_effs)
- core_eff_sub_ids.update(hw_thread);
+
+ // Figure out the absolute sub ids and core eff/type sub ids
+ if (is_absolute || using_core_effs || using_core_types) {
+ for (int level = 0; level < get_depth(); ++level) {
+ if (hw_thread.sub_ids[level] != prev_sub_ids[level]) {
+ bool found_targeted = false;
+ for (int j = level; j < get_depth(); ++j) {
+ bool targeted = is_targeted(j);
+ if (!found_targeted && targeted) {
+ found_targeted = true;
+ abs_sub_ids[j]++;
+ if (j == core_level && using_core_effs)
+ core_eff_sub_ids[get_core_eff_index(hw_thread)]++;
+ if (j == core_level && using_core_types)
+ core_type_sub_ids[get_core_type_index(hw_thread)]++;
+ } else if (targeted) {
+ abs_sub_ids[j] = 0;
+ if (j == core_level && using_core_effs)
+ core_eff_sub_ids[get_core_eff_index(hw_thread)] = 0;
+ if (j == core_level && using_core_types)
+ core_type_sub_ids[get_core_type_index(hw_thread)] = 0;
+ }
+ }
+ break;
+ }
+ }
+ for (int level = 0; level < get_depth(); ++level)
+ prev_sub_ids[level] = hw_thread.sub_ids[level];
+ }
// Check to see if this hardware thread should be filtered
bool should_be_filtered = false;
@@ -1319,20 +1338,24 @@ bool kmp_topology_t::filter_hw_subset() {
int num = hw_subset_item.num[attr_idx];
int offset = hw_subset_item.offset[attr_idx];
if (using_core_types)
- sub_id = core_type_sub_ids.get_sub_id(hw_thread);
+ sub_id = core_type_sub_ids[get_core_type_index(hw_thread)];
else
- sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
+ sub_id = core_eff_sub_ids[get_core_eff_index(hw_thread)];
if (sub_id < offset ||
(num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
should_be_filtered = true;
break;
}
} else {
+ int sub_id;
int num = hw_subset_item.num[0];
int offset = hw_subset_item.offset[0];
- if (hw_thread.sub_ids[level] < offset ||
- (num != kmp_hw_subset_t::USE_ALL &&
- hw_thread.sub_ids[level] >= offset + num)) {
+ if (is_absolute)
+ sub_id = abs_sub_ids[level];
+ else
+ sub_id = hw_thread.sub_ids[level];
+ if (sub_id < offset ||
+ (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
should_be_filtered = true;
break;
}
@@ -1777,6 +1800,8 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
__kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
else
__kmp_nThreadsPerCore = 1; // no CORE found
+ if (__kmp_nThreadsPerCore == 0)
+ __kmp_nThreadsPerCore = 1;
__kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
if (nCoresPerPkg == 0)
nCoresPerPkg = 1; // to prevent possible division by 0
@@ -1829,14 +1854,8 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
// Figure out the depth and types in the topology
depth = 0;
- pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
- KMP_ASSERT(pu);
- obj = pu;
- types[depth] = KMP_HW_THREAD;
- hwloc_types[depth] = obj->type;
- depth++;
- while (obj != root && obj != NULL) {
- obj = obj->parent;
+ obj = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
+ while (obj && obj != root) {
#if HWLOC_API_VERSION >= 0x00020000
if (obj->memory_arity) {
hwloc_obj_t memory;
@@ -1858,6 +1877,7 @@ static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
hwloc_types[depth] = obj->type;
depth++;
}
+ obj = obj->parent;
}
KMP_ASSERT(depth > 0);
@@ -2828,7 +2848,9 @@ static void __kmp_dispatch_set_hierarchy_values() {
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+#if KMP_ARCH_X86_64 && \
+ (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_WINDOWS) && \
KMP_MIC_SUPPORTED
if (__kmp_mic_type >= mic3)
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
@@ -2843,7 +2865,9 @@ static void __kmp_dispatch_set_hierarchy_values() {
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
__kmp_nThreadsPerCore;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
+#if KMP_ARCH_X86_64 && \
+ (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_WINDOWS) && \
KMP_MIC_SUPPORTED
if (__kmp_mic_type >= mic3)
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
@@ -2906,12 +2930,17 @@ static inline const char *__kmp_cpuinfo_get_envvar() {
}
// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
-// affinity map.
+// affinity map. On AIX, the map is obtained through system SRAD (Scheduler
+// Resource Allocation Domain).
static bool __kmp_affinity_create_cpuinfo_map(int *line,
kmp_i18n_id_t *const msg_id) {
+ *msg_id = kmp_i18n_null;
+
+#if KMP_OS_AIX
+ unsigned num_records = __kmp_xproc;
+#else
const char *filename = __kmp_cpuinfo_get_filename();
const char *envvar = __kmp_cpuinfo_get_envvar();
- *msg_id = kmp_i18n_null;
if (__kmp_affinity.flags.verbose) {
KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
@@ -2970,6 +2999,7 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
*msg_id = kmp_i18n_str_CantRewindCpuinfo;
return false;
}
+#endif // KMP_OS_AIX
// Allocate the array of records to store the proc info in. The dummy
// element at the end makes the logic in filling them out easier to code.
@@ -2999,6 +3029,91 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
INIT_PROC_INFO(threadInfo[i]);
}
+#if KMP_OS_AIX
+ int smt_threads;
+ lpar_info_format1_t cpuinfo;
+ unsigned num_avail = __kmp_xproc;
+
+ if (__kmp_affinity.flags.verbose)
+ KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
+
+ // Get the number of SMT threads per core.
+ smt_threads = syssmt(GET_NUMBER_SMT_SETS, 0, 0, NULL);
+
+ // Allocate a resource set containing available system resourses.
+ rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
+ if (sys_rset == NULL) {
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+ // Allocate a resource set for the SRAD info.
+ rsethandle_t srad = rs_alloc(RS_EMPTY);
+ if (srad == NULL) {
+ rs_free(sys_rset);
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+
+ // Get the SRAD system detail level.
+ int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0);
+ if (sradsdl < 0) {
+ rs_free(sys_rset);
+ rs_free(srad);
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+ // Get the number of RADs at that SRAD SDL.
+ int num_rads = rs_numrads(sys_rset, sradsdl, 0);
+ if (num_rads < 0) {
+ rs_free(sys_rset);
+ rs_free(srad);
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+
+ // Get the maximum number of procs that may be contained in a resource set.
+ int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0);
+ if (max_procs < 0) {
+ rs_free(sys_rset);
+ rs_free(srad);
+ CLEANUP_THREAD_INFO;
+ *msg_id = kmp_i18n_str_UnknownTopology;
+ return false;
+ }
+
+ int cur_rad = 0;
+ int num_set = 0;
+ for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS;
+ ++srad_idx) {
+ // Check if the SRAD is available in the RSET.
+ if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0)
+ continue;
+
+ for (int cpu = 0; cpu < max_procs; cpu++) {
+ // Set the info for the cpu if it is in the SRAD.
+ if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) {
+ threadInfo[cpu][osIdIndex] = cpu;
+ threadInfo[cpu][pkgIdIndex] = cur_rad;
+ threadInfo[cpu][coreIdIndex] = cpu / smt_threads;
+ ++num_set;
+ if (num_set >= num_avail) {
+ // Done if all available CPUs have been set.
+ break;
+ }
+ }
+ }
+ ++cur_rad;
+ }
+ rs_free(sys_rset);
+ rs_free(srad);
+
+ // The topology is already sorted.
+
+#else // !KMP_OS_AIX
unsigned num_avail = 0;
*line = 0;
#if KMP_ARCH_S390X
@@ -3246,6 +3361,8 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
qsort(threadInfo, num_avail, sizeof(*threadInfo),
__kmp_affinity_cmp_ProcCpuInfo_phys_id);
+#endif // KMP_OS_AIX
+
// The table is now sorted by pkgId / coreId / threadId, but we really don't
// know the radix of any of the fields. pkgId's may be sparsely assigned among
// the chips on a system. Although coreId's are usually assigned
@@ -4441,7 +4558,7 @@ static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
}
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
-#if KMP_OS_LINUX
+#if KMP_OS_LINUX || KMP_OS_AIX
if (!success) {
int line = 0;
success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
@@ -4837,7 +4954,12 @@ void __kmp_affinity_uninitialize(void) {
}
if (__kmp_affin_origMask != NULL) {
if (KMP_AFFINITY_CAPABLE()) {
+#if KMP_OS_AIX
+ // Uninitialize by unbinding the thread.
+ bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
__kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
+#endif
}
KMP_CPU_FREE(__kmp_affin_origMask);
__kmp_affin_origMask = NULL;
@@ -5011,7 +5133,10 @@ void __kmp_affinity_bind_init_mask(int gtid) {
__kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
} else
#endif
+#ifndef KMP_OS_AIX
+ // Do not set the full mask as the init mask on AIX.
__kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
+#endif
}
void __kmp_affinity_bind_place(int gtid) {
@@ -5124,7 +5249,7 @@ int __kmp_aux_set_affinity(void **mask) {
int __kmp_aux_get_affinity(void **mask) {
int gtid;
int retval;
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
kmp_info_t *th;
#endif
if (!KMP_AFFINITY_CAPABLE()) {
@@ -5132,7 +5257,7 @@ int __kmp_aux_get_affinity(void **mask) {
}
gtid = __kmp_entry_gtid();
-#if KMP_OS_WINDOWS || KMP_DEBUG
+#if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
th = __kmp_threads[gtid];
#else
(void)gtid; // unused variable
@@ -5155,7 +5280,7 @@ int __kmp_aux_get_affinity(void **mask) {
}
}
-#if !KMP_OS_WINDOWS
+#if !KMP_OS_WINDOWS && !KMP_OS_AIX
retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
KA_TRACE(
@@ -5175,7 +5300,7 @@ int __kmp_aux_get_affinity(void **mask) {
KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
return 0;
-#endif /* KMP_OS_WINDOWS */
+#endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */
}
int __kmp_aux_get_affinity_max_proc() {
@@ -5557,7 +5682,8 @@ void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
}
}
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_AIX
// We don't need this entry for Windows because
// there is GetProcessAffinityMask() api
//
@@ -5592,7 +5718,11 @@ extern "C"
"set full mask for thread %d\n",
gtid));
KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
+#if KMP_OS_AIX
+ return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
+#else
return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
+#endif
}
#endif
diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h
index 5464259784e2..3dc2c84d53f7 100644
--- a/openmp/runtime/src/kmp_affinity.h
+++ b/openmp/runtime/src/kmp_affinity.h
@@ -191,7 +191,8 @@ public:
};
#endif /* KMP_USE_HWLOC */
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_AIX
#if KMP_OS_LINUX
/* On some of the older OS's that we build on, these constants aren't present
in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
@@ -311,9 +312,18 @@ public:
#else
#error Unknown or unsupported architecture
#endif /* KMP_ARCH_* */
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
#include <pthread.h>
#include <pthread_np.h>
+#elif KMP_OS_NETBSD
+#include <pthread.h>
+#include <sched.h>
+#elif KMP_OS_AIX
+#include <sys/dr.h>
+#include <sys/rset.h>
+#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
+#define GET_NUMBER_SMT_SETS 0x0004
+extern "C" int syssmt(int flags, int, int, int *);
#endif
class KMPNativeAffinity : public KMPAffinity {
class Mask : public KMPAffinity::Mask {
@@ -401,13 +411,77 @@ class KMPNativeAffinity : public KMPAffinity {
++retval;
return retval;
}
+#if KMP_OS_AIX
+ // On AIX, we don't have a way to get CPU(s) a thread is bound to.
+ // This routine is only used to get the full mask.
+ int get_system_affinity(bool abort_on_error) override {
+ KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+ "Illegal get affinity operation when not capable");
+
+ (void)abort_on_error;
+
+ // Set the mask with all CPUs that are available.
+ for (int i = 0; i < __kmp_xproc; ++i)
+ KMP_CPU_SET(i, this);
+ return 0;
+ }
+ int set_system_affinity(bool abort_on_error) const override {
+ KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
+
+ "Illegal set affinity operation when not capable");
+
+ int location;
+ int gtid = __kmp_entry_gtid();
+ int tid = thread_self();
+
+ // Unbind the thread if it was bound to any processors before so that
+ // we can bind the thread to CPUs specified by the mask not others.
+ int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
+
+ // On AIX, we can only bind to one instead of a set of CPUs with the
+ // bindprocessor() system call.
+ KMP_CPU_SET_ITERATE(location, this) {
+ if (KMP_CPU_ISSET(location, this)) {
+ retval = bindprocessor(BINDTHREAD, tid, location);
+ if (retval == -1 && errno == 1) {
+ rsid_t rsid;
+ rsethandle_t rsh;
+ // Put something in rsh to prevent compiler warning
+ // about uninitalized use
+ rsh = rs_alloc(RS_EMPTY);
+ rsid.at_pid = getpid();
+ if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
+ retval = ra_detachrset(R_PROCESS, rsid, 0);
+ retval = bindprocessor(BINDTHREAD, tid, location);
+ }
+ }
+ if (retval == 0) {
+ KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
+ "T#%d to cpu=%d.\n",
+ gtid, location));
+ continue;
+ }
+ int error = errno;
+ if (abort_on_error) {
+ __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
+ KMP_ERR(error), __kmp_msg_null);
+ KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
+ "T#%d to cpu=%d, errno=%d.\n",
+ gtid, location, error));
+ return error;
+ }
+ }
+ }
+ return 0;
+ }
+#else // !KMP_OS_AIX
int get_system_affinity(bool abort_on_error) override {
KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
"Illegal get affinity operation when not capable");
#if KMP_OS_LINUX
long retval =
syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
reinterpret_cast<cpuset_t *>(mask));
int retval = (r == 0 ? 0 : -1);
@@ -428,7 +502,7 @@ class KMPNativeAffinity : public KMPAffinity {
#if KMP_OS_LINUX
long retval =
syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
reinterpret_cast<cpuset_t *>(mask));
int retval = (r == 0 ? 0 : -1);
@@ -443,6 +517,7 @@ class KMPNativeAffinity : public KMPAffinity {
}
return error;
}
+#endif // KMP_OS_AIX
};
void determine_capable(const char *env_var) override {
__kmp_affinity_determine_capable(env_var);
@@ -471,7 +546,8 @@ class KMPNativeAffinity : public KMPAffinity {
}
api_type get_api_type() const override { return NATIVE_OS; }
};
-#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
+#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
+ || KMP_OS_AIX */
#if KMP_OS_WINDOWS
class KMPNativeAffinity : public KMPAffinity {
@@ -1098,6 +1174,50 @@ public:
qsort(items, depth, sizeof(item_t), hw_subset_compare);
}
bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
+
+ // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
+ // This means putting each of {sockets, cores, threads} in the topology if
+ // they are not specified:
+ // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
+ // e.g., 3module => *s,3module,*c,*t
+ // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
+ // are expecting the traditional sockets/cores/threads topology. For newer
+ // hardware, there can be intervening layers like dies/tiles/modules
+ // (usually corresponding to a cache level). So when a user asks for
+ // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
+ // should get 12 hardware threads across 6 cores and effectively ignore the
+ // module layer.
+ void canonicalize(const kmp_topology_t *top) {
+ // Layers to target for KMP_HW_SUBSET canonicalization
+ kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
+
+ // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
+ if (is_absolute())
+ return;
+
+ // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
+ // topology doesn't have these layers
+ for (kmp_hw_t type : targeted)
+ if (top->get_level(type) == KMP_HW_UNKNOWN)
+ return;
+
+ // Put targeted layers in topology if they do not exist
+ for (kmp_hw_t type : targeted) {
+ bool found = false;
+ for (int i = 0; i < get_depth(); ++i) {
+ if (top->get_equivalent_type(items[i].type) == type) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
+ }
+ }
+ sort();
+ // Set as an absolute topology that only targets the targeted layers
+ set_absolute();
+ }
void dump() const {
printf("**********************\n");
printf("*** kmp_hw_subset: ***\n");
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index e9ab15f1723b..658cee594e48 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -1805,7 +1805,25 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
// It is OK to report the barrier state after the barrier begin callback.
// According to the OMPT specification, a compliant implementation may
// even delay reporting this state until the barrier begins to wait.
- this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
+ auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
+ switch (barrier_kind) {
+ case ompt_sync_region_barrier_explicit:
+ ompt_thr_info->state = ompt_state_wait_barrier_explicit;
+ break;
+ case ompt_sync_region_barrier_implicit_workshare:
+ ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
+ break;
+ case ompt_sync_region_barrier_implicit_parallel:
+ ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
+ break;
+ case ompt_sync_region_barrier_teams:
+ ompt_thr_info->state = ompt_state_wait_barrier_teams;
+ break;
+ case ompt_sync_region_barrier_implementation:
+ [[fallthrough]];
+ default:
+ ompt_thr_info->state = ompt_state_wait_barrier_implementation;
+ }
}
#endif
@@ -1858,8 +1876,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
}
if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
- // use 0 to only setup the current team if nthreads > 1
- __kmp_task_team_setup(this_thr, team, 0);
+ __kmp_task_team_setup(this_thr, team);
if (cancellable) {
cancelled = __kmp_linear_barrier_gather_cancellable(
@@ -2042,7 +2059,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
TRUE);
__kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
- __kmp_task_team_setup(this_thr, team, 0);
+ __kmp_task_team_setup(this_thr, team);
#if USE_ITT_BUILD
if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
@@ -2214,20 +2231,24 @@ void __kmp_join_barrier(int gtid) {
codeptr = team->t.ompt_team_info.master_return_address;
my_task_data = OMPT_CUR_TASK_DATA(this_thr);
my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
+ ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
+ ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
+ if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
+ sync_kind = ompt_sync_region_barrier_teams;
+ ompt_state = ompt_state_wait_barrier_teams;
+ }
if (ompt_enabled.ompt_callback_sync_region) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
- ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
- my_task_data, codeptr);
+ sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
}
if (ompt_enabled.ompt_callback_sync_region_wait) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
- ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
- my_task_data, codeptr);
+ sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
}
if (!KMP_MASTER_TID(ds_tid))
this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
#endif
- this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
+ this_thr->th.ompt_thread_info.state = ompt_state;
}
#endif
@@ -2243,9 +2264,7 @@ void __kmp_join_barrier(int gtid) {
__kmp_gtid_from_thread(this_thr), team_id,
team->t.t_task_team[this_thr->th.th_task_state],
this_thr->th.th_task_team));
- if (this_thr->th.th_task_team)
- KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
- team->t.t_task_team[this_thr->th.th_task_state]);
+ KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
}
#endif /* KMP_DEBUG */
@@ -2440,10 +2459,8 @@ void __kmp_fork_barrier(int gtid, int tid) {
}
#endif
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- // 0 indicates setup current task team if nthreads > 1
- __kmp_task_team_setup(this_thr, team, 0);
- }
+ if (__kmp_tasking_mode != tskm_immediate_exec)
+ __kmp_task_team_setup(this_thr, team);
/* The primary thread may have changed its blocktime between join barrier
and fork barrier. Copy the blocktime info to the thread, where
@@ -2493,8 +2510,10 @@ void __kmp_fork_barrier(int gtid, int tid) {
}
#if OMPT_SUPPORT
+ ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
if (ompt_enabled.enabled &&
- this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
+ (ompt_state == ompt_state_wait_barrier_teams ||
+ ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
int ds_tid = this_thr->th.th_info.ds.ds_tid;
ompt_data_t *task_data = (team)
? OMPT_CUR_TASK_DATA(this_thr)
@@ -2506,15 +2525,16 @@ void __kmp_fork_barrier(int gtid, int tid) {
(ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
+ ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
+ if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
+ sync_kind = ompt_sync_region_barrier_teams;
if (ompt_enabled.ompt_callback_sync_region_wait) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
- ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
- codeptr);
+ sync_kind, ompt_scope_end, NULL, task_data, codeptr);
}
if (ompt_enabled.ompt_callback_sync_region) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
- ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
- codeptr);
+ sync_kind, ompt_scope_end, NULL, task_data, codeptr);
}
#endif
if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp
index 2c410ca9b603..f1bf04901dc7 100644
--- a/openmp/runtime/src/kmp_collapse.cpp
+++ b/openmp/runtime/src/kmp_collapse.cpp
@@ -1272,6 +1272,299 @@ void kmp_calc_original_ivs_for_end(
}
}
+/**************************************************************************
+ * Identify nested loop structure - loops come in the canonical form
+ * Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0}
+ * j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
+ * Upper Triangle matrix
+ * i = 0; i <= N; i++ {0,0}:{N,0}
+ * j = 0+1*i; j <= N; j++ {0,1}:{N,0}
+ * ************************************************************************/
+nested_loop_type_t
+kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
+ /*in*/ kmp_index_t n) {
+ // only 2-level nested loops are supported
+ if (n != 2) {
+ return nested_loop_type_unkown;
+ }
+ // loops must be canonical
+ KMP_ASSERT(
+ (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
+ (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
+ // check outer loop bounds: for triangular need to be {0,0}:{N,0}
+ kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+ original_bounds_nest[0].lb0_u64);
+ kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+ original_bounds_nest[0].ub0_u64);
+ kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+ original_bounds_nest[0].lb1_u64);
+ kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+ original_bounds_nest[0].ub1_u64);
+ if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
+ return nested_loop_type_unkown;
+ }
+ // check inner bounds to determine triangle type
+ kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+ original_bounds_nest[1].lb0_u64);
+ kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+ original_bounds_nest[1].ub0_u64);
+ kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+ original_bounds_nest[1].lb1_u64);
+ kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+ original_bounds_nest[1].ub1_u64);
+ // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
+ if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
+ (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
+ return nested_loop_type_lower_triangular_matrix;
+ }
+ // upper triangle loop inner bounds need to be {0,1}:{N,0}
+ if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
+ inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
+ return nested_loop_type_upper_triangular_matrix;
+ }
+ return nested_loop_type_unkown;
+}
+
+/**************************************************************************
+ * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
+ * Start point is x so the result is always > sqrt(x)
+ * The method has uniform convergence, PRECISION is set to 0.1
+ * ************************************************************************/
+#define level_of_precision 0.1
+double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
+ double sqrt_old = 0.;
+ double sqrt_new = (double)x;
+ do {
+ sqrt_old = sqrt_new;
+ sqrt_new = (sqrt_old + x / sqrt_old) / 2;
+ } while ((sqrt_old - sqrt_new) > level_of_precision);
+ return sqrt_new;
+}
+
+/**************************************************************************
+ * Handle lower triangle matrix in the canonical form
+ * i = 0; i <= N; i++ {0,0}:{N,0}
+ * j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
+ * ************************************************************************/
+void kmp_handle_lower_triangle_matrix(
+ /*in*/ kmp_uint32 nth,
+ /*in*/ kmp_uint32 tid,
+ /*in */ kmp_index_t n,
+ /*in/out*/ bounds_info_t *original_bounds_nest,
+ /*out*/ bounds_info_t *chunk_bounds_nest) {
+
+ // transfer loop types from the original loop to the chunks
+ for (kmp_index_t i = 0; i < n; ++i) {
+ chunk_bounds_nest[i] = original_bounds_nest[i];
+ }
+ // cleanup iv variables
+ kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+ original_bounds_nest[0].ub0_u64);
+ kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+ original_bounds_nest[0].lb0_u64);
+ kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+ original_bounds_nest[1].ub0_u64);
+ // calculate the chunk's lower and upper bounds
+ // the total number of iterations in the loop is the sum of the arithmetic
+ // progression from the outer lower to outer upper bound (inclusive since the
+ // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
+ // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
+ // + 1) -> N - 1
+ kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
+ kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
+ // the current thread's number of iterations:
+ // each thread gets an equal number of iterations: total number of iterations
+ // divided by the number of threads plus, if there's a remainder,
+ // the first threads with the number up to the remainder get an additional
+ // iteration each to cover it
+ kmp_uint64 iter_current =
+ iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+ // cumulative number of iterations executed by all the previous threads:
+ // threads with the tid below the remainder will have (iter_total/nth+1)
+ // elements, and so will all threads before them so the cumulative number of
+ // iterations executed by the all previous will be the current thread's number
+ // of iterations multiplied by the number of previous threads which is equal
+ // to the current thread's tid; threads with the number equal or above the
+ // remainder will have (iter_total/nth) elements so the cumulative number of
+ // iterations previously executed is its number of iterations multipled by the
+ // number of previous threads which is again equal to the current thread's tid
+ // PLUS all the remainder iterations that will have been executed by the
+ // previous threads
+ kmp_uint64 iter_before_current =
+ tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
+ // cumulative number of iterations executed with the current thread is
+ // the cumulative number executed before it plus its own
+ kmp_uint64 iter_with_current = iter_before_current + iter_current;
+ // calculate the outer loop lower bound (lbo) which is the max outer iv value
+ // that gives the number of iterations that is equal or just below the total
+ // number of iterations executed by the previous threads, for less_than
+ // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
+ // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
+ // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
+ // i.e. lbo*(lbo+1)/2<=iter_before_current =>
+ // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
+ // using a parameter to control the equation sign
+ kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
+ kmp_uint64 lower_bound_outer =
+ (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
+ 8 * iter_before_current) +
+ inner_adjustment) /
+ 2 -
+ inner_adjustment;
+ // calculate the inner loop lower bound which is the remaining number of
+ // iterations required to hit the total number of iterations executed by the
+ // previous threads giving the starting point of this thread
+ kmp_uint64 lower_bound_inner =
+ iter_before_current -
+ ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
+ // calculate the outer loop upper bound using the same approach as for the
+ // inner bound except using the total number of iterations executed with the
+ // current thread
+ kmp_uint64 upper_bound_outer =
+ (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
+ 8 * iter_with_current) +
+ inner_adjustment) /
+ 2 -
+ inner_adjustment;
+ // calculate the inner loop upper bound which is the remaining number of
+ // iterations required to hit the total number of iterations executed after
+ // the current thread giving the starting point of the next thread
+ kmp_uint64 upper_bound_inner =
+ iter_with_current -
+ ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
+ // adjust the upper bounds down by 1 element to point at the last iteration of
+ // the current thread the first iteration of the next thread
+ if (upper_bound_inner == 0) {
+ // {n,0} => {n-1,n-1}
+ upper_bound_outer -= 1;
+ upper_bound_inner = upper_bound_outer;
+ } else {
+ // {n,m} => {n,m-1} (m!=0)
+ upper_bound_inner -= 1;
+ }
+
+ // assign the values, zeroing out lb1 and ub1 values since the iteration space
+ // is now one-dimensional
+ chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
+ chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
+ chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
+ chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
+ chunk_bounds_nest[0].lb1_u64 = 0;
+ chunk_bounds_nest[0].ub1_u64 = 0;
+ chunk_bounds_nest[1].lb1_u64 = 0;
+ chunk_bounds_nest[1].ub1_u64 = 0;
+
+#if 0
+ printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
+ tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
+ chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
+#endif
+}
+
+/**************************************************************************
+ * Handle upper triangle matrix in the canonical form
+ * i = 0; i <= N; i++ {0,0}:{N,0}
+ * j = 0+1*i; j <= N; j++ {0,1}:{N,0}
+ * ************************************************************************/
+void kmp_handle_upper_triangle_matrix(
+ /*in*/ kmp_uint32 nth,
+ /*in*/ kmp_uint32 tid,
+ /*in */ kmp_index_t n,
+ /*in/out*/ bounds_info_t *original_bounds_nest,
+ /*out*/ bounds_info_t *chunk_bounds_nest) {
+
+ // transfer loop types from the original loop to the chunks
+ for (kmp_index_t i = 0; i < n; ++i) {
+ chunk_bounds_nest[i] = original_bounds_nest[i];
+ }
+ // cleanup iv variables
+ kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+ original_bounds_nest[0].ub0_u64);
+ kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+ original_bounds_nest[0].lb0_u64);
+ [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
+ original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
+ // calculate the chunk's lower and upper bounds
+ // the total number of iterations in the loop is the sum of the arithmetic
+ // progression from the outer lower to outer upper bound (inclusive since the
+ // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
+ // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
+ // + 1) -> N - 1
+ kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
+ kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
+ // the current thread's number of iterations:
+ // each thread gets an equal number of iterations: total number of iterations
+ // divided by the number of threads plus, if there's a remainder,
+ // the first threads with the number up to the remainder get an additional
+ // iteration each to cover it
+ kmp_uint64 iter_current =
+ iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+ // cumulative number of iterations executed by all the previous threads:
+ // threads with the tid below the remainder will have (iter_total/nth+1)
+ // elements, and so will all threads before them so the cumulative number of
+ // iterations executed by the all previous will be the current thread's number
+ // of iterations multiplied by the number of previous threads which is equal
+ // to the current thread's tid; threads with the number equal or above the
+ // remainder will have (iter_total/nth) elements so the cumulative number of
+ // iterations previously executed is its number of iterations multipled by the
+ // number of previous threads which is again equal to the current thread's tid
+ // PLUS all the remainder iterations that will have been executed by the
+ // previous threads
+ kmp_uint64 iter_before_current =
+ tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
+ // cumulative number of iterations executed with the current thread is
+ // the cumulative number executed before it plus its own
+ kmp_uint64 iter_with_current = iter_before_current + iter_current;
+ // calculate the outer loop lower bound (lbo) which is the max outer iv value
+ // that gives the number of iterations that is equal or just below the total
+ // number of iterations executed by the previous threads:
+ // lbo*(lbo+1)/2<=iter_before_current =>
+ // lbo^2+lbo-2*iter_before_current<=0
+ kmp_uint64 lower_bound_outer =
+ (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
+ // calculate the inner loop lower bound which is the remaining number of
+ // iterations required to hit the total number of iterations executed by the
+ // previous threads giving the starting point of this thread
+ kmp_uint64 lower_bound_inner =
+ iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
+ // calculate the outer loop upper bound using the same approach as for the
+ // inner bound except using the total number of iterations executed with the
+ // current thread
+ kmp_uint64 upper_bound_outer =
+ (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
+ // calculate the inner loop upper bound which is the remaining number of
+ // iterations required to hit the total number of iterations executed after
+ // the current thread giving the starting point of the next thread
+ kmp_uint64 upper_bound_inner =
+ iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
+ // adjust the upper bounds down by 1 element to point at the last iteration of
+ // the current thread the first iteration of the next thread
+ if (upper_bound_inner == 0) {
+ // {n,0} => {n-1,n-1}
+ upper_bound_outer -= 1;
+ upper_bound_inner = upper_bound_outer;
+ } else {
+ // {n,m} => {n,m-1} (m!=0)
+ upper_bound_inner -= 1;
+ }
+
+ // assign the values, zeroing out lb1 and ub1 values since the iteration space
+ // is now one-dimensional
+ chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
+ chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
+ chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
+ chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
+ chunk_bounds_nest[0].lb1_u64 = 0;
+ chunk_bounds_nest[0].ub1_u64 = 0;
+ chunk_bounds_nest[1].lb1_u64 = 0;
+ chunk_bounds_nest[1].ub1_u64 = 0;
+
+#if 0
+ printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
+ tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
+ chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
+#endif
+}
//----------Init API for non-rectangular loops--------------------------------
// Init API for collapsed loops (static, no chunks defined).
@@ -1334,6 +1627,19 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
KMP_DEBUG_ASSERT(tid < nth);
+ // Handle special cases
+ nested_loop_type_t loop_type =
+ kmp_identify_nested_loop_structure(original_bounds_nest, n);
+ if (loop_type == nested_loop_type_lower_triangular_matrix) {
+ kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
+ chunk_bounds_nest);
+ return TRUE;
+ } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
+ kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
+ chunk_bounds_nest);
+ return TRUE;
+ }
+
CollapseAllocator<kmp_uint64> original_ivs_start(n);
if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
diff --git a/openmp/runtime/src/kmp_collapse.h b/openmp/runtime/src/kmp_collapse.h
index e4870185645d..1044478554a0 100644
--- a/openmp/runtime/src/kmp_collapse.h
+++ b/openmp/runtime/src/kmp_collapse.h
@@ -45,6 +45,13 @@ enum loop_type_t : kmp_int32 {
loop_type_int64 = 7
};
+// Defining loop types to handle special cases
+enum nested_loop_type_t : kmp_int32 {
+ nested_loop_type_unkown = 0,
+ nested_loop_type_lower_triangular_matrix = 1,
+ nested_loop_type_upper_triangular_matrix = 2
+};
+
/*!
@ingroup WORK_SHARING
* Describes the structure for rectangular nested loops.
@@ -124,14 +131,14 @@ struct bounds_info_t {
// It's represented in kmp_uint64, but each dimention is calculated in
// that loop IV type. Also dimentions have to be converted to those types
// when used in generated code.
-typedef kmp_uint64* kmp_point_t;
+typedef kmp_uint64 *kmp_point_t;
// Array: Number of loop iterations on each nesting level to achieve some point,
// in expanded space or in original space.
// OMPTODO: move from using iterations to using offsets (iterations multiplied
// by steps). For those we need to be careful with the types, as step can be
// negative, but it'll remove multiplications and divisions in several places.
-typedef kmp_loop_nest_iv_t* kmp_iterations_t;
+typedef kmp_loop_nest_iv_t *kmp_iterations_t;
// Internal struct with additional info:
template <typename T> struct bounds_info_internalXX_template {
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 9eeaeb88fb9e..b33c16fa79a6 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -18,6 +18,7 @@
#include "kmp_itt.h"
#include "kmp_lock.h"
#include "kmp_stats.h"
+#include "kmp_utils.h"
#include "ompt-specific.h"
#define MAX_MESSAGE 512
@@ -236,6 +237,50 @@ void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
__kmp_push_num_threads(loc, global_tid, num_threads);
}
+void __kmpc_push_num_threads_strict(ident_t *loc, kmp_int32 global_tid,
+ kmp_int32 num_threads, int severity,
+ const char *message) {
+ __kmp_push_num_threads(loc, global_tid, num_threads);
+ __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param list_length number of entries in the num_threads_list array
+@param num_threads_list array of numbers of threads requested for this parallel
+construct and subsequent nested parallel constructs
+
+Set the number of threads to be used by the next fork spawned by this thread,
+and some nested forks as well.
+This call is only required if the parallel construct has a `num_threads` clause
+that has a list of integers as the argument.
+*/
+void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+ kmp_uint32 list_length,
+ kmp_int32 *num_threads_list) {
+ KA_TRACE(20, ("__kmpc_push_num_threads_list: enter T#%d num_threads_list=",
+ global_tid));
+ KA_TRACE(20, ("%d", num_threads_list[0]));
+#ifdef KMP_DEBUG
+ for (kmp_uint32 i = 1; i < list_length; ++i)
+ KA_TRACE(20, (", %d", num_threads_list[i]));
+#endif
+ KA_TRACE(20, ("/n"));
+
+ __kmp_assert_valid_gtid(global_tid);
+ __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+}
+
+void __kmpc_push_num_threads_list_strict(ident_t *loc, kmp_int32 global_tid,
+ kmp_uint32 list_length,
+ kmp_int32 *num_threads_list,
+ int severity, const char *message) {
+ __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+ __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
/* the num_threads are automatically popped */
@@ -653,6 +698,12 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
serial_team->t.t_dispatch->th_disp_buffer->next;
__kmp_free(disp_buffer);
}
+
+ /* pop the task team stack */
+ if (serial_team->t.t_serialized > 1) {
+ __kmp_pop_task_team_node(this_thr, serial_team);
+ }
+
this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
--serial_team->t.t_serialized;
@@ -691,6 +742,11 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
this_thr->th.th_current_task->td_flags.executing = 1;
if (__kmp_tasking_mode != tskm_immediate_exec) {
+ // Restore task state from serial team structure
+ KMP_DEBUG_ASSERT(serial_team->t.t_primary_task_state == 0 ||
+ serial_team->t.t_primary_task_state == 1);
+ this_thr->th.th_task_state =
+ (kmp_uint8)serial_team->t.t_primary_task_state;
// Copy the task team from the new child / old parent team to the thread.
this_thr->th.th_task_team =
this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
@@ -1533,8 +1589,9 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
kmp_dyna_lockseq_t lockseq = __kmp_map_hint_to_lock(hint);
if (*lk == 0) {
if (KMP_IS_D_LOCK(lockseq)) {
- KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
- KMP_GET_D_TAG(lockseq));
+ KMP_COMPARE_AND_STORE_ACQ32(
+ (volatile kmp_int32 *)&((kmp_base_tas_lock_t *)crit)->poll, 0,
+ KMP_GET_D_TAG(lockseq));
} else {
__kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lockseq));
}
@@ -1949,13 +2006,13 @@ void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
if (ompt_enabled.ompt_callback_work) {
- ompt_work_t ompt_work_type = ompt_work_loop;
+ ompt_work_t ompt_work_type = ompt_work_loop_static;
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
// Determine workshare type
if (loc != NULL) {
if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
- ompt_work_type = ompt_work_loop;
+ ompt_work_type = ompt_work_loop_static;
} else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
ompt_work_type = ompt_work_sections;
} else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
@@ -4232,7 +4289,7 @@ void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
up = pr_buf->th_doacross_info[3];
st = pr_buf->th_doacross_info[4];
#if OMPT_SUPPORT && OMPT_OPTIONAL
- ompt_dependence_t deps[num_dims];
+ SimpleVLA<ompt_dependence_t> deps(num_dims);
#endif
if (st == 1) { // most common case
if (vec[0] < lo || vec[0] > up) {
@@ -4344,7 +4401,7 @@ void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
lo = pr_buf->th_doacross_info[2];
st = pr_buf->th_doacross_info[4];
#if OMPT_SUPPORT && OMPT_OPTIONAL
- ompt_dependence_t deps[num_dims];
+ SimpleVLA<ompt_dependence_t> deps(num_dims);
#endif
if (st == 1) { // most common case
iter_number = vec[0] - lo;
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index ac85b2b3f2fc..3b4a1f34df04 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -1164,8 +1164,9 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
ompt_callbacks.ompt_callback(ompt_callback_work)(
- ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
- &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
+ ompt_get_work_schedule(pr->schedule), ompt_scope_begin,
+ &(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,
+ OMPT_LOAD_RETURN_ADDRESS(gtid));
}
#endif
KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
@@ -2121,8 +2122,8 @@ int __kmp_dispatch_next_algorithm(int gtid,
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
ompt_callbacks.ompt_callback(ompt_callback_work)( \
- ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
- &(task_info->task_data), 0, codeptr); \
+ ompt_get_work_schedule(pr->schedule), ompt_scope_end, \
+ &(team_info->parallel_data), &(task_info->task_data), 0, codeptr); \
} \
}
#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
@@ -2397,6 +2398,8 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
sh->u.s.ordered_iteration = 0;
}
+ KMP_MB(); /* Flush all pending memory write invalidates. */
+
sh->buffer_index += __kmp_dispatch_num_buffers;
KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
gtid, sh->buffer_index));
@@ -3007,6 +3010,11 @@ See @ref __kmpc_dispatch_fini_4
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
__kmp_dispatch_finish<kmp_uint64>(gtid, loc);
}
+
+/*!
+See @ref __kmpc_dispatch_deinit
+*/
+void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid) {}
/*! @} */
//-----------------------------------------------------------------------------
diff --git a/openmp/runtime/src/kmp_gsupport.cpp b/openmp/runtime/src/kmp_gsupport.cpp
index 88189659a234..86cf16470e14 100644
--- a/openmp/runtime/src/kmp_gsupport.cpp
+++ b/openmp/runtime/src/kmp_gsupport.cpp
@@ -144,7 +144,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER)(void) {
// Mutual exclusion
-// The symbol that icc/ifort generates for unnamed for unnamed critical sections
+// The symbol that icc/ifort generates for unnamed critical sections
// - .gomp_critical_user_ - is defined using .comm in any objects reference it.
// We can't reference it directly here in C code, as the symbol contains a ".".
//
@@ -358,7 +358,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_END)(void) {
// (IA-32 architecture) or 64-bit signed (Intel(R) 64).
#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM || \
- KMP_ARCH_PPC
+ KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
#define KMP_DISPATCH_INIT __kmp_aux_dispatch_init_4
#define KMP_DISPATCH_FINI_CHUNK __kmp_aux_dispatch_fini_chunk_4
#define KMP_DISPATCH_NEXT __kmpc_dispatch_next_4
diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp
index 85c54f4cdc7e..0ad14f862bcb 100644
--- a/openmp/runtime/src/kmp_lock.cpp
+++ b/openmp/runtime/src/kmp_lock.cpp
@@ -2689,7 +2689,7 @@ void __kmp_spin_backoff(kmp_backoff_t *boff) {
// lock word.
static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
kmp_dyna_lockseq_t seq) {
- TCW_4(*lck, KMP_GET_D_TAG(seq));
+ TCW_4(((kmp_base_tas_lock_t *)lck)->poll, KMP_GET_D_TAG(seq));
KA_TRACE(
20,
("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
@@ -3180,8 +3180,8 @@ kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
lck->type = tag;
if (OMP_LOCK_T_SIZE < sizeof(void *)) {
- *((kmp_lock_index_t *)user_lock) = idx
- << 1; // indirect lock word must be even
+ *(kmp_lock_index_t *)&(((kmp_base_tas_lock_t *)user_lock)->poll) =
+ idx << 1; // indirect lock word must be even
} else {
*((kmp_indirect_lock_t **)user_lock) = lck;
}
diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h
index f21179b4eb68..6202f3d617cc 100644
--- a/openmp/runtime/src/kmp_lock.h
+++ b/openmp/runtime/src/kmp_lock.h
@@ -50,7 +50,7 @@ typedef struct ident ident_t;
// recent versions), but we are bounded by the pointer-sized chunks that
// the Intel compiler allocates.
-#if KMP_OS_LINUX && defined(KMP_GOMP_COMPAT)
+#if (KMP_OS_LINUX || KMP_OS_AIX) && defined(KMP_GOMP_COMPAT)
#define OMP_LOCK_T_SIZE sizeof(int)
#define OMP_NEST_LOCK_T_SIZE sizeof(void *)
#else
@@ -120,8 +120,16 @@ extern void __kmp_validate_locks(void);
struct kmp_base_tas_lock {
// KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) && \
+ __LP64__
+ // Flip the ordering of the high and low 32-bit member to be consistent
+ // with the memory layout of the address in 64-bit big-endian.
+ kmp_int32 depth_locked; // depth locked, for nested locks only
+ std::atomic<kmp_int32> poll;
+#else
std::atomic<kmp_int32> poll;
kmp_int32 depth_locked; // depth locked, for nested locks only
+#endif
};
typedef struct kmp_base_tas_lock kmp_base_tas_lock_t;
@@ -1138,11 +1146,13 @@ extern int (**__kmp_indirect_test)(kmp_user_lock_p, kmp_int32);
// Extracts direct lock tag from a user lock pointer
#define KMP_EXTRACT_D_TAG(l) \
- (*((kmp_dyna_lock_t *)(l)) & ((1 << KMP_LOCK_SHIFT) - 1) & \
- -(*((kmp_dyna_lock_t *)(l)) & 1))
+ ((kmp_dyna_lock_t)((kmp_base_tas_lock_t *)(l))->poll & \
+ ((1 << KMP_LOCK_SHIFT) - 1) & \
+ -((kmp_dyna_lock_t)((kmp_tas_lock_t *)(l))->lk.poll & 1))
// Extracts indirect lock index from a user lock pointer
-#define KMP_EXTRACT_I_INDEX(l) (*(kmp_lock_index_t *)(l) >> 1)
+#define KMP_EXTRACT_I_INDEX(l) \
+ ((kmp_lock_index_t)((kmp_base_tas_lock_t *)(l))->poll >> 1)
// Returns function pointer to the direct lock function with l (kmp_dyna_lock_t
// *) and op (operation type).
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index a0552dd930a6..24b40ed32d98 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -75,7 +75,9 @@
#error Unknown compiler
#endif
-#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_OS_WASI
+#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
+ KMP_OS_DRAGONFLY || KMP_OS_AIX) && \
+ !KMP_OS_WASI
#define KMP_AFFINITY_SUPPORTED 1
#if KMP_OS_WINDOWS && KMP_ARCH_X86_64
#define KMP_GROUP_AFFINITY 1
@@ -177,7 +179,7 @@ typedef unsigned long long kmp_uint64;
#endif /* KMP_OS_UNIX */
#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM || \
- KMP_ARCH_PPC
+ KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
#define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
#elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
@@ -467,7 +469,7 @@ enum kmp_mem_fence_type {
#pragma intrinsic(InterlockedExchangeAdd)
#pragma intrinsic(InterlockedCompareExchange)
#pragma intrinsic(InterlockedExchange)
-#if !(KMP_COMPILER_ICX && KMP_32_BIT_ARCH)
+#if !KMP_32_BIT_ARCH
#pragma intrinsic(InterlockedExchange64)
#endif
#endif
@@ -1049,7 +1051,7 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
#if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || \
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
- KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC
+ KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
#if KMP_OS_WINDOWS
#undef KMP_MB
#define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst)
diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h
index c06f46db2d49..200fdf697dd0 100644
--- a/openmp/runtime/src/kmp_platform.h
+++ b/openmp/runtime/src/kmp_platform.h
@@ -105,6 +105,7 @@
#define KMP_ARCH_X86 0
#define KMP_ARCH_X86_64 0
#define KMP_ARCH_AARCH64 0
+#define KMP_ARCH_AARCH64_32 0
#define KMP_ARCH_PPC64_ELFv1 0
#define KMP_ARCH_PPC64_ELFv2 0
#define KMP_ARCH_PPC64_XCOFF 0
@@ -157,6 +158,9 @@
#define KMP_ARCH_PPC_XCOFF 1
#undef KMP_ARCH_PPC
#define KMP_ARCH_PPC 1
+#elif defined __ARM64_ARCH_8_32__
+#undef KMP_ARCH_AARCH64_32
+#define KMP_ARCH_AARCH64_32 1
#elif defined __aarch64__
#undef KMP_ARCH_AARCH64
#define KMP_ARCH_AARCH64 1
@@ -244,7 +248,7 @@
/* Specify 32 bit architectures here */
#define KMP_32_BIT_ARCH \
(KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM || \
- KMP_ARCH_PPC)
+ KMP_ARCH_PPC || KMP_ARCH_AARCH64_32)
// Platforms which support Intel(R) Many Integrated Core Architecture
#define KMP_MIC_SUPPORTED \
@@ -254,7 +258,8 @@
#if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 + \
KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 + \
KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE + \
- KMP_ARCH_S390X + KMP_ARCH_WASM + KMP_ARCH_PPC)
+ KMP_ARCH_S390X + KMP_ARCH_WASM + KMP_ARCH_PPC + \
+ KMP_ARCH_AARCH64_32)
#error Unknown or unsupported architecture
#endif
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index fc5e8405a415..5b4391aa125d 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -113,6 +113,21 @@ void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
int new_nthreads);
void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
+static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
+ int level) {
+ kmp_nested_nthreads_t *new_nested_nth =
+ (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
+ sizeof(kmp_nested_nthreads_t));
+ int new_size = level + thr->th.th_set_nested_nth_sz;
+ new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
+ for (int i = 0; i < level + 1; ++i)
+ new_nested_nth->nth[i] = 0;
+ for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
+ new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
+ new_nested_nth->size = new_nested_nth->used = new_size;
+ return new_nested_nth;
+}
+
/* Calculate the identifier of the current thread */
/* fast (and somewhat portable) way to get unique identifier of executing
thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -930,6 +945,11 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
__kmp_get_gtid(), new_nthreads, set_nthreads));
}
#endif // KMP_DEBUG
+
+ if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
+ __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
+ this_thr->th.th_nt_msg);
+ }
return new_nthreads;
}
@@ -1042,6 +1062,41 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
}
}
+ // Take care of primary thread's task state
+ if (__kmp_tasking_mode != tskm_immediate_exec) {
+ if (use_hot_team) {
+ KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
+ KA_TRACE(
+ 20,
+ ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
+ "%p, new task_team %p / team %p\n",
+ __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
+ team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
+ team));
+
+ // Store primary thread's current task state on new team
+ KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+ master_th->th.th_task_state);
+
+ // Restore primary thread's task state to hot team's state
+ // by using thread 1's task state
+ if (team->t.t_nproc > 1) {
+ KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
+ team->t.t_threads[1]->th.th_task_state == 1);
+ KMP_CHECK_UPDATE(master_th->th.th_task_state,
+ team->t.t_threads[1]->th.th_task_state);
+ } else {
+ master_th->th.th_task_state = 0;
+ }
+ } else {
+ // Store primary thread's current task_state on new team
+ KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+ master_th->th.th_task_state);
+ // Are not using hot team, so set task state to 0.
+ master_th->th.th_task_state = 0;
+ }
+ }
+
if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
for (i = 0; i < team->t.t_nproc; i++) {
kmp_info_t *thr = team->t.t_threads[i];
@@ -1145,18 +1200,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
KMP_DEBUG_ASSERT(serial_team);
KMP_MB();
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- KMP_DEBUG_ASSERT(
- this_thr->th.th_task_team ==
- this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
- KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
- NULL);
- KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
- "team %p, new task_team = NULL\n",
- global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
- this_thr->th.th_task_team = NULL;
- }
-
kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
proc_bind = proc_bind_false;
@@ -1242,6 +1285,12 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
serial_team->t.t_serialized = 1;
serial_team->t.t_nproc = 1;
serial_team->t.t_parent = this_thr->th.th_team;
+ if (this_thr->th.th_team->t.t_nested_nth)
+ serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
+ else
+ serial_team->t.t_nested_nth = &__kmp_nested_nth;
+ // Save previous team's task state on serial team structure
+ serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
this_thr->th.th_team = serial_team;
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
@@ -1261,9 +1310,11 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
// Thread value exists in the nested nthreads array for the next nested
// level
- if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
- this_thr->th.th_current_task->td_icvs.nproc =
- __kmp_nested_nth.nth[level + 1];
+ kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+ if (this_thr->th.th_team->t.t_nested_nth)
+ nested_nth = this_thr->th.th_team->t.t_nested_nth;
+ if (nested_nth->used && (level + 1 < nested_nth->used)) {
+ this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
}
if (__kmp_nested_proc_bind.used &&
@@ -1281,6 +1332,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
this_thr->th.th_team_nproc = 1;
this_thr->th.th_team_master = this_thr;
this_thr->th.th_team_serialized = 1;
+ this_thr->th.th_task_team = NULL;
+ this_thr->th.th_task_state = 0;
serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
@@ -1312,10 +1365,14 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
int level = this_thr->th.th_team->t.t_level;
// Thread value exists in the nested nthreads array for the next nested
// level
- if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
- this_thr->th.th_current_task->td_icvs.nproc =
- __kmp_nested_nth.nth[level + 1];
+
+ kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+ if (serial_team->t.t_nested_nth)
+ nested_nth = serial_team->t.t_nested_nth;
+ if (nested_nth->used && (level + 1 < nested_nth->used)) {
+ this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
}
+
serial_team->t.t_level++;
KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
"of serial team %p to %d\n",
@@ -1332,6 +1389,9 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
}
this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+ /* allocate/push task team stack */
+ __kmp_push_task_team_node(this_thr, serial_team);
+
KMP_MB();
}
KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
@@ -1743,14 +1803,8 @@ __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
__kmp_alloc_argv_entries(argc, team, TRUE);
team->t.t_argc = argc;
argv = (void **)team->t.t_argv;
- if (ap) {
- for (i = argc - 1; i >= 0; --i)
- *argv++ = va_arg(kmp_va_deref(ap), void *);
- } else {
- for (i = 0; i < argc; ++i)
- // Get args from parent team for teams construct
- argv[i] = parent_team->t.t_argv[i];
- }
+ for (i = argc - 1; i >= 0; --i)
+ *argv++ = va_arg(kmp_va_deref(ap), void *);
// AC: revert change made in __kmpc_serialized_parallel()
// because initial code in teams should have level=0
team->t.t_level--;
@@ -1991,17 +2045,12 @@ int __kmp_fork_call(ident_t *loc, int gtid,
ap);
} // End parallel closely nested in teams construct
-#if KMP_DEBUG
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
- parent_team->t.t_task_team[master_th->th.th_task_state]);
- }
-#endif
-
// Need this to happen before we determine the number of threads, not while
// we are allocating the team
//__kmp_push_current_task_to_thread(master_th, parent_team, 0);
+ KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
+
// Determine the number of threads
int enter_teams =
__kmp_is_entering_teams(active_level, level, teams_level, ap);
@@ -2074,9 +2123,18 @@ int __kmp_fork_call(ident_t *loc, int gtid,
// See if we need to make a copy of the ICVs.
int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
- if ((level + 1 < __kmp_nested_nth.used) &&
- (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
- nthreads_icv = __kmp_nested_nth.nth[level + 1];
+ kmp_nested_nthreads_t *nested_nth = NULL;
+ if (!master_th->th.th_set_nested_nth &&
+ (level + 1 < parent_team->t.t_nested_nth->used) &&
+ (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
+ nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
+ } else if (master_th->th.th_set_nested_nth) {
+ nested_nth = __kmp_override_nested_nth(master_th, level);
+ if ((level + 1 < nested_nth->used) &&
+ (nested_nth->nth[level + 1] != nthreads_icv))
+ nthreads_icv = nested_nth->nth[level + 1];
+ else
+ nthreads_icv = 0; // don't update
} else {
nthreads_icv = 0; // don't update
}
@@ -2185,6 +2243,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
+ // Check if hot team has potentially outdated list, and if so, free it
+ if (team->t.t_nested_nth &&
+ team->t.t_nested_nth != parent_team->t.t_nested_nth) {
+ KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+ KMP_INTERNAL_FREE(team->t.t_nested_nth);
+ team->t.t_nested_nth = NULL;
+ }
+ team->t.t_nested_nth = parent_team->t.t_nested_nth;
+ if (master_th->th.th_set_nested_nth) {
+ if (!nested_nth)
+ nested_nth = __kmp_override_nested_nth(master_th, level);
+ team->t.t_nested_nth = nested_nth;
+ KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
+ master_th->th.th_set_nested_nth = NULL;
+ master_th->th.th_set_nested_nth_sz = 0;
+ master_th->th.th_nt_strict = false;
+ }
+
// Update the floating point rounding in the team if required.
propagateFPControl(team);
#if OMPD_SUPPORT
@@ -2192,64 +2268,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
ompd_bp_parallel_begin();
#endif
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- // Set primary thread's task team to team's task team. Unless this is hot
- // team, it should be NULL.
- KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
- parent_team->t.t_task_team[master_th->th.th_task_state]);
- KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
- "%p, new task_team %p / team %p\n",
- __kmp_gtid_from_thread(master_th),
- master_th->th.th_task_team, parent_team,
- team->t.t_task_team[master_th->th.th_task_state], team));
-
- if (active_level || master_th->th.th_task_team) {
- // Take a memo of primary thread's task_state
- KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
- if (master_th->th.th_task_state_top >=
- master_th->th.th_task_state_stack_sz) { // increase size
- kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
- kmp_uint8 *old_stack, *new_stack;
- kmp_uint32 i;
- new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
- for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
- new_stack[i] = master_th->th.th_task_state_memo_stack[i];
- }
- for (i = master_th->th.th_task_state_stack_sz; i < new_size;
- ++i) { // zero-init rest of stack
- new_stack[i] = 0;
- }
- old_stack = master_th->th.th_task_state_memo_stack;
- master_th->th.th_task_state_memo_stack = new_stack;
- master_th->th.th_task_state_stack_sz = new_size;
- __kmp_free(old_stack);
- }
- // Store primary thread's task_state on stack
- master_th->th
- .th_task_state_memo_stack[master_th->th.th_task_state_top] =
- master_th->th.th_task_state;
- master_th->th.th_task_state_top++;
-#if KMP_NESTED_HOT_TEAMS
- if (master_th->th.th_hot_teams &&
- active_level < __kmp_hot_teams_max_level &&
- team == master_th->th.th_hot_teams[active_level].hot_team) {
- // Restore primary thread's nested state if nested hot team
- master_th->th.th_task_state =
- master_th->th
- .th_task_state_memo_stack[master_th->th.th_task_state_top];
- } else {
-#endif
- master_th->th.th_task_state = 0;
-#if KMP_NESTED_HOT_TEAMS
- }
-#endif
- }
-#if !KMP_NESTED_HOT_TEAMS
- KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
- (team == root->r.r_hot_team));
-#endif
- }
-
KA_TRACE(
20,
("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
@@ -2457,8 +2475,7 @@ void __kmp_join_call(ident_t *loc, int gtid
__kmp_gtid_from_thread(master_th), team,
team->t.t_task_team[master_th->th.th_task_state],
master_th->th.th_task_team));
- KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
- team->t.t_task_team[master_th->th.th_task_state]);
+ KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
}
#endif
@@ -2696,24 +2713,11 @@ void __kmp_join_call(ident_t *loc, int gtid
}
if (__kmp_tasking_mode != tskm_immediate_exec) {
- if (master_th->th.th_task_state_top >
- 0) { // Restore task state from memo stack
- KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
- // Remember primary thread's state if we re-use this nested hot team
- master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
- master_th->th.th_task_state;
- --master_th->th.th_task_state_top; // pop
- // Now restore state at this level
- master_th->th.th_task_state =
- master_th->th
- .th_task_state_memo_stack[master_th->th.th_task_state_top];
- } else if (team != root->r.r_hot_team) {
- // Reset the task state of primary thread if we are not hot team because
- // in this case all the worker threads will be free, and their task state
- // will be reset. If not reset the primary's, the task state will be
- // inconsistent.
- master_th->th.th_task_state = 0;
- }
+ // Restore primary thread's task state from team structure
+ KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
+ team->t.t_primary_task_state == 1);
+ master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
+
// Copy the task team from the parent team to the primary thread
master_th->th.th_task_team =
parent_team->t.t_task_team[master_th->th.th_task_state];
@@ -3390,6 +3394,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
root_team->t.t_serialized = 1;
// TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
root_team->t.t_sched.sched = r_sched.sched;
+ root_team->t.t_nested_nth = &__kmp_nested_nth;
KA_TRACE(
20,
("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
@@ -3427,6 +3432,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
// TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
hot_team->t.t_sched.sched = r_sched.sched;
hot_team->t.t_size_changed = 0;
+ hot_team->t.t_nested_nth = &__kmp_nested_nth;
}
#ifdef KMP_DEBUG
@@ -4025,7 +4031,7 @@ int __kmp_register_root(int initial_thread) {
__kmp_root_counter++;
#if OMPT_SUPPORT
- if (!initial_thread && ompt_enabled.enabled) {
+ if (ompt_enabled.enabled) {
kmp_info_t *root_thread = ompt_get_thread();
@@ -4293,6 +4299,7 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
else // no tasking --> always safe to reap
this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
this_thr->th.th_set_proc_bind = proc_bind_default;
+
#if KMP_AFFINITY_SUPPORTED
this_thr->th.th_new_place = this_thr->th.th_current_place;
#endif
@@ -4402,17 +4409,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
this_thr->th.th_next_pool = NULL;
- if (!this_thr->th.th_task_state_memo_stack) {
- size_t i;
- this_thr->th.th_task_state_memo_stack =
- (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
- this_thr->th.th_task_state_top = 0;
- this_thr->th.th_task_state_stack_sz = 4;
- for (i = 0; i < this_thr->th.th_task_state_stack_sz;
- ++i) // zero init the stack
- this_thr->th.th_task_state_memo_stack[i] = 0;
- }
-
KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
@@ -4437,8 +4433,10 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
#endif
KMP_MB();
- /* first, try to get one from the thread pool */
- if (__kmp_thread_pool) {
+ /* first, try to get one from the thread pool unless allocating thread is
+ * the main hidden helper thread. The hidden helper team should always
+ * allocate new OS threads. */
+ if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
__kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
if (new_thr == __kmp_thread_pool_insert_pt) {
@@ -4467,8 +4465,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
TCW_4(__kmp_nth, __kmp_nth + 1);
new_thr->th.th_task_state = 0;
- new_thr->th.th_task_state_top = 0;
- new_thr->th.th_task_state_stack_sz = 4;
if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
// Make sure pool thread has transitioned to waiting on own thread struct
@@ -4503,7 +4499,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
}
/* no, well fork a new one */
- KMP_ASSERT(__kmp_nth == __kmp_all_nth);
+ KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
#if KMP_USE_MONITOR
@@ -4556,6 +4552,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
/* allocate space for it. */
new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
+ new_thr->th.th_nt_strict = false;
+ new_thr->th.th_nt_loc = NULL;
+ new_thr->th.th_nt_sev = severity_fatal;
+ new_thr->th.th_nt_msg = NULL;
+
TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
@@ -4666,6 +4667,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
new_thr->th.th_active_in_pool = FALSE;
TCW_4(new_thr->th.th_active, TRUE);
+ new_thr->th.th_set_nested_nth = NULL;
+ new_thr->th.th_set_nested_nth_sz = 0;
+
/* adjust the global counters */
__kmp_all_nth++;
__kmp_nth++;
@@ -5266,6 +5270,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
// Activate team threads via th_used_in_team
__kmp_add_threads_to_team(team, new_nproc);
}
+ // When decreasing team size, threads no longer in the team should
+ // unref task team.
+ if (__kmp_tasking_mode != tskm_immediate_exec) {
+ for (f = new_nproc; f < team->t.t_nproc; f++) {
+ kmp_info_t *th = team->t.t_threads[f];
+ KMP_DEBUG_ASSERT(th);
+ th->th.th_task_team = NULL;
+ }
+ }
#if KMP_NESTED_HOT_TEAMS
if (__kmp_hot_teams_mode == 0) {
// AC: saved number of threads should correspond to team's value in this
@@ -5276,11 +5289,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
/* release the extra threads we don't need any more */
for (f = new_nproc; f < team->t.t_nproc; f++) {
KMP_DEBUG_ASSERT(team->t.t_threads[f]);
- if (__kmp_tasking_mode != tskm_immediate_exec) {
- // When decreasing team size, threads no longer in the team should
- // unref task team.
- team->t.t_threads[f]->th.th_task_team = NULL;
- }
__kmp_free_thread(team->t.t_threads[f]);
team->t.t_threads[f] = NULL;
}
@@ -5376,7 +5384,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
__kmp_reinitialize_team(team, new_icvs, NULL);
}
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
+ KMP_AFFINITY_SUPPORTED
/* Temporarily set full mask for primary thread before creation of
workers. The reason is that workers inherit the affinity from the
primary thread, so if a lot of workers are created on the single
@@ -5412,7 +5421,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
}
}
-#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
+ KMP_AFFINITY_SUPPORTED
/* Restore initial primary thread's affinity mask */
new_temp_affinity.restore();
#endif
@@ -5456,7 +5466,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
}
} // Check changes in number of threads
- kmp_info_t *master = team->t.t_threads[0];
if (master->th.th_teams_microtask) {
for (f = 1; f < new_nproc; ++f) {
// propagate teams construct specific info to workers
@@ -5562,6 +5571,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
__ompt_team_assign_id(team, ompt_parallel_data);
#endif
+ team->t.t_nested_nth = NULL;
+
KMP_MB();
return team;
@@ -5633,6 +5644,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
KMP_MB();
+ team->t.t_nested_nth = NULL;
+
KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
team->t.t_id));
@@ -5708,9 +5721,8 @@ void __kmp_free_team(kmp_root_t *root,
}
#endif
// first check if thread is sleeping
- kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
- if (fl.is_sleeping())
- fl.resume(__kmp_gtid_from_thread(th));
+ if (th->th.th_sleep_loc)
+ __kmp_null_resume_wrapper(th);
KMP_CPU_PAUSE();
}
}
@@ -5736,6 +5748,14 @@ void __kmp_free_team(kmp_root_t *root,
}
}
+ // Before clearing parent pointer, check if nested_nth list should be freed
+ if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
+ team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
+ KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+ KMP_INTERNAL_FREE(team->t.t_nested_nth);
+ }
+ team->t.t_nested_nth = NULL;
+
// Reset pointer to parent team only for non-hot teams.
team->t.t_parent = NULL;
team->t.t_level = 0;
@@ -6251,11 +6271,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
thread->th.th_pri_common = NULL;
}
- if (thread->th.th_task_state_memo_stack != NULL) {
- __kmp_free(thread->th.th_task_state_memo_stack);
- thread->th.th_task_state_memo_stack = NULL;
- }
-
#if KMP_USE_BGET
if (thread->th.th_local.bget_data != NULL) {
__kmp_finalize_bget(thread);
@@ -6757,11 +6772,11 @@ void __kmp_register_library_startup(void) {
int fd1 = -1;
shm_name = __kmp_str_format("/%s", name);
int shm_preexist = 0;
- fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+ fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
if ((fd1 == -1) && (errno == EEXIST)) {
// file didn't open because it already exists.
// try opening existing file
- fd1 = shm_open(shm_name, O_RDWR, 0666);
+ fd1 = shm_open(shm_name, O_RDWR, 0600);
if (fd1 == -1) { // file didn't open
KMP_WARNING(FunctionError, "Can't open SHM");
__kmp_shm_available = false;
@@ -6805,11 +6820,11 @@ void __kmp_register_library_startup(void) {
int fd1 = -1;
temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
int tmp_preexist = 0;
- fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
+ fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
if ((fd1 == -1) && (errno == EEXIST)) {
// file didn't open because it already exists.
// try opening existing file
- fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
+ fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
if (fd1 == -1) { // file didn't open if (fd1 == -1) {
KMP_WARNING(FunctionError, "Can't open TEMP");
__kmp_tmp_available = false;
@@ -6949,7 +6964,7 @@ void __kmp_unregister_library(void) {
int fd1;
if (__kmp_shm_available) {
shm_name = __kmp_str_format("/%s", name);
- fd1 = shm_open(shm_name, O_RDONLY, 0666);
+ fd1 = shm_open(shm_name, O_RDONLY, 0600);
if (fd1 != -1) { // File opened successfully
char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
if (data1 != MAP_FAILED) {
@@ -7730,7 +7745,7 @@ int __kmp_invoke_task_func(int gtid) {
);
#if OMPT_SUPPORT
*exit_frame_p = NULL;
- this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
+ this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
#endif
#if KMP_STATS_ENABLED
@@ -7828,7 +7843,7 @@ int __kmp_invoke_teams_master(int gtid) {
#endif
__kmp_teams_master(gtid);
#if OMPT_SUPPORT
- this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
+ this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
#endif
__kmp_run_after_invoked_task(gtid, 0, this_thr, team);
return 1;
@@ -7838,7 +7853,6 @@ int __kmp_invoke_teams_master(int gtid) {
encountered by this team. since this should be enclosed in the forkjoin
critical section it should avoid race conditions with asymmetrical nested
parallelism */
-
void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
kmp_info_t *thr = __kmp_threads[gtid];
@@ -7846,6 +7860,39 @@ void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
thr->th.th_set_nproc = num_threads;
}
+void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
+ int *num_threads_list) {
+ kmp_info_t *thr = __kmp_threads[gtid];
+
+ KMP_DEBUG_ASSERT(list_length > 1);
+
+ if (num_threads_list[0] > 0)
+ thr->th.th_set_nproc = num_threads_list[0];
+ thr->th.th_set_nested_nth =
+ (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
+ for (kmp_uint32 i = 0; i < list_length; ++i)
+ thr->th.th_set_nested_nth[i] = num_threads_list[i];
+ thr->th.th_set_nested_nth_sz = list_length;
+}
+
+void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+ const char *msg) {
+ kmp_info_t *thr = __kmp_threads[gtid];
+ thr->th.th_nt_strict = true;
+ thr->th.th_nt_loc = loc;
+ // if sev is unset make fatal
+ if (sev == severity_warning)
+ thr->th.th_nt_sev = sev;
+ else
+ thr->th.th_nt_sev = severity_fatal;
+ // if msg is unset, use an appropriate message
+ if (msg)
+ thr->th.th_nt_msg = msg;
+ else
+ thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
+ "strict num_threads clause.";
+}
+
static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
int num_threads) {
KMP_DEBUG_ASSERT(thr);
@@ -8079,8 +8126,10 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
__kmp_join_barrier(gtid); /* wait for everyone */
#if OMPT_SUPPORT
+ ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
if (ompt_enabled.enabled &&
- this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
+ (ompt_state == ompt_state_wait_barrier_teams ||
+ ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
int ds_tid = this_thr->th.th_info.ds.ds_tid;
ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
@@ -8091,15 +8140,16 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
+ ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
+ if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
+ sync_kind = ompt_sync_region_barrier_teams;
if (ompt_enabled.ompt_callback_sync_region_wait) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
- ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
- codeptr);
+ sync_kind, ompt_scope_end, NULL, task_data, codeptr);
}
if (ompt_enabled.ompt_callback_sync_region) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
- ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
- codeptr);
+ sync_kind, ompt_scope_end, NULL, task_data, codeptr);
}
#endif
if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
@@ -8302,6 +8352,7 @@ void __kmp_cleanup(void) {
__kmp_nested_nth.nth = NULL;
__kmp_nested_nth.size = 0;
__kmp_nested_nth.used = 0;
+
KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
__kmp_nested_proc_bind.bind_types = NULL;
__kmp_nested_proc_bind.size = 0;
@@ -8929,7 +8980,7 @@ __kmp_determine_reduction_method(
// KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
- KMP_ARCH_WASM || KMP_ARCH_PPC
+ KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS || \
diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp
index 53182bef5873..2e0dfac6eeb3 100644
--- a/openmp/runtime/src/kmp_sched.cpp
+++ b/openmp/runtime/src/kmp_sched.cpp
@@ -52,6 +52,7 @@ char const *traits_t<long>::spec = "ld";
} else if (i > 0) { \
t = (u - l) / i + 1; \
} else { \
+ KMP_DEBUG_ASSERT(i != 0); \
t = (l - u) / (-i) + 1; \
} \
KMP_COUNT_VALUE(stat, t); \
@@ -102,7 +103,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
#if OMPT_SUPPORT && OMPT_OPTIONAL
ompt_team_info_t *team_info = NULL;
ompt_task_info_t *task_info = NULL;
- ompt_work_t ompt_work_type = ompt_work_loop;
+ ompt_work_t ompt_work_type = ompt_work_loop_static;
static kmp_int8 warn = 0;
@@ -113,7 +114,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
// Determine workshare type
if (loc != NULL) {
if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
- ompt_work_type = ompt_work_loop;
+ ompt_work_type = ompt_work_loop_static;
} else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
ompt_work_type = ompt_work_sections;
} else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
@@ -284,6 +285,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
// upper-lower can exceed the limit of signed type
trip_count = (UT)(*pupper - *plower) / incr + 1;
} else {
+ KMP_DEBUG_ASSERT(incr != 0);
trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
}
@@ -318,6 +320,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
if (plastiter != NULL)
*plastiter = (tid == trip_count - 1);
} else {
+ KMP_DEBUG_ASSERT(nth != 0);
if (__kmp_static == kmp_sch_static_balanced) {
UT small_chunk = trip_count / nth;
UT extras = trip_count % nth;
@@ -358,6 +361,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
case kmp_sch_static_chunked: {
ST span;
UT nchunks;
+ KMP_DEBUG_ASSERT(chunk != 0);
if (chunk < 1)
chunk = 1;
else if ((UT)chunk > trip_count)
@@ -383,6 +387,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
}
case kmp_sch_static_balanced_chunked: {
T old_upper = *pupper;
+ KMP_DEBUG_ASSERT(nth != 0);
// round up to make sure the chunk is enough to cover all iterations
UT span = (trip_count + nth - 1) / nth;
@@ -398,8 +403,10 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
} else if (*pupper < old_upper)
*pupper = old_upper;
- if (plastiter != NULL)
+ if (plastiter != NULL) {
+ KMP_DEBUG_ASSERT(chunk != 0);
*plastiter = (tid == ((trip_count - 1) / (UT)chunk));
+ }
break;
}
default:
@@ -417,6 +424,7 @@ static void __kmp_for_static_init(ident_t *loc, kmp_int32 global_tid,
// Calculate chunk in case it was not specified; it is specified for
// kmp_sch_static_chunked
if (schedtype == kmp_sch_static) {
+ KMP_DEBUG_ASSERT(nth != 0);
cur_chunk = trip_count / nth + ((trip_count % nth) ? 1 : 0);
}
// 0 - "static" schedule
@@ -547,6 +555,7 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
// upper-lower can exceed the limit of signed type
trip_count = (UT)(*pupper - *plower) / incr + 1;
} else {
+ KMP_DEBUG_ASSERT(incr != 0);
trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
}
@@ -568,6 +577,7 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
*plastiter = (tid == 0 && team_id == trip_count - 1);
} else {
// Get the team's chunk first (each team gets at most one chunk)
+ KMP_DEBUG_ASSERT(nteams != 0);
if (__kmp_static == kmp_sch_static_balanced) {
UT chunkD = trip_count / nteams;
UT extras = trip_count % nteams;
@@ -619,6 +629,7 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
// upper-lower can exceed the limit of signed type
trip_count = (UT)(*pupperDist - *plower) / incr + 1;
} else {
+ KMP_DEBUG_ASSERT(incr != 0);
trip_count = (UT)(*plower - *pupperDist) / (-incr) + 1;
}
KMP_DEBUG_ASSERT(trip_count);
@@ -637,6 +648,7 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
if (*plastiter != 0 && !(tid == trip_count - 1))
*plastiter = 0;
} else {
+ KMP_DEBUG_ASSERT(nth != 0);
if (__kmp_static == kmp_sch_static_balanced) {
UT chunkL = trip_count / nth;
UT extras = trip_count % nth;
@@ -684,9 +696,11 @@ static void __kmp_dist_for_static_init(ident_t *loc, kmp_int32 gtid,
*pstride = span * nth;
*plower = *plower + (span * tid);
*pupper = *plower + span - incr;
- if (plastiter != NULL)
+ if (plastiter != NULL) {
+ KMP_DEBUG_ASSERT(chunk != 0);
if (*plastiter != 0 && !(tid == ((trip_count - 1) / (UT)chunk) % nth))
*plastiter = 0;
+ }
break;
}
default:
@@ -809,6 +823,7 @@ static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
// upper-lower can exceed the limit of signed type
trip_count = (UT)(upper - lower) / incr + 1;
} else {
+ KMP_DEBUG_ASSERT(incr != 0);
trip_count = (UT)(lower - upper) / (-incr) + 1;
}
if (chunk < 1)
@@ -817,8 +832,10 @@ static void __kmp_team_static_init(ident_t *loc, kmp_int32 gtid,
*p_st = span * nteams;
*p_lb = lower + (span * team_id);
*p_ub = *p_lb + span - incr;
- if (p_last != NULL)
+ if (p_last != NULL) {
+ KMP_DEBUG_ASSERT(chunk != 0);
*p_last = (team_id == ((trip_count - 1) / (UT)chunk) % nteams);
+ }
// Correct upper bound if needed
if (incr > 0) {
if (*p_ub < *p_lb) // overflow?
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index d2157b10b781..8b6092cb1085 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -255,8 +255,13 @@ static void __kmp_stg_parse_bool(char const *name, char const *value,
// placed here in order to use __kmp_round4k static function
void __kmp_check_stksize(size_t *val) {
// if system stack size is too big then limit the size for worker threads
+#if KMP_OS_AIX
+ if (*val > KMP_DEFAULT_STKSIZE * 2) // Use 2 times, 16 is too large for AIX.
+ *val = KMP_DEFAULT_STKSIZE * 2;
+#else
if (*val > KMP_DEFAULT_STKSIZE * 16) // just a heuristics...
*val = KMP_DEFAULT_STKSIZE * 16;
+#endif
if (*val < __kmp_sys_min_stksize)
*val = __kmp_sys_min_stksize;
if (*val > KMP_MAX_STKSIZE)
@@ -4368,8 +4373,8 @@ static void __kmp_stg_parse_omp_schedule(char const *name, char const *value,
void *data) {
size_t length;
const char *ptr = value;
- SKIP_WS(ptr);
- if (value) {
+ if (ptr) {
+ SKIP_WS(ptr);
length = KMP_STRLEN(value);
if (length) {
if (value[length - 1] == '"' || value[length - 1] == '\'')
@@ -4884,9 +4889,6 @@ static void __kmp_stg_parse_spin_backoff_params(const char *name,
if (num <= 0) { // The number of retries should be > 0
msg = KMP_I18N_STR(ValueTooSmall);
num = 1;
- } else if (num > KMP_INT_MAX) {
- msg = KMP_I18N_STR(ValueTooLarge);
- num = KMP_INT_MAX;
}
if (msg != NULL) {
// Message is not empty. Print warning.
@@ -4983,9 +4985,6 @@ static void __kmp_stg_parse_adaptive_lock_props(const char *name,
if (num < 0) { // The number of retries should be >= 0
msg = KMP_I18N_STR(ValueTooSmall);
num = 1;
- } else if (num > KMP_INT_MAX) {
- msg = KMP_I18N_STR(ValueTooLarge);
- num = KMP_INT_MAX;
}
if (msg != NULL) {
// Message is not empty. Print warning.
@@ -6421,6 +6420,8 @@ void __kmp_env_initialize(char const *string) {
}
if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
(__kmp_nested_proc_bind.bind_types[0] != proc_bind_default)) {
+ if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)
+ __kmp_affinity.type = affinity_none;
if (__kmp_affinity.type == affinity_default) {
__kmp_affinity.type = affinity_compact;
__kmp_affinity.flags.dups = FALSE;
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index f7529481393f..39cf3496c5a1 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -739,7 +739,7 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
&(current_task->ompt_task_info.task_data),
&(current_task->ompt_task_info.frame),
&(new_taskdata->ompt_task_info.task_data),
- ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
+ TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
}
@@ -1030,6 +1030,12 @@ void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
__kmp_task_stealing_constraint);
}
+ // Wait until the last __kmp_release_deps is finished before we free the
+ // current stack frame holding the "node" variable; once its nrefs count
+ // reaches 1, we're sure nobody else can try to reference it again.
+ while (node.dn.nrefs > 1)
+ KMP_YIELD(TRUE);
+
#if OMPT_SUPPORT
__ompt_taskwait_dep_finish(current_task, taskwait_task_data);
#endif /* OMPT_SUPPORT */
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 6e8b948efa06..936afb91ac2f 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -811,8 +811,7 @@ static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
ompt_callbacks.ompt_callback(ompt_callback_task_create)(
&(parent_info->task_data), &(parent_info->frame),
&(taskdata->ompt_task_info.task_data),
- ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
- return_address);
+ TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
}
__ompt_task_start(task, current_task, gtid);
}
@@ -1156,6 +1155,11 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
// Note: no need to translate gtid to its shadow. If the current thread is a
// hidden helper one, then the gtid is already correct. Otherwise, hidden
// helper threads are disabled, and gtid refers to a OpenMP thread.
+#if OMPT_SUPPORT
+ if (ompt) {
+ __ompt_task_finish(task, resumed_task, ompt_task_switch);
+ }
+#endif
__kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
if (KMP_HIDDEN_HELPER_THREAD(gtid))
__kmp_hidden_helper_worker_thread_signal();
@@ -1511,8 +1515,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
KA_TRACE(30,
("T#%d creating task team in __kmp_task_alloc for proxy task\n",
gtid));
- // 1 indicates setup the current team regardless of nthreads
- __kmp_task_team_setup(thread, team, 1);
+ __kmp_task_team_setup(thread, team);
thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
}
kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -1712,6 +1715,7 @@ kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
// target task is untied defined in the specification
input_flags.tiedness = TASK_UNTIED;
+ input_flags.target = 1;
if (__kmp_enable_hidden_helper)
input_flags.hidden_helper = TRUE;
@@ -1943,6 +1947,11 @@ __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
#endif
__kmp_task_finish<false>(gtid, task, current_task);
}
+#if OMPT_SUPPORT
+ else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
+ __ompt_task_finish(task, current_task, ompt_task_switch);
+ }
+#endif
KA_TRACE(
30,
@@ -1975,7 +1984,8 @@ kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
if (ompt_enabled.ompt_callback_task_create) {
ompt_callbacks.ompt_callback(ompt_callback_task_create)(
&(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
- &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
+ &(new_taskdata->ompt_task_info.task_data),
+ TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
OMPT_GET_RETURN_ADDRESS(0));
}
}
@@ -2133,7 +2143,7 @@ kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
&(parent->ompt_task_info.task_data),
&(parent->ompt_task_info.frame),
&(new_taskdata->ompt_task_info.task_data),
- ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
+ TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
OMPT_LOAD_RETURN_ADDRESS(gtid));
}
} else {
@@ -2194,8 +2204,7 @@ kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
ompt_callbacks.ompt_callback(ompt_callback_task_create)(
&(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
&(new_taskdata->ompt_task_info.task_data),
- ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
- codeptr_ra);
+ TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
}
}
#endif
@@ -2662,8 +2671,8 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
if (tg == NULL)
tg = thread->th.th_current_task->td_taskgroup;
KMP_ASSERT(tg != NULL);
- kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
- kmp_int32 num = tg->reduce_num_data;
+ kmp_taskred_data_t *arr;
+ kmp_int32 num;
kmp_int32 tid = thread->th.th_info.ds.ds_tid;
#if OMPX_TASKGRAPH
@@ -2680,6 +2689,8 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
KMP_ASSERT(data != NULL);
while (tg != NULL) {
+ arr = (kmp_taskred_data_t *)(tg->reduce_data);
+ num = tg->reduce_num_data;
for (int i = 0; i < num; ++i) {
if (!arr[i].flags.lazy_priv) {
if (data == arr[i].reduce_shar ||
@@ -2713,8 +2724,6 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
}
KMP_ASSERT(tg->parent);
tg = tg->parent;
- arr = (kmp_taskred_data_t *)(tg->reduce_data);
- num = tg->reduce_num_data;
}
KMP_ASSERT2(0, "Unknown task reduction item");
return NULL; // ERROR, this line never executed
@@ -3219,7 +3228,7 @@ static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
// __kmp_steal_task: remove a task from another thread's deque
// Assume that calling thread has already checked existence of
// task_team thread_data before calling this routine.
-static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
+static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
kmp_task_team_t *task_team,
std::atomic<kmp_int32> *unfinished_threads,
int *thread_finished,
@@ -3229,15 +3238,18 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
kmp_taskdata_t *current;
kmp_thread_data_t *victim_td, *threads_data;
kmp_int32 target;
- kmp_int32 victim_tid;
+ kmp_info_t *victim_thr;
KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
threads_data = task_team->tt.tt_threads_data;
KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
+ KMP_DEBUG_ASSERT(victim_tid >= 0);
+ KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_nproc);
- victim_tid = victim_thr->th.th_info.ds.ds_tid;
victim_td = &threads_data[victim_tid];
+ victim_thr = victim_td->td.td_thr;
+ (void)victim_thr; // Use in TRACE messages which aren't always enabled.
KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
"task_team=%p ntasks=%d head=%u tail=%u\n",
@@ -3387,8 +3399,6 @@ static inline int __kmp_execute_tasks_template(
nthreads = task_team->tt.tt_nproc;
unfinished_threads = &(task_team->tt.tt_unfinished_threads);
- KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
- task_team->tt.tt_hidden_helper_task_encountered);
KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
while (1) { // Outer loop keeps trying to find tasks in case of single thread
@@ -3452,9 +3462,9 @@ static inline int __kmp_execute_tasks_template(
if (!asleep) {
// We have a victim to try to steal from
- task = __kmp_steal_task(other_thread, gtid, task_team,
- unfinished_threads, thread_finished,
- is_constrained);
+ task =
+ __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
+ thread_finished, is_constrained);
}
if (task != NULL) { // set last stolen to victim
if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
@@ -3940,6 +3950,20 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
__kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
}
+static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
+ kmp_team_t *team) {
+ int team_nth = team->t.t_nproc;
+ // Only need to init if task team is isn't active or team size changed
+ if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
+ TCW_4(task_team->tt.tt_found_tasks, FALSE);
+ TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+ TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
+ TCW_4(task_team->tt.tt_nproc, team_nth);
+ KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
+ TCW_4(task_team->tt.tt_active, TRUE);
+ }
+}
+
// __kmp_allocate_task_team:
// Allocates a task team associated with a specific team, taking it from
// the global task team free list if possible. Also initializes data
@@ -3947,7 +3971,6 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
kmp_team_t *team) {
kmp_task_team_t *task_team = NULL;
- int nthreads;
KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
(thread ? __kmp_gtid_from_thread(thread) : -1), team));
@@ -3989,14 +4012,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
// task_team->tt.tt_next = NULL;
}
- TCW_4(task_team->tt.tt_found_tasks, FALSE);
- TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
- TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
- task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
-
- KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
- TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
- TCW_4(task_team->tt.tt_active, TRUE);
+ __kmp_task_team_init(task_team, team);
KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
"unfinished_threads init'd to %d\n",
@@ -4050,6 +4066,40 @@ void __kmp_reap_task_teams(void) {
}
}
+// View the array of two task team pointers as a pair of pointers:
+// 1) a single task_team pointer
+// 2) next pointer for stack
+// Serial teams can create a stack of task teams for nested serial teams.
+void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+ KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+ kmp_task_team_list_t *current =
+ (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+ kmp_task_team_list_t *node =
+ (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
+ node->task_team = current->task_team;
+ node->next = current->next;
+ thread->th.th_task_team = current->task_team = NULL;
+ current->next = node;
+}
+
+// Serial team pops a task team off the stack
+void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+ KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+ kmp_task_team_list_t *current =
+ (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+ if (current->task_team) {
+ __kmp_free_task_team(thread, current->task_team);
+ }
+ kmp_task_team_list_t *next = current->next;
+ if (next) {
+ current->task_team = next->task_team;
+ current->next = next->next;
+ KMP_DEBUG_ASSERT(next != current);
+ __kmp_free(next);
+ thread->th.th_task_team = current->task_team;
+ }
+}
+
// __kmp_wait_to_unref_task_teams:
// Some threads could still be in the fork barrier release code, possibly
// trying to steal tasks. Wait for each thread to unreference its task team.
@@ -4114,55 +4164,34 @@ void __kmp_wait_to_unref_task_teams(void) {
}
}
-void __kmp_shift_task_state_stack(kmp_info_t *this_thr, kmp_uint8 value) {
- // Shift values from th_task_state_top+1 to task_state_stack_sz
- if (this_thr->th.th_task_state_top + 1 >=
- this_thr->th.th_task_state_stack_sz) { // increase size
- kmp_uint32 new_size = 2 * this_thr->th.th_task_state_stack_sz;
- kmp_uint8 *old_stack, *new_stack;
- kmp_uint32 i;
- new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
- for (i = 0; i <= this_thr->th.th_task_state_top; ++i) {
- new_stack[i] = this_thr->th.th_task_state_memo_stack[i];
- }
- // If we need to reallocate do the shift at the same time.
- for (; i < this_thr->th.th_task_state_stack_sz; ++i) {
- new_stack[i + 1] = this_thr->th.th_task_state_memo_stack[i];
- }
- for (i = this_thr->th.th_task_state_stack_sz; i < new_size;
- ++i) { // zero-init rest of stack
- new_stack[i] = 0;
- }
- old_stack = this_thr->th.th_task_state_memo_stack;
- this_thr->th.th_task_state_memo_stack = new_stack;
- this_thr->th.th_task_state_stack_sz = new_size;
- __kmp_free(old_stack);
- } else {
- kmp_uint8 *end;
- kmp_uint32 i;
-
- end = &this_thr->th
- .th_task_state_memo_stack[this_thr->th.th_task_state_stack_sz];
-
- for (i = this_thr->th.th_task_state_stack_sz - 1;
- i > this_thr->th.th_task_state_top; i--, end--)
- end[0] = end[-1];
- }
- this_thr->th.th_task_state_memo_stack[this_thr->th.th_task_state_top + 1] =
- value;
-}
-
// __kmp_task_team_setup: Create a task_team for the current team, but use
// an already created, unused one if it already exists.
-void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
+void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+ // For the serial and root teams, setup the first task team pointer to point
+ // to task team. The other pointer is a stack of task teams from previous
+ // serial levels.
+ if (team == this_thr->th.th_serial_team ||
+ team == this_thr->th.th_root->r.r_root_team) {
+ KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+ if (team->t.t_task_team[0] == NULL) {
+ team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
+ KA_TRACE(
+ 20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+ " for serial/root team %p\n",
+ __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
+
+ } else
+ __kmp_task_team_init(team->t.t_task_team[0], team);
+ return;
+ }
+
// If this task_team hasn't been created yet, allocate it. It will be used in
// the region after the next.
// If it exists, it is the current task team and shouldn't be touched yet as
// it may still be in use.
- if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
- (always || team->t.t_nproc > 1)) {
+ if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
team->t.t_task_team[this_thr->th.th_task_state] =
__kmp_allocate_task_team(this_thr, team);
KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
@@ -4171,52 +4200,31 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
this_thr->th.th_task_state));
}
- if (this_thr->th.th_task_state == 1 && always && team->t.t_nproc == 1) {
- // fix task state stack to adjust for proxy and helper tasks
- KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d needs to shift stack"
- " for team %d at parity=%d\n",
- __kmp_gtid_from_thread(this_thr), team->t.t_id,
- this_thr->th.th_task_state));
- __kmp_shift_task_state_stack(this_thr, this_thr->th.th_task_state);
- }
// After threads exit the release, they will call sync, and then point to this
// other task_team; make sure it is allocated and properly initialized. As
// threads spin in the barrier release phase, they will continue to use the
// previous task_team struct(above), until they receive the signal to stop
// checking for tasks (they can't safely reference the kmp_team_t struct,
- // which could be reallocated by the primary thread). No task teams are formed
- // for serialized teams.
- if (team->t.t_nproc > 1) {
- int other_team = 1 - this_thr->th.th_task_state;
- KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
- if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
- team->t.t_task_team[other_team] =
- __kmp_allocate_task_team(this_thr, team);
- KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
- "task_team %p for team %d at parity=%d\n",
- __kmp_gtid_from_thread(this_thr),
- team->t.t_task_team[other_team], team->t.t_id, other_team));
- } else { // Leave the old task team struct in place for the upcoming region;
- // adjust as needed
- kmp_task_team_t *task_team = team->t.t_task_team[other_team];
- if (!task_team->tt.tt_active ||
- team->t.t_nproc != task_team->tt.tt_nproc) {
- TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
- TCW_4(task_team->tt.tt_found_tasks, FALSE);
- TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
- TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
- KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
- team->t.t_nproc);
- TCW_4(task_team->tt.tt_active, TRUE);
- }
- // if team size has changed, the first thread to enable tasking will
- // realloc threads_data if necessary
- KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
- "%p for team %d at parity=%d\n",
- __kmp_gtid_from_thread(this_thr),
- team->t.t_task_team[other_team], team->t.t_id, other_team));
- }
+ // which could be reallocated by the primary thread).
+ int other_team = 1 - this_thr->th.th_task_state;
+ KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
+ if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
+ team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
+ KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
+ "task_team %p for team %d at parity=%d\n",
+ __kmp_gtid_from_thread(this_thr),
+ team->t.t_task_team[other_team], team->t.t_id, other_team));
+ } else { // Leave the old task team struct in place for the upcoming region;
+ // adjust as needed
+ kmp_task_team_t *task_team = team->t.t_task_team[other_team];
+ __kmp_task_team_init(task_team, team);
+ // if team size has changed, the first thread to enable tasking will
+ // realloc threads_data if necessary
+ KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
+ "%p for team %d at parity=%d\n",
+ __kmp_gtid_from_thread(this_thr),
+ team->t.t_task_team[other_team], team->t.t_id, other_team));
}
// For regular thread, task enabling should be called when the task is going
@@ -4242,9 +4250,11 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
// __kmp_task_team_sync: Propagation of task team data from team to threads
// which happens just after the release phase of a team barrier. This may be
-// called by any thread, but only for teams with # threads > 1.
+// called by any thread. This is not called for serial or root teams.
void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+ KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
+ KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
// Toggle the th_task_state field, to switch which task_team this thread
// refers to
@@ -4262,8 +4272,7 @@ void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
}
// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
-// barrier gather phase. Only called by primary thread if #threads in team > 1
-// or if proxy tasks were created.
+// barrier gather phase. Only called by the primary thread.
//
// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
// by passing in 0 optionally as the last argument. When wait is zero, primary
@@ -4297,9 +4306,6 @@ void __kmp_task_team_wait(
("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
"setting active to false, setting local and team's pointer to NULL\n",
__kmp_gtid_from_thread(this_thr), task_team));
- KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
- task_team->tt.tt_found_proxy_tasks == TRUE ||
- task_team->tt.tt_hidden_helper_task_encountered == TRUE);
TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
diff --git a/openmp/runtime/src/kmp_threadprivate.cpp b/openmp/runtime/src/kmp_threadprivate.cpp
index b79ac7d6d2b2..c4a1ec6e1023 100644
--- a/openmp/runtime/src/kmp_threadprivate.cpp
+++ b/openmp/runtime/src/kmp_threadprivate.cpp
@@ -248,16 +248,16 @@ void __kmp_common_destroy_gtid(int gtid) {
if (d_tn->is_vec) {
if (d_tn->dt.dtorv != 0) {
(void)(*d_tn->dt.dtorv)(tn->par_addr, d_tn->vec_len);
- }
- if (d_tn->obj_init != 0) {
- (void)(*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+ if (d_tn->obj_init != 0) {
+ (void)(*d_tn->dt.dtorv)(d_tn->obj_init, d_tn->vec_len);
+ }
}
} else {
if (d_tn->dt.dtor != 0) {
(void)(*d_tn->dt.dtor)(tn->par_addr);
- }
- if (d_tn->obj_init != 0) {
- (void)(*d_tn->dt.dtor)(d_tn->obj_init);
+ if (d_tn->obj_init != 0) {
+ (void)(*d_tn->dt.dtor)(d_tn->obj_init);
+ }
}
}
}
diff --git a/openmp/runtime/src/kmp_utility.cpp b/openmp/runtime/src/kmp_utility.cpp
index f901eaca92f4..bfa450c9ced2 100644
--- a/openmp/runtime/src/kmp_utility.cpp
+++ b/openmp/runtime/src/kmp_utility.cpp
@@ -28,68 +28,6 @@ static const char *unknown = "unknown";
static int trace_level = 5;
#endif
-/* LOG_ID_BITS = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
- * APIC_ID = (PHY_ID << LOG_ID_BITS) | LOG_ID
- * PHY_ID = APIC_ID >> LOG_ID_BITS
- */
-int __kmp_get_physical_id(int log_per_phy, int apic_id) {
- int index_lsb, index_msb, temp;
-
- if (log_per_phy > 1) {
- index_lsb = 0;
- index_msb = 31;
-
- temp = log_per_phy;
- while ((temp & 1) == 0) {
- temp >>= 1;
- index_lsb++;
- }
-
- temp = log_per_phy;
- while ((temp & 0x80000000) == 0) {
- temp <<= 1;
- index_msb--;
- }
-
- /* If >1 bits were set in log_per_phy, choose next higher power of 2 */
- if (index_lsb != index_msb)
- index_msb++;
-
- return ((int)(apic_id >> index_msb));
- }
-
- return apic_id;
-}
-
-/*
- * LOG_ID_BITS = ( 1 + floor( log_2( max( log_per_phy - 1, 1 ))))
- * APIC_ID = (PHY_ID << LOG_ID_BITS) | LOG_ID
- * LOG_ID = APIC_ID & (( 1 << LOG_ID_BITS ) - 1 )
- */
-int __kmp_get_logical_id(int log_per_phy, int apic_id) {
- unsigned current_bit;
- int bits_seen;
-
- if (log_per_phy <= 1)
- return (0);
-
- bits_seen = 0;
-
- for (current_bit = 1; log_per_phy != 0; current_bit <<= 1) {
- if (log_per_phy & current_bit) {
- log_per_phy &= ~current_bit;
- bits_seen++;
- }
- }
-
- /* If exactly 1 bit was set in log_per_phy, choose next lower power of 2 */
- if (bits_seen == 1) {
- current_bit >>= 1;
- }
-
- return ((int)((current_bit - 1) & apic_id));
-}
-
static kmp_uint64 __kmp_parse_frequency( // R: Frequency in Hz.
char const *frequency // I: Float number and unit: MHz, GHz, or TGz.
) {
@@ -122,7 +60,6 @@ static kmp_uint64 __kmp_parse_frequency( // R: Frequency in Hz.
void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
struct kmp_cpuid buf;
int max_arg;
- int log_per_phy;
#ifdef KMP_DEBUG
int cflush_size;
#endif
@@ -227,11 +164,8 @@ void __kmp_query_cpuid(kmp_cpuinfo_t *p) {
if ((buf.edx >> 28) & 1) {
/* Bits 23-16: Logical Processors per Physical Processor (1 for P4) */
- log_per_phy = data[2];
p->apic_id = data[3]; /* Bits 31-24: Processor Initial APIC ID (X) */
- KA_TRACE(trace_level, (" HT(%d TPUs)", log_per_phy));
- p->physical_id = __kmp_get_physical_id(log_per_phy, p->apic_id);
- p->logical_id = __kmp_get_logical_id(log_per_phy, p->apic_id);
+ KA_TRACE(trace_level, (" HT(%d TPUs)", data[2]));
}
#ifdef KMP_DEBUG
if ((buf.edx >> 29) & 1) {
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
index 12d5d0677a90..97db68943da7 100644
--- a/openmp/runtime/src/kmp_wait_release.h
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -323,19 +323,21 @@ static void __ompt_implicit_task_end(kmp_info_t *this_thr,
ompt_state_t ompt_state,
ompt_data_t *tId) {
int ds_tid = this_thr->th.th_info.ds.ds_tid;
- if (ompt_state == ompt_state_wait_barrier_implicit) {
+ if (ompt_state == ompt_state_wait_barrier_implicit_parallel ||
+ ompt_state == ompt_state_wait_barrier_teams) {
this_thr->th.ompt_thread_info.state = ompt_state_overhead;
#if OMPT_OPTIONAL
void *codeptr = NULL;
+ ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
+ if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
+ sync_kind = ompt_sync_region_barrier_teams;
if (ompt_enabled.ompt_callback_sync_region_wait) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
- ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
- codeptr);
+ sync_kind, ompt_scope_end, NULL, tId, codeptr);
}
if (ompt_enabled.ompt_callback_sync_region) {
ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
- ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
- codeptr);
+ sync_kind, ompt_scope_end, NULL, tId, codeptr);
}
#endif
if (!KMP_MASTER_TID(ds_tid)) {
@@ -455,7 +457,9 @@ final_spin=FALSE)
ompt_data_t *tId;
if (ompt_enabled.enabled) {
ompt_entry_state = this_thr->th.ompt_thread_info.state;
- if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
+ if (!final_spin ||
+ (ompt_entry_state != ompt_state_wait_barrier_implicit_parallel &&
+ ompt_entry_state != ompt_state_wait_barrier_teams) ||
KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
ompt_lw_taskteam_t *team = NULL;
if (this_thr->th.th_team)
diff --git a/openmp/runtime/src/ompt-general.cpp b/openmp/runtime/src/ompt-general.cpp
index 95aab6cd79e5..e07c5ff4fcc4 100644
--- a/openmp/runtime/src/ompt-general.cpp
+++ b/openmp/runtime/src/ompt-general.cpp
@@ -915,22 +915,16 @@ _OMP_EXTERN void ompt_libomp_connect(ompt_start_tool_result_t *result) {
// Ensure libomp callbacks have been added if not already
__ompt_force_initialization();
- if (ompt_enabled.enabled &&
- // Callbacks are initiated only if the device initialize callback
- // has been registered by the tool
- ompt_callbacks.ompt_callback(ompt_callback_device_initialize)) {
- if (result) {
- OMPT_VERBOSE_INIT_PRINT(
- "libomp --> OMPT: Connecting with libomptarget\n");
- // Pass in the libomp lookup function so that the already registered
- // functions can be extracted and assigned to the callbacks in
- // libomptarget
- result->initialize(ompt_libomp_target_fn_lookup,
- /* initial_device_num */ 0, /* tool_data */ nullptr);
- // Track the object provided by libomptarget so that the finalizer can be
- // called during OMPT finalization
- libomptarget_ompt_result = result;
- }
+ if (ompt_enabled.enabled && result) {
+ OMPT_VERBOSE_INIT_PRINT("libomp --> OMPT: Connecting with libomptarget\n");
+ // Pass in the libomp lookup function so that the already registered
+ // functions can be extracted and assigned to the callbacks in
+ // libomptarget
+ result->initialize(ompt_libomp_target_fn_lookup,
+ /* initial_device_num */ 0, /* tool_data */ nullptr);
+ // Track the object provided by libomptarget so that the finalizer can be
+ // called during OMPT finalization
+ libomptarget_ompt_result = result;
}
OMPT_VERBOSE_INIT_PRINT("libomp --> OMPT: Exit ompt_libomp_connect\n");
}
diff --git a/openmp/runtime/src/ompt-internal.h b/openmp/runtime/src/ompt-internal.h
index 0d77413d5490..580a7c2ac791 100644
--- a/openmp/runtime/src/ompt-internal.h
+++ b/openmp/runtime/src/ompt-internal.h
@@ -50,6 +50,10 @@ typedef struct ompt_callbacks_active_s {
: 0x0) | \
((!(info->td_flags.tiedness)) ? ompt_task_untied : 0x0) | \
(info->td_flags.final ? ompt_task_final : 0x0) | \
+ (info->td_flags.target \
+ ? ompt_task_target \
+ : (info->td_flags.tasktype ? ompt_task_explicit \
+ : ompt_task_implicit)) | \
(info->td_flags.merged_if0 ? ompt_task_mergeable : 0x0)
typedef struct {
diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp
index 9743f35d2c4f..0737c0cdfb16 100644
--- a/openmp/runtime/src/ompt-specific.cpp
+++ b/openmp/runtime/src/ompt-specific.cpp
@@ -421,9 +421,7 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
team_info = &team->t.ompt_team_info;
if (type) {
if (taskdata->td_parent) {
- *type = (taskdata->td_flags.tasktype ? ompt_task_explicit
- : ompt_task_implicit) |
- TASK_TYPE_DETAILS_FORMAT(taskdata);
+ *type = TASK_TYPE_DETAILS_FORMAT(taskdata);
} else {
*type = ompt_task_initial;
}
@@ -505,22 +503,23 @@ static uint64_t __ompt_get_unique_id_internal() {
ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt,
kmp_info_t *thr) {
- if (bt == bs_forkjoin_barrier)
- return ompt_sync_region_barrier_implicit;
+ if (bt == bs_forkjoin_barrier) {
+ if (thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
+ return ompt_sync_region_barrier_teams;
+ else
+ return ompt_sync_region_barrier_implicit_parallel;
+ }
- if (bt != bs_plain_barrier)
+ if (bt != bs_plain_barrier || !thr->th.th_ident)
return ompt_sync_region_barrier_implementation;
- if (!thr->th.th_ident)
- return ompt_sync_region_barrier;
-
kmp_int32 flags = thr->th.th_ident->flags;
if ((flags & KMP_IDENT_BARRIER_EXPL) != 0)
return ompt_sync_region_barrier_explicit;
if ((flags & KMP_IDENT_BARRIER_IMPL) != 0)
- return ompt_sync_region_barrier_implicit;
+ return ompt_sync_region_barrier_implicit_workshare;
return ompt_sync_region_barrier_implementation;
}
diff --git a/openmp/runtime/src/ompt-specific.h b/openmp/runtime/src/ompt-specific.h
index 63c59c3fb398..7864ed6126c7 100644
--- a/openmp/runtime/src/ompt-specific.h
+++ b/openmp/runtime/src/ompt-specific.h
@@ -130,6 +130,25 @@ inline const char *ompt_get_runtime_version() {
return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
}
+inline ompt_work_t ompt_get_work_schedule(enum sched_type schedule) {
+ switch (SCHEDULE_WITHOUT_MODIFIERS(schedule)) {
+ case kmp_sch_static_chunked:
+ case kmp_sch_static_balanced:
+ case kmp_sch_static_greedy:
+ return ompt_work_loop_static;
+ case kmp_sch_dynamic_chunked:
+ case kmp_sch_static_steal:
+ return ompt_work_loop_dynamic;
+ case kmp_sch_guided_iterative_chunked:
+ case kmp_sch_guided_analytical_chunked:
+ case kmp_sch_guided_chunked:
+ case kmp_sch_guided_simd:
+ return ompt_work_loop_guided;
+ default:
+ return ompt_work_loop_other;
+ }
+}
+
class OmptReturnAddressGuard {
private:
bool SetAddress{false};
diff --git a/openmp/runtime/src/z_AIX_asm.S b/openmp/runtime/src/z_AIX_asm.S
new file mode 100644
index 000000000000..d711fcb7a785
--- /dev/null
+++ b/openmp/runtime/src/z_AIX_asm.S
@@ -0,0 +1,410 @@
+// z_AIX_asm.S: - microtasking routines specifically
+// written for Power platforms running AIX OS
+
+//
+////===----------------------------------------------------------------------===//
+////
+//// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+//// See https://llvm.org/LICENSE.txt for license information.
+//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+////
+////===----------------------------------------------------------------------===//
+//
+
+// -----------------------------------------------------------------------
+// macros
+// -----------------------------------------------------------------------
+
+#include "kmp_config.h"
+
+#if KMP_OS_AIX
+//------------------------------------------------------------------------
+// int
+// __kmp_invoke_microtask( void (*pkfn) (int *gtid, int *tid, ...),
+// int gtid, int tid,
+// int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+// ,
+// void **exit_frame_ptr
+// #endif
+// ) {
+// #if OMPT_SUPPORT
+// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+// (*pkfn)( & gtid, & tid, p_argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+// *exit_frame_ptr = 0;
+// #endif
+//
+// return 1;
+// }
+//
+// parameters:
+// r3: pkfn
+// r4: gtid
+// r5: tid
+// r6: argc
+// r7: p_argv
+// r8: &exit_frame
+//
+// return: r3 (always 1/TRUE)
+//
+
+#if KMP_ARCH_PPC64_XCOFF
+
+ .globl __kmp_invoke_microtask[DS]
+ .globl .__kmp_invoke_microtask
+ .align 4
+ .csect __kmp_invoke_microtask[DS],3
+ .vbyte 8, .__kmp_invoke_microtask
+ .vbyte 8, TOC[TC0]
+ .vbyte 8, 0
+ .csect .text[PR],2
+ .machine "pwr7"
+.__kmp_invoke_microtask:
+
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+
+// We need to allocate a stack frame large enough to hold all of the parameters
+// on the stack for the microtask plus what this function needs. That's 48
+// bytes under the XCOFF64 ABI, plus max(64, 8*(2 + argc)) for
+// the parameters to the microtask (gtid, tid, argc elements of p_argv),
+// plus 8 bytes to store the values of r4 and r5, and 8 bytes to store r31.
+// With OMP-T support, we need an additional 8 bytes to save r30 to hold
+// a copy of r8.
+// Stack offsets relative to stack pointer:
+// r31: -8, r30: -16, gtid: -20, tid: -24
+
+ mflr 0
+ std 31, -8(1) # Save r31 to the stack
+ std 0, 16(1) # Save LR to the linkage area
+
+// This is unusual because normally we'd set r31 equal to r1 after the stack
+// frame is established. In this case, however, we need to dynamically compute
+// the stack frame size, and so we keep a direct copy of r1 to access our
+// register save areas and restore the r1 value before returning.
+ mr 31, 1
+
+// Compute the size of the "argc" portion of the parameter save area.
+// The parameter save area is always at least 64 bytes long (i.e. 8 regs)
+// The microtask has (2 + argc) parameters, so if argc <= 6, we need to
+// to allocate 8*6 bytes, not 8*argc.
+ li 0, 6
+ cmpwi 0, 6, 6
+ iselgt 0, 6, 0 # r0 = (argc > 6)? argc : 6
+ sldi 0, 0, 3 # r0 = 8 * max(argc, 6)
+
+// Compute the size necessary for the local stack frame.
+// 88 = 48 + 4 (for r4) + 4 (for r5) + 8 (for r31) + 8 (for OMP-T r30) +
+// 8 (parameter gtid) + 8 (parameter tid)
+ li 12, 88
+ add 12, 0, 12
+ neg 12, 12
+
+// We need to make sure that the stack frame stays aligned (to 16 bytes).
+ li 0, -16
+ and 12, 0, 12
+
+// Establish the local stack frame.
+ stdux 1, 1, 12
+
+#if OMPT_SUPPORT
+ std 30, -16(31) # Save r30 to the stack
+ std 1, 0(8)
+ mr 30, 8
+#endif
+
+// Store gtid and tid to the stack because they're passed by reference to the microtask.
+ stw 4, -20(31) # Save gtid to the stack
+ stw 5, -24(31) # Save tid to the stack
+
+ mr 12, 6 # r12 = argc
+ mr 4, 7 # r4 = p_argv
+
+ cmpwi 0, 12, 1
+ blt 0, .Lcall # if (argc < 1) goto .Lcall
+
+ ld 5, 0(4) # r5 = p_argv[0]
+
+ cmpwi 0, 12, 2
+ blt 0, .Lcall # if (argc < 2) goto .Lcall
+
+ ld 6, 8(4) # r6 = p_argv[1]
+
+ cmpwi 0, 12, 3
+ blt 0, .Lcall # if (argc < 3) goto .Lcall
+
+ ld 7, 16(4) # r7 = p_argv[2]
+
+ cmpwi 0, 12, 4
+ blt 0, .Lcall # if (argc < 4) goto .Lcall
+
+ ld 8, 24(4) # r8 = p_argv[3]
+
+ cmpwi 0, 12, 5
+ blt 0, .Lcall # if (argc < 5) goto .Lcall
+
+ ld 9, 32(4) # r9 = p_argv[4]
+
+ cmpwi 0, 12, 6
+ blt 0, .Lcall # if (argc < 6) goto .Lcall
+
+ ld 10, 40(4) # r10 = p_argv[5]
+
+ cmpwi 0, 12, 7
+ blt 0, .Lcall # if (argc < 7) goto .Lcall
+
+// There are more than 6 microtask parameters, so we need to store the
+// remainder to the stack.
+ addi 12, 12, -6 # argc -= 6
+ mtctr 12
+
+// These are set to 8 bytes before the first desired store address (we're using
+// pre-increment loads and stores in the loop below). The parameter save area
+// for the microtask begins 48 + 8*8 == 112 bytes above r1 for XCOFF64.
+ addi 4, 4, 40 # p_argv = p_argv + 5
+ # (i.e. skip the 5 elements we already processed)
+ addi 12, 1, 104 # r12 = stack offset (112 - 8)
+
+.Lnext:
+ ldu 0, 8(4)
+ stdu 0, 8(12)
+ bdnz .Lnext
+
+.Lcall:
+ std 2, 40(1) # Save the TOC pointer to the linkage area
+// Load the actual function address from the function descriptor.
+ ld 12, 0(3) # Function address
+ ld 2, 8(3) # TOC pointer
+ ld 11, 16(3) # Environment pointer
+
+ addi 3, 31, -20 # r3 = &gtid
+ addi 4, 31, -24 # r4 = &tid
+
+ mtctr 12 # CTR = function address
+ bctrl # Branch to CTR
+ ld 2, 40(1) # Restore TOC pointer from linkage area
+
+#if OMPT_SUPPORT
+ li 3, 0
+ std 3, 0(30)
+#endif
+
+ li 3, 1
+
+#if OMPT_SUPPORT
+ ld 30, -16(31) # Restore r30 from the saved value on the stack
+#endif
+
+ mr 1, 31
+ ld 31, -8(1) # Restore r31 from the saved value on the stack
+ ld 0, 16(1)
+ mtlr 0 # Restore LR from the linkage area
+ blr # Branch to LR
+
+#else // KMP_ARCH_PPC_XCOFF
+
+ .globl __kmp_invoke_microtask[DS]
+ .globl .__kmp_invoke_microtask
+ .align 4
+ .csect __kmp_invoke_microtask[DS],2
+ .vbyte 4, .__kmp_invoke_microtask
+ .vbyte 4, TOC[TC0]
+ .vbyte 4, 0
+ .csect .text[PR],2
+ .machine "pwr7"
+.__kmp_invoke_microtask:
+
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+
+// We need to allocate a stack frame large enough to hold all of the parameters
+// on the stack for the microtask plus what this function needs. That's 24
+// bytes under the XCOFF ABI, plus max(32, 8*(2 + argc)) for
+// the parameters to the microtask (gtid, tid, argc elements of p_argv),
+// plus 8 bytes to store the values of r4 and r5, and 4 bytes to store r31.
+// With OMP-T support, we need an additional 4 bytes to save r30 to hold
+// a copy of r8.
+// Stack offsets relative to stack pointer:
+// r31: -4, r30: -8, gtid: -12, tid: -16
+
+ mflr 0
+ stw 31, -4(1) # Save r31 to the stack
+ stw 0, 8(1) # Save LR to the linkage area
+
+// This is unusual because normally we'd set r31 equal to r1 after the stack
+// frame is established. In this case, however, we need to dynamically compute
+// the stack frame size, and so we keep a direct copy of r1 to access our
+// register save areas and restore the r1 value before returning.
+ mr 31, 1
+
+// Compute the size of the "argc" portion of the parameter save area.
+// The parameter save area is always at least 32 bytes long (i.e. 8 regs)
+// The microtask has (2 + argc) parameters, so if argc <= 6, we need to
+// to allocate 4*6 bytes, not 4*argc.
+ li 0, 6
+ cmpwi 0, 6, 6
+ iselgt 0, 6, 0 # r0 = (argc > 6)? argc : 6
+ slwi 0, 0, 2 # r0 = 4 * max(argc, 6)
+
+// Compute the size necessary for the local stack frame.
+// 56 = 32 + 4 (for r4) + 4 (for r5) + 4 (for r31) + 4 (for OMP-T r30) +
+// 4 (parameter gtid) + 4 (parameter tid)
+ li 12, 56
+ add 12, 0, 12
+ neg 12, 12
+
+// We need to make sure that the stack frame stays aligned (to 16 bytes).
+ li 0, -16
+ and 12, 0, 12
+
+// Establish the local stack frame.
+ stwux 1, 1, 12
+
+#if OMPT_SUPPORT
+ stw 30, -8(31) # Save r30 to the stack
+ stw 1, 0(8)
+ mr 30, 8
+#endif
+
+// Store gtid and tid to the stack because they're passed by reference to the microtask.
+ stw 4, -12(31) # Save gtid to the stack
+ stw 5, -16(31) # Save tid to the stack
+
+ mr 12, 6 # r12 = argc
+ mr 4, 7 # r4 = p_argv
+
+ cmpwi 0, 12, 1
+ blt 0, .Lcall # if (argc < 1) goto .Lcall
+
+ lwz 5, 0(4) # r5 = p_argv[0]
+
+ cmpwi 0, 12, 2
+ blt 0, .Lcall # if (argc < 2) goto .Lcall
+
+ lwz 6, 4(4) # r6 = p_argv[1]
+
+ cmpwi 0, 12, 3
+ blt 0, .Lcall # if (argc < 3) goto .Lcall
+
+ lwz 7, 8(4) # r7 = p_argv[2]
+
+ cmpwi 0, 12, 4
+ blt 0, .Lcall # if (argc < 4) goto .Lcall
+
+ lwz 8, 12(4) # r8 = p_argv[3]
+
+ cmpwi 0, 12, 5
+ blt 0, .Lcall # if (argc < 5) goto .Lcall
+
+ lwz 9, 16(4) # r9 = p_argv[4]
+
+ cmpwi 0, 12, 6
+ blt 0, .Lcall # if (argc < 6) goto .Lcall
+
+ lwz 10, 20(4) # r10 = p_argv[5]
+
+ cmpwi 0, 12, 7
+ blt 0, .Lcall # if (argc < 7) goto .Lcall
+
+// There are more than 6 microtask parameters, so we need to store the
+// remainder to the stack.
+ addi 12, 12, -6 # argc -= 6
+ mtctr 12
+
+// These are set to 4 bytes before the first desired store address (we're using
+// pre-increment loads and stores in the loop below). The parameter save area
+// for the microtask begins 24 + 4*8 == 56 bytes above r1 for XCOFF.
+ addi 4, 4, 20 # p_argv = p_argv + 5
+ # (i.e. skip the 5 elements we already processed)
+ addi 12, 1, 52 # r12 = stack offset (56 - 4)
+
+.Lnext:
+ lwzu 0, 4(4)
+ stwu 0, 4(12)
+ bdnz .Lnext
+
+.Lcall:
+ stw 2, 20(1) # Save the TOC pointer to the linkage area
+// Load the actual function address from the function descriptor.
+ lwz 12, 0(3) # Function address
+ lwz 2, 4(3) # TOC pointer
+ lwz 11, 8(3) # Environment pointer
+
+ addi 3, 31, -12 # r3 = &gtid
+ addi 4, 31, -16 # r4 = &tid
+
+ mtctr 12 # CTR = function address
+ bctrl # Branch to CTR
+ lwz 2, 20(1) # Restore TOC pointer from linkage area
+
+#if OMPT_SUPPORT
+ li 3, 0
+ stw 3, 0(30)
+#endif
+
+ li 3, 1
+
+#if OMPT_SUPPORT
+ lwz 30, -8(31) # Restore r30 from the saved value on the stack
+#endif
+
+ mr 1, 31
+ lwz 31, -4(1) # Restore r31 from the saved value on the stack
+ lwz 0, 8(1)
+ mtlr 0 # Restore LR from the linkage area
+ blr # Branch to LR
+
+#endif // KMP_ARCH_PPC64_XCOFF
+
+.Lfunc_end0:
+ .vbyte 4, 0x00000000 # Traceback table begin
+ .byte 0x00 # Version = 0
+ .byte 0x09 # Language = CPlusPlus
+ .byte 0x20 # -IsGlobaLinkage, -IsOutOfLineEpilogOrPrologue
+ # +HasTraceBackTableOffset, -IsInternalProcedure
+ # -HasControlledStorage, -IsTOCless
+ # -IsFloatingPointPresent
+ # -IsFloatingPointOperationLogOrAbortEnabled
+ .byte 0x61 # -IsInterruptHandler, +IsFunctionNamePresent, +IsAllocaUsed
+ # OnConditionDirective = 0, -IsCRSaved, +IsLRSaved
+ .byte 0x80 # +IsBackChainStored, -IsFixup, NumOfFPRsSaved = 0
+#if OMPT_SUPPORT
+ .byte 0x02 # -HasExtensionTable, -HasVectorInfo, NumOfGPRsSaved = 2
+ .byte 0x06 # NumberOfFixedParms = 6
+#else
+ .byte 0x01 # -HasExtensionTable, -HasVectorInfo, NumOfGPRsSaved = 1
+ .byte 0x05 # NumberOfFixedParms = 5
+#endif
+ .byte 0x01 # NumberOfFPParms = 0, +HasParmsOnStack
+ .vbyte 4, 0x00000000 # Parameter type = i, i, i, i, i
+ .vbyte 4, .Lfunc_end0-.__kmp_invoke_microtask # Function size
+ .vbyte 2, 0x0016 # Function name len = 22
+ .byte "__kmp_invoke_microtask" # Function Name
+ .byte 0x1f # AllocaRegister = 31
+ # -- End function
+
+// -- End __kmp_invoke_microtask
+
+// Support for unnamed common blocks.
+
+ .comm .gomp_critical_user_, 32, 3
+#if KMP_ARCH_PPC64_XCOFF
+ .csect __kmp_unnamed_critical_addr[RW],3
+#else
+ .csect __kmp_unnamed_critical_addr[RW],2
+#endif
+ .globl __kmp_unnamed_critical_addr[RW]
+ .ptr .gomp_critical_user_
+
+// -- End unnamed common block
+
+ .toc
+
+#endif // KMP_OS_AIX
diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S
index 14987c298fa5..5b614e26a833 100644
--- a/openmp/runtime/src/z_Linux_asm.S
+++ b/openmp/runtime/src/z_Linux_asm.S
@@ -108,7 +108,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
# endif // KMP_OS_DARWIN
#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
# if KMP_OS_DARWIN
# define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
@@ -176,7 +176,7 @@ KMP_PREFIX_UNDERSCORE(\proc):
.endm
# endif // KMP_OS_DARWIN
-#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
+#endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32 || KMP_ARCH_ARM)
.macro COMMON name, size, align_power
#if KMP_OS_DARWIN
@@ -1150,6 +1150,9 @@ KMP_LABEL(kmp_invoke_pass_parms): // put 1st - 6th parms to pkfn in registers.
movq %rdi, %rbx // pkfn -> %rbx
leaq __gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
leaq __tid(%rbp), %rsi // &tid -> %rsi (store 2nd parm to pkfn)
+ // Check if argc is 0
+ cmpq $0, %rax
+ je KMP_LABEL(kmp_no_args) // Jump ahead
movq %r8, %r11 // p_argv -> %r11
@@ -1195,6 +1198,7 @@ KMP_LABEL(kmp_1_exit):
cmovnsq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
#endif // KMP_MIC
+KMP_LABEL(kmp_no_args):
call *%rbx // call (*pkfn)();
movq $1, %rax // move 1 into return register;
@@ -1236,7 +1240,7 @@ KMP_LABEL(kmp_1_exit):
#endif /* KMP_ARCH_X86_64 */
// '
-#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
+#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32)
//------------------------------------------------------------------------
// int
@@ -1360,7 +1364,7 @@ KMP_LABEL(kmp_1):
DEBUG_INFO __kmp_invoke_microtask
// -- End __kmp_invoke_microtask
-#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
+#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_AARCH64_32) */
#if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
@@ -1505,7 +1509,7 @@ KMP_LABEL(kmp_1):
DEBUG_INFO __kmp_invoke_microtask
// -- End __kmp_invoke_microtask
-#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
+#endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM */
#if KMP_ARCH_PPC64
@@ -2405,18 +2409,21 @@ __kmp_invoke_microtask:
#endif /* KMP_ARCH_S390X */
-#if KMP_ARCH_ARM || KMP_ARCH_MIPS
+#if KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32
+#ifndef KMP_PREFIX_UNDERSCORE
+# define KMP_PREFIX_UNDERSCORE(x) x
+#endif
.data
COMMON .gomp_critical_user_, 32, 3
.data
.align 4
- .global __kmp_unnamed_critical_addr
-__kmp_unnamed_critical_addr:
+ .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
+KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
.4byte .gomp_critical_user_
#ifdef __ELF__
- .size __kmp_unnamed_critical_addr,4
+ .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),4
#endif
-#endif /* KMP_ARCH_ARM */
+#endif /* KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_AARCH64_32 */
#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || \
KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE || \
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 513ec6517d00..7c90740ae5bd 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -29,7 +29,10 @@
#include <semaphore.h>
#endif // KMP_OS_LINUX
#include <sys/resource.h>
-#if !KMP_OS_AIX
+#if KMP_OS_AIX
+#include <sys/ldr.h>
+#include <libperfstat.h>
+#else
#include <sys/syscall.h>
#endif
#include <sys/time.h>
@@ -59,10 +62,19 @@
#include <sys/sysctl.h>
#include <sys/user.h>
#include <pthread_np.h>
+#if KMP_OS_DRAGONFLY
+#include <kvm.h>
+#endif
#elif KMP_OS_NETBSD || KMP_OS_OPENBSD
#include <sys/types.h>
#include <sys/sysctl.h>
+#if KMP_OS_NETBSD
+#include <sched.h>
+#endif
#elif KMP_OS_SOLARIS
+#include <libproc.h>
+#include <procfs.h>
+#include <thread.h>
#include <sys/loadavg.h>
#endif
@@ -116,7 +128,9 @@ static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) {
}
#endif
-#if ((KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED)
+#if ((KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_AIX) && \
+ KMP_AFFINITY_SUPPORTED)
/* Affinity support */
@@ -132,6 +146,29 @@ void __kmp_affinity_bind_thread(int which) {
KMP_CPU_FREE_FROM_STACK(mask);
}
+#if KMP_OS_AIX
+void __kmp_affinity_determine_capable(const char *env_var) {
+ // All versions of AIX support bindprocessor().
+
+ size_t mask_size = __kmp_xproc / CHAR_BIT;
+ // Round up to byte boundary.
+ if (__kmp_xproc % CHAR_BIT)
+ ++mask_size;
+
+ // Round up to the mask_size_type boundary.
+ if (mask_size % sizeof(__kmp_affin_mask_size))
+ mask_size += sizeof(__kmp_affin_mask_size) -
+ mask_size % sizeof(__kmp_affin_mask_size);
+ KMP_AFFINITY_ENABLE(mask_size);
+ KA_TRACE(10,
+ ("__kmp_affinity_determine_capable: "
+ "AIX OS affinity interface bindprocessor functional (mask size = "
+ "%" KMP_SIZE_T_SPEC ").\n",
+ __kmp_affin_mask_size));
+}
+
+#else // !KMP_OS_AIX
+
/* Determine if we can access affinity functionality on this version of
* Linux* OS by checking __NR_sched_{get,set}affinity system calls, and set
* __kmp_affin_mask_size to the appropriate value (0 means not capable). */
@@ -141,8 +178,10 @@ void __kmp_affinity_determine_capable(const char *env_var) {
#if KMP_OS_LINUX
#define KMP_CPU_SET_SIZE_LIMIT (1024 * 1024)
#define KMP_CPU_SET_TRY_SIZE CACHE_LINE
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
#define KMP_CPU_SET_SIZE_LIMIT (sizeof(cpuset_t))
+#elif KMP_OS_NETBSD
+#define KMP_CPU_SET_SIZE_LIMIT (256)
#endif
int verbose = __kmp_affinity.flags.verbose;
@@ -230,7 +269,7 @@ void __kmp_affinity_determine_capable(const char *env_var) {
KMP_INTERNAL_FREE(buf);
return;
}
-#elif KMP_OS_FREEBSD
+#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
long gCode;
unsigned char *buf;
buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
@@ -259,8 +298,9 @@ void __kmp_affinity_determine_capable(const char *env_var) {
KMP_WARNING(AffCantGetMaskSize, env_var);
}
}
-
-#endif // KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+#endif // KMP_OS_AIX
+#endif // (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
+ KMP_OS_DRAGONFLY || KMP_OS_AIX) && KMP_AFFINITY_SUPPORTED
#if KMP_USE_FUTEX
@@ -415,7 +455,7 @@ void __kmp_terminate_thread(int gtid) {
KMP_YIELD(TRUE);
} //
-/* Set thread stack info according to values returned by pthread_getattr_np().
+/* Set thread stack info.
If values are unreasonable, assume call failed and use incremental stack
refinement method instead. Returns TRUE if the stack parameters could be
determined exactly, FALSE if incremental refinement is necessary. */
@@ -423,7 +463,6 @@ static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
int stack_data;
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
- pthread_attr_t attr;
int status;
size_t size = 0;
void *addr = 0;
@@ -433,6 +472,19 @@ static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
pthread_attr_getstack may cause thread gtid aliasing */
if (!KMP_UBER_GTID(gtid)) {
+#if KMP_OS_SOLARIS
+ stack_t s;
+ if ((status = thr_stksegment(&s)) < 0) {
+ KMP_CHECK_SYSFAIL("thr_stksegment", status);
+ }
+
+ addr = s.ss_sp;
+ size = s.ss_size;
+ KA_TRACE(60, ("__kmp_set_stack_info: T#%d thr_stksegment returned size:"
+ " %lu, low addr: %p\n",
+ gtid, size, addr));
+#else
+ pthread_attr_t attr;
/* Fetch the real thread attributes */
status = pthread_attr_init(&attr);
KMP_CHECK_SYSFAIL("pthread_attr_init", status);
@@ -451,6 +503,7 @@ static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
gtid, size, addr));
status = pthread_attr_destroy(&attr);
KMP_CHECK_SYSFAIL("pthread_attr_destroy", status);
+#endif
}
if (size != 0 && addr != 0) { // was stack parameter determination successful?
@@ -476,7 +529,7 @@ static void *__kmp_launch_worker(void *thr) {
#endif /* KMP_BLOCK_SIGNALS */
void *exit_val;
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
- KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS
+ KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
void *volatile padding = 0;
#endif
int gtid;
@@ -525,7 +578,7 @@ static void *__kmp_launch_worker(void *thr) {
#endif /* KMP_BLOCK_SIGNALS */
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
- KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS
+ KMP_OS_OPENBSD || KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_AIX
if (__kmp_stkoffset > 0 && gtid > 0) {
padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
(void)padding;
@@ -1042,9 +1095,7 @@ extern "C" void __kmp_reap_monitor(kmp_info_t *th) {
#else
// Empty symbol to export (see exports_so.txt) when
// monitor thread feature is disabled
-extern "C" void __kmp_reap_monitor(kmp_info_t *th) {
- (void)th;
-}
+extern "C" void __kmp_reap_monitor(kmp_info_t *th) { (void)th; }
#endif // KMP_USE_MONITOR
void __kmp_reap_worker(kmp_info_t *th) {
@@ -1245,7 +1296,8 @@ static void __kmp_atfork_child(void) {
++__kmp_fork_count;
#if KMP_AFFINITY_SUPPORTED
-#if KMP_OS_LINUX || KMP_OS_FREEBSD
+#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
+ KMP_OS_AIX
// reset the affinity in the child to the initial thread
// affinity in the parent
kmp_set_thread_affinity_mask_initial();
@@ -1840,21 +1892,8 @@ static int __kmp_get_xproc(void) {
#elif KMP_OS_DARWIN
- // Bug C77011 High "OpenMP Threads and number of active cores".
-
- // Find the number of available CPUs.
- kern_return_t rc;
- host_basic_info_data_t info;
- mach_msg_type_number_t num = HOST_BASIC_INFO_COUNT;
- rc = host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&info, &num);
- if (rc == 0 && num == HOST_BASIC_INFO_COUNT) {
- // Cannot use KA_TRACE() here because this code works before trace support
- // is initialized.
- r = info.avail_cpus;
- } else {
- KMP_WARNING(CantGetNumAvailCPU);
- KMP_INFORM(AssumedNumCPU);
- }
+ size_t len = sizeof(r);
+ sysctlbyname("hw.logicalcpu", &r, &len, NULL, 0);
#else
@@ -2105,10 +2144,10 @@ int __kmp_is_address_mapped(void *addr) {
// We pass from number of vm entry's semantic
// to size of whole entry map list.
lstsz = lstsz * 4 / 3;
- buf = reinterpret_cast<char *>(kmpc_malloc(lstsz));
+ buf = reinterpret_cast<char *>(KMP_INTERNAL_MALLOC(lstsz));
rc = sysctl(mib, 4, buf, &lstsz, NULL, 0);
if (rc < 0) {
- kmpc_free(buf);
+ KMP_INTERNAL_FREE(buf);
return 0;
}
@@ -2132,8 +2171,96 @@ int __kmp_is_address_mapped(void *addr) {
}
lw += cursz;
}
- kmpc_free(buf);
+ KMP_INTERNAL_FREE(buf);
+#elif KMP_OS_DRAGONFLY
+ char err[_POSIX2_LINE_MAX];
+ kinfo_proc *proc;
+ vmspace sp;
+ vm_map *cur;
+ vm_map_entry entry, *c;
+ struct proc p;
+ kvm_t *fd;
+ uintptr_t uaddr;
+ int num;
+
+ fd = kvm_openfiles(nullptr, nullptr, nullptr, O_RDONLY, err);
+ if (!fd) {
+ return 0;
+ }
+ proc = kvm_getprocs(fd, KERN_PROC_PID, getpid(), &num);
+
+ if (kvm_read(fd, static_cast<uintptr_t>(proc->kp_paddr), &p, sizeof(p)) !=
+ sizeof(p) ||
+ kvm_read(fd, reinterpret_cast<uintptr_t>(p.p_vmspace), &sp, sizeof(sp)) !=
+ sizeof(sp)) {
+ kvm_close(fd);
+ return 0;
+ }
+
+ (void)rc;
+ cur = &sp.vm_map;
+ uaddr = reinterpret_cast<uintptr_t>(addr);
+ for (c = kvm_vm_map_entry_first(fd, cur, &entry); c;
+ c = kvm_vm_map_entry_next(fd, c, &entry)) {
+ if ((uaddr >= entry.ba.start) && (uaddr <= entry.ba.end)) {
+ if ((entry.protection & VM_PROT_READ) != 0 &&
+ (entry.protection & VM_PROT_WRITE) != 0) {
+ found = 1;
+ break;
+ }
+ }
+ }
+
+ kvm_close(fd);
+#elif KMP_OS_SOLARIS
+ prmap_t *cur, *map;
+ void *buf;
+ uintptr_t uaddr;
+ ssize_t rd;
+ int err;
+ int file;
+
+ pid_t pid = getpid();
+ struct ps_prochandle *fd = Pgrab(pid, PGRAB_RDONLY, &err);
+ ;
+
+ if (!fd) {
+ return 0;
+ }
+
+ char *name = __kmp_str_format("/proc/%d/map", pid);
+ size_t sz = (1 << 20);
+ file = open(name, O_RDONLY);
+ if (file == -1) {
+ KMP_INTERNAL_FREE(name);
+ return 0;
+ }
+
+ buf = KMP_INTERNAL_MALLOC(sz);
+
+ while (sz > 0 && (rd = pread(file, buf, sz, 0)) == sz) {
+ void *newbuf;
+ sz <<= 1;
+ newbuf = KMP_INTERNAL_REALLOC(buf, sz);
+ buf = newbuf;
+ }
+
+ map = reinterpret_cast<prmap_t *>(buf);
+ uaddr = reinterpret_cast<uintptr_t>(addr);
+
+ for (cur = map; rd > 0; cur++, rd = -sizeof(*map)) {
+ if ((uaddr >= cur->pr_vaddr) && (uaddr < cur->pr_vaddr)) {
+ if ((cur->pr_mflags & MA_READ) != 0 && (cur->pr_mflags & MA_WRITE) != 0) {
+ found = 1;
+ break;
+ }
+ }
+ }
+
+ KMP_INTERNAL_FREE(map);
+ close(file);
+ KMP_INTERNAL_FREE(name);
#elif KMP_OS_DARWIN
/* On OS X*, /proc pseudo filesystem is not available. Try to read memory
@@ -2212,10 +2339,50 @@ int __kmp_is_address_mapped(void *addr) {
}
#elif KMP_OS_WASI
found = (int)addr < (__builtin_wasm_memory_size(0) * PAGESIZE);
-#elif KMP_OS_DRAGONFLY || KMP_OS_SOLARIS || KMP_OS_AIX
+#elif KMP_OS_AIX
- // FIXME(DragonFly, Solaris, AIX): Implement this
- found = 1;
+ uint32_t loadQueryBufSize = 4096u; // Default loadquery buffer size.
+ char *loadQueryBuf;
+
+ for (;;) {
+ loadQueryBuf = (char *)KMP_INTERNAL_MALLOC(loadQueryBufSize);
+ if (loadQueryBuf == NULL) {
+ return 0;
+ }
+
+ rc = loadquery(L_GETXINFO | L_IGNOREUNLOAD, loadQueryBuf, loadQueryBufSize);
+ if (rc < 0) {
+ KMP_INTERNAL_FREE(loadQueryBuf);
+ if (errno != ENOMEM) {
+ return 0;
+ }
+ // errno == ENOMEM; double the size.
+ loadQueryBufSize <<= 1;
+ continue;
+ }
+ // Obtained the load info successfully.
+ break;
+ }
+
+ struct ld_xinfo *curLdInfo = (struct ld_xinfo *)loadQueryBuf;
+
+ // Loop through the load info to find if there is a match.
+ for (;;) {
+ uintptr_t curDataStart = (uintptr_t)curLdInfo->ldinfo_dataorg;
+ uintptr_t curDataEnd = curDataStart + curLdInfo->ldinfo_datasize;
+
+ // The data segment is readable and writable.
+ if (curDataStart <= (uintptr_t)addr && (uintptr_t)addr < curDataEnd) {
+ found = 1;
+ break;
+ }
+ if (curLdInfo->ldinfo_next == 0u) {
+ // Reached the end of load info.
+ break;
+ }
+ curLdInfo = (struct ld_xinfo *)((char *)curLdInfo + curLdInfo->ldinfo_next);
+ }
+ KMP_INTERNAL_FREE(loadQueryBuf);
#else
@@ -2261,6 +2428,79 @@ int __kmp_get_load_balance(int max) {
return ret_avg;
}
+#elif KMP_OS_AIX
+
+// The function returns number of running (not sleeping) threads, or -1 in case
+// of error.
+int __kmp_get_load_balance(int max) {
+
+ static int glb_running_threads = 0; // Saved count of the running threads for
+ // the thread balance algorithm.
+ static double glb_call_time = 0; // Thread balance algorithm call time.
+ int running_threads = 0; // Number of running threads in the system.
+
+ double call_time = 0.0;
+
+ __kmp_elapsed(&call_time);
+
+ if (glb_call_time &&
+ (call_time - glb_call_time < __kmp_load_balance_interval))
+ return glb_running_threads;
+
+ glb_call_time = call_time;
+
+ if (max <= 0) {
+ max = INT_MAX;
+ }
+
+ // Check how many perfstat_cpu_t structures are available.
+ int logical_cpus = perfstat_cpu(NULL, NULL, sizeof(perfstat_cpu_t), 0);
+ if (logical_cpus <= 0) {
+ glb_call_time = -1;
+ return -1;
+ }
+
+ perfstat_cpu_t *cpu_stat = (perfstat_cpu_t *)KMP_INTERNAL_MALLOC(
+ logical_cpus * sizeof(perfstat_cpu_t));
+ if (cpu_stat == NULL) {
+ glb_call_time = -1;
+ return -1;
+ }
+
+ // Set first CPU as the name of the first logical CPU for which the info is
+ // desired.
+ perfstat_id_t first_cpu_name;
+ strcpy(first_cpu_name.name, FIRST_CPU);
+
+ // Get the stat info of logical CPUs.
+ int rc = perfstat_cpu(&first_cpu_name, cpu_stat, sizeof(perfstat_cpu_t),
+ logical_cpus);
+ KMP_DEBUG_ASSERT(rc == logical_cpus);
+ if (rc <= 0) {
+ KMP_INTERNAL_FREE(cpu_stat);
+ glb_call_time = -1;
+ return -1;
+ }
+ for (int i = 0; i < logical_cpus; ++i) {
+ running_threads += cpu_stat[i].runque;
+ if (running_threads >= max)
+ break;
+ }
+
+ // There _might_ be a timing hole where the thread executing this
+ // code gets skipped in the load balance, and running_threads is 0.
+ // Assert in the debug builds only!!!
+ KMP_DEBUG_ASSERT(running_threads > 0);
+ if (running_threads <= 0)
+ running_threads = 1;
+
+ KMP_INTERNAL_FREE(cpu_stat);
+
+ glb_running_threads = running_threads;
+
+ return running_threads;
+}
+
#else // Linux* OS
// The function returns number of running (not sleeping) threads, or -1 in case
@@ -2332,14 +2572,9 @@ int __kmp_get_load_balance(int max) {
proc_entry = readdir(proc_dir);
while (proc_entry != NULL) {
-#if KMP_OS_AIX
- // Proc entry name starts with a digit. Assume it is a process' directory.
- if (isdigit(proc_entry->d_name[0])) {
-#else
// Proc entry is a directory and name starts with a digit. Assume it is a
// process' directory.
if (proc_entry->d_type == DT_DIR && isdigit(proc_entry->d_name[0])) {
-#endif
#ifdef KMP_DEBUG
++total_processes;
@@ -2383,11 +2618,7 @@ int __kmp_get_load_balance(int max) {
task_entry = readdir(task_dir);
while (task_entry != NULL) {
// It is a directory and name starts with a digit.
-#if KMP_OS_AIX
- if (isdigit(task_entry->d_name[0])) {
-#else
if (proc_entry->d_type == DT_DIR && isdigit(task_entry->d_name[0])) {
-#endif
// Construct complete stat file path. Easiest way would be:
// __kmp_str_buf_print( & stat_path, "%s/%s/stat", task_path.str,
@@ -2497,7 +2728,45 @@ finish: // Clean up and exit.
#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || \
((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || \
KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
- KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC_XCOFF)
+ KMP_ARCH_ARM || KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC_XCOFF || \
+ KMP_ARCH_AARCH64_32)
+
+// Because WebAssembly will use `call_indirect` to invoke the microtask and
+// WebAssembly indirect calls check that the called signature is a precise
+// match, we need to cast each microtask function pointer back from `void *` to
+// its original type.
+typedef void (*microtask_t0)(int *, int *);
+typedef void (*microtask_t1)(int *, int *, void *);
+typedef void (*microtask_t2)(int *, int *, void *, void *);
+typedef void (*microtask_t3)(int *, int *, void *, void *, void *);
+typedef void (*microtask_t4)(int *, int *, void *, void *, void *, void *);
+typedef void (*microtask_t5)(int *, int *, void *, void *, void *, void *,
+ void *);
+typedef void (*microtask_t6)(int *, int *, void *, void *, void *, void *,
+ void *, void *);
+typedef void (*microtask_t7)(int *, int *, void *, void *, void *, void *,
+ void *, void *, void *);
+typedef void (*microtask_t8)(int *, int *, void *, void *, void *, void *,
+ void *, void *, void *, void *);
+typedef void (*microtask_t9)(int *, int *, void *, void *, void *, void *,
+ void *, void *, void *, void *, void *);
+typedef void (*microtask_t10)(int *, int *, void *, void *, void *, void *,
+ void *, void *, void *, void *, void *, void *);
+typedef void (*microtask_t11)(int *, int *, void *, void *, void *, void *,
+ void *, void *, void *, void *, void *, void *,
+ void *);
+typedef void (*microtask_t12)(int *, int *, void *, void *, void *, void *,
+ void *, void *, void *, void *, void *, void *,
+ void *, void *);
+typedef void (*microtask_t13)(int *, int *, void *, void *, void *, void *,
+ void *, void *, void *, void *, void *, void *,
+ void *, void *, void *);
+typedef void (*microtask_t14)(int *, int *, void *, void *, void *, void *,
+ void *, void *, void *, void *, void *, void *,
+ void *, void *, void *, void *);
+typedef void (*microtask_t15)(int *, int *, void *, void *, void *, void *,
+ void *, void *, void *, void *, void *, void *,
+ void *, void *, void *, void *, void *);
// we really only need the case with 1 argument, because CLANG always build
// a struct of pointers to shared variables referenced in the outlined function
@@ -2518,66 +2787,76 @@ int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
fflush(stderr);
exit(-1);
case 0:
- (*pkfn)(&gtid, &tid);
+ (*(microtask_t0)pkfn)(&gtid, &tid);
break;
case 1:
- (*pkfn)(&gtid, &tid, p_argv[0]);
+ (*(microtask_t1)pkfn)(&gtid, &tid, p_argv[0]);
break;
case 2:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
+ (*(microtask_t2)pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
break;
case 3:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
+ (*(microtask_t3)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
break;
case 4:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
+ (*(microtask_t4)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3]);
break;
case 5:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
+ (*(microtask_t5)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4]);
break;
case 6:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5]);
+ (*(microtask_t6)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5]);
break;
case 7:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6]);
+ (*(microtask_t7)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5], p_argv[6]);
break;
case 8:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7]);
+ (*(microtask_t8)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+ p_argv[7]);
break;
case 9:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
+ (*(microtask_t9)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5], p_argv[6], p_argv[7],
+ p_argv[8]);
break;
case 10:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
+ (*(microtask_t10)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+ p_argv[7], p_argv[8], p_argv[9]);
break;
case 11:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
+ (*(microtask_t11)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+ p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
break;
case 12:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
- p_argv[11]);
+ (*(microtask_t12)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+ p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+ p_argv[11]);
break;
case 13:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
- p_argv[11], p_argv[12]);
+ (*(microtask_t13)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+ p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+ p_argv[11], p_argv[12]);
break;
case 14:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
- p_argv[11], p_argv[12], p_argv[13]);
+ (*(microtask_t14)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+ p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+ p_argv[11], p_argv[12], p_argv[13]);
break;
case 15:
- (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
- p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
- p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
+ (*(microtask_t15)pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2],
+ p_argv[3], p_argv[4], p_argv[5], p_argv[6],
+ p_argv[7], p_argv[8], p_argv[9], p_argv[10],
+ p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
break;
}