diff options
author | Mark Johnston <markj@FreeBSD.org> | 2022-07-14 14:24:25 +0000 |
---|---|---|
committer | Mark Johnston <markj@FreeBSD.org> | 2022-08-01 14:12:49 +0000 |
commit | 84a0d34d10aca36b1e7f9d00d0c4883f3355883b (patch) | |
tree | d39402aa9ff3c7661946a6497518e4aaef2d1649 /sys/x86 | |
parent | 20abb8075086dd31fae4ad39e84a44ce429a357e (diff) |
x86: Add a required store-load barrier in cpu_idle()
ULE's tdq_notify() tries to avoid delivering IPIs to the idle thread.
In particular, it tries to detect whether the idle thread is running.
There are two mechanisms for this:
- tdq_cpu_idle, an MI flag which is set prior to calling cpu_idle(). If
tdq_cpu_idle == 0, then no IPI is needed;
- idle_state, an x86-specific state flag which is updated after
cpu_idleclock() is called.
The implementation of the second mechanism is racy; the race can cause a
CPU to go to sleep with pending work. Specifically, cpu_idle_*() set
idle_state = STATE_SLEEPING, then check for pending work by loading the
tdq_load field of the CPU's runqueue. These operations can be reordered
so that the idle thread observes tdq_load == 0, and tdq_notify()
observes idle_state == STATE_RUNNING.
Some counters indicate that the idle_state check in tdq_notify()
frequently elides an IPI. So, fix the problem by inserting a fence
after the store to idle_state, immediately before idling the CPU.
PR: 264867
Reviewed by: mav, kib, jhb
Sponsored by: The FreeBSD Foundation
(cherry picked from commit 03f868b163ad46d6f7cb03dc46fb83ca01fb8f69)
Diffstat (limited to 'sys/x86')
-rw-r--r-- | sys/x86/x86/cpu_machdep.c | 103 |
1 files changed, 62 insertions, 41 deletions
diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c index 53b32672132a..1ec3fb669d01 100644 --- a/sys/x86/x86/cpu_machdep.c +++ b/sys/x86/x86/cpu_machdep.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_platform.h" +#include "opt_sched.h" #ifdef __i386__ #include "opt_apic.h" #endif @@ -528,32 +529,25 @@ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); -static void -cpu_idle_acpi(sbintime_t sbt) +static bool +cpu_idle_enter(int *statep, int newstate) { - int *state; + KASSERT(atomic_load_int(statep) == STATE_RUNNING, + ("%s: state %d", __func__, atomic_load_int(statep))); - state = &PCPU_PTR(monitorbuf)->idle_state; - atomic_store_int(state, STATE_SLEEPING); - - /* See comments in cpu_idle_hlt(). */ - disable_intr(); - if (sched_runnable()) - enable_intr(); - else if (cpu_idle_hook) - cpu_idle_hook(sbt); - else - acpi_cpu_c1(); - atomic_store_int(state, STATE_RUNNING); -} - -static void -cpu_idle_hlt(sbintime_t sbt) -{ - int *state; - - state = &PCPU_PTR(monitorbuf)->idle_state; - atomic_store_int(state, STATE_SLEEPING); + /* + * A fence is needed to prevent reordering of the load in + * sched_runnable() with this store to the idle state word. Without it, + * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having + * added load to the queue, and elide an IPI. Then, sched_runnable() + * can observe tdq_load == 0, so the CPU ends up idling with pending + * work. tdq_notify() similarly ensures that a prior update to tdq_load + * is visible before calling cpu_idle_wakeup(). + */ + atomic_store_int(statep, newstate); +#if defined(SCHED_ULE) && defined(SMP) + atomic_thread_fence_seq_cst(); +#endif /* * Since we may be in a critical section from cpu_idle(), if @@ -572,35 +566,62 @@ cpu_idle_hlt(sbintime_t sbt) * interrupt. */ disable_intr(); - if (sched_runnable()) + if (sched_runnable()) { enable_intr(); - else - acpi_cpu_c1(); - atomic_store_int(state, STATE_RUNNING); + atomic_store_int(statep, STATE_RUNNING); + return (false); + } else { + return (true); + } } static void -cpu_idle_mwait(sbintime_t sbt) +cpu_idle_exit(int *statep) +{ + atomic_store_int(statep, STATE_RUNNING); +} + +static void +cpu_idle_acpi(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; - atomic_store_int(state, STATE_MWAIT); + if (cpu_idle_enter(state, STATE_SLEEPING)) { + if (cpu_idle_hook) + cpu_idle_hook(sbt); + else + acpi_cpu_c1(); + cpu_idle_exit(state); + } +} - /* See comments in cpu_idle_hlt(). */ - disable_intr(); - if (sched_runnable()) { +static void +cpu_idle_hlt(sbintime_t sbt) +{ + int *state; + + state = &PCPU_PTR(monitorbuf)->idle_state; + if (cpu_idle_enter(state, STATE_SLEEPING)) { + acpi_cpu_c1(); atomic_store_int(state, STATE_RUNNING); - enable_intr(); - return; } +} - cpu_monitor(state, 0, 0); - if (atomic_load_int(state) == STATE_MWAIT) - __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); - else - enable_intr(); - atomic_store_int(state, STATE_RUNNING); +static void +cpu_idle_mwait(sbintime_t sbt) +{ + int *state; + + state = &PCPU_PTR(monitorbuf)->idle_state; + if (cpu_idle_enter(state, STATE_MWAIT)) { + cpu_monitor(state, 0, 0); + if (atomic_load_int(state) == STATE_MWAIT) + __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); + else + enable_intr(); + cpu_idle_exit(state); + } } static void |