aboutsummaryrefslogtreecommitdiff
path: root/sys/x86
diff options
context:
space:
mode:
authorMark Johnston <markj@FreeBSD.org>2022-07-14 14:24:25 +0000
committerMark Johnston <markj@FreeBSD.org>2022-08-01 14:12:49 +0000
commit84a0d34d10aca36b1e7f9d00d0c4883f3355883b (patch)
treed39402aa9ff3c7661946a6497518e4aaef2d1649 /sys/x86
parent20abb8075086dd31fae4ad39e84a44ce429a357e (diff)
x86: Add a required store-load barrier in cpu_idle()
ULE's tdq_notify() tries to avoid delivering IPIs to the idle thread. In particular, it tries to detect whether the idle thread is running. There are two mechanisms for this: - tdq_cpu_idle, an MI flag which is set prior to calling cpu_idle(). If tdq_cpu_idle == 0, then no IPI is needed; - idle_state, an x86-specific state flag which is updated after cpu_idleclock() is called. The implementation of the second mechanism is racy; the race can cause a CPU to go to sleep with pending work. Specifically, cpu_idle_*() set idle_state = STATE_SLEEPING, then check for pending work by loading the tdq_load field of the CPU's runqueue. These operations can be reordered so that the idle thread observes tdq_load == 0, and tdq_notify() observes idle_state == STATE_RUNNING. Some counters indicate that the idle_state check in tdq_notify() frequently elides an IPI. So, fix the problem by inserting a fence after the store to idle_state, immediately before idling the CPU. PR: 264867 Reviewed by: mav, kib, jhb Sponsored by: The FreeBSD Foundation (cherry picked from commit 03f868b163ad46d6f7cb03dc46fb83ca01fb8f69)
Diffstat (limited to 'sys/x86')
-rw-r--r--sys/x86/x86/cpu_machdep.c103
1 files changed, 62 insertions, 41 deletions
diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c
index 53b32672132a..1ec3fb669d01 100644
--- a/sys/x86/x86/cpu_machdep.c
+++ b/sys/x86/x86/cpu_machdep.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include "opt_maxmem.h"
#include "opt_mp_watchdog.h"
#include "opt_platform.h"
+#include "opt_sched.h"
#ifdef __i386__
#include "opt_apic.h"
#endif
@@ -528,32 +529,25 @@ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
0, "Use MONITOR/MWAIT for short idle");
-static void
-cpu_idle_acpi(sbintime_t sbt)
+static bool
+cpu_idle_enter(int *statep, int newstate)
{
- int *state;
+ KASSERT(atomic_load_int(statep) == STATE_RUNNING,
+ ("%s: state %d", __func__, atomic_load_int(statep)));
- state = &PCPU_PTR(monitorbuf)->idle_state;
- atomic_store_int(state, STATE_SLEEPING);
-
- /* See comments in cpu_idle_hlt(). */
- disable_intr();
- if (sched_runnable())
- enable_intr();
- else if (cpu_idle_hook)
- cpu_idle_hook(sbt);
- else
- acpi_cpu_c1();
- atomic_store_int(state, STATE_RUNNING);
-}
-
-static void
-cpu_idle_hlt(sbintime_t sbt)
-{
- int *state;
-
- state = &PCPU_PTR(monitorbuf)->idle_state;
- atomic_store_int(state, STATE_SLEEPING);
+ /*
+ * A fence is needed to prevent reordering of the load in
+ * sched_runnable() with this store to the idle state word. Without it,
+ * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having
+ * added load to the queue, and elide an IPI. Then, sched_runnable()
+ * can observe tdq_load == 0, so the CPU ends up idling with pending
+ * work. tdq_notify() similarly ensures that a prior update to tdq_load
+ * is visible before calling cpu_idle_wakeup().
+ */
+ atomic_store_int(statep, newstate);
+#if defined(SCHED_ULE) && defined(SMP)
+ atomic_thread_fence_seq_cst();
+#endif
/*
* Since we may be in a critical section from cpu_idle(), if
@@ -572,35 +566,62 @@ cpu_idle_hlt(sbintime_t sbt)
* interrupt.
*/
disable_intr();
- if (sched_runnable())
+ if (sched_runnable()) {
enable_intr();
- else
- acpi_cpu_c1();
- atomic_store_int(state, STATE_RUNNING);
+ atomic_store_int(statep, STATE_RUNNING);
+ return (false);
+ } else {
+ return (true);
+ }
}
static void
-cpu_idle_mwait(sbintime_t sbt)
+cpu_idle_exit(int *statep)
+{
+ atomic_store_int(statep, STATE_RUNNING);
+}
+
+static void
+cpu_idle_acpi(sbintime_t sbt)
{
int *state;
state = &PCPU_PTR(monitorbuf)->idle_state;
- atomic_store_int(state, STATE_MWAIT);
+ if (cpu_idle_enter(state, STATE_SLEEPING)) {
+ if (cpu_idle_hook)
+ cpu_idle_hook(sbt);
+ else
+ acpi_cpu_c1();
+ cpu_idle_exit(state);
+ }
+}
- /* See comments in cpu_idle_hlt(). */
- disable_intr();
- if (sched_runnable()) {
+static void
+cpu_idle_hlt(sbintime_t sbt)
+{
+ int *state;
+
+ state = &PCPU_PTR(monitorbuf)->idle_state;
+ if (cpu_idle_enter(state, STATE_SLEEPING)) {
+ acpi_cpu_c1();
atomic_store_int(state, STATE_RUNNING);
- enable_intr();
- return;
}
+}
- cpu_monitor(state, 0, 0);
- if (atomic_load_int(state) == STATE_MWAIT)
- __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
- else
- enable_intr();
- atomic_store_int(state, STATE_RUNNING);
+static void
+cpu_idle_mwait(sbintime_t sbt)
+{
+ int *state;
+
+ state = &PCPU_PTR(monitorbuf)->idle_state;
+ if (cpu_idle_enter(state, STATE_MWAIT)) {
+ cpu_monitor(state, 0, 0);
+ if (atomic_load_int(state) == STATE_MWAIT)
+ __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
+ else
+ enable_intr();
+ cpu_idle_exit(state);
+ }
}
static void