src - FreeBSD source tree

diff options


context:
space:
mode:

author	Alexander Motin <mav@FreeBSD.org>	2021-07-29 01:18:50 +0000
committer	Alexander Motin <mav@FreeBSD.org>	2021-07-29 02:00:29 +0000
commit	aefe0a8c32d370f2fdd0d0771eb59f8845beda17 (patch)
tree	27afafabe2d1c41df94233241336df76c938dfa0 /sys/kern/sched_ule.c
parent	7cbf1de38e06663c76f4f075db31ea25f429f1b3 (diff)
download	src-aefe0a8c32d370f2fdd0d0771eb59f8845beda17.tar.gz src-aefe0a8c32d370f2fdd0d0771eb59f8845beda17.zip

Refactor/optimize cpu_search_*().

Remove cpu_search_both(), unused for many years. Without it there is less sense for the trick of compiling common cpu_search() into separate cpu_search_lowest() and cpu_search_highest(), so split them completely, making code more readable. While there, split iteration over children groups and CPUs, complicating code for very small deduplication. Stop passing cpuset_t arguments by value and avoid some manipulations. Since MAXCPU bump from 64 to 256, what was a single register turned into 32-byte memory array, requiring memory allocation and accesses. Splitting struct cpu_search into parameter and result parts allows to even more reduce stack usage, since the first can be passed through on recursion. Remove CPU_FFS() from the hot paths, precalculating first and last CPU for each CPU group in advance during initialization. Again, it was not a problem for 64 CPUs before, but for 256 FFS needs much more code. With these changes on 80-thread system doing ~260K uncached ZFS reads per second I observe ~30% reduction of time spent in cpu_search_*(). MFC after: 1 month

Diffstat (limited to 'sys/kern/sched_ule.c')

-rw-r--r--

sys/kern/sched_ule.c

290

1 files changed, 120 insertions, 170 deletions

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 094a3fffef8c..3bb73d64a70c 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c

@@ -631,170 +631,120 @@ sched_random(void)

}

struct cpu_search {

- cpuset_t cs_mask;

+ cpuset_t *cs_mask;

u_int cs_prefer;

int cs_pri; /* Min priority for low. */

int cs_limit; /* Max load for low, min load for high. */

+};

+struct cpu_search_res {

int cs_cpu;

int cs_load;

};

-#define CPU_SEARCH_LOWEST 0x1

-#define CPU_SEARCH_HIGHEST 0x2

-#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)

-static __always_inline int cpu_search(const struct cpu_group *cg,

- struct cpu_search *low, struct cpu_search *high, const int match);

-int __noinline cpu_search_lowest(const struct cpu_group *cg,

- struct cpu_search *low);

-int __noinline cpu_search_highest(const struct cpu_group *cg,

- struct cpu_search *high);

-int __noinline cpu_search_both(const struct cpu_group *cg,

- struct cpu_search *low, struct cpu_search *high);

-/*

- * Search the tree of cpu_groups for the lowest or highest loaded cpu

- * according to the match argument. This routine actually compares the

- * load on all paths through the tree and finds the least loaded cpu on

- * the least loaded path, which may differ from the least loaded cpu in

- * the system. This balances work among caches and buses.

- *

- * This inline is instantiated in three forms below using constants for the

- * match argument. It is reduced to the minimum set for each case. It is

- * also recursive to the depth of the tree.

- */

-static __always_inline int

-cpu_search(const struct cpu_group *cg, struct cpu_search *low,

- struct cpu_search *high, const int match)

- struct cpu_search lgroup;

- struct cpu_search hgroup;

- cpuset_t cpumask;

- struct cpu_group *child;

+/*

+ * Search the tree of cpu_groups for the lowest or highest loaded CPU.

+ * These routines actually compare the load on all paths through the tree

+ * and find the least loaded cpu on the least loaded path, which may differ

+ * from the least loaded cpu in the system. This balances work among caches

+ * and buses.

+ */

+static int

+cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,

+ struct cpu_search_res *r)

+ struct cpu_search_res lr;

struct tdq *tdq;

- int cpu, i, hload, lload, load, total, rnd;

+ int c, bload, l, load, total;

total = 0;

- cpumask = cg->cg_mask;

- if (match & CPU_SEARCH_LOWEST) {

- lload = INT_MAX;

- lgroup = *low;

- }

- if (match & CPU_SEARCH_HIGHEST) {

- hload = INT_MIN;

- hgroup = *high;

- }

+ bload = INT_MAX;

+ r->cs_cpu = -1;

- /* Iterate through the child CPU groups and then remaining CPUs. */

- for (i = cg->cg_children, cpu = mp_maxid; ; ) {

- if (i == 0) {

-#ifdef HAVE_INLINE_FFSL

- cpu = CPU_FFS(&cpumask) - 1;

-#else

- while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))

- cpu--;

-#endif

- if (cpu < 0)

- break;

- child = NULL;

- } else

- child = &cg->cg_child[i - 1];

- if (match & CPU_SEARCH_LOWEST)

- lgroup.cs_cpu = -1;

- if (match & CPU_SEARCH_HIGHEST)

- hgroup.cs_cpu = -1;

- if (child) { /* Handle child CPU group. */

- CPU_ANDNOT(&cpumask, &child->cg_mask);

- switch (match) {

- case CPU_SEARCH_LOWEST:

- load = cpu_search_lowest(child, &lgroup);

- break;

- case CPU_SEARCH_HIGHEST:

- load = cpu_search_highest(child, &hgroup);

- break;

- case CPU_SEARCH_BOTH:

- load = cpu_search_both(child, &lgroup, &hgroup);

- break;

- }

- } else { /* Handle child CPU. */

- CPU_CLR(cpu, &cpumask);

- tdq = TDQ_CPU(cpu);

- load = tdq->tdq_load * 256;

- rnd = sched_random() % 32;

- if (match & CPU_SEARCH_LOWEST) {

- if (cpu == low->cs_prefer)

- load -= 64;

- /* If that CPU is allowed and get data. */

- if (tdq->tdq_lowpri > lgroup.cs_pri &&

- tdq->tdq_load <= lgroup.cs_limit &&

- CPU_ISSET(cpu, &lgroup.cs_mask)) {

- lgroup.cs_cpu = cpu;

- lgroup.cs_load = load - rnd;

- }

+ /* Loop through children CPU groups if there are any. */

+ if (cg->cg_children > 0) {

+ for (c = cg->cg_children - 1; c >= 0; c--) {

+ load = cpu_search_lowest(&cg->cg_child[c], s, &lr);

+ total += load;

+ if (lr.cs_cpu >= 0 && (load < bload ||

+ (load == bload && lr.cs_load < r->cs_load))) {

+ bload = load;

+ r->cs_cpu = lr.cs_cpu;

+ r->cs_load = lr.cs_load;

}

- if (match & CPU_SEARCH_HIGHEST)

- if (tdq->tdq_load >= hgroup.cs_limit &&

- tdq->tdq_transferable &&

- CPU_ISSET(cpu, &hgroup.cs_mask)) {

- hgroup.cs_cpu = cpu;

- hgroup.cs_load = load - rnd;

- }

}

- total += load;

+ return (total);

+ }

- /* We have info about child item. Compare it. */

- if (match & CPU_SEARCH_LOWEST) {

- if (lgroup.cs_cpu >= 0 &&

- (load < lload ||

- (load == lload && lgroup.cs_load < low->cs_load))) {

- lload = load;

- low->cs_cpu = lgroup.cs_cpu;

- low->cs_load = lgroup.cs_load;

- }

- if (match & CPU_SEARCH_HIGHEST)

- if (hgroup.cs_cpu >= 0 &&

- (load > hload ||

- (load == hload && hgroup.cs_load > high->cs_load))) {

- hload = load;

- high->cs_cpu = hgroup.cs_cpu;

- high->cs_load = hgroup.cs_load;

- }

- if (child) {

- i--;

- if (i == 0 && CPU_EMPTY(&cpumask))

- break;

+ /* Loop through children CPUs otherwise. */

+ for (c = cg->cg_last; c >= cg->cg_first; c--) {

+ if (!CPU_ISSET(c, &cg->cg_mask))

+ continue;

+ tdq = TDQ_CPU(c);

+ l = tdq->tdq_load;

+ load = l * 256;

+ if (c == s->cs_prefer)

+ load -= 128;

+ total += load;

+ if (l > s->cs_limit || tdq->tdq_lowpri <= s->cs_pri ||

+ !CPU_ISSET(c, s->cs_mask))

+ continue;

+ load -= sched_random() % 128;

+ if (load < bload) {

+ bload = load;

+ r->cs_cpu = c;

}

-#ifndef HAVE_INLINE_FFSL

- else

- cpu--;

-#endif

}

+ r->cs_load = bload;

return (total);

}

-/*

- * cpu_search instantiations must pass constants to maintain the inline

- * optimization.

- */

-int

-cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)

+static int

+cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,

+ struct cpu_search_res *r)

{

- return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);

+ struct cpu_search_res lr;

+ struct tdq *tdq;

+ int c, bload, l, load, total;

-int

-cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)

- return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);

+ total = 0;

+ bload = INT_MIN;

+ r->cs_cpu = -1;

-int

-cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,

- struct cpu_search *high)

- return cpu_search(cg, low, high, CPU_SEARCH_BOTH);

+ /* Loop through children CPU groups if there are any. */

+ if (cg->cg_children > 0) {

+ for (c = cg->cg_children - 1; c >= 0; c--) {

+ load = cpu_search_highest(&cg->cg_child[c], s, &lr);

+ total += load;

+ if (lr.cs_cpu >= 0 && (load > bload ||

+ (load == bload && lr.cs_load > r->cs_load))) {

+ bload = load;

+ r->cs_cpu = lr.cs_cpu;

+ r->cs_load = lr.cs_load;

+ }

+ return (total);

+ }

+ /* Loop through children CPUs otherwise. */

+ for (c = cg->cg_last; c >= cg->cg_first; c--) {

+ if (!CPU_ISSET(c, &cg->cg_mask))

+ continue;

+ tdq = TDQ_CPU(c);

+ l = tdq->tdq_load;

+ load = l * 256;

+ total += load;

+ if (l < s->cs_limit || !tdq->tdq_transferable ||

+ !CPU_ISSET(c, s->cs_mask))

+ continue;

+ load -= sched_random() % 128;

+ if (load > bload) {

+ bload = load;

+ r->cs_cpu = c;

+ }

+ r->cs_load = bload;

+ return (total);

}

@@ -803,33 +753,33 @@ cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,

* acceptable.

static inline int

-sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,

+sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload,

int prefer)

{

- struct cpu_search low;

+ struct cpu_search s;

+ struct cpu_search_res r;

- low.cs_cpu = -1;

- low.cs_prefer = prefer;

- low.cs_mask = mask;

- low.cs_pri = pri;

- low.cs_limit = maxload;

- cpu_search_lowest(cg, &low);

- return low.cs_cpu;

+ s.cs_prefer = prefer;

+ s.cs_mask = mask;

+ s.cs_pri = pri;

+ s.cs_limit = maxload;

+ cpu_search_lowest(cg, &s, &r);

+ return (r.cs_cpu);

}

* Find the cpu with the highest load via the highest loaded path.

static inline int

-sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)

+sched_highest(const struct cpu_group *cg, cpuset_t *mask, int minload)

{

- struct cpu_search high;

+ struct cpu_search s;

+ struct cpu_search_res r;

- high.cs_cpu = -1;

- high.cs_mask = mask;

- high.cs_limit = minload;

- cpu_search_highest(cg, &high);

- return high.cs_cpu;

+ s.cs_mask = mask;

+ s.cs_limit = minload;

+ cpu_search_highest(cg, &s, &r);

+ return (r.cs_cpu);

}

static void

@@ -841,7 +791,7 @@ sched_balance_group(struct cpu_group *cg)

CPU_FILL(&hmask);

for (;;) {

- high = sched_highest(cg, hmask, 2);

+ high = sched_highest(cg, &hmask, 2);

/* Stop if there is no more CPU with transferrable threads. */

if (high == -1)

break;

@@ -853,7 +803,7 @@ sched_balance_group(struct cpu_group *cg)

anylow = 1;

tdq = TDQ_CPU(high);

nextlow:

- low = sched_lowest(cg, lmask, -1, tdq->tdq_load - 1, high);

+ low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high);

/* Stop if we looked well and found no less loaded CPU. */

if (anylow && low == -1)

break;

@@ -995,7 +945,7 @@ tdq_idled(struct tdq *tdq)

restart:

switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;

for (cg = tdq->tdq_cg; ; ) {

- cpu = sched_highest(cg, mask, steal_thresh);

+ cpu = sched_highest(cg, &mask, steal_thresh);

* We were assigned a thread but not preempted. Returning

* 0 here will cause our caller to switch to it.

@@ -1244,7 +1194,7 @@ sched_pickcpu(struct thread *td, int flags)

struct cpu_group *cg, *ccg;

struct td_sched *ts;

struct tdq *tdq;

- cpuset_t mask;

+ cpuset_t *mask;

int cpu, pri, self, intr;

self = PCPU_GET(cpuid);

@@ -1287,14 +1237,14 @@ sched_pickcpu(struct thread *td, int flags)

SCHED_AFFINITY(ts, CG_SHARE_L2)) {

if (cg->cg_flags & CG_FLAG_THREAD) {

/* Check all SMT threads for being idle. */

- for (cpu = CPU_FFS(&cg->cg_mask) - 1; ; cpu++) {

+ for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) {

if (CPU_ISSET(cpu, &cg->cg_mask) &&

TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)

break;

- if (cpu >= mp_maxid) {

- SCHED_STAT_INC(pickcpu_idle_affinity);

- return (ts->ts_cpu);

- }

+ }

+ if (cpu > cg->cg_last) {

+ SCHED_STAT_INC(pickcpu_idle_affinity);

+ return (ts->ts_cpu);

}

} else {

SCHED_STAT_INC(pickcpu_idle_affinity);

@@ -1321,7 +1271,7 @@ llc:

if (ccg == cpu_top)

ccg = NULL;

cpu = -1;

- mask = td->td_cpuset->cs_mask;

+ mask = &td->td_cpuset->cs_mask;

pri = td->td_priority;

* Try hard to keep interrupts within found LLC. Search the LLC for

@@ -1931,7 +1881,7 @@ tdq_trysteal(struct tdq *tdq)

spinlock_enter();

TDQ_UNLOCK(tdq);

for (i = 1, cg = tdq->tdq_cg; ; ) {

- cpu = sched_highest(cg, mask, steal_thresh);

+ cpu = sched_highest(cg, &mask, steal_thresh);

* If a thread was added while interrupts were disabled don't

* steal one here.

@@ -3002,7 +2952,7 @@ sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,

sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",

cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));

first = TRUE;

- for (i = 0; i < MAXCPU; i++) {

+ for (i = cg->cg_first; i <= cg->cg_last; i++) {

if (CPU_ISSET(i, &cg->cg_mask)) {

if (!first)

sbuf_printf(sb, ", ");