aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/spa.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/spa.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c191
1 files changed, 140 insertions, 51 deletions
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 96daf51b696a..ec2b674fb7ee 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -208,7 +208,7 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */
static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
#endif
-static uint_t zio_taskq_wr_iss_ncpus = 0;
+static uint_t zio_taskq_write_tpq = 16;
/*
* Report any spa_load_verify errors found, but do not fail spa_load.
@@ -1067,17 +1067,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
case ZTI_MODE_SYNC:
/*
- * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus',
- * not to exceed the number of spa allocators.
+ * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
+ * not to exceed the number of spa allocators, and align to it.
*/
- if (zio_taskq_wr_iss_ncpus == 0) {
- count = MAX(boot_ncpus / spa->spa_alloc_count, 1);
- } else {
- count = MAX(1,
- boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus));
- }
+ cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
+ count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
count = MIN(count, spa->spa_alloc_count);
+ while (spa->spa_alloc_count % count != 0 &&
+ spa->spa_alloc_count < count * 2)
+ count--;
/*
* zio_taskq_batch_pct is unbounded and may exceed 100%, but no
@@ -1495,15 +1494,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);
- if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
- (zio != NULL) && (zio->io_wr_iss_tq != NULL)) {
- /* dispatch to assigned write issue taskq */
- tq = zio->io_wr_iss_tq;
- return (tq);
- }
-
if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
+ } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
+ (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) {
+ tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
} else {
tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
}
@@ -3594,11 +3589,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
}
/*
- * Perform the import activity check. If the user canceled the import or
- * we detected activity then fail.
+ * Remote host activity check.
+ *
+ * error results:
+ * 0 - no activity detected
+ * EREMOTEIO - remote activity detected
+ * EINTR - user canceled the operation
*/
static int
-spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
+ boolean_t importing)
{
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
@@ -3643,19 +3643,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_expire = gethrtime() + import_delay;
- spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
- "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+ if (importing) {
+ spa_import_progress_set_notes(spa, "Checking MMP activity, "
+ "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
+ }
- int interations = 0;
+ int iterations = 0;
while ((now = gethrtime()) < import_expire) {
- if (interations++ % 30 == 0) {
+ if (importing && iterations++ % 30 == 0) {
spa_import_progress_set_notes(spa, "Checking MMP "
"activity, %llu ms remaining",
(u_longlong_t)NSEC2MSEC(import_expire - now));
}
- (void) spa_import_progress_set_mmp_check(spa_guid(spa),
- NSEC2SEC(import_expire - gethrtime()));
+ if (importing) {
+ (void) spa_import_progress_set_mmp_check(spa_guid(spa),
+ NSEC2SEC(import_expire - gethrtime()));
+ }
vdev_uberblock_load(rvd, ub, &mmp_label);
@@ -3737,6 +3741,61 @@ out:
return (error);
}
+/*
+ * Called from zfs_ioc_clear for a pool that was suspended
+ * after failing mmp write checks.
+ */
+boolean_t
+spa_mmp_remote_host_activity(spa_t *spa)
+{
+ ASSERT(spa_multihost(spa) && spa_suspended(spa));
+
+ nvlist_t *best_label;
+ uberblock_t best_ub;
+
+ /*
+ * Locate the best uberblock on disk
+ */
+ vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
+ if (best_label) {
+ /*
+ * confirm that the best hostid matches our hostid
+ */
+ if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
+ spa_get_hostid(spa) !=
+ fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
+ nvlist_free(best_label);
+ return (B_TRUE);
+ }
+ nvlist_free(best_label);
+ } else {
+ return (B_TRUE);
+ }
+
+ if (!MMP_VALID(&best_ub) ||
+ !MMP_FAIL_INT_VALID(&best_ub) ||
+ MMP_FAIL_INT(&best_ub) == 0) {
+ return (B_TRUE);
+ }
+
+ if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
+ best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
+ zfs_dbgmsg("txg mismatch detected during pool clear "
+ "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
+ (u_longlong_t)spa->spa_uberblock.ub_txg,
+ (u_longlong_t)best_ub.ub_txg,
+ (u_longlong_t)spa->spa_uberblock.ub_timestamp,
+ (u_longlong_t)best_ub.ub_timestamp);
+ return (B_TRUE);
+ }
+
+ /*
+ * Perform an activity check looking for any remote writer
+ */
+ return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
+ B_FALSE) != 0);
+}
+
static int
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
{
@@ -4063,7 +4122,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
- int error = spa_activity_check(spa, ub, spa->spa_config);
+ int error =
+ spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
if (error) {
nvlist_free(label);
return (error);
@@ -8771,15 +8831,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}
static void
-spa_async_probe(spa_t *spa, vdev_t *vd)
+spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
- if (vd->vdev_probe_wanted) {
- vd->vdev_probe_wanted = B_FALSE;
- vdev_reopen(vd); /* vdev_open() does the actual probe */
+ if (vd->vdev_fault_wanted) {
+ vd->vdev_fault_wanted = B_FALSE;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+ VDEV_AUX_ERR_EXCEEDED);
}
for (int c = 0; c < vd->vdev_children; c++)
- spa_async_probe(spa, vd->vdev_child[c]);
+ spa_async_fault_vdev(spa, vd->vdev_child[c]);
}
static void
@@ -8867,11 +8928,11 @@ spa_async_thread(void *arg)
}
/*
- * See if any devices need to be probed.
+ * See if any devices need to be marked faulted.
*/
- if (tasks & SPA_ASYNC_PROBE) {
+ if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
- spa_async_probe(spa, spa->spa_root_vdev);
+ spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}
@@ -10167,16 +10228,10 @@ spa_sync_tq_create(spa_t *spa, const char *name)
VERIFY(spa->spa_sync_tq != NULL);
VERIFY(kthreads != NULL);
- spa_taskqs_t *tqs =
- &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE];
-
spa_syncthread_info_t *ti = spa->spa_syncthreads;
- for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) {
+ for (int i = 0; i < nthreads; i++, ti++) {
ti->sti_thread = kthreads[i];
- if (w == tqs->stqs_count) {
- w = 0;
- }
- ti->sti_wr_iss_tq = tqs->stqs_taskq[w];
+ ti->sti_allocator = i;
}
kmem_free(kthreads, sizeof (*kthreads) * nthreads);
@@ -10195,6 +10250,42 @@ spa_sync_tq_destroy(spa_t *spa)
spa->spa_sync_tq = NULL;
}
+uint_t
+spa_acq_allocator(spa_t *spa)
+{
+ int i;
+
+ if (spa->spa_alloc_count == 1)
+ return (0);
+
+ mutex_enter(&spa->spa_allocs_use->sau_lock);
+ uint_t r = spa->spa_allocs_use->sau_rotor;
+ do {
+ if (++r == spa->spa_alloc_count)
+ r = 0;
+ } while (spa->spa_allocs_use->sau_inuse[r]);
+ spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
+ spa->spa_allocs_use->sau_rotor = r;
+ mutex_exit(&spa->spa_allocs_use->sau_lock);
+
+ spa_syncthread_info_t *ti = spa->spa_syncthreads;
+ for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
+ if (ti->sti_thread == curthread) {
+ ti->sti_allocator = r;
+ break;
+ }
+ }
+ ASSERT3S(i, <, spa->spa_alloc_count);
+ return (r);
+}
+
+void
+spa_rel_allocator(spa_t *spa, uint_t allocator)
+{
+ if (spa->spa_alloc_count > 1)
+ spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
+}
+
void
spa_select_allocator(zio_t *zio)
{
@@ -10222,8 +10313,7 @@ spa_select_allocator(zio_t *zio)
spa_syncthread_info_t *ti = spa->spa_syncthreads;
for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
if (ti->sti_thread == curthread) {
- zio->io_allocator = i;
- zio->io_wr_iss_tq = ti->sti_wr_iss_tq;
+ zio->io_allocator = ti->sti_allocator;
return;
}
}
@@ -10240,7 +10330,6 @@ spa_select_allocator(zio_t *zio)
bm->zb_blkid >> 20);
zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
- zio->io_wr_iss_tq = NULL;
}
/*
@@ -10811,10 +10900,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
"Print vdev tree to zfs_dbgmsg during pool import");
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
"Percentage of CPUs to run an IO worker thread");
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD,
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
"Number of threads per IO worker taskqueue");
/* BEGIN CSTYLED */
@@ -10845,13 +10934,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
#ifdef _KERNEL
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
- spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD,
+ spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
"Configure IO queues for read IO");
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
- spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD,
+ spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
"Configure IO queues for write IO");
#endif
/* END CSTYLED */
-ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW,
- "Number of CPUs to run write issue taskqs");
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
+ "Number of CPUs per write issue taskq");