diff options
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/spa.c')
-rw-r--r-- | sys/contrib/openzfs/module/zfs/spa.c | 191 |
1 files changed, 140 insertions, 51 deletions
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 96daf51b696a..ec2b674fb7ee 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -208,7 +208,7 @@ static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ #endif -static uint_t zio_taskq_wr_iss_ncpus = 0; +static uint_t zio_taskq_write_tpq = 16; /* * Report any spa_load_verify errors found, but do not fail spa_load. @@ -1067,17 +1067,16 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) case ZTI_MODE_SYNC: /* - * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus', - * not to exceed the number of spa allocators. + * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, + * not to exceed the number of spa allocators, and align to it. */ - if (zio_taskq_wr_iss_ncpus == 0) { - count = MAX(boot_ncpus / spa->spa_alloc_count, 1); - } else { - count = MAX(1, - boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus)); - } + cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); + count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); count = MAX(count, (zio_taskq_batch_pct + 99) / 100); count = MIN(count, spa->spa_alloc_count); + while (spa->spa_alloc_count % count != 0 && + spa->spa_alloc_count < count * 2) + count--; /* * zio_taskq_batch_pct is unbounded and may exceed 100%, but no @@ -1495,15 +1494,11 @@ spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); - if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && - (zio != NULL) && (zio->io_wr_iss_tq != NULL)) { - /* dispatch to assigned write issue taskq */ - tq = zio->io_wr_iss_tq; - return (tq); - } - if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; + } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && + (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) { + tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } @@ -3594,11 +3589,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) } /* - * Perform the import activity check. If the user canceled the import or - * we detected activity then fail. + * Remote host activity check. + * + * error results: + * 0 - no activity detected + * EREMOTEIO - remote activity detected + * EINTR - user canceled the operation */ static int -spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) +spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, + boolean_t importing) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; @@ -3643,19 +3643,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_expire = gethrtime() + import_delay; - spa_import_progress_set_notes(spa, "Checking MMP activity, waiting " - "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + if (importing) { + spa_import_progress_set_notes(spa, "Checking MMP activity, " + "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + } - int interations = 0; + int iterations = 0; while ((now = gethrtime()) < import_expire) { - if (interations++ % 30 == 0) { + if (importing && iterations++ % 30 == 0) { spa_import_progress_set_notes(spa, "Checking MMP " "activity, %llu ms remaining", (u_longlong_t)NSEC2MSEC(import_expire - now)); } - (void) spa_import_progress_set_mmp_check(spa_guid(spa), - NSEC2SEC(import_expire - gethrtime())); + if (importing) { + (void) spa_import_progress_set_mmp_check(spa_guid(spa), + NSEC2SEC(import_expire - gethrtime())); + } vdev_uberblock_load(rvd, ub, &mmp_label); @@ -3737,6 +3741,61 @@ out: return (error); } +/* + * Called from zfs_ioc_clear for a pool that was suspended + * after failing mmp write checks. + */ +boolean_t +spa_mmp_remote_host_activity(spa_t *spa) +{ + ASSERT(spa_multihost(spa) && spa_suspended(spa)); + + nvlist_t *best_label; + uberblock_t best_ub; + + /* + * Locate the best uberblock on disk + */ + vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); + if (best_label) { + /* + * confirm that the best hostid matches our hostid + */ + if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && + spa_get_hostid(spa) != + fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { + nvlist_free(best_label); + return (B_TRUE); + } + nvlist_free(best_label); + } else { + return (B_TRUE); + } + + if (!MMP_VALID(&best_ub) || + !MMP_FAIL_INT_VALID(&best_ub) || + MMP_FAIL_INT(&best_ub) == 0) { + return (B_TRUE); + } + + if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || + best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { + zfs_dbgmsg("txg mismatch detected during pool clear " + "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", + (u_longlong_t)spa->spa_uberblock.ub_txg, + (u_longlong_t)best_ub.ub_txg, + (u_longlong_t)spa->spa_uberblock.ub_timestamp, + (u_longlong_t)best_ub.ub_timestamp); + return (B_TRUE); + } + + /* + * Perform an activity check looking for any remote writer + */ + return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, + B_FALSE) != 0); +} + static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { @@ -4063,7 +4122,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } - int error = spa_activity_check(spa, ub, spa->spa_config); + int error = + spa_activity_check(spa, ub, spa->spa_config, B_TRUE); if (error) { nvlist_free(label); return (error); @@ -8771,15 +8831,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_probe(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(spa_t *spa, vdev_t *vd) { - if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = B_FALSE; - vdev_reopen(vd); /* vdev_open() does the actual probe */ + if (vd->vdev_fault_wanted) { + vd->vdev_fault_wanted = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) - spa_async_probe(spa, vd->vdev_child[c]); + spa_async_fault_vdev(spa, vd->vdev_child[c]); } static void @@ -8867,11 +8928,11 @@ spa_async_thread(void *arg) } /* - * See if any devices need to be probed. + * See if any devices need to be marked faulted. */ - if (tasks & SPA_ASYNC_PROBE) { + if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_probe(spa, spa->spa_root_vdev); + spa_async_fault_vdev(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } @@ -10167,16 +10228,10 @@ spa_sync_tq_create(spa_t *spa, const char *name) VERIFY(spa->spa_sync_tq != NULL); VERIFY(kthreads != NULL); - spa_taskqs_t *tqs = - &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE]; - spa_syncthread_info_t *ti = spa->spa_syncthreads; - for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) { + for (int i = 0; i < nthreads; i++, ti++) { ti->sti_thread = kthreads[i]; - if (w == tqs->stqs_count) { - w = 0; - } - ti->sti_wr_iss_tq = tqs->stqs_taskq[w]; + ti->sti_allocator = i; } kmem_free(kthreads, sizeof (*kthreads) * nthreads); @@ -10195,6 +10250,42 @@ spa_sync_tq_destroy(spa_t *spa) spa->spa_sync_tq = NULL; } +uint_t +spa_acq_allocator(spa_t *spa) +{ + int i; + + if (spa->spa_alloc_count == 1) + return (0); + + mutex_enter(&spa->spa_allocs_use->sau_lock); + uint_t r = spa->spa_allocs_use->sau_rotor; + do { + if (++r == spa->spa_alloc_count) + r = 0; + } while (spa->spa_allocs_use->sau_inuse[r]); + spa->spa_allocs_use->sau_inuse[r] = B_TRUE; + spa->spa_allocs_use->sau_rotor = r; + mutex_exit(&spa->spa_allocs_use->sau_lock); + + spa_syncthread_info_t *ti = spa->spa_syncthreads; + for (i = 0; i < spa->spa_alloc_count; i++, ti++) { + if (ti->sti_thread == curthread) { + ti->sti_allocator = r; + break; + } + } + ASSERT3S(i, <, spa->spa_alloc_count); + return (r); +} + +void +spa_rel_allocator(spa_t *spa, uint_t allocator) +{ + if (spa->spa_alloc_count > 1) + spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; +} + void spa_select_allocator(zio_t *zio) { @@ -10222,8 +10313,7 @@ spa_select_allocator(zio_t *zio) spa_syncthread_info_t *ti = spa->spa_syncthreads; for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { if (ti->sti_thread == curthread) { - zio->io_allocator = i; - zio->io_wr_iss_tq = ti->sti_wr_iss_tq; + zio->io_allocator = ti->sti_allocator; return; } } @@ -10240,7 +10330,6 @@ spa_select_allocator(zio_t *zio) bm->zb_blkid >> 20); zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; - zio->io_wr_iss_tq = NULL; } /* @@ -10811,10 +10900,10 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, "Percentage of CPUs to run an IO worker thread"); -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, "Number of threads per IO worker taskqueue"); /* BEGIN CSTYLED */ @@ -10845,13 +10934,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, #ifdef _KERNEL ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, - spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, "Configure IO queues for read IO"); ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, - spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, "Configure IO queues for write IO"); #endif /* END CSTYLED */ -ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW, - "Number of CPUs to run write issue taskqs"); +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, + "Number of CPUs per write issue taskq"); |