diff options
author | Martin Matuska <mm@FreeBSD.org> | 2023-11-09 10:42:33 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2023-11-09 12:19:17 +0000 |
commit | e716630d4cf89e69ec3f675ebfceee09f1a85e05 (patch) | |
tree | 3ee825a5671f470e1481d24312b58895a12d01ac /sys/contrib/openzfs/module/zfs | |
parent | f5b3e686292b6502878c64c3c154908024e06eb6 (diff) | |
parent | 887a3c533b94a4b70075e310f15c45b9dee19410 (diff) |
zfs: merge openzfs/zfs@887a3c533
Notable upstream pull request merges:
#15022 5caeef02f RAID-Z expansion feature
#15457 887a3c533 Increase L2ARC write rate and headroom
#15504 1c1be60fa Unbreak FreeBSD world build after 3bd4df384
Obtained from: OpenZFS
OpenZFS commit: 887a3c533b94a4b70075e310f15c45b9dee19410
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
-rw-r--r-- | sys/contrib/openzfs/module/zfs/arc.c | 12 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/dsl_scan.c | 1 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/metaslab.c | 12 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/spa.c | 240 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/spa_checkpoint.c | 3 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev.c | 114 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev_draid.c | 28 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev_initialize.c | 12 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev_label.c | 51 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev_raidz.c | 2556 | ||||
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev_trim.c | 17 |
11 files changed, 2817 insertions, 229 deletions
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index dfea15b74394..2d08cc5e7240 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -776,8 +776,8 @@ uint64_t zfs_crc64_table[256]; * Level 2 ARC */ -#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_WRITE_SIZE (32 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 8 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost @@ -4518,7 +4518,7 @@ arc_evict_cb_check(void *arg, zthr_t *zthr) static void arc_evict_cb(void *arg, zthr_t *zthr) { - (void) arg, (void) zthr; + (void) arg; uint64_t evicted = 0; fstrans_cookie_t cookie = spl_fstrans_mark(); @@ -4542,9 +4542,13 @@ arc_evict_cb(void *arg, zthr_t *zthr) * infinite loop. Additionally, zthr_iscancelled() is * checked here so that if the arc is shutting down, the * broadcast will wake any remaining arc evict waiters. + * + * Note we cancel using zthr instead of arc_evict_zthr + * because the latter may not yet be initializd when the + * callback is first invoked. */ mutex_enter(&arc_evict_lock); - arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && + arc_evict_needed = !zthr_iscancelled(zthr) && evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0; if (!arc_evict_needed) { /* diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 34012db82dee..e16128fdff87 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -3066,7 +3066,6 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); - spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_suspending) return; diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index e0d4a6a63508..0983ba143a1d 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -4342,7 +4342,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - metaslab_class_get_alloc(spa_normal_class(spa)); - if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { + if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || + vd->vdev_rz_expanding) { defer_allowed = B_FALSE; } @@ -4650,6 +4651,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) ASSERT(MUTEX_HELD(&msp->ms_lock)); VERIFY(!msp->ms_condensing); VERIFY0(msp->ms_disabled); + VERIFY0(msp->ms_new); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { @@ -4721,10 +4723,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, } /* - * If the selected metaslab is condensing or disabled, - * skip it. + * If the selected metaslab is condensing or disabled, or + * hasn't gone through a metaslab_sync_done(), then skip it. */ - if (msp->ms_condensing || msp->ms_disabled > 0) + if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) continue; *was_active = msp->ms_allocator != -1; @@ -5270,7 +5272,7 @@ top: ASSERT(mg->mg_class == mc); - uint64_t asize = vdev_psize_to_asize(vd, psize); + uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); /* diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 68f367c1c744..20225640f8c5 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -63,6 +63,7 @@ #include <sys/vdev_rebuild.h> #include <sys/vdev_trim.h> #include <sys/vdev_disk.h> +#include <sys/vdev_raidz.h> #include <sys/vdev_draid.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -1709,6 +1710,10 @@ spa_destroy_aux_threads(spa_t *spa) zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } + if (spa->spa_raidz_expand_zthr != NULL) { + zthr_destroy(spa->spa_raidz_expand_zthr); + spa->spa_raidz_expand_zthr = NULL; + } } /* @@ -1861,6 +1866,8 @@ spa_unload(spa_t *spa) spa->spa_compatibility = NULL; } + spa->spa_raidz_expand = NULL; + spa_config_exit(spa, SCL_ALL, spa); } @@ -2999,6 +3006,7 @@ spa_spawn_aux_threads(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + spa_start_raidz_expansion_thread(spa); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); @@ -3753,6 +3761,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != 0) { + spa_load_note(spa, "uberblock raidz_reflow_info: " + "state=%u offset=%llu", + (int)RRSS_GET_STATE(ub), + (u_longlong_t)RRSS_GET_OFFSET(ub)); + } /* @@ -5092,6 +5106,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* + * Before we do any zio_write's, complete the raidz expansion + * scratch space copying, if necessary. + */ + if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) + vdev_raidz_reflow_copy_scratch(spa); + + /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ @@ -6905,9 +6926,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } /* - * Attach a device to a mirror. The arguments are the path to any device - * in the mirror, and the nvroot for the new device. If the path specifies - * a device that is not mirrored, we automatically insert the mirror vdev. + * Attach a device to a vdev specified by its guid. The vdev type can be + * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a + * single device). When the vdev is a single device, a mirror vdev will be + * automatically inserted. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own @@ -6930,7 +6952,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; - int newvd_isspare; + int newvd_isspare = B_FALSE; int error; ASSERT(spa_writeable(spa)); @@ -6961,16 +6983,35 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, ZFS_ERR_REBUILD_IN_PROGRESS)); } - if (spa->spa_vdev_removal != NULL) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + if (spa->spa_vdev_removal != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_DEVRM_IN_PROGRESS)); + } if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - if (!oldvd->vdev_ops->vdev_op_leaf) + boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; + + if (raidz) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + /* + * Can't expand a raidz while prior expand is in progress. + */ + if (spa->spa_raidz_expand != NULL) { + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + } + } else if (!oldvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + } - pvd = oldvd->vdev_parent; + if (raidz) + pvd = oldvd; + else + pvd = oldvd->vdev_parent; if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH) != 0) @@ -7026,6 +7067,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_raidz_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); @@ -7065,7 +7107,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, /* * Make sure the new device is big enough. */ - if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) + vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; + if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -7076,31 +7119,74 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* + * RAIDZ-expansion-specific checks. + */ + if (raidz) { + if (vdev_raidz_attach_check(newvd) != 0) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * Fail early if a child is not healthy or being replaced + */ + for (int i = 0; i < oldvd->vdev_children; i++) { + if (vdev_is_dead(oldvd->vdev_child[i]) || + !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, newrootvd, txg, + ENXIO)); + } + /* Also fail if reserved boot area is in-use */ + if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) + != 0) { + return (spa_vdev_exit(spa, newrootvd, txg, + EADDRINUSE)); + } + } + } + + if (raidz) { + /* + * Note: oldvdpath is freed by spa_strfree(), but + * kmem_asprintf() is freed by kmem_strfree(), so we have to + * move it to a spa_strdup-ed string. + */ + char *tmp = kmem_asprintf("raidz%u-%u", + (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); + oldvdpath = spa_strdup(tmp); + kmem_strfree(tmp); + } else { + oldvdpath = spa_strdup(oldvd->vdev_path); + } + newvdpath = spa_strdup(newvd->vdev_path); + + /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ - if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + if (strcmp(oldvdpath, newvdpath) == 0) { spa_strfree(oldvd->vdev_path); - oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, KM_SLEEP); - (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, - "%s/%s", newvd->vdev_path, "old"); + (void) sprintf(oldvd->vdev_path, "%s/old", + newvdpath); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } + spa_strfree(oldvdpath); + oldvdpath = spa_strdup(oldvd->vdev_path); } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ - if (pvd->vdev_ops != pvops) + if (!raidz && pvd->vdev_ops != pvops) { pvd = vdev_add_parent(oldvd, pvops); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + } ASSERT(pvd->vdev_top->vdev_parent == rvd); - ASSERT(pvd->vdev_ops == pvops); - ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. @@ -7128,41 +7214,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, - TXG_INITIAL, dtl_max_txg - TXG_INITIAL); + if (raidz) { + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); - if (newvd->vdev_isspare) { - spa_spare_activate(newvd); - spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); - } + vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_wait(tvd); - oldvdpath = spa_strdup(oldvd->vdev_path); - newvdpath = spa_strdup(newvd->vdev_path); - newvd_isspare = newvd->vdev_isspare; + dtl_max_txg = spa_vdev_config_enter(spa); - /* - * Mark newvd's DTL dirty in this txg. - */ - vdev_dirty(tvd, VDD_DTL, newvd, txg); + tvd->vdev_rz_expanding = B_TRUE; - /* - * Schedule the resilver or rebuild to restart in the future. We do - * this to ensure that dmu_sync-ed blocks have been stitched into the - * respective datasets. - */ - if (rebuild) { - newvd->vdev_rebuild_txg = txg; + vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); + vdev_config_dirty(tvd); - vdev_rebuild(tvd); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + dtl_max_txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, + newvd, tx); + dmu_tx_commit(tx); } else { - newvd->vdev_resilver_txg = txg; + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); + + if (newvd->vdev_isspare) { + spa_spare_activate(newvd); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); + } + + newvd_isspare = newvd->vdev_isspare; + + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); - if (dsl_scan_resilvering(spa_get_dsl(spa)) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { - vdev_defer_resilver(newvd); + /* + * Schedule the resilver or rebuild to restart in the future. + * We do this to ensure that dmu_sync-ed blocks have been + * stitched into the respective datasets. + */ + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); } else { - dsl_scan_restart_resilver(spa->spa_dsl_pool, - dtl_max_txg); + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, + SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } } } @@ -7487,7 +7598,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || - vd->vdev_top->vdev_removing)) { + vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && @@ -7609,7 +7720,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && - (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { + (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && @@ -8512,6 +8624,10 @@ spa_async_suspend(spa_t *spa) if (condense_thread != NULL) zthr_cancel(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_cancel(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); @@ -8538,6 +8654,10 @@ spa_async_resume(spa_t *spa) if (condense_thread != NULL) zthr_resume(condense_thread); + zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; + if (raidz_expand_thread != NULL) + zthr_resume(raidz_expand_thread); + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); @@ -9343,6 +9463,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) != NULL) vdev_sync(vd, txg); + if (pass == 1) { + /* + * dsl_pool_sync() -> dp_sync_tasks may have dirtied + * the config. If that happens, this txg should not + * be a no-op. So we must sync the config to the MOS + * before checking for no-op. + * + * Note that when the config is dirty, it will + * be written to the MOS (i.e. the MOS will be + * dirtied) every time we call spa_sync_config_object() + * in this txg. Therefore we can't call this after + * dsl_pool_sync() every pass, because it would + * prevent us from converging, since we'd dirty + * the MOS every pass. + * + * Sync tasks can only be processed in pass 1, so + * there's no need to do this in later passes. + */ + spa_sync_config_object(spa, tx); + } + /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock @@ -10100,7 +10241,8 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: - if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + *in_progress = vdev_rebuild_active(spa->spa_root_vdev); + if (*in_progress) break; zfs_fallthrough; case ZPOOL_WAIT_SCRUB: @@ -10115,6 +10257,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } + case ZPOOL_WAIT_RAIDZ_EXPAND: + { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); + break; + } default: panic("unrecognized value for activity %d", activity); } diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c index b588f7041e5c..1efff47f87a0 100644 --- a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c +++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c @@ -465,6 +465,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx) if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); + if (spa->spa_raidz_expand != NULL) + return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); + if (spa->spa_checkpoint_txg != 0) return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index afb01c0ef7fd..c10c78ebf6db 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -58,6 +58,7 @@ #include <sys/abd.h> #include <sys/vdev_initialize.h> #include <sys/vdev_trim.h> +#include <sys/vdev_raidz.h> #include <sys/zvol.h> #include <sys/zfs_ratelimit.h> #include "zfs_prop.h" @@ -305,13 +306,13 @@ vdev_derive_alloc_bias(const char *bias) * all children. This is what's used by anything other than RAID-Z. */ uint64_t -vdev_default_asize(vdev_t *vd, uint64_t psize) +vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { - csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } @@ -930,6 +931,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); + vd->vdev_rz_expanding = nvlist_exists(nv, + ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } @@ -1692,6 +1695,8 @@ vdev_probe_done(zio_t *zio) vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; + vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", + vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { @@ -1913,17 +1918,20 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) } /* - * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the "typical" blocksize. - * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, - * otherwise it would inconsistently account for existing bp's. + * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) + * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE + * changed, this algorithm can not change, otherwise it would inconsistently + * account for existing bp's. We also hard-code txg 0 for the same reason + * since expanded RAIDZ vdevs can use a different asize for different birth + * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / - (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> + SPA_MINBLOCKSHIFT); } } @@ -3228,32 +3236,43 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - return; + } else { + mutex_enter(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) { + /* leaf vdevs only */ + continue; + } + if (t == DTL_PARTIAL) { + /* i.e. non-zero */ + minref = 1; + } else if (vdev_get_nparity(vd) != 0) { + /* RAIDZ, DRAID */ + minref = vdev_get_nparity(vd) + 1; + } else { + /* any kind of mirror */ + minref = vd->vdev_children; + } + space_reftree_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, + cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_reftree_generate_map(&reftree, + vd->vdev_dtl[t], minref); + space_reftree_destroy(&reftree); + } + mutex_exit(&vd->vdev_dtl_lock); } - mutex_enter(&vd->vdev_dtl_lock); - for (int t = 0; t < DTL_TYPES; t++) { - /* account for child's outage in parent's missing map */ - int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; - if (t == DTL_SCRUB) - continue; /* leaf vdevs only */ - if (t == DTL_PARTIAL) - minref = 1; /* i.e. non-zero */ - else if (vdev_get_nparity(vd) != 0) - minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ - else - minref = vd->vdev_children; /* any kind of mirror */ - space_reftree_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - mutex_enter(&cvd->vdev_dtl_lock); - space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); - mutex_exit(&cvd->vdev_dtl_lock); - } - space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); - space_reftree_destroy(&reftree); + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { + raidz_dtl_reassessed(vd); } - mutex_exit(&vd->vdev_dtl_lock); } /* @@ -3628,6 +3647,12 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); + if (vd->vdev_ops == &vdev_raidz_ops) { + error = vdev_raidz_load(vd); + if (error != 0) + return (error); + } + /* * On spa_load path, grab the allocation bias from our zap */ @@ -4005,10 +4030,22 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } +/* + * Return the amount of space that should be (or was) allocated for the given + * psize (compressed block size) in the given TXG. Note that for expanded + * RAIDZ vdevs, the size allocated for older BP's may be larger. See + * vdev_raidz_asize(). + */ +uint64_t +vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); +} + uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { - return (vd->vdev_ops->vdev_op_asize(vd, psize)); + return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* @@ -4174,9 +4211,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); - wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; @@ -5457,7 +5491,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + if ((vd->vdev_spa->spa_raidz_expand == NULL || + vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && + (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); @@ -6209,6 +6245,14 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; + case VDEV_PROP_RAIDZ_EXPANDING: + /* Only expose this for raidz */ + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_prop_add_list(outnvl, propname, + NULL, vd->vdev_rz_expanding, + ZPROP_SRC_NONE); + } + continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c index 307e2353d020..ec961255fd64 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_draid.c +++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c @@ -577,8 +577,9 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc, * i.e. vdev_draid_psize_to_asize(). */ static uint64_t -vdev_draid_asize(vdev_t *vd, uint64_t psize) +vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { + (void) txg; vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_ashift; @@ -960,7 +961,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, vdev_draid_config_t *vdc = vd->vdev_tsd; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t io_size = abd_size; - uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t io_asize = vdev_draid_asize(vd, io_size, 0); uint64_t group = vdev_draid_offset_to_group(vd, io_offset); uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); @@ -1025,15 +1026,9 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, ASSERT3U(vdc->vdc_nparity, >, 0); - raidz_row_t *rr; - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); - rr->rr_cols = groupwidth; - rr->rr_scols = groupwidth; + raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth); rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = vdc->vdc_nparity; - rr->rr_abd_empty = NULL; #ifdef ZFS_DEBUG rr->rr_offset = io_offset; rr->rr_size = io_size; @@ -1053,14 +1048,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); rc->rc_offset = physical_offset; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; if (q == 0 && i >= bc) rc->rc_size = 0; @@ -1129,7 +1116,7 @@ vdev_draid_map_alloc(zio_t *zio) if (size < abd_size) { vdev_t *vd = zio->io_vd; - io_offset += vdev_draid_asize(vd, size); + io_offset += vdev_draid_asize(vd, size, 0); abd_offset += size; abd_size -= size; nrows++; @@ -1151,7 +1138,6 @@ vdev_draid_map_alloc(zio_t *zio) rm->rm_row[0] = rr[0]; if (nrows == 2) rm->rm_row[1] = rr[1]; - return (rm); } @@ -1783,7 +1769,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { uint64_t offset = DVA_GET_OFFSET(dva); - uint64_t asize = vdev_draid_asize(vd, psize); + uint64_t asize = vdev_draid_asize(vd, psize, 0); if (phys_birth == TXG_UNKNOWN) { /* @@ -1840,7 +1826,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_draid_asize(vd, rr->rr_size); + vdev_draid_asize(vd, rr->rr_size, 0); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c index ffdcef1972c3..5aaef1a69986 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c +++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c @@ -48,7 +48,8 @@ static boolean_t vdev_initialize_should_stop(vdev_t *vd) { return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } static void @@ -67,7 +68,8 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; @@ -631,6 +633,7 @@ vdev_initialize(vdev_t *vd) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); vd->vdev_initialize_thread = thread_create(NULL, 0, @@ -791,13 +794,14 @@ vdev_initialize_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_action_time = timestamp; - if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_initialize_load(vd)); } else if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_initialize_thread == NULL) { vdev_initialize(vd); } diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index a2e5524a8391..e8f562a1a6a2 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -142,6 +142,7 @@ #include <sys/zap.h> #include <sys/vdev.h> #include <sys/vdev_impl.h> +#include <sys/vdev_raidz.h> #include <sys/vdev_draid.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> @@ -423,6 +424,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, sizeof (pcs) / sizeof (uint64_t)); } + + pool_raidz_expand_stat_t pres; + if (spa_raidz_expand_get_stats(spa, &pres) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres, + sizeof (pres) / sizeof (uint64_t)); + } } static void @@ -1504,7 +1512,8 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) } struct ubl_cbdata { - uberblock_t *ubl_ubbest; /* Best uberblock */ + uberblock_t ubl_latest; /* Most recent uberblock */ + uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */ vdev_t *ubl_vd; /* vdev associated with the above */ }; @@ -1521,6 +1530,9 @@ vdev_uberblock_load_done(zio_t *zio) if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); + if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) { + cbp->ubl_latest = *ub; + } if (ub->ub_txg <= spa->spa_load_max_txg && vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { /* @@ -1578,10 +1590,10 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) ASSERT(config); memset(ub, 0, sizeof (uberblock_t)); + memset(&cb, 0, sizeof (cb)); *config = NULL; cb.ubl_ubbest = ub; - cb.ubl_vd = NULL; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); zio = zio_root(spa, NULL, &cb, flags); @@ -1598,6 +1610,22 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); + if (ub->ub_raidz_reflow_info != + cb.ubl_latest.ub_raidz_reflow_info) { + vdev_dbgmsg(cb.ubl_vd, + "spa=%s best uberblock (txg=%llu info=0x%llx) " + "has different raidz_reflow_info than latest " + "uberblock (txg=%llu info=0x%llx)", + spa->spa_name, + (u_longlong_t)ub->ub_txg, + (u_longlong_t)ub->ub_raidz_reflow_info, + (u_longlong_t)cb.ubl_latest.ub_txg, + (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info); + memset(ub, 0, sizeof (uberblock_t)); + spa_config_exit(spa, SCL_ALL, FTAG); + return; + } + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); if (*config == NULL && spa->spa_extreme_rewind) { vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " @@ -1719,8 +1747,23 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, vd->vdev_copy_uberblocks = B_FALSE; } + /* + * We chose a slot based on the txg. If this uberblock has a special + * RAIDZ expansion state, then it is essentially an update of the + * current uberblock (it has the same txg). However, the current + * state is committed, so we want to write it to a different slot. If + * we overwrote the same slot, and we lose power during the uberblock + * write, and the disk does not do single-sector overwrites + * atomically (even though it is required to - i.e. we should see + * either the old or the new uberblock), then we could lose this + * txg's uberblock. Rewinding to the previous txg's uberblock may not + * be possible because RAIDZ expansion may have already overwritten + * some of the data, so we need the progress indicator in the + * uberblock. + */ int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; - int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); + int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) % + (VDEV_UBERBLOCK_COUNT(vd) - m); /* Copy the uberblock_t into the ABD */ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); @@ -1737,7 +1780,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, } /* Sync the uberblocks to all vdevs in svd[] */ -static int +int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { spa_t *spa = svd[0]->vdev_spa; diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index 3445fa9d35d5..9d0b8763f16f 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -27,15 +27,22 @@ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zap.h> #include <sys/vdev_impl.h> +#include <sys/metaslab_impl.h> #include <sys/zio.h> #include <sys/zio_checksum.h> +#include <sys/dmu_tx.h> #include <sys/abd.h> +#include <sys/zfs_rlock.h> #include <sys/fs/zfs.h> #include <sys/fm/fs/zfs.h> #include <sys/vdev_raidz.h> #include <sys/vdev_raidz_impl.h> #include <sys/vdev_draid.h> +#include <sys/uberblock_impl.h> +#include <sys/dsl_scan.h> #ifdef ZFS_DEBUG #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -135,6 +142,237 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } + +/* + * Big Theory Statement for how a RAIDZ VDEV is expanded + * + * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion + * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs + * that have been previously expanded can be expanded again. + * + * The RAIDZ VDEV must be healthy (must be able to write to all the drives in + * the VDEV) when an expansion starts. And the expansion will pause if any + * disk in the VDEV fails, and resume once the VDEV is healthy again. All other + * operations on the pool can continue while an expansion is in progress (e.g. + * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, + * and zpool initialize which can't be run during an expansion. Following a + * reboot or export/import, the expansion resumes where it left off. + * + * == Reflowing the Data == + * + * The expansion involves reflowing (copying) the data from the current set + * of disks to spread it across the new set which now has one more disk. This + * reflow operation is similar to reflowing text when the column width of a + * text editor window is expanded. The text doesn’t change but the location of + * the text changes to accommodate the new width. An example reflow result for + * a 4-wide RAIDZ1 to a 5-wide is shown below. + * + * Reflow End State + * Each letter indicates a parity group (logical stripe) + * + * Before expansion After Expansion + * D1 D2 D3 D4 D1 D2 D3 D4 D5 + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | A | A | A | A | | A | A | A | A | B | + * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | B | B | C | C | | B | C | C | C | C | + * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | C | C | D | D | | D | D | E | E | E | + * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | E | E | E | E | --> | E | F | F | G | G | + * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | F | F | G | G | | G | G | H | H | H | + * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | G | G | H | H | | H | I | I | J | J | + * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| + * +------+------+------+------+ +------+------+------+------+------+ + * | | | | | | | | | | | + * | H | H | I | I | | J | J | | | K | + * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| + * +------+------+------+------+ +------+------+------+------+------+ + * + * This reflow approach has several advantages. There is no need to read or + * modify the block pointers or recompute any block checksums. The reflow + * doesn’t need to know where the parity sectors reside. We can read and write + * data sequentially and the copy can occur in a background thread in open + * context. The design also allows for fast discovery of what data to copy. + * + * The VDEV metaslabs are processed, one at a time, to copy the block data to + * have it flow across all the disks. The metaslab is disabled for allocations + * during the copy. As an optimization, we only copy the allocated data which + * can be determined by looking at the metaslab range tree. During the copy we + * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still + * need to be able to survive losing parity count disks). This means we + * cannot overwrite data during the reflow that would be needed if a disk is + * lost. + * + * After the reflow completes, all newly-written blocks will have the new + * layout, i.e., they will have the parity to data ratio implied by the new + * number of disks in the RAIDZ group. Even though the reflow copies all of + * the allocated space (data and parity), it is only rearranged, not changed. + * + * This act of reflowing the data has a few implications about blocks + * that were written before the reflow completes: + * + * - Old blocks will still use the same amount of space (i.e., they will have + * the parity to data ratio implied by the old number of disks in the RAIDZ + * group). + * - Reading old blocks will be slightly slower than before the reflow, for + * two reasons. First, we will have to read from all disks in the RAIDZ + * VDEV, rather than being able to skip the children that contain only + * parity of this block (because the data of a single block is now spread + * out across all the disks). Second, in most cases there will be an extra + * bcopy, needed to rearrange the data back to its original layout in memory. + * + * == Scratch Area == + * + * As we copy the block data, we can only progress to the point that writes + * will not overlap with blocks whose progress has not yet been recorded on + * disk. Since partially-copied rows are always read from the old location, + * we need to stop one row before the sector-wise overlap, to prevent any + * row-wise overlap. For example, in the diagram above, when we reflow sector + * B6 it will overwite the original location for B5. + * + * To get around this, a scratch space is used so that we can start copying + * without risking data loss by overlapping the row. As an added benefit, it + * improves performance at the beginning of the reflow, but that small perf + * boost wouldn't be worth the complexity on its own. + * + * Ideally we want to copy at least 2 * (new_width)^2 so that we have a + * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max + * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice + * the widths will likely be single digits so we can get a substantial chuck + * size using only a few MB of scratch per disk. + * + * The scratch area is persisted to disk which holds a large amount of reflowed + * state. We can always read the partially written stripes when a disk fails or + * the copy is interrupted (crash) during the initial copying phase and also + * get past a small chunk size restriction. At a minimum, the scratch space + * must be large enough to get us to the point that one row does not overlap + * itself when moved (i.e new_width^2). But going larger is even better. We + * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels + * as our scratch space to handle overwriting the initial part of the VDEV. + * + * 0 256K 512K 4M + * +------+------+-----------------------+----------------------------- + * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... + * | L0 | L1 | Reserved | (Metaslabs) + * +------+------+-----------------------+------------------------------- + * Scratch Area + * + * == Reflow Progress Updates == + * After the initial scratch-based reflow, the expansion process works + * similarly to device removal. We create a new open context thread which + * reflows the data, and periodically kicks off sync tasks to update logical + * state. In this case, state is the committed progress (offset of next data + * to copy). We need to persist the completed offset on disk, so that if we + * crash we know which format each VDEV offset is in. + * + * == Time Dependent Geometry == + * + * In non-expanded RAIDZ, blocks are read from disk in a column by column + * fashion. For a multi-row block, the second sector is in the first column + * not in the second column. This allows us to issue full reads for each + * column directly into the request buffer. The block data is thus laid out + * sequentially in a column-by-column fashion. + * + * For example, in the before expansion diagram above, one logical block might + * be sectors G19-H26. The parity is in G19,H23; and the data is in + * G20,H24,G21,H25,G22,H26. + * + * After a block is reflowed, the sectors that were all in the original column + * data can now reside in different columns. When reading from an expanded + * VDEV, we need to know the logical stripe width for each block so we can + * reconstitute the block’s data after the reads are completed. Likewise, + * when we perform the combinatorial reconstruction we need to know the + * original width so we can retry combinations from the past layouts. + * + * Time dependent geometry is what we call having blocks with different layouts + * (stripe widths) in the same VDEV. This time-dependent geometry uses the + * block’s birth time (+ the time expansion ended) to establish the correct + * width for a given block. After an expansion completes, we record the time + * for blocks written with a particular width (geometry). + * + * == On Disk Format Changes == + * + * New pool feature flag, 'raidz_expansion' whose reference count is the number + * of RAIDZ VDEVs that have been expanded. + * + * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. + * + * Since the uberblock can point to arbitrary blocks, which might be on the + * expanding RAIDZ, and might or might not have been expanded. We need to know + * which way a block is laid out before reading it. This info is the next + * offset that needs to be reflowed and we persist that in the uberblock, in + * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. + * After the expansion is complete, we then use the raidz_expand_txgs array + * (see below) to determine how to read a block and the ub_raidz_reflow_info + * field no longer required. + * + * The uberblock's ub_raidz_reflow_info field also holds the scratch space + * state (i.e., active or not) which is also required before reading a block + * during the initial phase of reflowing the data. + * + * The top-level RAIDZ VDEV has two new entries in the nvlist: + * + * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here + * and used after the expansion is complete to + * determine how to read a raidz block + * 'raidz_expanding' boolean: present during reflow and removed after completion + * used during a spa import to resume an unfinished + * expansion + * + * And finally the VDEVs top zap adds the following informational entries: + * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE + * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME + * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME + * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED + */ + +/* + * For testing only: pause the raidz expansion after reflowing this amount. + * (accessed by ZTS and ztest) + */ +#ifdef _KERNEL +static +#endif /* _KERNEL */ +unsigned long raidz_expand_max_reflow_bytes = 0; + +/* + * For testing only: pause the raidz expansion at a certain point. + */ +uint_t raidz_expand_pause_point = 0; + +/* + * Maximum amount of copy io's outstanding at once. + */ +static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; + +/* + * Apply raidz map abds aggregation if the number of rows in the map is equal + * or greater than the value below. + */ +static unsigned long raidz_io_aggregate_rows = 4; + +/* + * Automatically start a pool scrub when a RAIDZ expansion completes in + * order to verify the checksums of all blocks which have been copied + * during the expansion. Automatic scrubbing is enabled by default and + * is strongly recommended. + */ +static int zfs_scrub_after_expand = 1; + static void vdev_raidz_row_free(raidz_row_t *rr) { @@ -159,6 +397,17 @@ vdev_raidz_map_free(raidz_map_t *rm) for (int i = 0; i < rm->rm_nrows; i++) vdev_raidz_row_free(rm->rm_row[i]); + if (rm->rm_nphys_cols) { + for (int i = 0; i < rm->rm_nphys_cols; i++) { + if (rm->rm_phys_col[i].rc_abd != NULL) + abd_free(rm->rm_phys_col[i].rc_abd); + } + + kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * + rm->rm_nphys_cols); + } + + ASSERT3P(rm->rm_lr, ==, NULL); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } @@ -170,10 +419,37 @@ vdev_raidz_map_free_vsd(zio_t *zio) vdev_raidz_map_free(rm); } +static int +vdev_raidz_reflow_compare(const void *x1, const void *x2) +{ + const reflow_node_t *l = x1; + const reflow_node_t *r = x2; + + return (TREE_CMP(l->re_txg, r->re_txg)); +} + const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, }; +raidz_row_t * +vdev_raidz_row_alloc(int cols) +{ + raidz_row_t *rr = + kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); + + rr->rr_cols = cols; + rr->rr_scols = cols; + + for (int c = 0; c < cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_shadow_devidx = INT_MAX; + rc->rc_shadow_offset = UINT64_MAX; + rc->rc_allow_repair = 1; + } + return (rr); +} + static void vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) { @@ -302,7 +578,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; - uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t acols, scols; raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); @@ -312,22 +588,22 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ - q = s / (dcols - nparity); + uint64_t q = s / (dcols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ - r = s - q * (dcols - nparity); + uint64_t r = s - q * (dcols - nparity); /* The number of "big columns" - those which contain remainder data. */ - bc = (r == 0 ? 0 : r + nparity); + uint64_t bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ - tot = s + nparity * (q + (r == 0 ? 0 : 1)); + uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* * acols: The columns that will be accessed. @@ -343,43 +619,28 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } ASSERT3U(acols, <=, scols); - - rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rr = vdev_raidz_row_alloc(scols); rm->rm_row[0] = rr; - rr->rr_cols = acols; - rr->rr_scols = scols; rr->rr_bigcols = bc; - rr->rr_missingdata = 0; - rr->rr_missingparity = 0; rr->rr_firstdatacol = nparity; - rr->rr_abd_empty = NULL; - rr->rr_nempty = 0; #ifdef ZFS_DEBUG rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif - asize = 0; + uint64_t asize = 0; - for (c = 0; c < scols; c++) { + for (uint64_t c = 0; c < scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - col = f + c; - coff = o; + uint64_t col = f + c; + uint64_t coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } rc->rc_devidx = col; rc->rc_offset = coff; - rc->rc_abd = NULL; - rc->rc_orig_data = NULL; - rc->rc_error = 0; - rc->rc_tried = 0; - rc->rc_skipped = 0; - rc->rc_force_repair = 0; - rc->rc_allow_repair = 1; - rc->rc_need_orig_restore = B_FALSE; if (c >= acols) rc->rc_size = 0; @@ -419,13 +680,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { - devidx = rr->rr_col[0].rc_devidx; + uint64_t devidx = rr->rr_col[0].rc_devidx; o = rr->rr_col[0].rc_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; - if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } @@ -435,7 +695,338 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, } else { vdev_raidz_map_alloc_read(zio, rm); } + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + + return (rm); +} + +/* + * Everything before reflow_offset_synced should have been moved to the new + * location (read and write completed). However, this may not yet be reflected + * in the on-disk format (e.g. raidz_reflow_sync() has been called but the + * uberblock has not yet been written). If reflow is not in progress, + * reflow_offset_synced should be UINT64_MAX. For each row, if the row is + * entirely before reflow_offset_synced, it will come from the new location. + * Otherwise this row will come from the old location. Therefore, rows that + * straddle the reflow_offset_synced will come from the old location. + * + * For writes, reflow_offset_next is the next offset to copy. If a sector has + * been copied, but not yet reflected in the on-disk progress + * (reflow_offset_synced), it will also be written to the new (already copied) + * offset. + */ +noinline raidz_map_t * +vdev_raidz_map_alloc_expanded(zio_t *zio, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity, uint64_t reflow_offset_synced, + uint64_t reflow_offset_next, boolean_t use_scratch) +{ + abd_t *abd = zio->io_abd; + uint64_t offset = zio->io_offset; + uint64_t size = zio->io_size; + + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + uint64_t q = s / (logical_cols - nparity); + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + uint64_t r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + uint64_t bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + rm->rm_skipstart = bc; + uint64_t asize = 0; + + for (uint64_t row = 0; row < rows; row++) { + boolean_t row_use_scratch = B_FALSE; + raidz_row_t *rr = vdev_raidz_row_alloc(cols); + rm->rm_row[row] = rr; + + /* The starting RAIDZ (parent) vdev sector of the row. */ + uint64_t b = (offset >> ashift) + row * logical_cols; + + /* + * If we are in the middle of a reflow, and the copying has + * not yet completed for any part of this row, then use the + * old location of this row. Note that reflow_offset_synced + * reflects the i/o that's been completed, because it's + * updated by a synctask, after zio_wait(spa_txg_zio[]). + * This is sufficient for our check, even if that progress + * has not yet been recorded to disk (reflected in + * spa_ubsync). Also note that we consider the last row to + * be "full width" (`cols`-wide rather than `bc`-wide) for + * this calculation. This causes a tiny bit of unnecessary + * double-writes but is safe and simpler to calculate. + */ + int row_phys_cols = physical_cols; + if (b + cols > reflow_offset_synced >> ashift) + row_phys_cols--; + else if (use_scratch) + row_use_scratch = B_TRUE; + + /* starting child of this row */ + uint64_t child_id = b % row_phys_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / row_phys_cols) << ashift; + + /* + * Note, rr_cols is the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_firstdatacol = nparity; +#ifdef ZFS_DEBUG + /* + * note: rr_size is PSIZE, not ASIZE + */ + rr->rr_offset = b << ashift; + rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; +#endif + + for (int c = 0; c < rr->rr_cols; c++, child_id++) { + if (child_id >= row_phys_cols) { + child_id -= row_phys_cols; + child_offset += 1ULL << ashift; + } + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_devidx = child_id; + rc->rc_offset = child_offset; + + /* + * Get this from the scratch space if appropriate. + * This only happens if we crashed in the middle of + * raidz_reflow_scratch_sync() (while it's running, + * the rangelock prevents us from doing concurrent + * io), and even then only during zpool import or + * when the pool is imported readonly. + */ + if (row_use_scratch) + rc->rc_offset -= VDEV_BOOT_SIZE; + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rc->rc_size = 1ULL << ashift; + + /* + * Parity sectors' rc_abd's are set below + * after determining if this is an aggregation. + */ + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end of the block (even including + * skip sectors). This sector is part of the + * map so that we have full rows for p/q parity + * generation. + */ + rc->rc_size = 0; + rc->rc_abd = NULL; + } else { + /* "data column" (col excluding parity) */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } + rc->rc_size = 1ULL << ashift; + rc->rc_abd = abd_get_offset_struct( + &rc->rc_abdstruct, abd, off << ashift, + rc->rc_size); + } + + if (rc->rc_size == 0) + continue; + + /* + * If any part of this row is in both old and new + * locations, the primary location is the old + * location. If this sector was already copied to the + * new location, we need to also write to the new, + * "shadow" location. + * + * Note, `row_phys_cols != physical_cols` indicates + * that the primary location is the old location. + * `b+c < reflow_offset_next` indicates that the copy + * to the new location has been initiated. We know + * that the copy has completed because we have the + * rangelock, which is held exclusively while the + * copy is in progress. + */ + if (row_use_scratch || + (row_phys_cols != physical_cols && + b + c < reflow_offset_next >> ashift)) { + rc->rc_shadow_devidx = (b + c) % physical_cols; + rc->rc_shadow_offset = + ((b + c) / physical_cols) << ashift; + if (row_use_scratch) + rc->rc_shadow_offset -= VDEV_BOOT_SIZE; + } + + asize += rc->rc_size; + } + + /* + * See comment in vdev_raidz_map_alloc() + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + + int devidx0 = rr->rr_col[0].rc_devidx; + uint64_t offset0 = rr->rr_col[0].rc_offset; + int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; + uint64_t shadow_offset0 = + rr->rr_col[0].rc_shadow_offset; + + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[0].rc_shadow_devidx = + rr->rr_col[1].rc_shadow_devidx; + rr->rr_col[0].rc_shadow_offset = + rr->rr_col[1].rc_shadow_offset; + + rr->rr_col[1].rc_devidx = devidx0; + rr->rr_col[1].rc_offset = offset0; + rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; + rr->rr_col[1].rc_shadow_offset = shadow_offset0; + } + } + ASSERT3U(asize, ==, tot << ashift); + + /* + * Determine if the block is contiguous, in which case we can use + * an aggregation. + */ + if (rows >= raidz_io_aggregate_rows) { + rm->rm_nphys_cols = physical_cols; + rm->rm_phys_col = + kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, + KM_SLEEP); + + /* + * Determine the aggregate io's offset and size, and check + * that the io is contiguous. + */ + for (int i = 0; + i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + + if (rc->rc_size == 0) + continue; + + if (prc->rc_size == 0) { + ASSERT0(prc->rc_offset); + prc->rc_offset = rc->rc_offset; + } else if (prc->rc_offset + prc->rc_size != + rc->rc_offset) { + /* + * This block is not contiguous and + * therefore can't be aggregated. + * This is expected to be rare, so + * the cost of allocating and then + * freeing rm_phys_col is not + * significant. + */ + kmem_free(rm->rm_phys_col, + sizeof (raidz_col_t) * + rm->rm_nphys_cols); + rm->rm_phys_col = NULL; + rm->rm_nphys_cols = 0; + break; + } + prc->rc_size += rc->rc_size; + } + } + } + if (rm->rm_phys_col != NULL) { + /* + * Allocate aggregate ABD's. + */ + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + + prc->rc_devidx = i; + + if (prc->rc_size == 0) + continue; + + prc->rc_abd = + abd_alloc_linear(rm->rm_phys_col[i].rc_size, + B_FALSE); + } + + /* + * Point the parity abd's into the aggregate abd's. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_abd = + abd_get_offset_struct(&rc->rc_abdstruct, + prc->rc_abd, + rc->rc_offset - prc->rc_offset, + rc->rc_size); + } + } + } else { + /* + * Allocate new abd's for the parity sectors. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = + abd_alloc_linear(rc->rc_size, + B_TRUE); + } + } + } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -453,11 +1044,11 @@ vdev_raidz_p_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; - int i, cnt = size / sizeof (src[0]); + int cnt = size / sizeof (src[0]); ASSERT(pqr->p && !pqr->q && !pqr->r); - for (i = 0; i < cnt; i++, src++, pqr->p++) + for (int i = 0; i < cnt; i++, src++, pqr->p++) *pqr->p ^= *src; return (0); @@ -469,11 +1060,11 @@ vdev_raidz_pq_func(void *buf, size_t size, void *private) struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; - int i, cnt = size / sizeof (src[0]); + int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && !pqr->r); - for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; @@ -488,11 +1079,11 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; - int i, cnt = size / sizeof (src[0]); + int cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && pqr->r); - for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; @@ -618,7 +1209,15 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr) void vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { - ASSERT3U(rr->rr_cols, !=, 0); + if (rr->rr_cols == 0) { + /* + * We are handling this block one row at a time (because + * this block has a different logical vs physical width, + * due to RAIDZ expansion), and this is a pad-only row, + * which has no parity. + */ + return; + } /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) @@ -770,6 +1369,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) int x = tgts[0]; abd_t *dst, *src; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); + ASSERT3U(ntgts, ==, 1); ASSERT3U(x, >=, rr->rr_firstdatacol); ASSERT3U(x, <, rr->rr_cols); @@ -802,6 +1404,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) int c, exp; abd_t *dst, *src; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); + ASSERT(ntgts == 1); ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); @@ -848,6 +1453,9 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) int y = tgts[1]; abd_t *xd, *yd; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); + ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rr->rr_firstdatacol); @@ -1295,11 +1903,14 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) abd_t **bufs = NULL; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs if any non-linear ABDs are found. */ for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { + ASSERT(rr->rr_col[i].rc_abd != NULL); if (!abd_is_linear(rr->rr_col[i].rc_abd)) { bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE); @@ -1427,10 +2038,23 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { + zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", + rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, + (int)rr->rr_missingparity); + } + nbadparity = rr->rr_firstdatacol; nbaddata = rr->rr_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rr->rr_cols; c++) { + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { + zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " + "offset=%llx error=%u)", + rr, c, (int)rr->rr_col[c].rc_devidx, + (long long)rr->rr_col[c].rc_offset, + (int)rr->rr_col[c].rc_error); + } if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; @@ -1537,8 +2161,15 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *physical_ashift, cvd->vdev_physical_ashift); } - *asize *= vd->vdev_children; - *max_asize *= vd->vdev_children; + if (vd->vdev_rz_expanding) { + *asize *= vd->vdev_children - 1; + *max_asize *= vd->vdev_children - 1; + + vd->vdev_min_asize = *asize; + } else { + *asize *= vd->vdev_children; + *max_asize *= vd->vdev_children; + } if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; @@ -1557,19 +2188,71 @@ vdev_raidz_close(vdev_t *vd) } } +/* + * Return the logical width to use, given the txg in which the allocation + * happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the + * BP was allocated. Remapped BP's (that were relocated due to device + * removal, see remap_blkptr_cb()), will have a more recent + * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can + * ignore these because they can't be on RAIDZ (device removal doesn't + * support RAIDZ). + */ +static uint64_t +vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) +{ + reflow_node_t lookup = { + .re_txg = txg, + }; + avl_index_t where; + + uint64_t width; + mutex_enter(&vdrz->vd_expand_lock); + reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); + if (re != NULL) { + width = re->re_logical_width; + } else { + re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); + if (re != NULL) + width = re->re_logical_width; + else + width = vdrz->vd_original_width; + } + mutex_exit(&vdrz->vd_expand_lock); + return (width); +} + +/* + * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated + * more space due to the lower data-to-parity ratio. In this case it's + * important to pass in the correct txg. Note that vdev_gang_header_asize() + * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, + * regardless of txg. This is assured because for a single data sector, we + * allocate P+1 sectors regardless of width ("cols", which is at least P+1). + */ static uint64_t -vdev_raidz_asize(vdev_t *vd, uint64_t psize) +vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vdrz->vd_logical_width; + uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; + cols = vdev_raidz_get_logical_width(vdrz, txg); + asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; +#ifdef ZFS_DEBUG + uint64_t asize_new = ((psize - 1) >> ashift) + 1; + uint64_t ncols_new = vdrz->vd_physical_width; + asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / + (ncols_new - nparity)); + asize_new = roundup(asize_new, nparity + 1) << ashift; + VERIFY3U(asize_new, <=, asize); +#endif + return (asize); } @@ -1596,21 +2279,37 @@ vdev_raidz_child_done(zio_t *zio) } static void -vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +vdev_raidz_shadow_child_done(zio_t *zio) { -#ifdef ZFS_DEBUG - vdev_t *tvd = vd->vdev_top; + raidz_col_t *rc = zio->io_private; + + rc->rc_shadow_error = zio->io_error; +} +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) +{ + (void) rm; +#ifdef ZFS_DEBUG range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(vd, rr->rr_size); + vdev_raidz_asize(zio->io_vd, rr->rr_size, + BP_PHYSICAL_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; - vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); + if (vdev_xlate_is_empty(&physical_rs)) { + /* + * If we are in the middle of expansion, the + * physical->logical mapping is changing so vdev_xlate() + * can't give us a reliable answer. + */ + return; + } ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1621,7 +2320,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + - rc->rc_size + (1 << tvd->vdev_ashift)); + rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } @@ -1629,7 +2328,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) } static void -vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; @@ -1641,31 +2340,66 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; /* Verify physical to logical translation */ - vdev_raidz_io_verify(vd, rr, c); + vdev_raidz_io_verify(zio, rm, rr, c); - if (rc->rc_size > 0) { - ASSERT3P(rc->rc_abd, !=, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, - abd_get_size(rc->rc_abd), zio->io_type, - zio->io_priority, 0, vdev_raidz_child_done, rc)); - } else { - /* - * Generate optional write for skip sector to improve - * aggregation contiguity. - */ - ASSERT3P(rc->rc_abd, ==, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, NULL, 1ULL << ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, - NULL)); + if (rc->rc_size == 0) + continue; + + ASSERT3U(rc->rc_offset + rc->rc_size, <, + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + + ASSERT3P(rc->rc_abd, !=, NULL); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), zio->io_type, + zio->io_priority, 0, vdev_raidz_child_done, rc)); + + if (rc->rc_shadow_devidx != INT_MAX) { + vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; + + ASSERT3U( + rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, + cvd2->vdev_psize - VDEV_LABEL_END_SIZE); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, + rc->rc_shadow_offset, rc->rc_abd, + abd_get_size(rc->rc_abd), + zio->io_type, zio->io_priority, 0, + vdev_raidz_shadow_child_done, rc)); } } } +/* + * Generate optional I/Os for skip sectors to improve aggregation contiguity. + * This only works for vdev_raidz_map_alloc() (not _expanded()). + */ +static void +raidz_start_skip_writes(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + raidz_map_t *rm = zio->io_vsd; + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + for (int c = 0; c < rr->rr_scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + if (rc->rc_size != 0) + continue; + ASSERT3P(rc->rc_abd, ==, NULL); + + ASSERT3U(rc->rc_offset, <, + cvd->vdev_psize - VDEV_LABEL_END_SIZE); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, + NULL, 1ULL << ashift, zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } +} + static void -vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) { vdev_t *vd = zio->io_vd; @@ -1697,7 +2431,8 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_skipped = 1; continue; } - if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + if (forceparity || + c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, @@ -1707,6 +2442,56 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) } } +static void +vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) +{ + vdev_t *vd = zio->io_vd; + + for (int i = 0; i < rm->rm_nphys_cols; i++) { + raidz_col_t *prc = &rm->rm_phys_col[i]; + if (prc->rc_size == 0) + continue; + + ASSERT3U(prc->rc_devidx, ==, i); + vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { + prc->rc_error = SET_ERROR(ENXIO); + prc->rc_tried = 1; /* don't even try */ + prc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + prc->rc_error = SET_ERROR(ESTALE); + prc->rc_skipped = 1; + continue; + } + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + prc->rc_offset, prc->rc_abd, prc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, prc)); + } +} + +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) +{ + /* + * If there are multiple rows, we will be hitting + * all disks, so go ahead and read the parity so + * that we are reading in decent size chunks. + */ + boolean_t forceparity = rm->rm_nrows > 1; + + if (rm->rm_phys_col) { + vdev_raidz_io_start_read_phys_cols(zio, rm); + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_start_read_row(zio, rr, forceparity); + } + } +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1730,24 +2515,83 @@ vdev_raidz_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_raidz_t *vdrz = vd->vdev_tsd; + raidz_map_t *rm; + + uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, + BP_PHYSICAL_BIRTH(zio->io_bp)); + if (logical_width != vdrz->vd_physical_width) { + zfs_locked_range_t *lr = NULL; + uint64_t synced_offset = UINT64_MAX; + uint64_t next_offset = UINT64_MAX; + boolean_t use_scratch = B_FALSE; + /* + * Note: when the expansion is completing, we set + * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) + * in a later txg than when we last update spa_ubsync's state + * (see the end of spa_raidz_expand_thread()). Therefore we + * may see vre_state!=SCANNING before + * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected + * on disk, but the copying progress has been synced to disk + * (and reflected in spa_ubsync). In this case it's fine to + * treat the expansion as completed, since if we crash there's + * no additional copying to do. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, + &vdrz->vn_vre); + lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, + zio->io_offset, zio->io_size, RL_READER); + use_scratch = + (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == + RRSS_SCRATCH_VALID); + synced_offset = + RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); + next_offset = vdrz->vn_vre.vre_offset; + /* + * If we haven't resumed expanding since importing the + * pool, vre_offset won't have been set yet. In + * this case the next offset to be copied is the same + * as what was synced. + */ + if (next_offset == UINT64_MAX) { + next_offset = synced_offset; + } + } + if (use_scratch) { + zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" + "%lld next_offset=%lld use_scratch=%u", + zio, + zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", + (long long)zio->io_offset, + (long long)synced_offset, + (long long)next_offset, + use_scratch); + } + + rm = vdev_raidz_map_alloc_expanded(zio, + tvd->vdev_ashift, vdrz->vd_physical_width, + logical_width, vdrz->vd_nparity, + synced_offset, next_offset, use_scratch); + rm->rm_lr = lr; + } else { + rm = vdev_raidz_map_alloc(zio, + tvd->vdev_ashift, logical_width, vdrz->vd_nparity); + } + rm->rm_original_width = vdrz->vd_original_width; - raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, - vdrz->vd_logical_width, vdrz->vd_nparity); zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; - - /* - * Until raidz expansion is implemented all maps for a raidz vdev - * contain a single row. - */ - ASSERT3U(rm->rm_nrows, ==, 1); - raidz_row_t *rr = rm->rm_row[0]; - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_start_write(zio, rm->rm_row[i]); + } + + if (logical_width == vdrz->vd_physical_width) { + raidz_start_skip_writes(zio); + } } else { ASSERT(zio->io_type == ZIO_TYPE_READ); - vdev_raidz_io_start_read(zio, rr); + vdev_raidz_io_start_read(zio, rm); } zio_execute(zio); @@ -1847,6 +2691,8 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { + zfs_dbgmsg("found error on col=%u devidx=%u off %llx", + c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1862,8 +2708,10 @@ vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rr->rr_cols; c++) + for (int c = 0; c < rr->rr_cols; c++) { error = zio_worst_error(error, rr->rr_col[c].rc_error); + error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); + } return (error); } @@ -1929,6 +2777,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) continue; } + zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " + "offset=%llx", + zio, c, rc->rc_devidx, (long long)rc->rc_offset); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, @@ -1938,6 +2790,42 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } + + /* + * Scrub or resilver i/o's: overwrite any shadow locations with the + * good data. This ensures that if we've already copied this sector, + * it will be corrected if it was damaged. This writes more than is + * necessary, but since expansion is paused during scrub/resilver, at + * most a single row will have a shadow location. + */ + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + + if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; + + /* + * Note: We don't want to update the repair stats + * because that would incorrectly indicate that there + * was bad data to repair, which we aren't sure about. + * By clearing the SCAN_THREAD flag, we prevent this + * from happening, despite having the REPAIR flag set. + * We need to set SELF_HEAL so that this i/o can't be + * bypassed by zio_vdev_io_start(). + */ + zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, + rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, + NULL, NULL); + cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; + zio_nowait(cio); + } + } } static void @@ -1957,6 +2845,43 @@ raidz_restore_orig_data(raidz_map_t *rm) } /* + * During raidz_reconstruct() for expanded VDEV, we need special consideration + * failure simulations. See note in raidz_reconstruct() on simulating failure + * of a pre-expansion device. + * + * Treating logical child i as failed, return TRUE if the given column should + * be treated as failed. The idea of logical children allows us to imagine + * that a disk silently failed before a RAIDZ expansion (reads from this disk + * succeed but return the wrong data). Since the expansion doesn't verify + * checksums, the incorrect data will be moved to new locations spread among + * the children (going diagonally across them). + * + * Higher "logical child failures" (values of `i`) indicate these + * "pre-expansion failures". The first physical_width values imagine that a + * current child failed; the next physical_width-1 values imagine that a + * child failed before the most recent expansion; the next physical_width-2 + * values imagine a child failed in the expansion before that, etc. + */ +static boolean_t +raidz_simulate_failure(int physical_width, int original_width, int ashift, + int i, raidz_col_t *rc) +{ + uint64_t sector_id = + physical_width * (rc->rc_offset >> ashift) + + rc->rc_devidx; + + for (int w = physical_width; w >= original_width; w--) { + if (i < w) { + return (sector_id % w == i); + } else { + i -= w; + } + } + ASSERT(!"invalid logical child id"); + return (B_FALSE); +} + +/* * returns EINVAL if reconstruction of the block will not be possible * returns ECKSUM if this specific reconstruction failed * returns 0 on successful reconstruction @@ -1965,6 +2890,15 @@ static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; + int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; + + if (dbgmsg) { + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " + "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); + } /* Reconstruct each row */ for (int r = 0; r < rm->rm_nrows; r++) { @@ -1974,6 +2908,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) int dead = 0; int dead_data = 0; + if (dbgmsg) + zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); + for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT0(rc->rc_need_orig_restore); @@ -1986,7 +2923,10 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) if (rc->rc_size == 0) continue; for (int lt = 0; lt < ntgts; lt++) { - if (rc->rc_devidx == ltgts[lt]) { + if (raidz_simulate_failure(physical_width, + original_width, + zio->io_vd->vdev_top->vdev_ashift, + ltgts[lt], rc)) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = abd_alloc_linear( @@ -1999,13 +2939,37 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) dead++; if (c >= nparity) dead_data++; - my_tgts[t++] = c; + /* + * Note: simulating failure of a + * pre-expansion device can hit more + * than one column, in which case we + * might try to simulate more failures + * than can be reconstructed, which is + * also more than the size of my_tgts. + * This check prevents accessing past + * the end of my_tgts. The "dead > + * nparity" check below will fail this + * reconstruction attempt. + */ + if (t < VDEV_RAIDZ_MAXPARITY) { + my_tgts[t++] = c; + if (dbgmsg) { + zfs_dbgmsg("simulating " + "failure of col %u " + "devidx %u", c, + (int)rc->rc_devidx); + } + } break; } } } if (dead > nparity) { /* reconstruction not possible */ + if (dbgmsg) { + zfs_dbgmsg("reconstruction not possible; " + "too many failures"); + } raidz_restore_orig_data(rm); return (EINVAL); } @@ -2049,11 +3013,19 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) zio_checksum_verified(zio); + if (dbgmsg) { + zfs_dbgmsg("reconstruction successful " + "(checksum verified)"); + } return (0); } /* Reconstruction failed - restore original data */ raidz_restore_orig_data(rm); + if (dbgmsg) { + zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " + "failed", zio); + } return (ECKSUM); } @@ -2068,7 +3040,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * The order that we find the various possible combinations of failed * disks is dictated by these rules: * - Examine each "slot" (the "i" in tgts[i]) - * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - Try to increment this slot (tgts[i] += 1) * - if we can't increment because it runs into the next slot, * reset our slot to the minimum, and examine the next slot * @@ -2099,18 +3071,22 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) * * This strategy works for dRAID but is less efficient when there are a large * number of child vdevs and therefore permutations to check. Furthermore, - * since the raidz_map_t rows likely do not overlap reconstruction would be + * since the raidz_map_t rows likely do not overlap, reconstruction would be * possible as long as there are no more than nparity data errors per row. * These additional permutations are not currently checked but could be as * a future improvement. + * + * Returns 0 on success, ECKSUM on failure. */ static int vdev_raidz_combrec(zio_t *zio) { int nparity = vdev_get_nparity(zio->io_vd); raidz_map_t *rm = zio->io_vsd; + int physical_width = zio->io_vd->vdev_children; + int original_width = (rm->rm_original_width != 0) ? + rm->rm_original_width : physical_width; - /* Check if there's enough data to attempt reconstrution. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; int total_errors = 0; @@ -2128,8 +3104,16 @@ vdev_raidz_combrec(zio_t *zio) int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *ltgts = &tstore[1]; /* value is logical child ID */ - /* Determine number of logical children, n */ - int n = zio->io_vd->vdev_children; + + /* + * Determine number of logical children, n. See comment + * above raidz_simulate_failure(). + */ + int n = 0; + for (int w = physical_width; + w >= original_width; w--) { + n += w; + } ASSERT3U(num_failures, <=, nparity); ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); @@ -2160,6 +3144,14 @@ vdev_raidz_combrec(zio_t *zio) if (ltgts[t] == n) { /* try more failures */ ASSERT3U(t, ==, num_failures - 1); + if (zfs_flags & + ZFS_DEBUG_RAIDZ_RECONSTRUCT) { + zfs_dbgmsg("reconstruction " + "failed for num_failures=" + "%u; tried all " + "combinations", + num_failures); + } break; } @@ -2171,7 +3163,7 @@ vdev_raidz_combrec(zio_t *zio) * Try the next combination. */ if (ltgts[t] != ltgts[t + 1]) - break; + break; // found next combination /* * Otherwise, reset this tgt to the minimum, @@ -2186,7 +3178,8 @@ vdev_raidz_combrec(zio_t *zio) break; } } - + if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) + zfs_dbgmsg("reconstruction failed for all num_failures"); return (ECKSUM); } @@ -2211,7 +3204,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) static void vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { - int total_errors = 0; + int normal_errors = 0; + int shadow_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); @@ -2220,24 +3214,31 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - if (rc->rc_error) { + if (rc->rc_error != 0) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - total_errors++; + normal_errors++; + } + if (rc->rc_shadow_error != 0) { + ASSERT(rc->rc_shadow_error != ECKSUM); + shadow_errors++; } } /* * Treat partial writes as a success. If we couldn't write enough - * columns to reconstruct the data, the I/O failed. Otherwise, - * good enough. + * columns to reconstruct the data, the I/O failed. Otherwise, good + * enough. Note that in the case of a shadow write (during raidz + * expansion), depending on if we crash, either the normal (old) or + * shadow (new) location may become the "real" version of the block, + * so both locations must have sufficient redundancy. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ - if (total_errors > rr->rr_firstdatacol) { + if (normal_errors > rr->rr_firstdatacol || + shadow_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); } @@ -2254,7 +3255,6 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -2337,7 +3337,7 @@ vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) * for a normal read then allocate an ABD for them now so they * may be read, verified, and any needed repairs performed. */ - if (rr->rr_nempty && rr->rr_abd_empty == NULL) + if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) vdev_draid_map_alloc_empty(zio, rr); for (int c = 0; c < rr->rr_cols; c++) { @@ -2395,11 +3395,48 @@ vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; + ASSERT(zio->io_bp != NULL); if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { + if (rm->rm_phys_col) { + /* + * This is an aggregated read. Copy the data and status + * from the aggregate abd's to the individual rows. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_tried || rc->rc_size == 0) + continue; + + raidz_col_t *prc = + &rm->rm_phys_col[rc->rc_devidx]; + rc->rc_error = prc->rc_error; + rc->rc_tried = prc->rc_tried; + rc->rc_skipped = prc->rc_skipped; + if (c >= rr->rr_firstdatacol) { + /* + * Note: this is slightly faster + * than using abd_copy_off(). + */ + char *physbuf = abd_to_buf( + prc->rc_abd); + void *physloc = physbuf + + rc->rc_offset - + prc->rc_offset; + + abd_copy_from_buf(rc->rc_abd, + physloc, rc->rc_size); + } + } + } + } + for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, @@ -2446,7 +3483,54 @@ vdev_raidz_io_done(zio_t *zio) zio_vdev_io_redone(zio); return; } - + /* + * It would be too expensive to try every possible + * combination of failed sectors in every row, so + * instead we try every combination of failed current or + * past physical disk. This means that if the incorrect + * sectors were all on Nparity disks at any point in the + * past, we will find the correct data. The only known + * case where this is less durable than a non-expanded + * RAIDZ, is if we have a silent failure during + * expansion. In that case, one block could be + * partially in the old format and partially in the + * new format, so we'd lost some sectors from the old + * format and some from the new format. + * + * e.g. logical_width=4 physical_width=6 + * the 15 (6+5+4) possible failed disks are: + * width=6 child=0 + * width=6 child=1 + * width=6 child=2 + * width=6 child=3 + * width=6 child=4 + * width=6 child=5 + * width=5 child=0 + * width=5 child=1 + * width=5 child=2 + * width=5 child=3 + * width=5 child=4 + * width=4 child=0 + * width=4 child=1 + * width=4 child=2 + * width=4 child=3 + * And we will try every combination of Nparity of these + * failing. + * + * As a first pass, we can generate every combo, + * and try reconstructing, ignoring any known + * failures. If any row has too many known + simulated + * failures, then we bail on reconstructing with this + * number of simulated failures. As an improvement, + * we could detect the number of whole known failures + * (i.e. we have known failures on these disks for + * every row; the disks never succeeded), and + * subtract that from the max # failures to simulate. + * We could go even further like the current + * combrec code, but that doesn't seem like it + * gains us very much. If we simulate a failure + * that is also a known failure, that's fine. + */ zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -2454,6 +3538,10 @@ vdev_raidz_io_done(zio_t *zio) } } } + if (rm->rm_lr != NULL) { + zfs_rangelock_exit(rm->rm_lr); + rm->rm_lr = NULL; + } } static void @@ -2480,6 +3568,14 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * If we're in the middle of a RAIDZ expansion, this block may be in + * the old and/or new location. For simplicity, always resilver it. + */ + if (vdrz->vn_vre.vre_state == DSS_SCANNING) + return (B_TRUE); + uint64_t dcols = vd->vdev_children; uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; @@ -2524,7 +3620,24 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); - uint64_t width = raidvd->vdev_children; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + /* + * We're in the middle of expansion, in which case the + * translation is in flux. Any answer we give may be wrong + * by the time we return, so it isn't safe for the caller to + * act on it. Therefore we say that this range isn't present + * on any children. The only consumers of this are "zpool + * initialize" and trimming, both of which are "best effort" + * anyway. + */ + physical_rs->rs_start = physical_rs->rs_end = 0; + remain_rs->rs_start = remain_rs->rs_end = 0; + return; + } + + uint64_t width = vdrz->vd_physical_width; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; @@ -2550,15 +3663,1155 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, logical_rs->rs_end - logical_rs->rs_start); } +static void +raidz_reflow_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + /* + * Ensure there are no i/os to the range that is being committed. + */ + uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); + + mutex_enter(&vre->vre_lock); + uint64_t new_offset = + MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); + /* + * We should not have committed anything that failed. + */ + VERIFY3U(vre->vre_failed_offset, >=, old_offset); + mutex_exit(&vre->vre_lock); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + old_offset, new_offset - old_offset, + RL_WRITER); + + /* + * Update the uberblock that will be written when this txg completes. + */ + RAIDZ_REFLOW_SET(&spa->spa_uberblock, + RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); + vre->vre_offset_pertxg[txgoff] = 0; + zfs_rangelock_exit(lr); + + mutex_enter(&vre->vre_lock); + vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; + vre->vre_bytes_copied_pertxg[txgoff] = 0; + mutex_exit(&vre->vre_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); +} + +static void +raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + for (int i = 0; i < TXG_SIZE; i++) + VERIFY0(vre->vre_offset_pertxg[i]); + + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; + re->re_logical_width = vdrz->vd_physical_width; + mutex_enter(&vdrz->vd_expand_lock); + avl_add(&vdrz->vd_expand_txgs, re); + mutex_exit(&vdrz->vd_expand_lock); + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + + /* + * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS + * will get written (based on vd_expand_txgs). + */ + vdev_config_dirty(vd); + + /* + * Before we change vre_state, the on-disk state must reflect that we + * have completed all copying, so that vdev_raidz_io_start() can use + * vre_state to determine if the reflow is in progress. See also the + * end of spa_raidz_expand_thread(). + */ + VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, + raidvd->vdev_ms_count << raidvd->vdev_ms_shift); + + vre->vre_end_time = gethrestime_sec(); + vre->vre_state = DSS_FINISHED; + + uint64_t state = vre->vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t end_time = vre->vre_end_time; + VERIFY0(zap_update(spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time, tx)); + + spa->spa_uberblock.ub_raidz_reflow_info = 0; + + spa_history_log_internal(spa, "raidz vdev expansion completed", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)vd->vdev_id, + (unsigned long long)vd->vdev_children); + + spa->spa_raidz_expand = NULL; + raidvd->vdev_rz_expanding = B_FALSE; + + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); + spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); + + spa_notify_waiters(spa); + + /* + * While we're in syncing context take the opportunity to + * setup a scrub. All the data has been sucessfully copied + * but we have not validated any checksums. + */ + pool_scan_func_t func = POOL_SCAN_SCRUB; + if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) + dsl_scan_setup_sync(&func, tx); +} + +/* + * Struct for one copy zio. + */ +typedef struct raidz_reflow_arg { + vdev_raidz_expand_t *rra_vre; + zfs_locked_range_t *rra_lr; + uint64_t rra_txg; +} raidz_reflow_arg_t; + +/* + * The write of the new location is done. + */ +static void +raidz_reflow_write_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + abd_free(zio->io_abd); + + mutex_enter(&vre->vre_lock); + if (zio->io_error != 0) { + /* Force a reflow pause on errors */ + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + } + ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); + vre->vre_outstanding_bytes -= zio->io_size; + if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < + vre->vre_failed_offset) { + vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += + zio->io_size; + } + cv_signal(&vre->vre_cv); + mutex_exit(&vre->vre_lock); + + zfs_rangelock_exit(rra->rra_lr); + + kmem_free(rra, sizeof (*rra)); + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +/* + * The read of the old location is done. The parent zio is the write to + * the new location. Allow it to start. + */ +static void +raidz_reflow_read_done(zio_t *zio) +{ + raidz_reflow_arg_t *rra = zio->io_private; + vdev_raidz_expand_t *vre = rra->rra_vre; + + /* + * If the read failed, or if it was done on a vdev that is not fully + * healthy (e.g. a child that has a resilver in progress), we may not + * have the correct data. Note that it's OK if the write proceeds. + * It may write garbage but the location is otherwise unused and we + * will retry later due to vre_failed_offset. + */ + if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { + zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " + "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_lr->lr_length, + (long long)rra->rra_txg, + zio->io_error, + vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), + vdev_dtl_empty(zio->io_vd, DTL_MISSING)); + mutex_enter(&vre->vre_lock); + /* Force a reflow pause on errors */ + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + mutex_exit(&vre->vre_lock); + } + + zio_nowait(zio_unique_parent(zio)); +} + +static void +raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, + dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (offset == 0) + return; + + mutex_enter(&vre->vre_lock); + ASSERT3U(vre->vre_offset, <=, offset); + vre->vre_offset = offset; + mutex_exit(&vre->vre_lock); + + if (vre->vre_offset_pertxg[txgoff] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, + spa, tx); + } + vre->vre_offset_pertxg[txgoff] = offset; +} + +static boolean_t +vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) +{ + for (int i = 0; i < raidz_vd->vdev_children; i++) { + /* Quick check if a child is being replaced */ + if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, + dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + int ashift = vd->vdev_top->vdev_ashift; + uint64_t offset, size; + + if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, + &offset, &size)) { + return (B_FALSE); + } + ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); + ASSERT3U(size, >=, 1 << ashift); + uint64_t length = 1 << ashift; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + uint64_t blkid = offset >> ashift; + + int old_children = vd->vdev_children - 1; + + /* + * We can only progress to the point that writes will not overlap + * with blocks whose progress has not yet been recorded on disk. + * Since partially-copied rows are still read from the old location, + * we need to stop one row before the sector-wise overlap, to prevent + * row-wise overlap. + * + * Note that even if we are skipping over a large unallocated region, + * we can't move the on-disk progress to `offset`, because concurrent + * writes/allocations could still use the currently-unallocated + * region. + */ + uint64_t ubsync_blkid = + RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; + uint64_t next_overwrite_blkid = ubsync_blkid + + ubsync_blkid / old_children - old_children; + VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); + + if (blkid >= next_overwrite_blkid) { + raidz_reflow_record_progress(vre, + next_overwrite_blkid << ashift, tx); + return (B_TRUE); + } + + range_tree_remove(rt, offset, length); + + raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); + rra->rra_vre = vre; + rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, + offset, length, RL_WRITER); + rra->rra_txg = dmu_tx_get_txg(tx); + + raidz_reflow_record_progress(vre, offset + length, tx); + + mutex_enter(&vre->vre_lock); + vre->vre_outstanding_bytes += length; + mutex_exit(&vre->vre_lock); + + /* + * SCL_STATE will be released when the read and write are done, + * by raidz_reflow_write_done(). + */ + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* check if a replacing vdev was added, if so treat it as an error */ + if (vdev_raidz_expand_child_replacing(vd)) { + zfs_dbgmsg("replacing vdev encountered, reflow paused at " + "offset=%llu txg=%llu", + (long long)rra->rra_lr->lr_offset, + (long long)rra->rra_txg); + + mutex_enter(&vre->vre_lock); + vre->vre_failed_offset = + MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); + cv_signal(&vre->vre_cv); + mutex_exit(&vre->vre_lock); + + /* drop everything we acquired */ + zfs_rangelock_exit(rra->rra_lr); + kmem_free(rra, sizeof (*rra)); + spa_config_exit(spa, SCL_STATE, spa); + return (B_TRUE); + } + + zio_t *pio = spa->spa_txg_zio[txgoff]; + abd_t *abd = abd_alloc_for_io(length, B_FALSE); + zio_t *write_zio = zio_vdev_child_io(pio, NULL, + vd->vdev_child[blkid % vd->vdev_children], + (blkid / vd->vdev_children) << ashift, + abd, length, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_write_done, rra); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + vd->vdev_child[blkid % old_children], + (blkid / old_children) << ashift, + abd, length, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + raidz_reflow_read_done, rra)); + + return (B_FALSE); +} + +/* + * For testing (ztest specific) + */ +static void +raidz_expand_pause(uint_t pause_point) +{ + while (raidz_expand_pause_point != 0 && + raidz_expand_pause_point <= pause_point) + delay(hz); +} + +static void +raidz_scratch_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); +} + +/* + * Reflow the beginning portion of the vdev into an intermediate scratch area + * in memory and on disk. This operation must be persisted on disk before we + * proceed to overwrite the beginning portion with the reflowed data. + * + * This multi-step task can fail to complete if disk errors are encountered + * and we can return here after a pause (waiting for disk to become healthy). + */ +static void +raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) +{ + vdev_raidz_expand_t *vre = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zio_t *pio; + int error; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + int ashift = raidvd->vdev_ashift; + uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift); + uint64_t logical_size = write_size * raidvd->vdev_children; + uint64_t read_size = + P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), + 1 << ashift); + + /* + * The scratch space must be large enough to get us to the point + * that one row does not overlap itself when moved. This is checked + * by vdev_raidz_attach_check(). + */ + VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); + VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); + VERIFY3U(write_size, <=, read_size); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, + 0, logical_size, RL_WRITER); + + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(read_size, B_FALSE); + } + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); + + /* + * If we have already written the scratch area then we must read from + * there, since new writes were redirected there while we were paused + * or the original location may have been partially overwritten with + * reflowed data. + */ + if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { + VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); + /* + * Read from scratch space. + */ + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE + * to the offset to calculate the physical offset to + * write to. Passing in a negative offset makes us + * access the scratch area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, + raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + zfs_dbgmsg("reflow: error %d reading scratch location", + error); + goto io_error_exit; + } + goto overwrite; + } + + /* + * Read from original location. + */ + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children - 1; i++) { + ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], read_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + zfs_dbgmsg("reflow: error %d reading original location", error); +io_error_exit: + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + zfs_rangelock_exit(lr); + spa_config_exit(spa, SCL_STATE, FTAG); + return; + } + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); + + /* + * Reflow in memory. + */ + uint64_t logical_sectors = logical_size >> ashift; + for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { + int oldchild = i % (raidvd->vdev_children - 1); + uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; + + int newchild = i % raidvd->vdev_children; + uint64_t newoff = (i / raidvd->vdev_children) << ashift; + + /* a single sector should not be copying over itself */ + ASSERT(!(newchild == oldchild && newoff == oldoff)); + + abd_copy_off(abds[newchild], abds[oldchild], + newoff, oldoff, 1 << ashift); + } + + /* + * Verify that we filled in everything we intended to (write_size on + * each child). + */ + VERIFY0(logical_sectors % raidvd->vdev_children); + VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, + write_size); + + /* + * Write to scratch location (boot area). + */ + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + zfs_dbgmsg("reflow: error %d writing scratch location", error); + goto io_error_exit; + } + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", + (long long)logical_size); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); + + /* + * Update uberblock to indicate that scratch space is valid. This is + * needed because after this point, the real location may be + * overwritten. If we crash, we need to get the data from the + * scratch space, rather than the real location. + * + * Note: ub_timestamp is bumped so that vdev_uberblock_compare() + * will prefer this uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); + + /* + * Overwrite with reflow'ed data. + */ +overwrite: + pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (int i = 0; i < raidvd->vdev_children; i++) { + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, + raidz_scratch_child_done, pio)); + } + error = zio_wait(pio); + if (error != 0) { + /* + * When we exit early here and drop the range lock, new + * writes will go into the scratch area so we'll need to + * read from there when we return after pausing. + */ + zfs_dbgmsg("reflow: error %d writing real location", error); + /* + * Update the uberblock that is written when this txg completes. + */ + RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, + logical_size); + goto io_error_exit; + } + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", + (long long)logical_size); + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); + + /* + * Update uberblock to indicate that the initial part has been + * reflow'ed. This is needed because after this point (when we exit + * the rangelock), we allow regular writes to this region, which will + * be written to the new location only (because reflow_offset_next == + * reflow_offset_synced). If we crashed and re-copied from the + * scratch space, we would lose the regular writes. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, + logical_size); + spa->spa_ubsync.ub_timestamp++; + ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); + + /* + * Update progress. + */ + vre->vre_offset = logical_size; + zfs_rangelock_exit(lr); + spa_config_exit(spa, SCL_STATE, FTAG); + + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + /* + * Note - raidz_reflow_sync() will update the uberblock state to + * RRSS_SCRATCH_INVALID_SYNCED_REFLOW + */ + raidz_reflow_sync(spa, tx); + + raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); +} + +/* + * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work + * here. No other i/o can be in progress, so we don't need the vre_rangelock. + */ +void +vdev_raidz_reflow_copy_scratch(spa_t *spa) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); + ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + ASSERT0(logical_size % raidvd->vdev_children); + uint64_t write_size = logical_size / raidvd->vdev_children; + + zio_t *pio; + + /* + * Read from scratch space. + */ + abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), + KM_SLEEP); + for (int i = 0; i < raidvd->vdev_children; i++) { + abds[i] = abd_alloc_linear(write_size, B_FALSE); + } + + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + /* + * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to + * the offset to calculate the physical offset to write to. + * Passing in a negative offset lets us access the boot area. + */ + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], + write_size, ZIO_TYPE_READ, + ZIO_PRIORITY_ASYNC_READ, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + + /* + * Overwrite real location with reflow'ed data. + */ + pio = zio_root(spa, NULL, NULL, 0); + for (int i = 0; i < raidvd->vdev_children; i++) { + zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], + 0, abds[i], write_size, ZIO_TYPE_WRITE, + ZIO_PRIORITY_ASYNC_WRITE, 0, + raidz_scratch_child_done, pio)); + } + zio_wait(pio); + pio = zio_root(spa, NULL, NULL, 0); + zio_flush(pio, raidvd); + zio_wait(pio); + + zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " + "to real location", (long long)logical_size); + + for (int i = 0; i < raidvd->vdev_children; i++) + abd_free(abds[i]); + kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); + + /* + * Update uberblock. + */ + RAIDZ_REFLOW_SET(&spa->spa_ubsync, + RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); + spa->spa_ubsync.ub_timestamp++; + VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, + &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); + if (spa_multihost(spa)) + mmp_update_uberblock(spa, &spa->spa_ubsync); + + zfs_dbgmsg("reflow recovery: uberblock updated " + "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", + (long long)spa->spa_ubsync.ub_txg, + (long long)logical_size, + (long long)spa->spa_ubsync.ub_timestamp); + + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, + spa_first_txg(spa)); + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + vre->vre_offset = logical_size; + vre->vre_offset_pertxg[txgoff] = vre->vre_offset; + vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; + /* + * Note that raidz_reflow_sync() will update the uberblock once more + */ + raidz_reflow_sync(spa, tx); + + dmu_tx_commit(tx); + + spa_config_exit(spa, SCL_STATE, FTAG); +} + +static boolean_t +spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) +{ + (void) zthr; + spa_t *spa = arg; + + return (spa->spa_raidz_expand != NULL && + !spa->spa_raidz_expand->vre_waiting_for_resilver); +} + +/* + * RAIDZ expansion background thread + * + * Can be called multiple times if the reflow is paused + */ +static void +spa_raidz_expand_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) + vre->vre_offset = 0; + else + vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); + + /* Reflow the begining portion using the scratch area */ + if (vre->vre_offset == 0) { + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, raidz_reflow_scratch_sync, + vre, 0, ZFS_SPACE_CHECK_NONE)); + + /* if we encountered errors then pause */ + if (vre->vre_offset == 0) { + mutex_enter(&vre->vre_lock); + vre->vre_waiting_for_resilver = B_TRUE; + mutex_exit(&vre->vre_lock); + return; + } + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + uint64_t guid = raidvd->vdev_guid; + + /* Iterate over all the remaining metaslabs */ + for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; + i < raidvd->vdev_ms_count && + !zthr_iscancelled(zthr) && + vre->vre_failed_offset == UINT64_MAX; i++) { + metaslab_t *msp = raidvd->vdev_ms[i]; + + metaslab_disable(msp); + mutex_enter(&msp->ms_lock); + + /* + * The metaslab may be newly created (for the expanded + * space), in which case its trees won't exist yet, + * so we need to bail out early. + */ + if (msp->ms_new) { + mutex_exit(&msp->ms_lock); + metaslab_enable(msp, B_FALSE, B_FALSE); + continue; + } + + VERIFY0(metaslab_load(msp)); + + /* + * We want to copy everything except the free (allocatable) + * space. Note that there may be a little bit more free + * space (e.g. in ms_defer), and it's fine to copy that too. + */ + range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, + NULL, 0, 0); + range_tree_add(rt, msp->ms_start, msp->ms_size); + range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); + mutex_exit(&msp->ms_lock); + + /* + * Force the last sector of each metaslab to be copied. This + * ensures that we advance the on-disk progress to the end of + * this metaslab while the metaslab is disabled. Otherwise, we + * could move past this metaslab without advancing the on-disk + * progress, and then an allocation to this metaslab would not + * be copied. + */ + int sectorsz = 1 << raidvd->vdev_ashift; + uint64_t ms_last_offset = msp->ms_start + + msp->ms_size - sectorsz; + if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { + range_tree_add(rt, ms_last_offset, sectorsz); + } + + /* + * When we are resuming from a paused expansion (i.e. + * when importing a pool with a expansion in progress), + * discard any state that we have already processed. + */ + range_tree_clear(rt, 0, vre->vre_offset); + + while (!zthr_iscancelled(zthr) && + !range_tree_is_empty(rt) && + vre->vre_failed_offset == UINT64_MAX) { + + /* + * We need to periodically drop the config lock so that + * writers can get in. Additionally, we can't wait + * for a txg to sync while holding a config lock + * (since a waiting writer could cause a 3-way deadlock + * with the sync thread, which also gets a config + * lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * If requested, pause the reflow when the amount + * specified by raidz_expand_max_reflow_bytes is reached + * + * This pause is only used during testing or debugging. + */ + while (raidz_expand_max_reflow_bytes != 0 && + raidz_expand_max_reflow_bytes <= + vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { + delay(hz); + } + + mutex_enter(&vre->vre_lock); + while (vre->vre_outstanding_bytes > + raidz_expand_max_copy_bytes) { + cv_wait(&vre->vre_cv, &vre->vre_lock); + } + mutex_exit(&vre->vre_lock); + + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Reacquire the vdev_config lock. Theoretically, the + * vdev_t that we're expanding may have changed. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + + boolean_t needsync = + raidz_reflow_impl(raidvd, vre, rt, tx); + + dmu_tx_commit(tx); + + if (needsync) { + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(spa->spa_dsl_pool, txg); + spa_config_enter(spa, SCL_CONFIG, FTAG, + RW_READER); + } + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + metaslab_enable(msp, B_FALSE, B_FALSE); + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * The txg_wait_synced() here ensures that all reflow zio's have + * completed, and vre_failed_offset has been set if necessary. It + * also ensures that the progress of the last raidz_reflow_sync() is + * written to disk before raidz_reflow_complete_sync() changes the + * in-memory vre_state. vdev_raidz_io_start() uses vre_state to + * determine if a reflow is in progress, in which case we may need to + * write to both old and new locations. Therefore we can only change + * vre_state once this is not necessary, which is once the on-disk + * progress (in spa_ubsync) has been set past any possible writes (to + * the end of the last metaslab). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + + if (!zthr_iscancelled(zthr) && + vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { + /* + * We are not being canceled or paused, so the reflow must be + * complete. In that case also mark it as completed on disk. + */ + ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + raidz_reflow_complete_sync, spa, + 0, ZFS_SPACE_CHECK_NONE)); + (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); + } else { + /* + * Wait for all copy zio's to complete and for all the + * raidz_reflow_sync() synctasks to be run. + */ + spa_history_log_internal(spa, "reflow pause", + NULL, "offset=%llu failed_offset=%lld", + (long long)vre->vre_offset, + (long long)vre->vre_failed_offset); + mutex_enter(&vre->vre_lock); + if (vre->vre_failed_offset != UINT64_MAX) { + /* + * Reset progress so that we will retry everything + * after the point that something failed. + */ + vre->vre_offset = vre->vre_failed_offset; + vre->vre_failed_offset = UINT64_MAX; + vre->vre_waiting_for_resilver = B_TRUE; + } + mutex_exit(&vre->vre_lock); + } +} + +void +spa_start_raidz_expansion_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); + spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", + spa_raidz_expand_thread_check, spa_raidz_expand_thread, + spa, defclsyspri); +} + +void +raidz_dtl_reassessed(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + if (spa->spa_raidz_expand != NULL) { + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + /* + * we get called often from vdev_dtl_reassess() so make + * sure it's our vdev and any replacing is complete + */ + if (vd->vdev_top->vdev_id == vre->vre_vdev_id && + !vdev_raidz_expand_child_replacing(vd->vdev_top)) { + mutex_enter(&vre->vre_lock); + if (vre->vre_waiting_for_resilver) { + vdev_dbgmsg(vd, "DTL reassessed, " + "continuing raidz expansion"); + vre->vre_waiting_for_resilver = B_FALSE; + zthr_wakeup(spa->spa_raidz_expand_zthr); + } + mutex_exit(&vre->vre_lock); + } + } +} + +int +vdev_raidz_attach_check(vdev_t *new_child) +{ + vdev_t *raidvd = new_child->vdev_parent; + uint64_t new_children = raidvd->vdev_children; + + /* + * We use the "boot" space as scratch space to handle overwriting the + * initial part of the vdev. If it is too small, then this expansion + * is not allowed. This would be very unusual (e.g. ashift > 13 and + * >200 children). + */ + if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { + return (EINVAL); + } + return (0); +} + +void +vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *new_child = arg; + spa_t *spa = new_child->vdev_spa; + vdev_t *raidvd = new_child->vdev_parent; + vdev_raidz_t *vdrz = raidvd->vdev_tsd; + ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); + ASSERT3P(raidvd->vdev_top, ==, raidvd); + ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); + ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); + ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, + new_child); + + spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); + + vdrz->vd_physical_width++; + + VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); + vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; + vdrz->vn_vre.vre_offset = 0; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + spa->spa_raidz_expand = &vdrz->vn_vre; + zthr_wakeup(spa->spa_raidz_expand_zthr); + + /* + * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get + * written to the config. + */ + vdev_config_dirty(raidvd); + + vdrz->vn_vre.vre_start_time = gethrestime_sec(); + vdrz->vn_vre.vre_end_time = 0; + vdrz->vn_vre.vre_state = DSS_SCANNING; + vdrz->vn_vre.vre_bytes_copied = 0; + + uint64_t state = vdrz->vn_vre.vre_state; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state, tx)); + + uint64_t start_time = vdrz->vn_vre.vre_start_time; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time, tx)); + + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); + (void) zap_remove(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); + + spa_history_log_internal(spa, "raidz vdev expansion started", tx, + "%s vdev %llu new width %llu", spa_name(spa), + (unsigned long long)raidvd->vdev_id, + (unsigned long long)raidvd->vdev_children); +} + +int +vdev_raidz_load(vdev_t *vd) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + int err; + + uint64_t state = DSS_NONE; + uint64_t start_time = 0; + uint64_t end_time = 0; + uint64_t bytes_copied = 0; + + if (vd->vdev_top_zap != 0) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, + sizeof (state), 1, &state); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, + sizeof (start_time), 1, &start_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, + sizeof (end_time), 1, &end_time); + if (err != 0 && err != ENOENT) + return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, + sizeof (bytes_copied), 1, &bytes_copied); + if (err != 0 && err != ENOENT) + return (err); + } + + /* + * If we are in the middle of expansion, vre_state should have + * already been set by vdev_raidz_init(). + */ + EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); + vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; + vdrz->vn_vre.vre_start_time = start_time; + vdrz->vn_vre.vre_end_time = end_time; + vdrz->vn_vre.vre_bytes_copied = bytes_copied; + + return (0); +} + +int +spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) +{ + vdev_raidz_expand_t *vre = spa->spa_raidz_expand; + + if (vre == NULL) { + /* no removal in progress; find most recent completed */ + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_raidz_t *vdrz = vd->vdev_tsd; + + if (vdrz->vn_vre.vre_end_time != 0 && + (vre == NULL || + vdrz->vn_vre.vre_end_time > + vre->vre_end_time)) { + vre = &vdrz->vn_vre; + } + } + } + } + + if (vre == NULL) { + return (SET_ERROR(ENOENT)); + } + + pres->pres_state = vre->vre_state; + pres->pres_expanding_vdev = vre->vre_vdev_id; + + vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); + pres->pres_to_reflow = vd->vdev_stat.vs_alloc; + + mutex_enter(&vre->vre_lock); + pres->pres_reflowed = vre->vre_bytes_copied; + for (int i = 0; i < TXG_SIZE; i++) + pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; + mutex_exit(&vre->vre_lock); + + pres->pres_start_time = vre->vre_start_time; + pres->pres_end_time = vre->vre_end_time; + pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; + + return (0); +} + /* * Initialize private RAIDZ specific fields from the nvlist. */ static int vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) { - vdev_raidz_t *vdrz; - uint64_t nparity; - uint_t children; nvlist_t **child; int error = nvlist_lookup_nvlist_array(nv, @@ -2566,6 +4819,7 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) if (error != 0) return (SET_ERROR(EINVAL)); + uint64_t nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); @@ -2592,10 +4846,56 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) nparity = 1; } - vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); - vdrz->vd_logical_width = children; + vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + vdrz->vn_vre.vre_vdev_id = -1; + vdrz->vn_vre.vre_offset = UINT64_MAX; + vdrz->vn_vre.vre_failed_offset = UINT64_MAX; + mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); + zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); + mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, + sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); + + vdrz->vd_physical_width = children; vdrz->vd_nparity = nparity; + /* note, the ID does not exist when creating a pool */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &vdrz->vn_vre.vre_vdev_id); + + boolean_t reflow_in_progress = + nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + if (reflow_in_progress) { + spa->spa_raidz_expand = &vdrz->vn_vre; + vdrz->vn_vre.vre_state = DSS_SCANNING; + } + + vdrz->vd_original_width = children; + uint64_t *txgs; + unsigned int txgs_size = 0; + error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + &txgs, &txgs_size); + if (error == 0) { + for (int i = 0; i < txgs_size; i++) { + reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); + re->re_txg = txgs[txgs_size - i - 1]; + re->re_logical_width = vdrz->vd_physical_width - i; + + if (reflow_in_progress) + re->re_logical_width--; + + avl_add(&vdrz->vd_expand_txgs, re); + } + + vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; + } + if (reflow_in_progress) { + vdrz->vd_original_width--; + zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", + children, txgs_size); + } + *tsd = vdrz; return (0); @@ -2604,7 +4904,20 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) static void vdev_raidz_fini(vdev_t *vd) { - kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) + vd->vdev_spa->spa_raidz_expand = NULL; + reflow_node_t *re; + void *cookie = NULL; + avl_tree_t *tree = &vdrz->vd_expand_txgs; + while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) + kmem_free(re, sizeof (*re)); + avl_destroy(&vdrz->vd_expand_txgs); + mutex_destroy(&vdrz->vd_expand_lock); + mutex_destroy(&vdrz->vn_vre.vre_lock); + cv_destroy(&vdrz->vn_vre.vre_cv); + zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); + kmem_free(vdrz, sizeof (*vdrz)); } /* @@ -2632,6 +4945,29 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); + + if (vdrz->vn_vre.vre_state == DSS_SCANNING) { + fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + } + + mutex_enter(&vdrz->vd_expand_lock); + if (!avl_is_empty(&vdrz->vd_expand_txgs)) { + uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); + uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, + KM_SLEEP); + uint64_t i = 0; + + for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); + re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { + txgs[i++] = re->re_txg; + } + + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, + txgs, count); + + kmem_free(txgs, sizeof (uint64_t) * count); + } + mutex_exit(&vdrz->vd_expand_lock); } static uint64_t @@ -2671,3 +5007,15 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, + "For testing, pause RAIDZ expansion after reflowing this many bytes"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, + "Max amount of concurrent i/o for RAIDZ expansion"); +ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, + "For expanded RAIDZ, aggregate reads that have more rows than this"); +ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, + "For expanded RAIDZ, automatically start a pool scrub when expansion " + "completes"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c index 03e17db024ea..1c54eae40355 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_trim.c +++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c @@ -169,7 +169,8 @@ static boolean_t vdev_trim_should_stop(vdev_t *vd) { return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || - vd->vdev_detached || vd->vdev_top->vdev_removing); + vd->vdev_detached || vd->vdev_top->vdev_removing || + vd->vdev_top->vdev_rz_expanding); } /* @@ -180,6 +181,7 @@ vdev_autotrim_should_stop(vdev_t *tvd) { return (tvd->vdev_autotrim_exit_wanted || !vdev_writeable(tvd) || tvd->vdev_removing || + tvd->vdev_rz_expanding || spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } @@ -222,7 +224,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); - if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + if (vd == NULL || vd->vdev_top->vdev_removing || + !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding) return; uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; @@ -1005,6 +1008,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_rz_expanding); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); vd->vdev_trim_thread = thread_create(NULL, 0, @@ -1162,12 +1166,13 @@ vdev_trim_restart(vdev_t *vd) ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_action_time = timestamp; - if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || - vd->vdev_offline) { + if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || + vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_trim_load(vd)); } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + !vd->vdev_top->vdev_rz_expanding && vd->vdev_trim_thread == NULL) { VERIFY0(vdev_trim_load(vd)); vdev_trim(vd, vd->vdev_trim_rate, @@ -1492,7 +1497,8 @@ vdev_autotrim(spa_t *spa) mutex_enter(&tvd->vdev_autotrim_lock); if (vdev_writeable(tvd) && !tvd->vdev_removing && - tvd->vdev_autotrim_thread == NULL) { + tvd->vdev_autotrim_thread == NULL && + !tvd->vdev_rz_expanding) { ASSERT3P(tvd->vdev_top, ==, tvd); tvd->vdev_autotrim_thread = thread_create(NULL, 0, @@ -1717,6 +1723,7 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_top->vdev_removing); + ASSERT(!vd->vdev_top->vdev_rz_expanding); ta.trim_vdev = vd; ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); |