aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2023-11-09 10:42:33 +0000
committerMartin Matuska <mm@FreeBSD.org>2023-11-09 12:19:17 +0000
commite716630d4cf89e69ec3f675ebfceee09f1a85e05 (patch)
tree3ee825a5671f470e1481d24312b58895a12d01ac /sys/contrib/openzfs/module/zfs
parentf5b3e686292b6502878c64c3c154908024e06eb6 (diff)
parent887a3c533b94a4b70075e310f15c45b9dee19410 (diff)
zfs: merge openzfs/zfs@887a3c533
Notable upstream pull request merges: #15022 5caeef02f RAID-Z expansion feature #15457 887a3c533 Increase L2ARC write rate and headroom #15504 1c1be60fa Unbreak FreeBSD world build after 3bd4df384 Obtained from: OpenZFS OpenZFS commit: 887a3c533b94a4b70075e310f15c45b9dee19410
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
-rw-r--r--sys/contrib/openzfs/module/zfs/arc.c12
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_scan.c1
-rw-r--r--sys/contrib/openzfs/module/zfs/metaslab.c12
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c240
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_checkpoint.c3
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c114
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_draid.c28
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_initialize.c12
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_label.c51
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz.c2556
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_trim.c17
11 files changed, 2817 insertions, 229 deletions
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index dfea15b74394..2d08cc5e7240 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -776,8 +776,8 @@ uint64_t zfs_crc64_table[256];
* Level 2 ARC
*/
-#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
-#define L2ARC_HEADROOM 2 /* num of writes */
+#define L2ARC_WRITE_SIZE (32 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 8 /* num of writes */
/*
* If we discover during ARC scan any buffers to be compressed, we boost
@@ -4518,7 +4518,7 @@ arc_evict_cb_check(void *arg, zthr_t *zthr)
static void
arc_evict_cb(void *arg, zthr_t *zthr)
{
- (void) arg, (void) zthr;
+ (void) arg;
uint64_t evicted = 0;
fstrans_cookie_t cookie = spl_fstrans_mark();
@@ -4542,9 +4542,13 @@ arc_evict_cb(void *arg, zthr_t *zthr)
* infinite loop. Additionally, zthr_iscancelled() is
* checked here so that if the arc is shutting down, the
* broadcast will wake any remaining arc evict waiters.
+ *
+ * Note we cancel using zthr instead of arc_evict_zthr
+ * because the latter may not yet be initializd when the
+ * callback is first invoked.
*/
mutex_enter(&arc_evict_lock);
- arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&
+ arc_evict_needed = !zthr_iscancelled(zthr) &&
evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
if (!arc_evict_needed) {
/*
diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
index 34012db82dee..e16128fdff87 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_scan.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@@ -3066,7 +3066,6 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
dsl_scan_visit_rootbp(scn, NULL,
&dp->dp_meta_rootbp, tx);
- spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
if (scn->scn_suspending)
return;
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index e0d4a6a63508..0983ba143a1d 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -4342,7 +4342,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
metaslab_class_get_alloc(spa_normal_class(spa));
- if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
+ if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||
+ vd->vdev_rz_expanding) {
defer_allowed = B_FALSE;
}
@@ -4650,6 +4651,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
ASSERT(MUTEX_HELD(&msp->ms_lock));
VERIFY(!msp->ms_condensing);
VERIFY0(msp->ms_disabled);
+ VERIFY0(msp->ms_new);
start = mc->mc_ops->msop_alloc(msp, size);
if (start != -1ULL) {
@@ -4721,10 +4723,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
}
/*
- * If the selected metaslab is condensing or disabled,
- * skip it.
+ * If the selected metaslab is condensing or disabled, or
+ * hasn't gone through a metaslab_sync_done(), then skip it.
*/
- if (msp->ms_condensing || msp->ms_disabled > 0)
+ if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)
continue;
*was_active = msp->ms_allocator != -1;
@@ -5270,7 +5272,7 @@ top:
ASSERT(mg->mg_class == mc);
- uint64_t asize = vdev_psize_to_asize(vd, psize);
+ uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
/*
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 68f367c1c744..20225640f8c5 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -63,6 +63,7 @@
#include <sys/vdev_rebuild.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
+#include <sys/vdev_raidz.h>
#include <sys/vdev_draid.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -1709,6 +1710,10 @@ spa_destroy_aux_threads(spa_t *spa)
zthr_destroy(spa->spa_livelist_condense_zthr);
spa->spa_livelist_condense_zthr = NULL;
}
+ if (spa->spa_raidz_expand_zthr != NULL) {
+ zthr_destroy(spa->spa_raidz_expand_zthr);
+ spa->spa_raidz_expand_zthr = NULL;
+ }
}
/*
@@ -1861,6 +1866,8 @@ spa_unload(spa_t *spa)
spa->spa_compatibility = NULL;
}
+ spa->spa_raidz_expand = NULL;
+
spa_config_exit(spa, SCL_ALL, spa);
}
@@ -2999,6 +3006,7 @@ spa_spawn_aux_threads(spa_t *spa)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ spa_start_raidz_expansion_thread(spa);
spa_start_indirect_condensing_thread(spa);
spa_start_livelist_destroy_thread(spa);
spa_start_livelist_condensing_thread(spa);
@@ -3753,6 +3761,12 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
}
spa_load_note(spa, "using uberblock with txg=%llu",
(u_longlong_t)ub->ub_txg);
+ if (ub->ub_raidz_reflow_info != 0) {
+ spa_load_note(spa, "uberblock raidz_reflow_info: "
+ "state=%u offset=%llu",
+ (int)RRSS_GET_STATE(ub),
+ (u_longlong_t)RRSS_GET_OFFSET(ub));
+ }
/*
@@ -5092,6 +5106,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
/*
+ * Before we do any zio_write's, complete the raidz expansion
+ * scratch space copying, if necessary.
+ */
+ if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
+ vdev_raidz_reflow_copy_scratch(spa);
+
+ /*
* In case of a checkpoint rewind, log the original txg
* of the checkpointed uberblock.
*/
@@ -6905,9 +6926,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
}
/*
- * Attach a device to a mirror. The arguments are the path to any device
- * in the mirror, and the nvroot for the new device. If the path specifies
- * a device that is not mirrored, we automatically insert the mirror vdev.
+ * Attach a device to a vdev specified by its guid. The vdev type can be
+ * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
+ * single device). When the vdev is a single device, a mirror vdev will be
+ * automatically inserted.
*
* If 'replacing' is specified, the new device is intended to replace the
* existing device; in this case the two devices are made into their own
@@ -6930,7 +6952,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
char *oldvdpath, *newvdpath;
- int newvd_isspare;
+ int newvd_isspare = B_FALSE;
int error;
ASSERT(spa_writeable(spa));
@@ -6961,16 +6983,35 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
ZFS_ERR_REBUILD_IN_PROGRESS));
}
- if (spa->spa_vdev_removal != NULL)
- return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+ if (spa->spa_vdev_removal != NULL) {
+ return (spa_vdev_exit(spa, NULL, txg,
+ ZFS_ERR_DEVRM_IN_PROGRESS));
+ }
if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
- if (!oldvd->vdev_ops->vdev_op_leaf)
+ boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
+
+ if (raidz) {
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
+ return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+ /*
+ * Can't expand a raidz while prior expand is in progress.
+ */
+ if (spa->spa_raidz_expand != NULL) {
+ return (spa_vdev_exit(spa, NULL, txg,
+ ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
+ }
+ } else if (!oldvd->vdev_ops->vdev_op_leaf) {
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+ }
- pvd = oldvd->vdev_parent;
+ if (raidz)
+ pvd = oldvd;
+ else
+ pvd = oldvd->vdev_parent;
if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
VDEV_ALLOC_ATTACH) != 0)
@@ -7026,6 +7067,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
* vdev.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_raidz_ops &&
pvd->vdev_ops != &vdev_root_ops)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
@@ -7065,7 +7107,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
/*
* Make sure the new device is big enough.
*/
- if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+ vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
+ if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
/*
@@ -7076,31 +7119,74 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
+ * RAIDZ-expansion-specific checks.
+ */
+ if (raidz) {
+ if (vdev_raidz_attach_check(newvd) != 0)
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+ /*
+ * Fail early if a child is not healthy or being replaced
+ */
+ for (int i = 0; i < oldvd->vdev_children; i++) {
+ if (vdev_is_dead(oldvd->vdev_child[i]) ||
+ !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
+ return (spa_vdev_exit(spa, newrootvd, txg,
+ ENXIO));
+ }
+ /* Also fail if reserved boot area is in-use */
+ if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
+ != 0) {
+ return (spa_vdev_exit(spa, newrootvd, txg,
+ EADDRINUSE));
+ }
+ }
+ }
+
+ if (raidz) {
+ /*
+ * Note: oldvdpath is freed by spa_strfree(), but
+ * kmem_asprintf() is freed by kmem_strfree(), so we have to
+ * move it to a spa_strdup-ed string.
+ */
+ char *tmp = kmem_asprintf("raidz%u-%u",
+ (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
+ oldvdpath = spa_strdup(tmp);
+ kmem_strfree(tmp);
+ } else {
+ oldvdpath = spa_strdup(oldvd->vdev_path);
+ }
+ newvdpath = spa_strdup(newvd->vdev_path);
+
+ /*
* If this is an in-place replacement, update oldvd's path and devid
* to make it distinguishable from newvd, and unopenable from now on.
*/
- if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+ if (strcmp(oldvdpath, newvdpath) == 0) {
spa_strfree(oldvd->vdev_path);
- oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+ oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
KM_SLEEP);
- (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
- "%s/%s", newvd->vdev_path, "old");
+ (void) sprintf(oldvd->vdev_path, "%s/old",
+ newvdpath);
if (oldvd->vdev_devid != NULL) {
spa_strfree(oldvd->vdev_devid);
oldvd->vdev_devid = NULL;
}
+ spa_strfree(oldvdpath);
+ oldvdpath = spa_strdup(oldvd->vdev_path);
}
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
*/
- if (pvd->vdev_ops != pvops)
+ if (!raidz && pvd->vdev_ops != pvops) {
pvd = vdev_add_parent(oldvd, pvops);
+ ASSERT(pvd->vdev_ops == pvops);
+ ASSERT(oldvd->vdev_parent == pvd);
+ }
ASSERT(pvd->vdev_top->vdev_parent == rvd);
- ASSERT(pvd->vdev_ops == pvops);
- ASSERT(oldvd->vdev_parent == pvd);
/*
* Extract the new device from its root and add it to pvd.
@@ -7128,41 +7214,66 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
*/
dtl_max_txg = txg + TXG_CONCURRENT_STATES;
- vdev_dtl_dirty(newvd, DTL_MISSING,
- TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
+ if (raidz) {
+ /*
+ * Wait for the youngest allocations and frees to sync,
+ * and then wait for the deferral of those frees to finish.
+ */
+ spa_vdev_config_exit(spa, NULL,
+ txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
- if (newvd->vdev_isspare) {
- spa_spare_activate(newvd);
- spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
- }
+ vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
+ vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
+ vdev_autotrim_stop_wait(tvd);
- oldvdpath = spa_strdup(oldvd->vdev_path);
- newvdpath = spa_strdup(newvd->vdev_path);
- newvd_isspare = newvd->vdev_isspare;
+ dtl_max_txg = spa_vdev_config_enter(spa);
- /*
- * Mark newvd's DTL dirty in this txg.
- */
- vdev_dirty(tvd, VDD_DTL, newvd, txg);
+ tvd->vdev_rz_expanding = B_TRUE;
- /*
- * Schedule the resilver or rebuild to restart in the future. We do
- * this to ensure that dmu_sync-ed blocks have been stitched into the
- * respective datasets.
- */
- if (rebuild) {
- newvd->vdev_rebuild_txg = txg;
+ vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
+ vdev_config_dirty(tvd);
- vdev_rebuild(tvd);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+ dtl_max_txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
+ newvd, tx);
+ dmu_tx_commit(tx);
} else {
- newvd->vdev_resilver_txg = txg;
+ vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+ dtl_max_txg - TXG_INITIAL);
+
+ if (newvd->vdev_isspare) {
+ spa_spare_activate(newvd);
+ spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+ }
+
+ newvd_isspare = newvd->vdev_isspare;
+
+ /*
+ * Mark newvd's DTL dirty in this txg.
+ */
+ vdev_dirty(tvd, VDD_DTL, newvd, txg);
- if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
- spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
- vdev_defer_resilver(newvd);
+ /*
+ * Schedule the resilver or rebuild to restart in the future.
+ * We do this to ensure that dmu_sync-ed blocks have been
+ * stitched into the respective datasets.
+ */
+ if (rebuild) {
+ newvd->vdev_rebuild_txg = txg;
+
+ vdev_rebuild(tvd);
} else {
- dsl_scan_restart_resilver(spa->spa_dsl_pool,
- dtl_max_txg);
+ newvd->vdev_resilver_txg = txg;
+
+ if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+ spa_feature_is_enabled(spa,
+ SPA_FEATURE_RESILVER_DEFER)) {
+ vdev_defer_resilver(newvd);
+ } else {
+ dsl_scan_restart_resilver(spa->spa_dsl_pool,
+ dtl_max_txg);
+ }
}
}
@@ -7487,7 +7598,7 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
*/
if (cmd_type == POOL_INITIALIZE_START &&
(vd->vdev_initialize_thread != NULL ||
- vd->vdev_top->vdev_removing)) {
+ vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
mutex_exit(&vd->vdev_initialize_lock);
return (SET_ERROR(EBUSY));
} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
@@ -7609,7 +7720,8 @@ spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
* which has completed but the thread is not exited.
*/
if (cmd_type == POOL_TRIM_START &&
- (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+ (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
+ vd->vdev_top->vdev_rz_expanding)) {
mutex_exit(&vd->vdev_trim_lock);
return (SET_ERROR(EBUSY));
} else if (cmd_type == POOL_TRIM_CANCEL &&
@@ -8512,6 +8624,10 @@ spa_async_suspend(spa_t *spa)
if (condense_thread != NULL)
zthr_cancel(condense_thread);
+ zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+ if (raidz_expand_thread != NULL)
+ zthr_cancel(raidz_expand_thread);
+
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
if (discard_thread != NULL)
zthr_cancel(discard_thread);
@@ -8538,6 +8654,10 @@ spa_async_resume(spa_t *spa)
if (condense_thread != NULL)
zthr_resume(condense_thread);
+ zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
+ if (raidz_expand_thread != NULL)
+ zthr_resume(raidz_expand_thread);
+
zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
if (discard_thread != NULL)
zthr_resume(discard_thread);
@@ -9343,6 +9463,27 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
!= NULL)
vdev_sync(vd, txg);
+ if (pass == 1) {
+ /*
+ * dsl_pool_sync() -> dp_sync_tasks may have dirtied
+ * the config. If that happens, this txg should not
+ * be a no-op. So we must sync the config to the MOS
+ * before checking for no-op.
+ *
+ * Note that when the config is dirty, it will
+ * be written to the MOS (i.e. the MOS will be
+ * dirtied) every time we call spa_sync_config_object()
+ * in this txg. Therefore we can't call this after
+ * dsl_pool_sync() every pass, because it would
+ * prevent us from converging, since we'd dirty
+ * the MOS every pass.
+ *
+ * Sync tasks can only be processed in pass 1, so
+ * there's no need to do this in later passes.
+ */
+ spa_sync_config_object(spa, tx);
+ }
+
/*
* Note: We need to check if the MOS is dirty because we could
* have marked the MOS dirty without updating the uberblock
@@ -10100,7 +10241,8 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
DSS_SCANNING);
break;
case ZPOOL_WAIT_RESILVER:
- if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+ *in_progress = vdev_rebuild_active(spa->spa_root_vdev);
+ if (*in_progress)
break;
zfs_fallthrough;
case ZPOOL_WAIT_SCRUB:
@@ -10115,6 +10257,12 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
is_scrub == (activity == ZPOOL_WAIT_SCRUB));
break;
}
+ case ZPOOL_WAIT_RAIDZ_EXPAND:
+ {
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+ *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
+ break;
+ }
default:
panic("unrecognized value for activity %d", activity);
}
diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
index b588f7041e5c..1efff47f87a0 100644
--- a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
+++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
@@ -465,6 +465,9 @@ spa_checkpoint_check(void *arg, dmu_tx_t *tx)
if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
+ if (spa->spa_raidz_expand != NULL)
+ return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
+
if (spa->spa_checkpoint_txg != 0)
return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index afb01c0ef7fd..c10c78ebf6db 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -58,6 +58,7 @@
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
+#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
#include "zfs_prop.h"
@@ -305,13 +306,13 @@ vdev_derive_alloc_bias(const char *bias)
* all children. This is what's used by anything other than RAID-Z.
*/
uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
+vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
for (int c = 0; c < vd->vdev_children; c++) {
- csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
asize = MAX(asize, csize);
}
@@ -930,6 +931,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_removing);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
&vd->vdev_top_zap);
+ vd->vdev_rz_expanding = nvlist_exists(nv,
+ ZPOOL_CONFIG_RAIDZ_EXPANDING);
} else {
ASSERT0(vd->vdev_top_zap);
}
@@ -1692,6 +1695,8 @@ vdev_probe_done(zio_t *zio)
vd->vdev_cant_read |= !vps->vps_readable;
vd->vdev_cant_write |= !vps->vps_writeable;
+ vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
+ vd->vdev_cant_read, vd->vdev_cant_write);
if (vdev_readable(vd) &&
(vdev_writeable(vd) || !spa_writeable(spa))) {
@@ -1913,17 +1918,20 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
}
/*
- * Compute the raidz-deflation ratio. Note, we hard-code
- * in 128k (1 << 17) because it is the "typical" blocksize.
- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
- * otherwise it would inconsistently account for existing bp's.
+ * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17)
+ * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE
+ * changed, this algorithm can not change, otherwise it would inconsistently
+ * account for existing bp's. We also hard-code txg 0 for the same reason
+ * since expanded RAIDZ vdevs can use a different asize for different birth
+ * txg's.
*/
static void
vdev_set_deflate_ratio(vdev_t *vd)
{
if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
vd->vdev_deflate_ratio = (1 << 17) /
- (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+ (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
+ SPA_MINBLOCKSHIFT);
}
}
@@ -3228,32 +3236,43 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
if (txg != 0)
vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
- return;
+ } else {
+ mutex_enter(&vd->vdev_dtl_lock);
+ for (int t = 0; t < DTL_TYPES; t++) {
+ /* account for child's outage in parent's missing map */
+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+ if (t == DTL_SCRUB) {
+ /* leaf vdevs only */
+ continue;
+ }
+ if (t == DTL_PARTIAL) {
+ /* i.e. non-zero */
+ minref = 1;
+ } else if (vdev_get_nparity(vd) != 0) {
+ /* RAIDZ, DRAID */
+ minref = vdev_get_nparity(vd) + 1;
+ } else {
+ /* any kind of mirror */
+ minref = vd->vdev_children;
+ }
+ space_reftree_create(&reftree);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ mutex_enter(&cvd->vdev_dtl_lock);
+ space_reftree_add_map(&reftree,
+ cvd->vdev_dtl[s], 1);
+ mutex_exit(&cvd->vdev_dtl_lock);
+ }
+ space_reftree_generate_map(&reftree,
+ vd->vdev_dtl[t], minref);
+ space_reftree_destroy(&reftree);
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
}
- mutex_enter(&vd->vdev_dtl_lock);
- for (int t = 0; t < DTL_TYPES; t++) {
- /* account for child's outage in parent's missing map */
- int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
- if (t == DTL_SCRUB)
- continue; /* leaf vdevs only */
- if (t == DTL_PARTIAL)
- minref = 1; /* i.e. non-zero */
- else if (vdev_get_nparity(vd) != 0)
- minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
- else
- minref = vd->vdev_children; /* any kind of mirror */
- space_reftree_create(&reftree);
- for (int c = 0; c < vd->vdev_children; c++) {
- vdev_t *cvd = vd->vdev_child[c];
- mutex_enter(&cvd->vdev_dtl_lock);
- space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
- mutex_exit(&cvd->vdev_dtl_lock);
- }
- space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
- space_reftree_destroy(&reftree);
+ if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
+ raidz_dtl_reassessed(vd);
}
- mutex_exit(&vd->vdev_dtl_lock);
}
/*
@@ -3628,6 +3647,12 @@ vdev_load(vdev_t *vd)
vdev_set_deflate_ratio(vd);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ error = vdev_raidz_load(vd);
+ if (error != 0)
+ return (error);
+ }
+
/*
* On spa_load path, grab the allocation bias from our zap
*/
@@ -4005,10 +4030,22 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
+/*
+ * Return the amount of space that should be (or was) allocated for the given
+ * psize (compressed block size) in the given TXG. Note that for expanded
+ * RAIDZ vdevs, the size allocated for older BP's may be larger. See
+ * vdev_raidz_asize().
+ */
+uint64_t
+vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
+}
+
uint64_t
vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
{
- return (vd->vdev_ops->vdev_op_asize(vd, psize));
+ return (vdev_psize_to_asize_txg(vd, psize, 0));
}
/*
@@ -4174,9 +4211,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
- if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
-
wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
oldstate = vd->vdev_state;
@@ -5457,7 +5491,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
vdev_set_deflate_ratio(vd);
- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ if ((vd->vdev_spa->spa_raidz_expand == NULL ||
+ vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
+ (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
vdev_is_concrete(vd)) {
vdev_metaslab_group_create(vd);
VERIFY(vdev_metaslab_init(vd, txg) == 0);
@@ -6209,6 +6245,14 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
vdev_prop_add_list(outnvl, propname, NULL,
vd->vdev_removing, ZPROP_SRC_NONE);
continue;
+ case VDEV_PROP_RAIDZ_EXPANDING:
+ /* Only expose this for raidz */
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ vdev_prop_add_list(outnvl, propname,
+ NULL, vd->vdev_rz_expanding,
+ ZPROP_SRC_NONE);
+ }
+ continue;
/* Numeric Properites */
case VDEV_PROP_ALLOCATING:
/* Leaf vdevs cannot have this property */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
index 307e2353d020..ec961255fd64 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_draid.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -577,8 +577,9 @@ vdev_draid_permute_id(vdev_draid_config_t *vdc,
* i.e. vdev_draid_psize_to_asize().
*/
static uint64_t
-vdev_draid_asize(vdev_t *vd, uint64_t psize)
+vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
{
+ (void) txg;
vdev_draid_config_t *vdc = vd->vdev_tsd;
uint64_t ashift = vd->vdev_ashift;
@@ -960,7 +961,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
vdev_draid_config_t *vdc = vd->vdev_tsd;
uint64_t ashift = vd->vdev_top->vdev_ashift;
uint64_t io_size = abd_size;
- uint64_t io_asize = vdev_draid_asize(vd, io_size);
+ uint64_t io_asize = vdev_draid_asize(vd, io_size, 0);
uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
@@ -1025,15 +1026,9 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
ASSERT3U(vdc->vdc_nparity, >, 0);
- raidz_row_t *rr;
- rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP);
- rr->rr_cols = groupwidth;
- rr->rr_scols = groupwidth;
+ raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth);
rr->rr_bigcols = bc;
- rr->rr_missingdata = 0;
- rr->rr_missingparity = 0;
rr->rr_firstdatacol = vdc->vdc_nparity;
- rr->rr_abd_empty = NULL;
#ifdef ZFS_DEBUG
rr->rr_offset = io_offset;
rr->rr_size = io_size;
@@ -1053,14 +1048,6 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
rc->rc_offset = physical_offset;
- rc->rc_abd = NULL;
- rc->rc_orig_data = NULL;
- rc->rc_error = 0;
- rc->rc_tried = 0;
- rc->rc_skipped = 0;
- rc->rc_force_repair = 0;
- rc->rc_allow_repair = 1;
- rc->rc_need_orig_restore = B_FALSE;
if (q == 0 && i >= bc)
rc->rc_size = 0;
@@ -1129,7 +1116,7 @@ vdev_draid_map_alloc(zio_t *zio)
if (size < abd_size) {
vdev_t *vd = zio->io_vd;
- io_offset += vdev_draid_asize(vd, size);
+ io_offset += vdev_draid_asize(vd, size, 0);
abd_offset += size;
abd_size -= size;
nrows++;
@@ -1151,7 +1138,6 @@ vdev_draid_map_alloc(zio_t *zio)
rm->rm_row[0] = rr[0];
if (nrows == 2)
rm->rm_row[1] = rr[1];
-
return (rm);
}
@@ -1783,7 +1769,7 @@ vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
uint64_t offset = DVA_GET_OFFSET(dva);
- uint64_t asize = vdev_draid_asize(vd, psize);
+ uint64_t asize = vdev_draid_asize(vd, psize, 0);
if (phys_birth == TXG_UNKNOWN) {
/*
@@ -1840,7 +1826,7 @@ vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = rr->rr_offset;
logical_rs.rs_end = logical_rs.rs_start +
- vdev_draid_asize(vd, rr->rr_size);
+ vdev_draid_asize(vd, rr->rr_size, 0);
raidz_col_t *rc = &rr->rr_col[col];
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
index ffdcef1972c3..5aaef1a69986 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
@@ -48,7 +48,8 @@ static boolean_t
vdev_initialize_should_stop(vdev_t *vd)
{
return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
- vd->vdev_detached || vd->vdev_top->vdev_removing);
+ vd->vdev_detached || vd->vdev_top->vdev_removing ||
+ vd->vdev_top->vdev_rz_expanding);
}
static void
@@ -67,7 +68,8 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
kmem_free(arg, sizeof (uint64_t));
vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
- if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ if (vd == NULL || vd->vdev_top->vdev_removing ||
+ !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)
return;
uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
@@ -631,6 +633,7 @@ vdev_initialize(vdev_t *vd)
ASSERT(!vd->vdev_detached);
ASSERT(!vd->vdev_initialize_exit_wanted);
ASSERT(!vd->vdev_top->vdev_removing);
+ ASSERT(!vd->vdev_top->vdev_rz_expanding);
vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
vd->vdev_initialize_thread = thread_create(NULL, 0,
@@ -791,13 +794,14 @@ vdev_initialize_restart(vdev_t *vd)
ASSERT(err == 0 || err == ENOENT);
vd->vdev_initialize_action_time = timestamp;
- if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
- vd->vdev_offline) {
+ if ((vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {
/* load progress for reporting, but don't resume */
VERIFY0(vdev_initialize_load(vd));
} else if (vd->vdev_initialize_state ==
VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
!vd->vdev_top->vdev_removing &&
+ !vd->vdev_top->vdev_rz_expanding &&
vd->vdev_initialize_thread == NULL) {
vdev_initialize(vd);
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index a2e5524a8391..e8f562a1a6a2 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -142,6 +142,7 @@
#include <sys/zap.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
#include <sys/vdev_draid.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
@@ -423,6 +424,13 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
sizeof (pcs) / sizeof (uint64_t));
}
+
+ pool_raidz_expand_stat_t pres;
+ if (spa_raidz_expand_get_stats(spa, &pres) == 0) {
+ fnvlist_add_uint64_array(nvl,
+ ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t *)&pres,
+ sizeof (pres) / sizeof (uint64_t));
+ }
}
static void
@@ -1504,7 +1512,8 @@ vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
}
struct ubl_cbdata {
- uberblock_t *ubl_ubbest; /* Best uberblock */
+ uberblock_t ubl_latest; /* Most recent uberblock */
+ uberblock_t *ubl_ubbest; /* Best uberblock (w/r/t max_txg) */
vdev_t *ubl_vd; /* vdev associated with the above */
};
@@ -1521,6 +1530,9 @@ vdev_uberblock_load_done(zio_t *zio)
if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
mutex_enter(&rio->io_lock);
+ if (vdev_uberblock_compare(ub, &cbp->ubl_latest) > 0) {
+ cbp->ubl_latest = *ub;
+ }
if (ub->ub_txg <= spa->spa_load_max_txg &&
vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
/*
@@ -1578,10 +1590,10 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
ASSERT(config);
memset(ub, 0, sizeof (uberblock_t));
+ memset(&cb, 0, sizeof (cb));
*config = NULL;
cb.ubl_ubbest = ub;
- cb.ubl_vd = NULL;
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
zio = zio_root(spa, NULL, &cb, flags);
@@ -1598,6 +1610,22 @@ vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
"txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
+ if (ub->ub_raidz_reflow_info !=
+ cb.ubl_latest.ub_raidz_reflow_info) {
+ vdev_dbgmsg(cb.ubl_vd,
+ "spa=%s best uberblock (txg=%llu info=0x%llx) "
+ "has different raidz_reflow_info than latest "
+ "uberblock (txg=%llu info=0x%llx)",
+ spa->spa_name,
+ (u_longlong_t)ub->ub_txg,
+ (u_longlong_t)ub->ub_raidz_reflow_info,
+ (u_longlong_t)cb.ubl_latest.ub_txg,
+ (u_longlong_t)cb.ubl_latest.ub_raidz_reflow_info);
+ memset(ub, 0, sizeof (uberblock_t));
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ return;
+ }
+
*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
if (*config == NULL && spa->spa_extreme_rewind) {
vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
@@ -1719,8 +1747,23 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
vd->vdev_copy_uberblocks = B_FALSE;
}
+ /*
+ * We chose a slot based on the txg. If this uberblock has a special
+ * RAIDZ expansion state, then it is essentially an update of the
+ * current uberblock (it has the same txg). However, the current
+ * state is committed, so we want to write it to a different slot. If
+ * we overwrote the same slot, and we lose power during the uberblock
+ * write, and the disk does not do single-sector overwrites
+ * atomically (even though it is required to - i.e. we should see
+ * either the old or the new uberblock), then we could lose this
+ * txg's uberblock. Rewinding to the previous txg's uberblock may not
+ * be possible because RAIDZ expansion may have already overwritten
+ * some of the data, so we need the progress indicator in the
+ * uberblock.
+ */
int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
- int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m);
+ int n = (ub->ub_txg - (RRSS_GET_STATE(ub) == RRSS_SCRATCH_VALID)) %
+ (VDEV_UBERBLOCK_COUNT(vd) - m);
/* Copy the uberblock_t into the ABD */
abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
@@ -1737,7 +1780,7 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
}
/* Sync the uberblocks to all vdevs in svd[] */
-static int
+int
vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
{
spa_t *spa = svd[0]->vdev_spa;
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index 3445fa9d35d5..9d0b8763f16f 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -27,15 +27,22 @@
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
+#include <sys/dmu_tx.h>
#include <sys/abd.h>
+#include <sys/zfs_rlock.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
#include <sys/vdev_raidz.h>
#include <sys/vdev_raidz_impl.h>
#include <sys/vdev_draid.h>
+#include <sys/uberblock_impl.h>
+#include <sys/dsl_scan.h>
#ifdef ZFS_DEBUG
#include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
@@ -135,6 +142,237 @@
VDEV_RAIDZ_64MUL_2((x), mask); \
}
+
+/*
+ * Big Theory Statement for how a RAIDZ VDEV is expanded
+ *
+ * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
+ * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
+ * that have been previously expanded can be expanded again.
+ *
+ * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
+ * the VDEV) when an expansion starts. And the expansion will pause if any
+ * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
+ * operations on the pool can continue while an expansion is in progress (e.g.
+ * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
+ * and zpool initialize which can't be run during an expansion. Following a
+ * reboot or export/import, the expansion resumes where it left off.
+ *
+ * == Reflowing the Data ==
+ *
+ * The expansion involves reflowing (copying) the data from the current set
+ * of disks to spread it across the new set which now has one more disk. This
+ * reflow operation is similar to reflowing text when the column width of a
+ * text editor window is expanded. The text doesn’t change but the location of
+ * the text changes to accommodate the new width. An example reflow result for
+ * a 4-wide RAIDZ1 to a 5-wide is shown below.
+ *
+ * Reflow End State
+ * Each letter indicates a parity group (logical stripe)
+ *
+ * Before expansion After Expansion
+ * D1 D2 D3 D4 D1 D2 D3 D4 D5
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | A | A | A | A | | A | A | A | A | B |
+ * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | B | B | C | C | | B | C | C | C | C |
+ * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | C | C | D | D | | D | D | E | E | E |
+ * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | E | E | E | E | --> | E | F | F | G | G |
+ * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | F | F | G | G | | G | G | H | H | H |
+ * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | G | G | H | H | | H | I | I | J | J |
+ * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
+ * +------+------+------+------+ +------+------+------+------+------+
+ * | | | | | | | | | | |
+ * | H | H | I | I | | J | J | | | K |
+ * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
+ * +------+------+------+------+ +------+------+------+------+------+
+ *
+ * This reflow approach has several advantages. There is no need to read or
+ * modify the block pointers or recompute any block checksums. The reflow
+ * doesn’t need to know where the parity sectors reside. We can read and write
+ * data sequentially and the copy can occur in a background thread in open
+ * context. The design also allows for fast discovery of what data to copy.
+ *
+ * The VDEV metaslabs are processed, one at a time, to copy the block data to
+ * have it flow across all the disks. The metaslab is disabled for allocations
+ * during the copy. As an optimization, we only copy the allocated data which
+ * can be determined by looking at the metaslab range tree. During the copy we
+ * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
+ * need to be able to survive losing parity count disks). This means we
+ * cannot overwrite data during the reflow that would be needed if a disk is
+ * lost.
+ *
+ * After the reflow completes, all newly-written blocks will have the new
+ * layout, i.e., they will have the parity to data ratio implied by the new
+ * number of disks in the RAIDZ group. Even though the reflow copies all of
+ * the allocated space (data and parity), it is only rearranged, not changed.
+ *
+ * This act of reflowing the data has a few implications about blocks
+ * that were written before the reflow completes:
+ *
+ * - Old blocks will still use the same amount of space (i.e., they will have
+ * the parity to data ratio implied by the old number of disks in the RAIDZ
+ * group).
+ * - Reading old blocks will be slightly slower than before the reflow, for
+ * two reasons. First, we will have to read from all disks in the RAIDZ
+ * VDEV, rather than being able to skip the children that contain only
+ * parity of this block (because the data of a single block is now spread
+ * out across all the disks). Second, in most cases there will be an extra
+ * bcopy, needed to rearrange the data back to its original layout in memory.
+ *
+ * == Scratch Area ==
+ *
+ * As we copy the block data, we can only progress to the point that writes
+ * will not overlap with blocks whose progress has not yet been recorded on
+ * disk. Since partially-copied rows are always read from the old location,
+ * we need to stop one row before the sector-wise overlap, to prevent any
+ * row-wise overlap. For example, in the diagram above, when we reflow sector
+ * B6 it will overwite the original location for B5.
+ *
+ * To get around this, a scratch space is used so that we can start copying
+ * without risking data loss by overlapping the row. As an added benefit, it
+ * improves performance at the beginning of the reflow, but that small perf
+ * boost wouldn't be worth the complexity on its own.
+ *
+ * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
+ * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
+ * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
+ * the widths will likely be single digits so we can get a substantial chuck
+ * size using only a few MB of scratch per disk.
+ *
+ * The scratch area is persisted to disk which holds a large amount of reflowed
+ * state. We can always read the partially written stripes when a disk fails or
+ * the copy is interrupted (crash) during the initial copying phase and also
+ * get past a small chunk size restriction. At a minimum, the scratch space
+ * must be large enough to get us to the point that one row does not overlap
+ * itself when moved (i.e new_width^2). But going larger is even better. We
+ * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
+ * as our scratch space to handle overwriting the initial part of the VDEV.
+ *
+ * 0 256K 512K 4M
+ * +------+------+-----------------------+-----------------------------
+ * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
+ * | L0 | L1 | Reserved | (Metaslabs)
+ * +------+------+-----------------------+-------------------------------
+ * Scratch Area
+ *
+ * == Reflow Progress Updates ==
+ * After the initial scratch-based reflow, the expansion process works
+ * similarly to device removal. We create a new open context thread which
+ * reflows the data, and periodically kicks off sync tasks to update logical
+ * state. In this case, state is the committed progress (offset of next data
+ * to copy). We need to persist the completed offset on disk, so that if we
+ * crash we know which format each VDEV offset is in.
+ *
+ * == Time Dependent Geometry ==
+ *
+ * In non-expanded RAIDZ, blocks are read from disk in a column by column
+ * fashion. For a multi-row block, the second sector is in the first column
+ * not in the second column. This allows us to issue full reads for each
+ * column directly into the request buffer. The block data is thus laid out
+ * sequentially in a column-by-column fashion.
+ *
+ * For example, in the before expansion diagram above, one logical block might
+ * be sectors G19-H26. The parity is in G19,H23; and the data is in
+ * G20,H24,G21,H25,G22,H26.
+ *
+ * After a block is reflowed, the sectors that were all in the original column
+ * data can now reside in different columns. When reading from an expanded
+ * VDEV, we need to know the logical stripe width for each block so we can
+ * reconstitute the block’s data after the reads are completed. Likewise,
+ * when we perform the combinatorial reconstruction we need to know the
+ * original width so we can retry combinations from the past layouts.
+ *
+ * Time dependent geometry is what we call having blocks with different layouts
+ * (stripe widths) in the same VDEV. This time-dependent geometry uses the
+ * block’s birth time (+ the time expansion ended) to establish the correct
+ * width for a given block. After an expansion completes, we record the time
+ * for blocks written with a particular width (geometry).
+ *
+ * == On Disk Format Changes ==
+ *
+ * New pool feature flag, 'raidz_expansion' whose reference count is the number
+ * of RAIDZ VDEVs that have been expanded.
+ *
+ * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
+ *
+ * Since the uberblock can point to arbitrary blocks, which might be on the
+ * expanding RAIDZ, and might or might not have been expanded. We need to know
+ * which way a block is laid out before reading it. This info is the next
+ * offset that needs to be reflowed and we persist that in the uberblock, in
+ * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
+ * After the expansion is complete, we then use the raidz_expand_txgs array
+ * (see below) to determine how to read a block and the ub_raidz_reflow_info
+ * field no longer required.
+ *
+ * The uberblock's ub_raidz_reflow_info field also holds the scratch space
+ * state (i.e., active or not) which is also required before reading a block
+ * during the initial phase of reflowing the data.
+ *
+ * The top-level RAIDZ VDEV has two new entries in the nvlist:
+ *
+ * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
+ * and used after the expansion is complete to
+ * determine how to read a raidz block
+ * 'raidz_expanding' boolean: present during reflow and removed after completion
+ * used during a spa import to resume an unfinished
+ * expansion
+ *
+ * And finally the VDEVs top zap adds the following informational entries:
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
+ */
+
+/*
+ * For testing only: pause the raidz expansion after reflowing this amount.
+ * (accessed by ZTS and ztest)
+ */
+#ifdef _KERNEL
+static
+#endif /* _KERNEL */
+unsigned long raidz_expand_max_reflow_bytes = 0;
+
+/*
+ * For testing only: pause the raidz expansion at a certain point.
+ */
+uint_t raidz_expand_pause_point = 0;
+
+/*
+ * Maximum amount of copy io's outstanding at once.
+ */
+static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
+
+/*
+ * Apply raidz map abds aggregation if the number of rows in the map is equal
+ * or greater than the value below.
+ */
+static unsigned long raidz_io_aggregate_rows = 4;
+
+/*
+ * Automatically start a pool scrub when a RAIDZ expansion completes in
+ * order to verify the checksums of all blocks which have been copied
+ * during the expansion. Automatic scrubbing is enabled by default and
+ * is strongly recommended.
+ */
+static int zfs_scrub_after_expand = 1;
+
static void
vdev_raidz_row_free(raidz_row_t *rr)
{
@@ -159,6 +397,17 @@ vdev_raidz_map_free(raidz_map_t *rm)
for (int i = 0; i < rm->rm_nrows; i++)
vdev_raidz_row_free(rm->rm_row[i]);
+ if (rm->rm_nphys_cols) {
+ for (int i = 0; i < rm->rm_nphys_cols; i++) {
+ if (rm->rm_phys_col[i].rc_abd != NULL)
+ abd_free(rm->rm_phys_col[i].rc_abd);
+ }
+
+ kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
+ rm->rm_nphys_cols);
+ }
+
+ ASSERT3P(rm->rm_lr, ==, NULL);
kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
}
@@ -170,10 +419,37 @@ vdev_raidz_map_free_vsd(zio_t *zio)
vdev_raidz_map_free(rm);
}
+static int
+vdev_raidz_reflow_compare(const void *x1, const void *x2)
+{
+ const reflow_node_t *l = x1;
+ const reflow_node_t *r = x2;
+
+ return (TREE_CMP(l->re_txg, r->re_txg));
+}
+
const zio_vsd_ops_t vdev_raidz_vsd_ops = {
.vsd_free = vdev_raidz_map_free_vsd,
};
+raidz_row_t *
+vdev_raidz_row_alloc(int cols)
+{
+ raidz_row_t *rr =
+ kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
+
+ rr->rr_cols = cols;
+ rr->rr_scols = cols;
+
+ for (int c = 0; c < cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_shadow_devidx = INT_MAX;
+ rc->rc_shadow_offset = UINT64_MAX;
+ rc->rc_allow_repair = 1;
+ }
+ return (rr);
+}
+
static void
vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
{
@@ -302,7 +578,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << ashift;
- uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ uint64_t acols, scols;
raidz_map_t *rm =
kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
@@ -312,22 +588,22 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
* "Quotient": The number of data sectors for this stripe on all but
* the "big column" child vdevs that also contain "remainder" data.
*/
- q = s / (dcols - nparity);
+ uint64_t q = s / (dcols - nparity);
/*
* "Remainder": The number of partial stripe data sectors in this I/O.
* This will add a sector to some, but not all, child vdevs.
*/
- r = s - q * (dcols - nparity);
+ uint64_t r = s - q * (dcols - nparity);
/* The number of "big columns" - those which contain remainder data. */
- bc = (r == 0 ? 0 : r + nparity);
+ uint64_t bc = (r == 0 ? 0 : r + nparity);
/*
* The total number of data and parity sectors associated with
* this I/O.
*/
- tot = s + nparity * (q + (r == 0 ? 0 : 1));
+ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
/*
* acols: The columns that will be accessed.
@@ -343,43 +619,28 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
}
ASSERT3U(acols, <=, scols);
-
- rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
+ rr = vdev_raidz_row_alloc(scols);
rm->rm_row[0] = rr;
-
rr->rr_cols = acols;
- rr->rr_scols = scols;
rr->rr_bigcols = bc;
- rr->rr_missingdata = 0;
- rr->rr_missingparity = 0;
rr->rr_firstdatacol = nparity;
- rr->rr_abd_empty = NULL;
- rr->rr_nempty = 0;
#ifdef ZFS_DEBUG
rr->rr_offset = zio->io_offset;
rr->rr_size = zio->io_size;
#endif
- asize = 0;
+ uint64_t asize = 0;
- for (c = 0; c < scols; c++) {
+ for (uint64_t c = 0; c < scols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
- col = f + c;
- coff = o;
+ uint64_t col = f + c;
+ uint64_t coff = o;
if (col >= dcols) {
col -= dcols;
coff += 1ULL << ashift;
}
rc->rc_devidx = col;
rc->rc_offset = coff;
- rc->rc_abd = NULL;
- rc->rc_orig_data = NULL;
- rc->rc_error = 0;
- rc->rc_tried = 0;
- rc->rc_skipped = 0;
- rc->rc_force_repair = 0;
- rc->rc_allow_repair = 1;
- rc->rc_need_orig_restore = B_FALSE;
if (c >= acols)
rc->rc_size = 0;
@@ -419,13 +680,12 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
- devidx = rr->rr_col[0].rc_devidx;
+ uint64_t devidx = rr->rr_col[0].rc_devidx;
o = rr->rr_col[0].rc_offset;
rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
rr->rr_col[1].rc_devidx = devidx;
rr->rr_col[1].rc_offset = o;
-
if (rm->rm_skipstart == 0)
rm->rm_skipstart = 1;
}
@@ -435,7 +695,338 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
} else {
vdev_raidz_map_alloc_read(zio, rm);
}
+ /* init RAIDZ parity ops */
+ rm->rm_ops = vdev_raidz_math_get_ops();
+
+ return (rm);
+}
+
+/*
+ * Everything before reflow_offset_synced should have been moved to the new
+ * location (read and write completed). However, this may not yet be reflected
+ * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
+ * uberblock has not yet been written). If reflow is not in progress,
+ * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
+ * entirely before reflow_offset_synced, it will come from the new location.
+ * Otherwise this row will come from the old location. Therefore, rows that
+ * straddle the reflow_offset_synced will come from the old location.
+ *
+ * For writes, reflow_offset_next is the next offset to copy. If a sector has
+ * been copied, but not yet reflected in the on-disk progress
+ * (reflow_offset_synced), it will also be written to the new (already copied)
+ * offset.
+ */
+noinline raidz_map_t *
+vdev_raidz_map_alloc_expanded(zio_t *zio,
+ uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
+ uint64_t nparity, uint64_t reflow_offset_synced,
+ uint64_t reflow_offset_next, boolean_t use_scratch)
+{
+ abd_t *abd = zio->io_abd;
+ uint64_t offset = zio->io_offset;
+ uint64_t size = zio->io_size;
+
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = size >> ashift;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ * AKA "full rows"
+ */
+ uint64_t q = s / (logical_cols - nparity);
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ uint64_t r = s - q * (logical_cols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
+ uint64_t bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
+ uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ /* How many rows contain data (not skip) */
+ uint64_t rows = howmany(tot, logical_cols);
+ int cols = MIN(tot, logical_cols);
+
+ raidz_map_t *rm =
+ kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
+ KM_SLEEP);
+ rm->rm_nrows = rows;
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+ rm->rm_skipstart = bc;
+ uint64_t asize = 0;
+
+ for (uint64_t row = 0; row < rows; row++) {
+ boolean_t row_use_scratch = B_FALSE;
+ raidz_row_t *rr = vdev_raidz_row_alloc(cols);
+ rm->rm_row[row] = rr;
+
+ /* The starting RAIDZ (parent) vdev sector of the row. */
+ uint64_t b = (offset >> ashift) + row * logical_cols;
+
+ /*
+ * If we are in the middle of a reflow, and the copying has
+ * not yet completed for any part of this row, then use the
+ * old location of this row. Note that reflow_offset_synced
+ * reflects the i/o that's been completed, because it's
+ * updated by a synctask, after zio_wait(spa_txg_zio[]).
+ * This is sufficient for our check, even if that progress
+ * has not yet been recorded to disk (reflected in
+ * spa_ubsync). Also note that we consider the last row to
+ * be "full width" (`cols`-wide rather than `bc`-wide) for
+ * this calculation. This causes a tiny bit of unnecessary
+ * double-writes but is safe and simpler to calculate.
+ */
+ int row_phys_cols = physical_cols;
+ if (b + cols > reflow_offset_synced >> ashift)
+ row_phys_cols--;
+ else if (use_scratch)
+ row_use_scratch = B_TRUE;
+
+ /* starting child of this row */
+ uint64_t child_id = b % row_phys_cols;
+ /* The starting byte offset on each child vdev. */
+ uint64_t child_offset = (b / row_phys_cols) << ashift;
+
+ /*
+ * Note, rr_cols is the entire width of the block, even
+ * if this row is shorter. This is needed because parity
+ * generation (for Q and R) needs to know the entire width,
+ * because it treats the short row as though it was
+ * full-width (and the "phantom" sectors were zero-filled).
+ *
+ * Another approach to this would be to set cols shorter
+ * (to just the number of columns that we might do i/o to)
+ * and have another mechanism to tell the parity generation
+ * about the "entire width". Reconstruction (at least
+ * vdev_raidz_reconstruct_general()) would also need to
+ * know about the "entire width".
+ */
+ rr->rr_firstdatacol = nparity;
+#ifdef ZFS_DEBUG
+ /*
+ * note: rr_size is PSIZE, not ASIZE
+ */
+ rr->rr_offset = b << ashift;
+ rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
+#endif
+
+ for (int c = 0; c < rr->rr_cols; c++, child_id++) {
+ if (child_id >= row_phys_cols) {
+ child_id -= row_phys_cols;
+ child_offset += 1ULL << ashift;
+ }
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_devidx = child_id;
+ rc->rc_offset = child_offset;
+
+ /*
+ * Get this from the scratch space if appropriate.
+ * This only happens if we crashed in the middle of
+ * raidz_reflow_scratch_sync() (while it's running,
+ * the rangelock prevents us from doing concurrent
+ * io), and even then only during zpool import or
+ * when the pool is imported readonly.
+ */
+ if (row_use_scratch)
+ rc->rc_offset -= VDEV_BOOT_SIZE;
+
+ uint64_t dc = c - rr->rr_firstdatacol;
+ if (c < rr->rr_firstdatacol) {
+ rc->rc_size = 1ULL << ashift;
+
+ /*
+ * Parity sectors' rc_abd's are set below
+ * after determining if this is an aggregation.
+ */
+ } else if (row == rows - 1 && bc != 0 && c >= bc) {
+ /*
+ * Past the end of the block (even including
+ * skip sectors). This sector is part of the
+ * map so that we have full rows for p/q parity
+ * generation.
+ */
+ rc->rc_size = 0;
+ rc->rc_abd = NULL;
+ } else {
+ /* "data column" (col excluding parity) */
+ uint64_t off;
+
+ if (c < bc || r == 0) {
+ off = dc * rows + row;
+ } else {
+ off = r * rows +
+ (dc - r) * (rows - 1) + row;
+ }
+ rc->rc_size = 1ULL << ashift;
+ rc->rc_abd = abd_get_offset_struct(
+ &rc->rc_abdstruct, abd, off << ashift,
+ rc->rc_size);
+ }
+
+ if (rc->rc_size == 0)
+ continue;
+
+ /*
+ * If any part of this row is in both old and new
+ * locations, the primary location is the old
+ * location. If this sector was already copied to the
+ * new location, we need to also write to the new,
+ * "shadow" location.
+ *
+ * Note, `row_phys_cols != physical_cols` indicates
+ * that the primary location is the old location.
+ * `b+c < reflow_offset_next` indicates that the copy
+ * to the new location has been initiated. We know
+ * that the copy has completed because we have the
+ * rangelock, which is held exclusively while the
+ * copy is in progress.
+ */
+ if (row_use_scratch ||
+ (row_phys_cols != physical_cols &&
+ b + c < reflow_offset_next >> ashift)) {
+ rc->rc_shadow_devidx = (b + c) % physical_cols;
+ rc->rc_shadow_offset =
+ ((b + c) / physical_cols) << ashift;
+ if (row_use_scratch)
+ rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
+ }
+
+ asize += rc->rc_size;
+ }
+
+ /*
+ * See comment in vdev_raidz_map_alloc()
+ */
+ if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
+ (offset & (1ULL << 20))) {
+ ASSERT(rr->rr_cols >= 2);
+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+
+ int devidx0 = rr->rr_col[0].rc_devidx;
+ uint64_t offset0 = rr->rr_col[0].rc_offset;
+ int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
+ uint64_t shadow_offset0 =
+ rr->rr_col[0].rc_shadow_offset;
+
+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+ rr->rr_col[0].rc_shadow_devidx =
+ rr->rr_col[1].rc_shadow_devidx;
+ rr->rr_col[0].rc_shadow_offset =
+ rr->rr_col[1].rc_shadow_offset;
+
+ rr->rr_col[1].rc_devidx = devidx0;
+ rr->rr_col[1].rc_offset = offset0;
+ rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
+ rr->rr_col[1].rc_shadow_offset = shadow_offset0;
+ }
+ }
+ ASSERT3U(asize, ==, tot << ashift);
+
+ /*
+ * Determine if the block is contiguous, in which case we can use
+ * an aggregation.
+ */
+ if (rows >= raidz_io_aggregate_rows) {
+ rm->rm_nphys_cols = physical_cols;
+ rm->rm_phys_col =
+ kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
+ KM_SLEEP);
+
+ /*
+ * Determine the aggregate io's offset and size, and check
+ * that the io is contiguous.
+ */
+ for (int i = 0;
+ i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ raidz_col_t *prc =
+ &rm->rm_phys_col[rc->rc_devidx];
+
+ if (rc->rc_size == 0)
+ continue;
+
+ if (prc->rc_size == 0) {
+ ASSERT0(prc->rc_offset);
+ prc->rc_offset = rc->rc_offset;
+ } else if (prc->rc_offset + prc->rc_size !=
+ rc->rc_offset) {
+ /*
+ * This block is not contiguous and
+ * therefore can't be aggregated.
+ * This is expected to be rare, so
+ * the cost of allocating and then
+ * freeing rm_phys_col is not
+ * significant.
+ */
+ kmem_free(rm->rm_phys_col,
+ sizeof (raidz_col_t) *
+ rm->rm_nphys_cols);
+ rm->rm_phys_col = NULL;
+ rm->rm_nphys_cols = 0;
+ break;
+ }
+ prc->rc_size += rc->rc_size;
+ }
+ }
+ }
+ if (rm->rm_phys_col != NULL) {
+ /*
+ * Allocate aggregate ABD's.
+ */
+ for (int i = 0; i < rm->rm_nphys_cols; i++) {
+ raidz_col_t *prc = &rm->rm_phys_col[i];
+
+ prc->rc_devidx = i;
+
+ if (prc->rc_size == 0)
+ continue;
+
+ prc->rc_abd =
+ abd_alloc_linear(rm->rm_phys_col[i].rc_size,
+ B_FALSE);
+ }
+
+ /*
+ * Point the parity abd's into the aggregate abd's.
+ */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_firstdatacol; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ raidz_col_t *prc =
+ &rm->rm_phys_col[rc->rc_devidx];
+ rc->rc_abd =
+ abd_get_offset_struct(&rc->rc_abdstruct,
+ prc->rc_abd,
+ rc->rc_offset - prc->rc_offset,
+ rc->rc_size);
+ }
+ }
+ } else {
+ /*
+ * Allocate new abd's for the parity sectors.
+ */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_firstdatacol; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_abd =
+ abd_alloc_linear(rc->rc_size,
+ B_TRUE);
+ }
+ }
+ }
/* init RAIDZ parity ops */
rm->rm_ops = vdev_raidz_math_get_ops();
@@ -453,11 +1044,11 @@ vdev_raidz_p_func(void *buf, size_t size, void *private)
{
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
- int i, cnt = size / sizeof (src[0]);
+ int cnt = size / sizeof (src[0]);
ASSERT(pqr->p && !pqr->q && !pqr->r);
- for (i = 0; i < cnt; i++, src++, pqr->p++)
+ for (int i = 0; i < cnt; i++, src++, pqr->p++)
*pqr->p ^= *src;
return (0);
@@ -469,11 +1060,11 @@ vdev_raidz_pq_func(void *buf, size_t size, void *private)
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
uint64_t mask;
- int i, cnt = size / sizeof (src[0]);
+ int cnt = size / sizeof (src[0]);
ASSERT(pqr->p && pqr->q && !pqr->r);
- for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+ for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
*pqr->p ^= *src;
VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
*pqr->q ^= *src;
@@ -488,11 +1079,11 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private)
struct pqr_struct *pqr = private;
const uint64_t *src = buf;
uint64_t mask;
- int i, cnt = size / sizeof (src[0]);
+ int cnt = size / sizeof (src[0]);
ASSERT(pqr->p && pqr->q && pqr->r);
- for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+ for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
*pqr->p ^= *src;
VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
*pqr->q ^= *src;
@@ -618,7 +1209,15 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
void
vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
{
- ASSERT3U(rr->rr_cols, !=, 0);
+ if (rr->rr_cols == 0) {
+ /*
+ * We are handling this block one row at a time (because
+ * this block has a different logical vs physical width,
+ * due to RAIDZ expansion), and this is a pad-only row,
+ * which has no parity.
+ */
+ return;
+ }
/* Generate using the new math implementation */
if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
@@ -770,6 +1369,9 @@ vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
int x = tgts[0];
abd_t *dst, *src;
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
+
ASSERT3U(ntgts, ==, 1);
ASSERT3U(x, >=, rr->rr_firstdatacol);
ASSERT3U(x, <, rr->rr_cols);
@@ -802,6 +1404,9 @@ vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
int c, exp;
abd_t *dst, *src;
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
+
ASSERT(ntgts == 1);
ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
@@ -848,6 +1453,9 @@ vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
int y = tgts[1];
abd_t *xd, *yd;
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
+
ASSERT(ntgts == 2);
ASSERT(x < y);
ASSERT(x >= rr->rr_firstdatacol);
@@ -1295,11 +1903,14 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
abd_t **bufs = NULL;
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
/*
* Matrix reconstruction can't use scatter ABDs yet, so we allocate
* temporary linear ABDs if any non-linear ABDs are found.
*/
for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
+ ASSERT(rr->rr_col[i].rc_abd != NULL);
if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
KM_PUSHPAGE);
@@ -1427,10 +2038,23 @@ vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
int nbadparity, nbaddata;
int parity_valid[VDEV_RAIDZ_MAXPARITY];
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
+ zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
+ rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
+ (int)rr->rr_missingparity);
+ }
+
nbadparity = rr->rr_firstdatacol;
nbaddata = rr->rr_cols - nbadparity;
ntgts = 0;
for (i = 0, c = 0; c < rr->rr_cols; c++) {
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
+ zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
+ "offset=%llx error=%u)",
+ rr, c, (int)rr->rr_col[c].rc_devidx,
+ (long long)rr->rr_col[c].rc_offset,
+ (int)rr->rr_col[c].rc_error);
+ }
if (c < rr->rr_firstdatacol)
parity_valid[c] = B_FALSE;
@@ -1537,8 +2161,15 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
*physical_ashift, cvd->vdev_physical_ashift);
}
- *asize *= vd->vdev_children;
- *max_asize *= vd->vdev_children;
+ if (vd->vdev_rz_expanding) {
+ *asize *= vd->vdev_children - 1;
+ *max_asize *= vd->vdev_children - 1;
+
+ vd->vdev_min_asize = *asize;
+ } else {
+ *asize *= vd->vdev_children;
+ *max_asize *= vd->vdev_children;
+ }
if (numerrors > nparity) {
vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
@@ -1557,19 +2188,71 @@ vdev_raidz_close(vdev_t *vd)
}
}
+/*
+ * Return the logical width to use, given the txg in which the allocation
+ * happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the
+ * BP was allocated. Remapped BP's (that were relocated due to device
+ * removal, see remap_blkptr_cb()), will have a more recent
+ * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can
+ * ignore these because they can't be on RAIDZ (device removal doesn't
+ * support RAIDZ).
+ */
+static uint64_t
+vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
+{
+ reflow_node_t lookup = {
+ .re_txg = txg,
+ };
+ avl_index_t where;
+
+ uint64_t width;
+ mutex_enter(&vdrz->vd_expand_lock);
+ reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
+ if (re != NULL) {
+ width = re->re_logical_width;
+ } else {
+ re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
+ if (re != NULL)
+ width = re->re_logical_width;
+ else
+ width = vdrz->vd_original_width;
+ }
+ mutex_exit(&vdrz->vd_expand_lock);
+ return (width);
+}
+
+/*
+ * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
+ * more space due to the lower data-to-parity ratio. In this case it's
+ * important to pass in the correct txg. Note that vdev_gang_header_asize()
+ * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
+ * regardless of txg. This is assured because for a single data sector, we
+ * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
+ */
static uint64_t
-vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
{
vdev_raidz_t *vdrz = vd->vdev_tsd;
uint64_t asize;
uint64_t ashift = vd->vdev_top->vdev_ashift;
- uint64_t cols = vdrz->vd_logical_width;
+ uint64_t cols = vdrz->vd_original_width;
uint64_t nparity = vdrz->vd_nparity;
+ cols = vdev_raidz_get_logical_width(vdrz, txg);
+
asize = ((psize - 1) >> ashift) + 1;
asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
asize = roundup(asize, nparity + 1) << ashift;
+#ifdef ZFS_DEBUG
+ uint64_t asize_new = ((psize - 1) >> ashift) + 1;
+ uint64_t ncols_new = vdrz->vd_physical_width;
+ asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
+ (ncols_new - nparity));
+ asize_new = roundup(asize_new, nparity + 1) << ashift;
+ VERIFY3U(asize_new, <=, asize);
+#endif
+
return (asize);
}
@@ -1596,21 +2279,37 @@ vdev_raidz_child_done(zio_t *zio)
}
static void
-vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
+vdev_raidz_shadow_child_done(zio_t *zio)
{
-#ifdef ZFS_DEBUG
- vdev_t *tvd = vd->vdev_top;
+ raidz_col_t *rc = zio->io_private;
+
+ rc->rc_shadow_error = zio->io_error;
+}
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
+{
+ (void) rm;
+#ifdef ZFS_DEBUG
range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = rr->rr_offset;
logical_rs.rs_end = logical_rs.rs_start +
- vdev_raidz_asize(vd, rr->rr_size);
+ vdev_raidz_asize(zio->io_vd, rr->rr_size,
+ BP_PHYSICAL_BIRTH(zio->io_bp));
raidz_col_t *rc = &rr->rr_col[col];
- vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+ vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
ASSERT(vdev_xlate_is_empty(&remain_rs));
+ if (vdev_xlate_is_empty(&physical_rs)) {
+ /*
+ * If we are in the middle of expansion, the
+ * physical->logical mapping is changing so vdev_xlate()
+ * can't give us a reliable answer.
+ */
+ return;
+ }
ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
/*
@@ -1621,7 +2320,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
*/
if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
- rc->rc_size + (1 << tvd->vdev_ashift));
+ rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
} else {
ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
}
@@ -1629,7 +2328,7 @@ vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
}
static void
-vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
{
vdev_t *vd = zio->io_vd;
raidz_map_t *rm = zio->io_vsd;
@@ -1641,31 +2340,66 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
/* Verify physical to logical translation */
- vdev_raidz_io_verify(vd, rr, c);
+ vdev_raidz_io_verify(zio, rm, rr, c);
- if (rc->rc_size > 0) {
- ASSERT3P(rc->rc_abd, !=, NULL);
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd,
- abd_get_size(rc->rc_abd), zio->io_type,
- zio->io_priority, 0, vdev_raidz_child_done, rc));
- } else {
- /*
- * Generate optional write for skip sector to improve
- * aggregation contiguity.
- */
- ASSERT3P(rc->rc_abd, ==, NULL);
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, NULL, 1ULL << ashift,
- zio->io_type, zio->io_priority,
- ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,
- NULL));
+ if (rc->rc_size == 0)
+ continue;
+
+ ASSERT3U(rc->rc_offset + rc->rc_size, <,
+ cvd->vdev_psize - VDEV_LABEL_END_SIZE);
+
+ ASSERT3P(rc->rc_abd, !=, NULL);
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd,
+ abd_get_size(rc->rc_abd), zio->io_type,
+ zio->io_priority, 0, vdev_raidz_child_done, rc));
+
+ if (rc->rc_shadow_devidx != INT_MAX) {
+ vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
+
+ ASSERT3U(
+ rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
+ cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
+ rc->rc_shadow_offset, rc->rc_abd,
+ abd_get_size(rc->rc_abd),
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_shadow_child_done, rc));
}
}
}
+/*
+ * Generate optional I/Os for skip sectors to improve aggregation contiguity.
+ * This only works for vdev_raidz_map_alloc() (not _expanded()).
+ */
+static void
+raidz_start_skip_writes(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ raidz_map_t *rm = zio->io_vsd;
+ ASSERT3U(rm->rm_nrows, ==, 1);
+ raidz_row_t *rr = rm->rm_row[0];
+ for (int c = 0; c < rr->rr_scols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+ if (rc->rc_size != 0)
+ continue;
+ ASSERT3P(rc->rc_abd, ==, NULL);
+
+ ASSERT3U(rc->rc_offset, <,
+ cvd->vdev_psize - VDEV_LABEL_END_SIZE);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
+ NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+}
+
static void
-vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
+vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
{
vdev_t *vd = zio->io_vd;
@@ -1697,7 +2431,8 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_skipped = 1;
continue;
}
- if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
+ if (forceparity ||
+ c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
@@ -1707,6 +2442,56 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
}
}
+static void
+vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
+{
+ vdev_t *vd = zio->io_vd;
+
+ for (int i = 0; i < rm->rm_nphys_cols; i++) {
+ raidz_col_t *prc = &rm->rm_phys_col[i];
+ if (prc->rc_size == 0)
+ continue;
+
+ ASSERT3U(prc->rc_devidx, ==, i);
+ vdev_t *cvd = vd->vdev_child[i];
+ if (!vdev_readable(cvd)) {
+ prc->rc_error = SET_ERROR(ENXIO);
+ prc->rc_tried = 1; /* don't even try */
+ prc->rc_skipped = 1;
+ continue;
+ }
+ if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
+ prc->rc_error = SET_ERROR(ESTALE);
+ prc->rc_skipped = 1;
+ continue;
+ }
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ prc->rc_offset, prc->rc_abd, prc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, prc));
+ }
+}
+
+static void
+vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
+{
+ /*
+ * If there are multiple rows, we will be hitting
+ * all disks, so go ahead and read the parity so
+ * that we are reading in decent size chunks.
+ */
+ boolean_t forceparity = rm->rm_nrows > 1;
+
+ if (rm->rm_phys_col) {
+ vdev_raidz_io_start_read_phys_cols(zio, rm);
+ } else {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ vdev_raidz_io_start_read_row(zio, rr, forceparity);
+ }
+ }
+}
+
/*
* Start an IO operation on a RAIDZ VDev
*
@@ -1730,24 +2515,83 @@ vdev_raidz_io_start(zio_t *zio)
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
vdev_raidz_t *vdrz = vd->vdev_tsd;
+ raidz_map_t *rm;
+
+ uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
+ BP_PHYSICAL_BIRTH(zio->io_bp));
+ if (logical_width != vdrz->vd_physical_width) {
+ zfs_locked_range_t *lr = NULL;
+ uint64_t synced_offset = UINT64_MAX;
+ uint64_t next_offset = UINT64_MAX;
+ boolean_t use_scratch = B_FALSE;
+ /*
+ * Note: when the expansion is completing, we set
+ * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
+ * in a later txg than when we last update spa_ubsync's state
+ * (see the end of spa_raidz_expand_thread()). Therefore we
+ * may see vre_state!=SCANNING before
+ * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
+ * on disk, but the copying progress has been synced to disk
+ * (and reflected in spa_ubsync). In this case it's fine to
+ * treat the expansion as completed, since if we crash there's
+ * no additional copying to do.
+ */
+ if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
+ ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
+ &vdrz->vn_vre);
+ lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
+ zio->io_offset, zio->io_size, RL_READER);
+ use_scratch =
+ (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
+ RRSS_SCRATCH_VALID);
+ synced_offset =
+ RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
+ next_offset = vdrz->vn_vre.vre_offset;
+ /*
+ * If we haven't resumed expanding since importing the
+ * pool, vre_offset won't have been set yet. In
+ * this case the next offset to be copied is the same
+ * as what was synced.
+ */
+ if (next_offset == UINT64_MAX) {
+ next_offset = synced_offset;
+ }
+ }
+ if (use_scratch) {
+ zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
+ "%lld next_offset=%lld use_scratch=%u",
+ zio,
+ zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
+ (long long)zio->io_offset,
+ (long long)synced_offset,
+ (long long)next_offset,
+ use_scratch);
+ }
+
+ rm = vdev_raidz_map_alloc_expanded(zio,
+ tvd->vdev_ashift, vdrz->vd_physical_width,
+ logical_width, vdrz->vd_nparity,
+ synced_offset, next_offset, use_scratch);
+ rm->rm_lr = lr;
+ } else {
+ rm = vdev_raidz_map_alloc(zio,
+ tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
+ }
+ rm->rm_original_width = vdrz->vd_original_width;
- raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
- vdrz->vd_logical_width, vdrz->vd_nparity);
zio->io_vsd = rm;
zio->io_vsd_ops = &vdev_raidz_vsd_ops;
-
- /*
- * Until raidz expansion is implemented all maps for a raidz vdev
- * contain a single row.
- */
- ASSERT3U(rm->rm_nrows, ==, 1);
- raidz_row_t *rr = rm->rm_row[0];
-
if (zio->io_type == ZIO_TYPE_WRITE) {
- vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_start_write(zio, rm->rm_row[i]);
+ }
+
+ if (logical_width == vdrz->vd_physical_width) {
+ raidz_start_skip_writes(zio);
+ }
} else {
ASSERT(zio->io_type == ZIO_TYPE_READ);
- vdev_raidz_io_start_read(zio, rr);
+ vdev_raidz_io_start_read(zio, rm);
}
zio_execute(zio);
@@ -1847,6 +2691,8 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
continue;
if (abd_cmp(orig[c], rc->rc_abd) != 0) {
+ zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
+ c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
vdev_raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
ret++;
@@ -1862,8 +2708,10 @@ vdev_raidz_worst_error(raidz_row_t *rr)
{
int error = 0;
- for (int c = 0; c < rr->rr_cols; c++)
+ for (int c = 0; c < rr->rr_cols; c++) {
error = zio_worst_error(error, rr->rr_col[c].rc_error);
+ error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
+ }
return (error);
}
@@ -1929,6 +2777,10 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
continue;
}
+ zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
+ "offset=%llx",
+ zio, c, rc->rc_devidx, (long long)rc->rc_offset);
+
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
ZIO_TYPE_WRITE,
@@ -1938,6 +2790,42 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
}
+
+ /*
+ * Scrub or resilver i/o's: overwrite any shadow locations with the
+ * good data. This ensures that if we've already copied this sector,
+ * it will be corrected if it was damaged. This writes more than is
+ * necessary, but since expansion is paused during scrub/resilver, at
+ * most a single row will have a shadow location.
+ */
+ if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+ (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *vd = zio->io_vd;
+
+ if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
+
+ /*
+ * Note: We don't want to update the repair stats
+ * because that would incorrectly indicate that there
+ * was bad data to repair, which we aren't sure about.
+ * By clearing the SCAN_THREAD flag, we prevent this
+ * from happening, despite having the REPAIR flag set.
+ * We need to set SELF_HEAL so that this i/o can't be
+ * bypassed by zio_vdev_io_start().
+ */
+ zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+ NULL, NULL);
+ cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
+ zio_nowait(cio);
+ }
+ }
}
static void
@@ -1957,6 +2845,43 @@ raidz_restore_orig_data(raidz_map_t *rm)
}
/*
+ * During raidz_reconstruct() for expanded VDEV, we need special consideration
+ * failure simulations. See note in raidz_reconstruct() on simulating failure
+ * of a pre-expansion device.
+ *
+ * Treating logical child i as failed, return TRUE if the given column should
+ * be treated as failed. The idea of logical children allows us to imagine
+ * that a disk silently failed before a RAIDZ expansion (reads from this disk
+ * succeed but return the wrong data). Since the expansion doesn't verify
+ * checksums, the incorrect data will be moved to new locations spread among
+ * the children (going diagonally across them).
+ *
+ * Higher "logical child failures" (values of `i`) indicate these
+ * "pre-expansion failures". The first physical_width values imagine that a
+ * current child failed; the next physical_width-1 values imagine that a
+ * child failed before the most recent expansion; the next physical_width-2
+ * values imagine a child failed in the expansion before that, etc.
+ */
+static boolean_t
+raidz_simulate_failure(int physical_width, int original_width, int ashift,
+ int i, raidz_col_t *rc)
+{
+ uint64_t sector_id =
+ physical_width * (rc->rc_offset >> ashift) +
+ rc->rc_devidx;
+
+ for (int w = physical_width; w >= original_width; w--) {
+ if (i < w) {
+ return (sector_id % w == i);
+ } else {
+ i -= w;
+ }
+ }
+ ASSERT(!"invalid logical child id");
+ return (B_FALSE);
+}
+
+/*
* returns EINVAL if reconstruction of the block will not be possible
* returns ECKSUM if this specific reconstruction failed
* returns 0 on successful reconstruction
@@ -1965,6 +2890,15 @@ static int
raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
{
raidz_map_t *rm = zio->io_vsd;
+ int physical_width = zio->io_vd->vdev_children;
+ int original_width = (rm->rm_original_width != 0) ?
+ rm->rm_original_width : physical_width;
+ int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
+
+ if (dbgmsg) {
+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
+ "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
+ }
/* Reconstruct each row */
for (int r = 0; r < rm->rm_nrows; r++) {
@@ -1974,6 +2908,9 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
int dead = 0;
int dead_data = 0;
+ if (dbgmsg)
+ zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
+
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
ASSERT0(rc->rc_need_orig_restore);
@@ -1986,7 +2923,10 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
if (rc->rc_size == 0)
continue;
for (int lt = 0; lt < ntgts; lt++) {
- if (rc->rc_devidx == ltgts[lt]) {
+ if (raidz_simulate_failure(physical_width,
+ original_width,
+ zio->io_vd->vdev_top->vdev_ashift,
+ ltgts[lt], rc)) {
if (rc->rc_orig_data == NULL) {
rc->rc_orig_data =
abd_alloc_linear(
@@ -1999,13 +2939,37 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
dead++;
if (c >= nparity)
dead_data++;
- my_tgts[t++] = c;
+ /*
+ * Note: simulating failure of a
+ * pre-expansion device can hit more
+ * than one column, in which case we
+ * might try to simulate more failures
+ * than can be reconstructed, which is
+ * also more than the size of my_tgts.
+ * This check prevents accessing past
+ * the end of my_tgts. The "dead >
+ * nparity" check below will fail this
+ * reconstruction attempt.
+ */
+ if (t < VDEV_RAIDZ_MAXPARITY) {
+ my_tgts[t++] = c;
+ if (dbgmsg) {
+ zfs_dbgmsg("simulating "
+ "failure of col %u "
+ "devidx %u", c,
+ (int)rc->rc_devidx);
+ }
+ }
break;
}
}
}
if (dead > nparity) {
/* reconstruction not possible */
+ if (dbgmsg) {
+ zfs_dbgmsg("reconstruction not possible; "
+ "too many failures");
+ }
raidz_restore_orig_data(rm);
return (EINVAL);
}
@@ -2049,11 +3013,19 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
zio_checksum_verified(zio);
+ if (dbgmsg) {
+ zfs_dbgmsg("reconstruction successful "
+ "(checksum verified)");
+ }
return (0);
}
/* Reconstruction failed - restore original data */
raidz_restore_orig_data(rm);
+ if (dbgmsg) {
+ zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
+ "failed", zio);
+ }
return (ECKSUM);
}
@@ -2068,7 +3040,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
* The order that we find the various possible combinations of failed
* disks is dictated by these rules:
* - Examine each "slot" (the "i" in tgts[i])
- * - Try to increment this slot (tgts[i] = tgts[i] + 1)
+ * - Try to increment this slot (tgts[i] += 1)
* - if we can't increment because it runs into the next slot,
* reset our slot to the minimum, and examine the next slot
*
@@ -2099,18 +3071,22 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
*
* This strategy works for dRAID but is less efficient when there are a large
* number of child vdevs and therefore permutations to check. Furthermore,
- * since the raidz_map_t rows likely do not overlap reconstruction would be
+ * since the raidz_map_t rows likely do not overlap, reconstruction would be
* possible as long as there are no more than nparity data errors per row.
* These additional permutations are not currently checked but could be as
* a future improvement.
+ *
+ * Returns 0 on success, ECKSUM on failure.
*/
static int
vdev_raidz_combrec(zio_t *zio)
{
int nparity = vdev_get_nparity(zio->io_vd);
raidz_map_t *rm = zio->io_vsd;
+ int physical_width = zio->io_vd->vdev_children;
+ int original_width = (rm->rm_original_width != 0) ?
+ rm->rm_original_width : physical_width;
- /* Check if there's enough data to attempt reconstrution. */
for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
int total_errors = 0;
@@ -2128,8 +3104,16 @@ vdev_raidz_combrec(zio_t *zio)
int tstore[VDEV_RAIDZ_MAXPARITY + 2];
int *ltgts = &tstore[1]; /* value is logical child ID */
- /* Determine number of logical children, n */
- int n = zio->io_vd->vdev_children;
+
+ /*
+ * Determine number of logical children, n. See comment
+ * above raidz_simulate_failure().
+ */
+ int n = 0;
+ for (int w = physical_width;
+ w >= original_width; w--) {
+ n += w;
+ }
ASSERT3U(num_failures, <=, nparity);
ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
@@ -2160,6 +3144,14 @@ vdev_raidz_combrec(zio_t *zio)
if (ltgts[t] == n) {
/* try more failures */
ASSERT3U(t, ==, num_failures - 1);
+ if (zfs_flags &
+ ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
+ zfs_dbgmsg("reconstruction "
+ "failed for num_failures="
+ "%u; tried all "
+ "combinations",
+ num_failures);
+ }
break;
}
@@ -2171,7 +3163,7 @@ vdev_raidz_combrec(zio_t *zio)
* Try the next combination.
*/
if (ltgts[t] != ltgts[t + 1])
- break;
+ break; // found next combination
/*
* Otherwise, reset this tgt to the minimum,
@@ -2186,7 +3178,8 @@ vdev_raidz_combrec(zio_t *zio)
break;
}
}
-
+ if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
+ zfs_dbgmsg("reconstruction failed for all num_failures");
return (ECKSUM);
}
@@ -2211,7 +3204,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
static void
vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
{
- int total_errors = 0;
+ int normal_errors = 0;
+ int shadow_errors = 0;
ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
@@ -2220,24 +3214,31 @@ vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
- if (rc->rc_error) {
+ if (rc->rc_error != 0) {
ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
-
- total_errors++;
+ normal_errors++;
+ }
+ if (rc->rc_shadow_error != 0) {
+ ASSERT(rc->rc_shadow_error != ECKSUM);
+ shadow_errors++;
}
}
/*
* Treat partial writes as a success. If we couldn't write enough
- * columns to reconstruct the data, the I/O failed. Otherwise,
- * good enough.
+ * columns to reconstruct the data, the I/O failed. Otherwise, good
+ * enough. Note that in the case of a shadow write (during raidz
+ * expansion), depending on if we crash, either the normal (old) or
+ * shadow (new) location may become the "real" version of the block,
+ * so both locations must have sufficient redundancy.
*
* Now that we support write reallocation, it would be better
* to treat partial failure as real failure unless there are
* no non-degraded top-level vdevs left, and not update DTLs
* if we intend to reallocate.
*/
- if (total_errors > rr->rr_firstdatacol) {
+ if (normal_errors > rr->rr_firstdatacol ||
+ shadow_errors > rr->rr_firstdatacol) {
zio->io_error = zio_worst_error(zio->io_error,
vdev_raidz_worst_error(rr));
}
@@ -2254,7 +3255,6 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
- ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
for (int c = 0; c < rr->rr_cols; c++) {
raidz_col_t *rc = &rr->rr_col[c];
@@ -2337,7 +3337,7 @@ vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
* for a normal read then allocate an ABD for them now so they
* may be read, verified, and any needed repairs performed.
*/
- if (rr->rr_nempty && rr->rr_abd_empty == NULL)
+ if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
vdev_draid_map_alloc_empty(zio, rr);
for (int c = 0; c < rr->rr_cols; c++) {
@@ -2395,11 +3395,48 @@ vdev_raidz_io_done(zio_t *zio)
{
raidz_map_t *rm = zio->io_vsd;
+ ASSERT(zio->io_bp != NULL);
if (zio->io_type == ZIO_TYPE_WRITE) {
for (int i = 0; i < rm->rm_nrows; i++) {
vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
}
} else {
+ if (rm->rm_phys_col) {
+ /*
+ * This is an aggregated read. Copy the data and status
+ * from the aggregate abd's to the individual rows.
+ */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_tried || rc->rc_size == 0)
+ continue;
+
+ raidz_col_t *prc =
+ &rm->rm_phys_col[rc->rc_devidx];
+ rc->rc_error = prc->rc_error;
+ rc->rc_tried = prc->rc_tried;
+ rc->rc_skipped = prc->rc_skipped;
+ if (c >= rr->rr_firstdatacol) {
+ /*
+ * Note: this is slightly faster
+ * than using abd_copy_off().
+ */
+ char *physbuf = abd_to_buf(
+ prc->rc_abd);
+ void *physloc = physbuf +
+ rc->rc_offset -
+ prc->rc_offset;
+
+ abd_copy_from_buf(rc->rc_abd,
+ physloc, rc->rc_size);
+ }
+ }
+ }
+ }
+
for (int i = 0; i < rm->rm_nrows; i++) {
raidz_row_t *rr = rm->rm_row[i];
vdev_raidz_io_done_reconstruct_known_missing(zio,
@@ -2446,7 +3483,54 @@ vdev_raidz_io_done(zio_t *zio)
zio_vdev_io_redone(zio);
return;
}
-
+ /*
+ * It would be too expensive to try every possible
+ * combination of failed sectors in every row, so
+ * instead we try every combination of failed current or
+ * past physical disk. This means that if the incorrect
+ * sectors were all on Nparity disks at any point in the
+ * past, we will find the correct data. The only known
+ * case where this is less durable than a non-expanded
+ * RAIDZ, is if we have a silent failure during
+ * expansion. In that case, one block could be
+ * partially in the old format and partially in the
+ * new format, so we'd lost some sectors from the old
+ * format and some from the new format.
+ *
+ * e.g. logical_width=4 physical_width=6
+ * the 15 (6+5+4) possible failed disks are:
+ * width=6 child=0
+ * width=6 child=1
+ * width=6 child=2
+ * width=6 child=3
+ * width=6 child=4
+ * width=6 child=5
+ * width=5 child=0
+ * width=5 child=1
+ * width=5 child=2
+ * width=5 child=3
+ * width=5 child=4
+ * width=4 child=0
+ * width=4 child=1
+ * width=4 child=2
+ * width=4 child=3
+ * And we will try every combination of Nparity of these
+ * failing.
+ *
+ * As a first pass, we can generate every combo,
+ * and try reconstructing, ignoring any known
+ * failures. If any row has too many known + simulated
+ * failures, then we bail on reconstructing with this
+ * number of simulated failures. As an improvement,
+ * we could detect the number of whole known failures
+ * (i.e. we have known failures on these disks for
+ * every row; the disks never succeeded), and
+ * subtract that from the max # failures to simulate.
+ * We could go even further like the current
+ * combrec code, but that doesn't seem like it
+ * gains us very much. If we simulate a failure
+ * that is also a known failure, that's fine.
+ */
zio->io_error = vdev_raidz_combrec(zio);
if (zio->io_error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -2454,6 +3538,10 @@ vdev_raidz_io_done(zio_t *zio)
}
}
}
+ if (rm->rm_lr != NULL) {
+ zfs_rangelock_exit(rm->rm_lr);
+ rm->rm_lr = NULL;
+ }
}
static void
@@ -2480,6 +3568,14 @@ vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
uint64_t phys_birth)
{
vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+ /*
+ * If we're in the middle of a RAIDZ expansion, this block may be in
+ * the old and/or new location. For simplicity, always resilver it.
+ */
+ if (vdrz->vn_vre.vre_state == DSS_SCANNING)
+ return (B_TRUE);
+
uint64_t dcols = vd->vdev_children;
uint64_t nparity = vdrz->vd_nparity;
uint64_t ashift = vd->vdev_top->vdev_ashift;
@@ -2524,7 +3620,24 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
vdev_t *raidvd = cvd->vdev_parent;
ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
- uint64_t width = raidvd->vdev_children;
+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+
+ if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
+ /*
+ * We're in the middle of expansion, in which case the
+ * translation is in flux. Any answer we give may be wrong
+ * by the time we return, so it isn't safe for the caller to
+ * act on it. Therefore we say that this range isn't present
+ * on any children. The only consumers of this are "zpool
+ * initialize" and trimming, both of which are "best effort"
+ * anyway.
+ */
+ physical_rs->rs_start = physical_rs->rs_end = 0;
+ remain_rs->rs_start = remain_rs->rs_end = 0;
+ return;
+ }
+
+ uint64_t width = vdrz->vd_physical_width;
uint64_t tgt_col = cvd->vdev_id;
uint64_t ashift = raidvd->vdev_top->vdev_ashift;
@@ -2550,15 +3663,1155 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
logical_rs->rs_end - logical_rs->rs_start);
}
+static void
+raidz_reflow_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = arg;
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+
+ /*
+ * Ensure there are no i/os to the range that is being committed.
+ */
+ uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
+ ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
+
+ mutex_enter(&vre->vre_lock);
+ uint64_t new_offset =
+ MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
+ /*
+ * We should not have committed anything that failed.
+ */
+ VERIFY3U(vre->vre_failed_offset, >=, old_offset);
+ mutex_exit(&vre->vre_lock);
+
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
+ old_offset, new_offset - old_offset,
+ RL_WRITER);
+
+ /*
+ * Update the uberblock that will be written when this txg completes.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_uberblock,
+ RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
+ vre->vre_offset_pertxg[txgoff] = 0;
+ zfs_rangelock_exit(lr);
+
+ mutex_enter(&vre->vre_lock);
+ vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
+ vre->vre_bytes_copied_pertxg[txgoff] = 0;
+ mutex_exit(&vre->vre_lock);
+
+ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
+ sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
+}
+
+static void
+raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = arg;
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+
+ for (int i = 0; i < TXG_SIZE; i++)
+ VERIFY0(vre->vre_offset_pertxg[i]);
+
+ reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
+ re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
+ re->re_logical_width = vdrz->vd_physical_width;
+ mutex_enter(&vdrz->vd_expand_lock);
+ avl_add(&vdrz->vd_expand_txgs, re);
+ mutex_exit(&vdrz->vd_expand_lock);
+
+ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
+
+ /*
+ * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
+ * will get written (based on vd_expand_txgs).
+ */
+ vdev_config_dirty(vd);
+
+ /*
+ * Before we change vre_state, the on-disk state must reflect that we
+ * have completed all copying, so that vdev_raidz_io_start() can use
+ * vre_state to determine if the reflow is in progress. See also the
+ * end of spa_raidz_expand_thread().
+ */
+ VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
+ raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
+
+ vre->vre_end_time = gethrestime_sec();
+ vre->vre_state = DSS_FINISHED;
+
+ uint64_t state = vre->vre_state;
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
+ sizeof (state), 1, &state, tx));
+
+ uint64_t end_time = vre->vre_end_time;
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
+ sizeof (end_time), 1, &end_time, tx));
+
+ spa->spa_uberblock.ub_raidz_reflow_info = 0;
+
+ spa_history_log_internal(spa, "raidz vdev expansion completed", tx,
+ "%s vdev %llu new width %llu", spa_name(spa),
+ (unsigned long long)vd->vdev_id,
+ (unsigned long long)vd->vdev_children);
+
+ spa->spa_raidz_expand = NULL;
+ raidvd->vdev_rz_expanding = B_FALSE;
+
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+ spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+ spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
+
+ spa_notify_waiters(spa);
+
+ /*
+ * While we're in syncing context take the opportunity to
+ * setup a scrub. All the data has been sucessfully copied
+ * but we have not validated any checksums.
+ */
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
+ dsl_scan_setup_sync(&func, tx);
+}
+
+/*
+ * Struct for one copy zio.
+ */
+typedef struct raidz_reflow_arg {
+ vdev_raidz_expand_t *rra_vre;
+ zfs_locked_range_t *rra_lr;
+ uint64_t rra_txg;
+} raidz_reflow_arg_t;
+
+/*
+ * The write of the new location is done.
+ */
+static void
+raidz_reflow_write_done(zio_t *zio)
+{
+ raidz_reflow_arg_t *rra = zio->io_private;
+ vdev_raidz_expand_t *vre = rra->rra_vre;
+
+ abd_free(zio->io_abd);
+
+ mutex_enter(&vre->vre_lock);
+ if (zio->io_error != 0) {
+ /* Force a reflow pause on errors */
+ vre->vre_failed_offset =
+ MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
+ }
+ ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
+ vre->vre_outstanding_bytes -= zio->io_size;
+ if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
+ vre->vre_failed_offset) {
+ vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
+ zio->io_size;
+ }
+ cv_signal(&vre->vre_cv);
+ mutex_exit(&vre->vre_lock);
+
+ zfs_rangelock_exit(rra->rra_lr);
+
+ kmem_free(rra, sizeof (*rra));
+ spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The read of the old location is done. The parent zio is the write to
+ * the new location. Allow it to start.
+ */
+static void
+raidz_reflow_read_done(zio_t *zio)
+{
+ raidz_reflow_arg_t *rra = zio->io_private;
+ vdev_raidz_expand_t *vre = rra->rra_vre;
+
+ /*
+ * If the read failed, or if it was done on a vdev that is not fully
+ * healthy (e.g. a child that has a resilver in progress), we may not
+ * have the correct data. Note that it's OK if the write proceeds.
+ * It may write garbage but the location is otherwise unused and we
+ * will retry later due to vre_failed_offset.
+ */
+ if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
+ zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
+ "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
+ (long long)rra->rra_lr->lr_offset,
+ (long long)rra->rra_lr->lr_length,
+ (long long)rra->rra_txg,
+ zio->io_error,
+ vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
+ vdev_dtl_empty(zio->io_vd, DTL_MISSING));
+ mutex_enter(&vre->vre_lock);
+ /* Force a reflow pause on errors */
+ vre->vre_failed_offset =
+ MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
+ mutex_exit(&vre->vre_lock);
+ }
+
+ zio_nowait(zio_unique_parent(zio));
+}
+
+static void
+raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
+ dmu_tx_t *tx)
+{
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (offset == 0)
+ return;
+
+ mutex_enter(&vre->vre_lock);
+ ASSERT3U(vre->vre_offset, <=, offset);
+ vre->vre_offset = offset;
+ mutex_exit(&vre->vre_lock);
+
+ if (vre->vre_offset_pertxg[txgoff] == 0) {
+ dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
+ spa, tx);
+ }
+ vre->vre_offset_pertxg[txgoff] = offset;
+}
+
+static boolean_t
+vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
+{
+ for (int i = 0; i < raidz_vd->vdev_children; i++) {
+ /* Quick check if a child is being replaced */
+ if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static boolean_t
+raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
+ dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ int ashift = vd->vdev_top->vdev_ashift;
+ uint64_t offset, size;
+
+ if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
+ &offset, &size)) {
+ return (B_FALSE);
+ }
+ ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
+ ASSERT3U(size, >=, 1 << ashift);
+ uint64_t length = 1 << ashift;
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ uint64_t blkid = offset >> ashift;
+
+ int old_children = vd->vdev_children - 1;
+
+ /*
+ * We can only progress to the point that writes will not overlap
+ * with blocks whose progress has not yet been recorded on disk.
+ * Since partially-copied rows are still read from the old location,
+ * we need to stop one row before the sector-wise overlap, to prevent
+ * row-wise overlap.
+ *
+ * Note that even if we are skipping over a large unallocated region,
+ * we can't move the on-disk progress to `offset`, because concurrent
+ * writes/allocations could still use the currently-unallocated
+ * region.
+ */
+ uint64_t ubsync_blkid =
+ RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
+ uint64_t next_overwrite_blkid = ubsync_blkid +
+ ubsync_blkid / old_children - old_children;
+ VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
+
+ if (blkid >= next_overwrite_blkid) {
+ raidz_reflow_record_progress(vre,
+ next_overwrite_blkid << ashift, tx);
+ return (B_TRUE);
+ }
+
+ range_tree_remove(rt, offset, length);
+
+ raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
+ rra->rra_vre = vre;
+ rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
+ offset, length, RL_WRITER);
+ rra->rra_txg = dmu_tx_get_txg(tx);
+
+ raidz_reflow_record_progress(vre, offset + length, tx);
+
+ mutex_enter(&vre->vre_lock);
+ vre->vre_outstanding_bytes += length;
+ mutex_exit(&vre->vre_lock);
+
+ /*
+ * SCL_STATE will be released when the read and write are done,
+ * by raidz_reflow_write_done().
+ */
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+ /* check if a replacing vdev was added, if so treat it as an error */
+ if (vdev_raidz_expand_child_replacing(vd)) {
+ zfs_dbgmsg("replacing vdev encountered, reflow paused at "
+ "offset=%llu txg=%llu",
+ (long long)rra->rra_lr->lr_offset,
+ (long long)rra->rra_txg);
+
+ mutex_enter(&vre->vre_lock);
+ vre->vre_failed_offset =
+ MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
+ cv_signal(&vre->vre_cv);
+ mutex_exit(&vre->vre_lock);
+
+ /* drop everything we acquired */
+ zfs_rangelock_exit(rra->rra_lr);
+ kmem_free(rra, sizeof (*rra));
+ spa_config_exit(spa, SCL_STATE, spa);
+ return (B_TRUE);
+ }
+
+ zio_t *pio = spa->spa_txg_zio[txgoff];
+ abd_t *abd = abd_alloc_for_io(length, B_FALSE);
+ zio_t *write_zio = zio_vdev_child_io(pio, NULL,
+ vd->vdev_child[blkid % vd->vdev_children],
+ (blkid / vd->vdev_children) << ashift,
+ abd, length,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ raidz_reflow_write_done, rra);
+
+ zio_nowait(zio_vdev_child_io(write_zio, NULL,
+ vd->vdev_child[blkid % old_children],
+ (blkid / old_children) << ashift,
+ abd, length,
+ ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ raidz_reflow_read_done, rra));
+
+ return (B_FALSE);
+}
+
+/*
+ * For testing (ztest specific)
+ */
+static void
+raidz_expand_pause(uint_t pause_point)
+{
+ while (raidz_expand_pause_point != 0 &&
+ raidz_expand_pause_point <= pause_point)
+ delay(hz);
+}
+
+static void
+raidz_scratch_child_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_private;
+
+ mutex_enter(&pio->io_lock);
+ pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
+ mutex_exit(&pio->io_lock);
+}
+
+/*
+ * Reflow the beginning portion of the vdev into an intermediate scratch area
+ * in memory and on disk. This operation must be persisted on disk before we
+ * proceed to overwrite the beginning portion with the reflowed data.
+ *
+ * This multi-step task can fail to complete if disk errors are encountered
+ * and we can return here after a pause (waiting for disk to become healthy).
+ */
+static void
+raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_raidz_expand_t *vre = arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ zio_t *pio;
+ int error;
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ int ashift = raidvd->vdev_ashift;
+ uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift);
+ uint64_t logical_size = write_size * raidvd->vdev_children;
+ uint64_t read_size =
+ P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
+ 1 << ashift);
+
+ /*
+ * The scratch space must be large enough to get us to the point
+ * that one row does not overlap itself when moved. This is checked
+ * by vdev_raidz_attach_check().
+ */
+ VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
+ VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
+ VERIFY3U(write_size, <=, read_size);
+
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
+ 0, logical_size, RL_WRITER);
+
+ abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
+ KM_SLEEP);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ abds[i] = abd_alloc_linear(read_size, B_FALSE);
+ }
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
+
+ /*
+ * If we have already written the scratch area then we must read from
+ * there, since new writes were redirected there while we were paused
+ * or the original location may have been partially overwritten with
+ * reflowed data.
+ */
+ if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
+ VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
+ /*
+ * Read from scratch space.
+ */
+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ /*
+ * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
+ * to the offset to calculate the physical offset to
+ * write to. Passing in a negative offset makes us
+ * access the scratch area.
+ */
+ zio_nowait(zio_vdev_child_io(pio, NULL,
+ raidvd->vdev_child[i],
+ VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
+ write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
+ }
+ error = zio_wait(pio);
+ if (error != 0) {
+ zfs_dbgmsg("reflow: error %d reading scratch location",
+ error);
+ goto io_error_exit;
+ }
+ goto overwrite;
+ }
+
+ /*
+ * Read from original location.
+ */
+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (int i = 0; i < raidvd->vdev_children - 1; i++) {
+ ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ 0, abds[i], read_size, ZIO_TYPE_READ,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+ raidz_scratch_child_done, pio));
+ }
+ error = zio_wait(pio);
+ if (error != 0) {
+ zfs_dbgmsg("reflow: error %d reading original location", error);
+io_error_exit:
+ for (int i = 0; i < raidvd->vdev_children; i++)
+ abd_free(abds[i]);
+ kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
+ zfs_rangelock_exit(lr);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ return;
+ }
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
+
+ /*
+ * Reflow in memory.
+ */
+ uint64_t logical_sectors = logical_size >> ashift;
+ for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
+ int oldchild = i % (raidvd->vdev_children - 1);
+ uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
+
+ int newchild = i % raidvd->vdev_children;
+ uint64_t newoff = (i / raidvd->vdev_children) << ashift;
+
+ /* a single sector should not be copying over itself */
+ ASSERT(!(newchild == oldchild && newoff == oldoff));
+
+ abd_copy_off(abds[newchild], abds[oldchild],
+ newoff, oldoff, 1 << ashift);
+ }
+
+ /*
+ * Verify that we filled in everything we intended to (write_size on
+ * each child).
+ */
+ VERIFY0(logical_sectors % raidvd->vdev_children);
+ VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
+ write_size);
+
+ /*
+ * Write to scratch location (boot area).
+ */
+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ /*
+ * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
+ * the offset to calculate the physical offset to write to.
+ * Passing in a negative offset lets us access the boot area.
+ */
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
+ write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
+ }
+ error = zio_wait(pio);
+ if (error != 0) {
+ zfs_dbgmsg("reflow: error %d writing scratch location", error);
+ goto io_error_exit;
+ }
+ pio = zio_root(spa, NULL, NULL, 0);
+ zio_flush(pio, raidvd);
+ zio_wait(pio);
+
+ zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
+ (long long)logical_size);
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
+
+ /*
+ * Update uberblock to indicate that scratch space is valid. This is
+ * needed because after this point, the real location may be
+ * overwritten. If we crash, we need to get the data from the
+ * scratch space, rather than the real location.
+ *
+ * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
+ * will prefer this uberblock.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
+ spa->spa_ubsync.ub_timestamp++;
+ ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
+ &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
+ if (spa_multihost(spa))
+ mmp_update_uberblock(spa, &spa->spa_ubsync);
+
+ zfs_dbgmsg("reflow: uberblock updated "
+ "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
+ (long long)spa->spa_ubsync.ub_txg,
+ (long long)logical_size,
+ (long long)spa->spa_ubsync.ub_timestamp);
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
+
+ /*
+ * Overwrite with reflow'ed data.
+ */
+overwrite:
+ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ 0, abds[i], write_size, ZIO_TYPE_WRITE,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
+ raidz_scratch_child_done, pio));
+ }
+ error = zio_wait(pio);
+ if (error != 0) {
+ /*
+ * When we exit early here and drop the range lock, new
+ * writes will go into the scratch area so we'll need to
+ * read from there when we return after pausing.
+ */
+ zfs_dbgmsg("reflow: error %d writing real location", error);
+ /*
+ * Update the uberblock that is written when this txg completes.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
+ logical_size);
+ goto io_error_exit;
+ }
+ pio = zio_root(spa, NULL, NULL, 0);
+ zio_flush(pio, raidvd);
+ zio_wait(pio);
+
+ zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
+ (long long)logical_size);
+ for (int i = 0; i < raidvd->vdev_children; i++)
+ abd_free(abds[i]);
+ kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
+
+ /*
+ * Update uberblock to indicate that the initial part has been
+ * reflow'ed. This is needed because after this point (when we exit
+ * the rangelock), we allow regular writes to this region, which will
+ * be written to the new location only (because reflow_offset_next ==
+ * reflow_offset_synced). If we crashed and re-copied from the
+ * scratch space, we would lose the regular writes.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
+ logical_size);
+ spa->spa_ubsync.ub_timestamp++;
+ ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
+ &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
+ if (spa_multihost(spa))
+ mmp_update_uberblock(spa, &spa->spa_ubsync);
+
+ zfs_dbgmsg("reflow: uberblock updated "
+ "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
+ (long long)spa->spa_ubsync.ub_txg,
+ (long long)logical_size,
+ (long long)spa->spa_ubsync.ub_timestamp);
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
+
+ /*
+ * Update progress.
+ */
+ vre->vre_offset = logical_size;
+ zfs_rangelock_exit(lr);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+ vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
+ vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
+ /*
+ * Note - raidz_reflow_sync() will update the uberblock state to
+ * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
+ */
+ raidz_reflow_sync(spa, tx);
+
+ raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
+}
+
+/*
+ * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
+ * here. No other i/o can be in progress, so we don't need the vre_rangelock.
+ */
+void
+vdev_raidz_reflow_copy_scratch(spa_t *spa)
+{
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+ uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
+ ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ ASSERT0(logical_size % raidvd->vdev_children);
+ uint64_t write_size = logical_size / raidvd->vdev_children;
+
+ zio_t *pio;
+
+ /*
+ * Read from scratch space.
+ */
+ abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
+ KM_SLEEP);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ abds[i] = abd_alloc_linear(write_size, B_FALSE);
+ }
+
+ pio = zio_root(spa, NULL, NULL, 0);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ /*
+ * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
+ * the offset to calculate the physical offset to write to.
+ * Passing in a negative offset lets us access the boot area.
+ */
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
+ write_size, ZIO_TYPE_READ,
+ ZIO_PRIORITY_ASYNC_READ, 0,
+ raidz_scratch_child_done, pio));
+ }
+ zio_wait(pio);
+
+ /*
+ * Overwrite real location with reflow'ed data.
+ */
+ pio = zio_root(spa, NULL, NULL, 0);
+ for (int i = 0; i < raidvd->vdev_children; i++) {
+ zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
+ 0, abds[i], write_size, ZIO_TYPE_WRITE,
+ ZIO_PRIORITY_ASYNC_WRITE, 0,
+ raidz_scratch_child_done, pio));
+ }
+ zio_wait(pio);
+ pio = zio_root(spa, NULL, NULL, 0);
+ zio_flush(pio, raidvd);
+ zio_wait(pio);
+
+ zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
+ "to real location", (long long)logical_size);
+
+ for (int i = 0; i < raidvd->vdev_children; i++)
+ abd_free(abds[i]);
+ kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
+
+ /*
+ * Update uberblock.
+ */
+ RAIDZ_REFLOW_SET(&spa->spa_ubsync,
+ RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
+ spa->spa_ubsync.ub_timestamp++;
+ VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
+ &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
+ if (spa_multihost(spa))
+ mmp_update_uberblock(spa, &spa->spa_ubsync);
+
+ zfs_dbgmsg("reflow recovery: uberblock updated "
+ "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
+ (long long)spa->spa_ubsync.ub_txg,
+ (long long)logical_size,
+ (long long)spa->spa_ubsync.ub_timestamp);
+
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
+ spa_first_txg(spa));
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+ vre->vre_offset = logical_size;
+ vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
+ vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
+ /*
+ * Note that raidz_reflow_sync() will update the uberblock once more
+ */
+ raidz_reflow_sync(spa, tx);
+
+ dmu_tx_commit(tx);
+
+ spa_config_exit(spa, SCL_STATE, FTAG);
+}
+
+static boolean_t
+spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
+{
+ (void) zthr;
+ spa_t *spa = arg;
+
+ return (spa->spa_raidz_expand != NULL &&
+ !spa->spa_raidz_expand->vre_waiting_for_resilver);
+}
+
+/*
+ * RAIDZ expansion background thread
+ *
+ * Can be called multiple times if the reflow is paused
+ */
+static void
+spa_raidz_expand_thread(void *arg, zthr_t *zthr)
+{
+ spa_t *spa = arg;
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+
+ if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
+ vre->vre_offset = 0;
+ else
+ vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
+
+ /* Reflow the begining portion using the scratch area */
+ if (vre->vre_offset == 0) {
+ VERIFY0(dsl_sync_task(spa_name(spa),
+ NULL, raidz_reflow_scratch_sync,
+ vre, 0, ZFS_SPACE_CHECK_NONE));
+
+ /* if we encountered errors then pause */
+ if (vre->vre_offset == 0) {
+ mutex_enter(&vre->vre_lock);
+ vre->vre_waiting_for_resilver = B_TRUE;
+ mutex_exit(&vre->vre_lock);
+ return;
+ }
+ }
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+
+ uint64_t guid = raidvd->vdev_guid;
+
+ /* Iterate over all the remaining metaslabs */
+ for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
+ i < raidvd->vdev_ms_count &&
+ !zthr_iscancelled(zthr) &&
+ vre->vre_failed_offset == UINT64_MAX; i++) {
+ metaslab_t *msp = raidvd->vdev_ms[i];
+
+ metaslab_disable(msp);
+ mutex_enter(&msp->ms_lock);
+
+ /*
+ * The metaslab may be newly created (for the expanded
+ * space), in which case its trees won't exist yet,
+ * so we need to bail out early.
+ */
+ if (msp->ms_new) {
+ mutex_exit(&msp->ms_lock);
+ metaslab_enable(msp, B_FALSE, B_FALSE);
+ continue;
+ }
+
+ VERIFY0(metaslab_load(msp));
+
+ /*
+ * We want to copy everything except the free (allocatable)
+ * space. Note that there may be a little bit more free
+ * space (e.g. in ms_defer), and it's fine to copy that too.
+ */
+ range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
+ NULL, 0, 0);
+ range_tree_add(rt, msp->ms_start, msp->ms_size);
+ range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
+ mutex_exit(&msp->ms_lock);
+
+ /*
+ * Force the last sector of each metaslab to be copied. This
+ * ensures that we advance the on-disk progress to the end of
+ * this metaslab while the metaslab is disabled. Otherwise, we
+ * could move past this metaslab without advancing the on-disk
+ * progress, and then an allocation to this metaslab would not
+ * be copied.
+ */
+ int sectorsz = 1 << raidvd->vdev_ashift;
+ uint64_t ms_last_offset = msp->ms_start +
+ msp->ms_size - sectorsz;
+ if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
+ range_tree_add(rt, ms_last_offset, sectorsz);
+ }
+
+ /*
+ * When we are resuming from a paused expansion (i.e.
+ * when importing a pool with a expansion in progress),
+ * discard any state that we have already processed.
+ */
+ range_tree_clear(rt, 0, vre->vre_offset);
+
+ while (!zthr_iscancelled(zthr) &&
+ !range_tree_is_empty(rt) &&
+ vre->vre_failed_offset == UINT64_MAX) {
+
+ /*
+ * We need to periodically drop the config lock so that
+ * writers can get in. Additionally, we can't wait
+ * for a txg to sync while holding a config lock
+ * (since a waiting writer could cause a 3-way deadlock
+ * with the sync thread, which also gets a config
+ * lock for reader). So we can't hold the config lock
+ * while calling dmu_tx_assign().
+ */
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * If requested, pause the reflow when the amount
+ * specified by raidz_expand_max_reflow_bytes is reached
+ *
+ * This pause is only used during testing or debugging.
+ */
+ while (raidz_expand_max_reflow_bytes != 0 &&
+ raidz_expand_max_reflow_bytes <=
+ vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
+ delay(hz);
+ }
+
+ mutex_enter(&vre->vre_lock);
+ while (vre->vre_outstanding_bytes >
+ raidz_expand_max_copy_bytes) {
+ cv_wait(&vre->vre_cv, &vre->vre_lock);
+ }
+ mutex_exit(&vre->vre_lock);
+
+ dmu_tx_t *tx =
+ dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ /*
+ * Reacquire the vdev_config lock. Theoretically, the
+ * vdev_t that we're expanding may have changed.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+
+ boolean_t needsync =
+ raidz_reflow_impl(raidvd, vre, rt, tx);
+
+ dmu_tx_commit(tx);
+
+ if (needsync) {
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ txg_wait_synced(spa->spa_dsl_pool, txg);
+ spa_config_enter(spa, SCL_CONFIG, FTAG,
+ RW_READER);
+ }
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ metaslab_enable(msp, B_FALSE, B_FALSE);
+ range_tree_vacate(rt, NULL, NULL);
+ range_tree_destroy(rt);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
+ * The txg_wait_synced() here ensures that all reflow zio's have
+ * completed, and vre_failed_offset has been set if necessary. It
+ * also ensures that the progress of the last raidz_reflow_sync() is
+ * written to disk before raidz_reflow_complete_sync() changes the
+ * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
+ * determine if a reflow is in progress, in which case we may need to
+ * write to both old and new locations. Therefore we can only change
+ * vre_state once this is not necessary, which is once the on-disk
+ * progress (in spa_ubsync) has been set past any possible writes (to
+ * the end of the last metaslab).
+ */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+
+ if (!zthr_iscancelled(zthr) &&
+ vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
+ /*
+ * We are not being canceled or paused, so the reflow must be
+ * complete. In that case also mark it as completed on disk.
+ */
+ ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+ raidz_reflow_complete_sync, spa,
+ 0, ZFS_SPACE_CHECK_NONE));
+ (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
+ } else {
+ /*
+ * Wait for all copy zio's to complete and for all the
+ * raidz_reflow_sync() synctasks to be run.
+ */
+ spa_history_log_internal(spa, "reflow pause",
+ NULL, "offset=%llu failed_offset=%lld",
+ (long long)vre->vre_offset,
+ (long long)vre->vre_failed_offset);
+ mutex_enter(&vre->vre_lock);
+ if (vre->vre_failed_offset != UINT64_MAX) {
+ /*
+ * Reset progress so that we will retry everything
+ * after the point that something failed.
+ */
+ vre->vre_offset = vre->vre_failed_offset;
+ vre->vre_failed_offset = UINT64_MAX;
+ vre->vre_waiting_for_resilver = B_TRUE;
+ }
+ mutex_exit(&vre->vre_lock);
+ }
+}
+
+void
+spa_start_raidz_expansion_thread(spa_t *spa)
+{
+ ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
+ spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
+ spa_raidz_expand_thread_check, spa_raidz_expand_thread,
+ spa, defclsyspri);
+}
+
+void
+raidz_dtl_reassessed(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ if (spa->spa_raidz_expand != NULL) {
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+ /*
+ * we get called often from vdev_dtl_reassess() so make
+ * sure it's our vdev and any replacing is complete
+ */
+ if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
+ !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
+ mutex_enter(&vre->vre_lock);
+ if (vre->vre_waiting_for_resilver) {
+ vdev_dbgmsg(vd, "DTL reassessed, "
+ "continuing raidz expansion");
+ vre->vre_waiting_for_resilver = B_FALSE;
+ zthr_wakeup(spa->spa_raidz_expand_zthr);
+ }
+ mutex_exit(&vre->vre_lock);
+ }
+ }
+}
+
+int
+vdev_raidz_attach_check(vdev_t *new_child)
+{
+ vdev_t *raidvd = new_child->vdev_parent;
+ uint64_t new_children = raidvd->vdev_children;
+
+ /*
+ * We use the "boot" space as scratch space to handle overwriting the
+ * initial part of the vdev. If it is too small, then this expansion
+ * is not allowed. This would be very unusual (e.g. ashift > 13 and
+ * >200 children).
+ */
+ if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
+ return (EINVAL);
+ }
+ return (0);
+}
+
+void
+vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
+{
+ vdev_t *new_child = arg;
+ spa_t *spa = new_child->vdev_spa;
+ vdev_t *raidvd = new_child->vdev_parent;
+ vdev_raidz_t *vdrz = raidvd->vdev_tsd;
+ ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
+ ASSERT3P(raidvd->vdev_top, ==, raidvd);
+ ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
+ ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
+ ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
+ new_child);
+
+ spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
+
+ vdrz->vd_physical_width++;
+
+ VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
+ vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
+ vdrz->vn_vre.vre_offset = 0;
+ vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
+ spa->spa_raidz_expand = &vdrz->vn_vre;
+ zthr_wakeup(spa->spa_raidz_expand_zthr);
+
+ /*
+ * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
+ * written to the config.
+ */
+ vdev_config_dirty(raidvd);
+
+ vdrz->vn_vre.vre_start_time = gethrestime_sec();
+ vdrz->vn_vre.vre_end_time = 0;
+ vdrz->vn_vre.vre_state = DSS_SCANNING;
+ vdrz->vn_vre.vre_bytes_copied = 0;
+
+ uint64_t state = vdrz->vn_vre.vre_state;
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
+ sizeof (state), 1, &state, tx));
+
+ uint64_t start_time = vdrz->vn_vre.vre_start_time;
+ VERIFY0(zap_update(spa->spa_meta_objset,
+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
+ sizeof (start_time), 1, &start_time, tx));
+
+ (void) zap_remove(spa->spa_meta_objset,
+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
+ (void) zap_remove(spa->spa_meta_objset,
+ raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
+
+ spa_history_log_internal(spa, "raidz vdev expansion started", tx,
+ "%s vdev %llu new width %llu", spa_name(spa),
+ (unsigned long long)raidvd->vdev_id,
+ (unsigned long long)raidvd->vdev_children);
+}
+
+int
+vdev_raidz_load(vdev_t *vd)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ int err;
+
+ uint64_t state = DSS_NONE;
+ uint64_t start_time = 0;
+ uint64_t end_time = 0;
+ uint64_t bytes_copied = 0;
+
+ if (vd->vdev_top_zap != 0) {
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
+ sizeof (state), 1, &state);
+ if (err != 0 && err != ENOENT)
+ return (err);
+
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
+ sizeof (start_time), 1, &start_time);
+ if (err != 0 && err != ENOENT)
+ return (err);
+
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
+ sizeof (end_time), 1, &end_time);
+ if (err != 0 && err != ENOENT)
+ return (err);
+
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
+ sizeof (bytes_copied), 1, &bytes_copied);
+ if (err != 0 && err != ENOENT)
+ return (err);
+ }
+
+ /*
+ * If we are in the middle of expansion, vre_state should have
+ * already been set by vdev_raidz_init().
+ */
+ EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
+ vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
+ vdrz->vn_vre.vre_start_time = start_time;
+ vdrz->vn_vre.vre_end_time = end_time;
+ vdrz->vn_vre.vre_bytes_copied = bytes_copied;
+
+ return (0);
+}
+
+int
+spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
+{
+ vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
+
+ if (vre == NULL) {
+ /* no removal in progress; find most recent completed */
+ for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+ if (vdrz->vn_vre.vre_end_time != 0 &&
+ (vre == NULL ||
+ vdrz->vn_vre.vre_end_time >
+ vre->vre_end_time)) {
+ vre = &vdrz->vn_vre;
+ }
+ }
+ }
+ }
+
+ if (vre == NULL) {
+ return (SET_ERROR(ENOENT));
+ }
+
+ pres->pres_state = vre->vre_state;
+ pres->pres_expanding_vdev = vre->vre_vdev_id;
+
+ vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
+ pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
+
+ mutex_enter(&vre->vre_lock);
+ pres->pres_reflowed = vre->vre_bytes_copied;
+ for (int i = 0; i < TXG_SIZE; i++)
+ pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
+ mutex_exit(&vre->vre_lock);
+
+ pres->pres_start_time = vre->vre_start_time;
+ pres->pres_end_time = vre->vre_end_time;
+ pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
+
+ return (0);
+}
+
/*
* Initialize private RAIDZ specific fields from the nvlist.
*/
static int
vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
{
- vdev_raidz_t *vdrz;
- uint64_t nparity;
-
uint_t children;
nvlist_t **child;
int error = nvlist_lookup_nvlist_array(nv,
@@ -2566,6 +4819,7 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
if (error != 0)
return (SET_ERROR(EINVAL));
+ uint64_t nparity;
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
return (SET_ERROR(EINVAL));
@@ -2592,10 +4846,56 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
nparity = 1;
}
- vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
- vdrz->vd_logical_width = children;
+ vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
+ vdrz->vn_vre.vre_vdev_id = -1;
+ vdrz->vn_vre.vre_offset = UINT64_MAX;
+ vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
+ mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
+ zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
+ mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
+ sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
+
+ vdrz->vd_physical_width = children;
vdrz->vd_nparity = nparity;
+ /* note, the ID does not exist when creating a pool */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+ &vdrz->vn_vre.vre_vdev_id);
+
+ boolean_t reflow_in_progress =
+ nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
+ if (reflow_in_progress) {
+ spa->spa_raidz_expand = &vdrz->vn_vre;
+ vdrz->vn_vre.vre_state = DSS_SCANNING;
+ }
+
+ vdrz->vd_original_width = children;
+ uint64_t *txgs;
+ unsigned int txgs_size = 0;
+ error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
+ &txgs, &txgs_size);
+ if (error == 0) {
+ for (int i = 0; i < txgs_size; i++) {
+ reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
+ re->re_txg = txgs[txgs_size - i - 1];
+ re->re_logical_width = vdrz->vd_physical_width - i;
+
+ if (reflow_in_progress)
+ re->re_logical_width--;
+
+ avl_add(&vdrz->vd_expand_txgs, re);
+ }
+
+ vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
+ }
+ if (reflow_in_progress) {
+ vdrz->vd_original_width--;
+ zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
+ children, txgs_size);
+ }
+
*tsd = vdrz;
return (0);
@@ -2604,7 +4904,20 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
static void
vdev_raidz_fini(vdev_t *vd)
{
- kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
+ vd->vdev_spa->spa_raidz_expand = NULL;
+ reflow_node_t *re;
+ void *cookie = NULL;
+ avl_tree_t *tree = &vdrz->vd_expand_txgs;
+ while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
+ kmem_free(re, sizeof (*re));
+ avl_destroy(&vdrz->vd_expand_txgs);
+ mutex_destroy(&vdrz->vd_expand_lock);
+ mutex_destroy(&vdrz->vn_vre.vre_lock);
+ cv_destroy(&vdrz->vn_vre.vre_cv);
+ zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
+ kmem_free(vdrz, sizeof (*vdrz));
}
/*
@@ -2632,6 +4945,29 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
* it.
*/
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
+
+ if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
+ fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
+ }
+
+ mutex_enter(&vdrz->vd_expand_lock);
+ if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
+ uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
+ uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
+ KM_SLEEP);
+ uint64_t i = 0;
+
+ for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
+ re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
+ txgs[i++] = re->re_txg;
+ }
+
+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
+ txgs, count);
+
+ kmem_free(txgs, sizeof (uint64_t) * count);
+ }
+ mutex_exit(&vdrz->vd_expand_lock);
}
static uint64_t
@@ -2671,3 +5007,15 @@ vdev_ops_t vdev_raidz_ops = {
.vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
+ "For testing, pause RAIDZ expansion after reflowing this many bytes");
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
+ "Max amount of concurrent i/o for RAIDZ expansion");
+ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
+ "For expanded RAIDZ, aggregate reads that have more rows than this");
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
+ "For expanded RAIDZ, automatically start a pool scrub when expansion "
+ "completes");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c
index 03e17db024ea..1c54eae40355 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_trim.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c
@@ -169,7 +169,8 @@ static boolean_t
vdev_trim_should_stop(vdev_t *vd)
{
return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
- vd->vdev_detached || vd->vdev_top->vdev_removing);
+ vd->vdev_detached || vd->vdev_top->vdev_removing ||
+ vd->vdev_top->vdev_rz_expanding);
}
/*
@@ -180,6 +181,7 @@ vdev_autotrim_should_stop(vdev_t *tvd)
{
return (tvd->vdev_autotrim_exit_wanted ||
!vdev_writeable(tvd) || tvd->vdev_removing ||
+ tvd->vdev_rz_expanding ||
spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
}
@@ -222,7 +224,8 @@ vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
kmem_free(arg, sizeof (uint64_t));
vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
- if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ if (vd == NULL || vd->vdev_top->vdev_removing ||
+ !vdev_is_concrete(vd) || vd->vdev_top->vdev_rz_expanding)
return;
uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
@@ -1005,6 +1008,7 @@ vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)
ASSERT(!vd->vdev_detached);
ASSERT(!vd->vdev_trim_exit_wanted);
ASSERT(!vd->vdev_top->vdev_removing);
+ ASSERT(!vd->vdev_rz_expanding);
vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);
vd->vdev_trim_thread = thread_create(NULL, 0,
@@ -1162,12 +1166,13 @@ vdev_trim_restart(vdev_t *vd)
ASSERT(err == 0 || err == ENOENT);
vd->vdev_trim_action_time = timestamp;
- if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
- vd->vdev_offline) {
+ if ((vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
+ vd->vdev_offline) && !vd->vdev_top->vdev_rz_expanding) {
/* load progress for reporting, but don't resume */
VERIFY0(vdev_trim_load(vd));
} else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&
vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&
+ !vd->vdev_top->vdev_rz_expanding &&
vd->vdev_trim_thread == NULL) {
VERIFY0(vdev_trim_load(vd));
vdev_trim(vd, vd->vdev_trim_rate,
@@ -1492,7 +1497,8 @@ vdev_autotrim(spa_t *spa)
mutex_enter(&tvd->vdev_autotrim_lock);
if (vdev_writeable(tvd) && !tvd->vdev_removing &&
- tvd->vdev_autotrim_thread == NULL) {
+ tvd->vdev_autotrim_thread == NULL &&
+ !tvd->vdev_rz_expanding) {
ASSERT3P(tvd->vdev_top, ==, tvd);
tvd->vdev_autotrim_thread = thread_create(NULL, 0,
@@ -1717,6 +1723,7 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
ASSERT(vd->vdev_ops->vdev_op_leaf);
ASSERT(!vd->vdev_detached);
ASSERT(!vd->vdev_top->vdev_removing);
+ ASSERT(!vd->vdev_top->vdev_rz_expanding);
ta.trim_vdev = vd;
ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);