aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/vdev.c
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2023-11-09 10:42:33 +0000
committerMartin Matuska <mm@FreeBSD.org>2023-11-09 12:19:17 +0000
commite716630d4cf89e69ec3f675ebfceee09f1a85e05 (patch)
tree3ee825a5671f470e1481d24312b58895a12d01ac /sys/contrib/openzfs/module/zfs/vdev.c
parentf5b3e686292b6502878c64c3c154908024e06eb6 (diff)
parent887a3c533b94a4b70075e310f15c45b9dee19410 (diff)
zfs: merge openzfs/zfs@887a3c533
Notable upstream pull request merges: #15022 5caeef02f RAID-Z expansion feature #15457 887a3c533 Increase L2ARC write rate and headroom #15504 1c1be60fa Unbreak FreeBSD world build after 3bd4df384 Obtained from: OpenZFS OpenZFS commit: 887a3c533b94a4b70075e310f15c45b9dee19410
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/vdev.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c114
1 files changed, 79 insertions, 35 deletions
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index afb01c0ef7fd..c10c78ebf6db 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -58,6 +58,7 @@
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
+#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>
#include "zfs_prop.h"
@@ -305,13 +306,13 @@ vdev_derive_alloc_bias(const char *bias)
* all children. This is what's used by anything other than RAID-Z.
*/
uint64_t
-vdev_default_asize(vdev_t *vd, uint64_t psize)
+vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;
for (int c = 0; c < vd->vdev_children; c++) {
- csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+ csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
asize = MAX(asize, csize);
}
@@ -930,6 +931,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_removing);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
&vd->vdev_top_zap);
+ vd->vdev_rz_expanding = nvlist_exists(nv,
+ ZPOOL_CONFIG_RAIDZ_EXPANDING);
} else {
ASSERT0(vd->vdev_top_zap);
}
@@ -1692,6 +1695,8 @@ vdev_probe_done(zio_t *zio)
vd->vdev_cant_read |= !vps->vps_readable;
vd->vdev_cant_write |= !vps->vps_writeable;
+ vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
+ vd->vdev_cant_read, vd->vdev_cant_write);
if (vdev_readable(vd) &&
(vdev_writeable(vd) || !spa_writeable(spa))) {
@@ -1913,17 +1918,20 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
}
/*
- * Compute the raidz-deflation ratio. Note, we hard-code
- * in 128k (1 << 17) because it is the "typical" blocksize.
- * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
- * otherwise it would inconsistently account for existing bp's.
+ * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17)
+ * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE
+ * changed, this algorithm can not change, otherwise it would inconsistently
+ * account for existing bp's. We also hard-code txg 0 for the same reason
+ * since expanded RAIDZ vdevs can use a different asize for different birth
+ * txg's.
*/
static void
vdev_set_deflate_ratio(vdev_t *vd)
{
if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
vd->vdev_deflate_ratio = (1 << 17) /
- (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+ (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
+ SPA_MINBLOCKSHIFT);
}
}
@@ -3228,32 +3236,43 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
if (txg != 0)
vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
- return;
+ } else {
+ mutex_enter(&vd->vdev_dtl_lock);
+ for (int t = 0; t < DTL_TYPES; t++) {
+ /* account for child's outage in parent's missing map */
+ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+ if (t == DTL_SCRUB) {
+ /* leaf vdevs only */
+ continue;
+ }
+ if (t == DTL_PARTIAL) {
+ /* i.e. non-zero */
+ minref = 1;
+ } else if (vdev_get_nparity(vd) != 0) {
+ /* RAIDZ, DRAID */
+ minref = vdev_get_nparity(vd) + 1;
+ } else {
+ /* any kind of mirror */
+ minref = vd->vdev_children;
+ }
+ space_reftree_create(&reftree);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ mutex_enter(&cvd->vdev_dtl_lock);
+ space_reftree_add_map(&reftree,
+ cvd->vdev_dtl[s], 1);
+ mutex_exit(&cvd->vdev_dtl_lock);
+ }
+ space_reftree_generate_map(&reftree,
+ vd->vdev_dtl[t], minref);
+ space_reftree_destroy(&reftree);
+ }
+ mutex_exit(&vd->vdev_dtl_lock);
}
- mutex_enter(&vd->vdev_dtl_lock);
- for (int t = 0; t < DTL_TYPES; t++) {
- /* account for child's outage in parent's missing map */
- int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
- if (t == DTL_SCRUB)
- continue; /* leaf vdevs only */
- if (t == DTL_PARTIAL)
- minref = 1; /* i.e. non-zero */
- else if (vdev_get_nparity(vd) != 0)
- minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
- else
- minref = vd->vdev_children; /* any kind of mirror */
- space_reftree_create(&reftree);
- for (int c = 0; c < vd->vdev_children; c++) {
- vdev_t *cvd = vd->vdev_child[c];
- mutex_enter(&cvd->vdev_dtl_lock);
- space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
- mutex_exit(&cvd->vdev_dtl_lock);
- }
- space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
- space_reftree_destroy(&reftree);
+ if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
+ raidz_dtl_reassessed(vd);
}
- mutex_exit(&vd->vdev_dtl_lock);
}
/*
@@ -3628,6 +3647,12 @@ vdev_load(vdev_t *vd)
vdev_set_deflate_ratio(vd);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ error = vdev_raidz_load(vd);
+ if (error != 0)
+ return (error);
+ }
+
/*
* On spa_load path, grab the allocation bias from our zap
*/
@@ -4005,10 +4030,22 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}
+/*
+ * Return the amount of space that should be (or was) allocated for the given
+ * psize (compressed block size) in the given TXG. Note that for expanded
+ * RAIDZ vdevs, the size allocated for older BP's may be larger. See
+ * vdev_raidz_asize().
+ */
+uint64_t
+vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
+{
+ return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
+}
+
uint64_t
vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
{
- return (vd->vdev_ops->vdev_op_asize(vd, psize));
+ return (vdev_psize_to_asize_txg(vd, psize, 0));
}
/*
@@ -4174,9 +4211,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
- if (!vd->vdev_ops->vdev_op_leaf)
- return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
-
wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
oldstate = vd->vdev_state;
@@ -5457,7 +5491,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
vdev_set_deflate_ratio(vd);
- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ if ((vd->vdev_spa->spa_raidz_expand == NULL ||
+ vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
+ (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
vdev_is_concrete(vd)) {
vdev_metaslab_group_create(vd);
VERIFY(vdev_metaslab_init(vd, txg) == 0);
@@ -6209,6 +6245,14 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
vdev_prop_add_list(outnvl, propname, NULL,
vd->vdev_removing, ZPROP_SRC_NONE);
continue;
+ case VDEV_PROP_RAIDZ_EXPANDING:
+ /* Only expose this for raidz */
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ vdev_prop_add_list(outnvl, propname,
+ NULL, vd->vdev_rz_expanding,
+ ZPROP_SRC_NONE);
+ }
+ continue;
/* Numeric Properites */
case VDEV_PROP_ALLOCATING:
/* Leaf vdevs cannot have this property */