diff options
author | Martin Matuska <mm@FreeBSD.org> | 2023-11-09 10:42:33 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2023-11-09 12:19:17 +0000 |
commit | e716630d4cf89e69ec3f675ebfceee09f1a85e05 (patch) | |
tree | 3ee825a5671f470e1481d24312b58895a12d01ac /sys/contrib/openzfs/module/zfs/vdev.c | |
parent | f5b3e686292b6502878c64c3c154908024e06eb6 (diff) | |
parent | 887a3c533b94a4b70075e310f15c45b9dee19410 (diff) |
zfs: merge openzfs/zfs@887a3c533
Notable upstream pull request merges:
#15022 5caeef02f RAID-Z expansion feature
#15457 887a3c533 Increase L2ARC write rate and headroom
#15504 1c1be60fa Unbreak FreeBSD world build after 3bd4df384
Obtained from: OpenZFS
OpenZFS commit: 887a3c533b94a4b70075e310f15c45b9dee19410
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/vdev.c')
-rw-r--r-- | sys/contrib/openzfs/module/zfs/vdev.c | 114 |
1 files changed, 79 insertions, 35 deletions
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index afb01c0ef7fd..c10c78ebf6db 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -58,6 +58,7 @@ #include <sys/abd.h> #include <sys/vdev_initialize.h> #include <sys/vdev_trim.h> +#include <sys/vdev_raidz.h> #include <sys/zvol.h> #include <sys/zfs_ratelimit.h> #include "zfs_prop.h" @@ -305,13 +306,13 @@ vdev_derive_alloc_bias(const char *bias) * all children. This is what's used by anything other than RAID-Z. */ uint64_t -vdev_default_asize(vdev_t *vd, uint64_t psize) +vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { - csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } @@ -930,6 +931,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); + vd->vdev_rz_expanding = nvlist_exists(nv, + ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } @@ -1692,6 +1695,8 @@ vdev_probe_done(zio_t *zio) vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; + vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", + vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { @@ -1913,17 +1918,20 @@ vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) } /* - * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the "typical" blocksize. - * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, - * otherwise it would inconsistently account for existing bp's. + * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) + * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE + * changed, this algorithm can not change, otherwise it would inconsistently + * account for existing bp's. We also hard-code txg 0 for the same reason + * since expanded RAIDZ vdevs can use a different asize for different birth + * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / - (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> + SPA_MINBLOCKSHIFT); } } @@ -3228,32 +3236,43 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - return; + } else { + mutex_enter(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) { + /* leaf vdevs only */ + continue; + } + if (t == DTL_PARTIAL) { + /* i.e. non-zero */ + minref = 1; + } else if (vdev_get_nparity(vd) != 0) { + /* RAIDZ, DRAID */ + minref = vdev_get_nparity(vd) + 1; + } else { + /* any kind of mirror */ + minref = vd->vdev_children; + } + space_reftree_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, + cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_reftree_generate_map(&reftree, + vd->vdev_dtl[t], minref); + space_reftree_destroy(&reftree); + } + mutex_exit(&vd->vdev_dtl_lock); } - mutex_enter(&vd->vdev_dtl_lock); - for (int t = 0; t < DTL_TYPES; t++) { - /* account for child's outage in parent's missing map */ - int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; - if (t == DTL_SCRUB) - continue; /* leaf vdevs only */ - if (t == DTL_PARTIAL) - minref = 1; /* i.e. non-zero */ - else if (vdev_get_nparity(vd) != 0) - minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ - else - minref = vd->vdev_children; /* any kind of mirror */ - space_reftree_create(&reftree); - for (int c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - mutex_enter(&cvd->vdev_dtl_lock); - space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); - mutex_exit(&cvd->vdev_dtl_lock); - } - space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); - space_reftree_destroy(&reftree); + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { + raidz_dtl_reassessed(vd); } - mutex_exit(&vd->vdev_dtl_lock); } /* @@ -3628,6 +3647,12 @@ vdev_load(vdev_t *vd) vdev_set_deflate_ratio(vd); + if (vd->vdev_ops == &vdev_raidz_ops) { + error = vdev_raidz_load(vd); + if (error != 0) + return (error); + } + /* * On spa_load path, grab the allocation bias from our zap */ @@ -4005,10 +4030,22 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } +/* + * Return the amount of space that should be (or was) allocated for the given + * psize (compressed block size) in the given TXG. Note that for expanded + * RAIDZ vdevs, the size allocated for older BP's may be larger. See + * vdev_raidz_asize(). + */ +uint64_t +vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); +} + uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { - return (vd->vdev_ops->vdev_op_asize(vd, psize)); + return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* @@ -4174,9 +4211,6 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); - if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); - wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; @@ -5457,7 +5491,9 @@ vdev_expand(vdev_t *vd, uint64_t txg) vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + if ((vd->vdev_spa->spa_raidz_expand == NULL || + vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && + (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); @@ -6209,6 +6245,14 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; + case VDEV_PROP_RAIDZ_EXPANDING: + /* Only expose this for raidz */ + if (vd->vdev_ops == &vdev_raidz_ops) { + vdev_prop_add_list(outnvl, propname, + NULL, vd->vdev_rz_expanding, + ZPROP_SRC_NONE); + } + continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ |