aboutsummaryrefslogtreecommitdiff
path: root/module/zfs/vdev_removal.c
diff options
context:
space:
mode:
authorMatthew Ahrens <mahrens@delphix.com>2019-06-13 20:12:39 +0000
committerBrian Behlendorf <behlendorf1@llnl.gov>2019-06-13 20:12:39 +0000
commit53dce5acc652800fcfca1b83e22a00c5e4fc9e87 (patch)
tree209821821bba809af812e0ad209ce3a600fc1067 /module/zfs/vdev_removal.c
parentbe89734a29fda5a0f5780d953789fb7e91b2a529 (diff)
downloadsrc-53dce5acc652800fcfca1b83e22a00c5e4fc9e87.tar.gz
src-53dce5acc652800fcfca1b83e22a00c5e4fc9e87.zip
panic in removal_remap test on 4K devices
If the zfs_remove_max_segment tunable is changed to be not a multiple of the sector size, then the device removal code will malfunction and try to create mappings that are smaller than one sector, leading to a panic. On debug bits this assertion will fail in spa_vdev_copy_segment(): ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); On nondebug, the system panics with a stack like: metaslab_free_concrete() metaslab_free_impl() metaslab_free_impl_cb() vdev_indirect_remap() free_from_removing_vdev() metaslab_free_impl() metaslab_free_dva() metaslab_free() Fortunately, the default for zfs_remove_max_segment is 1MB, so this can't occur by default. We hit it during this test because removal_remap.ksh changes zfs_remove_max_segment to 1KB. When testing on 4KB-sector disks, we hit the bug. This change makes the zfs_remove_max_segment tunable more robust, automatically rounding it up to a multiple of the sector size. We also turn some key assertions into VERIFY's so that similar bugs would be caught before they are encoded on disk (and thus avoid a panic-reboot-loop). Reviewed-by: Sean Eric Fagan <sef@ixsystems.com> Reviewed-by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com> Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> External-issue: DLPX-61342 Closes #8893
Diffstat (limited to 'module/zfs/vdev_removal.c')
-rw-r--r--module/zfs/vdev_removal.c33
1 files changed, 26 insertions, 7 deletions
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 536a982eca2b..6f64edd8c473 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -100,6 +100,8 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
* removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If
* there is a performance problem with attempting to allocate large blocks,
* consider decreasing this.
+ *
+ * See also the accessor function spa_remove_max_segment().
*/
int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
@@ -951,8 +953,10 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
vdev_indirect_mapping_entry_t *entry;
dva_t dst = {{ 0 }};
uint64_t start = range_tree_min(segs);
+ ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift));
ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
+ ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift));
uint64_t size = range_tree_span(segs);
if (range_tree_span(segs) > maxalloc) {
@@ -983,6 +987,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
}
}
ASSERT3U(size, <=, maxalloc);
+ ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift));
/*
* An allocation class might not have any remaining vdevs or space
@@ -1026,11 +1031,11 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
/*
* We can't have any padding of the allocated size, otherwise we will
- * misunderstand what's allocated, and the size of the mapping.
- * The caller ensures this will be true by passing in a size that is
- * aligned to the worst (highest) ashift in the pool.
+ * misunderstand what's allocated, and the size of the mapping. We
+ * prevent padding by ensuring that all devices in the pool have the
+ * same ashift, and the allocation size is a multiple of the ashift.
*/
- ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
+ VERIFY3U(DVA_GET_ASIZE(&dst), ==, size);
entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
@@ -1364,6 +1369,20 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
}
/*
+ * The size of each removal mapping is limited by the tunable
+ * zfs_remove_max_segment, but we must adjust this to be a multiple of the
+ * pool's ashift, so that we don't try to split individual sectors regardless
+ * of the tunable value. (Note that device removal requires that all devices
+ * have the same ashift, so there's no difference between spa_min_ashift and
+ * spa_max_ashift.) The raw tunable should not be used elsewhere.
+ */
+uint64_t
+spa_remove_max_segment(spa_t *spa)
+{
+ return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift));
+}
+
+/*
* The removal thread operates in open context. It iterates over all
* allocated space in the vdev, by loading each metaslab's spacemap.
* For each contiguous segment of allocated space (capping the segment
@@ -1385,7 +1404,7 @@ spa_vdev_remove_thread(void *arg)
spa_t *spa = arg;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_copy_arg_t vca;
- uint64_t max_alloc = zfs_remove_max_segment;
+ uint64_t max_alloc = spa_remove_max_segment(spa);
uint64_t last_txg = 0;
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -1511,7 +1530,7 @@ spa_vdev_remove_thread(void *arg)
vd = vdev_lookup_top(spa, svr->svr_vdev_id);
if (txg != last_txg)
- max_alloc = zfs_remove_max_segment;
+ max_alloc = spa_remove_max_segment(spa);
last_txg = txg;
spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);