aboutsummaryrefslogtreecommitdiff
path: root/cddl/contrib
diff options
context:
space:
mode:
authorAlexander Motin <mav@FreeBSD.org>2018-03-28 22:01:27 +0000
committerAlexander Motin <mav@FreeBSD.org>2018-03-28 22:01:27 +0000
commit0b0c76bc58f1ea92571ef1c9ec7596459426ec31 (patch)
tree63570fbbc435e29786a767d591876db77fdf4ee1 /cddl/contrib
parent4bada7a0a206974294fa26efb81e9eba1d0bc895 (diff)
parentbce155015b3989c8a43519fc187ac469cf518336 (diff)
downloadsrc-0b0c76bc58f1ea92571ef1c9ec7596459426ec31.tar.gz
src-0b0c76bc58f1ea92571ef1c9ec7596459426ec31.zip
MFV r331695, 331700: 9166 zfs storage pool checkpoint
illumos/illumos-gate@8671400134a11c848244896ca51a7db4d0f69da4 The idea of Storage Pool Checkpoint (aka zpool checkpoint) deals with exactly that. It can be thought of as a “pool-wide snapshot” (or a variation of extreme rewind that doesn’t corrupt your data). It remembers the entire state of the pool at the point that it was taken and the user can revert back to it later or discard it. Its generic use case is an administrator that is about to perform a set of destructive actions to ZFS as part of a critical procedure. She takes a checkpoint of the pool before performing the actions, then rewinds back to it if one of them fails or puts the pool into an unexpected state. Otherwise, she discards it. With the assumption that no one else is making modifications to ZFS, she basically wraps all these actions into a “high-level transaction”. Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: John Kennedy <john.kennedy@delphix.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> Author: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Notes
Notes: svn path=/head/; revision=331701
Diffstat (limited to 'cddl/contrib')
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb.85
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb.c847
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb_il.c12
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool-features.719
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool.893
-rw-r--r--cddl/contrib/opensolaris/cmd/zpool/zpool_main.c215
-rw-r--r--cddl/contrib/opensolaris/cmd/ztest/ztest.c120
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h7
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c43
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c29
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c68
-rw-r--r--cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h3
12 files changed, 1316 insertions, 145 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.8 b/cddl/contrib/opensolaris/cmd/zdb/zdb.8
index 2d294ea3641a..8561b972ac45 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.8
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.8
@@ -21,7 +21,7 @@
.Nd display zpool debugging and consistency information
.Sh SYNOPSIS
.Nm
-.Op Fl AbcdDFGhiLMPsvX
+.Op Fl AbcdDFGhikLMPsvX
.Op Fl e Oo Fl V Oc Op Fl p Ar path ...
.Op Fl I Ar inflight I/Os
.Oo Fl o Ar var Ns = Ns Ar value Oc Ns ...
@@ -170,6 +170,9 @@ Display information about intent log
.Pq ZIL
entries relating to each dataset.
If specified multiple times, display counts of each intent log transaction type.
+.It Fl k
+Examine the checkpointed state of the pool.
+Note, the on disk format of the pool is not reverted to the checkpointed state.
.It Fl l Ar device
Read the vdev labels from the specified device.
.Nm Fl l
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
index 92b5f34fd046..223ebd86831d 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@@ -131,7 +131,7 @@ static void
usage(void)
{
(void) fprintf(stderr,
- "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-V] [-p <path> ...]] "
+ "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] "
"[-I <inflight I/Os>]\n"
"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
"\t\t[<poolname> [<object> ...]]\n"
@@ -168,6 +168,8 @@ usage(void)
(void) fprintf(stderr, " -h pool history\n");
(void) fprintf(stderr, " -i intent logs\n");
(void) fprintf(stderr, " -l read label contents\n");
+ (void) fprintf(stderr, " -k examine the checkpointed state "
+ "of the pool\n");
(void) fprintf(stderr, " -L disable leak tracking (do not "
"load spacemaps)\n");
(void) fprintf(stderr, " -m metaslabs\n");
@@ -729,6 +731,22 @@ get_prev_obsolete_spacemap_refcount(spa_t *spa)
}
static int
+get_checkpoint_refcount(vdev_t *vd)
+{
+ int refcount = 0;
+
+ if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
+ zap_contains(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
+ refcount++;
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
+ refcount += get_checkpoint_refcount(vd->vdev_child[c]);
+
+ return (refcount);
+}
+
+static int
verify_spacemap_refcounts(spa_t *spa)
{
uint64_t expected_refcount = 0;
@@ -741,6 +759,7 @@ verify_spacemap_refcounts(spa_t *spa)
actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
+ actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
if (expected_refcount != actual_refcount) {
(void) printf("space map refcount mismatch: expected %lld != "
@@ -814,8 +833,8 @@ static void
dump_metaslab_stats(metaslab_t *msp)
{
char maxbuf[32];
- range_tree_t *rt = msp->ms_tree;
- avl_tree_t *t = &msp->ms_size_tree;
+ range_tree_t *rt = msp->ms_allocatable;
+ avl_tree_t *t = &msp->ms_allocatable_by_size;
int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
/* max sure nicenum has enough space */
@@ -851,7 +870,7 @@ dump_metaslab(metaslab_t *msp)
metaslab_load_wait(msp);
if (!msp->ms_loaded) {
VERIFY0(metaslab_load(msp));
- range_tree_stat_verify(msp->ms_tree);
+ range_tree_stat_verify(msp->ms_allocatable);
}
dump_metaslab_stats(msp);
metaslab_unload(msp);
@@ -2289,6 +2308,8 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
(void) printf("\trootbp = %s\n", blkbuf);
}
+ (void) printf("\tcheckpoint_txg = %llu\n",
+ (u_longlong_t)ub->ub_checkpoint_txg);
(void) printf("%s", footer ? footer : "");
}
@@ -2649,6 +2670,7 @@ static const char *zdb_ot_extname[] = {
typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
uint64_t zcb_removing_size;
+ uint64_t zcb_checkpoint_size;
uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks;
uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
@@ -2748,7 +2770,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
}
VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
- refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
+ refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
}
@@ -2950,7 +2972,7 @@ claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
ASSERT(vdev_is_concrete(vd));
VERIFY0(metaslab_claim_impl(vd, offset, size,
- spa_first_txg(vd->vdev_spa)));
+ spa_min_claim_txg(vd->vdev_spa)));
}
static void
@@ -3011,70 +3033,6 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
-/*
- * vm_idxp is an in-out parameter which (for indirect vdevs) is the
- * index in vim_entries that has the first entry in this metaslab. On
- * return, it will be set to the first entry after this metaslab.
- */
-static void
-zdb_leak_init_ms(metaslab_t *msp, uint64_t *vim_idxp)
-{
- metaslab_group_t *mg = msp->ms_group;
- vdev_t *vd = mg->mg_vd;
- vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
-
- mutex_enter(&msp->ms_lock);
- metaslab_unload(msp);
-
- /*
- * We don't want to spend the CPU manipulating the size-ordered
- * tree, so clear the range_tree ops.
- */
- msp->ms_tree->rt_ops = NULL;
-
- (void) fprintf(stderr,
- "\rloading vdev %llu of %llu, metaslab %llu of %llu ...",
- (longlong_t)vd->vdev_id,
- (longlong_t)rvd->vdev_children,
- (longlong_t)msp->ms_id,
- (longlong_t)vd->vdev_ms_count);
-
- /*
- * For leak detection, we overload the metaslab ms_tree to
- * contain allocated segments instead of free segments. As a
- * result, we can't use the normal metaslab_load/unload
- * interfaces.
- */
- if (vd->vdev_ops == &vdev_indirect_ops) {
- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
- for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
- (*vim_idxp)++) {
- vdev_indirect_mapping_entry_phys_t *vimep =
- &vim->vim_entries[*vim_idxp];
- uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
- uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
- ASSERT3U(ent_offset, >=, msp->ms_start);
- if (ent_offset >= msp->ms_start + msp->ms_size)
- break;
-
- /*
- * Mappings do not cross metaslab boundaries,
- * because we create them by walking the metaslabs.
- */
- ASSERT3U(ent_offset + ent_len, <=,
- msp->ms_start + msp->ms_size);
- range_tree_add(msp->ms_tree, ent_offset, ent_len);
- }
- } else if (msp->ms_sm != NULL) {
- VERIFY0(space_map_load(msp->ms_sm, msp->ms_tree, SM_ALLOC));
- }
-
- if (!msp->ms_loaded) {
- msp->ms_loaded = B_TRUE;
- }
- mutex_exit(&msp->ms_lock);
-}
-
/* ARGSUSED */
static int
increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
@@ -3131,6 +3089,242 @@ zdb_load_obsolete_counts(vdev_t *vd)
return (counts);
}
+typedef struct checkpoint_sm_exclude_entry_arg {
+ vdev_t *cseea_vd;
+ uint64_t cseea_checkpoint_size;
+} checkpoint_sm_exclude_entry_arg_t;
+
+static int
+checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+ void *arg)
+{
+ checkpoint_sm_exclude_entry_arg_t *cseea = arg;
+ vdev_t *vd = cseea->cseea_vd;
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ uint64_t end = offset + size;
+
+ ASSERT(type == SM_FREE);
+
+ /*
+ * Since the vdev_checkpoint_sm exists in the vdev level
+ * and the ms_sm space maps exist in the metaslab level,
+ * an entry in the checkpoint space map could theoretically
+ * cross the boundaries of the metaslab that it belongs.
+ *
+ * In reality, because of the way that we populate and
+ * manipulate the checkpoint's space maps currently,
+ * there shouldn't be any entries that cross metaslabs.
+ * Hence the assertion below.
+ *
+ * That said, there is no fundamental requirement that
+ * the checkpoint's space map entries should not cross
+ * metaslab boundaries. So if needed we could add code
+ * that handles metaslab-crossing segments in the future.
+ */
+ VERIFY3U(offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * By removing the entry from the allocated segments we
+ * also verify that the entry is there to begin with.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_remove(ms->ms_allocatable, offset, size);
+ mutex_exit(&ms->ms_lock);
+
+ cseea->cseea_checkpoint_size += size;
+ return (0);
+}
+
+static void
+zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ /*
+ * If there is no vdev_top_zap, we are in a pool whose
+ * version predates the pool checkpoint feature.
+ */
+ if (vd->vdev_top_zap == 0)
+ return;
+
+ /*
+ * If there is no reference of the vdev_checkpoint_sm in
+ * the vdev_top_zap, then one of the following scenarios
+ * is true:
+ *
+ * 1] There is no checkpoint
+ * 2] There is a checkpoint, but no checkpointed blocks
+ * have been freed yet
+ * 3] The current vdev is indirect
+ *
+ * In these cases we return immediately.
+ */
+ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ return;
+
+ VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
+ &checkpoint_sm_obj));
+
+ checkpoint_sm_exclude_entry_arg_t cseea;
+ cseea.cseea_vd = vd;
+ cseea.cseea_checkpoint_size = 0;
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+ checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+ space_map_update(checkpoint_sm);
+
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ checkpoint_sm_exclude_entry_cb, &cseea));
+ space_map_close(checkpoint_sm);
+
+ zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
+}
+
+static void
+zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
+ zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
+ }
+}
+
+static void
+load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+
+ ASSERT3U(i, ==, vd->vdev_id);
+
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ continue;
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rloading concrete vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)msp->ms_id,
+ (longlong_t)vd->vdev_ms_count);
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_unload(msp);
+
+ /*
+ * We don't want to spend the CPU manipulating the
+ * size-ordered tree, so clear the range_tree ops.
+ */
+ msp->ms_allocatable->rt_ops = NULL;
+
+ if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm,
+ msp->ms_allocatable, maptype));
+ }
+ if (!msp->ms_loaded)
+ msp->ms_loaded = B_TRUE;
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+}
+
+/*
+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
+ * index in vim_entries that has the first entry in this metaslab.
+ * On return, it will be set to the first entry after this metaslab.
+ */
+static void
+load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
+ uint64_t *vim_idxp)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_unload(msp);
+
+ /*
+ * We don't want to spend the CPU manipulating the
+ * size-ordered tree, so clear the range_tree ops.
+ */
+ msp->ms_allocatable->rt_ops = NULL;
+
+ for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
+ (*vim_idxp)++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[*vim_idxp];
+ uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+ uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
+ ASSERT3U(ent_offset, >=, msp->ms_start);
+ if (ent_offset >= msp->ms_start + msp->ms_size)
+ break;
+
+ /*
+ * Mappings do not cross metaslab boundaries,
+ * because we create them by walking the metaslabs.
+ */
+ ASSERT3U(ent_offset + ent_len, <=,
+ msp->ms_start + msp->ms_size);
+ range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
+ }
+
+ if (!msp->ms_loaded)
+ msp->ms_loaded = B_TRUE;
+ mutex_exit(&msp->ms_lock);
+}
+
+static void
+zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ ASSERT3U(c, ==, vd->vdev_id);
+
+ if (vd->vdev_ops != &vdev_indirect_ops)
+ continue;
+
+ /*
+ * Note: we don't check for mapping leaks on
+ * removing vdevs because their ms_allocatable's
+ * are used to look for leaks in allocated space.
+ */
+ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
+
+ /*
+ * Normally, indirect vdevs don't have any
+ * metaslabs. We want to set them up for
+ * zio_claim().
+ */
+ VERIFY0(vdev_metaslab_init(vd, 0));
+
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t vim_idx = 0;
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+
+ (void) fprintf(stderr,
+ "\rloading indirect vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vd->vdev_ms[m]->ms_id,
+ (longlong_t)vd->vdev_ms_count);
+
+ load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
+ &vim_idx);
+ }
+ ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
+ }
+}
+
static void
zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
@@ -3142,7 +3336,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
/*
* We are going to be changing the meaning of the metaslab's
- * ms_tree. Ensure that the allocator doesn't try to
+ * ms_allocatable. Ensure that the allocator doesn't try to
* use the tree.
*/
spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
@@ -3152,39 +3346,37 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
UMEM_NOFAIL);
+ /*
+ * For leak detection, we overload the ms_allocatable trees
+ * to contain allocated segments instead of free segments.
+ * As a result, we can't use the normal metaslab_load/unload
+ * interfaces.
+ */
+ zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+ load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
- for (uint64_t c = 0; c < rvd->vdev_children; c++) {
- vdev_t *vd = rvd->vdev_child[c];
- uint64_t vim_idx = 0;
-
- ASSERT3U(c, ==, vd->vdev_id);
-
- /*
- * Note: we don't check for mapping leaks on
- * removing vdevs because their ms_tree's are
- * used to look for leaks in allocated space.
- */
- if (vd->vdev_ops == &vdev_indirect_ops) {
- zcb->zcb_vd_obsolete_counts[c] =
- zdb_load_obsolete_counts(vd);
-
- /*
- * Normally, indirect vdevs don't have any
- * metaslabs. We want to set them up for
- * zio_claim().
- */
- VERIFY0(vdev_metaslab_init(vd, 0));
- }
+ /*
+ * On load_concrete_ms_allocatable_trees() we loaded all the
+ * allocated entries from the ms_sm to the ms_allocatable for
+ * each metaslab. If the pool has a checkpoint or is in the
+ * middle of discarding a checkpoint, some of these blocks
+ * may have been freed but their ms_sm may not have been
+ * updated because they are referenced by the checkpoint. In
+ * order to avoid false-positives during leak-detection, we
+ * go through the vdev's checkpoint space map and exclude all
+ * its entries from their relevant ms_allocatable.
+ *
+ * We also aggregate the space held by the checkpoint and add
+ * it to zcb_checkpoint_size.
+ *
+ * Note that at this point we are also verifying that all the
+ * entries on the checkpoint_sm are marked as allocated in
+ * the ms_sm of their relevant metaslab.
+ * [see comment in checkpoint_sm_exclude_entry_cb()]
+ */
+ zdb_leak_init_exclude_checkpoint(spa, zcb);
- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
- zdb_leak_init_ms(vd->vdev_ms[m], &vim_idx);
- }
- if (vd->vdev_ops == &vdev_indirect_ops) {
- ASSERT3U(vim_idx, ==,
- vdev_indirect_mapping_num_entries(
- vd->vdev_indirect_mapping));
- }
- }
+ /* for cleaner progress output */
(void) fprintf(stderr, "\n");
if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
@@ -3193,12 +3385,16 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
increment_indirect_mapping_cb, zcb, NULL);
}
+ } else {
+ /*
+ * If leak tracing is disabled, we still need to consider
+ * any checkpointed space in our space verification.
+ */
+ zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
}
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
zdb_ddt_leak_init(spa, zcb);
-
spa_config_exit(spa, SCL_CONFIG, FTAG);
}
@@ -3225,7 +3421,7 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
for (uint64_t inner_offset = 0;
inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
inner_offset += 1 << vd->vdev_ashift) {
- if (range_tree_contains(msp->ms_tree,
+ if (range_tree_contains(msp->ms_allocatable,
offset + inner_offset, 1 << vd->vdev_ashift)) {
obsolete_bytes += 1 << vd->vdev_ashift;
}
@@ -3291,23 +3487,23 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
ASSERT3P(mg, ==, msp->ms_group);
/*
- * The ms_tree has been overloaded to
- * contain allocated segments. Now that we
- * finished traversing all blocks, any
- * block that remains in the ms_tree
+ * ms_allocatable has been overloaded
+ * to contain allocated segments. Now that
+ * we finished traversing all blocks, any
+ * block that remains in the ms_allocatable
* represents an allocated block that we
* did not claim during the traversal.
* Claimed blocks would have been removed
- * from the ms_tree. For indirect vdevs,
- * space remaining in the tree represents
- * parts of the mapping that are not
- * referenced, which is not a bug.
+ * from the ms_allocatable. For indirect
+ * vdevs, space remaining in the tree
+ * represents parts of the mapping that are
+ * not referenced, which is not a bug.
*/
if (vd->vdev_ops == &vdev_indirect_ops) {
- range_tree_vacate(msp->ms_tree,
+ range_tree_vacate(msp->ms_allocatable,
NULL, NULL);
} else {
- range_tree_vacate(msp->ms_tree,
+ range_tree_vacate(msp->ms_allocatable,
zdb_leak, vd);
}
@@ -3430,7 +3626,7 @@ dump_block_stats(spa_t *spa)
total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
- zcb.zcb_removing_size;
+ zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
if (total_found == total_alloc) {
if (!dump_opt['L'])
@@ -3839,6 +4035,384 @@ verify_device_removal_feature_counts(spa_t *spa)
return (ret);
}
+#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appened to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+ int error = 0;
+ char *poolname, *bogus_name;
+
+ /* If the target is not a pool, the extract the pool name */
+ char *path_start = strchr(target, '/');
+ if (path_start != NULL) {
+ size_t poolname_len = path_start - target;
+ poolname = strndup(target, poolname_len);
+ } else {
+ poolname = target;
+ }
+
+ if (cfg == NULL) {
+ error = spa_get_stats(poolname, &cfg, NULL, 0);
+ if (error != 0) {
+ fatal("Tried to read config of pool \"%s\" but "
+ "spa_get_stats() failed with error %d\n",
+ poolname, error);
+ }
+ }
+
+ (void) asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX);
+ fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+ error = spa_import(bogus_name, cfg, NULL,
+ ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT);
+ if (error != 0) {
+ fatal("Tried to import pool \"%s\" but spa_import() failed "
+ "with error %d\n", bogus_name, error);
+ }
+
+ if (new_path != NULL && path_start != NULL)
+ (void) asprintf(new_path, "%s%s", bogus_name, path_start);
+
+ if (target != poolname)
+ free(poolname);
+
+ return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+ vdev_t *vcsec_vd;
+
+ /* the following fields are only used for printing progress */
+ uint64_t vcsec_entryid;
+ uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
+ void *arg)
+{
+ verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+ vdev_t *vd = vcsec->vcsec_vd;
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ uint64_t end = offset + size;
+
+ ASSERT(type == SM_FREE);
+
+ if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu, space map entry %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vcsec->vcsec_entryid,
+ (longlong_t)vcsec->vcsec_num_entries);
+ }
+ vcsec->vcsec_entryid++;
+
+ /*
+ * See comment in checkpoint_sm_exclude_entry_cb()
+ */
+ VERIFY3U(offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * The entries in the vdev_checkpoint_sm should be marked as
+ * allocated in the checkpointed state of the pool, therefore
+ * their respective ms_allocateable trees should not contain them.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_verify(ms->ms_allocatable, offset, size);
+ mutex_exit(&ms->ms_lock);
+
+ return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
+static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+ for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * Since we don't allow device removal in a pool
+ * that has a checkpoint, we expect that all removed
+ * vdevs were removed from the pool before the
+ * checkpoint.
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ /*
+ * If the checkpoint space map doesn't exist, then nothing
+ * here is checkpointed so there's nothing to verify.
+ */
+ if (current_vd->vdev_top_zap == 0 ||
+ zap_contains(spa_meta_objset(current),
+ current_vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(current),
+ current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+ checkpoint_sm_obj, 0, current_vd->vdev_asize,
+ current_vd->vdev_ashift));
+ space_map_update(checkpoint_sm);
+
+ verify_checkpoint_sm_entry_cb_arg_t vcsec;
+ vcsec.vcsec_vd = ckpoint_vd;
+ vcsec.vcsec_entryid = 0;
+ vcsec.vcsec_num_entries =
+ space_map_length(checkpoint_sm) / sizeof (uint64_t);
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ verify_checkpoint_sm_entry_cb, &vcsec));
+ dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+
+ /*
+ * If we've added vdevs since we took the checkpoint, ensure
+ * that their checkpoint space maps are empty.
+ */
+ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+ for (uint64_t c = ckpoint_rvd->vdev_children;
+ c < current_rvd->vdev_children; c++) {
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+ ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+ load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+ for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+ vdev_t *current_vd = current_rvd->vdev_child[i];
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * See comment in verify_checkpoint_vdev_spacemaps()
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+ metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+ metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu of %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)current_vd->vdev_id,
+ (longlong_t)current_rvd->vdev_children,
+ (longlong_t)current_vd->vdev_ms[m]->ms_id,
+ (longlong_t)current_vd->vdev_ms_count);
+
+ /*
+ * We walk through the ms_allocatable trees that
+ * are loaded with the allocated blocks from the
+ * ms_sm spacemaps of the checkpoint. For each
+ * one of these ranges we ensure that none of them
+ * exists in the ms_allocatable trees of the
+ * current state which are loaded with the ranges
+ * that are currently free.
+ *
+ * This way we ensure that none of the blocks that
+ * are part of the checkpoint were freed by mistake.
+ */
+ range_tree_walk(ckpoint_msp->ms_allocatable,
+ (range_tree_func_t *)range_tree_verify,
+ current_msp->ms_allocatable);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+ spa_t *checkpoint_spa;
+ char *checkpoint_pool;
+ nvlist_t *config = NULL;
+ int error = 0;
+
+ /*
+ * We import the checkpointed state of the pool (under a different
+ * name) so we can do verification on it against the current state
+ * of the pool.
+ */
+ checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+ NULL);
+ ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+ error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but spa_open() failed with "
+ "error %d\n", checkpoint_pool, error);
+ }
+
+ /*
+ * Ensure that ranges in the checkpoint space maps of each vdev
+ * are allocated according to the checkpointed state's metaslab
+ * space maps.
+ */
+ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Ensure that allocated ranges in the checkpoint's metaslab
+ * space maps remain allocated in the metaslab space maps of
+ * the current state.
+ */
+ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Once we are done, we get rid of the checkpointed state.
+ */
+ spa_close(checkpoint_spa, FTAG);
+ free(checkpoint_pool);
+}
+
+static void
+dump_leftover_checkpoint_blocks(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (vd->vdev_top_zap == 0)
+ continue;
+
+ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+ checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+ space_map_update(checkpoint_sm);
+ dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+}
+
+static int
+verify_checkpoint(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (0);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error == ENOENT) {
+ /*
+ * If the feature is active but the uberblock is missing
+ * then we must be in the middle of discarding the
+ * checkpoint.
+ */
+ (void) printf("\nPartially discarded checkpoint "
+ "state found:\n");
+ dump_leftover_checkpoint_blocks(spa);
+ return (0);
+ } else if (error != 0) {
+ (void) printf("lookup error %d when looking for "
+ "checkpointed uberblock in MOS\n", error);
+ return (error);
+ }
+ dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
+
+ if (checkpoint.ub_checkpoint_txg == 0) {
+ (void) printf("\nub_checkpoint_txg not set in checkpointed "
+ "uberblock\n");
+ error = 3;
+ }
+
+ if (error == 0)
+ verify_checkpoint_blocks(spa);
+
+ return (error);
+}
+
static void
dump_zpool(spa_t *spa)
{
@@ -3938,6 +4512,9 @@ dump_zpool(spa_t *spa)
if (dump_opt['h'])
dump_history(spa);
+ if (rc == 0 && !dump_opt['L'])
+ rc = verify_checkpoint(spa);
+
if (rc != 0) {
dump_debug_buffer();
exit(rc);
@@ -4451,6 +5028,7 @@ main(int argc, char **argv)
int rewind = ZPOOL_NEVER_REWIND;
char *spa_config_path_env;
boolean_t target_is_spa = B_TRUE;
+ nvlist_t *cfg = NULL;
(void) setrlimit(RLIMIT_NOFILE, &rl);
(void) enable_extended_FILE_stdio(-1, -1);
@@ -4467,7 +5045,7 @@ main(int argc, char **argv)
spa_config_path = spa_config_path_env;
while ((c = getopt(argc, argv,
- "AbcCdDeEFGhiI:lLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
+ "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) {
switch (c) {
case 'b':
case 'c':
@@ -4492,6 +5070,7 @@ main(int argc, char **argv)
case 'A':
case 'e':
case 'F':
+ case 'k':
case 'L':
case 'P':
case 'q':
@@ -4598,7 +5177,7 @@ main(int argc, char **argv)
verbose = MAX(verbose, 1);
for (c = 0; c < 256; c++) {
- if (dump_all && strchr("AeEFlLOPRSX", c) == NULL)
+ if (dump_all && strchr("AeEFklLOPRSX", c) == NULL)
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
@@ -4651,7 +5230,6 @@ main(int argc, char **argv)
target = argv[0];
if (dump_opt['e']) {
- nvlist_t *cfg = NULL;
char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
error = ENOENT;
@@ -4660,6 +5238,7 @@ main(int argc, char **argv)
(void) printf("\nConfiguration for import:\n");
dump_nvlist(cfg, 8);
}
+
if (nvlist_add_nvlist(cfg,
ZPOOL_REWIND_POLICY, policy) != 0) {
fatal("can't open '%s': %s",
@@ -4669,6 +5248,17 @@ main(int argc, char **argv)
}
}
+ char *checkpoint_pool = NULL;
+ char *checkpoint_target = NULL;
+ if (dump_opt['k']) {
+ checkpoint_pool = import_checkpointed_state(target, cfg,
+ &checkpoint_target);
+
+ if (checkpoint_target != NULL)
+ target = checkpoint_target;
+
+ }
+
if (strpbrk(target, "/@") != NULL) {
size_t targetlen;
@@ -4685,7 +5275,18 @@ main(int argc, char **argv)
}
if (error == 0) {
- if (target_is_spa || dump_opt['R']) {
+ if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
+ ASSERT(checkpoint_pool != NULL);
+ ASSERT(checkpoint_target == NULL);
+
+ error = spa_open(checkpoint_pool, &spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but "
+ "spa_open() failed with error %d\n",
+ checkpoint_pool, error);
+ }
+
+ } else if (target_is_spa || dump_opt['R']) {
error = spa_open_rewind(target, &spa, FTAG, policy,
NULL);
if (error) {
@@ -4751,6 +5352,12 @@ main(int argc, char **argv)
zdb_read_block(argv[i], spa);
}
+ if (dump_opt['k']) {
+ free(checkpoint_pool);
+ if (!target_is_spa)
+ free(checkpoint_target);
+ }
+
if (os != NULL)
close_objset(os, FTAG);
else
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
index 680179fc3b7b..a2ebe5857e4d 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
/*
@@ -41,6 +41,7 @@
#include <sys/resource.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
+#include <sys/spa_impl.h>
#include <sys/abd.h>
#include "zdb.h"
@@ -162,7 +163,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg)
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
(void) printf("%shas blkptr, %s\n", tab_prefix,
!BP_IS_HOLE(bp) &&
- bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
+ bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
print_log_bp(bp, tab_prefix);
@@ -352,7 +353,7 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
if (claim_txg != 0)
claim = "already claimed";
- else if (bp->blk_birth >= spa_first_txg(zilog->zl_spa))
+ else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
claim = "will claim";
else
claim = "won't claim";
@@ -407,6 +408,11 @@ dump_intent_log(zilog_t *zilog)
for (i = 0; i < TX_MAX_TYPE; i++)
zil_rec_info[i].zri_count = 0;
+ /* see comment in zil_claim() or zil_check_log_chain() */
+ if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)
+ return;
+
if (verbose >= 2) {
(void) printf("\n");
(void) zil_parse(zilog, print_log_block, print_log_record, NULL,
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7 b/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
index 9766a18a8aab..18e242129314 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
@@ -17,7 +17,7 @@
.\" fields enclosed by brackets "[]" replaced with your own identifying
.\" information: Portions Copyright [yyyy] [name of copyright owner]
.\"
-.\" Copyright (c) 2012 by Delphix. All rights reserved.
+.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
.\" Copyright (c) 2013, Joyent, Inc. All rights reserved.
.\"
@@ -431,6 +431,23 @@ This feature becomes
as soon as it is enabled and will
never return to being
.Sy enabled .
+.It Sy zpool_checkpoint
+.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:zpool_checkpoint"
+.It GUID Ta com.delphix:zpool_checkpoint
+.It READ\-ONLY COMPATIBLE Ta yes
+.It DEPENDENCIES Ta none
+.El
+.Pp
+This feature enables the "zpool checkpoint" subcommand that can
+checkpoint the state of the pool at the time it was issued and later
+rewind back to it or discard it.
+.Pp
+This feature becomes
+.Sy active
+when the "zpool checkpoint" command is used to checkpoint the pool.
+The feature will only return back to being
+.Sy enabled
+when the pool is rewound or the checkpoint has been discarded.
.It Sy large_blocks
.Bl -column "READ\-ONLY COMPATIBLE" "org.open-zfs:large_block"
.It GUID Ta org.open-zfs:large_block
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool.8 b/cddl/contrib/opensolaris/cmd/zpool/zpool.8
index d021e25b425f..3e9cd13064ce 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool.8
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool.8
@@ -45,6 +45,10 @@
.Op Fl f
.Ar pool device new_device
.Nm
+.Cm checkpoint
+.Op Fl d, -discard
+.Ar pool
+.Nm
.Cm clear
.Op Fl F Op Fl n
.Ar pool
@@ -90,6 +94,7 @@
.Op Fl o Ar mntopts
.Op Fl o Ar property Ns = Ns Ar value
.Ar ...
+.Op Fl -rewind-to-checkpoint
.Op Fl d Ar dir | Fl c Ar cachefile
.Op Fl D
.Op Fl f
@@ -103,6 +108,7 @@
.Op Fl o Ar mntopts
.Op Fl o Ar property Ns = Ns Ar value
.Ar ...
+.Op Fl -rewind-to-checkpoint
.Op Fl d Ar dir | Fl c Ar cachefile
.Op Fl D
.Op Fl f
@@ -532,6 +538,50 @@ configuration.
.Pp
The content of the cache devices is considered volatile, as is the case with
other system caches.
+.Ss Pool checkpoint
+Before starting critical procedures that include destructive actions (e.g
+.Nm zfs Cm destroy
+), an administrator can checkpoint the pool's state and in the case of a
+mistake or failure, rewind the entire pool back to the checkpoint.
+Otherwise, the checkpoint can be discarded when the procedure has completed
+successfully.
+.Pp
+A pool checkpoint can be thought of as a pool-wide snapshot and should be used
+with care as it contains every part of the pool's state, from properties to vdev
+configuration.
+Thus, while a pool has a checkpoint certain operations are not allowed.
+Specifically, vdev removal/attach/detach, mirror splitting, and
+changing the pool's guid.
+Adding a new vdev is supported but in the case of a rewind it will have to be
+added again.
+Finally, users of this feature should keep in mind that scrubs in a pool that
+has a checkpoint do not repair checkpointed data.
+.Pp
+To create a checkpoint for a pool:
+.Bd -literal
+# zpool checkpoint pool
+.Ed
+.Pp
+To later rewind to its checkpointed state, you need to first export it and
+then rewind it during import:
+.Bd -literal
+# zpool export pool
+# zpool import --rewind-to-checkpoint pool
+.Ed
+.Pp
+To discard the checkpoint from a pool:
+.Bd -literal
+# zpool checkpoint -d pool
+.Ed
+.Pp
+Dataset reservations (controlled by the
+.Nm reservation
+or
+.Nm refreservation
+zfs properties) may be unenforceable while a checkpoint exists, because the
+checkpoint is allowed to consume the dataset's reservation.
+Finally, data that is part of the checkpoint but has been freed in the
+current state of the pool won't be scanned during a scrub.
.Ss Properties
Each pool has several properties associated with it. Some properties are
read-only statistics while others are configurable and change the behavior of
@@ -847,6 +897,39 @@ manner.
.El
.It Xo
.Nm
+.Cm checkpoint
+.Op Fl d, -discard
+.Ar pool
+.Xc
+Checkpoints the current state of
+.Ar pool
+, which can be later restored by
+.Nm zpool Cm import --rewind-to-checkpoint .
+The existence of a checkpoint in a pool prohibits the following
+.Nm zpool
+commands:
+.Cm remove ,
+.Cm attach ,
+.Cm detach ,
+.Cm split ,
+and
+.Cm reguid .
+In addition, it may break reservation boundaries if the pool lacks free
+space.
+The
+.Nm zpool Cm status
+command indicates the existence of a checkpoint or the progress of discarding a
+checkpoint from a pool.
+The
+.Nm zpool Cm list
+command reports how much space the checkpoint takes from the pool.
+.Bl -tag -width Ds
+.It Fl d, -discard
+Discards an existing checkpoint from
+.Ar pool .
+.El
+.It Xo
+.Nm
.Cm clear
.Op Fl F Op Fl n
.Ar pool
@@ -1306,6 +1389,16 @@ importable again, but does not actually perform the pool recovery. For more
details about pool recovery mode, see the
.Fl F
option, above.
+.It Fl -rewind-to-checkpoint
+Rewinds pool to the checkpointed state.
+Once the pool is imported with this flag there is no way to undo the rewind.
+All changes and data that were written after the checkpoint are lost!
+The only exception is when the
+.Sy readonly
+mounting option is enabled.
+In this case, the checkpointed state of the pool is opened and an
+administrator can see how the pool would look like if they were
+to fully rewind.
.El
.It Xo
.Nm
diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
index 5bc1f9c9c91c..ee6d1a8d54e5 100644
--- a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
+++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
@@ -36,6 +36,7 @@
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
+#include <getopt.h>
#include <libgen.h>
#include <libintl.h>
#include <libuutil.h>
@@ -68,6 +69,8 @@ static int zpool_do_add(int, char **);
static int zpool_do_remove(int, char **);
static int zpool_do_labelclear(int, char **);
+static int zpool_do_checkpoint(int, char **);
+
static int zpool_do_list(int, char **);
static int zpool_do_iostat(int, char **);
static int zpool_do_status(int, char **);
@@ -120,6 +123,7 @@ typedef enum {
HELP_ATTACH,
HELP_CLEAR,
HELP_CREATE,
+ HELP_CHECKPOINT,
HELP_DESTROY,
HELP_DETACH,
HELP_EXPORT,
@@ -167,6 +171,8 @@ static zpool_command_t command_table[] = {
{ NULL },
{ "labelclear", zpool_do_labelclear, HELP_LABELCLEAR },
{ NULL },
+ { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT },
+ { NULL },
{ "list", zpool_do_list, HELP_LIST },
{ "iostat", zpool_do_iostat, HELP_IOSTAT },
{ "status", zpool_do_status, HELP_STATUS },
@@ -216,6 +222,8 @@ get_usage(zpool_help_t idx)
"[-o property=value] ... \n"
"\t [-O file-system-property=value] ... \n"
"\t [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
+ case HELP_CHECKPOINT:
+ return (gettext("\tcheckpoint [--discard] <pool> ...\n"));
case HELP_DESTROY:
return (gettext("\tdestroy [-f] <pool>\n"));
case HELP_DETACH:
@@ -226,14 +234,13 @@ get_usage(zpool_help_t idx)
return (gettext("\thistory [-il] [<pool>] ...\n"));
case HELP_IMPORT:
return (gettext("\timport [-d dir] [-D]\n"
- "\timport [-d dir | -c cachefile] [-F [-n]] <pool | id>\n"
"\timport [-o mntopts] [-o property=value] ... \n"
"\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
"[-R root] [-F [-n]] -a\n"
"\timport [-o mntopts] [-o property=value] ... \n"
"\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
"[-R root] [-F [-n]]\n"
- "\t <pool | id> [newpool]\n"));
+ "\t [--rewind-to-checkpoint] <pool | id> [newpool]\n"));
case HELP_IOSTAT:
return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval "
"[count]]\n"));
@@ -2078,6 +2085,79 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
}
/*
+ * zpool checkpoint <pool>
+ * checkpoint --discard <pool>
+ *
+ * -d Discard the checkpoint from a checkpointed
+ * --discard pool.
+ *
+ * Checkpoints the specified pool, by taking a "snapshot" of its
+ * current state. A pool can only have one checkpoint at a time.
+ */
+int
+zpool_do_checkpoint(int argc, char **argv)
+{
+ boolean_t discard;
+ char *pool;
+ zpool_handle_t *zhp;
+ int c, err;
+
+ struct option long_options[] = {
+ {"discard", no_argument, NULL, 'd'},
+ {0, 0, 0, 0}
+ };
+
+ discard = B_FALSE;
+ while ((c = getopt_long(argc, argv, ":d", long_options, NULL)) != -1) {
+ switch (c) {
+ case 'd':
+ discard = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool argument\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ pool = argv[0];
+
+ if ((zhp = zpool_open(g_zfs, pool)) == NULL) {
+ /* As a special case, check for use of '/' in the name */
+ if (strchr(pool, '/') != NULL)
+ (void) fprintf(stderr, gettext("'zpool checkpoint' "
+ "doesn't work on datasets. To save the state "
+ "of a dataset from a specific point in time "
+ "please use 'zfs snapshot'\n"));
+ return (1);
+ }
+
+ if (discard)
+ err = (zpool_discard_checkpoint(zhp) != 0);
+ else
+ err = (zpool_checkpoint(zhp) != 0);
+
+ zpool_close(zhp);
+
+ return (err);
+}
+
+#define CHECKPOINT_OPT 1024
+
+/*
* zpool import [-d dir] [-D]
* import [-o mntopts] [-o prop=value] ... [-R root] [-D]
* [-d dir | -c cachefile] [-f] -a
@@ -2118,6 +2198,9 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
*
* -o Set property=value and/or temporary mount options (without '=').
*
+ * --rewind-to-checkpoint
+ * Import the pool and revert back to the checkpoint.
+ *
* The import command scans for pools to import, and import pools based on pool
* name and GUID. The pool can also be renamed as part of the import process.
*/
@@ -2151,8 +2234,15 @@ zpool_do_import(int argc, char **argv)
importargs_t idata = { 0 };
char *endptr;
+
+ struct option long_options[] = {
+ {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT},
+ {0, 0, 0, 0}
+ };
+
/* check options */
- while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:T:VX")) != -1) {
+ while ((c = getopt_long(argc, argv, ":aCc:d:DEfFmnNo:rR:T:VX",
+ long_options, NULL)) != -1) {
switch (c) {
case 'a':
do_all = B_TRUE;
@@ -2230,6 +2320,9 @@ zpool_do_import(int argc, char **argv)
case 'X':
xtreme_rewind = B_TRUE;
break;
+ case CHECKPOINT_OPT:
+ flags |= ZFS_IMPORT_CHECKPOINT;
+ break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
"'%c' option\n"), optopt);
@@ -3087,6 +3180,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted,
switch (prop) {
case ZPOOL_PROP_EXPANDSZ:
+ case ZPOOL_PROP_CHECKPOINT:
if (value == 0)
(void) strlcpy(propval, "-", sizeof (propval));
else
@@ -3159,6 +3253,8 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
toplevel);
print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
scripted, toplevel);
+ print_one_column(ZPOOL_PROP_CHECKPOINT,
+ vs->vs_checkpoint_space, scripted, toplevel);
print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted,
B_TRUE);
print_one_column(ZPOOL_PROP_FRAGMENTATION,
@@ -3273,8 +3369,8 @@ zpool_do_list(int argc, char **argv)
int ret;
list_cbdata_t cb = { 0 };
static char default_props[] =
- "name,size,allocated,free,expandsize,fragmentation,capacity,"
- "dedupratio,health,altroot";
+ "name,size,allocated,free,checkpoint,expandsize,fragmentation,"
+ "capacity,dedupratio,health,altroot";
char *props = default_props;
unsigned long interval = 0, count = 0;
zpool_list_t *list;
@@ -3994,6 +4090,32 @@ typedef struct scrub_cbdata {
pool_scrub_cmd_t cb_scrub_cmd;
} scrub_cbdata_t;
+static boolean_t
+zpool_has_checkpoint(zpool_handle_t *zhp)
+{
+ nvlist_t *config, *nvroot;
+
+ config = zpool_get_config(zhp, NULL);
+
+ if (config != NULL) {
+ pool_checkpoint_stat_t *pcs = NULL;
+ uint_t c;
+
+ nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+
+ if (pcs == NULL || pcs->pcs_state == CS_NONE)
+ return (B_FALSE);
+
+ assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS ||
+ pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
int
scrub_callback(zpool_handle_t *zhp, void *data)
{
@@ -4011,6 +4133,13 @@ scrub_callback(zpool_handle_t *zhp, void *data)
err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);
+ if (err == 0 && zpool_has_checkpoint(zhp) &&
+ cb->cb_type == POOL_SCAN_SCRUB) {
+ (void) printf(gettext("warning: will not scrub state that "
+ "belongs to the checkpoint of pool '%s'\n"),
+ zpool_get_name(zhp));
+ }
+
return (err != 0);
}
@@ -4204,6 +4333,40 @@ print_scan_status(pool_scan_stat_t *ps)
}
/*
+ * As we don't scrub checkpointed blocks, we want to warn the
+ * user that we skipped scanning some blocks if a checkpoint exists
+ * or existed at any time during the scan.
+ */
+static void
+print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
+{
+ if (ps == NULL || pcs == NULL)
+ return;
+
+ if (pcs->pcs_state == CS_NONE ||
+ pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
+ return;
+
+ assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS);
+
+ if (ps->pss_state == DSS_NONE)
+ return;
+
+ if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) &&
+ ps->pss_end_time < pcs->pcs_start_time)
+ return;
+
+ if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) {
+ (void) printf(gettext(" scan warning: skipped blocks "
+ "that are only referenced by the checkpoint.\n"));
+ } else {
+ assert(ps->pss_state == DSS_SCANNING);
+ (void) printf(gettext(" scan warning: skipping blocks "
+ "that are only referenced by the checkpoint.\n"));
+ }
+}
+
+/*
* Print out detailed removal status.
*/
static void
@@ -4309,6 +4472,39 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
}
static void
+print_checkpoint_status(pool_checkpoint_stat_t *pcs)
+{
+ time_t start;
+ char space_buf[7];
+
+ if (pcs == NULL || pcs->pcs_state == CS_NONE)
+ return;
+
+ (void) printf(gettext("checkpoint: "));
+
+ start = pcs->pcs_start_time;
+ zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf));
+
+ if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) {
+ char *date = ctime(&start);
+
+ /*
+ * ctime() adds a newline at the end of the generated
+ * string, thus the weird format specifier and the
+ * strlen() call used to chop it off from the output.
+ */
+ (void) printf(gettext("created %.*s, consumes %s\n"),
+ strlen(date) - 1, date, space_buf);
+ return;
+ }
+
+ assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+
+ (void) printf(gettext("discarding, %s remaining.\n"),
+ space_buf);
+}
+
+static void
print_error_log(zpool_handle_t *zhp)
{
nvlist_t *nverrlist = NULL;
@@ -4689,16 +4885,21 @@ status_callback(zpool_handle_t *zhp, void *data)
uint64_t nerr;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
+ pool_checkpoint_stat_t *pcs = NULL;
pool_scan_stat_t *ps = NULL;
pool_removal_stat_t *prs = NULL;
(void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+ (void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
- print_scan_status(ps);
-
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
+
+ print_scan_status(ps);
+ print_checkpoint_scan_warning(ps, pcs);
print_removal_status(zhp, prs);
+ print_checkpoint_status(pcs);
namewidth = max_width(zhp, nvroot, 0, 0);
if (namewidth < 10)
diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
index de365a049560..e3f55ed459b8 100644
--- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c
+++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
@@ -346,6 +346,7 @@ ztest_func_t ztest_reguid;
ztest_func_t ztest_spa_upgrade;
ztest_func_t ztest_device_removal;
ztest_func_t ztest_remap_blocks;
+ztest_func_t ztest_spa_checkpoint_create_discard;
uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
@@ -388,7 +389,8 @@ ztest_info_t ztest_info[] = {
{ ztest_vdev_aux_add_remove, 1,
&ztest_opts.zo_vdevtime },
{ ztest_device_removal, 1, &zopt_sometimes },
- { ztest_remap_blocks, 1, &zopt_sometimes }
+ { ztest_remap_blocks, 1, &zopt_sometimes },
+ { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }
};
#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
@@ -434,6 +436,7 @@ static spa_t *ztest_spa = NULL;
static ztest_ds_t *ztest_ds;
static kmutex_t ztest_vdev_lock;
+static kmutex_t ztest_checkpoint_lock;
/*
* The ztest_name_lock protects the pool and dataset namespace used by
@@ -2478,6 +2481,62 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
mutex_exit(&ztest_vdev_lock);
}
+static void
+ztest_spa_checkpoint(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+ int error = spa_checkpoint(spa->spa_name);
+
+ switch (error) {
+ case 0:
+ case ZFS_ERR_DEVRM_IN_PROGRESS:
+ case ZFS_ERR_DISCARDING_CHECKPOINT:
+ case ZFS_ERR_CHECKPOINT_EXISTS:
+ break;
+ case ENOSPC:
+ ztest_record_enospc(FTAG);
+ break;
+ default:
+ fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error);
+ }
+}
+
+static void
+ztest_spa_discard_checkpoint(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+ int error = spa_checkpoint_discard(spa->spa_name);
+
+ switch (error) {
+ case 0:
+ case ZFS_ERR_DISCARDING_CHECKPOINT:
+ case ZFS_ERR_NO_CHECKPOINT:
+ break;
+ default:
+ fatal(0, "spa_discard_checkpoint(%s) = %d",
+ spa->spa_name, error);
+ }
+
+}
+
+/* ARGSUSED */
+void
+ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+
+ mutex_enter(&ztest_checkpoint_lock);
+ if (ztest_random(2) == 0) {
+ ztest_spa_checkpoint(spa);
+ } else {
+ ztest_spa_discard_checkpoint(spa);
+ }
+ mutex_exit(&ztest_checkpoint_lock);
+}
+
+
static vdev_t *
vdev_lookup_by_path(vdev_t *vd, const char *path)
{
@@ -2558,8 +2617,15 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
error = spa_vdev_remove(spa, guid, B_FALSE);
rw_exit(&ztest_name_lock);
- if (error && error != EEXIST)
+ switch (error) {
+ case 0:
+ case EEXIST:
+ case ZFS_ERR_CHECKPOINT_EXISTS:
+ case ZFS_ERR_DISCARDING_CHECKPOINT:
+ break;
+ default:
fatal(0, "spa_vdev_remove() = %d", error);
+ }
} else {
spa_config_exit(spa, SCL_VDEV, FTAG);
@@ -2574,10 +2640,15 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
error = spa_vdev_add(spa, nvroot);
nvlist_free(nvroot);
- if (error == ENOSPC)
+ switch (error) {
+ case 0:
+ break;
+ case ENOSPC:
ztest_record_enospc("spa_vdev_add");
- else if (error != 0)
+ break;
+ default:
fatal(0, "spa_vdev_add() = %d", error);
+ }
}
mutex_exit(&ztest_vdev_lock);
@@ -2646,8 +2717,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
(ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
error = spa_vdev_add(spa, nvroot);
- if (error != 0)
+
+ switch (error) {
+ case 0:
+ break;
+ default:
fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
+ }
nvlist_free(nvroot);
} else {
/*
@@ -2659,8 +2735,16 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
(void) vdev_online(spa, guid, 0, NULL);
error = spa_vdev_remove(spa, guid, B_FALSE);
- if (error != 0 && error != EBUSY)
+
+ switch (error) {
+ case 0:
+ case EBUSY:
+ case ZFS_ERR_CHECKPOINT_EXISTS:
+ case ZFS_ERR_DISCARDING_CHECKPOINT:
+ break;
+ default:
fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
+ }
}
mutex_exit(&ztest_vdev_lock);
@@ -2759,7 +2843,6 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id)
--zs->zs_mirrors;
}
mutex_exit(&ztest_vdev_lock);
-
}
/*
@@ -2858,7 +2941,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
spa_config_exit(spa, SCL_ALL, FTAG);
error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
if (error != 0 && error != ENODEV && error != EBUSY &&
- error != ENOTSUP)
+ error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS &&
+ error != ZFS_ERR_DISCARDING_CHECKPOINT)
fatal(0, "detach (%s) returned %d", oldpath, error);
mutex_exit(&ztest_vdev_lock);
return;
@@ -2950,6 +3034,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
if (error == EOVERFLOW || error == EBUSY)
expected_error = error;
+ if (error == ZFS_ERR_CHECKPOINT_EXISTS ||
+ error == ZFS_ERR_DISCARDING_CHECKPOINT)
+ expected_error = error;
+
/* XXX workaround 6690467 */
if (error != expected_error && expected_error != EBUSY) {
fatal(0, "attach (%s %llu, %s %llu, %d) "
@@ -3106,6 +3194,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
uint64_t top;
uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
+ mutex_enter(&ztest_checkpoint_lock);
mutex_enter(&ztest_vdev_lock);
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
@@ -3116,8 +3205,9 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
* when the device removal completes).
*/
if (spa->spa_vdev_removal != NULL) {
- spa_config_exit(spa, SCL_STATE, FTAG);
+ spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
return;
}
@@ -3147,6 +3237,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
return;
}
ASSERT(psize > 0);
@@ -3172,6 +3263,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
}
spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
return;
}
@@ -3206,6 +3298,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
}
spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
return;
}
@@ -3236,6 +3329,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
spa_config_exit(spa, SCL_STATE, spa);
mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
}
/*
@@ -5040,7 +5134,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
*/
fd = open(pathrand, O_RDWR);
- if (fd == -1) /* we hit a gap in the device namespace */
+ if (fd == -1) /* we hit a gap in the device namespace */
return;
fsize = lseek(fd, 0, SEEK_END);
@@ -5750,6 +5844,7 @@ ztest_run(ztest_shared_t *zs)
/*
* Initialize parent/child shared state.
*/
+ mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL);
mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL);
rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL);
@@ -5811,7 +5906,7 @@ ztest_run(ztest_shared_t *zs)
NULL) == 0);
/*
- * Verify that we can safely inquire about about any object,
+ * Verify that we can safely inquire about any object,
* whether it's allocated or not. To make it interesting,
* we probe a 5-wide window around each power of two.
* This hits all edge cases, including zero and the max.
@@ -5914,6 +6009,7 @@ ztest_run(ztest_shared_t *zs)
rw_destroy(&ztest_name_lock);
mutex_destroy(&ztest_vdev_lock);
+ mutex_destroy(&ztest_checkpoint_lock);
}
static void
@@ -6058,6 +6154,7 @@ ztest_init(ztest_shared_t *zs)
nvlist_t *nvroot, *props;
mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL);
+ mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL);
rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL);
kernel_init(FREAD | FWRITE);
@@ -6098,6 +6195,7 @@ ztest_init(ztest_shared_t *zs)
rw_destroy(&ztest_name_lock);
mutex_destroy(&ztest_vdev_lock);
+ mutex_destroy(&ztest_checkpoint_lock);
}
static void
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
index 81b37a89fa75..e333723a2f1a 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
@@ -132,6 +132,11 @@ typedef enum zfs_error {
EZFS_POOLREADONLY, /* pool is in read-only mode */
EZFS_SCRUB_PAUSED, /* scrub currently paused */
EZFS_NO_PENDING, /* cannot cancel, no operation is pending */
+ EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */
+ EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */
+ EZFS_NO_CHECKPOINT, /* pool has no checkpoint */
+ EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */
+ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */
EZFS_UNKNOWN
} zfs_error_t;
@@ -420,6 +425,8 @@ extern int zfs_ioctl(libzfs_handle_t *, int request, struct zfs_cmd *);
extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
nvlist_t *);
+extern int zpool_checkpoint(zpool_handle_t *);
+extern int zpool_discard_checkpoint(zpool_handle_t *);
/*
* Basic handle manipulations. These functions do not create or destroy the
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
index 7c93a0f69579..d946972854ef 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
@@ -318,6 +318,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
break;
case ZPOOL_PROP_BOOTSIZE:
case ZPOOL_PROP_EXPANDSZ:
+ case ZPOOL_PROP_CHECKPOINT:
if (intval == 0) {
(void) strlcpy(buf, "-", len);
} else if (literal) {
@@ -1282,6 +1283,48 @@ zpool_destroy(zpool_handle_t *zhp, const char *log_str)
}
/*
+ * Create a checkpoint in the given pool.
+ */
+int
+zpool_checkpoint(zpool_handle_t *zhp)
+{
+ libzfs_handle_t *hdl = zhp->zpool_hdl;
+ char msg[1024];
+ int error;
+
+ error = lzc_pool_checkpoint(zhp->zpool_name);
+ if (error != 0) {
+ (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+ "cannot checkpoint '%s'"), zhp->zpool_name);
+ (void) zpool_standard_error(hdl, error, msg);
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Discard the checkpoint from the given pool.
+ */
+int
+zpool_discard_checkpoint(zpool_handle_t *zhp)
+{
+ libzfs_handle_t *hdl = zhp->zpool_hdl;
+ char msg[1024];
+ int error;
+
+ error = lzc_pool_checkpoint_discard(zhp->zpool_name);
+ if (error != 0) {
+ (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
+ "cannot discard checkpoint in '%s'"), zhp->zpool_name);
+ (void) zpool_standard_error(hdl, error, msg);
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
* Add the given vdevs to the pool. The caller must have already performed the
* necessary verification to ensure that the vdev specification is well-formed.
*/
diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
index 6adab0bd1788..822c3af26ad0 100644
--- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
+++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_util.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright (c) 2017 Datto Inc.
*/
@@ -243,6 +243,17 @@ libzfs_error_description(libzfs_handle_t *hdl)
case EZFS_NO_PENDING:
return (dgettext(TEXT_DOMAIN, "operation is not "
"in progress"));
+ case EZFS_CHECKPOINT_EXISTS:
+ return (dgettext(TEXT_DOMAIN, "checkpoint exists"));
+ case EZFS_DISCARDING_CHECKPOINT:
+ return (dgettext(TEXT_DOMAIN, "currently discarding "
+ "checkpoint"));
+ case EZFS_NO_CHECKPOINT:
+ return (dgettext(TEXT_DOMAIN, "checkpoint does not exist"));
+ case EZFS_DEVRM_IN_PROGRESS:
+ return (dgettext(TEXT_DOMAIN, "device removal in progress"));
+ case EZFS_VDEV_TOO_BIG:
+ return (dgettext(TEXT_DOMAIN, "device exceeds supported size"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
@@ -494,7 +505,21 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ESRCH:
zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap);
break;
-
+ case ZFS_ERR_CHECKPOINT_EXISTS:
+ zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap);
+ break;
+ case ZFS_ERR_DISCARDING_CHECKPOINT:
+ zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap);
+ break;
+ case ZFS_ERR_NO_CHECKPOINT:
+ zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap);
+ break;
+ case ZFS_ERR_DEVRM_IN_PROGRESS:
+ zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap);
+ break;
+ case ZFS_ERR_VDEV_TOO_BIG:
+ zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap);
+ break;
default:
zfs_error_aux(hdl, strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
index a7c973f0f634..3dda53395a1f 100644
--- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
+++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.c
@@ -999,6 +999,74 @@ lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit,
}
/*
+ * Creates a checkpoint for the specified pool.
+ *
+ * If this function returns 0 the pool was successfully checkpointed.
+ *
+ * This method may also return:
+ *
+ * ZFS_ERR_CHECKPOINT_EXISTS
+ * The pool already has a checkpoint. A pools can only have one
+ * checkpoint at most, at any given time.
+ *
+ * ZFS_ERR_DISCARDING_CHECKPOINT
+ * ZFS is in the middle of discarding a checkpoint for this pool.
+ * The pool can be checkpointed again once the discard is done.
+ *
+ * ZFS_DEVRM_IN_PROGRESS
+ * A vdev is currently being removed. The pool cannot be
+ * checkpointed until the device removal is done.
+ *
+ * ZFS_VDEV_TOO_BIG
+ * One or more top-level vdevs exceed the maximum vdev size
+ * supported for this feature.
+ */
+int
+lzc_pool_checkpoint(const char *pool)
+{
+ int error;
+
+ nvlist_t *result = NULL;
+ nvlist_t *args = fnvlist_alloc();
+
+ error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result);
+
+ fnvlist_free(args);
+ fnvlist_free(result);
+
+ return (error);
+}
+
+/*
+ * Discard the checkpoint from the specified pool.
+ *
+ * If this function returns 0 the checkpoint was successfully discarded.
+ *
+ * This method may also return:
+ *
+ * ZFS_ERR_NO_CHECKPOINT
+ * The pool does not have a checkpoint.
+ *
+ * ZFS_ERR_DISCARDING_CHECKPOINT
+ * ZFS is already in the middle of discarding the checkpoint.
+ */
+int
+lzc_pool_checkpoint_discard(const char *pool)
+{
+ int error;
+
+ nvlist_t *result = NULL;
+ nvlist_t *args = fnvlist_alloc();
+
+ error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result);
+
+ fnvlist_free(args);
+ fnvlist_free(result);
+
+ return (error);
+}
+
+/*
* Executes a read-only channel program.
*
* A read-only channel program works programmatically the same way as a
diff --git a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
index 5202fd135db8..0eaa89cc1e9e 100644
--- a/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
+++ b/cddl/contrib/opensolaris/lib/libzfs_core/common/libzfs_core.h
@@ -92,6 +92,9 @@ int lzc_channel_program(const char *, const char *, uint64_t,
int lzc_channel_program_nosync(const char *, const char *, uint64_t,
uint64_t, nvlist_t *, nvlist_t **);
+int lzc_pool_checkpoint(const char *);
+int lzc_pool_checkpoint_discard(const char *);
+
#ifdef __cplusplus
}
#endif