165 files changed, 59838 insertions, 0 deletions
diff --git a/sys/contrib/openzfs/cmd/Makefile.am b/sys/contrib/openzfs/cmd/Makefile.am
new file mode 100644
index 000000000000..88d32b1c538c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/Makefile.am
@@ -0,0 +1,10 @@
+SUBDIRS  = zfs zpool zdb zhack zinject zstream zstreamdump ztest
+SUBDIRS += fsck_zfs vdev_id raidz_test zfs_ids_to_path
+
+if USING_PYTHON
+SUBDIRS += arcstat arc_summary dbufstat
+endif
+
+if BUILD_LINUX
+SUBDIRS += mount_zfs zed zgenhostid zvol_id zvol_wait
+endif
diff --git a/sys/contrib/openzfs/cmd/arc_summary/.gitignore b/sys/contrib/openzfs/cmd/arc_summary/.gitignore
new file mode 100644
index 000000000000..50ba15f034e2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arc_summary/.gitignore
@@ -0,0 +1 @@
+arc_summary
diff --git a/sys/contrib/openzfs/cmd/arc_summary/Makefile.am b/sys/contrib/openzfs/cmd/arc_summary/Makefile.am
new file mode 100644
index 000000000000..1a26c2c199f8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arc_summary/Makefile.am
@@ -0,0 +1,13 @@
+bin_SCRIPTS = arc_summary
+
+CLEANFILES = arc_summary
+EXTRA_DIST = arc_summary2 arc_summary3
+
+if USING_PYTHON_2
+SCRIPT = arc_summary2
+else
+SCRIPT = arc_summary3
+endif
+
+arc_summary: $(SCRIPT)
+	cp $< $@
diff --git a/sys/contrib/openzfs/cmd/arc_summary/arc_summary2 b/sys/contrib/openzfs/cmd/arc_summary/arc_summary2
new file mode 100755
index 000000000000..5dc40d759dce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arc_summary/arc_summary2
@@ -0,0 +1,1093 @@
+#!/usr/bin/env python2
+#
+# $Id: arc_summary.pl,v 388:e27800740aa2 2011-07-08 02:53:29Z jhell $
+#
+# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>,
+# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>,
+# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>,
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# If you are having troubles when using this script from cron(8) please try
+# adjusting your PATH before reporting problems.
+#
+# Note some of this code uses older code (eg getopt instead of argparse,
+# subprocess.Popen() instead of subprocess.run()) because we need to support
+# some very old versions of Python.
+#
+
+"""Print statistics on the ZFS Adjustable Replacement Cache (ARC)
+
+Provides basic information on the ARC, its efficiency, the L2ARC (if present),
+the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the
+in-source documentation and code at
+https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details.
+"""
+
+import getopt
+import os
+import sys
+import time
+import errno
+
+from subprocess import Popen, PIPE
+from decimal import Decimal as D
+
+
+if sys.platform.startswith('freebsd'):
+    # Requires py27-sysctl on FreeBSD
+    import sysctl
+
+    def load_kstats(namespace):
+        """Collect information on a specific subsystem of the ARC"""
+
+        base = 'kstat.zfs.misc.%s.' % namespace
+        return [(kstat.name, D(kstat.value)) for kstat in sysctl.filter(base)]
+
+    def load_tunables():
+        return dict((ctl.name, ctl.value) for ctl in sysctl.filter('vfs.zfs'))
+
+elif sys.platform.startswith('linux'):
+
+    def load_kstats(namespace):
+        """Collect information on a specific subsystem of the ARC"""
+
+        kstat = 'kstat.zfs.misc.%s.%%s' % namespace
+        path = '/proc/spl/kstat/zfs/%s' % namespace
+        with open(path) as f:
+            entries = [line.strip().split() for line in f][2:] # Skip header
+        return [(kstat % name, D(value)) for name, _, value in entries]
+
+    def load_tunables():
+        basepath = '/sys/module/zfs/parameters'
+        tunables = {}
+        for name in os.listdir(basepath):
+            if not name:
+                continue
+            path = '%s/%s' % (basepath, name)
+            with open(path) as f:
+                value = f.read()
+            tunables[name] = value.strip()
+        return tunables
+
+
+show_tunable_descriptions = False
+alternate_tunable_layout = False
+
+
+def handle_Exception(ex_cls, ex, tb):
+    if ex is IOError:
+        if ex.errno == errno.EPIPE:
+            sys.exit()
+
+    if ex is KeyboardInterrupt:
+        sys.exit()
+
+
+sys.excepthook = handle_Exception
+
+
+def get_Kstat():
+    """Collect information on the ZFS subsystem from the /proc virtual
+    file system. The name "kstat" is a holdover from the Solaris utility
+    of the same name.
+    """
+
+    Kstat = {}
+    Kstat.update(load_kstats('arcstats'))
+    Kstat.update(load_kstats('zfetchstats'))
+    Kstat.update(load_kstats('vdev_cache_stats'))
+    return Kstat
+
+
+def fBytes(b=0):
+    """Return human-readable representation of a byte value in
+    powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal
+    points. Values smaller than one KiB are returned without
+    decimal points.
+    """
+
+    prefixes = [
+        [2**80, "YiB"],   # yobibytes (yotta)
+        [2**70, "ZiB"],   # zebibytes (zetta)
+        [2**60, "EiB"],   # exbibytes (exa)
+        [2**50, "PiB"],   # pebibytes (peta)
+        [2**40, "TiB"],   # tebibytes (tera)
+        [2**30, "GiB"],   # gibibytes (giga)
+        [2**20, "MiB"],   # mebibytes (mega)
+        [2**10, "KiB"]]   # kibibytes (kilo)
+
+    if b >= 2**10:
+
+        for limit, unit in prefixes:
+
+            if b >= limit:
+                value = b / limit
+                break
+
+        result = "%0.2f\t%s" % (value, unit)
+
+    else:
+
+        result = "%d\tBytes" % b
+
+    return result
+
+
+def fHits(hits=0):
+    """Create a human-readable representation of the number of hits.
+    The single-letter symbols used are SI to avoid the confusion caused
+    by the different "short scale" and "long scale" representations in
+    English, which use the same words for different values. See
+    https://en.wikipedia.org/wiki/Names_of_large_numbers and
+    https://physics.nist.gov/cuu/Units/prefixes.html
+    """
+
+    numbers = [
+            [10**24, 'Y'],  # yotta (septillion)
+            [10**21, 'Z'],  # zetta (sextillion)
+            [10**18, 'E'],  # exa   (quintrillion)
+            [10**15, 'P'],  # peta  (quadrillion)
+            [10**12, 'T'],  # tera  (trillion)
+            [10**9, 'G'],   # giga  (billion)
+            [10**6, 'M'],   # mega  (million)
+            [10**3, 'k']]   # kilo  (thousand)
+
+    if hits >= 1000:
+
+        for limit, symbol in numbers:
+
+            if hits >= limit:
+                value = hits/limit
+                break
+
+        result = "%0.2f%s" % (value, symbol)
+
+    else:
+
+        result = "%d" % hits
+
+    return result
+
+
+def fPerc(lVal=0, rVal=0, Decimal=2):
+    """Calculate percentage value and return in human-readable format"""
+
+    if rVal > 0:
+        return str("%0." + str(Decimal) + "f") % (100 * (lVal / rVal)) + "%"
+    else:
+        return str("%0." + str(Decimal) + "f") % 100 + "%"
+
+
+def get_arc_summary(Kstat):
+    """Collect general data on the ARC"""
+
+    output = {}
+    memory_throttle_count = Kstat[
+        "kstat.zfs.misc.arcstats.memory_throttle_count"
+        ]
+
+    if memory_throttle_count > 0:
+        output['health'] = 'THROTTLED'
+    else:
+        output['health'] = 'HEALTHY'
+
+    output['memory_throttle_count'] = fHits(memory_throttle_count)
+
+    # ARC Misc.
+    deleted = Kstat["kstat.zfs.misc.arcstats.deleted"]
+    mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"]
+    evict_skip = Kstat["kstat.zfs.misc.arcstats.evict_skip"]
+
+    # ARC Misc.
+    output["arc_misc"] = {}
+    output["arc_misc"]["deleted"] = fHits(deleted)
+    output["arc_misc"]['mutex_miss'] = fHits(mutex_miss)
+    output["arc_misc"]['evict_skips'] = fHits(evict_skip)
+
+    # ARC Sizing
+    arc_size = Kstat["kstat.zfs.misc.arcstats.size"]
+    mru_size = Kstat["kstat.zfs.misc.arcstats.mru_size"]
+    mfu_size = Kstat["kstat.zfs.misc.arcstats.mfu_size"]
+    meta_limit = Kstat["kstat.zfs.misc.arcstats.arc_meta_limit"]
+    meta_size = Kstat["kstat.zfs.misc.arcstats.arc_meta_used"]
+    dnode_limit = Kstat["kstat.zfs.misc.arcstats.arc_dnode_limit"]
+    dnode_size = Kstat["kstat.zfs.misc.arcstats.dnode_size"]
+    target_max_size = Kstat["kstat.zfs.misc.arcstats.c_max"]
+    target_min_size = Kstat["kstat.zfs.misc.arcstats.c_min"]
+    target_size = Kstat["kstat.zfs.misc.arcstats.c"]
+
+    target_size_ratio = (target_max_size / target_min_size)
+
+    # ARC Sizing
+    output['arc_sizing'] = {}
+    output['arc_sizing']['arc_size'] = {
+        'per': fPerc(arc_size, target_max_size),
+        'num': fBytes(arc_size),
+    }
+    output['arc_sizing']['target_max_size'] = {
+        'ratio': target_size_ratio,
+        'num': fBytes(target_max_size),
+    }
+    output['arc_sizing']['target_min_size'] = {
+        'per': fPerc(target_min_size, target_max_size),
+        'num': fBytes(target_min_size),
+    }
+    output['arc_sizing']['target_size'] = {
+        'per': fPerc(target_size, target_max_size),
+        'num': fBytes(target_size),
+    }
+    output['arc_sizing']['meta_limit'] = {
+        'per': fPerc(meta_limit, target_max_size),
+        'num': fBytes(meta_limit),
+    }
+    output['arc_sizing']['meta_size'] = {
+        'per': fPerc(meta_size, meta_limit),
+        'num': fBytes(meta_size),
+    }
+    output['arc_sizing']['dnode_limit'] = {
+        'per': fPerc(dnode_limit, meta_limit),
+        'num': fBytes(dnode_limit),
+    }
+    output['arc_sizing']['dnode_size'] = {
+        'per': fPerc(dnode_size, dnode_limit),
+        'num': fBytes(dnode_size),
+    }
+
+    # ARC Hash Breakdown
+    output['arc_hash_break'] = {}
+    output['arc_hash_break']['hash_chain_max'] = Kstat[
+        "kstat.zfs.misc.arcstats.hash_chain_max"
+        ]
+    output['arc_hash_break']['hash_chains'] = Kstat[
+        "kstat.zfs.misc.arcstats.hash_chains"
+        ]
+    output['arc_hash_break']['hash_collisions'] = Kstat[
+        "kstat.zfs.misc.arcstats.hash_collisions"
+        ]
+    output['arc_hash_break']['hash_elements'] = Kstat[
+        "kstat.zfs.misc.arcstats.hash_elements"
+        ]
+    output['arc_hash_break']['hash_elements_max'] = Kstat[
+        "kstat.zfs.misc.arcstats.hash_elements_max"
+        ]
+
+    output['arc_size_break'] = {}
+    output['arc_size_break']['recently_used_cache_size'] = {
+        'per': fPerc(mru_size, mru_size + mfu_size),
+        'num': fBytes(mru_size),
+    }
+    output['arc_size_break']['frequently_used_cache_size'] = {
+        'per': fPerc(mfu_size, mru_size + mfu_size),
+        'num': fBytes(mfu_size),
+    }
+
+    # ARC Hash Breakdown
+    hash_chain_max = Kstat["kstat.zfs.misc.arcstats.hash_chain_max"]
+    hash_chains = Kstat["kstat.zfs.misc.arcstats.hash_chains"]
+    hash_collisions = Kstat["kstat.zfs.misc.arcstats.hash_collisions"]
+    hash_elements = Kstat["kstat.zfs.misc.arcstats.hash_elements"]
+    hash_elements_max = Kstat["kstat.zfs.misc.arcstats.hash_elements_max"]
+
+    output['arc_hash_break'] = {}
+    output['arc_hash_break']['elements_max'] = fHits(hash_elements_max)
+    output['arc_hash_break']['elements_current'] = {
+        'per': fPerc(hash_elements, hash_elements_max),
+        'num': fHits(hash_elements),
+        }
+    output['arc_hash_break']['collisions'] = fHits(hash_collisions)
+    output['arc_hash_break']['chain_max'] = fHits(hash_chain_max)
+    output['arc_hash_break']['chains'] = fHits(hash_chains)
+
+    return output
+
+
+def _arc_summary(Kstat):
+    """Print information on the ARC"""
+
+    # ARC Sizing
+    arc = get_arc_summary(Kstat)
+
+    sys.stdout.write("ARC Summary: (%s)\n" % arc['health'])
+
+    sys.stdout.write("\tMemory Throttle Count:\t\t\t%s\n" %
+                     arc['memory_throttle_count'])
+    sys.stdout.write("\n")
+
+    # ARC Misc.
+    sys.stdout.write("ARC Misc:\n")
+    sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted'])
+    sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" %
+                     arc['arc_misc']['mutex_miss'])
+    sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" %
+                     arc['arc_misc']['evict_skips'])
+    sys.stdout.write("\n")
+
+    # ARC Sizing
+    sys.stdout.write("ARC Size:\t\t\t\t%s\t%s\n" % (
+        arc['arc_sizing']['arc_size']['per'],
+        arc['arc_sizing']['arc_size']['num']
+        )
+    )
+    sys.stdout.write("\tTarget Size: (Adaptive)\t\t%s\t%s\n" % (
+        arc['arc_sizing']['target_size']['per'],
+        arc['arc_sizing']['target_size']['num'],
+        )
+    )
+
+    sys.stdout.write("\tMin Size (Hard Limit):\t\t%s\t%s\n" % (
+        arc['arc_sizing']['target_min_size']['per'],
+        arc['arc_sizing']['target_min_size']['num'],
+        )
+    )
+
+    sys.stdout.write("\tMax Size (High Water):\t\t%d:1\t%s\n" % (
+        arc['arc_sizing']['target_max_size']['ratio'],
+        arc['arc_sizing']['target_max_size']['num'],
+        )
+    )
+
+    sys.stdout.write("\nARC Size Breakdown:\n")
+    sys.stdout.write("\tRecently Used Cache Size:\t%s\t%s\n" % (
+        arc['arc_size_break']['recently_used_cache_size']['per'],
+        arc['arc_size_break']['recently_used_cache_size']['num'],
+        )
+    )
+    sys.stdout.write("\tFrequently Used Cache Size:\t%s\t%s\n" % (
+        arc['arc_size_break']['frequently_used_cache_size']['per'],
+        arc['arc_size_break']['frequently_used_cache_size']['num'],
+        )
+    )
+    sys.stdout.write("\tMetadata Size (Hard Limit):\t%s\t%s\n" % (
+        arc['arc_sizing']['meta_limit']['per'],
+        arc['arc_sizing']['meta_limit']['num'],
+        )
+    )
+    sys.stdout.write("\tMetadata Size:\t\t\t%s\t%s\n" % (
+        arc['arc_sizing']['meta_size']['per'],
+        arc['arc_sizing']['meta_size']['num'],
+        )
+    )
+    sys.stdout.write("\tDnode Size (Hard Limit):\t%s\t%s\n" % (
+        arc['arc_sizing']['dnode_limit']['per'],
+        arc['arc_sizing']['dnode_limit']['num'],
+        )
+    )
+    sys.stdout.write("\tDnode Size:\t\t\t%s\t%s\n" % (
+        arc['arc_sizing']['dnode_size']['per'],
+        arc['arc_sizing']['dnode_size']['num'],
+        )
+    )
+
+    sys.stdout.write("\n")
+
+    # ARC Hash Breakdown
+    sys.stdout.write("ARC Hash Breakdown:\n")
+    sys.stdout.write("\tElements Max:\t\t\t\t%s\n" %
+                     arc['arc_hash_break']['elements_max'])
+    sys.stdout.write("\tElements Current:\t\t%s\t%s\n" % (
+        arc['arc_hash_break']['elements_current']['per'],
+        arc['arc_hash_break']['elements_current']['num'],
+        )
+    )
+    sys.stdout.write("\tCollisions:\t\t\t\t%s\n" %
+                     arc['arc_hash_break']['collisions'])
+    sys.stdout.write("\tChain Max:\t\t\t\t%s\n" %
+                     arc['arc_hash_break']['chain_max'])
+    sys.stdout.write("\tChains:\t\t\t\t\t%s\n" %
+                     arc['arc_hash_break']['chains'])
+
+
+def get_arc_efficiency(Kstat):
+    """Collect information on the efficiency of the ARC"""
+
+    output = {}
+
+    arc_hits = Kstat["kstat.zfs.misc.arcstats.hits"]
+    arc_misses = Kstat["kstat.zfs.misc.arcstats.misses"]
+    demand_data_hits = Kstat["kstat.zfs.misc.arcstats.demand_data_hits"]
+    demand_data_misses = Kstat["kstat.zfs.misc.arcstats.demand_data_misses"]
+    demand_metadata_hits = Kstat[
+        "kstat.zfs.misc.arcstats.demand_metadata_hits"
+        ]
+    demand_metadata_misses = Kstat[
+        "kstat.zfs.misc.arcstats.demand_metadata_misses"
+        ]
+    mfu_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mfu_ghost_hits"]
+    mfu_hits = Kstat["kstat.zfs.misc.arcstats.mfu_hits"]
+    mru_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mru_ghost_hits"]
+    mru_hits = Kstat["kstat.zfs.misc.arcstats.mru_hits"]
+    prefetch_data_hits = Kstat["kstat.zfs.misc.arcstats.prefetch_data_hits"]
+    prefetch_data_misses = Kstat[
+        "kstat.zfs.misc.arcstats.prefetch_data_misses"
+        ]
+    prefetch_metadata_hits = Kstat[
+        "kstat.zfs.misc.arcstats.prefetch_metadata_hits"
+        ]
+    prefetch_metadata_misses = Kstat[
+        "kstat.zfs.misc.arcstats.prefetch_metadata_misses"
+        ]
+
+    anon_hits = arc_hits - (
+        mfu_hits + mru_hits + mfu_ghost_hits + mru_ghost_hits
+        )
+    arc_accesses_total = (arc_hits + arc_misses)
+    demand_data_total = (demand_data_hits + demand_data_misses)
+    prefetch_data_total = (prefetch_data_hits + prefetch_data_misses)
+    real_hits = (mfu_hits + mru_hits)
+
+    output["total_accesses"] = fHits(arc_accesses_total)
+    output["cache_hit_ratio"] = {
+        'per': fPerc(arc_hits, arc_accesses_total),
+        'num': fHits(arc_hits),
+    }
+    output["cache_miss_ratio"] = {
+        'per': fPerc(arc_misses, arc_accesses_total),
+        'num': fHits(arc_misses),
+    }
+    output["actual_hit_ratio"] = {
+        'per': fPerc(real_hits, arc_accesses_total),
+        'num': fHits(real_hits),
+    }
+    output["data_demand_efficiency"] = {
+        'per': fPerc(demand_data_hits, demand_data_total),
+        'num': fHits(demand_data_total),
+    }
+
+    if prefetch_data_total > 0:
+        output["data_prefetch_efficiency"] = {
+            'per': fPerc(prefetch_data_hits, prefetch_data_total),
+            'num': fHits(prefetch_data_total),
+        }
+
+    if anon_hits > 0:
+        output["cache_hits_by_cache_list"] = {}
+        output["cache_hits_by_cache_list"]["anonymously_used"] = {
+            'per': fPerc(anon_hits, arc_hits),
+            'num': fHits(anon_hits),
+        }
+
+    output["most_recently_used"] = {
+        'per': fPerc(mru_hits, arc_hits),
+        'num': fHits(mru_hits),
+    }
+    output["most_frequently_used"] = {
+        'per': fPerc(mfu_hits, arc_hits),
+        'num': fHits(mfu_hits),
+    }
+    output["most_recently_used_ghost"] = {
+        'per': fPerc(mru_ghost_hits, arc_hits),
+        'num': fHits(mru_ghost_hits),
+    }
+    output["most_frequently_used_ghost"] = {
+        'per': fPerc(mfu_ghost_hits, arc_hits),
+        'num': fHits(mfu_ghost_hits),
+    }
+
+    output["cache_hits_by_data_type"] = {}
+    output["cache_hits_by_data_type"]["demand_data"] = {
+        'per': fPerc(demand_data_hits, arc_hits),
+        'num': fHits(demand_data_hits),
+    }
+    output["cache_hits_by_data_type"]["prefetch_data"] = {
+        'per': fPerc(prefetch_data_hits, arc_hits),
+        'num': fHits(prefetch_data_hits),
+    }
+    output["cache_hits_by_data_type"]["demand_metadata"] = {
+        'per': fPerc(demand_metadata_hits, arc_hits),
+        'num': fHits(demand_metadata_hits),
+    }
+    output["cache_hits_by_data_type"]["prefetch_metadata"] = {
+        'per': fPerc(prefetch_metadata_hits, arc_hits),
+        'num': fHits(prefetch_metadata_hits),
+    }
+
+    output["cache_misses_by_data_type"] = {}
+    output["cache_misses_by_data_type"]["demand_data"] = {
+        'per': fPerc(demand_data_misses, arc_misses),
+        'num': fHits(demand_data_misses),
+    }
+    output["cache_misses_by_data_type"]["prefetch_data"] = {
+        'per': fPerc(prefetch_data_misses, arc_misses),
+        'num': fHits(prefetch_data_misses),
+    }
+    output["cache_misses_by_data_type"]["demand_metadata"] = {
+        'per': fPerc(demand_metadata_misses, arc_misses),
+        'num': fHits(demand_metadata_misses),
+    }
+    output["cache_misses_by_data_type"]["prefetch_metadata"] = {
+        'per': fPerc(prefetch_metadata_misses, arc_misses),
+        'num': fHits(prefetch_metadata_misses),
+    }
+
+    return output
+
+
+def _arc_efficiency(Kstat):
+    """Print information on the efficiency of the ARC"""
+
+    arc = get_arc_efficiency(Kstat)
+
+    sys.stdout.write("ARC Total accesses:\t\t\t\t\t%s\n" %
+                     arc['total_accesses'])
+    sys.stdout.write("\tCache Hit Ratio:\t\t%s\t%s\n" % (
+        arc['cache_hit_ratio']['per'],
+        arc['cache_hit_ratio']['num'],
+        )
+    )
+    sys.stdout.write("\tCache Miss Ratio:\t\t%s\t%s\n" % (
+        arc['cache_miss_ratio']['per'],
+        arc['cache_miss_ratio']['num'],
+        )
+    )
+
+    sys.stdout.write("\tActual Hit Ratio:\t\t%s\t%s\n" % (
+        arc['actual_hit_ratio']['per'],
+        arc['actual_hit_ratio']['num'],
+        )
+    )
+
+    sys.stdout.write("\n")
+    sys.stdout.write("\tData Demand Efficiency:\t\t%s\t%s\n" % (
+        arc['data_demand_efficiency']['per'],
+        arc['data_demand_efficiency']['num'],
+        )
+    )
+
+    if 'data_prefetch_efficiency' in arc:
+        sys.stdout.write("\tData Prefetch Efficiency:\t%s\t%s\n" % (
+            arc['data_prefetch_efficiency']['per'],
+            arc['data_prefetch_efficiency']['num'],
+            )
+        )
+    sys.stdout.write("\n")
+
+    sys.stdout.write("\tCACHE HITS BY CACHE LIST:\n")
+    if 'cache_hits_by_cache_list' in arc:
+        sys.stdout.write("\t  Anonymously Used:\t\t%s\t%s\n" % (
+            arc['cache_hits_by_cache_list']['anonymously_used']['per'],
+            arc['cache_hits_by_cache_list']['anonymously_used']['num'],
+            )
+        )
+    sys.stdout.write("\t  Most Recently Used:\t\t%s\t%s\n" % (
+        arc['most_recently_used']['per'],
+        arc['most_recently_used']['num'],
+        )
+    )
+    sys.stdout.write("\t  Most Frequently Used:\t\t%s\t%s\n" % (
+        arc['most_frequently_used']['per'],
+        arc['most_frequently_used']['num'],
+        )
+    )
+    sys.stdout.write("\t  Most Recently Used Ghost:\t%s\t%s\n" % (
+        arc['most_recently_used_ghost']['per'],
+        arc['most_recently_used_ghost']['num'],
+        )
+    )
+    sys.stdout.write("\t  Most Frequently Used Ghost:\t%s\t%s\n" % (
+        arc['most_frequently_used_ghost']['per'],
+        arc['most_frequently_used_ghost']['num'],
+        )
+    )
+
+    sys.stdout.write("\n\tCACHE HITS BY DATA TYPE:\n")
+    sys.stdout.write("\t  Demand Data:\t\t\t%s\t%s\n" % (
+        arc["cache_hits_by_data_type"]['demand_data']['per'],
+        arc["cache_hits_by_data_type"]['demand_data']['num'],
+        )
+    )
+    sys.stdout.write("\t  Prefetch Data:\t\t%s\t%s\n" % (
+        arc["cache_hits_by_data_type"]['prefetch_data']['per'],
+        arc["cache_hits_by_data_type"]['prefetch_data']['num'],
+        )
+    )
+    sys.stdout.write("\t  Demand Metadata:\t\t%s\t%s\n" % (
+        arc["cache_hits_by_data_type"]['demand_metadata']['per'],
+        arc["cache_hits_by_data_type"]['demand_metadata']['num'],
+        )
+    )
+    sys.stdout.write("\t  Prefetch Metadata:\t\t%s\t%s\n" % (
+        arc["cache_hits_by_data_type"]['prefetch_metadata']['per'],
+        arc["cache_hits_by_data_type"]['prefetch_metadata']['num'],
+        )
+    )
+
+    sys.stdout.write("\n\tCACHE MISSES BY DATA TYPE:\n")
+    sys.stdout.write("\t  Demand Data:\t\t\t%s\t%s\n" % (
+        arc["cache_misses_by_data_type"]['demand_data']['per'],
+        arc["cache_misses_by_data_type"]['demand_data']['num'],
+        )
+    )
+    sys.stdout.write("\t  Prefetch Data:\t\t%s\t%s\n" % (
+        arc["cache_misses_by_data_type"]['prefetch_data']['per'],
+        arc["cache_misses_by_data_type"]['prefetch_data']['num'],
+        )
+    )
+    sys.stdout.write("\t  Demand Metadata:\t\t%s\t%s\n" % (
+        arc["cache_misses_by_data_type"]['demand_metadata']['per'],
+        arc["cache_misses_by_data_type"]['demand_metadata']['num'],
+        )
+    )
+    sys.stdout.write("\t  Prefetch Metadata:\t\t%s\t%s\n" % (
+        arc["cache_misses_by_data_type"]['prefetch_metadata']['per'],
+        arc["cache_misses_by_data_type"]['prefetch_metadata']['num'],
+        )
+    )
+
+
+def get_l2arc_summary(Kstat):
+    """Collection information on the L2ARC"""
+
+    output = {}
+
+    l2_abort_lowmem = Kstat["kstat.zfs.misc.arcstats.l2_abort_lowmem"]
+    l2_cksum_bad = Kstat["kstat.zfs.misc.arcstats.l2_cksum_bad"]
+    l2_evict_lock_retry = Kstat["kstat.zfs.misc.arcstats.l2_evict_lock_retry"]
+    l2_evict_reading = Kstat["kstat.zfs.misc.arcstats.l2_evict_reading"]
+    l2_feeds = Kstat["kstat.zfs.misc.arcstats.l2_feeds"]
+    l2_free_on_write = Kstat["kstat.zfs.misc.arcstats.l2_free_on_write"]
+    l2_hdr_size = Kstat["kstat.zfs.misc.arcstats.l2_hdr_size"]
+    l2_hits = Kstat["kstat.zfs.misc.arcstats.l2_hits"]
+    l2_io_error = Kstat["kstat.zfs.misc.arcstats.l2_io_error"]
+    l2_misses = Kstat["kstat.zfs.misc.arcstats.l2_misses"]
+    l2_rw_clash = Kstat["kstat.zfs.misc.arcstats.l2_rw_clash"]
+    l2_size = Kstat["kstat.zfs.misc.arcstats.l2_size"]
+    l2_asize = Kstat["kstat.zfs.misc.arcstats.l2_asize"]
+    l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"]
+    l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"]
+    l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"]
+
+    l2_access_total = (l2_hits + l2_misses)
+    output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error)
+
+    output['l2_access_total'] = l2_access_total
+    output['l2_size'] = l2_size
+    output['l2_asize'] = l2_asize
+
+    if l2_size > 0 and l2_access_total > 0:
+
+        if output['l2_health_count'] > 0:
+            output["health"] = "DEGRADED"
+        else:
+            output["health"] = "HEALTHY"
+
+        output["low_memory_aborts"] = fHits(l2_abort_lowmem)
+        output["free_on_write"] = fHits(l2_free_on_write)
+        output["rw_clashes"] = fHits(l2_rw_clash)
+        output["bad_checksums"] = fHits(l2_cksum_bad)
+        output["io_errors"] = fHits(l2_io_error)
+
+        output["l2_arc_size"] = {}
+        output["l2_arc_size"]["adative"] = fBytes(l2_size)
+        output["l2_arc_size"]["actual"] = {
+            'per': fPerc(l2_asize, l2_size),
+            'num': fBytes(l2_asize)
+            }
+        output["l2_arc_size"]["head_size"] = {
+            'per': fPerc(l2_hdr_size, l2_size),
+            'num': fBytes(l2_hdr_size),
+        }
+
+        output["l2_arc_evicts"] = {}
+        output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry)
+        output["l2_arc_evicts"]['reading'] = fHits(l2_evict_reading)
+
+        output['l2_arc_breakdown'] = {}
+        output['l2_arc_breakdown']['value'] = fHits(l2_access_total)
+        output['l2_arc_breakdown']['hit_ratio'] = {
+            'per': fPerc(l2_hits, l2_access_total),
+            'num': fHits(l2_hits),
+        }
+        output['l2_arc_breakdown']['miss_ratio'] = {
+            'per': fPerc(l2_misses, l2_access_total),
+            'num': fHits(l2_misses),
+        }
+        output['l2_arc_breakdown']['feeds'] = fHits(l2_feeds)
+
+        output['l2_arc_buffer'] = {}
+
+        output['l2_arc_writes'] = {}
+        output['l2_writes_done'] = l2_writes_done
+        output['l2_writes_sent'] = l2_writes_sent
+        if l2_writes_done != l2_writes_sent:
+            output['l2_arc_writes']['writes_sent'] = {
+                'value': "FAULTED",
+                'num': fHits(l2_writes_sent),
+            }
+            output['l2_arc_writes']['done_ratio'] = {
+                'per': fPerc(l2_writes_done, l2_writes_sent),
+                'num': fHits(l2_writes_done),
+            }
+            output['l2_arc_writes']['error_ratio'] = {
+                'per': fPerc(l2_writes_error, l2_writes_sent),
+                'num': fHits(l2_writes_error),
+            }
+        else:
+            output['l2_arc_writes']['writes_sent'] = {
+                'per': fPerc(100),
+                'num': fHits(l2_writes_sent),
+            }
+
+    return output
+
+
+def _l2arc_summary(Kstat):
+    """Print information on the L2ARC"""
+
+    arc = get_l2arc_summary(Kstat)
+
+    if arc['l2_size'] > 0 and arc['l2_access_total'] > 0:
+        sys.stdout.write("L2 ARC Summary: ")
+        if arc['l2_health_count'] > 0:
+            sys.stdout.write("(DEGRADED)\n")
+        else:
+            sys.stdout.write("(HEALTHY)\n")
+        sys.stdout.write("\tLow Memory Aborts:\t\t\t%s\n" %
+                         arc['low_memory_aborts'])
+        sys.stdout.write("\tFree on Write:\t\t\t\t%s\n" % arc['free_on_write'])
+        sys.stdout.write("\tR/W Clashes:\t\t\t\t%s\n" % arc['rw_clashes'])
+        sys.stdout.write("\tBad Checksums:\t\t\t\t%s\n" % arc['bad_checksums'])
+        sys.stdout.write("\tIO Errors:\t\t\t\t%s\n" % arc['io_errors'])
+        sys.stdout.write("\n")
+
+        sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" %
+                         arc["l2_arc_size"]["adative"])
+        sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % (
+            arc["l2_arc_size"]["actual"]["per"],
+            arc["l2_arc_size"]["actual"]["num"],
+            )
+        )
+        sys.stdout.write("\tHeader Size:\t\t\t%s\t%s\n" % (
+            arc["l2_arc_size"]["head_size"]["per"],
+            arc["l2_arc_size"]["head_size"]["num"],
+            )
+        )
+        sys.stdout.write("\n")
+
+        if arc["l2_arc_evicts"]['lock_retries'] != '0' or \
+           arc["l2_arc_evicts"]["reading"] != '0':
+            sys.stdout.write("L2 ARC Evicts:\n")
+            sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" %
+                             arc["l2_arc_evicts"]['lock_retries'])
+            sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" %
+                             arc["l2_arc_evicts"]["reading"])
+            sys.stdout.write("\n")
+
+        sys.stdout.write("L2 ARC Breakdown:\t\t\t\t%s\n" %
+                         arc['l2_arc_breakdown']['value'])
+        sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
+            arc['l2_arc_breakdown']['hit_ratio']['per'],
+            arc['l2_arc_breakdown']['hit_ratio']['num'],
+            )
+        )
+
+        sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
+            arc['l2_arc_breakdown']['miss_ratio']['per'],
+            arc['l2_arc_breakdown']['miss_ratio']['num'],
+            )
+        )
+
+        sys.stdout.write("\tFeeds:\t\t\t\t\t%s\n" %
+                         arc['l2_arc_breakdown']['feeds'])
+        sys.stdout.write("\n")
+
+        sys.stdout.write("L2 ARC Writes:\n")
+        if arc['l2_writes_done'] != arc['l2_writes_sent']:
+            sys.stdout.write("\tWrites Sent: (%s)\t\t\t\t%s\n" % (
+                arc['l2_arc_writes']['writes_sent']['value'],
+                arc['l2_arc_writes']['writes_sent']['num'],
+                )
+            )
+            sys.stdout.write("\t  Done Ratio:\t\t\t%s\t%s\n" % (
+                arc['l2_arc_writes']['done_ratio']['per'],
+                arc['l2_arc_writes']['done_ratio']['num'],
+                )
+            )
+            sys.stdout.write("\t  Error Ratio:\t\t\t%s\t%s\n" % (
+                arc['l2_arc_writes']['error_ratio']['per'],
+                arc['l2_arc_writes']['error_ratio']['num'],
+                )
+            )
+        else:
+            sys.stdout.write("\tWrites Sent:\t\t\t%s\t%s\n" % (
+                arc['l2_arc_writes']['writes_sent']['per'],
+                arc['l2_arc_writes']['writes_sent']['num'],
+                )
+            )
+
+
+def get_dmu_summary(Kstat):
+    """Collect information on the DMU"""
+
+    output = {}
+
+    zfetch_hits = Kstat["kstat.zfs.misc.zfetchstats.hits"]
+    zfetch_misses = Kstat["kstat.zfs.misc.zfetchstats.misses"]
+
+    zfetch_access_total = (zfetch_hits + zfetch_misses)
+    output['zfetch_access_total'] = zfetch_access_total
+
+    if zfetch_access_total > 0:
+        output['dmu'] = {}
+        output['dmu']['efficiency'] = {}
+        output['dmu']['efficiency']['value'] = fHits(zfetch_access_total)
+        output['dmu']['efficiency']['hit_ratio'] = {
+            'per': fPerc(zfetch_hits, zfetch_access_total),
+            'num': fHits(zfetch_hits),
+        }
+        output['dmu']['efficiency']['miss_ratio'] = {
+            'per': fPerc(zfetch_misses, zfetch_access_total),
+            'num': fHits(zfetch_misses),
+        }
+
+    return output
+
+
+def _dmu_summary(Kstat):
+    """Print information on the DMU"""
+
+    arc = get_dmu_summary(Kstat)
+
+    if arc['zfetch_access_total'] > 0:
+        sys.stdout.write("DMU Prefetch Efficiency:\t\t\t\t\t%s\n" %
+                         arc['dmu']['efficiency']['value'])
+        sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
+            arc['dmu']['efficiency']['hit_ratio']['per'],
+            arc['dmu']['efficiency']['hit_ratio']['num'],
+            )
+        )
+        sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
+            arc['dmu']['efficiency']['miss_ratio']['per'],
+            arc['dmu']['efficiency']['miss_ratio']['num'],
+            )
+        )
+
+        sys.stdout.write("\n")
+
+
+def get_vdev_summary(Kstat):
+    """Collect information on the VDEVs"""
+
+    output = {}
+
+    vdev_cache_delegations = \
+        Kstat["kstat.zfs.misc.vdev_cache_stats.delegations"]
+    vdev_cache_misses = Kstat["kstat.zfs.misc.vdev_cache_stats.misses"]
+    vdev_cache_hits = Kstat["kstat.zfs.misc.vdev_cache_stats.hits"]
+    vdev_cache_total = (vdev_cache_misses + vdev_cache_hits +
+                        vdev_cache_delegations)
+
+    output['vdev_cache_total'] = vdev_cache_total
+
+    if vdev_cache_total > 0:
+        output['summary'] = fHits(vdev_cache_total)
+        output['hit_ratio'] = {
+            'per': fPerc(vdev_cache_hits, vdev_cache_total),
+            'num': fHits(vdev_cache_hits),
+        }
+        output['miss_ratio'] = {
+            'per': fPerc(vdev_cache_misses, vdev_cache_total),
+            'num': fHits(vdev_cache_misses),
+        }
+        output['delegations'] = {
+            'per': fPerc(vdev_cache_delegations, vdev_cache_total),
+            'num': fHits(vdev_cache_delegations),
+        }
+
+    return output
+
+
+def _vdev_summary(Kstat):
+    """Print information on the VDEVs"""
+
+    arc = get_vdev_summary(Kstat)
+
+    if arc['vdev_cache_total'] > 0:
+        sys.stdout.write("VDEV Cache Summary:\t\t\t\t%s\n" % arc['summary'])
+        sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
+            arc['hit_ratio']['per'],
+            arc['hit_ratio']['num'],
+        ))
+        sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
+            arc['miss_ratio']['per'],
+            arc['miss_ratio']['num'],
+        ))
+        sys.stdout.write("\tDelegations:\t\t\t%s\t%s\n" % (
+            arc['delegations']['per'],
+            arc['delegations']['num'],
+        ))
+
+
+def _tunable_summary(Kstat):
+    """Print information on tunables, including descriptions if requested"""
+
+    global show_tunable_descriptions
+    global alternate_tunable_layout
+
+    tunables = load_tunables()
+    descriptions = {}
+
+    if show_tunable_descriptions:
+
+        command = ["/sbin/modinfo", "zfs", "-0"]
+
+        try:
+            p = Popen(command, stdin=PIPE, stdout=PIPE,
+                      stderr=PIPE, shell=False, close_fds=True)
+            p.wait()
+
+            # By default, Python 2 returns a string as the first element of the
+            # tuple from p.communicate(), while Python 3 returns bytes which
+            # must be decoded first. The better way to do this would be with
+            # subprocess.run() or at least .check_output(), but this fails on
+            # CentOS 6 because of its old version of Python 2
+            desc = bytes.decode(p.communicate()[0])
+            description_list = desc.strip().split('\0')
+
+            if p.returncode == 0:
+                for tunable in description_list:
+                    if tunable[0:5] == 'parm:':
+                        tunable = tunable[5:].strip()
+                        name, description = tunable.split(':', 1)
+                        if not description:
+                            description = "Description unavailable"
+                        descriptions[name] = description
+            else:
+                sys.stderr.write("%s: '%s' exited with code %i\n" %
+                                 (sys.argv[0], command[0], p.returncode))
+                sys.stderr.write("Tunable descriptions will be disabled.\n")
+        except OSError as e:
+            sys.stderr.write("%s: Cannot run '%s': %s\n" %
+                             (sys.argv[0], command[0], e.strerror))
+            sys.stderr.write("Tunable descriptions will be disabled.\n")
+
+    sys.stdout.write("ZFS Tunables:\n")
+
+    if alternate_tunable_layout:
+        fmt = "\t%s=%s\n"
+    else:
+        fmt = "\t%-50s%s\n"
+
+    for name in sorted(tunables.keys()):
+        if show_tunable_descriptions and name in descriptions:
+            sys.stdout.write("\t# %s\n" % descriptions[name])
+
+        sys.stdout.write(fmt % (name, tunables[name]))
+
+
+unSub = [
+    _arc_summary,
+    _arc_efficiency,
+    _l2arc_summary,
+    _dmu_summary,
+    _vdev_summary,
+    _tunable_summary
+]
+
+
+def zfs_header():
+    """Print title string with date"""
+
+    daydate = time.strftime('%a %b %d %H:%M:%S %Y')
+
+    sys.stdout.write('\n'+'-'*72+'\n')
+    sys.stdout.write('ZFS Subsystem Report\t\t\t\t%s' % daydate)
+    sys.stdout.write('\n')
+
+
+def usage():
+    """Print usage information"""
+
+    sys.stdout.write("Usage: arc_summary [-h] [-a] [-d] [-p PAGE]\n\n")
+    sys.stdout.write("\t -h, --help           : "
+                     "Print this help message and exit\n")
+    sys.stdout.write("\t -a, --alternate      : "
+                     "Show an alternate sysctl layout\n")
+    sys.stdout.write("\t -d, --description    : "
+                     "Show the sysctl descriptions\n")
+    sys.stdout.write("\t -p PAGE, --page=PAGE : "
+                     "Select a single output page to display,\n")
+    sys.stdout.write("\t                        "
+                     "should be an integer between 1 and " +
+                     str(len(unSub)) + "\n\n")
+    sys.stdout.write("Examples:\n")
+    sys.stdout.write("\tarc_summary -a\n")
+    sys.stdout.write("\tarc_summary -p 4\n")
+    sys.stdout.write("\tarc_summary -ad\n")
+    sys.stdout.write("\tarc_summary --page=2\n")
+
+
+def main():
+    """Main function"""
+
+    global show_tunable_descriptions
+    global alternate_tunable_layout
+
+    try:
+        opts, args = getopt.getopt(
+            sys.argv[1:],
+            "adp:h", ["alternate", "description", "page=", "help"]
+        )
+    except getopt.error as e:
+        sys.stderr.write("Error: %s\n" % e.msg)
+        usage()
+        sys.exit(1)
+
+    args = {}
+    for opt, arg in opts:
+        if opt in ('-a', '--alternate'):
+            args['a'] = True
+        if opt in ('-d', '--description'):
+            args['d'] = True
+        if opt in ('-p', '--page'):
+            args['p'] = arg
+        if opt in ('-h', '--help'):
+            usage()
+            sys.exit(0)
+
+    Kstat = get_Kstat()
+
+    alternate_tunable_layout = 'a' in args
+    show_tunable_descriptions = 'd' in args
+
+    pages = []
+
+    if 'p' in args:
+        try:
+            pages.append(unSub[int(args['p']) - 1])
+        except IndexError:
+            sys.stderr.write('the argument to -p must be between 1 and ' +
+                             str(len(unSub)) + '\n')
+            sys.exit(1)
+    else:
+        pages = unSub
+
+    zfs_header()
+    for page in pages:
+        page(Kstat)
+        sys.stdout.write("\n")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sys/contrib/openzfs/cmd/arc_summary/arc_summary3 b/sys/contrib/openzfs/cmd/arc_summary/arc_summary3
new file mode 100755
index 000000000000..c920b8e5395d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arc_summary/arc_summary3
@@ -0,0 +1,943 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>,
+# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>,
+# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>,
+# Copyright (c) 2017 Scot W. Stevenson <scot.stevenson@gmail.com>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+"""Print statistics on the ZFS ARC Cache and other information
+
+Provides basic information on the ARC, its efficiency, the L2ARC (if present),
+the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See
+the in-source documentation and code at
+https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details.
+The original introduction to arc_summary can be found at
+http://cuddletech.com/?p=454
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+
+DESCRIPTION = 'Print ARC and other statistics for ZFS on Linux'
+INDENT = ' '*8
+LINE_LENGTH = 72
+DATE_FORMAT = '%a %b %d %H:%M:%S %Y'
+TITLE = 'ZFS Subsystem Report'
+
+SECTIONS = 'arc archits dmu l2arc spl tunables vdev zil'.split()
+SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')'
+
+# Tunables and SPL are handled separately because they come from
+# different sources
+SECTION_PATHS = {'arc': 'arcstats',
+                 'dmu': 'dmu_tx',
+                 'l2arc': 'arcstats',  # L2ARC stuff lives in arcstats
+                 'vdev': 'vdev_cache_stats',
+                 'xuio': 'xuio_stats',
+                 'zfetch': 'zfetchstats',
+                 'zil': 'zil'}
+
+parser = argparse.ArgumentParser(description=DESCRIPTION)
+parser.add_argument('-a', '--alternate', action='store_true', default=False,
+                    help='use alternate formatting for tunables and SPL',
+                    dest='alt')
+parser.add_argument('-d', '--description', action='store_true', default=False,
+                    help='print descriptions with tunables and SPL',
+                    dest='desc')
+parser.add_argument('-g', '--graph', action='store_true', default=False,
+                    help='print graph on ARC use and exit', dest='graph')
+parser.add_argument('-p', '--page', type=int, dest='page',
+                    help='print page by number (DEPRECATED, use "-s")')
+parser.add_argument('-r', '--raw', action='store_true', default=False,
+                    help='dump all available data with minimal formatting',
+                    dest='raw')
+parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP)
+ARGS = parser.parse_args()
+
+
+if sys.platform.startswith('freebsd'):
+    # Requires py36-sysctl on FreeBSD
+    import sysctl
+
+    VDEV_CACHE_SIZE = 'vdev.cache_size'
+
+    def load_kstats(section):
+        base = 'kstat.zfs.misc.{section}.'.format(section=section)
+        # base is removed from the name
+        fmt = lambda kstat: '{name} : {value}'.format(name=kstat.name[len(base):],
+                                                      value=kstat.value)
+        return [fmt(kstat) for kstat in sysctl.filter(base)]
+
+    def get_params(base):
+        cut = 8 # = len('vfs.zfs.')
+        return {ctl.name[cut:]: str(ctl.value) for ctl in sysctl.filter(base)}
+
+    def get_tunable_params():
+        return get_params('vfs.zfs')
+
+    def get_vdev_params():
+        return get_params('vfs.zfs.vdev')
+
+    def get_version_impl(request):
+        # FreeBSD reports versions for zpl and spa instead of zfs and spl.
+        name = {'zfs': 'zpl',
+                'spl': 'spa'}[request]
+        mib = 'vfs.zfs.version.{}'.format(name)
+        version = sysctl.filter(mib)[0].value
+        return '{} version {}'.format(name, version)
+
+    def get_descriptions(_request):
+        # py-sysctl doesn't give descriptions, so we have to shell out.
+        command = ['sysctl', '-d', 'vfs.zfs']
+
+        # The recommended way to do this is with subprocess.run(). However,
+        # some installed versions of Python are < 3.5, so we offer them
+        # the option of doing it the old way (for now)
+        if 'run' in dir(subprocess):
+            info = subprocess.run(command, stdout=subprocess.PIPE,
+                                  universal_newlines=True)
+            lines = info.stdout.split('\n')
+        else:
+            info = subprocess.check_output(command, universal_newlines=True)
+            lines = info.split('\n')
+
+        def fmt(line):
+            name, desc = line.split(':', 1)
+            return (name.strip(), desc.strip())
+
+        return dict([fmt(line) for line in lines if len(line) > 0])
+
+
+elif sys.platform.startswith('linux'):
+    KSTAT_PATH = '/proc/spl/kstat/zfs'
+    SPL_PATH = '/sys/module/spl/parameters'
+    TUNABLES_PATH = '/sys/module/zfs/parameters'
+
+    VDEV_CACHE_SIZE = 'zfs_vdev_cache_size'
+
+    def load_kstats(section):
+        path = os.path.join(KSTAT_PATH, section)
+        with open(path) as f:
+            return list(f)[2:] # Get rid of header
+
+    def get_params(basepath):
+        """Collect information on the Solaris Porting Layer (SPL) or the
+        tunables, depending on the PATH given. Does not check if PATH is
+        legal.
+        """
+        result = {}
+        for name in os.listdir(basepath):
+            path = os.path.join(basepath, name)
+            with open(path) as f:
+                value = f.read()
+                result[name] = value.strip()
+        return result
+
+    def get_spl_params():
+        return get_params(SPL_PATH)
+
+    def get_tunable_params():
+        return get_params(TUNABLES_PATH)
+
+    def get_vdev_params():
+        return get_params(TUNABLES_PATH)
+
+    def get_version_impl(request):
+        # The original arc_summary called /sbin/modinfo/{spl,zfs} to get
+        # the version information. We switch to /sys/module/{spl,zfs}/version
+        # to make sure we get what is really loaded in the kernel
+        command = ["cat", "/sys/module/{0}/version".format(request)]
+        req = request.upper()
+
+        # The recommended way to do this is with subprocess.run(). However,
+        # some installed versions of Python are < 3.5, so we offer them
+        # the option of doing it the old way (for now)
+        if 'run' in dir(subprocess):
+            info = subprocess.run(command, stdout=subprocess.PIPE,
+                                  universal_newlines=True)
+            version = info.stdout.strip()
+        else:
+            info = subprocess.check_output(command, universal_newlines=True)
+            version = info.strip()
+
+        return version
+
+    def get_descriptions(request):
+        """Get the descriptions of the Solaris Porting Layer (SPL) or the
+        tunables, return with minimal formatting.
+        """
+
+        if request not in ('spl', 'zfs'):
+            print('ERROR: description of "{0}" requested)'.format(request))
+            sys.exit(1)
+
+        descs = {}
+        target_prefix = 'parm:'
+
+        # We would prefer to do this with /sys/modules -- see the discussion at
+        # get_version() -- but there isn't a way to get the descriptions from
+        # there, so we fall back on modinfo
+        command = ["/sbin/modinfo", request, "-0"]
+
+        # The recommended way to do this is with subprocess.run(). However,
+        # some installed versions of Python are < 3.5, so we offer them
+        # the option of doing it the old way (for now)
+        info = ''
+
+        try:
+
+            if 'run' in dir(subprocess):
+                info = subprocess.run(command, stdout=subprocess.PIPE,
+                                      universal_newlines=True)
+                raw_output = info.stdout.split('\0')
+            else:
+                info = subprocess.check_output(command,
+                                               universal_newlines=True)
+                raw_output = info.split('\0')
+
+        except subprocess.CalledProcessError:
+            print("Error: Descriptions not available",
+                  "(can't access kernel module)")
+            sys.exit(1)
+
+        for line in raw_output:
+
+            if not line.startswith(target_prefix):
+                continue
+
+            line = line[len(target_prefix):].strip()
+            name, raw_desc = line.split(':', 1)
+            desc = raw_desc.rsplit('(', 1)[0]
+
+            if desc == '':
+                desc = '(No description found)'
+
+            descs[name.strip()] = desc.strip()
+
+        return descs
+
+
+def cleanup_line(single_line):
+    """Format a raw line of data from /proc and isolate the name value
+    part, returning a tuple with each. Currently, this gets rid of the
+    middle '4'. For example "arc_no_grow    4    0" returns the tuple
+    ("arc_no_grow", "0").
+    """
+    name, _, value = single_line.split()
+
+    return name, value
+
+
+def draw_graph(kstats_dict):
+    """Draw a primitive graph representing the basic information on the
+    ARC -- its size and the proportion used by MFU and MRU -- and quit.
+    We use max size of the ARC to calculate how full it is. This is a
+    very rough representation.
+    """
+
+    arc_stats = isolate_section('arcstats', kstats_dict)
+
+    GRAPH_INDENT = ' '*4
+    GRAPH_WIDTH = 60
+    arc_size = f_bytes(arc_stats['size'])
+    arc_perc = f_perc(arc_stats['size'], arc_stats['c_max'])
+    mfu_size = f_bytes(arc_stats['mfu_size'])
+    mru_size = f_bytes(arc_stats['mru_size'])
+    meta_limit = f_bytes(arc_stats['arc_meta_limit'])
+    meta_size = f_bytes(arc_stats['arc_meta_used'])
+    dnode_limit = f_bytes(arc_stats['arc_dnode_limit'])
+    dnode_size = f_bytes(arc_stats['dnode_size'])
+
+    info_form = ('ARC: {0} ({1})  MFU: {2}  MRU: {3}  META: {4} ({5}) '
+                 'DNODE {6} ({7})')
+    info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size,
+                                 meta_size, meta_limit, dnode_size,
+                                 dnode_limit)
+    info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2)
+    info_line = GRAPH_INDENT+info_spc+info_line
+
+    graph_line = GRAPH_INDENT+'+'+('-'*(GRAPH_WIDTH-2))+'+'
+
+    mfu_perc = float(int(arc_stats['mfu_size'])/int(arc_stats['c_max']))
+    mru_perc = float(int(arc_stats['mru_size'])/int(arc_stats['c_max']))
+    arc_perc = float(int(arc_stats['size'])/int(arc_stats['c_max']))
+    total_ticks = float(arc_perc)*GRAPH_WIDTH
+    mfu_ticks = mfu_perc*GRAPH_WIDTH
+    mru_ticks = mru_perc*GRAPH_WIDTH
+    other_ticks = total_ticks-(mfu_ticks+mru_ticks)
+
+    core_form = 'F'*int(mfu_ticks)+'R'*int(mru_ticks)+'O'*int(other_ticks)
+    core_spc = ' '*(GRAPH_WIDTH-(2+len(core_form)))
+    core_line = GRAPH_INDENT+'|'+core_form+core_spc+'|'
+
+    for line in ('', info_line, graph_line, core_line, graph_line, ''):
+        print(line)
+
+
+def f_bytes(byte_string):
+    """Return human-readable representation of a byte value in
+    powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal
+    points. Values smaller than one KiB are returned without
+    decimal points. Note "bytes" is a reserved keyword.
+    """
+
+    prefixes = ([2**80, "YiB"],   # yobibytes (yotta)
+                [2**70, "ZiB"],   # zebibytes (zetta)
+                [2**60, "EiB"],   # exbibytes (exa)
+                [2**50, "PiB"],   # pebibytes (peta)
+                [2**40, "TiB"],   # tebibytes (tera)
+                [2**30, "GiB"],   # gibibytes (giga)
+                [2**20, "MiB"],   # mebibytes (mega)
+                [2**10, "KiB"])   # kibibytes (kilo)
+
+    bites = int(byte_string)
+
+    if bites >= 2**10:
+        for limit, unit in prefixes:
+
+            if bites >= limit:
+                value = bites / limit
+                break
+
+        result = '{0:.1f} {1}'.format(value, unit)
+    else:
+        result = '{0} Bytes'.format(bites)
+
+    return result
+
+
+def f_hits(hits_string):
+    """Create a human-readable representation of the number of hits.
+    The single-letter symbols used are SI to avoid the confusion caused
+    by the different "short scale" and "long scale" representations in
+    English, which use the same words for different values. See
+    https://en.wikipedia.org/wiki/Names_of_large_numbers and:
+    https://physics.nist.gov/cuu/Units/prefixes.html
+    """
+
+    numbers = ([10**24, 'Y'],  # yotta (septillion)
+               [10**21, 'Z'],  # zetta (sextillion)
+               [10**18, 'E'],  # exa   (quintrillion)
+               [10**15, 'P'],  # peta  (quadrillion)
+               [10**12, 'T'],  # tera  (trillion)
+               [10**9, 'G'],   # giga  (billion)
+               [10**6, 'M'],   # mega  (million)
+               [10**3, 'k'])   # kilo  (thousand)
+
+    hits = int(hits_string)
+
+    if hits >= 1000:
+        for limit, symbol in numbers:
+
+            if hits >= limit:
+                value = hits/limit
+                break
+
+        result = "%0.1f%s" % (value, symbol)
+    else:
+        result = "%d" % hits
+
+    return result
+
+
+def f_perc(value1, value2):
+    """Calculate percentage and return in human-readable form. If
+    rounding produces the result '0.0' though the first number is
+    not zero, include a 'less-than' symbol to avoid confusion.
+    Division by zero is handled by returning 'n/a'; no error
+    is called.
+    """
+
+    v1 = float(value1)
+    v2 = float(value2)
+
+    try:
+        perc = 100 * v1/v2
+    except ZeroDivisionError:
+        result = 'n/a'
+    else:
+        result = '{0:0.1f} %'.format(perc)
+
+    if result == '0.0 %' and v1 > 0:
+        result = '< 0.1 %'
+
+    return result
+
+
+def format_raw_line(name, value):
+    """For the --raw option for the tunable and SPL outputs, decide on the
+    correct formatting based on the --alternate flag.
+    """
+
+    if ARGS.alt:
+        result = '{0}{1}={2}'.format(INDENT, name, value)
+    else:
+        spc = LINE_LENGTH-(len(INDENT)+len(value))
+        result = '{0}{1:<{spc}}{2}'.format(INDENT, name, value, spc=spc)
+
+    return result
+
+
+def get_kstats():
+    """Collect information on the ZFS subsystem. The step does not perform any
+    further processing, giving us the option to only work on what is actually
+    needed. The name "kstat" is a holdover from the Solaris utility of the same
+    name.
+    """
+
+    result = {}
+
+    for section in SECTION_PATHS.values():
+        if section not in result:
+            result[section] = load_kstats(section)
+
+    return result
+
+
+def get_version(request):
+    """Get the version number of ZFS or SPL on this machine for header.
+    Returns an error string, but does not raise an error, if we can't
+    get the ZFS/SPL version.
+    """
+
+    if request not in ('spl', 'zfs'):
+        error_msg = '(ERROR: "{0}" requested)'.format(request)
+        return error_msg
+
+    return get_version_impl(request)
+
+
+def print_header():
+    """Print the initial heading with date and time as well as info on the
+    kernel and ZFS versions. This is not called for the graph.
+    """
+
+    # datetime is now recommended over time but we keep the exact formatting
+    # from the older version of arc_summary in case there are scripts
+    # that expect it in this way
+    daydate = time.strftime(DATE_FORMAT)
+    spc_date = LINE_LENGTH-len(daydate)
+    sys_version = os.uname()
+
+    sys_msg = sys_version.sysname+' '+sys_version.release
+    zfs = get_version('zfs')
+    spc_zfs = LINE_LENGTH-len(zfs)
+
+    machine_msg = 'Machine: '+sys_version.nodename+' ('+sys_version.machine+')'
+    spl = get_version('spl')
+    spc_spl = LINE_LENGTH-len(spl)
+
+    print('\n'+('-'*LINE_LENGTH))
+    print('{0:<{spc}}{1}'.format(TITLE, daydate, spc=spc_date))
+    print('{0:<{spc}}{1}'.format(sys_msg, zfs, spc=spc_zfs))
+    print('{0:<{spc}}{1}\n'.format(machine_msg, spl, spc=spc_spl))
+
+
+def print_raw(kstats_dict):
+    """Print all available data from the system in a minimally sorted format.
+    This can be used as a source to be piped through 'grep'.
+    """
+
+    sections = sorted(kstats_dict.keys())
+
+    for section in sections:
+
+        print('\n{0}:'.format(section.upper()))
+        lines = sorted(kstats_dict[section])
+
+        for line in lines:
+            name, value = cleanup_line(line)
+            print(format_raw_line(name, value))
+
+    # Tunables and SPL must be handled separately because they come from a
+    # different source and have descriptions the user might request
+    print()
+    section_spl()
+    section_tunables()
+
+
+def isolate_section(section_name, kstats_dict):
+    """From the complete information on all sections, retrieve only those
+    for one section.
+    """
+
+    try:
+        section_data = kstats_dict[section_name]
+    except KeyError:
+        print('ERROR: Data on {0} not available'.format(section_data))
+        sys.exit(1)
+
+    section_dict = dict(cleanup_line(l) for l in section_data)
+
+    return section_dict
+
+
+# Formatted output helper functions
+
+
+def prt_1(text, value):
+    """Print text and one value, no indent"""
+    spc = ' '*(LINE_LENGTH-(len(text)+len(value)))
+    print('{0}{spc}{1}'.format(text, value, spc=spc))
+
+
+def prt_i1(text, value):
+    """Print text and one value, with indent"""
+    spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(value)))
+    print(INDENT+'{0}{spc}{1}'.format(text, value, spc=spc))
+
+
+def prt_2(text, value1, value2):
+    """Print text and two values, no indent"""
+    values = '{0:>9}  {1:>9}'.format(value1, value2)
+    spc = ' '*(LINE_LENGTH-(len(text)+len(values)+2))
+    print('{0}{spc}  {1}'.format(text, values, spc=spc))
+
+
+def prt_i2(text, value1, value2):
+    """Print text and two values, with indent"""
+    values = '{0:>9}  {1:>9}'.format(value1, value2)
+    spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(values)+2))
+    print(INDENT+'{0}{spc}  {1}'.format(text, values, spc=spc))
+
+
+# The section output concentrates on important parameters instead of
+# being exhaustive (that is what the --raw parameter is for)
+
+
+def section_arc(kstats_dict):
+    """Give basic information on the ARC, MRU and MFU. This is the first
+    and most used section.
+    """
+
+    arc_stats = isolate_section('arcstats', kstats_dict)
+
+    throttle = arc_stats['memory_throttle_count']
+
+    if throttle == '0':
+        health = 'HEALTHY'
+    else:
+        health = 'THROTTLED'
+
+    prt_1('ARC status:', health)
+    prt_i1('Memory throttle count:', throttle)
+    print()
+
+    arc_size = arc_stats['size']
+    arc_target_size = arc_stats['c']
+    arc_max = arc_stats['c_max']
+    arc_min = arc_stats['c_min']
+    mfu_size = arc_stats['mfu_size']
+    mru_size = arc_stats['mru_size']
+    meta_limit = arc_stats['arc_meta_limit']
+    meta_size = arc_stats['arc_meta_used']
+    dnode_limit = arc_stats['arc_dnode_limit']
+    dnode_size = arc_stats['dnode_size']
+    target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min))
+
+    prt_2('ARC size (current):',
+          f_perc(arc_size, arc_max), f_bytes(arc_size))
+    prt_i2('Target size (adaptive):',
+           f_perc(arc_target_size, arc_max), f_bytes(arc_target_size))
+    prt_i2('Min size (hard limit):',
+           f_perc(arc_min, arc_max), f_bytes(arc_min))
+    prt_i2('Max size (high water):',
+           target_size_ratio, f_bytes(arc_max))
+    caches_size = int(mfu_size)+int(mru_size)
+    prt_i2('Most Frequently Used (MFU) cache size:',
+           f_perc(mfu_size, caches_size), f_bytes(mfu_size))
+    prt_i2('Most Recently Used (MRU) cache size:',
+           f_perc(mru_size, caches_size), f_bytes(mru_size))
+    prt_i2('Metadata cache size (hard limit):',
+           f_perc(meta_limit, arc_max), f_bytes(meta_limit))
+    prt_i2('Metadata cache size (current):',
+           f_perc(meta_size, meta_limit), f_bytes(meta_size))
+    prt_i2('Dnode cache size (hard limit):',
+           f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit))
+    prt_i2('Dnode cache size (current):',
+           f_perc(dnode_size, dnode_limit), f_bytes(dnode_size))
+    print()
+
+    print('ARC hash breakdown:')
+    prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max']))
+    prt_i2('Elements current:',
+           f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']),
+           f_hits(arc_stats['hash_elements']))
+    prt_i1('Collisions:', f_hits(arc_stats['hash_collisions']))
+
+    prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max']))
+    prt_i1('Chains:', f_hits(arc_stats['hash_chains']))
+    print()
+
+    print('ARC misc:')
+    prt_i1('Deleted:', f_hits(arc_stats['deleted']))
+    prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss']))
+    prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip']))
+    print()
+
+
+def section_archits(kstats_dict):
+    """Print information on how the caches are accessed ("arc hits").
+    """
+
+    arc_stats = isolate_section('arcstats', kstats_dict)
+    all_accesses = int(arc_stats['hits'])+int(arc_stats['misses'])
+    actual_hits = int(arc_stats['mfu_hits'])+int(arc_stats['mru_hits'])
+
+    prt_1('ARC total accesses (hits + misses):', f_hits(all_accesses))
+    ta_todo = (('Cache hit ratio:', arc_stats['hits']),
+               ('Cache miss ratio:', arc_stats['misses']),
+               ('Actual hit ratio (MFU + MRU hits):', actual_hits))
+
+    for title, value in ta_todo:
+        prt_i2(title, f_perc(value, all_accesses), f_hits(value))
+
+    dd_total = int(arc_stats['demand_data_hits']) +\
+        int(arc_stats['demand_data_misses'])
+    prt_i2('Data demand efficiency:',
+           f_perc(arc_stats['demand_data_hits'], dd_total),
+           f_hits(dd_total))
+
+    dp_total = int(arc_stats['prefetch_data_hits']) +\
+        int(arc_stats['prefetch_data_misses'])
+    prt_i2('Data prefetch efficiency:',
+           f_perc(arc_stats['prefetch_data_hits'], dp_total),
+           f_hits(dp_total))
+
+    known_hits = int(arc_stats['mfu_hits']) +\
+        int(arc_stats['mru_hits']) +\
+        int(arc_stats['mfu_ghost_hits']) +\
+        int(arc_stats['mru_ghost_hits'])
+
+    anon_hits = int(arc_stats['hits'])-known_hits
+
+    print()
+    print('Cache hits by cache type:')
+    cl_todo = (('Most frequently used (MFU):', arc_stats['mfu_hits']),
+               ('Most recently used (MRU):', arc_stats['mru_hits']),
+               ('Most frequently used (MFU) ghost:',
+                arc_stats['mfu_ghost_hits']),
+               ('Most recently used (MRU) ghost:',
+                arc_stats['mru_ghost_hits']))
+
+    for title, value in cl_todo:
+        prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value))
+
+    # For some reason, anon_hits can turn negative, which is weird. Until we
+    # have figured out why this happens, we just hide the problem, following
+    # the behavior of the original arc_summary.
+    if anon_hits >= 0:
+        prt_i2('Anonymously used:',
+               f_perc(anon_hits, arc_stats['hits']), f_hits(anon_hits))
+
+    print()
+    print('Cache hits by data type:')
+    dt_todo = (('Demand data:', arc_stats['demand_data_hits']),
+               ('Demand prefetch data:', arc_stats['prefetch_data_hits']),
+               ('Demand metadata:', arc_stats['demand_metadata_hits']),
+               ('Demand prefetch metadata:',
+                arc_stats['prefetch_metadata_hits']))
+
+    for title, value in dt_todo:
+        prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value))
+
+    print()
+    print('Cache misses by data type:')
+    dm_todo = (('Demand data:', arc_stats['demand_data_misses']),
+               ('Demand prefetch data:',
+                arc_stats['prefetch_data_misses']),
+               ('Demand metadata:', arc_stats['demand_metadata_misses']),
+               ('Demand prefetch metadata:',
+                arc_stats['prefetch_metadata_misses']))
+
+    for title, value in dm_todo:
+        prt_i2(title, f_perc(value, arc_stats['misses']), f_hits(value))
+
+    print()
+
+
+def section_dmu(kstats_dict):
+    """Collect information on the DMU"""
+
+    zfetch_stats = isolate_section('zfetchstats', kstats_dict)
+
+    zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
+
+    prt_1('DMU prefetch efficiency:', f_hits(zfetch_access_total))
+    prt_i2('Hit ratio:', f_perc(zfetch_stats['hits'], zfetch_access_total),
+           f_hits(zfetch_stats['hits']))
+    prt_i2('Miss ratio:', f_perc(zfetch_stats['misses'], zfetch_access_total),
+           f_hits(zfetch_stats['misses']))
+    print()
+
+
+def section_l2arc(kstats_dict):
+    """Collect information on L2ARC device if present. If not, tell user
+    that we're skipping the section.
+    """
+
+    # The L2ARC statistics live in the same section as the normal ARC stuff
+    arc_stats = isolate_section('arcstats', kstats_dict)
+
+    if arc_stats['l2_size'] == '0':
+        print('L2ARC not detected, skipping section\n')
+        return
+
+    l2_errors = int(arc_stats['l2_writes_error']) +\
+        int(arc_stats['l2_cksum_bad']) +\
+        int(arc_stats['l2_io_error'])
+
+    l2_access_total = int(arc_stats['l2_hits'])+int(arc_stats['l2_misses'])
+    health = 'HEALTHY'
+
+    if l2_errors > 0:
+        health = 'DEGRADED'
+
+    prt_1('L2ARC status:', health)
+
+    l2_todo = (('Low memory aborts:', 'l2_abort_lowmem'),
+               ('Free on write:', 'l2_free_on_write'),
+               ('R/W clashes:', 'l2_rw_clash'),
+               ('Bad checksums:', 'l2_cksum_bad'),
+               ('I/O errors:', 'l2_io_error'))
+
+    for title, value in l2_todo:
+        prt_i1(title, f_hits(arc_stats[value]))
+
+    print()
+    prt_1('L2ARC size (adaptive):', f_bytes(arc_stats['l2_size']))
+    prt_i2('Compressed:', f_perc(arc_stats['l2_asize'], arc_stats['l2_size']),
+           f_bytes(arc_stats['l2_asize']))
+    prt_i2('Header size:',
+           f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']),
+           f_bytes(arc_stats['l2_hdr_size']))
+
+    print()
+    prt_1('L2ARC breakdown:', f_hits(l2_access_total))
+    prt_i2('Hit ratio:',
+           f_perc(arc_stats['l2_hits'], l2_access_total),
+           f_hits(arc_stats['l2_hits']))
+    prt_i2('Miss ratio:',
+           f_perc(arc_stats['l2_misses'], l2_access_total),
+           f_hits(arc_stats['l2_misses']))
+    prt_i1('Feeds:', f_hits(arc_stats['l2_feeds']))
+
+    print()
+    print('L2ARC writes:')
+
+    if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']:
+        prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent']))
+        prt_i2('Done ratio:',
+               f_perc(arc_stats['l2_writes_done'],
+                      arc_stats['l2_writes_sent']),
+               f_hits(arc_stats['l2_writes_done']))
+        prt_i2('Error ratio:',
+               f_perc(arc_stats['l2_writes_error'],
+                      arc_stats['l2_writes_sent']),
+               f_hits(arc_stats['l2_writes_error']))
+    else:
+        prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent']))
+
+    print()
+    print('L2ARC evicts:')
+    prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry']))
+    prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading']))
+    print()
+
+
+def section_spl(*_):
+    """Print the SPL parameters, if requested with alternative format
+    and/or descriptions. This does not use kstats.
+    """
+
+    if sys.platform.startswith('freebsd'):
+        # No SPL support in FreeBSD
+        return
+
+    spls = get_spl_params()
+    keylist = sorted(spls.keys())
+    print('Solaris Porting Layer (SPL):')
+
+    if ARGS.desc:
+        descriptions = get_descriptions('spl')
+
+    for key in keylist:
+        value = spls[key]
+
+        if ARGS.desc:
+            try:
+                print(INDENT+'#', descriptions[key])
+            except KeyError:
+                print(INDENT+'# (No description found)')  # paranoid
+
+        print(format_raw_line(key, value))
+
+    print()
+
+
+def section_tunables(*_):
+    """Print the tunables, if requested with alternative format and/or
+    descriptions. This does not use kstasts.
+    """
+
+    tunables = get_tunable_params()
+    keylist = sorted(tunables.keys())
+    print('Tunables:')
+
+    if ARGS.desc:
+        descriptions = get_descriptions('zfs')
+
+    for key in keylist:
+        value = tunables[key]
+
+        if ARGS.desc:
+            try:
+                print(INDENT+'#', descriptions[key])
+            except KeyError:
+                print(INDENT+'# (No description found)')  # paranoid
+
+        print(format_raw_line(key, value))
+
+    print()
+
+
+def section_vdev(kstats_dict):
+    """Collect information on VDEV caches"""
+
+    # Currently [Nov 2017] the VDEV cache is disabled, because it is actually
+    # harmful. When this is the case, we just skip the whole entry. See
+    # https://github.com/zfsonlinux/zfs/blob/master/module/zfs/vdev_cache.c
+    # for details
+    tunables = get_vdev_params()
+
+    if tunables[VDEV_CACHE_SIZE] == '0':
+        print('VDEV cache disabled, skipping section\n')
+        return
+
+    vdev_stats = isolate_section('vdev_cache_stats', kstats_dict)
+
+    vdev_cache_total = int(vdev_stats['hits']) +\
+        int(vdev_stats['misses']) +\
+        int(vdev_stats['delegations'])
+
+    prt_1('VDEV cache summary:', f_hits(vdev_cache_total))
+    prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total),
+           f_hits(vdev_stats['hits']))
+    prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total),
+           f_hits(vdev_stats['misses']))
+    prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total),
+           f_hits(vdev_stats['delegations']))
+    print()
+
+
+def section_zil(kstats_dict):
+    """Collect information on the ZFS Intent Log. Some of the information
+    taken from https://github.com/zfsonlinux/zfs/blob/master/include/sys/zil.h
+    """
+
+    zil_stats = isolate_section('zil', kstats_dict)
+
+    prt_1('ZIL committed transactions:',
+          f_hits(zil_stats['zil_itx_count']))
+    prt_i1('Commit requests:', f_hits(zil_stats['zil_commit_count']))
+    prt_i1('Flushes to stable storage:',
+           f_hits(zil_stats['zil_commit_writer_count']))
+    prt_i2('Transactions to SLOG storage pool:',
+           f_bytes(zil_stats['zil_itx_metaslab_slog_bytes']),
+           f_hits(zil_stats['zil_itx_metaslab_slog_count']))
+    prt_i2('Transactions to non-SLOG storage pool:',
+           f_bytes(zil_stats['zil_itx_metaslab_normal_bytes']),
+           f_hits(zil_stats['zil_itx_metaslab_normal_count']))
+    print()
+
+
+section_calls = {'arc': section_arc,
+                 'archits': section_archits,
+                 'dmu': section_dmu,
+                 'l2arc': section_l2arc,
+                 'spl': section_spl,
+                 'tunables': section_tunables,
+                 'vdev': section_vdev,
+                 'zil': section_zil}
+
+
+def main():
+    """Run program. The options to draw a graph and to print all data raw are
+    treated separately because they come with their own call.
+    """
+
+    kstats = get_kstats()
+
+    if ARGS.graph:
+        draw_graph(kstats)
+        sys.exit(0)
+
+    print_header()
+
+    if ARGS.raw:
+        print_raw(kstats)
+
+    elif ARGS.section:
+
+        try:
+            section_calls[ARGS.section](kstats)
+        except KeyError:
+            print('Error: Section "{0}" unknown'.format(ARGS.section))
+            sys.exit(1)
+
+    elif ARGS.page:
+        print('WARNING: Pages are deprecated, please use "--section"\n')
+
+        pages_to_calls = {1: 'arc',
+                          2: 'archits',
+                          3: 'l2arc',
+                          4: 'dmu',
+                          5: 'vdev',
+                          6: 'tunables'}
+
+        try:
+            call = pages_to_calls[ARGS.page]
+        except KeyError:
+            print('Error: Page "{0}" not supported'.format(ARGS.page))
+            sys.exit(1)
+        else:
+            section_calls[call](kstats)
+
+    else:
+        # If no parameters were given, we print all sections. We might want to
+        # change the sequence by hand
+        calls = sorted(section_calls.keys())
+
+        for section in calls:
+            section_calls[section](kstats)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sys/contrib/openzfs/cmd/arcstat/.gitignore b/sys/contrib/openzfs/cmd/arcstat/.gitignore
new file mode 100644
index 000000000000..6d6cd1ab75fc
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arcstat/.gitignore
@@ -0,0 +1 @@
+arcstat
diff --git a/sys/contrib/openzfs/cmd/arcstat/Makefile.am b/sys/contrib/openzfs/cmd/arcstat/Makefile.am
new file mode 100644
index 000000000000..d1ba989a0cd8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arcstat/Makefile.am
@@ -0,0 +1,5 @@
+include $(top_srcdir)/config/Substfiles.am
+
+bin_SCRIPTS = arcstat
+
+SUBSTFILES += $(bin_SCRIPTS)
diff --git a/sys/contrib/openzfs/cmd/arcstat/arcstat.in b/sys/contrib/openzfs/cmd/arcstat/arcstat.in
new file mode 100755
index 000000000000..c83a1c74599e
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arcstat/arcstat.in
@@ -0,0 +1,494 @@
+#!/usr/bin/env @PYTHON_SHEBANG@
+#
+# Print out ZFS ARC Statistics exported via kstat(1)
+# For a definition of fields, or usage, use arcstat -v
+#
+# This script was originally a fork of the original arcstat.pl (0.1)
+# by Neelakanth Nadgir, originally published on his Sun blog on
+# 09/18/2007
+#     http://blogs.sun.com/realneel/entry/zfs_arc_statistics
+#
+# A new version aimed to improve upon the original by adding features
+# and fixing bugs as needed.  This version was maintained by Mike
+# Harsch and was hosted in a public open source repository:
+#    http://github.com/mharsch/arcstat
+#
+# but has since moved to the illumos-gate repository.
+#
+# This Python port was written by John Hixson for FreeNAS, introduced
+# in commit e2c29f:
+#    https://github.com/freenas/freenas
+#
+# and has been improved by many people since.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Fields have a fixed width. Every interval, we fill the "v"
+# hash with its corresponding value (v[field]=value) using calculate().
+# @hdr is the array of fields that needs to be printed, so we
+# just iterate over this array and print the values using our pretty printer.
+#
+# This script must remain compatible with Python 2.6+ and Python 3.4+.
+#
+
+import sys
+import time
+import getopt
+import re
+import copy
+
+from signal import signal, SIGINT, SIGWINCH, SIG_DFL
+
+
+cols = {
+    # HDR:        [Size, Scale, Description]
+    "time":       [8, -1, "Time"],
+    "hits":       [4, 1000, "ARC reads per second"],
+    "miss":       [4, 1000, "ARC misses per second"],
+    "read":       [4, 1000, "Total ARC accesses per second"],
+    "hit%":       [4, 100, "ARC hit percentage"],
+    "miss%":      [5, 100, "ARC miss percentage"],
+    "dhit":       [4, 1000, "Demand hits per second"],
+    "dmis":       [4, 1000, "Demand misses per second"],
+    "dh%":        [3, 100, "Demand hit percentage"],
+    "dm%":        [3, 100, "Demand miss percentage"],
+    "phit":       [4, 1000, "Prefetch hits per second"],
+    "pmis":       [4, 1000, "Prefetch misses per second"],
+    "ph%":        [3, 100, "Prefetch hits percentage"],
+    "pm%":        [3, 100, "Prefetch miss percentage"],
+    "mhit":       [4, 1000, "Metadata hits per second"],
+    "mmis":       [4, 1000, "Metadata misses per second"],
+    "mread":      [5, 1000, "Metadata accesses per second"],
+    "mh%":        [3, 100, "Metadata hit percentage"],
+    "mm%":        [3, 100, "Metadata miss percentage"],
+    "arcsz":      [5, 1024, "ARC size"],
+    "size":       [4, 1024, "ARC size"],
+    "c":          [4, 1024, "ARC target size"],
+    "mfu":        [4, 1000, "MFU list hits per second"],
+    "mru":        [4, 1000, "MRU list hits per second"],
+    "mfug":       [4, 1000, "MFU ghost list hits per second"],
+    "mrug":       [4, 1000, "MRU ghost list hits per second"],
+    "eskip":      [5, 1000, "evict_skip per second"],
+    "mtxmis":     [6, 1000, "mutex_miss per second"],
+    "dread":      [5, 1000, "Demand accesses per second"],
+    "pread":      [5, 1000, "Prefetch accesses per second"],
+    "l2hits":     [6, 1000, "L2ARC hits per second"],
+    "l2miss":     [6, 1000, "L2ARC misses per second"],
+    "l2read":     [6, 1000, "Total L2ARC accesses per second"],
+    "l2hit%":     [6, 100, "L2ARC access hit percentage"],
+    "l2miss%":    [7, 100, "L2ARC access miss percentage"],
+    "l2asize":    [7, 1024, "Actual (compressed) size of the L2ARC"],
+    "l2size":     [6, 1024, "Size of the L2ARC"],
+    "l2bytes":    [7, 1024, "Bytes read per second from the L2ARC"],
+    "grow":       [4, 1000, "ARC grow disabled"],
+    "need":       [4, 1024, "ARC reclaim need"],
+    "free":       [4, 1024, "ARC free memory"],
+    "avail":      [5, 1024, "ARC available memory"],
+    "waste":      [5, 1024, "Wasted memory due to round up to pagesize"],
+}
+
+v = {}
+hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis",
+       "mm%", "size", "c", "avail"]
+xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "dread",
+        "pread", "read"]
+sint = 1               # Default interval is 1 second
+count = 1              # Default count is 1
+hdr_intr = 20          # Print header every 20 lines of output
+opfile = None
+sep = "  "              # Default separator is 2 spaces
+version = "0.4"
+l2exist = False
+cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval "
+       "[count]]\n")
+cur = {}
+d = {}
+out = None
+kstat = None
+
+
+if sys.platform.startswith('freebsd'):
+    # Requires py27-sysctl on FreeBSD
+    import sysctl
+
+    def kstat_update():
+        global kstat
+
+        k = sysctl.filter('kstat.zfs.misc.arcstats')
+
+        if not k:
+            sys.exit(1)
+
+        kstat = {}
+
+        for s in k:
+            if not s:
+                continue
+
+            name, value = s.name, s.value
+            # Trims 'kstat.zfs.misc.arcstats' from the name
+            kstat[name[24:]] = int(value)
+
+elif sys.platform.startswith('linux'):
+    def kstat_update():
+        global kstat
+
+        k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')]
+
+        if not k:
+            sys.exit(1)
+
+        del k[0:2]
+        kstat = {}
+
+        for s in k:
+            if not s:
+                continue
+
+            name, unused, value = s.split()
+            kstat[name] = int(value)
+
+
+def detailed_usage():
+    sys.stderr.write("%s\n" % cmd)
+    sys.stderr.write("Field definitions are as follows:\n")
+    for key in cols:
+        sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
+    sys.stderr.write("\n")
+
+    sys.exit(0)
+
+
+def usage():
+    sys.stderr.write("%s\n" % cmd)
+    sys.stderr.write("\t -h : Print this help message\n")
+    sys.stderr.write("\t -v : List all possible field headers and definitions"
+                     "\n")
+    sys.stderr.write("\t -x : Print extended stats\n")
+    sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
+    sys.stderr.write("\t -o : Redirect output to the specified file\n")
+    sys.stderr.write("\t -s : Override default field separator with custom "
+                     "character or string\n")
+    sys.stderr.write("\nExamples:\n")
+    sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n")
+    sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n")
+    sys.stderr.write("\tarcstat -v\n")
+    sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n")
+    sys.stderr.write("\n")
+
+    sys.exit(1)
+
+
+def snap_stats():
+    global cur
+    global kstat
+
+    prev = copy.deepcopy(cur)
+    kstat_update()
+
+    cur = kstat
+    for key in cur:
+        if re.match(key, "class"):
+            continue
+        if key in prev:
+            d[key] = cur[key] - prev[key]
+        else:
+            d[key] = cur[key]
+
+
+def prettynum(sz, scale, num=0):
+    suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
+    index = 0
+    save = 0
+
+    # Special case for date field
+    if scale == -1:
+        return "%s" % num
+
+    # Rounding error, return 0
+    elif 0 < num < 1:
+        num = 0
+
+    while abs(num) > scale and index < 5:
+        save = num
+        num = num / scale
+        index += 1
+
+    if index == 0:
+        return "%*d" % (sz, num)
+
+    if abs(save / scale) < 10:
+        return "%*.1f%s" % (sz - 1, num, suffix[index])
+    else:
+        return "%*d%s" % (sz - 1, num, suffix[index])
+
+
+def print_values():
+    global hdr
+    global sep
+    global v
+
+    sys.stdout.write(sep.join(
+      prettynum(cols[col][0], cols[col][1], v[col]) for col in hdr))
+
+    sys.stdout.write("\n")
+    sys.stdout.flush()
+
+
+def print_header():
+    global hdr
+    global sep
+
+    sys.stdout.write(sep.join("%*s" % (cols[col][0], col) for col in hdr))
+
+    sys.stdout.write("\n")
+
+
+def get_terminal_lines():
+    try:
+        import fcntl
+        import termios
+        import struct
+        data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234')
+        sz = struct.unpack('hh', data)
+        return sz[0]
+    except Exception:
+        pass
+
+
+def update_hdr_intr():
+    global hdr_intr
+
+    lines = get_terminal_lines()
+    if lines and lines > 3:
+        hdr_intr = lines - 3
+
+
+def resize_handler(signum, frame):
+    update_hdr_intr()
+
+
+def init():
+    global sint
+    global count
+    global hdr
+    global xhdr
+    global opfile
+    global sep
+    global out
+    global l2exist
+
+    desired_cols = None
+    xflag = False
+    hflag = False
+    vflag = False
+    i = 1
+
+    try:
+        opts, args = getopt.getopt(
+            sys.argv[1:],
+            "xo:hvs:f:",
+            [
+                "extended",
+                "outfile",
+                "help",
+                "verbose",
+                "separator",
+                "columns"
+            ]
+        )
+    except getopt.error as msg:
+        sys.stderr.write("Error: %s\n" % str(msg))
+        usage()
+        opts = None
+
+    for opt, arg in opts:
+        if opt in ('-x', '--extended'):
+            xflag = True
+        if opt in ('-o', '--outfile'):
+            opfile = arg
+            i += 1
+        if opt in ('-h', '--help'):
+            hflag = True
+        if opt in ('-v', '--verbose'):
+            vflag = True
+        if opt in ('-s', '--separator'):
+            sep = arg
+            i += 1
+        if opt in ('-f', '--columns'):
+            desired_cols = arg
+            i += 1
+        i += 1
+
+    argv = sys.argv[i:]
+    sint = int(argv[0]) if argv else sint
+    count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1)
+
+    if hflag or (xflag and desired_cols):
+        usage()
+
+    if vflag:
+        detailed_usage()
+
+    if xflag:
+        hdr = xhdr
+
+    update_hdr_intr()
+
+    # check if L2ARC exists
+    snap_stats()
+    l2_size = cur.get("l2_size")
+    if l2_size:
+        l2exist = True
+
+    if desired_cols:
+        hdr = desired_cols.split(",")
+
+        invalid = []
+        incompat = []
+        for ele in hdr:
+            if ele not in cols:
+                invalid.append(ele)
+            elif not l2exist and ele.startswith("l2"):
+                sys.stdout.write("No L2ARC Here\n%s\n" % ele)
+                incompat.append(ele)
+
+        if len(invalid) > 0:
+            sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
+            usage()
+
+        if len(incompat) > 0:
+            sys.stderr.write("Incompatible field specified! -- %s\n" %
+                             incompat)
+            usage()
+
+    if opfile:
+        try:
+            out = open(opfile, "w")
+            sys.stdout = out
+
+        except IOError:
+            sys.stderr.write("Cannot open %s for writing\n" % opfile)
+            sys.exit(1)
+
+
+def calculate():
+    global d
+    global v
+    global l2exist
+
+    v = dict()
+    v["time"] = time.strftime("%H:%M:%S", time.localtime())
+    v["hits"] = d["hits"] / sint
+    v["miss"] = d["misses"] / sint
+    v["read"] = v["hits"] + v["miss"]
+    v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0
+    v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0
+
+    v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint
+    v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint
+
+    v["dread"] = v["dhit"] + v["dmis"]
+    v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0
+    v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0
+
+    v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint
+    v["pmis"] = (d["prefetch_data_misses"] +
+                 d["prefetch_metadata_misses"]) / sint
+
+    v["pread"] = v["phit"] + v["pmis"]
+    v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0
+    v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0
+
+    v["mhit"] = (d["prefetch_metadata_hits"] +
+                 d["demand_metadata_hits"]) / sint
+    v["mmis"] = (d["prefetch_metadata_misses"] +
+                 d["demand_metadata_misses"]) / sint
+
+    v["mread"] = v["mhit"] + v["mmis"]
+    v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0
+    v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0
+
+    v["arcsz"] = cur["size"]
+    v["size"] = cur["size"]
+    v["c"] = cur["c"]
+    v["mfu"] = d["mfu_hits"] / sint
+    v["mru"] = d["mru_hits"] / sint
+    v["mrug"] = d["mru_ghost_hits"] / sint
+    v["mfug"] = d["mfu_ghost_hits"] / sint
+    v["eskip"] = d["evict_skip"] / sint
+    v["mtxmis"] = d["mutex_miss"] / sint
+
+    if l2exist:
+        v["l2hits"] = d["l2_hits"] / sint
+        v["l2miss"] = d["l2_misses"] / sint
+        v["l2read"] = v["l2hits"] + v["l2miss"]
+        v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0
+
+        v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0
+        v["l2asize"] = cur["l2_asize"]
+        v["l2size"] = cur["l2_size"]
+        v["l2bytes"] = d["l2_read_bytes"] / sint
+
+    v["grow"] = 0 if cur["arc_no_grow"] else 1
+    v["need"] = cur["arc_need_free"]
+    v["free"] = cur["memory_free_bytes"]
+    v["avail"] = cur["memory_available_bytes"]
+    v["waste"] = cur["abd_chunk_waste_size"]
+
+
+def main():
+    global sint
+    global count
+    global hdr_intr
+
+    i = 0
+    count_flag = 0
+
+    init()
+    if count > 0:
+        count_flag = 1
+
+    signal(SIGINT, SIG_DFL)
+    signal(SIGWINCH, resize_handler)
+    while True:
+        if i == 0:
+            print_header()
+
+        snap_stats()
+        calculate()
+        print_values()
+
+        if count_flag == 1:
+            if count <= 1:
+                break
+            count -= 1
+
+        i = 0 if i >= hdr_intr else i + 1
+        time.sleep(sint)
+
+    if out:
+        out.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sys/contrib/openzfs/cmd/dbufstat/.gitignore b/sys/contrib/openzfs/cmd/dbufstat/.gitignore
new file mode 100644
index 000000000000..2c2e913cef70
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/dbufstat/.gitignore
@@ -0,0 +1 @@
+dbufstat
diff --git a/sys/contrib/openzfs/cmd/dbufstat/Makefile.am b/sys/contrib/openzfs/cmd/dbufstat/Makefile.am
new file mode 100644
index 000000000000..e672a01a4227
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/dbufstat/Makefile.am
@@ -0,0 +1,5 @@
+include $(top_srcdir)/config/Substfiles.am
+
+bin_SCRIPTS = dbufstat
+
+SUBSTFILES += $(bin_SCRIPTS)
diff --git a/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in b/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in
new file mode 100755
index 000000000000..98eb79057388
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in
@@ -0,0 +1,669 @@
+#!/usr/bin/env @PYTHON_SHEBANG@
+#
+# Print out statistics for all cached dmu buffers.  This information
+# is available through the dbufs kstat and may be post-processed as
+# needed by the script.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (C) 2013 Lawrence Livermore National Security, LLC.
+# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+#
+# This script must remain compatible with Python 2.6+ and Python 3.4+.
+#
+
+import sys
+import getopt
+import errno
+import re
+
+bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
+bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
+         "meta", "state", "dbholds", "dbc", "list", "atype", "flags",
+         "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2",
+         "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype",
+         "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"]
+bincompat = ["cached", "direct", "indirect", "bonus", "spill"]
+
+dhdr = ["pool", "objset", "object", "dtype", "cached"]
+dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
+         "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
+         "indirect", "bonus", "spill"]
+dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
+             "dbc", "list", "atype", "flags", "count", "asize", "access",
+             "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
+             "l2_comp", "aholds"]
+
+thdr = ["pool", "objset", "dtype", "cached"]
+txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
+         "bonus", "spill"]
+tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
+             "dbc", "dbholds", "list", "atype", "flags", "count", "asize",
+             "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
+             "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs",
+             "bsize", "lvls", "dholds", "blocks", "dsize"]
+
+cols = {
+    # hdr:        [size, scale, description]
+    "pool":       [15,   -1, "pool name"],
+    "objset":     [6,    -1, "dataset identification number"],
+    "object":     [10,   -1, "object number"],
+    "level":      [5,    -1, "indirection level of buffer"],
+    "blkid":      [8,    -1, "block number of buffer"],
+    "offset":     [12, 1024, "offset in object of buffer"],
+    "dbsize":     [7,  1024, "size of buffer"],
+    "meta":       [4,    -1, "is this buffer metadata?"],
+    "state":      [5,    -1, "state of buffer (read, cached, etc)"],
+    "dbholds":    [7,  1000, "number of holds on buffer"],
+    "dbc":        [3,    -1, "in dbuf cache"],
+    "list":       [4,    -1, "which ARC list contains this buffer"],
+    "atype":      [7,    -1, "ARC header type (data or metadata)"],
+    "flags":      [9,    -1, "ARC read flags"],
+    "count":      [5,    -1, "ARC data count"],
+    "asize":      [7,  1024, "size of this ARC buffer"],
+    "access":     [10,   -1, "time this ARC buffer was last accessed"],
+    "mru":        [5,  1000, "hits while on the ARC's MRU list"],
+    "gmru":       [5,  1000, "hits while on the ARC's MRU ghost list"],
+    "mfu":        [5,  1000, "hits while on the ARC's MFU list"],
+    "gmfu":       [5,  1000, "hits while on the ARC's MFU ghost list"],
+    "l2":         [5,  1000, "hits while on the L2ARC"],
+    "l2_dattr":   [8,    -1, "L2ARC disk address/offset"],
+    "l2_asize":   [8,  1024, "L2ARC alloc'd size (depending on compression)"],
+    "l2_comp":    [21,   -1, "L2ARC compression algorithm for buffer"],
+    "aholds":     [6,  1000, "number of holds on this ARC buffer"],
+    "dtype":      [27,   -1, "dnode type"],
+    "btype":      [27,   -1, "bonus buffer type"],
+    "data_bs":    [7,  1024, "data block size"],
+    "meta_bs":    [7,  1024, "metadata block size"],
+    "bsize":      [6,  1024, "bonus buffer size"],
+    "lvls":       [6,    -1, "number of indirection levels"],
+    "dholds":     [6,  1000, "number of holds on dnode"],
+    "blocks":     [8,  1000, "number of allocated blocks"],
+    "dsize":      [12, 1024, "size of dnode"],
+    "cached":     [6,  1024, "bytes cached for all blocks"],
+    "direct":     [6,  1024, "bytes cached for direct blocks"],
+    "indirect":   [8,  1024, "bytes cached for indirect blocks"],
+    "bonus":      [5,  1024, "bytes cached for bonus buffer"],
+    "spill":      [5,  1024, "bytes cached for spill block"],
+}
+
+hdr = None
+xhdr = None
+sep = "  "  # Default separator is 2 spaces
+cmd = ("Usage: dbufstat [-bdhnrtvx] [-i file] [-f fields] [-o file] "
+       "[-s string] [-F filter]\n")
+raw = 0
+
+
+def print_incompat_helper(incompat):
+    cnt = 0
+    for key in sorted(incompat):
+        if cnt is 0:
+            sys.stderr.write("\t")
+        elif cnt > 8:
+            sys.stderr.write(",\n\t")
+            cnt = 0
+        else:
+            sys.stderr.write(", ")
+
+        sys.stderr.write("%s" % key)
+        cnt += 1
+
+    sys.stderr.write("\n\n")
+
+
+def detailed_usage():
+    sys.stderr.write("%s\n" % cmd)
+
+    sys.stderr.write("Field definitions incompatible with '-b' option:\n")
+    print_incompat_helper(bincompat)
+
+    sys.stderr.write("Field definitions incompatible with '-d' option:\n")
+    print_incompat_helper(dincompat)
+
+    sys.stderr.write("Field definitions incompatible with '-t' option:\n")
+    print_incompat_helper(tincompat)
+
+    sys.stderr.write("Field definitions are as follows:\n")
+    for key in sorted(cols.keys()):
+        sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
+    sys.stderr.write("\n")
+
+    sys.exit(0)
+
+
+def usage():
+    sys.stderr.write("%s\n" % cmd)
+    sys.stderr.write("\t -b : Print table of information for each dbuf\n")
+    sys.stderr.write("\t -d : Print table of information for each dnode\n")
+    sys.stderr.write("\t -h : Print this help message\n")
+    sys.stderr.write("\t -n : Exclude header from output\n")
+    sys.stderr.write("\t -r : Print raw values\n")
+    sys.stderr.write("\t -t : Print table of information for each dnode type"
+                     "\n")
+    sys.stderr.write("\t -v : List all possible field headers and definitions"
+                     "\n")
+    sys.stderr.write("\t -x : Print extended stats\n")
+    sys.stderr.write("\t -i : Redirect input from the specified file\n")
+    sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
+    sys.stderr.write("\t -o : Redirect output to the specified file\n")
+    sys.stderr.write("\t -s : Override default field separator with custom "
+                     "character or string\n")
+    sys.stderr.write("\t -F : Filter output by value or regex\n")
+    sys.stderr.write("\nExamples:\n")
+    sys.stderr.write("\tdbufstat -d -o /tmp/d.log\n")
+    sys.stderr.write("\tdbufstat -t -s \",\" -o /tmp/t.log\n")
+    sys.stderr.write("\tdbufstat -v\n")
+    sys.stderr.write("\tdbufstat -d -f pool,object,objset,dsize,cached\n")
+    sys.stderr.write("\tdbufstat -bx -F dbc=1,objset=54,pool=testpool\n")
+    sys.stderr.write("\n")
+
+    sys.exit(1)
+
+
+def prettynum(sz, scale, num=0):
+    global raw
+
+    suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
+    index = 0
+    save = 0
+
+    if raw or scale == -1:
+        return "%*s" % (sz, num)
+
+    # Rounding error, return 0
+    elif 0 < num < 1:
+        num = 0
+
+    while num > scale and index < 5:
+        save = num
+        num = num / scale
+        index += 1
+
+    if index == 0:
+        return "%*d" % (sz, num)
+
+    if (save / scale) < 10:
+        return "%*.1f%s" % (sz - 1, num, suffix[index])
+    else:
+        return "%*d%s" % (sz - 1, num, suffix[index])
+
+
+def print_values(v):
+    global hdr
+    global sep
+
+    try:
+        for col in hdr:
+            sys.stdout.write("%s%s" % (
+                prettynum(cols[col][0], cols[col][1], v[col]), sep))
+        sys.stdout.write("\n")
+    except IOError as e:
+        if e.errno == errno.EPIPE:
+            sys.exit(1)
+
+
+def print_header():
+    global hdr
+    global sep
+
+    try:
+        for col in hdr:
+            sys.stdout.write("%*s%s" % (cols[col][0], col, sep))
+        sys.stdout.write("\n")
+    except IOError as e:
+        if e.errno == errno.EPIPE:
+            sys.exit(1)
+
+
+def get_typestring(t):
+    ot_strings = [
+                    "DMU_OT_NONE",
+                    # general:
+                    "DMU_OT_OBJECT_DIRECTORY",
+                    "DMU_OT_OBJECT_ARRAY",
+                    "DMU_OT_PACKED_NVLIST",
+                    "DMU_OT_PACKED_NVLIST_SIZE",
+                    "DMU_OT_BPOBJ",
+                    "DMU_OT_BPOBJ_HDR",
+                    # spa:
+                    "DMU_OT_SPACE_MAP_HEADER",
+                    "DMU_OT_SPACE_MAP",
+                    # zil:
+                    "DMU_OT_INTENT_LOG",
+                    # dmu:
+                    "DMU_OT_DNODE",
+                    "DMU_OT_OBJSET",
+                    # dsl:
+                    "DMU_OT_DSL_DIR",
+                    "DMU_OT_DSL_DIR_CHILD_MAP",
+                    "DMU_OT_DSL_DS_SNAP_MAP",
+                    "DMU_OT_DSL_PROPS",
+                    "DMU_OT_DSL_DATASET",
+                    # zpl:
+                    "DMU_OT_ZNODE",
+                    "DMU_OT_OLDACL",
+                    "DMU_OT_PLAIN_FILE_CONTENTS",
+                    "DMU_OT_DIRECTORY_CONTENTS",
+                    "DMU_OT_MASTER_NODE",
+                    "DMU_OT_UNLINKED_SET",
+                    # zvol:
+                    "DMU_OT_ZVOL",
+                    "DMU_OT_ZVOL_PROP",
+                    # other; for testing only!
+                    "DMU_OT_PLAIN_OTHER",
+                    "DMU_OT_UINT64_OTHER",
+                    "DMU_OT_ZAP_OTHER",
+                    # new object types:
+                    "DMU_OT_ERROR_LOG",
+                    "DMU_OT_SPA_HISTORY",
+                    "DMU_OT_SPA_HISTORY_OFFSETS",
+                    "DMU_OT_POOL_PROPS",
+                    "DMU_OT_DSL_PERMS",
+                    "DMU_OT_ACL",
+                    "DMU_OT_SYSACL",
+                    "DMU_OT_FUID",
+                    "DMU_OT_FUID_SIZE",
+                    "DMU_OT_NEXT_CLONES",
+                    "DMU_OT_SCAN_QUEUE",
+                    "DMU_OT_USERGROUP_USED",
+                    "DMU_OT_USERGROUP_QUOTA",
+                    "DMU_OT_USERREFS",
+                    "DMU_OT_DDT_ZAP",
+                    "DMU_OT_DDT_STATS",
+                    "DMU_OT_SA",
+                    "DMU_OT_SA_MASTER_NODE",
+                    "DMU_OT_SA_ATTR_REGISTRATION",
+                    "DMU_OT_SA_ATTR_LAYOUTS",
+                    "DMU_OT_SCAN_XLATE",
+                    "DMU_OT_DEDUP",
+                    "DMU_OT_DEADLIST",
+                    "DMU_OT_DEADLIST_HDR",
+                    "DMU_OT_DSL_CLONES",
+                    "DMU_OT_BPOBJ_SUBOBJ"]
+    otn_strings = {
+                    0x80: "DMU_OTN_UINT8_DATA",
+                    0xc0: "DMU_OTN_UINT8_METADATA",
+                    0x81: "DMU_OTN_UINT16_DATA",
+                    0xc1: "DMU_OTN_UINT16_METADATA",
+                    0x82: "DMU_OTN_UINT32_DATA",
+                    0xc2: "DMU_OTN_UINT32_METADATA",
+                    0x83: "DMU_OTN_UINT64_DATA",
+                    0xc3: "DMU_OTN_UINT64_METADATA",
+                    0x84: "DMU_OTN_ZAP_DATA",
+                    0xc4: "DMU_OTN_ZAP_METADATA",
+                    0xa0: "DMU_OTN_UINT8_ENC_DATA",
+                    0xe0: "DMU_OTN_UINT8_ENC_METADATA",
+                    0xa1: "DMU_OTN_UINT16_ENC_DATA",
+                    0xe1: "DMU_OTN_UINT16_ENC_METADATA",
+                    0xa2: "DMU_OTN_UINT32_ENC_DATA",
+                    0xe2: "DMU_OTN_UINT32_ENC_METADATA",
+                    0xa3: "DMU_OTN_UINT64_ENC_DATA",
+                    0xe3: "DMU_OTN_UINT64_ENC_METADATA",
+                    0xa4: "DMU_OTN_ZAP_ENC_DATA",
+                    0xe4: "DMU_OTN_ZAP_ENC_METADATA"}
+
+    # If "-rr" option is used, don't convert to string representation
+    if raw > 1:
+        return "%i" % t
+
+    try:
+        if t < len(ot_strings):
+            return ot_strings[t]
+        else:
+            return otn_strings[t]
+    except (IndexError, KeyError):
+        return "(UNKNOWN)"
+
+
+def get_compstring(c):
+    comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON",
+                    "ZIO_COMPRESS_OFF",     "ZIO_COMPRESS_LZJB",
+                    "ZIO_COMPRESS_EMPTY",   "ZIO_COMPRESS_GZIP_1",
+                    "ZIO_COMPRESS_GZIP_2",  "ZIO_COMPRESS_GZIP_3",
+                    "ZIO_COMPRESS_GZIP_4",  "ZIO_COMPRESS_GZIP_5",
+                    "ZIO_COMPRESS_GZIP_6",  "ZIO_COMPRESS_GZIP_7",
+                    "ZIO_COMPRESS_GZIP_8",  "ZIO_COMPRESS_GZIP_9",
+                    "ZIO_COMPRESS_ZLE",     "ZIO_COMPRESS_LZ4",
+                    "ZIO_COMPRESS_ZSTD",    "ZIO_COMPRESS_FUNCTION"]
+
+    # If "-rr" option is used, don't convert to string representation
+    if raw > 1:
+        return "%i" % c
+
+    try:
+        return comp_strings[c]
+    except IndexError:
+        return "%i" % c
+
+
+def parse_line(line, labels):
+    global hdr
+
+    new = dict()
+    val = None
+    for col in hdr:
+        # These are "special" fields computed in the update_dict
+        # function, prevent KeyError exception on labels[col] for these.
+        if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']:
+            val = line[labels[col]]
+
+        if col in ['pool', 'flags']:
+            new[col] = str(val)
+        elif col in ['dtype', 'btype']:
+            new[col] = get_typestring(int(val))
+        elif col in ['l2_comp']:
+            new[col] = get_compstring(int(val))
+        else:
+            new[col] = int(val)
+
+    return new
+
+
+def update_dict(d, k, line, labels):
+    pool = line[labels['pool']]
+    objset = line[labels['objset']]
+    key = line[labels[k]]
+
+    dbsize = int(line[labels['dbsize']])
+    blkid = int(line[labels['blkid']])
+    level = int(line[labels['level']])
+
+    if pool not in d:
+        d[pool] = dict()
+
+    if objset not in d[pool]:
+        d[pool][objset] = dict()
+
+    if key not in d[pool][objset]:
+        d[pool][objset][key] = parse_line(line, labels)
+        d[pool][objset][key]['bonus'] = 0
+        d[pool][objset][key]['cached'] = 0
+        d[pool][objset][key]['direct'] = 0
+        d[pool][objset][key]['indirect'] = 0
+        d[pool][objset][key]['spill'] = 0
+
+    d[pool][objset][key]['cached'] += dbsize
+
+    if blkid == -1:
+        d[pool][objset][key]['bonus'] += dbsize
+    elif blkid == -2:
+        d[pool][objset][key]['spill'] += dbsize
+    else:
+        if level == 0:
+            d[pool][objset][key]['direct'] += dbsize
+        else:
+            d[pool][objset][key]['indirect'] += dbsize
+
+    return d
+
+
+def skip_line(vals, filters):
+    '''
+    Determines if a line should be skipped during printing
+    based on a set of filters
+    '''
+    if len(filters) == 0:
+        return False
+
+    for key in vals:
+        if key in filters:
+            val = prettynum(cols[key][0], cols[key][1], vals[key]).strip()
+            # we want a full match here
+            if re.match("(?:" + filters[key] + r")\Z", val) is None:
+                return True
+
+    return False
+
+
+def print_dict(d, filters, noheader):
+    if not noheader:
+        print_header()
+    for pool in list(d.keys()):
+        for objset in list(d[pool].keys()):
+            for v in list(d[pool][objset].values()):
+                if not skip_line(v, filters):
+                    print_values(v)
+
+
+def dnodes_build_dict(filehandle):
+    labels = dict()
+    dnodes = dict()
+
+    # First 3 lines are header information, skip the first two
+    for i in range(2):
+        next(filehandle)
+
+    # The third line contains the labels and index locations
+    for i, v in enumerate(next(filehandle).split()):
+        labels[v] = i
+
+    # The rest of the file is buffer information
+    for line in filehandle:
+        update_dict(dnodes, 'object', line.split(), labels)
+
+    return dnodes
+
+
+def types_build_dict(filehandle):
+    labels = dict()
+    types = dict()
+
+    # First 3 lines are header information, skip the first two
+    for i in range(2):
+        next(filehandle)
+
+    # The third line contains the labels and index locations
+    for i, v in enumerate(next(filehandle).split()):
+        labels[v] = i
+
+    # The rest of the file is buffer information
+    for line in filehandle:
+        update_dict(types, 'dtype', line.split(), labels)
+
+    return types
+
+
+def buffers_print_all(filehandle, filters, noheader):
+    labels = dict()
+
+    # First 3 lines are header information, skip the first two
+    for i in range(2):
+        next(filehandle)
+
+    # The third line contains the labels and index locations
+    for i, v in enumerate(next(filehandle).split()):
+        labels[v] = i
+
+    if not noheader:
+        print_header()
+
+    # The rest of the file is buffer information
+    for line in filehandle:
+        vals = parse_line(line.split(), labels)
+        if not skip_line(vals, filters):
+            print_values(vals)
+
+
+def main():
+    global hdr
+    global sep
+    global raw
+
+    desired_cols = None
+    bflag = False
+    dflag = False
+    hflag = False
+    ifile = None
+    ofile = None
+    tflag = False
+    vflag = False
+    xflag = False
+    nflag = False
+    filters = dict()
+
+    try:
+        opts, args = getopt.getopt(
+            sys.argv[1:],
+            "bdf:hi:o:rs:tvxF:n",
+            [
+                "buffers",
+                "dnodes",
+                "columns",
+                "help",
+                "infile",
+                "outfile",
+                "separator",
+                "types",
+                "verbose",
+                "extended",
+                "filter"
+            ]
+        )
+    except getopt.error:
+        usage()
+        opts = None
+
+    for opt, arg in opts:
+        if opt in ('-b', '--buffers'):
+            bflag = True
+        if opt in ('-d', '--dnodes'):
+            dflag = True
+        if opt in ('-f', '--columns'):
+            desired_cols = arg
+        if opt in ('-h', '--help'):
+            hflag = True
+        if opt in ('-i', '--infile'):
+            ifile = arg
+        if opt in ('-o', '--outfile'):
+            ofile = arg
+        if opt in ('-r', '--raw'):
+            raw += 1
+        if opt in ('-s', '--separator'):
+            sep = arg
+        if opt in ('-t', '--types'):
+            tflag = True
+        if opt in ('-v', '--verbose'):
+            vflag = True
+        if opt in ('-x', '--extended'):
+            xflag = True
+        if opt in ('-n', '--noheader'):
+            nflag = True
+        if opt in ('-F', '--filter'):
+            fils = [x.strip() for x in arg.split(",")]
+
+            for fil in fils:
+                f = [x.strip() for x in fil.split("=")]
+
+                if len(f) != 2:
+                    sys.stderr.write("Invalid filter '%s'.\n" % fil)
+                    sys.exit(1)
+
+                if f[0] not in cols:
+                    sys.stderr.write("Invalid field '%s' in filter.\n" % f[0])
+                    sys.exit(1)
+
+                if f[0] in filters:
+                    sys.stderr.write("Field '%s' specified multiple times in "
+                                     "filter.\n" % f[0])
+                    sys.exit(1)
+
+                try:
+                    re.compile("(?:" + f[1] + r")\Z")
+                except re.error:
+                    sys.stderr.write("Invalid regex for field '%s' in "
+                                     "filter.\n" % f[0])
+                    sys.exit(1)
+
+                filters[f[0]] = f[1]
+
+    if hflag or (xflag and desired_cols):
+        usage()
+
+    if vflag:
+        detailed_usage()
+
+    # Ensure at most only one of b, d, or t flags are set
+    if (bflag and dflag) or (bflag and tflag) or (dflag and tflag):
+        usage()
+
+    if bflag:
+        hdr = bxhdr if xflag else bhdr
+    elif tflag:
+        hdr = txhdr if xflag else thdr
+    else:  # Even if dflag is False, it's the default if none set
+        dflag = True
+        hdr = dxhdr if xflag else dhdr
+
+    if desired_cols:
+        hdr = desired_cols.split(",")
+
+        invalid = []
+        incompat = []
+        for ele in hdr:
+            if ele not in cols:
+                invalid.append(ele)
+            elif ((bflag and bincompat and ele in bincompat) or
+                  (dflag and dincompat and ele in dincompat) or
+                  (tflag and tincompat and ele in tincompat)):
+                    incompat.append(ele)
+
+        if len(invalid) > 0:
+            sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
+            usage()
+
+        if len(incompat) > 0:
+            sys.stderr.write("Incompatible field specified! -- %s\n" %
+                             incompat)
+            usage()
+
+    if ofile:
+        try:
+            tmp = open(ofile, "w")
+            sys.stdout = tmp
+
+        except IOError:
+            sys.stderr.write("Cannot open %s for writing\n" % ofile)
+            sys.exit(1)
+
+    if not ifile:
+        ifile = '/proc/spl/kstat/zfs/dbufs'
+
+    if ifile is not "-":
+        try:
+            tmp = open(ifile, "r")
+            sys.stdin = tmp
+        except IOError:
+            sys.stderr.write("Cannot open %s for reading\n" % ifile)
+            sys.exit(1)
+
+    if bflag:
+        buffers_print_all(sys.stdin, filters, nflag)
+
+    if dflag:
+        print_dict(dnodes_build_dict(sys.stdin), filters, nflag)
+
+    if tflag:
+        print_dict(types_build_dict(sys.stdin), filters, nflag)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am b/sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am
new file mode 100644
index 000000000000..2380f56fa4d4
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am
@@ -0,0 +1 @@
+dist_sbin_SCRIPTS = fsck.zfs
diff --git a/sys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs b/sys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs
new file mode 100755
index 000000000000..129a7f39c388
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs
@@ -0,0 +1,9 @@
+#!/bin/sh
+#
+# fsck.zfs: A fsck helper to accommodate distributions that expect
+# to be able to execute a fsck on all filesystem types.  Currently
+# this script does nothing but it could be extended to act as a
+# compatibility wrapper for 'zpool scrub'.
+#
+
+exit 0
diff --git a/sys/contrib/openzfs/cmd/mount_zfs/.gitignore b/sys/contrib/openzfs/cmd/mount_zfs/.gitignore
new file mode 100644
index 000000000000..cd9254bde3da
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/mount_zfs/.gitignore
@@ -0,0 +1 @@
+mount.zfs
diff --git a/sys/contrib/openzfs/cmd/mount_zfs/Makefile.am b/sys/contrib/openzfs/cmd/mount_zfs/Makefile.am
new file mode 100644
index 000000000000..6c4d6ff79f16
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/mount_zfs/Makefile.am
@@ -0,0 +1,20 @@
+include $(top_srcdir)/config/Rules.am
+
+#
+# Ignore the prefix for the mount helper.  It must be installed in /sbin/
+# because this path is hardcoded in the mount(8) for security reasons.
+# However, if needed, the configure option --with-mounthelperdir= can be used
+# to override the default install location.
+#
+sbindir=$(mounthelperdir)
+sbin_PROGRAMS = mount.zfs
+
+mount_zfs_SOURCES = \
+	mount_zfs.c
+
+mount_zfs_LDADD = \
+	$(abs_top_builddir)/lib/libzfs/libzfs.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+	$(abs_top_builddir)/lib/libnvpair/libnvpair.la
+
+mount_zfs_LDADD += $(LTLIBINTL)
diff --git a/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c b/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c
new file mode 100644
index 000000000000..87d2ccadcded
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c
@@ -0,0 +1,408 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Lawrence Livermore National Security, LLC.
+ */
+
+#include <libintl.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/mntent.h>
+#include <sys/stat.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <locale.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#define	ZS_COMMENT	0x00000000	/* comment */
+#define	ZS_ZFSUTIL	0x00000001	/* caller is zfs(8) */
+
+libzfs_handle_t *g_zfs;
+
+/*
+ * Return the pool/dataset to mount given the name passed to mount.  This
+ * is expected to be of the form pool/dataset, however may also refer to
+ * a block device if that device contains a valid zfs label.
+ */
+static char *
+parse_dataset(char *dataset)
+{
+	char cwd[PATH_MAX];
+	struct stat64 statbuf;
+	int error;
+	int len;
+
+	/*
+	 * We expect a pool/dataset to be provided, however if we're
+	 * given a device which is a member of a zpool we attempt to
+	 * extract the pool name stored in the label.  Given the pool
+	 * name we can mount the root dataset.
+	 */
+	error = stat64(dataset, &statbuf);
+	if (error == 0) {
+		nvlist_t *config;
+		char *name;
+		int fd;
+
+		fd = open(dataset, O_RDONLY);
+		if (fd < 0)
+			goto out;
+
+		error = zpool_read_label(fd, &config, NULL);
+		(void) close(fd);
+		if (error)
+			goto out;
+
+		error = nvlist_lookup_string(config,
+		    ZPOOL_CONFIG_POOL_NAME, &name);
+		if (error) {
+			nvlist_free(config);
+		} else {
+			dataset = strdup(name);
+			nvlist_free(config);
+			return (dataset);
+		}
+	}
+out:
+	/*
+	 * If a file or directory in your current working directory is
+	 * named 'dataset' then mount(8) will prepend your current working
+	 * directory to the dataset.  There is no way to prevent this
+	 * behavior so we simply check for it and strip the prepended
+	 * patch when it is added.
+	 */
+	if (getcwd(cwd, PATH_MAX) == NULL)
+		return (dataset);
+
+	len = strlen(cwd);
+
+	/* Do not add one when cwd already ends in a trailing '/' */
+	if (strncmp(cwd, dataset, len) == 0)
+		return (dataset + len + (cwd[len-1] != '/'));
+
+	return (dataset);
+}
+
+/*
+ * Update the mtab_* code to use the libmount library when it is commonly
+ * available otherwise fallback to legacy mode.  The mount(8) utility will
+ * manage the lock file for us to prevent racing updates to /etc/mtab.
+ */
+static int
+mtab_is_writeable(void)
+{
+	struct stat st;
+	int error, fd;
+
+	error = lstat("/etc/mtab", &st);
+	if (error || S_ISLNK(st.st_mode))
+		return (0);
+
+	fd = open("/etc/mtab", O_RDWR | O_CREAT, 0644);
+	if (fd < 0)
+		return (0);
+
+	close(fd);
+	return (1);
+}
+
+static int
+mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts)
+{
+	struct mntent mnt;
+	FILE *fp;
+	int error;
+
+	mnt.mnt_fsname = dataset;
+	mnt.mnt_dir = mntpoint;
+	mnt.mnt_type = type;
+	mnt.mnt_opts = mntopts ? mntopts : "";
+	mnt.mnt_freq = 0;
+	mnt.mnt_passno = 0;
+
+	fp = setmntent("/etc/mtab", "a+");
+	if (!fp) {
+		(void) fprintf(stderr, gettext(
+		    "filesystem '%s' was mounted, but /etc/mtab "
+		    "could not be opened due to error %d\n"),
+		    dataset, errno);
+		return (MOUNT_FILEIO);
+	}
+
+	error = addmntent(fp, &mnt);
+	if (error) {
+		(void) fprintf(stderr, gettext(
+		    "filesystem '%s' was mounted, but /etc/mtab "
+		    "could not be updated due to error %d\n"),
+		    dataset, errno);
+		return (MOUNT_FILEIO);
+	}
+
+	(void) endmntent(fp);
+
+	return (MOUNT_SUCCESS);
+}
+
+int
+main(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	char prop[ZFS_MAXPROPLEN];
+	uint64_t zfs_version = 0;
+	char mntopts[MNT_LINE_MAX] = { '\0' };
+	char badopt[MNT_LINE_MAX] = { '\0' };
+	char mtabopt[MNT_LINE_MAX] = { '\0' };
+	char mntpoint[PATH_MAX];
+	char *dataset;
+	unsigned long mntflags = 0, zfsflags = 0, remount = 0;
+	int sloppy = 0, fake = 0, verbose = 0, nomtab = 0, zfsutil = 0;
+	int error, c;
+
+	(void) setlocale(LC_ALL, "");
+	(void) textdomain(TEXT_DOMAIN);
+
+	opterr = 0;
+
+	/* check options */
+	while ((c = getopt_long(argc, argv, "sfnvo:h?", 0, 0)) != -1) {
+		switch (c) {
+		case 's':
+			sloppy = 1;
+			break;
+		case 'f':
+			fake = 1;
+			break;
+		case 'n':
+			nomtab = 1;
+			break;
+		case 'v':
+			verbose++;
+			break;
+		case 'o':
+			(void) strlcpy(mntopts, optarg, sizeof (mntopts));
+			break;
+		case 'h':
+		case '?':
+			(void) fprintf(stderr, gettext("Invalid option '%c'\n"),
+			    optopt);
+			(void) fprintf(stderr, gettext("Usage: mount.zfs "
+			    "[-sfnv] [-o options] <dataset> <mountpoint>\n"));
+			return (MOUNT_USAGE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check that we only have two arguments */
+	if (argc != 2) {
+		if (argc == 0)
+			(void) fprintf(stderr, gettext("missing dataset "
+			    "argument\n"));
+		else if (argc == 1)
+			(void) fprintf(stderr,
+			    gettext("missing mountpoint argument\n"));
+		else
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+		(void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
+		return (MOUNT_USAGE);
+	}
+
+	dataset = parse_dataset(argv[0]);
+
+	/* canonicalize the mount point */
+	if (realpath(argv[1], mntpoint) == NULL) {
+		(void) fprintf(stderr, gettext("filesystem '%s' cannot be "
+		    "mounted at '%s' due to canonicalization error %d.\n"),
+		    dataset, argv[1], errno);
+		return (MOUNT_SYSERR);
+	}
+
+	/* validate mount options and set mntflags */
+	error = zfs_parse_mount_options(mntopts, &mntflags, &zfsflags, sloppy,
+	    badopt, mtabopt);
+	if (error) {
+		switch (error) {
+		case ENOMEM:
+			(void) fprintf(stderr, gettext("filesystem '%s' "
+			    "cannot be mounted due to a memory allocation "
+			    "failure.\n"), dataset);
+			return (MOUNT_SYSERR);
+		case ENOENT:
+			(void) fprintf(stderr, gettext("filesystem '%s' "
+			    "cannot be mounted due to invalid option "
+			    "'%s'.\n"), dataset, badopt);
+			(void) fprintf(stderr, gettext("Use the '-s' option "
+			    "to ignore the bad mount option.\n"));
+			return (MOUNT_USAGE);
+		default:
+			(void) fprintf(stderr, gettext("filesystem '%s' "
+			    "cannot be mounted due to internal error %d.\n"),
+			    dataset, error);
+			return (MOUNT_SOFTWARE);
+		}
+	}
+
+	if (verbose)
+		(void) fprintf(stdout, gettext("mount.zfs:\n"
+		    "  dataset:    \"%s\"\n  mountpoint: \"%s\"\n"
+		    "  mountflags: 0x%lx\n  zfsflags:   0x%lx\n"
+		    "  mountopts:  \"%s\"\n  mtabopts:   \"%s\"\n"),
+		    dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);
+
+	if (mntflags & MS_REMOUNT) {
+		nomtab = 1;
+		remount = 1;
+	}
+
+	if (zfsflags & ZS_ZFSUTIL)
+		zfsutil = 1;
+
+	if ((g_zfs = libzfs_init()) == NULL) {
+		(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+		return (MOUNT_SYSERR);
+	}
+
+	/* try to open the dataset to access the mount point */
+	if ((zhp = zfs_open(g_zfs, dataset,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) {
+		(void) fprintf(stderr, gettext("filesystem '%s' cannot be "
+		    "mounted, unable to open the dataset\n"), dataset);
+		libzfs_fini(g_zfs);
+		return (MOUNT_USAGE);
+	}
+
+	zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
+
+	/* treat all snapshots as legacy mount points */
+	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT)
+		(void) strlcpy(prop, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN);
+	else
+		(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, prop,
+		    sizeof (prop), NULL, NULL, 0, B_FALSE);
+
+	/*
+	 * Fetch the max supported zfs version in case we get ENOTSUP
+	 * back from the mount command, since we need the zfs handle
+	 * to do so.
+	 */
+	zfs_version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+	if (zfs_version == 0) {
+		fprintf(stderr, gettext("unable to fetch "
+		    "ZFS version for filesystem '%s'\n"), dataset);
+		return (MOUNT_SYSERR);
+	}
+
+	zfs_close(zhp);
+	libzfs_fini(g_zfs);
+
+	/*
+	 * Legacy mount points may only be mounted using 'mount', never using
+	 * 'zfs mount'.  However, since 'zfs mount' actually invokes 'mount'
+	 * we differentiate the two cases using the 'zfsutil' mount option.
+	 * This mount option should only be supplied by the 'zfs mount' util.
+	 *
+	 * The only exception to the above rule is '-o remount' which is
+	 * always allowed for non-legacy datasets.  This is done because when
+	 * using zfs as your root file system both rc.sysinit/umountroot and
+	 * systemd depend on 'mount -o remount <mountpoint>' to work.
+	 */
+	if (zfsutil && (strcmp(prop, ZFS_MOUNTPOINT_LEGACY) == 0)) {
+		(void) fprintf(stderr, gettext(
+		    "filesystem '%s' cannot be mounted using 'zfs mount'.\n"
+		    "Use 'zfs set mountpoint=%s' or 'mount -t zfs %s %s'.\n"
+		    "See zfs(8) for more information.\n"),
+		    dataset, mntpoint, dataset, mntpoint);
+		return (MOUNT_USAGE);
+	}
+
+	if (!zfsutil && !(remount || fake) &&
+	    strcmp(prop, ZFS_MOUNTPOINT_LEGACY)) {
+		(void) fprintf(stderr, gettext(
+		    "filesystem '%s' cannot be mounted using 'mount'.\n"
+		    "Use 'zfs set mountpoint=%s' or 'zfs mount %s'.\n"
+		    "See zfs(8) for more information.\n"),
+		    dataset, "legacy", dataset);
+		return (MOUNT_USAGE);
+	}
+
+	if (!fake) {
+		error = mount(dataset, mntpoint, MNTTYPE_ZFS,
+		    mntflags, mntopts);
+	}
+
+	if (error) {
+		switch (errno) {
+		case ENOENT:
+			(void) fprintf(stderr, gettext("mount point "
+			    "'%s' does not exist\n"), mntpoint);
+			return (MOUNT_SYSERR);
+		case EBUSY:
+			(void) fprintf(stderr, gettext("filesystem "
+			    "'%s' is already mounted\n"), dataset);
+			return (MOUNT_BUSY);
+		case ENOTSUP:
+			if (zfs_version > ZPL_VERSION) {
+				(void) fprintf(stderr,
+				    gettext("filesystem '%s' (v%d) is not "
+				    "supported by this implementation of "
+				    "ZFS (max v%d).\n"), dataset,
+				    (int)zfs_version, (int)ZPL_VERSION);
+			} else {
+				(void) fprintf(stderr,
+				    gettext("filesystem '%s' mount "
+				    "failed for unknown reason.\n"), dataset);
+			}
+			return (MOUNT_SYSERR);
+#ifdef MS_MANDLOCK
+		case EPERM:
+			if (mntflags & MS_MANDLOCK) {
+				(void) fprintf(stderr, gettext("filesystem "
+				    "'%s' has the 'nbmand=on' property set, "
+				    "this mount\noption may be disabled in "
+				    "your kernel.  Use 'zfs set nbmand=off'\n"
+				    "to disable this option and try to "
+				    "mount the filesystem again.\n"), dataset);
+				return (MOUNT_SYSERR);
+			}
+			/* fallthru */
+#endif
+		default:
+			(void) fprintf(stderr, gettext("filesystem "
+			    "'%s' can not be mounted: %s\n"), dataset,
+			    strerror(errno));
+			return (MOUNT_USAGE);
+		}
+	}
+
+	if (!nomtab && mtab_is_writeable()) {
+		error = mtab_update(dataset, mntpoint, MNTTYPE_ZFS, mtabopt);
+		if (error)
+			return (error);
+	}
+
+	return (MOUNT_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/cmd/raidz_test/.gitignore b/sys/contrib/openzfs/cmd/raidz_test/.gitignore
new file mode 100644
index 000000000000..f8b83d9cce03
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/.gitignore
@@ -0,0 +1 @@
+/raidz_test
diff --git a/sys/contrib/openzfs/cmd/raidz_test/Makefile.am b/sys/contrib/openzfs/cmd/raidz_test/Makefile.am
new file mode 100644
index 000000000000..72c914e641e4
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/Makefile.am
@@ -0,0 +1,20 @@
+include $(top_srcdir)/config/Rules.am
+
+# Includes kernel code, generate warnings for large stack frames
+AM_CFLAGS += $(FRAME_LARGER_THAN)
+
+# Unconditionally enable ASSERTs
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+bin_PROGRAMS = raidz_test
+
+raidz_test_SOURCES = \
+	raidz_test.h \
+	raidz_test.c \
+	raidz_bench.c
+
+raidz_test_LDADD = \
+	$(abs_top_builddir)/lib/libzpool/libzpool.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la
+
+raidz_test_LDADD += -lm
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c
new file mode 100644
index 000000000000..8a2cec4ca685
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c
@@ -0,0 +1,227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/zio.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <stdio.h>
+
+#include <sys/time.h>
+
+#include "raidz_test.h"
+
+#define	GEN_BENCH_MEMORY	(((uint64_t)1ULL)<<32)
+#define	REC_BENCH_MEMORY	(((uint64_t)1ULL)<<29)
+#define	BENCH_ASHIFT		12
+#define	MIN_CS_SHIFT		BENCH_ASHIFT
+#define	MAX_CS_SHIFT		SPA_MAXBLOCKSHIFT
+
+static zio_t zio_bench;
+static raidz_map_t *rm_bench;
+static size_t max_data_size = SPA_MAXBLOCKSIZE;
+
+static void
+bench_init_raidz_map(void)
+{
+	zio_bench.io_offset = 0;
+	zio_bench.io_size = max_data_size;
+
+	/*
+	 * To permit larger column sizes these have to be done
+	 * allocated using aligned alloc instead of zio_abd_buf_alloc
+	 */
+	zio_bench.io_abd = raidz_alloc(max_data_size);
+
+	init_zio_abd(&zio_bench);
+}
+
+static void
+bench_fini_raidz_maps(void)
+{
+	/* tear down golden zio */
+	raidz_free(zio_bench.io_abd, max_data_size);
+	bzero(&zio_bench, sizeof (zio_t));
+}
+
+static inline void
+run_gen_bench_impl(const char *impl)
+{
+	int fn, ncols;
+	uint64_t ds, iter_cnt, iter, disksize;
+	hrtime_t start;
+	double elapsed, d_bw;
+
+	/* Benchmark generate functions */
+	for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+
+		for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) {
+			/* create suitable raidz_map */
+			ncols = rto_opts.rto_dcols + fn + 1;
+			zio_bench.io_size = 1ULL << ds;
+			rm_bench = vdev_raidz_map_alloc(&zio_bench,
+			    BENCH_ASHIFT, ncols, fn+1);
+
+			/* estimate iteration count */
+			iter_cnt = GEN_BENCH_MEMORY;
+			iter_cnt /= zio_bench.io_size;
+
+			start = gethrtime();
+			for (iter = 0; iter < iter_cnt; iter++)
+				vdev_raidz_generate_parity(rm_bench);
+			elapsed = NSEC2SEC((double)(gethrtime() - start));
+
+			disksize = (1ULL << ds) / rto_opts.rto_dcols;
+			d_bw = (double)iter_cnt * (double)disksize;
+			d_bw /= (1024.0 * 1024.0 * elapsed);
+
+			LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n",
+			    impl,
+			    raidz_gen_name[fn],
+			    rto_opts.rto_dcols,
+			    (1ULL<<ds),
+			    d_bw,
+			    d_bw * (double)(ncols),
+			    (unsigned)iter_cnt);
+
+			vdev_raidz_map_free(rm_bench);
+		}
+	}
+}
+
+static void
+run_gen_bench(void)
+{
+	char **impl_name;
+
+	LOG(D_INFO, DBLSEP "\nBenchmarking parity generation...\n\n");
+	LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n");
+
+	for (impl_name = (char **)raidz_impl_names; *impl_name != NULL;
+	    impl_name++) {
+
+		if (vdev_raidz_impl_set(*impl_name) != 0)
+			continue;
+
+		run_gen_bench_impl(*impl_name);
+	}
+}
+
+static void
+run_rec_bench_impl(const char *impl)
+{
+	int fn, ncols, nbad;
+	uint64_t ds, iter_cnt, iter, disksize;
+	hrtime_t start;
+	double elapsed, d_bw;
+	static const int tgt[7][3] = {
+		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
+		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
+		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
+		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
+		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
+		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
+		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
+	};
+
+	for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
+		for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) {
+
+			/* create suitable raidz_map */
+			ncols = rto_opts.rto_dcols + PARITY_PQR;
+			zio_bench.io_size = 1ULL << ds;
+
+			/*
+			 * raidz block is too short to test
+			 * the requested method
+			 */
+			if (zio_bench.io_size / rto_opts.rto_dcols <
+			    (1ULL << BENCH_ASHIFT))
+				continue;
+
+			rm_bench = vdev_raidz_map_alloc(&zio_bench,
+			    BENCH_ASHIFT, ncols, PARITY_PQR);
+
+			/* estimate iteration count */
+			iter_cnt = (REC_BENCH_MEMORY);
+			iter_cnt /= zio_bench.io_size;
+
+			/* calculate how many bad columns there are */
+			nbad = MIN(3, raidz_ncols(rm_bench) -
+			    raidz_parity(rm_bench));
+
+			start = gethrtime();
+			for (iter = 0; iter < iter_cnt; iter++)
+				vdev_raidz_reconstruct(rm_bench, tgt[fn], nbad);
+			elapsed = NSEC2SEC((double)(gethrtime() - start));
+
+			disksize = (1ULL << ds) / rto_opts.rto_dcols;
+			d_bw = (double)iter_cnt * (double)(disksize);
+			d_bw /= (1024.0 * 1024.0 * elapsed);
+
+			LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n",
+			    impl,
+			    raidz_rec_name[fn],
+			    rto_opts.rto_dcols,
+			    (1ULL<<ds),
+			    d_bw,
+			    d_bw * (double)ncols,
+			    (unsigned)iter_cnt);
+
+			vdev_raidz_map_free(rm_bench);
+		}
+	}
+}
+
+static void
+run_rec_bench(void)
+{
+	char **impl_name;
+
+	LOG(D_INFO, DBLSEP "\nBenchmarking data reconstruction...\n\n");
+	LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n");
+
+	for (impl_name = (char **)raidz_impl_names; *impl_name != NULL;
+	    impl_name++) {
+
+		if (vdev_raidz_impl_set(*impl_name) != 0)
+			continue;
+
+		run_rec_bench_impl(*impl_name);
+	}
+}
+
+void
+run_raidz_benchmark(void)
+{
+	bench_init_raidz_map();
+
+	run_gen_bench();
+	run_rec_bench();
+
+	bench_fini_raidz_maps();
+}
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c
new file mode 100644
index 000000000000..66f36b0d56ca
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c
@@ -0,0 +1,782 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/zio.h>
+#include <umem.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <assert.h>
+#include <stdio.h>
+#include "raidz_test.h"
+
+static int *rand_data;
+raidz_test_opts_t rto_opts;
+
+static char gdb[256];
+static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d";
+
+static void sig_handler(int signo)
+{
+	struct sigaction action;
+	/*
+	 * Restore default action and re-raise signal so SIGSEGV and
+	 * SIGABRT can trigger a core dump.
+	 */
+	action.sa_handler = SIG_DFL;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	(void) sigaction(signo, &action, NULL);
+
+	if (rto_opts.rto_gdb)
+		if (system(gdb)) { }
+
+	raise(signo);
+}
+
+static void print_opts(raidz_test_opts_t *opts, boolean_t force)
+{
+	char *verbose;
+	switch (opts->rto_v) {
+		case 0:
+			verbose = "no";
+			break;
+		case 1:
+			verbose = "info";
+			break;
+		default:
+			verbose = "debug";
+			break;
+	}
+
+	if (force || opts->rto_v >= D_INFO) {
+		(void) fprintf(stdout, DBLSEP "Running with options:\n"
+		    "  (-a) zio ashift                   : %zu\n"
+		    "  (-o) zio offset                   : 1 << %zu\n"
+		    "  (-d) number of raidz data columns : %zu\n"
+		    "  (-s) size of DATA                 : 1 << %zu\n"
+		    "  (-S) sweep parameters             : %s \n"
+		    "  (-v) verbose                      : %s \n\n",
+		    opts->rto_ashift,			/* -a */
+		    ilog2(opts->rto_offset),		/* -o */
+		    opts->rto_dcols,			/* -d */
+		    ilog2(opts->rto_dsize),		/* -s */
+		    opts->rto_sweep ? "yes" : "no",	/* -S */
+		    verbose);				/* -v */
+	}
+}
+
+static void usage(boolean_t requested)
+{
+	const raidz_test_opts_t *o = &rto_opts_defaults;
+
+	FILE *fp = requested ? stdout : stderr;
+
+	(void) fprintf(fp, "Usage:\n"
+	    "\t[-a zio ashift (default: %zu)]\n"
+	    "\t[-o zio offset, exponent radix 2 (default: %zu)]\n"
+	    "\t[-d number of raidz data columns (default: %zu)]\n"
+	    "\t[-s zio size, exponent radix 2 (default: %zu)]\n"
+	    "\t[-S parameter sweep (default: %s)]\n"
+	    "\t[-t timeout for parameter sweep test]\n"
+	    "\t[-B benchmark all raidz implementations]\n"
+	    "\t[-v increase verbosity (default: %zu)]\n"
+	    "\t[-h (print help)]\n"
+	    "\t[-T test the test, see if failure would be detected]\n"
+	    "\t[-D debug (attach gdb on SIGSEGV)]\n"
+	    "",
+	    o->rto_ashift,				/* -a */
+	    ilog2(o->rto_offset),			/* -o */
+	    o->rto_dcols,				/* -d */
+	    ilog2(o->rto_dsize),			/* -s */
+	    rto_opts.rto_sweep ? "yes" : "no",		/* -S */
+	    o->rto_v);					/* -d */
+
+	exit(requested ? 0 : 1);
+}
+
+static void process_options(int argc, char **argv)
+{
+	size_t value;
+	int opt;
+
+	raidz_test_opts_t *o = &rto_opts;
+
+	bcopy(&rto_opts_defaults, o, sizeof (*o));
+
+	while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) {
+		value = 0;
+
+		switch (opt) {
+		case 'a':
+			value = strtoull(optarg, NULL, 0);
+			o->rto_ashift = MIN(13, MAX(9, value));
+			break;
+		case 'o':
+			value = strtoull(optarg, NULL, 0);
+			o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
+			break;
+		case 'd':
+			value = strtoull(optarg, NULL, 0);
+			o->rto_dcols = MIN(255, MAX(1, value));
+			break;
+		case 's':
+			value = strtoull(optarg, NULL, 0);
+			o->rto_dsize = 1ULL <<  MIN(SPA_MAXBLOCKSHIFT,
+			    MAX(SPA_MINBLOCKSHIFT, value));
+			break;
+		case 't':
+			value = strtoull(optarg, NULL, 0);
+			o->rto_sweep_timeout = value;
+			break;
+		case 'v':
+			o->rto_v++;
+			break;
+		case 'S':
+			o->rto_sweep = 1;
+			break;
+		case 'B':
+			o->rto_benchmark = 1;
+			break;
+		case 'D':
+			o->rto_gdb = 1;
+			break;
+		case 'T':
+			o->rto_sanity = 1;
+			break;
+		case 'h':
+			usage(B_TRUE);
+			break;
+		case '?':
+		default:
+			usage(B_FALSE);
+			break;
+		}
+	}
+}
+
+#define	DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd)
+#define	DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size)
+
+#define	CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd)
+#define	CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size)
+
+static int
+cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
+{
+	int i, ret = 0;
+
+	VERIFY(parity >= 1 && parity <= 3);
+
+	for (i = 0; i < parity; i++) {
+		if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i))
+		    != 0) {
+			ret++;
+			LOG_OPT(D_DEBUG, opts,
+			    "\nParity block [%d] different!\n", i);
+		}
+	}
+	return (ret);
+}
+
+static int
+cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
+{
+	int i, ret = 0;
+	int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden);
+
+	for (i = 0; i < dcols; i++) {
+		if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i))
+		    != 0) {
+			ret++;
+
+			LOG_OPT(D_DEBUG, opts,
+			    "\nData block [%d] different!\n", i);
+		}
+	}
+	return (ret);
+}
+
+static int
+init_rand(void *data, size_t size, void *private)
+{
+	int i;
+	int *dst = (int *)data;
+
+	for (i = 0; i < size / sizeof (int); i++)
+		dst[i] = rand_data[i];
+
+	return (0);
+}
+
+static void
+corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
+{
+	int i;
+	raidz_col_t *col;
+
+	for (i = 0; i < cnt; i++) {
+		col = &rm->rm_col[tgts[i]];
+		abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL);
+	}
+}
+
+void
+init_zio_abd(zio_t *zio)
+{
+	abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
+}
+
+static void
+fini_raidz_map(zio_t **zio, raidz_map_t **rm)
+{
+	vdev_raidz_map_free(*rm);
+	raidz_free((*zio)->io_abd, (*zio)->io_size);
+	umem_free(*zio, sizeof (zio_t));
+
+	*zio = NULL;
+	*rm = NULL;
+}
+
+static int
+init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
+{
+	int err = 0;
+	zio_t *zio_test;
+	raidz_map_t *rm_test;
+	const size_t total_ncols = opts->rto_dcols + parity;
+
+	if (opts->rm_golden) {
+		fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
+	}
+
+	opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
+	zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
+
+	opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset;
+	opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize;
+
+	opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize);
+	zio_test->io_abd = raidz_alloc(opts->rto_dsize);
+
+	init_zio_abd(opts->zio_golden);
+	init_zio_abd(zio_test);
+
+	VERIFY0(vdev_raidz_impl_set("original"));
+
+	opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
+	    opts->rto_ashift, total_ncols, parity);
+	rm_test = vdev_raidz_map_alloc(zio_test,
+	    opts->rto_ashift, total_ncols, parity);
+
+	VERIFY(opts->zio_golden);
+	VERIFY(opts->rm_golden);
+
+	vdev_raidz_generate_parity(opts->rm_golden);
+	vdev_raidz_generate_parity(rm_test);
+
+	/* sanity check */
+	err |= cmp_data(opts, rm_test);
+	err |= cmp_code(opts, rm_test, parity);
+
+	if (err)
+		ERR("initializing the golden copy ... [FAIL]!\n");
+
+	/* tear down raidz_map of test zio */
+	fini_raidz_map(&zio_test, &rm_test);
+
+	return (err);
+}
+
+static raidz_map_t *
+init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
+{
+	raidz_map_t *rm = NULL;
+	const size_t alloc_dsize = opts->rto_dsize;
+	const size_t total_ncols = opts->rto_dcols + parity;
+	const int ccols[] = { 0, 1, 2 };
+
+	VERIFY(zio);
+	VERIFY(parity <= 3 && parity >= 1);
+
+	*zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
+
+	(*zio)->io_offset = 0;
+	(*zio)->io_size = alloc_dsize;
+	(*zio)->io_abd = raidz_alloc(alloc_dsize);
+	init_zio_abd(*zio);
+
+	rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
+	    total_ncols, parity);
+	VERIFY(rm);
+
+	/* Make sure code columns are destroyed */
+	corrupt_colums(rm, ccols, parity);
+
+	return (rm);
+}
+
+static int
+run_gen_check(raidz_test_opts_t *opts)
+{
+	char **impl_name;
+	int fn, err = 0;
+	zio_t *zio_test;
+	raidz_map_t *rm_test;
+
+	err = init_raidz_golden_map(opts, PARITY_PQR);
+	if (0 != err)
+		return (err);
+
+	LOG(D_INFO, DBLSEP);
+	LOG(D_INFO, "Testing parity generation...\n");
+
+	for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
+	    impl_name++) {
+
+		LOG(D_INFO, SEP);
+		LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
+
+		if (0 != vdev_raidz_impl_set(*impl_name)) {
+			LOG(D_INFO, "[SKIP]\n");
+			continue;
+		} else {
+			LOG(D_INFO, "[SUPPORTED]\n");
+		}
+
+		for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+
+			/* Check if should stop */
+			if (rto_opts.rto_should_stop)
+				return (err);
+
+			/* create suitable raidz_map */
+			rm_test = init_raidz_map(opts, &zio_test, fn+1);
+			VERIFY(rm_test);
+
+			LOG(D_INFO, "\t\tTesting method [%s] ...",
+			    raidz_gen_name[fn]);
+
+			if (!opts->rto_sanity)
+				vdev_raidz_generate_parity(rm_test);
+
+			if (cmp_code(opts, rm_test, fn+1) != 0) {
+				LOG(D_INFO, "[FAIL]\n");
+				err++;
+			} else
+				LOG(D_INFO, "[PASS]\n");
+
+			fini_raidz_map(&zio_test, &rm_test);
+		}
+	}
+
+	fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
+
+	return (err);
+}
+
+static int
+run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
+{
+	int x0, x1, x2;
+	int tgtidx[3];
+	int err = 0;
+	static const int rec_tgts[7][3] = {
+		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
+		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
+		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
+		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
+		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
+		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
+		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
+	};
+
+	memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx));
+
+	if (fn < RAIDZ_REC_PQ) {
+		/* can reconstruct 1 failed data disk */
+		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
+			if (x0 >= rm->rm_cols - raidz_parity(rm))
+				continue;
+
+			/* Check if should stop */
+			if (rto_opts.rto_should_stop)
+				return (err);
+
+			LOG(D_DEBUG, "[%d] ", x0);
+
+			tgtidx[2] = x0 + raidz_parity(rm);
+
+			corrupt_colums(rm, tgtidx+2, 1);
+
+			if (!opts->rto_sanity)
+				vdev_raidz_reconstruct(rm, tgtidx, 3);
+
+			if (cmp_data(opts, rm) != 0) {
+				err++;
+				LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0);
+			}
+		}
+
+	} else if (fn < RAIDZ_REC_PQR) {
+		/* can reconstruct 2 failed data disk */
+		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
+			if (x0 >= rm->rm_cols - raidz_parity(rm))
+				continue;
+			for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
+				if (x1 >= rm->rm_cols - raidz_parity(rm))
+					continue;
+
+				/* Check if should stop */
+				if (rto_opts.rto_should_stop)
+					return (err);
+
+				LOG(D_DEBUG, "[%d %d] ", x0, x1);
+
+				tgtidx[1] = x0 + raidz_parity(rm);
+				tgtidx[2] = x1 + raidz_parity(rm);
+
+				corrupt_colums(rm, tgtidx+1, 2);
+
+				if (!opts->rto_sanity)
+					vdev_raidz_reconstruct(rm, tgtidx, 3);
+
+				if (cmp_data(opts, rm) != 0) {
+					err++;
+					LOG(D_DEBUG, "\nREC D[%d %d]... "
+					    "[FAIL]\n", x0, x1);
+				}
+			}
+		}
+	} else {
+		/* can reconstruct 3 failed data disk */
+		for (x0 = 0; x0 < opts->rto_dcols; x0++) {
+			if (x0 >= rm->rm_cols - raidz_parity(rm))
+				continue;
+			for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
+				if (x1 >= rm->rm_cols - raidz_parity(rm))
+					continue;
+				for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
+					if (x2 >=
+					    rm->rm_cols - raidz_parity(rm))
+						continue;
+
+					/* Check if should stop */
+					if (rto_opts.rto_should_stop)
+						return (err);
+
+					LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2);
+
+					tgtidx[0] = x0 + raidz_parity(rm);
+					tgtidx[1] = x1 + raidz_parity(rm);
+					tgtidx[2] = x2 + raidz_parity(rm);
+
+					corrupt_colums(rm, tgtidx, 3);
+
+					if (!opts->rto_sanity)
+						vdev_raidz_reconstruct(rm,
+						    tgtidx, 3);
+
+					if (cmp_data(opts, rm) != 0) {
+						err++;
+						LOG(D_DEBUG,
+						    "\nREC D[%d %d %d]... "
+						    "[FAIL]\n", x0, x1, x2);
+					}
+				}
+			}
+		}
+	}
+	return (err);
+}
+
+static int
+run_rec_check(raidz_test_opts_t *opts)
+{
+	char **impl_name;
+	unsigned fn, err = 0;
+	zio_t *zio_test;
+	raidz_map_t *rm_test;
+
+	err = init_raidz_golden_map(opts, PARITY_PQR);
+	if (0 != err)
+		return (err);
+
+	LOG(D_INFO, DBLSEP);
+	LOG(D_INFO, "Testing data reconstruction...\n");
+
+	for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
+	    impl_name++) {
+
+		LOG(D_INFO, SEP);
+		LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
+
+		if (vdev_raidz_impl_set(*impl_name) != 0) {
+			LOG(D_INFO, "[SKIP]\n");
+			continue;
+		} else
+			LOG(D_INFO, "[SUPPORTED]\n");
+
+
+		/* create suitable raidz_map */
+		rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR);
+		/* generate parity */
+		vdev_raidz_generate_parity(rm_test);
+
+		for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
+
+			LOG(D_INFO, "\t\tTesting method [%s] ...",
+			    raidz_rec_name[fn]);
+
+			if (run_rec_check_impl(opts, rm_test, fn) != 0) {
+				LOG(D_INFO, "[FAIL]\n");
+				err++;
+
+			} else
+				LOG(D_INFO, "[PASS]\n");
+
+		}
+		/* tear down test raidz_map */
+		fini_raidz_map(&zio_test, &rm_test);
+	}
+
+	fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
+
+	return (err);
+}
+
+static int
+run_test(raidz_test_opts_t *opts)
+{
+	int err = 0;
+
+	if (opts == NULL)
+		opts = &rto_opts;
+
+	print_opts(opts, B_FALSE);
+
+	err |= run_gen_check(opts);
+	err |= run_rec_check(opts);
+
+	return (err);
+}
+
+#define	SWEEP_RUNNING	0
+#define	SWEEP_FINISHED	1
+#define	SWEEP_ERROR	2
+#define	SWEEP_TIMEOUT	3
+
+static int sweep_state = 0;
+static raidz_test_opts_t failed_opts;
+
+static kmutex_t sem_mtx;
+static kcondvar_t sem_cv;
+static int max_free_slots;
+static int free_slots;
+
+static void
+sweep_thread(void *arg)
+{
+	int err = 0;
+	raidz_test_opts_t *opts = (raidz_test_opts_t *)arg;
+	VERIFY(opts != NULL);
+
+	err = run_test(opts);
+
+	if (rto_opts.rto_sanity) {
+		/* 25% chance that a sweep test fails */
+		if (rand() < (RAND_MAX/4))
+			err = 1;
+	}
+
+	if (0 != err) {
+		mutex_enter(&sem_mtx);
+		memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t));
+		sweep_state = SWEEP_ERROR;
+		mutex_exit(&sem_mtx);
+	}
+
+	umem_free(opts, sizeof (raidz_test_opts_t));
+
+	/* signal the next thread */
+	mutex_enter(&sem_mtx);
+	free_slots++;
+	cv_signal(&sem_cv);
+	mutex_exit(&sem_mtx);
+
+	thread_exit();
+}
+
+static int
+run_sweep(void)
+{
+	static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 };
+	static const size_t ashift_v[] = { 9, 12, 14 };
+	static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12),
+		1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE };
+
+	(void) setvbuf(stdout, NULL, _IONBF, 0);
+
+	ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) *
+	    ARRAY_SIZE(dcols_v);
+	ulong_t tried_comb = 0;
+	hrtime_t time_diff, start_time = gethrtime();
+	raidz_test_opts_t *opts;
+	int a, d, s;
+
+	max_free_slots = free_slots = MAX(2, boot_ncpus);
+
+	mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&sem_cv, NULL, CV_DEFAULT, NULL);
+
+	for (s = 0; s < ARRAY_SIZE(size_v); s++)
+	for (a = 0; a < ARRAY_SIZE(ashift_v); a++)
+	for (d = 0; d < ARRAY_SIZE(dcols_v); d++) {
+
+		if (size_v[s] < (1 << ashift_v[a])) {
+			total_comb--;
+			continue;
+		}
+
+		if (++tried_comb % 20 == 0)
+			LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb);
+
+		/* wait for signal to start new thread */
+		mutex_enter(&sem_mtx);
+		while (cv_timedwait_sig(&sem_cv, &sem_mtx,
+		    ddi_get_lbolt() + hz)) {
+
+			/* check if should stop the test (timeout) */
+			time_diff = (gethrtime() - start_time) / NANOSEC;
+			if (rto_opts.rto_sweep_timeout > 0 &&
+			    time_diff >= rto_opts.rto_sweep_timeout) {
+				sweep_state = SWEEP_TIMEOUT;
+				rto_opts.rto_should_stop = B_TRUE;
+				mutex_exit(&sem_mtx);
+				goto exit;
+			}
+
+			/* check if should stop the test (error) */
+			if (sweep_state != SWEEP_RUNNING) {
+				mutex_exit(&sem_mtx);
+				goto exit;
+			}
+
+			/* exit loop if a slot is available */
+			if (free_slots > 0) {
+				break;
+			}
+		}
+
+		free_slots--;
+		mutex_exit(&sem_mtx);
+
+		opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL);
+		opts->rto_ashift = ashift_v[a];
+		opts->rto_dcols = dcols_v[d];
+		opts->rto_offset = (1 << ashift_v[a]) * rand();
+		opts->rto_dsize = size_v[s];
+		opts->rto_v = 0; /* be quiet */
+
+		VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
+		    0, NULL, TS_RUN, defclsyspri), !=, NULL);
+	}
+
+exit:
+	LOG(D_ALL, "\nWaiting for test threads to finish...\n");
+	mutex_enter(&sem_mtx);
+	VERIFY(free_slots <= max_free_slots);
+	while (free_slots < max_free_slots) {
+		(void) cv_wait(&sem_cv, &sem_mtx);
+	}
+	mutex_exit(&sem_mtx);
+
+	if (sweep_state == SWEEP_ERROR) {
+		ERR("Sweep test failed! Failed option: \n");
+		print_opts(&failed_opts, B_TRUE);
+	} else {
+		if (sweep_state == SWEEP_TIMEOUT)
+			LOG(D_ALL, "Test timeout (%lus). Stopping...\n",
+			    (ulong_t)rto_opts.rto_sweep_timeout);
+
+		LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n",
+		    (ulong_t)tried_comb);
+	}
+
+	mutex_destroy(&sem_mtx);
+
+	return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
+}
+
+int
+main(int argc, char **argv)
+{
+	size_t i;
+	struct sigaction action;
+	int err = 0;
+
+	/* init gdb string early */
+	(void) sprintf(gdb, gdb_tmpl, getpid());
+
+	action.sa_handler = sig_handler;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+
+	if (sigaction(SIGSEGV, &action, NULL) < 0) {
+		ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	(void) setvbuf(stdout, NULL, _IOLBF, 0);
+
+	dprintf_setup(&argc, argv);
+
+	process_options(argc, argv);
+
+	kernel_init(SPA_MODE_READ);
+
+	/* setup random data because rand() is not reentrant */
+	rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+	srand((unsigned)time(NULL) * getpid());
+	for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++)
+		rand_data[i] = rand();
+
+	mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ);
+
+	if (rto_opts.rto_benchmark) {
+		run_raidz_benchmark();
+	} else if (rto_opts.rto_sweep) {
+		err = run_sweep();
+	} else {
+		err = run_test(NULL);
+	}
+
+	umem_free(rand_data, SPA_MAXBLOCKSIZE);
+	kernel_fini();
+
+	return (err);
+}
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h
new file mode 100644
index 000000000000..09c825ae43c7
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h
@@ -0,0 +1,116 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#ifndef	RAIDZ_TEST_H
+#define	RAIDZ_TEST_H
+
+#include <sys/spa.h>
+
+static const char *raidz_impl_names[] = {
+	"original",
+	"scalar",
+	"sse2",
+	"ssse3",
+	"avx2",
+	"avx512f",
+	"avx512bw",
+	"aarch64_neon",
+	"aarch64_neonx2",
+	"powerpc_altivec",
+	NULL
+};
+
+typedef struct raidz_test_opts {
+	size_t rto_ashift;
+	size_t rto_offset;
+	size_t rto_dcols;
+	size_t rto_dsize;
+	size_t rto_v;
+	size_t rto_sweep;
+	size_t rto_sweep_timeout;
+	size_t rto_benchmark;
+	size_t rto_sanity;
+	size_t rto_gdb;
+
+	/* non-user options */
+	boolean_t rto_should_stop;
+
+	zio_t *zio_golden;
+	raidz_map_t *rm_golden;
+} raidz_test_opts_t;
+
+static const raidz_test_opts_t rto_opts_defaults = {
+	.rto_ashift = 9,
+	.rto_offset = 1ULL << 0,
+	.rto_dcols = 8,
+	.rto_dsize = 1<<19,
+	.rto_v = 0,
+	.rto_sweep = 0,
+	.rto_benchmark = 0,
+	.rto_sanity = 0,
+	.rto_gdb = 0,
+	.rto_should_stop = B_FALSE
+};
+
+extern raidz_test_opts_t rto_opts;
+
+static inline size_t ilog2(size_t a)
+{
+	return (a > 1 ? 1 + ilog2(a >> 1) : 0);
+}
+
+
+#define	D_ALL	0
+#define	D_INFO	1
+#define	D_DEBUG	2
+
+#define	LOG(lvl, a...)				\
+{						\
+	if (rto_opts.rto_v >= lvl)		\
+		(void) fprintf(stdout, a);	\
+}						\
+
+#define	LOG_OPT(lvl, opt, a...)			\
+{						\
+	if (opt->rto_v >= lvl)			\
+		(void) fprintf(stdout, a);	\
+}						\
+
+#define	ERR(a...)	(void) fprintf(stderr, a)
+
+
+#define	DBLSEP "================\n"
+#define	SEP    "----------------\n"
+
+
+#define	raidz_alloc(size)	abd_alloc(size, B_FALSE)
+#define	raidz_free(p, size)	abd_free(p)
+
+
+void init_zio_abd(zio_t *zio);
+
+void run_raidz_benchmark(void);
+
+#endif /* RAIDZ_TEST_H */
diff --git a/sys/contrib/openzfs/cmd/vdev_id/Makefile.am b/sys/contrib/openzfs/cmd/vdev_id/Makefile.am
new file mode 100644
index 000000000000..fb815faad084
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/vdev_id/Makefile.am
@@ -0,0 +1 @@
+dist_udev_SCRIPTS = vdev_id
diff --git a/sys/contrib/openzfs/cmd/vdev_id/vdev_id b/sys/contrib/openzfs/cmd/vdev_id/vdev_id
new file mode 100755
index 000000000000..8a75e638b67e
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/vdev_id/vdev_id
@@ -0,0 +1,605 @@
+#!/bin/sh
+#
+# vdev_id: udev helper to generate user-friendly names for JBOD disks
+#
+# This script parses the file /etc/zfs/vdev_id.conf to map a
+# physical path in a storage topology to a channel name.  The
+# channel name is combined with a disk enclosure slot number to
+# create an alias that reflects the physical location of the drive.
+# This is particularly helpful when it comes to tasks like replacing
+# failed drives.  Slot numbers may also be re-mapped in case the
+# default numbering is unsatisfactory.  The drive aliases will be
+# created as symbolic links in /dev/disk/by-vdev.
+#
+# The currently supported topologies are sas_direct and sas_switch.
+# A multipath mode is supported in which dm-mpath devices are
+# handled by examining the first-listed running component disk.  In
+# multipath mode the configuration file should contain a channel
+# definition with the same name for each path to a given enclosure.
+#
+# The alias keyword provides a simple way to map already-existing
+# device symlinks to more convenient names.  It is suitable for
+# small, static configurations or for sites that have some automated
+# way to generate the mapping file.
+#
+#
+# Some example configuration files are given below.
+
+# #
+# # Example vdev_id.conf - sas_direct.
+# #
+#
+# multipath     no
+# topology      sas_direct
+# phys_per_port 4
+# slot          bay
+#
+# #       PCI_ID  HBA PORT  CHANNEL NAME
+# channel 85:00.0 1         A
+# channel 85:00.0 0         B
+# channel 86:00.0 1         C
+# channel 86:00.0 0         D
+#
+# # Custom mapping for Channel A
+#
+# #    Linux      Mapped
+# #    Slot       Slot      Channel
+# slot 1          7         A
+# slot 2          10        A
+# slot 3          3         A
+# slot 4          6         A
+#
+# # Default mapping for B, C, and D
+# slot 1          4
+# slot 2          2
+# slot 3          1
+# slot 4          3
+
+# #
+# # Example vdev_id.conf - sas_switch
+# #
+#
+# topology      sas_switch
+#
+# #       SWITCH PORT  CHANNEL NAME
+# channel 1            A
+# channel 2            B
+# channel 3            C
+# channel 4            D
+
+# #
+# # Example vdev_id.conf - multipath
+# #
+#
+# multipath yes
+#
+# #       PCI_ID  HBA PORT  CHANNEL NAME
+# channel 85:00.0 1         A
+# channel 85:00.0 0         B
+# channel 86:00.0 1         A
+# channel 86:00.0 0         B
+
+# #
+# # Example vdev_id.conf - alias
+# #
+#
+# #     by-vdev
+# #     name     fully qualified or base name of device link
+# alias d1       /dev/disk/by-id/wwn-0x5000c5002de3b9ca
+# alias d2       wwn-0x5000c5002def789e
+
+PATH=/bin:/sbin:/usr/bin:/usr/sbin
+CONFIG=/etc/zfs/vdev_id.conf
+PHYS_PER_PORT=
+DEV=
+MULTIPATH=
+TOPOLOGY=
+BAY=
+
+usage() {
+	cat << EOF
+Usage: vdev_id [-h]
+       vdev_id <-d device> [-c config_file] [-p phys_per_port]
+               [-g sas_direct|sas_switch|scsi] [-m]
+
+  -c    specify name of an alternative config file [default=$CONFIG]
+  -d    specify basename of device (i.e. sda)
+  -e    Create enclose device symlinks only (/dev/by-enclosure)
+  -g    Storage network topology [default="$TOPOLOGY"]
+  -m    Run in multipath mode
+  -p    number of phy's per switch port [default=$PHYS_PER_PORT]
+  -h    show this summary
+EOF
+	exit 0
+}
+
+map_slot() {
+	LINUX_SLOT=$1
+	CHANNEL=$2
+
+	MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \
+			\\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG`
+	if [ -z "$MAPPED_SLOT" ] ; then
+		MAPPED_SLOT=$LINUX_SLOT
+	fi
+	printf "%d" ${MAPPED_SLOT}
+}
+
+map_channel() {
+	MAPPED_CHAN=
+	PCI_ID=$1
+	PORT=$2
+
+	case $TOPOLOGY in
+		"sas_switch")
+		MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \
+			{ print \\$3; exit }" $CONFIG`
+		;;
+		"sas_direct"|"scsi")
+		MAPPED_CHAN=`awk "\\$1 == \"channel\" && \
+			\\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \
+			{ print \\$4; exit }" $CONFIG`
+		;;
+	esac
+	printf "%s" ${MAPPED_CHAN}
+}
+
+sas_handler() {
+	if [ -z "$PHYS_PER_PORT" ] ; then
+		PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \
+			{print \\$2; exit}" $CONFIG`
+	fi
+	PHYS_PER_PORT=${PHYS_PER_PORT:-4}
+	if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then
+		echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric"
+		exit 1
+	fi
+
+	if [ -z "$MULTIPATH_MODE" ] ; then
+		MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \
+			{print \\$2; exit}" $CONFIG`
+	fi
+
+	# Use first running component device if we're handling a dm-mpath device
+	if [ "$MULTIPATH_MODE" = "yes" ] ; then
+		# If udev didn't tell us the UUID via DM_NAME, check /dev/mapper
+		if [ -z "$DM_NAME" ] ; then
+			DM_NAME=`ls -l --full-time /dev/mapper |
+				awk "/\/$DEV$/{print \\$9}"`
+		fi
+
+		# For raw disks udev exports DEVTYPE=partition when
+		# handling partitions, and the rules can be written to
+		# take advantage of this to append a -part suffix.  For
+		# dm devices we get DEVTYPE=disk even for partitions so
+		# we have to append the -part suffix directly in the
+		# helper.
+		if [ "$DEVTYPE" != "partition" ] ; then
+			PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+		fi
+
+		# Strip off partition information.
+		DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'`
+		if [ -z "$DM_NAME" ] ; then
+			return
+		fi
+
+		# Get the raw scsi device name from multipath -ll. Strip off
+		# leading pipe symbols to make field numbering consistent.
+		DEV=`multipath -ll $DM_NAME |
+			awk '/running/{gsub("^[|]"," "); print $3 ; exit}'`
+		if [ -z "$DEV" ] ; then
+			return
+		fi
+	fi
+
+	if echo $DEV | grep -q ^/devices/ ; then
+		sys_path=$DEV
+	else
+		sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null`
+	fi
+
+	# Use positional parameters as an ad-hoc array
+	set -- $(echo "$sys_path" | tr / ' ')
+	num_dirs=$#
+	scsi_host_dir="/sys"
+
+	# Get path up to /sys/.../hostX
+	i=1
+	while [ $i -le $num_dirs ] ; do
+		d=$(eval echo \${$i})
+		scsi_host_dir="$scsi_host_dir/$d"
+		echo $d | grep -q -E '^host[0-9]+$' && break
+		i=$(($i + 1))
+	done
+
+	if [ $i = $num_dirs ] ; then
+		return
+	fi
+
+	PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}')
+
+	# In sas_switch mode, the directory four levels beneath
+	# /sys/.../hostX contains symlinks to phy devices that reveal
+	# the switch port number.  In sas_direct mode, the phy links one
+	# directory down reveal the HBA port.
+	port_dir=$scsi_host_dir
+	case $TOPOLOGY in
+		"sas_switch") j=$(($i + 4)) ;;
+		"sas_direct") j=$(($i + 1)) ;;
+	esac
+
+	i=$(($i + 1))
+	while [ $i -le $j ] ; do
+		port_dir="$port_dir/$(eval echo \${$i})"
+		i=$(($i + 1))
+	done
+
+	PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'`
+	if [ -z "$PHY" ] ; then
+		PHY=0
+	fi
+	PORT=$(( $PHY / $PHYS_PER_PORT ))
+
+	# Look in /sys/.../sas_device/end_device-X for the bay_identifier
+	# attribute.
+	end_device_dir=$port_dir
+	while [ $i -lt $num_dirs ] ; do
+		d=$(eval echo \${$i})
+		end_device_dir="$end_device_dir/$d"
+		if echo $d | grep -q '^end_device' ; then
+			end_device_dir="$end_device_dir/sas_device/$d"
+			break
+		fi
+		i=$(($i + 1))
+	done
+
+	SLOT=
+	case $BAY in
+	"bay")
+		SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null`
+		;;
+	"phy")
+		SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null`
+		;;
+	"port")
+		d=$(eval echo \${$i})
+		SLOT=`echo $d | sed -e 's/^.*://'`
+		;;
+	"id")
+		i=$(($i + 1))
+		d=$(eval echo \${$i})
+		SLOT=`echo $d | sed -e 's/^.*://'`
+		;;
+	"lun")
+		i=$(($i + 2))
+		d=$(eval echo \${$i})
+		SLOT=`echo $d | sed -e 's/^.*://'`
+		;;
+	"ses")
+		# look for this SAS path in all SCSI Enclosure Services
+		# (SES) enclosures
+		sas_address=`cat $end_device_dir/sas_address 2>/dev/null`
+		enclosures=`lsscsi -g | \
+			sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'`
+		for enclosure in $enclosures; do
+			set -- $(sg_ses -p aes $enclosure | \
+				awk "/device slot number:/{slot=\$12} \
+					/SAS address: $sas_address/\
+					{print slot}")
+			SLOT=$1
+			if [ -n "$SLOT" ] ; then
+				break
+			fi
+		done
+		;;
+	esac
+	if [ -z "$SLOT" ] ; then
+		return
+	fi
+
+	CHAN=`map_channel $PCI_ID $PORT`
+	SLOT=`map_slot $SLOT $CHAN`
+	if [ -z "$CHAN" ] ; then
+		return
+	fi
+	echo ${CHAN}${SLOT}${PART}
+}
+
+scsi_handler() {
+	if [ -z "$FIRST_BAY_NUMBER" ] ; then
+		FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \
+			{print \\$2; exit}" $CONFIG`
+	fi
+	FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0}
+
+	if [ -z "$PHYS_PER_PORT" ] ; then
+		PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \
+			{print \\$2; exit}" $CONFIG`
+	fi
+	PHYS_PER_PORT=${PHYS_PER_PORT:-4}
+	if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then
+		echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric"
+		exit 1
+	fi
+
+	if [ -z "$MULTIPATH_MODE" ] ; then
+		MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \
+			{print \\$2; exit}" $CONFIG`
+	fi
+
+	# Use first running component device if we're handling a dm-mpath device
+	if [ "$MULTIPATH_MODE" = "yes" ] ; then
+		# If udev didn't tell us the UUID via DM_NAME, check /dev/mapper
+		if [ -z "$DM_NAME" ] ; then
+			DM_NAME=`ls -l --full-time /dev/mapper |
+				awk "/\/$DEV$/{print \\$9}"`
+		fi
+
+		# For raw disks udev exports DEVTYPE=partition when
+		# handling partitions, and the rules can be written to
+		# take advantage of this to append a -part suffix.  For
+		# dm devices we get DEVTYPE=disk even for partitions so
+		# we have to append the -part suffix directly in the
+		# helper.
+		if [ "$DEVTYPE" != "partition" ] ; then
+			PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+		fi
+
+		# Strip off partition information.
+		DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'`
+		if [ -z "$DM_NAME" ] ; then
+			return
+		fi
+
+		# Get the raw scsi device name from multipath -ll. Strip off
+		# leading pipe symbols to make field numbering consistent.
+		DEV=`multipath -ll $DM_NAME |
+			awk '/running/{gsub("^[|]"," "); print $3 ; exit}'`
+		if [ -z "$DEV" ] ; then
+			return
+		fi
+	fi
+
+	if echo $DEV | grep -q ^/devices/ ; then
+		sys_path=$DEV
+	else
+		sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null`
+	fi
+
+	# expect sys_path like this, for example:
+	# /devices/pci0000:00/0000:00:0b.0/0000:09:00.0/0000:0a:05.0/0000:0c:00.0/host3/target3:1:0/3:1:0:21/block/sdv
+
+	# Use positional parameters as an ad-hoc array
+	set -- $(echo "$sys_path" | tr / ' ')
+	num_dirs=$#
+	scsi_host_dir="/sys"
+
+	# Get path up to /sys/.../hostX
+	i=1
+	while [ $i -le $num_dirs ] ; do
+		d=$(eval echo \${$i})
+		scsi_host_dir="$scsi_host_dir/$d"
+		echo $d | grep -q -E '^host[0-9]+$' && break
+		i=$(($i + 1))
+	done
+
+	if [ $i = $num_dirs ] ; then
+		return
+	fi
+
+	PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}')
+
+	# In scsi mode, the directory two levels beneath
+	# /sys/.../hostX reveals the port and slot.
+	port_dir=$scsi_host_dir
+	j=$(($i + 2))
+
+	i=$(($i + 1))
+	while [ $i -le $j ] ; do
+		port_dir="$port_dir/$(eval echo \${$i})"
+		i=$(($i + 1))
+	done
+
+	set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/')
+	PORT=$1
+	SLOT=$(($2 + $FIRST_BAY_NUMBER))
+
+	if [ -z "$SLOT" ] ; then
+		return
+	fi
+
+	CHAN=`map_channel $PCI_ID $PORT`
+	SLOT=`map_slot $SLOT $CHAN`
+	if [ -z "$CHAN" ] ; then
+		return
+	fi
+	echo ${CHAN}${SLOT}${PART}
+}
+
+# Figure out the name for the enclosure symlink
+enclosure_handler () {
+	# We get all the info we need from udev's DEVPATH variable:
+	#
+	# DEVPATH=/sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/subsystem/devices/0:0:0:0/scsi_generic/sg0
+
+	# Get the enclosure ID ("0:0:0:0")
+	ENC=$(basename $(readlink -m "/sys/$DEVPATH/../.."))
+	if [ ! -d /sys/class/enclosure/$ENC ] ; then
+		# Not an enclosure, bail out
+		return
+	fi
+
+	# Get the long sysfs device path to our enclosure. Looks like:
+	# /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0
+
+	ENC_DEVICE=$(readlink /sys/class/enclosure/$ENC)
+
+	# Grab the full path to the hosts port dir:
+	# /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0
+	PORT_DIR=$(echo $ENC_DEVICE | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+')
+
+	# Get the port number
+	PORT_ID=$(echo $PORT_DIR | grep -Eo "[0-9]+$")
+
+	# The PCI directory is two directories up from the port directory
+	# /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0
+	PCI_ID_LONG=$(basename $(readlink -m "/sys/$PORT_DIR/../.."))
+
+	# Strip down the PCI address from 0000:05:00.0 to 05:00.0
+	PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g')
+
+	# Name our device according to vdev_id.conf (like "L0" or "U1").
+	NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \
+		\$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG)
+
+	echo "${NAME}"
+}
+
+alias_handler () {
+	# Special handling is needed to correctly append a -part suffix
+	# to partitions of device mapper devices.  The DEVTYPE attribute
+	# is normally set to "disk" instead of "partition" in this case,
+	# so the udev rules won't handle that for us as they do for
+	# "plain" block devices.
+	#
+	# For example, we may have the following links for a device and its
+	# partitions,
+	#
+	#  /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0   -> ../../dm-0
+	#  /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p1 -> ../../dm-1
+	#  /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p2 -> ../../dm-3
+	#
+	# and the following alias in vdev_id.conf.
+	#
+	#   alias A0 dm-name-isw_dibgbfcije_ARRAY0
+	#
+	# The desired outcome is for the following links to be created
+	# without having explicitly defined aliases for the partitions.
+	#
+	#  /dev/disk/by-vdev/A0       -> ../../dm-0
+	#  /dev/disk/by-vdev/A0-part1 -> ../../dm-1
+	#  /dev/disk/by-vdev/A0-part2 -> ../../dm-3
+	#
+	# Warning: The following grep pattern will misidentify whole-disk
+	#          devices whose names end with 'p' followed by a string of
+	#          digits as partitions, causing alias creation to fail. This
+	#          ambiguity seems unavoidable, so devices using this facility
+	#          must not use such names.
+	DM_PART=
+	if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then
+		if [ "$DEVTYPE" != "partition" ] ; then
+			DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+		fi
+	fi
+
+	# DEVLINKS attribute must have been populated by already-run udev rules.
+	for link in $DEVLINKS ; do
+		# Remove partition information to match key of top-level device.
+		if [ -n "$DM_PART" ] ; then
+			link=`echo $link | sed 's/p[0-9][0-9]*$//'`
+		fi
+		# Check both the fully qualified and the base name of link.
+		for l in $link `basename $link` ; do
+			alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \
+					{ print \\$2; exit }" $CONFIG`
+			if [ -n "$alias" ] ; then
+				echo ${alias}${DM_PART}
+				return
+			fi
+		done
+	done
+}
+
+while getopts 'c:d:eg:mp:h' OPTION; do
+	case ${OPTION} in
+	c)
+		CONFIG=${OPTARG}
+		;;
+	d)
+		DEV=${OPTARG}
+		;;
+	e)
+	# When udev sees a scsi_generic device, it calls this script with -e to
+	# create the enclosure device symlinks only.  We also need
+	# "enclosure_symlinks yes" set in vdev_id.config to actually create the
+	# symlink.
+	ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG)
+	if [ "$ENCLOSURE_MODE" != "yes" ] ; then
+		exit 0
+	fi
+		;;
+	g)
+		TOPOLOGY=$OPTARG
+		;;
+	p)
+		PHYS_PER_PORT=${OPTARG}
+		;;
+	m)
+		MULTIPATH_MODE=yes
+		;;
+	h)
+		usage
+		;;
+	esac
+done
+
+if [ ! -r $CONFIG ] ; then
+	exit 0
+fi
+
+if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then
+	echo "Error: missing required option -d"
+	exit 1
+fi
+
+if [ -z "$TOPOLOGY" ] ; then
+	TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG`
+fi
+
+if [ -z "$BAY" ] ; then
+	BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG`
+fi
+
+TOPOLOGY=${TOPOLOGY:-sas_direct}
+
+# Should we create /dev/by-enclosure symlinks?
+if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then
+	ID_ENCLOSURE=$(enclosure_handler)
+	if [ -z "$ID_ENCLOSURE" ] ; then
+		exit 0
+	fi
+
+	# Just create the symlinks to the enclosure devices and then exit.
+	ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' $CONFIG)
+	if [ -z "$ENCLOSURE_PREFIX" ] ; then
+		ENCLOSURE_PREFIX="enc"
+	fi
+	echo "ID_ENCLOSURE=$ID_ENCLOSURE"
+	echo "ID_ENCLOSURE_PATH=by-enclosure/$ENCLOSURE_PREFIX-$ID_ENCLOSURE"
+	exit 0
+fi
+
+# First check if an alias was defined for this device.
+ID_VDEV=`alias_handler`
+
+if [ -z "$ID_VDEV" ] ; then
+	BAY=${BAY:-bay}
+	case $TOPOLOGY in
+		sas_direct|sas_switch)
+			ID_VDEV=`sas_handler`
+			;;
+		scsi)
+			ID_VDEV=`scsi_handler`
+			;;
+		*)
+			echo "Error: unknown topology $TOPOLOGY"
+			exit 1
+			;;
+	esac
+fi
+
+if [ -n "$ID_VDEV" ] ; then
+	echo "ID_VDEV=${ID_VDEV}"
+	echo "ID_VDEV_PATH=disk/by-vdev/${ID_VDEV}"
+fi
diff --git a/sys/contrib/openzfs/cmd/zdb/.gitignore b/sys/contrib/openzfs/cmd/zdb/.gitignore
new file mode 100644
index 000000000000..f64a3fc5a160
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/.gitignore
@@ -0,0 +1 @@
+/zdb
diff --git a/sys/contrib/openzfs/cmd/zdb/Makefile.am b/sys/contrib/openzfs/cmd/zdb/Makefile.am
new file mode 100644
index 000000000000..b325cb060bd2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/Makefile.am
@@ -0,0 +1,16 @@
+include $(top_srcdir)/config/Rules.am
+
+# Unconditionally enable debugging for zdb
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+sbin_PROGRAMS = zdb
+
+zdb_SOURCES = \
+	zdb.c \
+	zdb_il.c \
+	zdb.h
+
+zdb_LDADD = \
+	$(abs_top_builddir)/lib/libzpool/libzpool.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+	$(abs_top_builddir)/lib/libnvpair/libnvpair.la
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c
new file mode 100644
index 000000000000..e7211711a41c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c
@@ -0,0 +1,8606 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015, 2017, Intel Corporation.
+ * Copyright (c) 2020 Datto Inc.
+ * Copyright (c) 2020, The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ *     under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dbuf.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_traverse.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/zfs_fuid.h>
+#include <sys/arc.h>
+#include <sys/arc_impl.h>
+#include <sys/ddt.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_crypt.h>
+#include <sys/dsl_scan.h>
+#include <sys/btree.h>
+#include <zfs_comutil.h>
+#include <sys/zstd/zstd.h>
+
+#include <libnvpair.h>
+#include <libzutil.h>
+
+#include "zdb.h"
+
+#define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
+	zio_compress_table[(idx)].ci_name : "UNKNOWN")
+#define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
+	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
+#define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
+	(idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ?	\
+	DMU_OT_ZAP_OTHER : \
+	(idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \
+	DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
+
+static char *
+zdb_ot_name(dmu_object_type_t type)
+{
+	if (type < DMU_OT_NUMTYPES)
+		return (dmu_ot[type].ot_name);
+	else if ((type & DMU_OT_NEWTYPE) &&
+	    ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS))
+		return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name);
+	else
+		return ("UNKNOWN");
+}
+
+extern int reference_tracking_enable;
+extern int zfs_recover;
+extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit;
+extern int zfs_vdev_async_read_max_active;
+extern boolean_t spa_load_verify_dryrun;
+extern int zfs_reconstruct_indirect_combinations_max;
+extern int zfs_btree_verify_intensity;
+
+static const char cmdname[] = "zdb";
+uint8_t dump_opt[256];
+
+typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
+
+uint64_t *zopt_metaslab = NULL;
+static unsigned zopt_metaslab_args = 0;
+
+typedef struct zopt_object_range {
+	uint64_t zor_obj_start;
+	uint64_t zor_obj_end;
+	uint64_t zor_flags;
+} zopt_object_range_t;
+zopt_object_range_t *zopt_object_ranges = NULL;
+static unsigned zopt_object_args = 0;
+
+static int flagbits[256];
+
+#define	ZOR_FLAG_PLAIN_FILE	0x0001
+#define	ZOR_FLAG_DIRECTORY	0x0002
+#define	ZOR_FLAG_SPACE_MAP	0x0004
+#define	ZOR_FLAG_ZAP		0x0008
+#define	ZOR_FLAG_ALL_TYPES	-1
+#define	ZOR_SUPPORTED_FLAGS	(ZOR_FLAG_PLAIN_FILE	| \
+				ZOR_FLAG_DIRECTORY	| \
+				ZOR_FLAG_SPACE_MAP	| \
+				ZOR_FLAG_ZAP)
+
+#define	ZDB_FLAG_CHECKSUM	0x0001
+#define	ZDB_FLAG_DECOMPRESS	0x0002
+#define	ZDB_FLAG_BSWAP		0x0004
+#define	ZDB_FLAG_GBH		0x0008
+#define	ZDB_FLAG_INDIRECT	0x0010
+#define	ZDB_FLAG_RAW		0x0020
+#define	ZDB_FLAG_PRINT_BLKPTR	0x0040
+#define	ZDB_FLAG_VERBOSE	0x0080
+
+uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
+static int leaked_objects = 0;
+static range_tree_t *mos_refd_objs;
+
+static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
+    boolean_t);
+static void mos_obj_refd(uint64_t);
+static void mos_obj_refd_multiple(uint64_t);
+static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
+    dmu_tx_t *tx);
+
+typedef struct sublivelist_verify {
+	/* all ALLOC'd blkptr_t in one sub-livelist */
+	zfs_btree_t sv_all_allocs;
+
+	/* all FREE'd blkptr_t in one sub-livelist */
+	zfs_btree_t sv_all_frees;
+
+	/* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
+	zfs_btree_t sv_pair;
+
+	/* ALLOC's without a matching FREE, accumulates across sub-livelists */
+	zfs_btree_t sv_leftover;
+} sublivelist_verify_t;
+
+static int
+livelist_compare(const void *larg, const void *rarg)
+{
+	const blkptr_t *l = larg;
+	const blkptr_t *r = rarg;
+
+	/* Sort them according to dva[0] */
+	uint64_t l_dva0_vdev, r_dva0_vdev;
+	l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
+	r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
+	if (l_dva0_vdev < r_dva0_vdev)
+		return (-1);
+	else if (l_dva0_vdev > r_dva0_vdev)
+		return (+1);
+
+	/* if vdevs are equal, sort by offsets. */
+	uint64_t l_dva0_offset;
+	uint64_t r_dva0_offset;
+	l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
+	r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
+	if (l_dva0_offset < r_dva0_offset) {
+		return (-1);
+	} else if (l_dva0_offset > r_dva0_offset) {
+		return (+1);
+	}
+
+	/*
+	 * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
+	 * it's possible the offsets are equal. In that case, sort by txg
+	 */
+	if (l->blk_birth < r->blk_birth) {
+		return (-1);
+	} else if (l->blk_birth > r->blk_birth) {
+		return (+1);
+	}
+	return (0);
+}
+
+typedef struct sublivelist_verify_block {
+	dva_t svb_dva;
+
+	/*
+	 * We need this to check if the block marked as allocated
+	 * in the livelist was freed (and potentially reallocated)
+	 * in the metaslab spacemaps at a later TXG.
+	 */
+	uint64_t svb_allocated_txg;
+} sublivelist_verify_block_t;
+
+static void zdb_print_blkptr(const blkptr_t *bp, int flags);
+
+static int
+sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
+    dmu_tx_t *tx)
+{
+	ASSERT3P(tx, ==, NULL);
+	struct sublivelist_verify *sv = arg;
+	char blkbuf[BP_SPRINTF_LEN];
+	zfs_btree_index_t where;
+	if (free) {
+		zfs_btree_add(&sv->sv_pair, bp);
+		/* Check if the FREE is a duplicate */
+		if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
+			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+			    free);
+			(void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
+		} else {
+			zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
+		}
+	} else {
+		/* Check if the ALLOC has been freed */
+		if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
+			zfs_btree_remove_idx(&sv->sv_pair, &where);
+		} else {
+			for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+				if (DVA_IS_EMPTY(&bp->blk_dva[i]))
+					break;
+				sublivelist_verify_block_t svb = {
+				    .svb_dva = bp->blk_dva[i],
+				    .svb_allocated_txg = bp->blk_birth
+				};
+
+				if (zfs_btree_find(&sv->sv_leftover, &svb,
+				    &where) == NULL) {
+					zfs_btree_add_idx(&sv->sv_leftover,
+					    &svb, &where);
+				}
+			}
+		}
+		/* Check if the ALLOC is a duplicate */
+		if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
+			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+			    free);
+			(void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
+		} else {
+			zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
+		}
+	}
+	return (0);
+}
+
+static int
+sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
+{
+	int err;
+	char blkbuf[BP_SPRINTF_LEN];
+	struct sublivelist_verify *sv = args;
+
+	zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
+	    sizeof (blkptr_t));
+
+	zfs_btree_create(&sv->sv_all_frees, livelist_compare,
+	    sizeof (blkptr_t));
+
+	zfs_btree_create(&sv->sv_pair, livelist_compare,
+	    sizeof (blkptr_t));
+
+	err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
+	    sv, NULL);
+
+	zfs_btree_clear(&sv->sv_all_allocs);
+	zfs_btree_destroy(&sv->sv_all_allocs);
+
+	zfs_btree_clear(&sv->sv_all_frees);
+	zfs_btree_destroy(&sv->sv_all_frees);
+
+	blkptr_t *e;
+	zfs_btree_index_t *cookie = NULL;
+	while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
+		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
+		(void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
+	}
+	zfs_btree_destroy(&sv->sv_pair);
+
+	return (err);
+}
+
+static int
+livelist_block_compare(const void *larg, const void *rarg)
+{
+	const sublivelist_verify_block_t *l = larg;
+	const sublivelist_verify_block_t *r = rarg;
+
+	if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
+		return (-1);
+	else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
+		return (+1);
+
+	if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
+		return (-1);
+	else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
+		return (+1);
+
+	if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
+		return (-1);
+	else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
+		return (+1);
+
+	return (0);
+}
+
+/*
+ * Check for errors in a livelist while tracking all unfreed ALLOCs in the
+ * sublivelist_verify_t: sv->sv_leftover
+ */
+static void
+livelist_verify(dsl_deadlist_t *dl, void *arg)
+{
+	sublivelist_verify_t *sv = arg;
+	dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
+}
+
+/*
+ * Check for errors in the livelist entry and discard the intermediary
+ * data structures
+ */
+/* ARGSUSED */
+static int
+sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
+{
+	sublivelist_verify_t sv;
+	zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+	    sizeof (sublivelist_verify_block_t));
+	int err = sublivelist_verify_func(&sv, dle);
+	zfs_btree_clear(&sv.sv_leftover);
+	zfs_btree_destroy(&sv.sv_leftover);
+	return (err);
+}
+
+typedef struct metaslab_verify {
+	/*
+	 * Tree containing all the leftover ALLOCs from the livelists
+	 * that are part of this metaslab.
+	 */
+	zfs_btree_t mv_livelist_allocs;
+
+	/*
+	 * Metaslab information.
+	 */
+	uint64_t mv_vdid;
+	uint64_t mv_msid;
+	uint64_t mv_start;
+	uint64_t mv_end;
+
+	/*
+	 * What's currently allocated for this metaslab.
+	 */
+	range_tree_t *mv_allocated;
+} metaslab_verify_t;
+
+typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
+
+typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
+    void *arg);
+
+typedef struct unflushed_iter_cb_arg {
+	spa_t *uic_spa;
+	uint64_t uic_txg;
+	void *uic_arg;
+	zdb_log_sm_cb_t uic_cb;
+} unflushed_iter_cb_arg_t;
+
+static int
+iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
+{
+	unflushed_iter_cb_arg_t *uic = arg;
+	return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
+}
+
+static void
+iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+		space_map_t *sm = NULL;
+		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+		unflushed_iter_cb_arg_t uic = {
+			.uic_spa = spa,
+			.uic_txg = sls->sls_txg,
+			.uic_arg = arg,
+			.uic_cb = cb
+		};
+		VERIFY0(space_map_iterate(sm, space_map_length(sm),
+		    iterate_through_spacemap_logs_cb, &uic));
+		space_map_close(sm);
+	}
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static void
+verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
+    uint64_t offset, uint64_t size)
+{
+	sublivelist_verify_block_t svb;
+	DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
+	DVA_SET_OFFSET(&svb.svb_dva, offset);
+	DVA_SET_ASIZE(&svb.svb_dva, size);
+	zfs_btree_index_t where;
+	uint64_t end_offset = offset + size;
+
+	/*
+	 *  Look for an exact match for spacemap entry in the livelist entries.
+	 *  Then, look for other livelist entries that fall within the range
+	 *  of the spacemap entry as it may have been condensed
+	 */
+	sublivelist_verify_block_t *found =
+	    zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
+	if (found == NULL) {
+		found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
+	}
+	for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
+	    DVA_GET_OFFSET(&found->svb_dva) < end_offset;
+	    found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+		if (found->svb_allocated_txg <= txg) {
+			(void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
+			    "from TXG %llx FREED at TXG %llx\n",
+			    (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
+			    (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
+			    (u_longlong_t)found->svb_allocated_txg,
+			    (u_longlong_t)txg);
+		}
+	}
+}
+
+static int
+metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
+{
+	metaslab_verify_t *mv = arg;
+	uint64_t offset = sme->sme_offset;
+	uint64_t size = sme->sme_run;
+	uint64_t txg = sme->sme_txg;
+
+	if (sme->sme_type == SM_ALLOC) {
+		if (range_tree_contains(mv->mv_allocated,
+		    offset, size)) {
+			(void) printf("ERROR: DOUBLE ALLOC: "
+			    "%llu [%llx:%llx] "
+			    "%llu:%llu LOG_SM\n",
+			    (u_longlong_t)txg, (u_longlong_t)offset,
+			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+			    (u_longlong_t)mv->mv_msid);
+		} else {
+			range_tree_add(mv->mv_allocated,
+			    offset, size);
+		}
+	} else {
+		if (!range_tree_contains(mv->mv_allocated,
+		    offset, size)) {
+			(void) printf("ERROR: DOUBLE FREE: "
+			    "%llu [%llx:%llx] "
+			    "%llu:%llu LOG_SM\n",
+			    (u_longlong_t)txg, (u_longlong_t)offset,
+			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+			    (u_longlong_t)mv->mv_msid);
+		} else {
+			range_tree_remove(mv->mv_allocated,
+			    offset, size);
+		}
+	}
+
+	if (sme->sme_type != SM_ALLOC) {
+		/*
+		 * If something is freed in the spacemap, verify that
+		 * it is not listed as allocated in the livelist.
+		 */
+		verify_livelist_allocs(mv, txg, offset, size);
+	}
+	return (0);
+}
+
+static int
+spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
+    uint64_t txg, void *arg)
+{
+	metaslab_verify_t *mv = arg;
+	uint64_t offset = sme->sme_offset;
+	uint64_t vdev_id = sme->sme_vdev;
+
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+	/* skip indirect vdevs */
+	if (!vdev_is_concrete(vd))
+		return (0);
+
+	if (vdev_id != mv->mv_vdid)
+		return (0);
+
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	if (ms->ms_id != mv->mv_msid)
+		return (0);
+
+	if (txg < metaslab_unflushed_txg(ms))
+		return (0);
+
+
+	ASSERT3U(txg, ==, sme->sme_txg);
+	return (metaslab_spacemap_validation_cb(sme, mv));
+}
+
+static void
+spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
+{
+	iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
+}
+
+static void
+spacemap_check_ms_sm(space_map_t  *sm, metaslab_verify_t *mv)
+{
+	if (sm == NULL)
+		return;
+
+	VERIFY0(space_map_iterate(sm, space_map_length(sm),
+	    metaslab_spacemap_validation_cb, mv));
+}
+
+static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
+
+/*
+ * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
+ * they are part of that metaslab (mv_msid).
+ */
+static void
+mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
+{
+	zfs_btree_index_t where;
+	sublivelist_verify_block_t *svb;
+	ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
+	for (svb = zfs_btree_first(&sv->sv_leftover, &where);
+	    svb != NULL;
+	    svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
+		if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
+			continue;
+
+		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
+		    (DVA_GET_OFFSET(&svb->svb_dva) +
+		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
+			(void) printf("ERROR: Found block that crosses "
+			    "metaslab boundary: <%llu:%llx:%llx>\n",
+			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+			continue;
+		}
+
+		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
+			continue;
+
+		if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
+			continue;
+
+		if ((DVA_GET_OFFSET(&svb->svb_dva) +
+		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
+			(void) printf("ERROR: Found block that crosses "
+			    "metaslab boundary: <%llu:%llx:%llx>\n",
+			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+			continue;
+		}
+
+		zfs_btree_add(&mv->mv_livelist_allocs, svb);
+	}
+
+	for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
+	    svb != NULL;
+	    svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+		zfs_btree_remove(&sv->sv_leftover, svb);
+	}
+}
+
+/*
+ * [Livelist Check]
+ * Iterate through all the sublivelists and:
+ * - report leftover frees
+ * - report double ALLOCs/FREEs
+ * - record leftover ALLOCs together with their TXG [see Cross Check]
+ *
+ * [Spacemap Check]
+ * for each metaslab:
+ * - iterate over spacemap and then the metaslab's entries in the
+ *   spacemap log, then report any double FREEs and ALLOCs (do not
+ *   blow up).
+ *
+ * [Cross Check]
+ * After finishing the Livelist Check phase and while being in the
+ * Spacemap Check phase, we find all the recorded leftover ALLOCs
+ * of the livelist check that are part of the metaslab that we are
+ * currently looking at in the Spacemap Check. We report any entries
+ * that are marked as ALLOCs in the livelists but have been actually
+ * freed (and potentially allocated again) after their TXG stamp in
+ * the spacemaps. Also report any ALLOCs from the livelists that
+ * belong to indirect vdevs (e.g. their vdev completed removal).
+ *
+ * Note that this will miss Log Spacemap entries that cancelled each other
+ * out before being flushed to the metaslab, so we are not guaranteed
+ * to match all erroneous ALLOCs.
+ */
+static void
+livelist_metaslab_validate(spa_t *spa)
+{
+	(void) printf("Verifying deleted livelist entries\n");
+
+	sublivelist_verify_t sv;
+	zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+	    sizeof (sublivelist_verify_block_t));
+	iterate_deleted_livelists(spa, livelist_verify, &sv);
+
+	(void) printf("Verifying metaslab entries\n");
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+
+		if (!vdev_is_concrete(vd))
+			continue;
+
+		for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
+			metaslab_t *m = vd->vdev_ms[mid];
+
+			(void) fprintf(stderr,
+			    "\rverifying concrete vdev %llu, "
+			    "metaslab %llu of %llu ...",
+			    (longlong_t)vd->vdev_id,
+			    (longlong_t)mid,
+			    (longlong_t)vd->vdev_ms_count);
+
+			uint64_t shift, start;
+			range_seg_type_t type =
+			    metaslab_calculate_range_tree_type(vd, m,
+			    &start, &shift);
+			metaslab_verify_t mv;
+			mv.mv_allocated = range_tree_create(NULL,
+			    type, NULL, start, shift);
+			mv.mv_vdid = vd->vdev_id;
+			mv.mv_msid = m->ms_id;
+			mv.mv_start = m->ms_start;
+			mv.mv_end = m->ms_start + m->ms_size;
+			zfs_btree_create(&mv.mv_livelist_allocs,
+			    livelist_block_compare,
+			    sizeof (sublivelist_verify_block_t));
+
+			mv_populate_livelist_allocs(&mv, &sv);
+
+			spacemap_check_ms_sm(m->ms_sm, &mv);
+			spacemap_check_sm_log(spa, &mv);
+
+			range_tree_vacate(mv.mv_allocated, NULL, NULL);
+			range_tree_destroy(mv.mv_allocated);
+			zfs_btree_clear(&mv.mv_livelist_allocs);
+			zfs_btree_destroy(&mv.mv_livelist_allocs);
+		}
+	}
+	(void) fprintf(stderr, "\n");
+
+	/*
+	 * If there are any segments in the leftover tree after we walked
+	 * through all the metaslabs in the concrete vdevs then this means
+	 * that we have segments in the livelists that belong to indirect
+	 * vdevs and are marked as allocated.
+	 */
+	if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
+		zfs_btree_destroy(&sv.sv_leftover);
+		return;
+	}
+	(void) printf("ERROR: Found livelist blocks marked as allocated "
+	    "for indirect vdevs:\n");
+
+	zfs_btree_index_t *where = NULL;
+	sublivelist_verify_block_t *svb;
+	while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
+	    NULL) {
+		int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
+		ASSERT3U(vdev_id, <, rvd->vdev_children);
+		vdev_t *vd = rvd->vdev_child[vdev_id];
+		ASSERT(!vdev_is_concrete(vd));
+		(void) printf("<%d:%llx:%llx> TXG %llx\n",
+		    vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+		    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
+		    (u_longlong_t)svb->svb_allocated_txg);
+	}
+	(void) printf("\n");
+	zfs_btree_destroy(&sv.sv_leftover);
+}
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init(void)
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+static void
+usage(void)
+{
+	(void) fprintf(stderr,
+	    "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
+	    "[-I <inflight I/Os>]\n"
+	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
+	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
+	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
+	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
+	    "\t%s [-v] <bookmark>\n"
+	    "\t%s -C [-A] [-U <cache>]\n"
+	    "\t%s -l [-Aqu] <device>\n"
+	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
+	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
+	    "\t%s -O <dataset> <path>\n"
+	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
+	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
+	    "\t%s -E [-A] word0:word1:...:word15\n"
+	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
+	    "<poolname>\n\n",
+	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
+	    cmdname, cmdname, cmdname);
+
+	(void) fprintf(stderr, "    Dataset name must include at least one "
+	    "separator character '/' or '@'\n");
+	(void) fprintf(stderr, "    If dataset name is specified, only that "
+	    "dataset is dumped\n");
+	(void) fprintf(stderr,  "    If object numbers or object number "
+	    "ranges are specified, only those\n"
+	    "    objects or ranges are dumped.\n\n");
+	(void) fprintf(stderr,
+	    "    Object ranges take the form <start>:<end>[:<flags>]\n"
+	    "        start    Starting object number\n"
+	    "        end      Ending object number, or -1 for no upper bound\n"
+	    "        flags    Optional flags to select object types:\n"
+	    "            A     All objects (this is the default)\n"
+	    "            d     ZFS directories\n"
+	    "            f     ZFS files \n"
+	    "            m     SPA space maps\n"
+	    "            z     ZAPs\n"
+	    "            -     Negate effect of next flag\n\n");
+	(void) fprintf(stderr, "    Options to control amount of output:\n");
+	(void) fprintf(stderr, "        -b block statistics\n");
+	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
+	    "all data) blocks\n");
+	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
+	(void) fprintf(stderr, "        -d dataset(s)\n");
+	(void) fprintf(stderr, "        -D dedup statistics\n");
+	(void) fprintf(stderr, "        -E decode and display block from an "
+	    "embedded block pointer\n");
+	(void) fprintf(stderr, "        -h pool history\n");
+	(void) fprintf(stderr, "        -i intent logs\n");
+	(void) fprintf(stderr, "        -l read label contents\n");
+	(void) fprintf(stderr, "        -k examine the checkpointed state "
+	    "of the pool\n");
+	(void) fprintf(stderr, "        -L disable leak tracking (do not "
+	    "load spacemaps)\n");
+	(void) fprintf(stderr, "        -m metaslabs\n");
+	(void) fprintf(stderr, "        -M metaslab groups\n");
+	(void) fprintf(stderr, "        -O perform object lookups by path\n");
+	(void) fprintf(stderr, "        -R read and display block from a "
+	    "device\n");
+	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
+	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
+	(void) fprintf(stderr, "        -v verbose (applies to all "
+	    "others)\n");
+	(void) fprintf(stderr, "        -y perform livelist and metaslab "
+	    "validation on any livelists being deleted\n\n");
+	(void) fprintf(stderr, "    Below options are intended for use "
+	    "with other options:\n");
+	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
+	    "panic recovery (-AA) or both (-AAA)\n");
+	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
+	    "has altroot/not in a cachefile\n");
+	(void) fprintf(stderr, "        -F attempt automatic rewind within "
+	    "safe range of transaction groups\n");
+	(void) fprintf(stderr, "        -G dump zfs_dbgmsg buffer before "
+	    "exiting\n");
+	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
+	    "specify the maximum number of\n           "
+	    "checksumming I/Os [default is 200]\n");
+	(void) fprintf(stderr, "        -o <variable>=<value> set global "
+	    "variable to an unsigned 32-bit integer\n");
+	(void) fprintf(stderr, "        -p <path> -- use one or more with "
+	    "-e to specify path to vdev dir\n");
+	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
+	(void) fprintf(stderr, "        -q don't print label contents\n");
+	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
+	    "searching for uberblocks\n");
+	(void) fprintf(stderr, "        -u uberblock\n");
+	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
+	    "cachefile\n");
+	(void) fprintf(stderr, "        -V do verbatim import\n");
+	(void) fprintf(stderr, "        -x <dumpdir> -- "
+	    "dump all read blocks into specified directory\n");
+	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
+	    "work with dataset)\n");
+	(void) fprintf(stderr, "        -Y attempt all reconstruction "
+	    "combinations for split blocks\n");
+	(void) fprintf(stderr, "        -Z show ZSTD headers \n");
+	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
+	    "to make only that option verbose\n");
+	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
+	exit(1);
+}
+
+static void
+dump_debug_buffer(void)
+{
+	if (dump_opt['G']) {
+		(void) printf("\n");
+		(void) fflush(stdout);
+		zfs_dbgmsg_print("zdb");
+	}
+}
+
+/*
+ * Called for usage errors that are discovered after a call to spa_open(),
+ * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
+ */
+
+static void
+fatal(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	(void) fprintf(stderr, "%s: ", cmdname);
+	(void) vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	(void) fprintf(stderr, "\n");
+
+	dump_debug_buffer();
+
+	exit(1);
+}
+
+/* ARGSUSED */
+static void
+dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	nvlist_t *nv;
+	size_t nvsize = *(uint64_t *)data;
+	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
+
+	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
+
+	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
+
+	umem_free(packed, nvsize);
+
+	dump_nvlist(nv, 8);
+
+	nvlist_free(nv);
+}
+
+/* ARGSUSED */
+static void
+dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	spa_history_phys_t *shp = data;
+
+	if (shp == NULL)
+		return;
+
+	(void) printf("\t\tpool_create_len = %llu\n",
+	    (u_longlong_t)shp->sh_pool_create_len);
+	(void) printf("\t\tphys_max_off = %llu\n",
+	    (u_longlong_t)shp->sh_phys_max_off);
+	(void) printf("\t\tbof = %llu\n",
+	    (u_longlong_t)shp->sh_bof);
+	(void) printf("\t\teof = %llu\n",
+	    (u_longlong_t)shp->sh_eof);
+	(void) printf("\t\trecords_lost = %llu\n",
+	    (u_longlong_t)shp->sh_records_lost);
+}
+
+static void
+zdb_nicenum(uint64_t num, char *buf, size_t buflen)
+{
+	if (dump_opt['P'])
+		(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
+	else
+		nicenum(num, buf, sizeof (buf));
+}
+
+static const char histo_stars[] = "****************************************";
+static const uint64_t histo_width = sizeof (histo_stars) - 1;
+
+static void
+dump_histogram(const uint64_t *histo, int size, int offset)
+{
+	int i;
+	int minidx = size - 1;
+	int maxidx = 0;
+	uint64_t max = 0;
+
+	for (i = 0; i < size; i++) {
+		if (histo[i] > max)
+			max = histo[i];
+		if (histo[i] > 0 && i > maxidx)
+			maxidx = i;
+		if (histo[i] > 0 && i < minidx)
+			minidx = i;
+	}
+
+	if (max < histo_width)
+		max = histo_width;
+
+	for (i = minidx; i <= maxidx; i++) {
+		(void) printf("\t\t\t%3u: %6llu %s\n",
+		    i + offset, (u_longlong_t)histo[i],
+		    &histo_stars[(max - histo[i]) * histo_width / max]);
+	}
+}
+
+static void
+dump_zap_stats(objset_t *os, uint64_t object)
+{
+	int error;
+	zap_stats_t zs;
+
+	error = zap_get_stats(os, object, &zs);
+	if (error)
+		return;
+
+	if (zs.zs_ptrtbl_len == 0) {
+		ASSERT(zs.zs_num_blocks == 1);
+		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
+		    (u_longlong_t)zs.zs_blocksize,
+		    (u_longlong_t)zs.zs_num_entries);
+		return;
+	}
+
+	(void) printf("\tFat ZAP stats:\n");
+
+	(void) printf("\t\tPointer table:\n");
+	(void) printf("\t\t\t%llu elements\n",
+	    (u_longlong_t)zs.zs_ptrtbl_len);
+	(void) printf("\t\t\tzt_blk: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
+	(void) printf("\t\t\tzt_numblks: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
+	(void) printf("\t\t\tzt_shift: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
+	(void) printf("\t\t\tzt_blks_copied: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
+	(void) printf("\t\t\tzt_nextblk: %llu\n",
+	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
+
+	(void) printf("\t\tZAP entries: %llu\n",
+	    (u_longlong_t)zs.zs_num_entries);
+	(void) printf("\t\tLeaf blocks: %llu\n",
+	    (u_longlong_t)zs.zs_num_leafs);
+	(void) printf("\t\tTotal blocks: %llu\n",
+	    (u_longlong_t)zs.zs_num_blocks);
+	(void) printf("\t\tzap_block_type: 0x%llx\n",
+	    (u_longlong_t)zs.zs_block_type);
+	(void) printf("\t\tzap_magic: 0x%llx\n",
+	    (u_longlong_t)zs.zs_magic);
+	(void) printf("\t\tzap_salt: 0x%llx\n",
+	    (u_longlong_t)zs.zs_salt);
+
+	(void) printf("\t\tLeafs with 2^n pointers:\n");
+	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
+
+	(void) printf("\t\tBlocks with n*5 entries:\n");
+	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
+
+	(void) printf("\t\tBlocks n/10 full:\n");
+	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
+
+	(void) printf("\t\tEntries with n chunks:\n");
+	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
+
+	(void) printf("\t\tBuckets with n entries:\n");
+	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
+}
+
+/*ARGSUSED*/
+static void
+dump_none(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	(void) printf("\tUNKNOWN OBJECT TYPE\n");
+}
+
+/*ARGSUSED*/
+static void
+dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	uint64_t *arr;
+	uint64_t oursize;
+	if (dump_opt['d'] < 6)
+		return;
+
+	if (data == NULL) {
+		dmu_object_info_t doi;
+
+		VERIFY0(dmu_object_info(os, object, &doi));
+		size = doi.doi_max_offset;
+		/*
+		 * We cap the size at 1 mebibyte here to prevent
+		 * allocation failures and nigh-infinite printing if the
+		 * object is extremely large.
+		 */
+		oursize = MIN(size, 1 << 20);
+		arr = kmem_alloc(oursize, KM_SLEEP);
+
+		int err = dmu_read(os, object, 0, oursize, arr, 0);
+		if (err != 0) {
+			(void) printf("got error %u from dmu_read\n", err);
+			kmem_free(arr, oursize);
+			return;
+		}
+	} else {
+		/*
+		 * Even though the allocation is already done in this code path,
+		 * we still cap the size to prevent excessive printing.
+		 */
+		oursize = MIN(size, 1 << 20);
+		arr = data;
+	}
+
+	if (size == 0) {
+		(void) printf("\t\t[]\n");
+		return;
+	}
+
+	(void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
+	for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
+		if (i % 4 != 0)
+			(void) printf(", %0llx", (u_longlong_t)arr[i]);
+		else
+			(void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
+	}
+	if (oursize != size)
+		(void) printf(", ... ");
+	(void) printf("]\n");
+
+	if (data == NULL)
+		kmem_free(arr, oursize);
+}
+
+/*ARGSUSED*/
+static void
+dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	void *prop;
+	unsigned i;
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = ", attr.za_name);
+		if (attr.za_num_integers == 0) {
+			(void) printf("\n");
+			continue;
+		}
+		prop = umem_zalloc(attr.za_num_integers *
+		    attr.za_integer_length, UMEM_NOFAIL);
+		(void) zap_lookup(os, object, attr.za_name,
+		    attr.za_integer_length, attr.za_num_integers, prop);
+		if (attr.za_integer_length == 1) {
+			(void) printf("%s", (char *)prop);
+		} else {
+			for (i = 0; i < attr.za_num_integers; i++) {
+				switch (attr.za_integer_length) {
+				case 2:
+					(void) printf("%u ",
+					    ((uint16_t *)prop)[i]);
+					break;
+				case 4:
+					(void) printf("%u ",
+					    ((uint32_t *)prop)[i]);
+					break;
+				case 8:
+					(void) printf("%lld ",
+					    (u_longlong_t)((int64_t *)prop)[i]);
+					break;
+				}
+			}
+		}
+		(void) printf("\n");
+		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
+	}
+	zap_cursor_fini(&zc);
+}
+
+static void
+dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	bpobj_phys_t *bpop = data;
+	uint64_t i;
+	char bytes[32], comp[32], uncomp[32];
+
+	/* make sure the output won't get truncated */
+	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
+
+	if (bpop == NULL)
+		return;
+
+	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
+	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
+	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
+
+	(void) printf("\t\tnum_blkptrs = %llu\n",
+	    (u_longlong_t)bpop->bpo_num_blkptrs);
+	(void) printf("\t\tbytes = %s\n", bytes);
+	if (size >= BPOBJ_SIZE_V1) {
+		(void) printf("\t\tcomp = %s\n", comp);
+		(void) printf("\t\tuncomp = %s\n", uncomp);
+	}
+	if (size >= BPOBJ_SIZE_V2) {
+		(void) printf("\t\tsubobjs = %llu\n",
+		    (u_longlong_t)bpop->bpo_subobjs);
+		(void) printf("\t\tnum_subobjs = %llu\n",
+		    (u_longlong_t)bpop->bpo_num_subobjs);
+	}
+	if (size >= sizeof (*bpop)) {
+		(void) printf("\t\tnum_freed = %llu\n",
+		    (u_longlong_t)bpop->bpo_num_freed);
+	}
+
+	if (dump_opt['d'] < 5)
+		return;
+
+	for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
+		char blkbuf[BP_SPRINTF_LEN];
+		blkptr_t bp;
+
+		int err = dmu_read(os, object,
+		    i * sizeof (bp), sizeof (bp), &bp, 0);
+		if (err != 0) {
+			(void) printf("got error %u from dmu_read\n", err);
+			break;
+		}
+		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
+		    BP_GET_FREE(&bp));
+		(void) printf("\t%s\n", blkbuf);
+	}
+}
+
+/* ARGSUSED */
+static void
+dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dmu_object_info_t doi;
+	int64_t i;
+
+	VERIFY0(dmu_object_info(os, object, &doi));
+	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
+
+	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
+	if (err != 0) {
+		(void) printf("got error %u from dmu_read\n", err);
+		kmem_free(subobjs, doi.doi_max_offset);
+		return;
+	}
+
+	int64_t last_nonzero = -1;
+	for (i = 0; i < doi.doi_max_offset / 8; i++) {
+		if (subobjs[i] != 0)
+			last_nonzero = i;
+	}
+
+	for (i = 0; i <= last_nonzero; i++) {
+		(void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
+	}
+	kmem_free(subobjs, doi.doi_max_offset);
+}
+
+/*ARGSUSED*/
+static void
+dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dump_zap_stats(os, object);
+	/* contents are printed elsewhere, properly decoded */
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = ", attr.za_name);
+		if (attr.za_num_integers == 0) {
+			(void) printf("\n");
+			continue;
+		}
+		(void) printf(" %llx : [%d:%d:%d]\n",
+		    (u_longlong_t)attr.za_first_integer,
+		    (int)ATTR_LENGTH(attr.za_first_integer),
+		    (int)ATTR_BSWAP(attr.za_first_integer),
+		    (int)ATTR_NUM(attr.za_first_integer));
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	uint16_t *layout_attrs;
+	unsigned i;
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = [", attr.za_name);
+		if (attr.za_num_integers == 0) {
+			(void) printf("\n");
+			continue;
+		}
+
+		VERIFY(attr.za_integer_length == 2);
+		layout_attrs = umem_zalloc(attr.za_num_integers *
+		    attr.za_integer_length, UMEM_NOFAIL);
+
+		VERIFY(zap_lookup(os, object, attr.za_name,
+		    attr.za_integer_length,
+		    attr.za_num_integers, layout_attrs) == 0);
+
+		for (i = 0; i != attr.za_num_integers; i++)
+			(void) printf(" %d ", (int)layout_attrs[i]);
+		(void) printf("]\n");
+		umem_free(layout_attrs,
+		    attr.za_num_integers * attr.za_integer_length);
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	const char *typenames[] = {
+		/* 0 */ "not specified",
+		/* 1 */ "FIFO",
+		/* 2 */ "Character Device",
+		/* 3 */ "3 (invalid)",
+		/* 4 */ "Directory",
+		/* 5 */ "5 (invalid)",
+		/* 6 */ "Block Device",
+		/* 7 */ "7 (invalid)",
+		/* 8 */ "Regular File",
+		/* 9 */ "9 (invalid)",
+		/* 10 */ "Symbolic Link",
+		/* 11 */ "11 (invalid)",
+		/* 12 */ "Socket",
+		/* 13 */ "Door",
+		/* 14 */ "Event Port",
+		/* 15 */ "15 (invalid)",
+	};
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = %lld (type: %s)\n",
+		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
+		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
+	}
+	zap_cursor_fini(&zc);
+}
+
+static int
+get_dtl_refcount(vdev_t *vd)
+{
+	int refcount = 0;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		space_map_t *sm = vd->vdev_dtl_sm;
+
+		if (sm != NULL &&
+		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+			return (1);
+		return (0);
+	}
+
+	for (unsigned c = 0; c < vd->vdev_children; c++)
+		refcount += get_dtl_refcount(vd->vdev_child[c]);
+	return (refcount);
+}
+
+static int
+get_metaslab_refcount(vdev_t *vd)
+{
+	int refcount = 0;
+
+	if (vd->vdev_top == vd) {
+		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
+
+			if (sm != NULL &&
+			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+				refcount++;
+		}
+	}
+	for (unsigned c = 0; c < vd->vdev_children; c++)
+		refcount += get_metaslab_refcount(vd->vdev_child[c]);
+
+	return (refcount);
+}
+
+static int
+get_obsolete_refcount(vdev_t *vd)
+{
+	uint64_t obsolete_sm_object;
+	int refcount = 0;
+
+	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+	if (vd->vdev_top == vd && obsolete_sm_object != 0) {
+		dmu_object_info_t doi;
+		VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
+		    obsolete_sm_object, &doi));
+		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
+			refcount++;
+		}
+	} else {
+		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
+		ASSERT3U(obsolete_sm_object, ==, 0);
+	}
+	for (unsigned c = 0; c < vd->vdev_children; c++) {
+		refcount += get_obsolete_refcount(vd->vdev_child[c]);
+	}
+
+	return (refcount);
+}
+
+static int
+get_prev_obsolete_spacemap_refcount(spa_t *spa)
+{
+	uint64_t prev_obj =
+	    spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
+	if (prev_obj != 0) {
+		dmu_object_info_t doi;
+		VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
+		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
+			return (1);
+		}
+	}
+	return (0);
+}
+
+static int
+get_checkpoint_refcount(vdev_t *vd)
+{
+	int refcount = 0;
+
+	if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
+	    zap_contains(spa_meta_objset(vd->vdev_spa),
+	    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
+		refcount++;
+
+	for (uint64_t c = 0; c < vd->vdev_children; c++)
+		refcount += get_checkpoint_refcount(vd->vdev_child[c]);
+
+	return (refcount);
+}
+
+static int
+get_log_spacemap_refcount(spa_t *spa)
+{
+	return (avl_numnodes(&spa->spa_sm_logs_by_txg));
+}
+
+static int
+verify_spacemap_refcounts(spa_t *spa)
+{
+	uint64_t expected_refcount = 0;
+	uint64_t actual_refcount;
+
+	(void) feature_get_refcount(spa,
+	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
+	    &expected_refcount);
+	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
+	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
+	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
+	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
+	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
+	actual_refcount += get_log_spacemap_refcount(spa);
+
+	if (expected_refcount != actual_refcount) {
+		(void) printf("space map refcount mismatch: expected %lld != "
+		    "actual %lld\n",
+		    (longlong_t)expected_refcount,
+		    (longlong_t)actual_refcount);
+		return (2);
+	}
+	return (0);
+}
+
+static void
+dump_spacemap(objset_t *os, space_map_t *sm)
+{
+	const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
+	    "INVALID", "INVALID", "INVALID", "INVALID" };
+
+	if (sm == NULL)
+		return;
+
+	(void) printf("space map object %llu:\n",
+	    (longlong_t)sm->sm_object);
+	(void) printf("  smp_length = 0x%llx\n",
+	    (longlong_t)sm->sm_phys->smp_length);
+	(void) printf("  smp_alloc = 0x%llx\n",
+	    (longlong_t)sm->sm_phys->smp_alloc);
+
+	if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+		return;
+
+	/*
+	 * Print out the freelist entries in both encoded and decoded form.
+	 */
+	uint8_t mapshift = sm->sm_shift;
+	int64_t alloc = 0;
+	uint64_t word, entry_id = 0;
+	for (uint64_t offset = 0; offset < space_map_length(sm);
+	    offset += sizeof (word)) {
+
+		VERIFY0(dmu_read(os, space_map_object(sm), offset,
+		    sizeof (word), &word, DMU_READ_PREFETCH));
+
+		if (sm_entry_is_debug(word)) {
+			uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
+			uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
+			if (de_txg == 0) {
+				(void) printf(
+				    "\t    [%6llu] PADDING\n",
+				    (u_longlong_t)entry_id);
+			} else {
+				(void) printf(
+				    "\t    [%6llu] %s: txg %llu pass %llu\n",
+				    (u_longlong_t)entry_id,
+				    ddata[SM_DEBUG_ACTION_DECODE(word)],
+				    (u_longlong_t)de_txg,
+				    (u_longlong_t)de_sync_pass);
+			}
+			entry_id++;
+			continue;
+		}
+
+		uint8_t words;
+		char entry_type;
+		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
+
+		if (sm_entry_is_single_word(word)) {
+			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
+			    'A' : 'F';
+			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
+			    sm->sm_start;
+			entry_run = SM_RUN_DECODE(word) << mapshift;
+			words = 1;
+		} else {
+			/* it is a two-word entry so we read another word */
+			ASSERT(sm_entry_is_double_word(word));
+
+			uint64_t extra_word;
+			offset += sizeof (extra_word);
+			VERIFY0(dmu_read(os, space_map_object(sm), offset,
+			    sizeof (extra_word), &extra_word,
+			    DMU_READ_PREFETCH));
+
+			ASSERT3U(offset, <=, space_map_length(sm));
+
+			entry_run = SM2_RUN_DECODE(word) << mapshift;
+			entry_vdev = SM2_VDEV_DECODE(word);
+			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
+			    'A' : 'F';
+			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
+			    mapshift) + sm->sm_start;
+			words = 2;
+		}
+
+		(void) printf("\t    [%6llu]    %c  range:"
+		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
+		    (u_longlong_t)entry_id,
+		    entry_type, (u_longlong_t)entry_off,
+		    (u_longlong_t)(entry_off + entry_run),
+		    (u_longlong_t)entry_run,
+		    (u_longlong_t)entry_vdev, words);
+
+		if (entry_type == 'A')
+			alloc += entry_run;
+		else
+			alloc -= entry_run;
+		entry_id++;
+	}
+	if (alloc != space_map_allocated(sm)) {
+		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
+		    "with space map summary (%lld)\n",
+		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
+	}
+}
+
+static void
+dump_metaslab_stats(metaslab_t *msp)
+{
+	char maxbuf[32];
+	range_tree_t *rt = msp->ms_allocatable;
+	zfs_btree_t *t = &msp->ms_allocatable_by_size;
+	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+
+	/* max sure nicenum has enough space */
+	CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
+
+	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
+
+	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
+	    "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
+	    "freepct", free_pct);
+	(void) printf("\tIn-memory histogram:\n");
+	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+}
+
+static void
+dump_metaslab(metaslab_t *msp)
+{
+	vdev_t *vd = msp->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	space_map_t *sm = msp->ms_sm;
+	char freebuf[32];
+
+	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
+	    sizeof (freebuf));
+
+	(void) printf(
+	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
+	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
+	    (u_longlong_t)space_map_object(sm), freebuf);
+
+	if (dump_opt['m'] > 2 && !dump_opt['L']) {
+		mutex_enter(&msp->ms_lock);
+		VERIFY0(metaslab_load(msp));
+		range_tree_stat_verify(msp->ms_allocatable);
+		dump_metaslab_stats(msp);
+		metaslab_unload(msp);
+		mutex_exit(&msp->ms_lock);
+	}
+
+	if (dump_opt['m'] > 1 && sm != NULL &&
+	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+		/*
+		 * The space map histogram represents free space in chunks
+		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
+		 */
+		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
+		    (u_longlong_t)msp->ms_fragmentation);
+		dump_histogram(sm->sm_phys->smp_histogram,
+		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
+	}
+
+	ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+		(void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
+		    (u_longlong_t)metaslab_unflushed_txg(msp));
+	}
+}
+
+static void
+print_vdev_metaslab_header(vdev_t *vd)
+{
+	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+	const char *bias_str = "";
+	if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
+		bias_str = VDEV_ALLOC_BIAS_LOG;
+	} else if (alloc_bias == VDEV_BIAS_SPECIAL) {
+		bias_str = VDEV_ALLOC_BIAS_SPECIAL;
+	} else if (alloc_bias == VDEV_BIAS_DEDUP) {
+		bias_str = VDEV_ALLOC_BIAS_DEDUP;
+	}
+
+	uint64_t ms_flush_data_obj = 0;
+	if (vd->vdev_top_zap != 0) {
+		int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+		    sizeof (uint64_t), 1, &ms_flush_data_obj);
+		if (error != ENOENT) {
+			ASSERT0(error);
+		}
+	}
+
+	(void) printf("\tvdev %10llu   %s",
+	    (u_longlong_t)vd->vdev_id, bias_str);
+
+	if (ms_flush_data_obj != 0) {
+		(void) printf("   ms_unflushed_phys object %llu",
+		    (u_longlong_t)ms_flush_data_obj);
+	}
+
+	(void) printf("\n\t%-10s%5llu   %-19s   %-15s   %-12s\n",
+	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
+	    "offset", "spacemap", "free");
+	(void) printf("\t%15s   %19s   %15s   %12s\n",
+	    "---------------", "-------------------",
+	    "---------------", "------------");
+}
+
+static void
+dump_metaslab_groups(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	metaslab_class_t *mc = spa_normal_class(spa);
+	uint64_t fragmentation;
+
+	metaslab_class_histogram_verify(mc);
+
+	for (unsigned c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (mg == NULL || mg->mg_class != mc)
+			continue;
+
+		metaslab_group_histogram_verify(mg);
+		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
+
+		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
+		    "fragmentation",
+		    (u_longlong_t)tvd->vdev_id,
+		    (u_longlong_t)tvd->vdev_ms_count);
+		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+			(void) printf("%3s\n", "-");
+		} else {
+			(void) printf("%3llu%%\n",
+			    (u_longlong_t)mg->mg_fragmentation);
+		}
+		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+	}
+
+	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
+	fragmentation = metaslab_class_fragmentation(mc);
+	if (fragmentation == ZFS_FRAG_INVALID)
+		(void) printf("\t%3s\n", "-");
+	else
+		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
+	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+}
+
+static void
+print_vdev_indirect(vdev_t *vd)
+{
+	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+	if (vim == NULL) {
+		ASSERT3P(vib, ==, NULL);
+		return;
+	}
+
+	ASSERT3U(vdev_indirect_mapping_object(vim), ==,
+	    vic->vic_mapping_object);
+	ASSERT3U(vdev_indirect_births_object(vib), ==,
+	    vic->vic_births_object);
+
+	(void) printf("indirect births obj %llu:\n",
+	    (longlong_t)vic->vic_births_object);
+	(void) printf("    vib_count = %llu\n",
+	    (longlong_t)vdev_indirect_births_count(vib));
+	for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
+		vdev_indirect_birth_entry_phys_t *cur_vibe =
+		    &vib->vib_entries[i];
+		(void) printf("\toffset %llx -> txg %llu\n",
+		    (longlong_t)cur_vibe->vibe_offset,
+		    (longlong_t)cur_vibe->vibe_phys_birth_txg);
+	}
+	(void) printf("\n");
+
+	(void) printf("indirect mapping obj %llu:\n",
+	    (longlong_t)vic->vic_mapping_object);
+	(void) printf("    vim_max_offset = 0x%llx\n",
+	    (longlong_t)vdev_indirect_mapping_max_offset(vim));
+	(void) printf("    vim_bytes_mapped = 0x%llx\n",
+	    (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
+	(void) printf("    vim_count = %llu\n",
+	    (longlong_t)vdev_indirect_mapping_num_entries(vim));
+
+	if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
+		return;
+
+	uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
+
+	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
+		vdev_indirect_mapping_entry_phys_t *vimep =
+		    &vim->vim_entries[i];
+		(void) printf("\t<%llx:%llx:%llx> -> "
+		    "<%llx:%llx:%llx> (%x obsolete)\n",
+		    (longlong_t)vd->vdev_id,
+		    (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
+		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+		    (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
+		    (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
+		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+		    counts[i]);
+	}
+	(void) printf("\n");
+
+	uint64_t obsolete_sm_object;
+	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+	if (obsolete_sm_object != 0) {
+		objset_t *mos = vd->vdev_spa->spa_meta_objset;
+		(void) printf("obsolete space map object %llu:\n",
+		    (u_longlong_t)obsolete_sm_object);
+		ASSERT(vd->vdev_obsolete_sm != NULL);
+		ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
+		    obsolete_sm_object);
+		dump_spacemap(mos, vd->vdev_obsolete_sm);
+		(void) printf("\n");
+	}
+}
+
+static void
+dump_metaslabs(spa_t *spa)
+{
+	vdev_t *vd, *rvd = spa->spa_root_vdev;
+	uint64_t m, c = 0, children = rvd->vdev_children;
+
+	(void) printf("\nMetaslabs:\n");
+
+	if (!dump_opt['d'] && zopt_metaslab_args > 0) {
+		c = zopt_metaslab[0];
+
+		if (c >= children)
+			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
+
+		if (zopt_metaslab_args > 1) {
+			vd = rvd->vdev_child[c];
+			print_vdev_metaslab_header(vd);
+
+			for (m = 1; m < zopt_metaslab_args; m++) {
+				if (zopt_metaslab[m] < vd->vdev_ms_count)
+					dump_metaslab(
+					    vd->vdev_ms[zopt_metaslab[m]]);
+				else
+					(void) fprintf(stderr, "bad metaslab "
+					    "number %llu\n",
+					    (u_longlong_t)zopt_metaslab[m]);
+			}
+			(void) printf("\n");
+			return;
+		}
+		children = c + 1;
+	}
+	for (; c < children; c++) {
+		vd = rvd->vdev_child[c];
+		print_vdev_metaslab_header(vd);
+
+		print_vdev_indirect(vd);
+
+		for (m = 0; m < vd->vdev_ms_count; m++)
+			dump_metaslab(vd->vdev_ms[m]);
+		(void) printf("\n");
+	}
+}
+
+static void
+dump_log_spacemaps(spa_t *spa)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	(void) printf("\nLog Space Maps in Pool:\n");
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+		space_map_t *sm = NULL;
+		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+		(void) printf("Log Spacemap object %llu txg %llu\n",
+		    (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
+		dump_spacemap(spa->spa_meta_objset, sm);
+		space_map_close(sm);
+	}
+	(void) printf("\n");
+}
+
+static void
+dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+{
+	const ddt_phys_t *ddp = dde->dde_phys;
+	const ddt_key_t *ddk = &dde->dde_key;
+	const char *types[4] = { "ditto", "single", "double", "triple" };
+	char blkbuf[BP_SPRINTF_LEN];
+	blkptr_t blk;
+	int p;
+
+	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
+		(void) printf("index %llx refcnt %llu %s %s\n",
+		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+		    types[p], blkbuf);
+	}
+}
+
+static void
+dump_dedup_ratio(const ddt_stat_t *dds)
+{
+	double rL, rP, rD, D, dedup, compress, copies;
+
+	if (dds->dds_blocks == 0)
+		return;
+
+	rL = (double)dds->dds_ref_lsize;
+	rP = (double)dds->dds_ref_psize;
+	rD = (double)dds->dds_ref_dsize;
+	D = (double)dds->dds_dsize;
+
+	dedup = rD / D;
+	compress = rL / rP;
+	copies = rD / rP;
+
+	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+	    "dedup * compress / copies = %.2f\n\n",
+	    dedup, compress, copies, dedup * compress / copies);
+}
+
+static void
+dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	char name[DDT_NAMELEN];
+	ddt_entry_t dde;
+	uint64_t walk = 0;
+	dmu_object_info_t doi;
+	uint64_t count, dspace, mspace;
+	int error;
+
+	error = ddt_object_info(ddt, type, class, &doi);
+
+	if (error == ENOENT)
+		return;
+	ASSERT(error == 0);
+
+	error = ddt_object_count(ddt, type, class, &count);
+	ASSERT(error == 0);
+	if (count == 0)
+		return;
+
+	dspace = doi.doi_physical_blocks_512 << 9;
+	mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+	ddt_object_name(ddt, type, class, name);
+
+	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+	    name,
+	    (u_longlong_t)count,
+	    (u_longlong_t)(dspace / count),
+	    (u_longlong_t)(mspace / count));
+
+	if (dump_opt['D'] < 3)
+		return;
+
+	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+
+	if (dump_opt['D'] < 4)
+		return;
+
+	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+		return;
+
+	(void) printf("%s contents:\n\n", name);
+
+	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+		dump_dde(ddt, &dde, walk);
+
+	ASSERT3U(error, ==, ENOENT);
+
+	(void) printf("\n");
+}
+
+static void
+dump_all_ddts(spa_t *spa)
+{
+	ddt_histogram_t ddh_total;
+	ddt_stat_t dds_total;
+
+	bzero(&ddh_total, sizeof (ddh_total));
+	bzero(&dds_total, sizeof (dds_total));
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				dump_ddt(ddt, type, class);
+			}
+		}
+	}
+
+	ddt_get_dedup_stats(spa, &dds_total);
+
+	if (dds_total.dds_blocks == 0) {
+		(void) printf("All DDTs are empty\n");
+		return;
+	}
+
+	(void) printf("\n");
+
+	if (dump_opt['D'] > 1) {
+		(void) printf("DDT histogram (aggregated over all DDTs):\n");
+		ddt_get_dedup_histogram(spa, &ddh_total);
+		zpool_dump_ddt(&dds_total, &ddh_total);
+	}
+
+	dump_dedup_ratio(&dds_total);
+}
+
+static void
+dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
+{
+	char *prefix = arg;
+
+	(void) printf("%s [%llu,%llu) length %llu\n",
+	    prefix,
+	    (u_longlong_t)start,
+	    (u_longlong_t)(start + size),
+	    (u_longlong_t)(size));
+}
+
+static void
+dump_dtl(vdev_t *vd, int indent)
+{
+	spa_t *spa = vd->vdev_spa;
+	boolean_t required;
+	const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
+		"outage" };
+	char prefix[256];
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+	required = vdev_dtl_required(vd);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+
+	if (indent == 0)
+		(void) printf("\nDirty time logs:\n\n");
+
+	(void) printf("\t%*s%s [%s]\n", indent, "",
+	    vd->vdev_path ? vd->vdev_path :
+	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
+	    required ? "DTL-required" : "DTL-expendable");
+
+	for (int t = 0; t < DTL_TYPES; t++) {
+		range_tree_t *rt = vd->vdev_dtl[t];
+		if (range_tree_space(rt) == 0)
+			continue;
+		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
+		    indent + 2, "", name[t]);
+		range_tree_walk(rt, dump_dtl_seg, prefix);
+		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
+			dump_spacemap(spa->spa_meta_objset,
+			    vd->vdev_dtl_sm);
+	}
+
+	for (unsigned c = 0; c < vd->vdev_children; c++)
+		dump_dtl(vd->vdev_child[c], indent + 4);
+}
+
+static void
+dump_history(spa_t *spa)
+{
+	nvlist_t **events = NULL;
+	char *buf;
+	uint64_t resid, len, off = 0;
+	uint_t num = 0;
+	int error;
+	time_t tsec;
+	struct tm t;
+	char tbuf[30];
+	char internalstr[MAXPATHLEN];
+
+	if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
+		(void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
+		    __func__);
+		return;
+	}
+
+	do {
+		len = SPA_OLD_MAXBLOCKSIZE;
+
+		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+			(void) fprintf(stderr, "Unable to read history: "
+			    "error %d\n", error);
+			free(buf);
+			return;
+		}
+
+		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+			break;
+
+		off -= resid;
+	} while (len != 0);
+
+	(void) printf("\nHistory:\n");
+	for (unsigned i = 0; i < num; i++) {
+		uint64_t time, txg, ievent;
+		char *cmd, *intstr;
+		boolean_t printed = B_FALSE;
+
+		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+		    &time) != 0)
+			goto next;
+		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+		    &cmd) != 0) {
+			if (nvlist_lookup_uint64(events[i],
+			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+				goto next;
+			verify(nvlist_lookup_uint64(events[i],
+			    ZPOOL_HIST_TXG, &txg) == 0);
+			verify(nvlist_lookup_string(events[i],
+			    ZPOOL_HIST_INT_STR, &intstr) == 0);
+			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
+				goto next;
+
+			(void) snprintf(internalstr,
+			    sizeof (internalstr),
+			    "[internal %s txg:%lld] %s",
+			    zfs_history_event_names[ievent],
+			    (longlong_t)txg, intstr);
+			cmd = internalstr;
+		}
+		tsec = time;
+		(void) localtime_r(&tsec, &t);
+		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+		(void) printf("%s %s\n", tbuf, cmd);
+		printed = B_TRUE;
+
+next:
+		if (dump_opt['h'] > 1) {
+			if (!printed)
+				(void) printf("unrecognized record:\n");
+			dump_nvlist(events[i], 2);
+		}
+	}
+	free(buf);
+}
+
+/*ARGSUSED*/
+static void
+dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static uint64_t
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
+    const zbookmark_phys_t *zb)
+{
+	if (dnp == NULL) {
+		ASSERT(zb->zb_level < 0);
+		if (zb->zb_object == 0)
+			return (zb->zb_blkid);
+		return (zb->zb_blkid * BP_GET_LSIZE(bp));
+	}
+
+	ASSERT(zb->zb_level >= 0);
+
+	return ((zb->zb_blkid <<
+	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+}
+
+static void
+snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
+    const blkptr_t *bp)
+{
+	abd_t *pabd;
+	void *buf;
+	zio_t *zio;
+	zfs_zstdhdr_t zstd_hdr;
+	int error;
+
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
+		return;
+
+	if (BP_IS_HOLE(bp))
+		return;
+
+	if (BP_IS_EMBEDDED(bp)) {
+		buf = malloc(SPA_MAXBLOCKSIZE);
+		if (buf == NULL) {
+			(void) fprintf(stderr, "out of memory\n");
+			exit(1);
+		}
+		decode_embedded_bp_compressed(bp, buf);
+		memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
+		free(buf);
+		zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
+		zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf),
+		    " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
+		    zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level);
+		return;
+	}
+
+	pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
+	zio = zio_root(spa, NULL, NULL, 0);
+
+	/* Decrypt but don't decompress so we can read the compression header */
+	zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
+	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
+	    NULL));
+	error = zio_wait(zio);
+	if (error) {
+		(void) fprintf(stderr, "read failed: %d\n", error);
+		return;
+	}
+	buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
+	memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
+	zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
+	zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
+
+	(void) snprintf(blkbuf + strlen(blkbuf),
+	    buflen - strlen(blkbuf),
+	    " ZSTD:size=%u:version=%u:level=%u:NORMAL",
+	    zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level);
+
+	abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
+}
+
+static void
+snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
+    boolean_t bp_freed)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+	int i;
+
+	if (dump_opt['b'] >= 6) {
+		snprintf_blkptr(blkbuf, buflen, bp);
+		if (bp_freed) {
+			(void) snprintf(blkbuf + strlen(blkbuf),
+			    buflen - strlen(blkbuf), " %s", "FREE");
+		}
+		return;
+	}
+
+	if (BP_IS_EMBEDDED(bp)) {
+		(void) sprintf(blkbuf,
+		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
+		    (int)BPE_GET_ETYPE(bp),
+		    (u_longlong_t)BPE_GET_LSIZE(bp),
+		    (u_longlong_t)BPE_GET_PSIZE(bp),
+		    (u_longlong_t)bp->blk_birth);
+		return;
+	}
+
+	blkbuf[0] = '\0';
+
+	for (i = 0; i < ndvas; i++)
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
+		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
+		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
+		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+
+	if (BP_IS_HOLE(bp)) {
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf),
+		    "%llxL B=%llu",
+		    (u_longlong_t)BP_GET_LSIZE(bp),
+		    (u_longlong_t)bp->blk_birth);
+	} else {
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf),
+		    "%llxL/%llxP F=%llu B=%llu/%llu",
+		    (u_longlong_t)BP_GET_LSIZE(bp),
+		    (u_longlong_t)BP_GET_PSIZE(bp),
+		    (u_longlong_t)BP_GET_FILL(bp),
+		    (u_longlong_t)bp->blk_birth,
+		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
+		if (bp_freed)
+			(void) snprintf(blkbuf + strlen(blkbuf),
+			    buflen - strlen(blkbuf), " %s", "FREE");
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx",
+		    (u_longlong_t)bp->blk_cksum.zc_word[0],
+		    (u_longlong_t)bp->blk_cksum.zc_word[1],
+		    (u_longlong_t)bp->blk_cksum.zc_word[2],
+		    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+	}
+}
+
+static void
+print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
+    const dnode_phys_t *dnp)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+	int l;
+
+	if (!BP_IS_EMBEDDED(bp)) {
+		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+	}
+
+	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
+
+	ASSERT(zb->zb_level >= 0);
+
+	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
+		if (l == zb->zb_level) {
+			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
+		} else {
+			(void) printf(" ");
+		}
+	}
+
+	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
+	if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
+		snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
+	(void) printf("%s\n", blkbuf);
+}
+
+static int
+visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
+    blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	int err = 0;
+
+	if (bp->blk_birth == 0)
+		return (0);
+
+	print_indirect(spa, bp, zb, dnp);
+
+	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		arc_buf_t *buf;
+		uint64_t fill = 0;
+		ASSERT(!BP_IS_REDACTED(bp));
+
+		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err)
+			return (err);
+		ASSERT(buf->b_data);
+
+		/* recursively visit blocks below this */
+		cbp = buf->b_data;
+		for (i = 0; i < epb; i++, cbp++) {
+			zbookmark_phys_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			err = visit_indirect(spa, dnp, cbp, &czb);
+			if (err)
+				break;
+			fill += BP_GET_FILL(cbp);
+		}
+		if (!err)
+			ASSERT3U(fill, ==, BP_GET_FILL(bp));
+		arc_buf_destroy(buf, &buf);
+	}
+
+	return (err);
+}
+
+/*ARGSUSED*/
+static void
+dump_indirect(dnode_t *dn)
+{
+	dnode_phys_t *dnp = dn->dn_phys;
+	int j;
+	zbookmark_phys_t czb;
+
+	(void) printf("Indirect blocks:\n");
+
+	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
+	    dn->dn_object, dnp->dn_nlevels - 1, 0);
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		czb.zb_blkid = j;
+		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
+		    &dnp->dn_blkptr[j], &czb);
+	}
+
+	(void) printf("\n");
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dsl_dir_phys_t *dd = data;
+	time_t crtime;
+	char nice[32];
+
+	/* make sure nicenum has enough space */
+	CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);
+
+	if (dd == NULL)
+		return;
+
+	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
+
+	crtime = dd->dd_creation_time;
+	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
+	(void) printf("\t\thead_dataset_obj = %llu\n",
+	    (u_longlong_t)dd->dd_head_dataset_obj);
+	(void) printf("\t\tparent_dir_obj = %llu\n",
+	    (u_longlong_t)dd->dd_parent_obj);
+	(void) printf("\t\torigin_obj = %llu\n",
+	    (u_longlong_t)dd->dd_origin_obj);
+	(void) printf("\t\tchild_dir_zapobj = %llu\n",
+	    (u_longlong_t)dd->dd_child_dir_zapobj);
+	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
+	(void) printf("\t\tused_bytes = %s\n", nice);
+	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
+	(void) printf("\t\tcompressed_bytes = %s\n", nice);
+	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
+	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
+	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
+	(void) printf("\t\tquota = %s\n", nice);
+	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
+	(void) printf("\t\treserved = %s\n", nice);
+	(void) printf("\t\tprops_zapobj = %llu\n",
+	    (u_longlong_t)dd->dd_props_zapobj);
+	(void) printf("\t\tdeleg_zapobj = %llu\n",
+	    (u_longlong_t)dd->dd_deleg_zapobj);
+	(void) printf("\t\tflags = %llx\n",
+	    (u_longlong_t)dd->dd_flags);
+
+#define	DO(which) \
+	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
+	    sizeof (nice)); \
+	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
+	DO(HEAD);
+	DO(SNAP);
+	DO(CHILD);
+	DO(CHILD_RSRV);
+	DO(REFRSRV);
+#undef DO
+	(void) printf("\t\tclones = %llu\n",
+	    (u_longlong_t)dd->dd_clones);
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dsl_dataset_phys_t *ds = data;
+	time_t crtime;
+	char used[32], compressed[32], uncompressed[32], unique[32];
+	char blkbuf[BP_SPRINTF_LEN];
+
+	/* make sure nicenum has enough space */
+	CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);
+
+	if (ds == NULL)
+		return;
+
+	ASSERT(size == sizeof (*ds));
+	crtime = ds->ds_creation_time;
+	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
+	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
+	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
+	    sizeof (uncompressed));
+	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
+	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
+
+	(void) printf("\t\tdir_obj = %llu\n",
+	    (u_longlong_t)ds->ds_dir_obj);
+	(void) printf("\t\tprev_snap_obj = %llu\n",
+	    (u_longlong_t)ds->ds_prev_snap_obj);
+	(void) printf("\t\tprev_snap_txg = %llu\n",
+	    (u_longlong_t)ds->ds_prev_snap_txg);
+	(void) printf("\t\tnext_snap_obj = %llu\n",
+	    (u_longlong_t)ds->ds_next_snap_obj);
+	(void) printf("\t\tsnapnames_zapobj = %llu\n",
+	    (u_longlong_t)ds->ds_snapnames_zapobj);
+	(void) printf("\t\tnum_children = %llu\n",
+	    (u_longlong_t)ds->ds_num_children);
+	(void) printf("\t\tuserrefs_obj = %llu\n",
+	    (u_longlong_t)ds->ds_userrefs_obj);
+	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
+	(void) printf("\t\tcreation_txg = %llu\n",
+	    (u_longlong_t)ds->ds_creation_txg);
+	(void) printf("\t\tdeadlist_obj = %llu\n",
+	    (u_longlong_t)ds->ds_deadlist_obj);
+	(void) printf("\t\tused_bytes = %s\n", used);
+	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
+	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
+	(void) printf("\t\tunique = %s\n", unique);
+	(void) printf("\t\tfsid_guid = %llu\n",
+	    (u_longlong_t)ds->ds_fsid_guid);
+	(void) printf("\t\tguid = %llu\n",
+	    (u_longlong_t)ds->ds_guid);
+	(void) printf("\t\tflags = %llx\n",
+	    (u_longlong_t)ds->ds_flags);
+	(void) printf("\t\tnext_clones_obj = %llu\n",
+	    (u_longlong_t)ds->ds_next_clones_obj);
+	(void) printf("\t\tprops_obj = %llu\n",
+	    (u_longlong_t)ds->ds_props_obj);
+	(void) printf("\t\tbp = %s\n", blkbuf);
+}
+
+/* ARGSUSED */
+static int
+dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+
+	if (bp->blk_birth != 0) {
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("\t%s\n", blkbuf);
+	}
+	return (0);
+}
+
+static void
+dump_bptree(objset_t *os, uint64_t obj, const char *name)
+{
+	char bytes[32];
+	bptree_phys_t *bt;
+	dmu_buf_t *db;
+
+	/* make sure nicenum has enough space */
+	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+
+	if (dump_opt['d'] < 3)
+		return;
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
+	(void) printf("\n    %s: %llu datasets, %s\n",
+	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
+	dmu_buf_rele(db, FTAG);
+
+	if (dump_opt['d'] < 5)
+		return;
+
+	(void) printf("\n");
+
+	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
+}
+
+/* ARGSUSED */
+static int
+dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+
+	ASSERT(bp->blk_birth != 0);
+	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
+	(void) printf("\t%s\n", blkbuf);
+	return (0);
+}
+
+static void
+dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
+{
+	char bytes[32];
+	char comp[32];
+	char uncomp[32];
+	uint64_t i;
+
+	/* make sure nicenum has enough space */
+	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
+
+	if (dump_opt['d'] < 3)
+		return;
+
+	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
+	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
+		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
+		if (bpo->bpo_havefreed) {
+			(void) printf("    %*s: object %llu, %llu local "
+			    "blkptrs, %llu freed, %llu subobjs in object %llu, "
+			    "%s (%s/%s comp)\n",
+			    indent * 8, name,
+			    (u_longlong_t)bpo->bpo_object,
+			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
+			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
+			    bytes, comp, uncomp);
+		} else {
+			(void) printf("    %*s: object %llu, %llu local "
+			    "blkptrs, %llu subobjs in object %llu, "
+			    "%s (%s/%s comp)\n",
+			    indent * 8, name,
+			    (u_longlong_t)bpo->bpo_object,
+			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
+			    bytes, comp, uncomp);
+		}
+
+		for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+			uint64_t subobj;
+			bpobj_t subbpo;
+			int error;
+			VERIFY0(dmu_read(bpo->bpo_os,
+			    bpo->bpo_phys->bpo_subobjs,
+			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+			if (error != 0) {
+				(void) printf("ERROR %u while trying to open "
+				    "subobj id %llu\n",
+				    error, (u_longlong_t)subobj);
+				continue;
+			}
+			dump_full_bpobj(&subbpo, "subobj", indent + 1);
+			bpobj_close(&subbpo);
+		}
+	} else {
+		if (bpo->bpo_havefreed) {
+			(void) printf("    %*s: object %llu, %llu blkptrs, "
+			    "%llu freed, %s\n",
+			    indent * 8, name,
+			    (u_longlong_t)bpo->bpo_object,
+			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
+			    bytes);
+		} else {
+			(void) printf("    %*s: object %llu, %llu blkptrs, "
+			    "%s\n",
+			    indent * 8, name,
+			    (u_longlong_t)bpo->bpo_object,
+			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+			    bytes);
+		}
+	}
+
+	if (dump_opt['d'] < 5)
+		return;
+
+
+	if (indent == 0) {
+		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+		(void) printf("\n");
+	}
+}
+
+static int
+dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
+    boolean_t print_list)
+{
+	int err = 0;
+	zfs_bookmark_phys_t prop;
+	objset_t *mos = dp->dp_spa->spa_meta_objset;
+	err = dsl_bookmark_lookup(dp, name, NULL, &prop);
+
+	if (err != 0) {
+		return (err);
+	}
+
+	(void) printf("\t#%s: ", strchr(name, '#') + 1);
+	(void) printf("{guid: %llx creation_txg: %llu creation_time: "
+	    "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
+	    (u_longlong_t)prop.zbm_creation_txg,
+	    (u_longlong_t)prop.zbm_creation_time,
+	    (u_longlong_t)prop.zbm_redaction_obj);
+
+	IMPLY(print_list, print_redact);
+	if (!print_redact || prop.zbm_redaction_obj == 0)
+		return (0);
+
+	redaction_list_t *rl;
+	VERIFY0(dsl_redaction_list_hold_obj(dp,
+	    prop.zbm_redaction_obj, FTAG, &rl));
+
+	redaction_list_phys_t *rlp = rl->rl_phys;
+	(void) printf("\tRedacted:\n\t\tProgress: ");
+	if (rlp->rlp_last_object != UINT64_MAX ||
+	    rlp->rlp_last_blkid != UINT64_MAX) {
+		(void) printf("%llu %llu (incomplete)\n",
+		    (u_longlong_t)rlp->rlp_last_object,
+		    (u_longlong_t)rlp->rlp_last_blkid);
+	} else {
+		(void) printf("complete\n");
+	}
+	(void) printf("\t\tSnapshots: [");
+	for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
+		if (i > 0)
+			(void) printf(", ");
+		(void) printf("%0llu",
+		    (u_longlong_t)rlp->rlp_snaps[i]);
+	}
+	(void) printf("]\n\t\tLength: %llu\n",
+	    (u_longlong_t)rlp->rlp_num_entries);
+
+	if (!print_list) {
+		dsl_redaction_list_rele(rl, FTAG);
+		return (0);
+	}
+
+	if (rlp->rlp_num_entries == 0) {
+		dsl_redaction_list_rele(rl, FTAG);
+		(void) printf("\t\tRedaction List: []\n\n");
+		return (0);
+	}
+
+	redact_block_phys_t *rbp_buf;
+	uint64_t size;
+	dmu_object_info_t doi;
+
+	VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
+	size = doi.doi_max_offset;
+	rbp_buf = kmem_alloc(size, KM_SLEEP);
+
+	err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
+	    rbp_buf, 0);
+	if (err != 0) {
+		dsl_redaction_list_rele(rl, FTAG);
+		kmem_free(rbp_buf, size);
+		return (err);
+	}
+
+	(void) printf("\t\tRedaction List: [{object: %llx, offset: "
+	    "%llx, blksz: %x, count: %llx}",
+	    (u_longlong_t)rbp_buf[0].rbp_object,
+	    (u_longlong_t)rbp_buf[0].rbp_blkid,
+	    (uint_t)(redact_block_get_size(&rbp_buf[0])),
+	    (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
+
+	for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
+		(void) printf(",\n\t\t{object: %llx, offset: %llx, "
+		    "blksz: %x, count: %llx}",
+		    (u_longlong_t)rbp_buf[i].rbp_object,
+		    (u_longlong_t)rbp_buf[i].rbp_blkid,
+		    (uint_t)(redact_block_get_size(&rbp_buf[i])),
+		    (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
+	}
+	dsl_redaction_list_rele(rl, FTAG);
+	kmem_free(rbp_buf, size);
+	(void) printf("]\n\n");
+	return (0);
+}
+
+static void
+dump_bookmarks(objset_t *os, int verbosity)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	dsl_dataset_t *ds = dmu_objset_ds(os);
+	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
+	objset_t *mos = os->os_spa->spa_meta_objset;
+	if (verbosity < 4)
+		return;
+	dsl_pool_config_enter(dp, FTAG);
+
+	for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		char osname[ZFS_MAX_DATASET_NAME_LEN];
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
+		dmu_objset_name(os, osname);
+		VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname,
+		    attr.za_name));
+		(void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
+	}
+	zap_cursor_fini(&zc);
+	dsl_pool_config_exit(dp, FTAG);
+}
+
+static void
+bpobj_count_refd(bpobj_t *bpo)
+{
+	mos_obj_refd(bpo->bpo_object);
+
+	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+		mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
+		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+			uint64_t subobj;
+			bpobj_t subbpo;
+			int error;
+			VERIFY0(dmu_read(bpo->bpo_os,
+			    bpo->bpo_phys->bpo_subobjs,
+			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+			if (error != 0) {
+				(void) printf("ERROR %u while trying to open "
+				    "subobj id %llu\n",
+				    error, (u_longlong_t)subobj);
+				continue;
+			}
+			bpobj_count_refd(&subbpo);
+			bpobj_close(&subbpo);
+		}
+	}
+}
+
+static int
+dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
+{
+	spa_t *spa = arg;
+	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
+	if (dle->dle_bpobj.bpo_object != empty_bpobj)
+		bpobj_count_refd(&dle->dle_bpobj);
+	return (0);
+}
+
+static int
+dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
+{
+	ASSERT(arg == NULL);
+	if (dump_opt['d'] >= 5) {
+		char buf[128];
+		(void) snprintf(buf, sizeof (buf),
+		    "mintxg %llu -> obj %llu",
+		    (longlong_t)dle->dle_mintxg,
+		    (longlong_t)dle->dle_bpobj.bpo_object);
+
+		dump_full_bpobj(&dle->dle_bpobj, buf, 0);
+	} else {
+		(void) printf("mintxg %llu -> obj %llu\n",
+		    (longlong_t)dle->dle_mintxg,
+		    (longlong_t)dle->dle_bpobj.bpo_object);
+	}
+	return (0);
+}
+
+static void
+dump_blkptr_list(dsl_deadlist_t *dl, char *name)
+{
+	char bytes[32];
+	char comp[32];
+	char uncomp[32];
+	char entries[32];
+	spa_t *spa = dmu_objset_spa(dl->dl_os);
+	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
+
+	if (dl->dl_oldfmt) {
+		if (dl->dl_bpobj.bpo_object != empty_bpobj)
+			bpobj_count_refd(&dl->dl_bpobj);
+	} else {
+		mos_obj_refd(dl->dl_object);
+		dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
+	}
+
+	/* make sure nicenum has enough space */
+	CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ);
+
+	if (dump_opt['d'] < 3)
+		return;
+
+	if (dl->dl_oldfmt) {
+		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
+		return;
+	}
+
+	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
+	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
+	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
+	zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
+	(void) printf("\n    %s: %s (%s/%s comp), %s entries\n",
+	    name, bytes, comp, uncomp, entries);
+
+	if (dump_opt['d'] < 4)
+		return;
+
+	(void) printf("\n");
+
+	dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
+}
+
+static int
+verify_dd_livelist(objset_t *os)
+{
+	uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
+	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
+	dsl_dir_t  *dd = os->os_dsl_dataset->ds_dir;
+
+	ASSERT(!dmu_objset_is_snapshot(os));
+	if (!dsl_deadlist_is_open(&dd->dd_livelist))
+		return (0);
+
+	/* Iterate through the livelist to check for duplicates */
+	dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
+	    NULL);
+
+	dsl_pool_config_enter(dp, FTAG);
+	dsl_deadlist_space(&dd->dd_livelist, &ll_used,
+	    &ll_comp, &ll_uncomp);
+
+	dsl_dataset_t *origin_ds;
+	ASSERT(dsl_pool_config_held(dp));
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
+	VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
+	    &used, &comp, &uncomp));
+	dsl_dataset_rele(origin_ds, FTAG);
+	dsl_pool_config_exit(dp, FTAG);
+	/*
+	 *  It's possible that the dataset's uncomp space is larger than the
+	 *  livelist's because livelists do not track embedded block pointers
+	 */
+	if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
+		char nice_used[32], nice_comp[32], nice_uncomp[32];
+		(void) printf("Discrepancy in space accounting:\n");
+		zdb_nicenum(used, nice_used, sizeof (nice_used));
+		zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
+		zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
+		(void) printf("dir: used %s, comp %s, uncomp %s\n",
+		    nice_used, nice_comp, nice_uncomp);
+		zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
+		zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
+		zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
+		(void) printf("livelist: used %s, comp %s, uncomp %s\n",
+		    nice_used, nice_comp, nice_uncomp);
+		return (1);
+	}
+	return (0);
+}
+
+static avl_tree_t idx_tree;
+static avl_tree_t domain_tree;
+static boolean_t fuid_table_loaded;
+static objset_t *sa_os = NULL;
+static sa_attr_type_t *sa_attr_table = NULL;
+
+static int
+open_objset(const char *path, void *tag, objset_t **osp)
+{
+	int err;
+	uint64_t sa_attrs = 0;
+	uint64_t version = 0;
+
+	VERIFY3P(sa_os, ==, NULL);
+	/*
+	 * We can't own an objset if it's redacted.  Therefore, we do this
+	 * dance: hold the objset, then acquire a long hold on its dataset, then
+	 * release the pool (which is held as part of holding the objset).
+	 */
+	err = dmu_objset_hold(path, tag, osp);
+	if (err != 0) {
+		(void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
+		    path, strerror(err));
+		return (err);
+	}
+	dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
+	dsl_pool_rele(dmu_objset_pool(*osp), tag);
+
+	if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) {
+		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+		    8, 1, &version);
+		if (version >= ZPL_VERSION_SA) {
+			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
+			    8, 1, &sa_attrs);
+		}
+		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
+		    &sa_attr_table);
+		if (err != 0) {
+			(void) fprintf(stderr, "sa_setup failed: %s\n",
+			    strerror(err));
+			dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
+			dsl_dataset_rele(dmu_objset_ds(*osp), tag);
+			*osp = NULL;
+		}
+	}
+	sa_os = *osp;
+
+	return (0);
+}
+
+static void
+close_objset(objset_t *os, void *tag)
+{
+	VERIFY3P(os, ==, sa_os);
+	if (os->os_sa != NULL)
+		sa_tear_down(os);
+	dsl_dataset_long_rele(dmu_objset_ds(os), tag);
+	dsl_dataset_rele(dmu_objset_ds(os), tag);
+	sa_attr_table = NULL;
+	sa_os = NULL;
+}
+
+static void
+fuid_table_destroy(void)
+{
+	if (fuid_table_loaded) {
+		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
+		fuid_table_loaded = B_FALSE;
+	}
+}
+
+/*
+ * print uid or gid information.
+ * For normal POSIX id just the id is printed in decimal format.
+ * For CIFS files with FUID the fuid is printed in hex followed by
+ * the domain-rid string.
+ */
+static void
+print_idstr(uint64_t id, const char *id_type)
+{
+	if (FUID_INDEX(id)) {
+		char *domain;
+
+		domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
+		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
+		    (u_longlong_t)id, domain, (int)FUID_RID(id));
+	} else {
+		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
+	}
+
+}
+
+static void
+dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
+{
+	uint32_t uid_idx, gid_idx;
+
+	uid_idx = FUID_INDEX(uid);
+	gid_idx = FUID_INDEX(gid);
+
+	/* Load domain table, if not already loaded */
+	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
+		uint64_t fuid_obj;
+
+		/* first find the fuid object.  It lives in the master node */
+		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
+		    8, 1, &fuid_obj) == 0);
+		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
+		(void) zfs_fuid_table_load(os, fuid_obj,
+		    &idx_tree, &domain_tree);
+		fuid_table_loaded = B_TRUE;
+	}
+
+	print_idstr(uid, "uid");
+	print_idstr(gid, "gid");
+}
+
+static void
+dump_znode_sa_xattr(sa_handle_t *hdl)
+{
+	nvlist_t *sa_xattr;
+	nvpair_t *elem = NULL;
+	int sa_xattr_size = 0;
+	int sa_xattr_entries = 0;
+	int error;
+	char *sa_xattr_packed;
+
+	error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
+	if (error || sa_xattr_size == 0)
+		return;
+
+	sa_xattr_packed = malloc(sa_xattr_size);
+	if (sa_xattr_packed == NULL)
+		return;
+
+	error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
+	    sa_xattr_packed, sa_xattr_size);
+	if (error) {
+		free(sa_xattr_packed);
+		return;
+	}
+
+	error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
+	if (error) {
+		free(sa_xattr_packed);
+		return;
+	}
+
+	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
+		sa_xattr_entries++;
+
+	(void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
+	    sa_xattr_size, sa_xattr_entries);
+	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
+		uchar_t *value;
+		uint_t cnt, idx;
+
+		(void) printf("\t\t%s = ", nvpair_name(elem));
+		nvpair_value_byte_array(elem, &value, &cnt);
+		for (idx = 0; idx < cnt; ++idx) {
+			if (isprint(value[idx]))
+				(void) putchar(value[idx]);
+			else
+				(void) printf("\\%3.3o", value[idx]);
+		}
+		(void) putchar('\n');
+	}
+
+	nvlist_free(sa_xattr);
+	free(sa_xattr_packed);
+}
+
+static void
+dump_znode_symlink(sa_handle_t *hdl)
+{
+	int sa_symlink_size = 0;
+	char linktarget[MAXPATHLEN];
+	linktarget[0] = '\0';
+	int error;
+
+	error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
+	if (error || sa_symlink_size == 0) {
+		return;
+	}
+	if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
+	    &linktarget, sa_symlink_size) == 0)
+		(void) printf("\ttarget	%s\n", linktarget);
+}
+
+/*ARGSUSED*/
+static void
+dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
+	sa_handle_t *hdl;
+	uint64_t xattr, rdev, gen;
+	uint64_t uid, gid, mode, fsize, parent, links;
+	uint64_t pflags;
+	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
+	time_t z_crtime, z_atime, z_mtime, z_ctime;
+	sa_bulk_attr_t bulk[12];
+	int idx = 0;
+	int error;
+
+	VERIFY3P(os, ==, sa_os);
+	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
+		(void) printf("Failed to get handle for SA znode\n");
+		return;
+	}
+
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
+	    &links, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
+	    &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
+	    NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
+	    &fsize, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
+	    acctm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
+	    modtm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
+	    crtm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
+	    chgtm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
+	    &pflags, 8);
+
+	if (sa_bulk_lookup(hdl, bulk, idx)) {
+		(void) sa_handle_destroy(hdl);
+		return;
+	}
+
+	z_crtime = (time_t)crtm[0];
+	z_atime = (time_t)acctm[0];
+	z_mtime = (time_t)modtm[0];
+	z_ctime = (time_t)chgtm[0];
+
+	if (dump_opt['d'] > 4) {
+		error = zfs_obj_to_path(os, object, path, sizeof (path));
+		if (error == ESTALE) {
+			(void) snprintf(path, sizeof (path), "on delete queue");
+		} else if (error != 0) {
+			leaked_objects++;
+			(void) snprintf(path, sizeof (path),
+			    "path not found, possibly leaked");
+		}
+		(void) printf("\tpath	%s\n", path);
+	}
+
+	if (S_ISLNK(mode))
+		dump_znode_symlink(hdl);
+	dump_uidgid(os, uid, gid);
+	(void) printf("\tatime	%s", ctime(&z_atime));
+	(void) printf("\tmtime	%s", ctime(&z_mtime));
+	(void) printf("\tctime	%s", ctime(&z_ctime));
+	(void) printf("\tcrtime	%s", ctime(&z_crtime));
+	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
+	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
+	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
+	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
+	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
+	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
+	if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
+		uint64_t projid;
+
+		if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
+		    sizeof (uint64_t)) == 0)
+			(void) printf("\tprojid	%llu\n", (u_longlong_t)projid);
+	}
+	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
+	    sizeof (uint64_t)) == 0)
+		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
+	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
+	    sizeof (uint64_t)) == 0)
+		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
+	dump_znode_sa_xattr(hdl);
+	sa_handle_destroy(hdl);
+}
+
+/*ARGSUSED*/
+static void
+dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
+	dump_none,		/* unallocated			*/
+	dump_zap,		/* object directory		*/
+	dump_uint64,		/* object array			*/
+	dump_none,		/* packed nvlist		*/
+	dump_packed_nvlist,	/* packed nvlist size		*/
+	dump_none,		/* bpobj			*/
+	dump_bpobj,		/* bpobj header			*/
+	dump_none,		/* SPA space map header		*/
+	dump_none,		/* SPA space map		*/
+	dump_none,		/* ZIL intent log		*/
+	dump_dnode,		/* DMU dnode			*/
+	dump_dmu_objset,	/* DMU objset			*/
+	dump_dsl_dir,		/* DSL directory		*/
+	dump_zap,		/* DSL directory child map	*/
+	dump_zap,		/* DSL dataset snap map		*/
+	dump_zap,		/* DSL props			*/
+	dump_dsl_dataset,	/* DSL dataset			*/
+	dump_znode,		/* ZFS znode			*/
+	dump_acl,		/* ZFS V0 ACL			*/
+	dump_uint8,		/* ZFS plain file		*/
+	dump_zpldir,		/* ZFS directory		*/
+	dump_zap,		/* ZFS master node		*/
+	dump_zap,		/* ZFS delete queue		*/
+	dump_uint8,		/* zvol object			*/
+	dump_zap,		/* zvol prop			*/
+	dump_uint8,		/* other uint8[]		*/
+	dump_uint64,		/* other uint64[]		*/
+	dump_zap,		/* other ZAP			*/
+	dump_zap,		/* persistent error log		*/
+	dump_uint8,		/* SPA history			*/
+	dump_history_offsets,	/* SPA history offsets		*/
+	dump_zap,		/* Pool properties		*/
+	dump_zap,		/* DSL permissions		*/
+	dump_acl,		/* ZFS ACL			*/
+	dump_uint8,		/* ZFS SYSACL			*/
+	dump_none,		/* FUID nvlist			*/
+	dump_packed_nvlist,	/* FUID nvlist size		*/
+	dump_zap,		/* DSL dataset next clones	*/
+	dump_zap,		/* DSL scrub queue		*/
+	dump_zap,		/* ZFS user/group/project used	*/
+	dump_zap,		/* ZFS user/group/project quota	*/
+	dump_zap,		/* snapshot refcount tags	*/
+	dump_ddt_zap,		/* DDT ZAP object		*/
+	dump_zap,		/* DDT statistics		*/
+	dump_znode,		/* SA object			*/
+	dump_zap,		/* SA Master Node		*/
+	dump_sa_attrs,		/* SA attribute registration	*/
+	dump_sa_layouts,	/* SA attribute layouts		*/
+	dump_zap,		/* DSL scrub translations	*/
+	dump_none,		/* fake dedup BP		*/
+	dump_zap,		/* deadlist			*/
+	dump_none,		/* deadlist hdr			*/
+	dump_zap,		/* dsl clones			*/
+	dump_bpobj_subobjs,	/* bpobj subobjs		*/
+	dump_unknown,		/* Unknown type, must be last	*/
+};
+
+static boolean_t
+match_object_type(dmu_object_type_t obj_type, uint64_t flags)
+{
+	boolean_t match = B_TRUE;
+
+	switch (obj_type) {
+	case DMU_OT_DIRECTORY_CONTENTS:
+		if (!(flags & ZOR_FLAG_DIRECTORY))
+			match = B_FALSE;
+		break;
+	case DMU_OT_PLAIN_FILE_CONTENTS:
+		if (!(flags & ZOR_FLAG_PLAIN_FILE))
+			match = B_FALSE;
+		break;
+	case DMU_OT_SPACE_MAP:
+		if (!(flags & ZOR_FLAG_SPACE_MAP))
+			match = B_FALSE;
+		break;
+	default:
+		if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
+			if (!(flags & ZOR_FLAG_ZAP))
+				match = B_FALSE;
+			break;
+		}
+
+		/*
+		 * If all bits except some of the supported flags are
+		 * set, the user combined the all-types flag (A) with
+		 * a negated flag to exclude some types (e.g. A-f to
+		 * show all object types except plain files).
+		 */
+		if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
+			match = B_FALSE;
+
+		break;
+	}
+
+	return (match);
+}
+
+static void
+dump_object(objset_t *os, uint64_t object, int verbosity,
+    boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
+{
+	dmu_buf_t *db = NULL;
+	dmu_object_info_t doi;
+	dnode_t *dn;
+	boolean_t dnode_held = B_FALSE;
+	void *bonus = NULL;
+	size_t bsize = 0;
+	char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
+	char bonus_size[32];
+	char aux[50];
+	int error;
+
+	/* make sure nicenum has enough space */
+	CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
+	CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
+
+	if (*print_header) {
+		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %6s  %5s  %6s  %s\n",
+		    "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
+		    "lsize", "%full", "type");
+		*print_header = 0;
+	}
+
+	if (object == 0) {
+		dn = DMU_META_DNODE(os);
+		dmu_object_info_from_dnode(dn, &doi);
+	} else {
+		/*
+		 * Encrypted datasets will have sensitive bonus buffers
+		 * encrypted. Therefore we cannot hold the bonus buffer and
+		 * must hold the dnode itself instead.
+		 */
+		error = dmu_object_info(os, object, &doi);
+		if (error)
+			fatal("dmu_object_info() failed, errno %u", error);
+
+		if (os->os_encrypted &&
+		    DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
+			error = dnode_hold(os, object, FTAG, &dn);
+			if (error)
+				fatal("dnode_hold() failed, errno %u", error);
+			dnode_held = B_TRUE;
+		} else {
+			error = dmu_bonus_hold(os, object, FTAG, &db);
+			if (error)
+				fatal("dmu_bonus_hold(%llu) failed, errno %u",
+				    object, error);
+			bonus = db->db_data;
+			bsize = db->db_size;
+			dn = DB_DNODE((dmu_buf_impl_t *)db);
+		}
+	}
+
+	/*
+	 * Default to showing all object types if no flags were specified.
+	 */
+	if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
+	    !match_object_type(doi.doi_type, flags))
+		goto out;
+
+	if (dnode_slots_used)
+		*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
+
+	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
+	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
+	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
+	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
+	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
+	zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
+	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
+	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
+	    doi.doi_max_offset);
+
+	aux[0] = '\0';
+
+	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
+		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
+		    " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
+	}
+
+	if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
+	    ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
+		const char *compname = NULL;
+		if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
+		    ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
+		    &compname) == 0) {
+			(void) snprintf(aux + strlen(aux),
+			    sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
+			    compname);
+		} else {
+			(void) snprintf(aux + strlen(aux),
+			    sizeof (aux) - strlen(aux),
+			    " (Z=inherit=%s-unknown)",
+			    ZDB_COMPRESS_NAME(os->os_compress));
+		}
+	} else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
+		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
+		    " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
+	} else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
+		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
+		    " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
+	}
+
+	(void) printf("%10lld  %3u  %5s  %5s  %5s  %6s  %5s  %6s  %s%s\n",
+	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+	    asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
+
+	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
+		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %5s  %6s  %s\n",
+		    "", "", "", "", "", "", bonus_size, "bonus",
+		    zdb_ot_name(doi.doi_bonus_type));
+	}
+
+	if (verbosity >= 4) {
+		(void) printf("\tdnode flags: %s%s%s%s\n",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
+		    "USED_BYTES " : "",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
+		    "USERUSED_ACCOUNTED " : "",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
+		    "USEROBJUSED_ACCOUNTED " : "",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
+		    "SPILL_BLKPTR" : "");
+		(void) printf("\tdnode maxblkid: %llu\n",
+		    (longlong_t)dn->dn_phys->dn_maxblkid);
+
+		if (!dnode_held) {
+			object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
+			    object, bonus, bsize);
+		} else {
+			(void) printf("\t\t(bonus encrypted)\n");
+		}
+
+		if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) {
+			object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
+			    NULL, 0);
+		} else {
+			(void) printf("\t\t(object encrypted)\n");
+		}
+
+		*print_header = B_TRUE;
+	}
+
+	if (verbosity >= 5)
+		dump_indirect(dn);
+
+	if (verbosity >= 5) {
+		/*
+		 * Report the list of segments that comprise the object.
+		 */
+		uint64_t start = 0;
+		uint64_t end;
+		uint64_t blkfill = 1;
+		int minlvl = 1;
+
+		if (dn->dn_type == DMU_OT_DNODE) {
+			minlvl = 0;
+			blkfill = DNODES_PER_BLOCK;
+		}
+
+		for (;;) {
+			char segsize[32];
+			/* make sure nicenum has enough space */
+			CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
+			error = dnode_next_offset(dn,
+			    0, &start, minlvl, blkfill, 0);
+			if (error)
+				break;
+			end = start;
+			error = dnode_next_offset(dn,
+			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
+			zdb_nicenum(end - start, segsize, sizeof (segsize));
+			(void) printf("\t\tsegment [%016llx, %016llx)"
+			    " size %5s\n", (u_longlong_t)start,
+			    (u_longlong_t)end, segsize);
+			if (error)
+				break;
+			start = end;
+		}
+	}
+
+out:
+	if (db != NULL)
+		dmu_buf_rele(db, FTAG);
+	if (dnode_held)
+		dnode_rele(dn, FTAG);
+}
+
+static void
+count_dir_mos_objects(dsl_dir_t *dd)
+{
+	mos_obj_refd(dd->dd_object);
+	mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
+	mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
+	mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
+	mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
+
+	/*
+	 * The dd_crypto_obj can be referenced by multiple dsl_dir's.
+	 * Ignore the references after the first one.
+	 */
+	mos_obj_refd_multiple(dd->dd_crypto_obj);
+}
+
+static void
+count_ds_mos_objects(dsl_dataset_t *ds)
+{
+	mos_obj_refd(ds->ds_object);
+	mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
+	mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
+	mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
+	mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+	mos_obj_refd(ds->ds_bookmarks_obj);
+
+	if (!dsl_dataset_is_snapshot(ds)) {
+		count_dir_mos_objects(ds->ds_dir);
+	}
+}
+
+static const char *objset_types[DMU_OST_NUMTYPES] = {
+	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
+
+/*
+ * Parse a string denoting a range of object IDs of the form
+ * <start>[:<end>[:flags]], and store the results in zor.
+ * Return 0 on success. On error, return 1 and update the msg
+ * pointer to point to a descriptive error message.
+ */
+static int
+parse_object_range(char *range, zopt_object_range_t *zor, char **msg)
+{
+	uint64_t flags = 0;
+	char *p, *s, *dup, *flagstr;
+	size_t len;
+	int i;
+	int rc = 0;
+
+	if (strchr(range, ':') == NULL) {
+		zor->zor_obj_start = strtoull(range, &p, 0);
+		if (*p != '\0') {
+			*msg = "Invalid characters in object ID";
+			rc = 1;
+		}
+		zor->zor_obj_end = zor->zor_obj_start;
+		return (rc);
+	}
+
+	if (strchr(range, ':') == range) {
+		*msg = "Invalid leading colon";
+		rc = 1;
+		return (rc);
+	}
+
+	len = strlen(range);
+	if (range[len - 1] == ':') {
+		*msg = "Invalid trailing colon";
+		rc = 1;
+		return (rc);
+	}
+
+	dup = strdup(range);
+	s = strtok(dup, ":");
+	zor->zor_obj_start = strtoull(s, &p, 0);
+
+	if (*p != '\0') {
+		*msg = "Invalid characters in start object ID";
+		rc = 1;
+		goto out;
+	}
+
+	s = strtok(NULL, ":");
+	zor->zor_obj_end = strtoull(s, &p, 0);
+
+	if (*p != '\0') {
+		*msg = "Invalid characters in end object ID";
+		rc = 1;
+		goto out;
+	}
+
+	if (zor->zor_obj_start > zor->zor_obj_end) {
+		*msg = "Start object ID may not exceed end object ID";
+		rc = 1;
+		goto out;
+	}
+
+	s = strtok(NULL, ":");
+	if (s == NULL) {
+		zor->zor_flags = ZOR_FLAG_ALL_TYPES;
+		goto out;
+	} else if (strtok(NULL, ":") != NULL) {
+		*msg = "Invalid colon-delimited field after flags";
+		rc = 1;
+		goto out;
+	}
+
+	flagstr = s;
+	for (i = 0; flagstr[i]; i++) {
+		int bit;
+		boolean_t negation = (flagstr[i] == '-');
+
+		if (negation) {
+			i++;
+			if (flagstr[i] == '\0') {
+				*msg = "Invalid trailing negation operator";
+				rc = 1;
+				goto out;
+			}
+		}
+		bit = flagbits[(uchar_t)flagstr[i]];
+		if (bit == 0) {
+			*msg = "Invalid flag";
+			rc = 1;
+			goto out;
+		}
+		if (negation)
+			flags &= ~bit;
+		else
+			flags |= bit;
+	}
+	zor->zor_flags = flags;
+
+out:
+	free(dup);
+	return (rc);
+}
+
+static void
+dump_objset(objset_t *os)
+{
+	dmu_objset_stats_t dds = { 0 };
+	uint64_t object, object_count;
+	uint64_t refdbytes, usedobjs, scratch;
+	char numbuf[32];
+	char blkbuf[BP_SPRINTF_LEN + 20];
+	char osname[ZFS_MAX_DATASET_NAME_LEN];
+	const char *type = "UNKNOWN";
+	int verbosity = dump_opt['d'];
+	boolean_t print_header;
+	unsigned i;
+	int error;
+	uint64_t total_slots_used = 0;
+	uint64_t max_slot_used = 0;
+	uint64_t dnode_slots;
+	uint64_t obj_start;
+	uint64_t obj_end;
+	uint64_t flags;
+
+	/* make sure nicenum has enough space */
+	CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
+
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+	dmu_objset_fast_stat(os, &dds);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+
+	print_header = B_TRUE;
+
+	if (dds.dds_type < DMU_OST_NUMTYPES)
+		type = objset_types[dds.dds_type];
+
+	if (dds.dds_type == DMU_OST_META) {
+		dds.dds_creation_txg = TXG_INITIAL;
+		usedobjs = BP_GET_FILL(os->os_rootbp);
+		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
+		    dd_used_bytes;
+	} else {
+		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
+	}
+
+	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
+
+	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
+
+	if (verbosity >= 4) {
+		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
+		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
+		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
+	} else {
+		blkbuf[0] = '\0';
+	}
+
+	dmu_objset_name(os, osname);
+
+	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
+	    "%s, %llu objects%s%s\n",
+	    osname, type, (u_longlong_t)dmu_objset_id(os),
+	    (u_longlong_t)dds.dds_creation_txg,
+	    numbuf, (u_longlong_t)usedobjs, blkbuf,
+	    (dds.dds_inconsistent) ? " (inconsistent)" : "");
+
+	for (i = 0; i < zopt_object_args; i++) {
+		obj_start = zopt_object_ranges[i].zor_obj_start;
+		obj_end = zopt_object_ranges[i].zor_obj_end;
+		flags = zopt_object_ranges[i].zor_flags;
+
+		object = obj_start;
+		if (object == 0 || obj_start == obj_end)
+			dump_object(os, object, verbosity, &print_header, NULL,
+			    flags);
+		else
+			object--;
+
+		while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
+		    object <= obj_end) {
+			dump_object(os, object, verbosity, &print_header, NULL,
+			    flags);
+		}
+	}
+
+	if (zopt_object_args > 0) {
+		(void) printf("\n");
+		return;
+	}
+
+	if (dump_opt['i'] != 0 || verbosity >= 2)
+		dump_intent_log(dmu_objset_zil(os));
+
+	if (dmu_objset_ds(os) != NULL) {
+		dsl_dataset_t *ds = dmu_objset_ds(os);
+		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
+		if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+		    !dmu_objset_is_snapshot(os)) {
+			dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
+			if (verify_dd_livelist(os) != 0)
+				fatal("livelist is incorrect");
+		}
+
+		if (dsl_dataset_remap_deadlist_exists(ds)) {
+			(void) printf("ds_remap_deadlist:\n");
+			dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
+		}
+		count_ds_mos_objects(ds);
+	}
+
+	if (dmu_objset_ds(os) != NULL)
+		dump_bookmarks(os, verbosity);
+
+	if (verbosity < 2)
+		return;
+
+	if (BP_IS_HOLE(os->os_rootbp))
+		return;
+
+	dump_object(os, 0, verbosity, &print_header, NULL, 0);
+	object_count = 0;
+	if (DMU_USERUSED_DNODE(os) != NULL &&
+	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
+		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
+		    NULL, 0);
+		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
+		    NULL, 0);
+	}
+
+	if (DMU_PROJECTUSED_DNODE(os) != NULL &&
+	    DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
+		dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
+		    &print_header, NULL, 0);
+
+	object = 0;
+	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
+		dump_object(os, object, verbosity, &print_header, &dnode_slots,
+		    0);
+		object_count++;
+		total_slots_used += dnode_slots;
+		max_slot_used = object + dnode_slots - 1;
+	}
+
+	(void) printf("\n");
+
+	(void) printf("    Dnode slots:\n");
+	(void) printf("\tTotal used:    %10llu\n",
+	    (u_longlong_t)total_slots_used);
+	(void) printf("\tMax used:      %10llu\n",
+	    (u_longlong_t)max_slot_used);
+	(void) printf("\tPercent empty: %10lf\n",
+	    (double)(max_slot_used - total_slots_used)*100 /
+	    (double)max_slot_used);
+	(void) printf("\n");
+
+	if (error != ESRCH) {
+		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
+		abort();
+	}
+
+	ASSERT3U(object_count, ==, usedobjs);
+
+	if (leaked_objects != 0) {
+		(void) printf("%d potentially leaked objects detected\n",
+		    leaked_objects);
+		leaked_objects = 0;
+	}
+}
+
+static void
+dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
+{
+	time_t timestamp = ub->ub_timestamp;
+
+	(void) printf("%s", header ? header : "");
+	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
+	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
+	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
+	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
+	(void) printf("\ttimestamp = %llu UTC = %s",
+	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
+
+	(void) printf("\tmmp_magic = %016llx\n",
+	    (u_longlong_t)ub->ub_mmp_magic);
+	if (MMP_VALID(ub)) {
+		(void) printf("\tmmp_delay = %0llu\n",
+		    (u_longlong_t)ub->ub_mmp_delay);
+		if (MMP_SEQ_VALID(ub))
+			(void) printf("\tmmp_seq = %u\n",
+			    (unsigned int) MMP_SEQ(ub));
+		if (MMP_FAIL_INT_VALID(ub))
+			(void) printf("\tmmp_fail = %u\n",
+			    (unsigned int) MMP_FAIL_INT(ub));
+		if (MMP_INTERVAL_VALID(ub))
+			(void) printf("\tmmp_write = %u\n",
+			    (unsigned int) MMP_INTERVAL(ub));
+		/* After MMP_* to make summarize_uberblock_mmp cleaner */
+		(void) printf("\tmmp_valid = %x\n",
+		    (unsigned int) ub->ub_mmp_config & 0xFF);
+	}
+
+	if (dump_opt['u'] >= 4) {
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
+		(void) printf("\trootbp = %s\n", blkbuf);
+	}
+	(void) printf("\tcheckpoint_txg = %llu\n",
+	    (u_longlong_t)ub->ub_checkpoint_txg);
+	(void) printf("%s", footer ? footer : "");
+}
+
+static void
+dump_config(spa_t *spa)
+{
+	dmu_buf_t *db;
+	size_t nvsize = 0;
+	int error = 0;
+
+
+	error = dmu_bonus_hold(spa->spa_meta_objset,
+	    spa->spa_config_object, FTAG, &db);
+
+	if (error == 0) {
+		nvsize = *(uint64_t *)db->db_data;
+		dmu_buf_rele(db, FTAG);
+
+		(void) printf("\nMOS Configuration:\n");
+		dump_packed_nvlist(spa->spa_meta_objset,
+		    spa->spa_config_object, (void *)&nvsize, 1);
+	} else {
+		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
+		    (u_longlong_t)spa->spa_config_object, error);
+	}
+}
+
+static void
+dump_cachefile(const char *cachefile)
+{
+	int fd;
+	struct stat64 statbuf;
+	char *buf;
+	nvlist_t *config;
+
+	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
+		(void) printf("cannot open '%s': %s\n", cachefile,
+		    strerror(errno));
+		exit(1);
+	}
+
+	if (fstat64(fd, &statbuf) != 0) {
+		(void) printf("failed to stat '%s': %s\n", cachefile,
+		    strerror(errno));
+		exit(1);
+	}
+
+	if ((buf = malloc(statbuf.st_size)) == NULL) {
+		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
+		    (u_longlong_t)statbuf.st_size);
+		exit(1);
+	}
+
+	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
+		(void) fprintf(stderr, "failed to read %llu bytes\n",
+		    (u_longlong_t)statbuf.st_size);
+		exit(1);
+	}
+
+	(void) close(fd);
+
+	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
+		(void) fprintf(stderr, "failed to unpack nvlist\n");
+		exit(1);
+	}
+
+	free(buf);
+
+	dump_nvlist(config, 0);
+
+	nvlist_free(config);
+}
+
+/*
+ * ZFS label nvlist stats
+ */
+typedef struct zdb_nvl_stats {
+	int		zns_list_count;
+	int		zns_leaf_count;
+	size_t		zns_leaf_largest;
+	size_t		zns_leaf_total;
+	nvlist_t	*zns_string;
+	nvlist_t	*zns_uint64;
+	nvlist_t	*zns_boolean;
+} zdb_nvl_stats_t;
+
+static void
+collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
+{
+	nvlist_t *list, **array;
+	nvpair_t *nvp = NULL;
+	char *name;
+	uint_t i, items;
+
+	stats->zns_list_count++;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		name = nvpair_name(nvp);
+
+		switch (nvpair_type(nvp)) {
+		case DATA_TYPE_STRING:
+			fnvlist_add_string(stats->zns_string, name,
+			    fnvpair_value_string(nvp));
+			break;
+		case DATA_TYPE_UINT64:
+			fnvlist_add_uint64(stats->zns_uint64, name,
+			    fnvpair_value_uint64(nvp));
+			break;
+		case DATA_TYPE_BOOLEAN:
+			fnvlist_add_boolean(stats->zns_boolean, name);
+			break;
+		case DATA_TYPE_NVLIST:
+			if (nvpair_value_nvlist(nvp, &list) == 0)
+				collect_nvlist_stats(list, stats);
+			break;
+		case DATA_TYPE_NVLIST_ARRAY:
+			if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
+				break;
+
+			for (i = 0; i < items; i++) {
+				collect_nvlist_stats(array[i], stats);
+
+				/* collect stats on leaf vdev */
+				if (strcmp(name, "children") == 0) {
+					size_t size;
+
+					(void) nvlist_size(array[i], &size,
+					    NV_ENCODE_XDR);
+					stats->zns_leaf_total += size;
+					if (size > stats->zns_leaf_largest)
+						stats->zns_leaf_largest = size;
+					stats->zns_leaf_count++;
+				}
+			}
+			break;
+		default:
+			(void) printf("skip type %d!\n", (int)nvpair_type(nvp));
+		}
+	}
+}
+
+static void
+dump_nvlist_stats(nvlist_t *nvl, size_t cap)
+{
+	zdb_nvl_stats_t stats = { 0 };
+	size_t size, sum = 0, total;
+	size_t noise;
+
+	/* requires nvlist with non-unique names for stat collection */
+	VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
+	VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
+	VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
+	VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
+
+	(void) printf("\n\nZFS Label NVList Config Stats:\n");
+
+	VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
+	(void) printf("  %d bytes used, %d bytes free (using %4.1f%%)\n\n",
+	    (int)total, (int)(cap - total), 100.0 * total / cap);
+
+	collect_nvlist_stats(nvl, &stats);
+
+	VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
+	size -= noise;
+	sum += size;
+	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
+	    (int)fnvlist_num_pairs(stats.zns_uint64),
+	    (int)size, 100.0 * size / total);
+
+	VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
+	size -= noise;
+	sum += size;
+	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
+	    (int)fnvlist_num_pairs(stats.zns_string),
+	    (int)size, 100.0 * size / total);
+
+	VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
+	size -= noise;
+	sum += size;
+	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
+	    (int)fnvlist_num_pairs(stats.zns_boolean),
+	    (int)size, 100.0 * size / total);
+
+	size = total - sum;	/* treat remainder as nvlist overhead */
+	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
+	    stats.zns_list_count, (int)size, 100.0 * size / total);
+
+	if (stats.zns_leaf_count > 0) {
+		size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
+
+		(void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
+		    stats.zns_leaf_count, (int)average);
+		(void) printf("%24d bytes largest\n",
+		    (int)stats.zns_leaf_largest);
+
+		if (dump_opt['l'] >= 3 && average > 0)
+			(void) printf("  space for %d additional leaf vdevs\n",
+			    (int)((cap - total) / average));
+	}
+	(void) printf("\n");
+
+	nvlist_free(stats.zns_string);
+	nvlist_free(stats.zns_uint64);
+	nvlist_free(stats.zns_boolean);
+}
+
+typedef struct cksum_record {
+	zio_cksum_t cksum;
+	boolean_t labels[VDEV_LABELS];
+	avl_node_t link;
+} cksum_record_t;
+
+static int
+cksum_record_compare(const void *x1, const void *x2)
+{
+	const cksum_record_t *l = (cksum_record_t *)x1;
+	const cksum_record_t *r = (cksum_record_t *)x2;
+	int arraysize = ARRAY_SIZE(l->cksum.zc_word);
+	int difference;
+
+	for (int i = 0; i < arraysize; i++) {
+		difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
+		if (difference)
+			break;
+	}
+
+	return (difference);
+}
+
+static cksum_record_t *
+cksum_record_alloc(zio_cksum_t *cksum, int l)
+{
+	cksum_record_t *rec;
+
+	rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
+	rec->cksum = *cksum;
+	rec->labels[l] = B_TRUE;
+
+	return (rec);
+}
+
+static cksum_record_t *
+cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
+{
+	cksum_record_t lookup = { .cksum = *cksum };
+	avl_index_t where;
+
+	return (avl_find(tree, &lookup, &where));
+}
+
+static cksum_record_t *
+cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
+{
+	cksum_record_t *rec;
+
+	rec = cksum_record_lookup(tree, cksum);
+	if (rec) {
+		rec->labels[l] = B_TRUE;
+	} else {
+		rec = cksum_record_alloc(cksum, l);
+		avl_add(tree, rec);
+	}
+
+	return (rec);
+}
+
+static int
+first_label(cksum_record_t *rec)
+{
+	for (int i = 0; i < VDEV_LABELS; i++)
+		if (rec->labels[i])
+			return (i);
+
+	return (-1);
+}
+
+static void
+print_label_numbers(char *prefix, cksum_record_t *rec)
+{
+	printf("%s", prefix);
+	for (int i = 0; i < VDEV_LABELS; i++)
+		if (rec->labels[i] == B_TRUE)
+			printf("%d ", i);
+	printf("\n");
+}
+
+#define	MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
+
+typedef struct zdb_label {
+	vdev_label_t label;
+	nvlist_t *config_nv;
+	cksum_record_t *config;
+	cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
+	boolean_t header_printed;
+	boolean_t read_failed;
+} zdb_label_t;
+
+static void
+print_label_header(zdb_label_t *label, int l)
+{
+
+	if (dump_opt['q'])
+		return;
+
+	if (label->header_printed == B_TRUE)
+		return;
+
+	(void) printf("------------------------------------\n");
+	(void) printf("LABEL %d\n", l);
+	(void) printf("------------------------------------\n");
+
+	label->header_printed = B_TRUE;
+}
+
+static void
+print_l2arc_header(void)
+{
+	(void) printf("------------------------------------\n");
+	(void) printf("L2ARC device header\n");
+	(void) printf("------------------------------------\n");
+}
+
+static void
+print_l2arc_log_blocks(void)
+{
+	(void) printf("------------------------------------\n");
+	(void) printf("L2ARC device log blocks\n");
+	(void) printf("------------------------------------\n");
+}
+
+static void
+dump_l2arc_log_entries(uint64_t log_entries,
+    l2arc_log_ent_phys_t *le, uint64_t i)
+{
+	for (int j = 0; j < log_entries; j++) {
+		dva_t dva = le[j].le_dva;
+		(void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
+		    "vdev: %llu, offset: %llu\n",
+		    (u_longlong_t)i, j + 1,
+		    (u_longlong_t)DVA_GET_ASIZE(&dva),
+		    (u_longlong_t)DVA_GET_VDEV(&dva),
+		    (u_longlong_t)DVA_GET_OFFSET(&dva));
+		(void) printf("|\t\t\t\tbirth: %llu\n",
+		    (u_longlong_t)le[j].le_birth);
+		(void) printf("|\t\t\t\tlsize: %llu\n",
+		    (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
+		(void) printf("|\t\t\t\tpsize: %llu\n",
+		    (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
+		(void) printf("|\t\t\t\tcompr: %llu\n",
+		    (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
+		(void) printf("|\t\t\t\tcomplevel: %llu\n",
+		    (u_longlong_t)(&le[j])->le_complevel);
+		(void) printf("|\t\t\t\ttype: %llu\n",
+		    (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
+		(void) printf("|\t\t\t\tprotected: %llu\n",
+		    (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
+		(void) printf("|\t\t\t\tprefetch: %llu\n",
+		    (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
+		(void) printf("|\t\t\t\taddress: %llu\n",
+		    (u_longlong_t)le[j].le_daddr);
+		(void) printf("|\n");
+	}
+	(void) printf("\n");
+}
+
+static void
+dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps)
+{
+	(void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr);
+	(void) printf("|\t\tpayload_asize: %llu\n",
+	    (u_longlong_t)lbps.lbp_payload_asize);
+	(void) printf("|\t\tpayload_start: %llu\n",
+	    (u_longlong_t)lbps.lbp_payload_start);
+	(void) printf("|\t\tlsize: %llu\n",
+	    (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop));
+	(void) printf("|\t\tasize: %llu\n",
+	    (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop));
+	(void) printf("|\t\tcompralgo: %llu\n",
+	    (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop));
+	(void) printf("|\t\tcksumalgo: %llu\n",
+	    (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop));
+	(void) printf("|\n\n");
+}
+
+static void
+dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr,
+    l2arc_dev_hdr_phys_t *rebuild)
+{
+	l2arc_log_blk_phys_t this_lb;
+	uint64_t asize;
+	l2arc_log_blkptr_t lbps[2];
+	abd_t *abd;
+	zio_cksum_t cksum;
+	int failed = 0;
+	l2arc_dev_t dev;
+
+	if (!dump_opt['q'])
+		print_l2arc_log_blocks();
+	bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps));
+
+	dev.l2ad_evict = l2dhdr.dh_evict;
+	dev.l2ad_start = l2dhdr.dh_start;
+	dev.l2ad_end = l2dhdr.dh_end;
+
+	if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) {
+		/* no log blocks to read */
+		if (!dump_opt['q']) {
+			(void) printf("No log blocks to read\n");
+			(void) printf("\n");
+		}
+		return;
+	} else {
+		dev.l2ad_hand = lbps[0].lbp_daddr +
+		    L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+	}
+
+	dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+
+	for (;;) {
+		if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
+			break;
+
+		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
+		asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+		if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
+			if (!dump_opt['q']) {
+				(void) printf("Error while reading next log "
+				    "block\n\n");
+			}
+			break;
+		}
+
+		fletcher_4_native_varsize(&this_lb, asize, &cksum);
+		if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
+			failed++;
+			if (!dump_opt['q']) {
+				(void) printf("Invalid cksum\n");
+				dump_l2arc_log_blkptr(lbps[0]);
+			}
+			break;
+		}
+
+		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
+		case ZIO_COMPRESS_OFF:
+			break;
+		default:
+			abd = abd_alloc_for_io(asize, B_TRUE);
+			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
+			zio_decompress_data(L2BLK_GET_COMPRESS(
+			    (&lbps[0])->lbp_prop), abd, &this_lb,
+			    asize, sizeof (this_lb), NULL);
+			abd_free(abd);
+			break;
+		}
+
+		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+			byteswap_uint64_array(&this_lb, sizeof (this_lb));
+		if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
+			if (!dump_opt['q'])
+				(void) printf("Invalid log block magic\n\n");
+			break;
+		}
+
+		rebuild->dh_lb_count++;
+		rebuild->dh_lb_asize += asize;
+		if (dump_opt['l'] > 1 && !dump_opt['q']) {
+			(void) printf("lb[%4llu]\tmagic: %llu\n",
+			    (u_longlong_t)rebuild->dh_lb_count,
+			    (u_longlong_t)this_lb.lb_magic);
+			dump_l2arc_log_blkptr(lbps[0]);
+		}
+
+		if (dump_opt['l'] > 2 && !dump_opt['q'])
+			dump_l2arc_log_entries(l2dhdr.dh_log_entries,
+			    this_lb.lb_entries,
+			    rebuild->dh_lb_count);
+
+		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
+		    lbps[0].lbp_payload_start, dev.l2ad_evict) &&
+		    !dev.l2ad_first)
+			break;
+
+		lbps[0] = lbps[1];
+		lbps[1] = this_lb.lb_prev_lbp;
+	}
+
+	if (!dump_opt['q']) {
+		(void) printf("log_blk_count:\t %llu with valid cksum\n",
+		    (u_longlong_t)rebuild->dh_lb_count);
+		(void) printf("\t\t %d with invalid cksum\n", failed);
+		(void) printf("log_blk_asize:\t %llu\n\n",
+		    (u_longlong_t)rebuild->dh_lb_asize);
+	}
+}
+
+static int
+dump_l2arc_header(int fd)
+{
+	l2arc_dev_hdr_phys_t l2dhdr, rebuild;
+	int error = B_FALSE;
+
+	bzero(&l2dhdr, sizeof (l2dhdr));
+	bzero(&rebuild, sizeof (rebuild));
+
+	if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
+	    VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
+		error = B_TRUE;
+	} else {
+		if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+			byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
+
+		if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
+			error = B_TRUE;
+	}
+
+	if (error) {
+		(void) printf("L2ARC device header not found\n\n");
+		/* Do not return an error here for backward compatibility */
+		return (0);
+	} else if (!dump_opt['q']) {
+		print_l2arc_header();
+
+		(void) printf("    magic: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_magic);
+		(void) printf("    version: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_version);
+		(void) printf("    pool_guid: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_spa_guid);
+		(void) printf("    flags: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_flags);
+		(void) printf("    start_lbps[0]: %llu\n",
+		    (u_longlong_t)
+		    l2dhdr.dh_start_lbps[0].lbp_daddr);
+		(void) printf("    start_lbps[1]: %llu\n",
+		    (u_longlong_t)
+		    l2dhdr.dh_start_lbps[1].lbp_daddr);
+		(void) printf("    log_blk_ent: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_log_entries);
+		(void) printf("    start: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_start);
+		(void) printf("    end: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_end);
+		(void) printf("    evict: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_evict);
+		(void) printf("    lb_asize_refcount: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_lb_asize);
+		(void) printf("    lb_count_refcount: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_lb_count);
+		(void) printf("    trim_action_time: %llu\n",
+		    (u_longlong_t)l2dhdr.dh_trim_action_time);
+		(void) printf("    trim_state: %llu\n\n",
+		    (u_longlong_t)l2dhdr.dh_trim_state);
+	}
+
+	dump_l2arc_log_blocks(fd, l2dhdr, &rebuild);
+	/*
+	 * The total aligned size of log blocks and the number of log blocks
+	 * reported in the header of the device may be less than what zdb
+	 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
+	 * This happens because dump_l2arc_log_blocks() lacks the memory
+	 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
+	 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
+	 * and dh_lb_count will be lower to begin with than what exists on the
+	 * device. This is normal and zdb should not exit with an error. The
+	 * opposite case should never happen though, the values reported in the
+	 * header should never be higher than what dump_l2arc_log_blocks() and
+	 * l2arc_rebuild() report. If this happens there is a leak in the
+	 * accounting of log blocks.
+	 */
+	if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
+	    l2dhdr.dh_lb_count > rebuild.dh_lb_count)
+		return (1);
+
+	return (0);
+}
+
+static void
+dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
+{
+	if (dump_opt['q'])
+		return;
+
+	if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
+		return;
+
+	print_label_header(label, l);
+	dump_nvlist(label->config_nv, 4);
+	print_label_numbers("    labels = ", label->config);
+
+	if (dump_opt['l'] >= 2)
+		dump_nvlist_stats(label->config_nv, buflen);
+}
+
+#define	ZDB_MAX_UB_HEADER_SIZE 32
+
+static void
+dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
+{
+
+	vdev_t vd;
+	char header[ZDB_MAX_UB_HEADER_SIZE];
+
+	vd.vdev_ashift = ashift;
+	vd.vdev_top = &vd;
+
+	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
+		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
+		uberblock_t *ub = (void *)((char *)&label->label + uoff);
+		cksum_record_t *rec = label->uberblocks[i];
+
+		if (rec == NULL) {
+			if (dump_opt['u'] >= 2) {
+				print_label_header(label, label_num);
+				(void) printf("    Uberblock[%d] invalid\n", i);
+			}
+			continue;
+		}
+
+		if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
+			continue;
+
+		if ((dump_opt['u'] < 4) &&
+		    (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
+		    (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
+			continue;
+
+		print_label_header(label, label_num);
+		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+		    "    Uberblock[%d]\n", i);
+		dump_uberblock(ub, header, "");
+		print_label_numbers("        labels = ", rec);
+	}
+}
+
+static char curpath[PATH_MAX];
+
+/*
+ * Iterate through the path components, recursively passing
+ * current one's obj and remaining path until we find the obj
+ * for the last one.
+ */
+static int
+dump_path_impl(objset_t *os, uint64_t obj, char *name)
+{
+	int err;
+	boolean_t header = B_TRUE;
+	uint64_t child_obj;
+	char *s;
+	dmu_buf_t *db;
+	dmu_object_info_t doi;
+
+	if ((s = strchr(name, '/')) != NULL)
+		*s = '\0';
+	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
+
+	(void) strlcat(curpath, name, sizeof (curpath));
+
+	if (err != 0) {
+		(void) fprintf(stderr, "failed to lookup %s: %s\n",
+		    curpath, strerror(err));
+		return (err);
+	}
+
+	child_obj = ZFS_DIRENT_OBJ(child_obj);
+	err = sa_buf_hold(os, child_obj, FTAG, &db);
+	if (err != 0) {
+		(void) fprintf(stderr,
+		    "failed to get SA dbuf for obj %llu: %s\n",
+		    (u_longlong_t)child_obj, strerror(err));
+		return (EINVAL);
+	}
+	dmu_object_info_from_db(db, &doi);
+	sa_buf_rele(db, FTAG);
+
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    doi.doi_bonus_type != DMU_OT_ZNODE) {
+		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
+		    doi.doi_bonus_type, (u_longlong_t)child_obj);
+		return (EINVAL);
+	}
+
+	if (dump_opt['v'] > 6) {
+		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
+		    (u_longlong_t)child_obj, curpath, doi.doi_type,
+		    doi.doi_bonus_type);
+	}
+
+	(void) strlcat(curpath, "/", sizeof (curpath));
+
+	switch (doi.doi_type) {
+	case DMU_OT_DIRECTORY_CONTENTS:
+		if (s != NULL && *(s + 1) != '\0')
+			return (dump_path_impl(os, child_obj, s + 1));
+		/*FALLTHROUGH*/
+	case DMU_OT_PLAIN_FILE_CONTENTS:
+		dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0);
+		return (0);
+	default:
+		(void) fprintf(stderr, "object %llu has non-file/directory "
+		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
+		break;
+	}
+
+	return (EINVAL);
+}
+
+/*
+ * Dump the blocks for the object specified by path inside the dataset.
+ */
+static int
+dump_path(char *ds, char *path)
+{
+	int err;
+	objset_t *os;
+	uint64_t root_obj;
+
+	err = open_objset(ds, FTAG, &os);
+	if (err != 0)
+		return (err);
+
+	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
+	if (err != 0) {
+		(void) fprintf(stderr, "can't lookup root znode: %s\n",
+		    strerror(err));
+		close_objset(os, FTAG);
+		return (EINVAL);
+	}
+
+	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
+
+	err = dump_path_impl(os, root_obj, path);
+
+	close_objset(os, FTAG);
+	return (err);
+}
+
+static int
+dump_label(const char *dev)
+{
+	char path[MAXPATHLEN];
+	zdb_label_t labels[VDEV_LABELS];
+	uint64_t psize, ashift, l2cache;
+	struct stat64 statbuf;
+	boolean_t config_found = B_FALSE;
+	boolean_t error = B_FALSE;
+	boolean_t read_l2arc_header = B_FALSE;
+	avl_tree_t config_tree;
+	avl_tree_t uberblock_tree;
+	void *node, *cookie;
+	int fd;
+
+	bzero(labels, sizeof (labels));
+
+	/*
+	 * Check if we were given absolute path and use it as is.
+	 * Otherwise if the provided vdev name doesn't point to a file,
+	 * try prepending expected disk paths and partition numbers.
+	 */
+	(void) strlcpy(path, dev, sizeof (path));
+	if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
+		int error;
+
+		error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
+		if (error == 0 && zfs_dev_is_whole_disk(path)) {
+			if (zfs_append_partition(path, MAXPATHLEN) == -1)
+				error = ENOENT;
+		}
+
+		if (error || (stat64(path, &statbuf) != 0)) {
+			(void) printf("failed to find device %s, try "
+			    "specifying absolute path instead\n", dev);
+			return (1);
+		}
+	}
+
+	if ((fd = open64(path, O_RDONLY)) < 0) {
+		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
+		exit(1);
+	}
+
+	if (fstat64_blk(fd, &statbuf) != 0) {
+		(void) printf("failed to stat '%s': %s\n", path,
+		    strerror(errno));
+		(void) close(fd);
+		exit(1);
+	}
+
+	if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
+		(void) printf("failed to invalidate cache '%s' : %s\n", path,
+		    strerror(errno));
+
+	avl_create(&config_tree, cksum_record_compare,
+	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
+	avl_create(&uberblock_tree, cksum_record_compare,
+	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
+
+	psize = statbuf.st_size;
+	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+	ashift = SPA_MINBLOCKSHIFT;
+
+	/*
+	 * 1. Read the label from disk
+	 * 2. Unpack the configuration and insert in config tree.
+	 * 3. Traverse all uberblocks and insert in uberblock tree.
+	 */
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		zdb_label_t *label = &labels[l];
+		char *buf = label->label.vl_vdev_phys.vp_nvlist;
+		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
+		nvlist_t *config;
+		cksum_record_t *rec;
+		zio_cksum_t cksum;
+		vdev_t vd;
+
+		if (pread64(fd, &label->label, sizeof (label->label),
+		    vdev_label_offset(psize, l, 0)) != sizeof (label->label)) {
+			if (!dump_opt['q'])
+				(void) printf("failed to read label %d\n", l);
+			label->read_failed = B_TRUE;
+			error = B_TRUE;
+			continue;
+		}
+
+		label->read_failed = B_FALSE;
+
+		if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
+			nvlist_t *vdev_tree = NULL;
+			size_t size;
+
+			if ((nvlist_lookup_nvlist(config,
+			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
+			    (nvlist_lookup_uint64(vdev_tree,
+			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
+				ashift = SPA_MINBLOCKSHIFT;
+
+			if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
+				size = buflen;
+
+			/* If the device is a cache device clear the header. */
+			if (!read_l2arc_header) {
+				if (nvlist_lookup_uint64(config,
+				    ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
+				    l2cache == POOL_STATE_L2CACHE) {
+					read_l2arc_header = B_TRUE;
+				}
+			}
+
+			fletcher_4_native_varsize(buf, size, &cksum);
+			rec = cksum_record_insert(&config_tree, &cksum, l);
+
+			label->config = rec;
+			label->config_nv = config;
+			config_found = B_TRUE;
+		} else {
+			error = B_TRUE;
+		}
+
+		vd.vdev_ashift = ashift;
+		vd.vdev_top = &vd;
+
+		for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
+			uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
+			uberblock_t *ub = (void *)((char *)label + uoff);
+
+			if (uberblock_verify(ub))
+				continue;
+
+			fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
+			rec = cksum_record_insert(&uberblock_tree, &cksum, l);
+
+			label->uberblocks[i] = rec;
+		}
+	}
+
+	/*
+	 * Dump the label and uberblocks.
+	 */
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		zdb_label_t *label = &labels[l];
+		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
+
+		if (label->read_failed == B_TRUE)
+			continue;
+
+		if (label->config_nv) {
+			dump_config_from_label(label, buflen, l);
+		} else {
+			if (!dump_opt['q'])
+				(void) printf("failed to unpack label %d\n", l);
+		}
+
+		if (dump_opt['u'])
+			dump_label_uberblocks(label, ashift, l);
+
+		nvlist_free(label->config_nv);
+	}
+
+	/*
+	 * Dump the L2ARC header, if existent.
+	 */
+	if (read_l2arc_header)
+		error |= dump_l2arc_header(fd);
+
+	cookie = NULL;
+	while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
+		umem_free(node, sizeof (cksum_record_t));
+
+	cookie = NULL;
+	while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
+		umem_free(node, sizeof (cksum_record_t));
+
+	avl_destroy(&config_tree);
+	avl_destroy(&uberblock_tree);
+
+	(void) close(fd);
+
+	return (config_found == B_FALSE ? 2 :
+	    (error == B_TRUE ? 1 : 0));
+}
+
+static uint64_t dataset_feature_count[SPA_FEATURES];
+static uint64_t global_feature_count[SPA_FEATURES];
+static uint64_t remap_deadlist_count = 0;
+
+/*ARGSUSED*/
+static int
+dump_one_objset(const char *dsname, void *arg)
+{
+	int error;
+	objset_t *os;
+	spa_feature_t f;
+
+	error = open_objset(dsname, FTAG, &os);
+	if (error != 0)
+		return (0);
+
+	for (f = 0; f < SPA_FEATURES; f++) {
+		if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
+			continue;
+		ASSERT(spa_feature_table[f].fi_flags &
+		    ZFEATURE_FLAG_PER_DATASET);
+		dataset_feature_count[f]++;
+	}
+
+	if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
+		remap_deadlist_count++;
+	}
+
+	for (dsl_bookmark_node_t *dbn =
+	    avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
+	    dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
+		mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
+		if (dbn->dbn_phys.zbm_redaction_obj != 0)
+			global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++;
+		if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
+			global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
+	}
+
+	if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
+	    !dmu_objset_is_snapshot(os)) {
+		global_feature_count[SPA_FEATURE_LIVELIST]++;
+	}
+
+	dump_objset(os);
+	close_objset(os, FTAG);
+	fuid_table_destroy();
+	return (0);
+}
+
+/*
+ * Block statistics.
+ */
+#define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
+typedef struct zdb_blkstats {
+	uint64_t zb_asize;
+	uint64_t zb_lsize;
+	uint64_t zb_psize;
+	uint64_t zb_count;
+	uint64_t zb_gangs;
+	uint64_t zb_ditto_samevdev;
+	uint64_t zb_ditto_same_ms;
+	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
+} zdb_blkstats_t;
+
+/*
+ * Extended object types to report deferred frees and dedup auto-ditto blocks.
+ */
+#define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
+#define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
+#define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
+#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
+
+static const char *zdb_ot_extname[] = {
+	"deferred free",
+	"dedup ditto",
+	"other",
+	"Total",
+};
+
+#define	ZB_TOTAL	DN_MAX_LEVELS
+#define	SPA_MAX_FOR_16M	(SPA_MAXBLOCKSHIFT+1)
+
+typedef struct zdb_cb {
+	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+	uint64_t	zcb_removing_size;
+	uint64_t	zcb_checkpoint_size;
+	uint64_t	zcb_dedup_asize;
+	uint64_t	zcb_dedup_blocks;
+	uint64_t	zcb_psize_count[SPA_MAX_FOR_16M];
+	uint64_t	zcb_lsize_count[SPA_MAX_FOR_16M];
+	uint64_t	zcb_asize_count[SPA_MAX_FOR_16M];
+	uint64_t	zcb_psize_len[SPA_MAX_FOR_16M];
+	uint64_t	zcb_lsize_len[SPA_MAX_FOR_16M];
+	uint64_t	zcb_asize_len[SPA_MAX_FOR_16M];
+	uint64_t	zcb_psize_total;
+	uint64_t	zcb_lsize_total;
+	uint64_t	zcb_asize_total;
+	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
+	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
+	    [BPE_PAYLOAD_SIZE + 1];
+	uint64_t	zcb_start;
+	hrtime_t	zcb_lastprint;
+	uint64_t	zcb_totalasize;
+	uint64_t	zcb_errors[256];
+	int		zcb_readfails;
+	int		zcb_haderrors;
+	spa_t		*zcb_spa;
+	uint32_t	**zcb_vd_obsolete_counts;
+} zdb_cb_t;
+
+/* test if two DVA offsets from same vdev are within the same metaslab */
+static boolean_t
+same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
+{
+	vdev_t *vd = vdev_lookup_top(spa, vdev);
+	uint64_t ms_shift = vd->vdev_ms_shift;
+
+	return ((off1 >> ms_shift) == (off2 >> ms_shift));
+}
+
+/*
+ * Used to simplify reporting of the histogram data.
+ */
+typedef struct one_histo {
+	char *name;
+	uint64_t *count;
+	uint64_t *len;
+	uint64_t cumulative;
+} one_histo_t;
+
+/*
+ * The number of separate histograms processed for psize, lsize and asize.
+ */
+#define	NUM_HISTO 3
+
+/*
+ * This routine will create a fixed column size output of three different
+ * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
+ * the count, length and cumulative length of the psize, lsize and
+ * asize blocks.
+ *
+ * All three types of blocks are listed on a single line
+ *
+ * By default the table is printed in nicenumber format (e.g. 123K) but
+ * if the '-P' parameter is specified then the full raw number (parseable)
+ * is printed out.
+ */
+static void
+dump_size_histograms(zdb_cb_t *zcb)
+{
+	/*
+	 * A temporary buffer that allows us to convert a number into
+	 * a string using zdb_nicenumber to allow either raw or human
+	 * readable numbers to be output.
+	 */
+	char numbuf[32];
+
+	/*
+	 * Define titles which are used in the headers of the tables
+	 * printed by this routine.
+	 */
+	const char blocksize_title1[] = "block";
+	const char blocksize_title2[] = "size";
+	const char count_title[] = "Count";
+	const char length_title[] = "Size";
+	const char cumulative_title[] = "Cum.";
+
+	/*
+	 * Setup the histogram arrays (psize, lsize, and asize).
+	 */
+	one_histo_t parm_histo[NUM_HISTO];
+
+	parm_histo[0].name = "psize";
+	parm_histo[0].count = zcb->zcb_psize_count;
+	parm_histo[0].len = zcb->zcb_psize_len;
+	parm_histo[0].cumulative = 0;
+
+	parm_histo[1].name = "lsize";
+	parm_histo[1].count = zcb->zcb_lsize_count;
+	parm_histo[1].len = zcb->zcb_lsize_len;
+	parm_histo[1].cumulative = 0;
+
+	parm_histo[2].name = "asize";
+	parm_histo[2].count = zcb->zcb_asize_count;
+	parm_histo[2].len = zcb->zcb_asize_len;
+	parm_histo[2].cumulative = 0;
+
+
+	(void) printf("\nBlock Size Histogram\n");
+	/*
+	 * Print the first line titles
+	 */
+	if (dump_opt['P'])
+		(void) printf("\n%s\t", blocksize_title1);
+	else
+		(void) printf("\n%7s   ", blocksize_title1);
+
+	for (int j = 0; j < NUM_HISTO; j++) {
+		if (dump_opt['P']) {
+			if (j < NUM_HISTO - 1) {
+				(void) printf("%s\t\t\t", parm_histo[j].name);
+			} else {
+				/* Don't print trailing spaces */
+				(void) printf("  %s", parm_histo[j].name);
+			}
+		} else {
+			if (j < NUM_HISTO - 1) {
+				/* Left aligned strings in the output */
+				(void) printf("%-7s              ",
+				    parm_histo[j].name);
+			} else {
+				/* Don't print trailing spaces */
+				(void) printf("%s", parm_histo[j].name);
+			}
+		}
+	}
+	(void) printf("\n");
+
+	/*
+	 * Print the second line titles
+	 */
+	if (dump_opt['P']) {
+		(void) printf("%s\t", blocksize_title2);
+	} else {
+		(void) printf("%7s ", blocksize_title2);
+	}
+
+	for (int i = 0; i < NUM_HISTO; i++) {
+		if (dump_opt['P']) {
+			(void) printf("%s\t%s\t%s\t",
+			    count_title, length_title, cumulative_title);
+		} else {
+			(void) printf("%7s%7s%7s",
+			    count_title, length_title, cumulative_title);
+		}
+	}
+	(void) printf("\n");
+
+	/*
+	 * Print the rows
+	 */
+	for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
+
+		/*
+		 * Print the first column showing the blocksize
+		 */
+		zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
+
+		if (dump_opt['P']) {
+			printf("%s", numbuf);
+		} else {
+			printf("%7s:", numbuf);
+		}
+
+		/*
+		 * Print the remaining set of 3 columns per size:
+		 * for psize, lsize and asize
+		 */
+		for (int j = 0; j < NUM_HISTO; j++) {
+			parm_histo[j].cumulative += parm_histo[j].len[i];
+
+			zdb_nicenum(parm_histo[j].count[i],
+			    numbuf, sizeof (numbuf));
+			if (dump_opt['P'])
+				(void) printf("\t%s", numbuf);
+			else
+				(void) printf("%7s", numbuf);
+
+			zdb_nicenum(parm_histo[j].len[i],
+			    numbuf, sizeof (numbuf));
+			if (dump_opt['P'])
+				(void) printf("\t%s", numbuf);
+			else
+				(void) printf("%7s", numbuf);
+
+			zdb_nicenum(parm_histo[j].cumulative,
+			    numbuf, sizeof (numbuf));
+			if (dump_opt['P'])
+				(void) printf("\t%s", numbuf);
+			else
+				(void) printf("%7s", numbuf);
+		}
+		(void) printf("\n");
+	}
+}
+
+static void
+zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
+    dmu_object_type_t type)
+{
+	uint64_t refcnt = 0;
+	int i;
+
+	ASSERT(type < ZDB_OT_TOTAL);
+
+	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+		return;
+
+	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
+
+	for (i = 0; i < 4; i++) {
+		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
+		int t = (i & 1) ? type : ZDB_OT_TOTAL;
+		int equal;
+		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
+
+		zb->zb_asize += BP_GET_ASIZE(bp);
+		zb->zb_lsize += BP_GET_LSIZE(bp);
+		zb->zb_psize += BP_GET_PSIZE(bp);
+		zb->zb_count++;
+
+		/*
+		 * The histogram is only big enough to record blocks up to
+		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+		 * "other", bucket.
+		 */
+		unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+		zb->zb_psize_histogram[idx]++;
+
+		zb->zb_gangs += BP_COUNT_GANG(bp);
+
+		switch (BP_GET_NDVAS(bp)) {
+		case 2:
+			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1])) {
+				zb->zb_ditto_samevdev++;
+
+				if (same_metaslab(zcb->zcb_spa,
+				    DVA_GET_VDEV(&bp->blk_dva[0]),
+				    DVA_GET_OFFSET(&bp->blk_dva[0]),
+				    DVA_GET_OFFSET(&bp->blk_dva[1])))
+					zb->zb_ditto_same_ms++;
+			}
+			break;
+		case 3:
+			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2]));
+			if (equal != 0) {
+				zb->zb_ditto_samevdev++;
+
+				if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+				    DVA_GET_VDEV(&bp->blk_dva[1]) &&
+				    same_metaslab(zcb->zcb_spa,
+				    DVA_GET_VDEV(&bp->blk_dva[0]),
+				    DVA_GET_OFFSET(&bp->blk_dva[0]),
+				    DVA_GET_OFFSET(&bp->blk_dva[1])))
+					zb->zb_ditto_same_ms++;
+				else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
+				    same_metaslab(zcb->zcb_spa,
+				    DVA_GET_VDEV(&bp->blk_dva[0]),
+				    DVA_GET_OFFSET(&bp->blk_dva[0]),
+				    DVA_GET_OFFSET(&bp->blk_dva[2])))
+					zb->zb_ditto_same_ms++;
+				else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
+				    same_metaslab(zcb->zcb_spa,
+				    DVA_GET_VDEV(&bp->blk_dva[1]),
+				    DVA_GET_OFFSET(&bp->blk_dva[1]),
+				    DVA_GET_OFFSET(&bp->blk_dva[2])))
+					zb->zb_ditto_same_ms++;
+			}
+			break;
+		}
+	}
+
+	spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
+
+	if (BP_IS_EMBEDDED(bp)) {
+		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
+		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
+		    [BPE_GET_PSIZE(bp)]++;
+		return;
+	}
+	/*
+	 * The binning histogram bins by powers of two up to
+	 * SPA_MAXBLOCKSIZE rather than creating bins for
+	 * every possible blocksize found in the pool.
+	 */
+	int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
+
+	zcb->zcb_psize_count[bin]++;
+	zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
+	zcb->zcb_psize_total += BP_GET_PSIZE(bp);
+
+	bin = highbit64(BP_GET_LSIZE(bp)) - 1;
+
+	zcb->zcb_lsize_count[bin]++;
+	zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
+	zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
+
+	bin = highbit64(BP_GET_ASIZE(bp)) - 1;
+
+	zcb->zcb_asize_count[bin]++;
+	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
+	zcb->zcb_asize_total += BP_GET_ASIZE(bp);
+
+	if (dump_opt['L'])
+		return;
+
+	if (BP_GET_DEDUP(bp)) {
+		ddt_t *ddt;
+		ddt_entry_t *dde;
+
+		ddt = ddt_select(zcb->zcb_spa, bp);
+		ddt_enter(ddt);
+		dde = ddt_lookup(ddt, bp, B_FALSE);
+
+		if (dde == NULL) {
+			refcnt = 0;
+		} else {
+			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+			ddt_phys_decref(ddp);
+			refcnt = ddp->ddp_refcnt;
+			if (ddt_phys_total_refcnt(dde) == 0)
+				ddt_remove(ddt, dde);
+		}
+		ddt_exit(ddt);
+	}
+
+	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
+	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+}
+
+static void
+zdb_blkptr_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	int ioerr = zio->io_error;
+	zdb_cb_t *zcb = zio->io_private;
+	zbookmark_phys_t *zb = &zio->io_bookmark;
+
+	abd_free(zio->io_abd);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
+	cv_broadcast(&spa->spa_scrub_io_cv);
+
+	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		char blkbuf[BP_SPRINTF_LEN];
+
+		zcb->zcb_haderrors = 1;
+		zcb->zcb_errors[ioerr]++;
+
+		if (dump_opt['b'] >= 2)
+			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		else
+			blkbuf[0] = '\0';
+
+		(void) printf("zdb_blkptr_cb: "
+		    "Got error %d reading "
+		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+		    ioerr,
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    (u_longlong_t)zb->zb_level,
+		    (u_longlong_t)zb->zb_blkid,
+		    blkbuf);
+	}
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	zdb_cb_t *zcb = arg;
+	dmu_object_type_t type;
+	boolean_t is_metadata;
+
+	if (zb->zb_level == ZB_DNODE_LEVEL)
+		return (0);
+
+	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("objset %llu object %llu "
+		    "level %lld offset 0x%llx %s\n",
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    (longlong_t)zb->zb_level,
+		    (u_longlong_t)blkid2offset(dnp, bp, zb),
+		    blkbuf);
+	}
+
+	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
+		return (0);
+
+	type = BP_GET_TYPE(bp);
+
+	zdb_count_block(zcb, zilog, bp,
+	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
+
+	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
+
+	if (!BP_IS_EMBEDDED(bp) &&
+	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
+		size_t size = BP_GET_PSIZE(bp);
+		abd_t *abd = abd_alloc(size, B_FALSE);
+		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
+
+		/* If it's an intent log block, failure is expected. */
+		if (zb->zb_level == ZB_ZIL_LEVEL)
+			flags |= ZIO_FLAG_SPECULATIVE;
+
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_load_verify_bytes > max_inflight_bytes)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		spa->spa_load_verify_bytes += size;
+		mutex_exit(&spa->spa_scrub_lock);
+
+		zio_nowait(zio_read(NULL, spa, bp, abd, size,
+		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
+	}
+
+	zcb->zcb_readfails = 0;
+
+	/* only call gethrtime() every 100 blocks */
+	static int iters;
+	if (++iters > 100)
+		iters = 0;
+	else
+		return (0);
+
+	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
+		uint64_t now = gethrtime();
+		char buf[10];
+		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
+		int kb_per_sec =
+		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
+		int sec_remaining =
+		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
+
+		/* make sure nicenum has enough space */
+		CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);
+
+		zfs_nicebytes(bytes, buf, sizeof (buf));
+		(void) fprintf(stderr,
+		    "\r%5s completed (%4dMB/s) "
+		    "estimated time remaining: %uhr %02umin %02usec        ",
+		    buf, kb_per_sec / 1024,
+		    sec_remaining / 60 / 60,
+		    sec_remaining / 60 % 60,
+		    sec_remaining % 60);
+
+		zcb->zcb_lastprint = now;
+	}
+
+	return (0);
+}
+
+static void
+zdb_leak(void *arg, uint64_t start, uint64_t size)
+{
+	vdev_t *vd = arg;
+
+	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+}
+
+static metaslab_ops_t zdb_metaslab_ops = {
+	NULL	/* alloc */
+};
+
+/* ARGSUSED */
+static int
+load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
+    uint64_t txg, void *arg)
+{
+	spa_vdev_removal_t *svr = arg;
+
+	uint64_t offset = sme->sme_offset;
+	uint64_t size = sme->sme_run;
+
+	/* skip vdevs we don't care about */
+	if (sme->sme_vdev != svr->svr_vdev_id)
+		return (0);
+
+	vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+	if (txg < metaslab_unflushed_txg(ms))
+		return (0);
+
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	ASSERT(vim != NULL);
+	if (offset >= vdev_indirect_mapping_max_offset(vim))
+		return (0);
+
+	if (sme->sme_type == SM_ALLOC)
+		range_tree_add(svr->svr_allocd_segs, offset, size);
+	else
+		range_tree_remove(svr->svr_allocd_segs, offset, size);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *arg)
+{
+	/*
+	 * This callback was called through a remap from
+	 * a device being removed. Therefore, the vdev that
+	 * this callback is applied to is a concrete
+	 * vdev.
+	 */
+	ASSERT(vdev_is_concrete(vd));
+
+	VERIFY0(metaslab_claim_impl(vd, offset, size,
+	    spa_min_claim_txg(vd->vdev_spa)));
+}
+
+static void
+claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
+{
+	vdev_t *vd = arg;
+
+	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
+	    claim_segment_impl_cb, NULL);
+}
+
+/*
+ * After accounting for all allocated blocks that are directly referenced,
+ * we might have missed a reference to a block from a partially complete
+ * (and thus unused) indirect mapping object. We perform a secondary pass
+ * through the metaslabs we have already mapped and claim the destination
+ * blocks.
+ */
+static void
+zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
+{
+	if (dump_opt['L'])
+		return;
+
+	if (spa->spa_vdev_removal == NULL)
+		return;
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+	ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+	range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
+		metaslab_t *msp = vd->vdev_ms[msi];
+
+		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
+			break;
+
+		ASSERT0(range_tree_space(allocs));
+		if (msp->ms_sm != NULL)
+			VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
+		range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
+	}
+	range_tree_destroy(allocs);
+
+	iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
+
+	/*
+	 * Clear everything past what has been synced,
+	 * because we have not allocated mappings for
+	 * it yet.
+	 */
+	range_tree_clear(svr->svr_allocd_segs,
+	    vdev_indirect_mapping_max_offset(vim),
+	    vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
+
+	zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
+	range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+/* ARGSUSED */
+static int
+increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	zdb_cb_t *zcb = arg;
+	spa_t *spa = zcb->zcb_spa;
+	vdev_t *vd;
+	const dva_t *dva = &bp->blk_dva[0];
+
+	ASSERT(!bp_freed);
+	ASSERT(!dump_opt['L']);
+	ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
+	ASSERT3P(vd, !=, NULL);
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+	ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
+
+	vdev_indirect_mapping_increment_obsolete_count(
+	    vd->vdev_indirect_mapping,
+	    DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
+	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
+
+	return (0);
+}
+
+static uint32_t *
+zdb_load_obsolete_counts(vdev_t *vd)
+{
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	spa_t *spa = vd->vdev_spa;
+	spa_condensing_indirect_phys_t *scip =
+	    &spa->spa_condensing_indirect_phys;
+	uint64_t obsolete_sm_object;
+	uint32_t *counts;
+
+	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+	EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
+	counts = vdev_indirect_mapping_load_obsolete_counts(vim);
+	if (vd->vdev_obsolete_sm != NULL) {
+		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
+		    vd->vdev_obsolete_sm);
+	}
+	if (scip->scip_vdev == vd->vdev_id &&
+	    scip->scip_prev_obsolete_sm_object != 0) {
+		space_map_t *prev_obsolete_sm = NULL;
+		VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
+		    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
+		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
+		    prev_obsolete_sm);
+		space_map_close(prev_obsolete_sm);
+	}
+	return (counts);
+}
+
+static void
+zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+	ddt_bookmark_t ddb;
+	ddt_entry_t dde;
+	int error;
+	int p;
+
+	ASSERT(!dump_opt['L']);
+
+	bzero(&ddb, sizeof (ddb));
+	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+		blkptr_t blk;
+		ddt_phys_t *ddp = dde.dde_phys;
+
+		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+			return;
+
+		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
+		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+			if (ddp->ddp_phys_birth == 0)
+				continue;
+			ddt_bp_create(ddb.ddb_checksum,
+			    &dde.dde_key, ddp, &blk);
+			if (p == DDT_PHYS_DITTO) {
+				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
+			} else {
+				zcb->zcb_dedup_asize +=
+				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+				zcb->zcb_dedup_blocks++;
+			}
+		}
+		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+		ddt_enter(ddt);
+		VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+		ddt_exit(ddt);
+	}
+
+	ASSERT(error == ENOENT);
+}
+
+typedef struct checkpoint_sm_exclude_entry_arg {
+	vdev_t *cseea_vd;
+	uint64_t cseea_checkpoint_size;
+} checkpoint_sm_exclude_entry_arg_t;
+
+static int
+checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
+{
+	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
+	vdev_t *vd = cseea->cseea_vd;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
+
+	ASSERT(sme->sme_type == SM_FREE);
+
+	/*
+	 * Since the vdev_checkpoint_sm exists in the vdev level
+	 * and the ms_sm space maps exist in the metaslab level,
+	 * an entry in the checkpoint space map could theoretically
+	 * cross the boundaries of the metaslab that it belongs.
+	 *
+	 * In reality, because of the way that we populate and
+	 * manipulate the checkpoint's space maps currently,
+	 * there shouldn't be any entries that cross metaslabs.
+	 * Hence the assertion below.
+	 *
+	 * That said, there is no fundamental requirement that
+	 * the checkpoint's space map entries should not cross
+	 * metaslab boundaries. So if needed we could add code
+	 * that handles metaslab-crossing segments in the future.
+	 */
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+	/*
+	 * By removing the entry from the allocated segments we
+	 * also verify that the entry is there to begin with.
+	 */
+	mutex_enter(&ms->ms_lock);
+	range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
+	mutex_exit(&ms->ms_lock);
+
+	cseea->cseea_checkpoint_size += sme->sme_run;
+	return (0);
+}
+
+static void
+zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
+{
+	spa_t *spa = vd->vdev_spa;
+	space_map_t *checkpoint_sm = NULL;
+	uint64_t checkpoint_sm_obj;
+
+	/*
+	 * If there is no vdev_top_zap, we are in a pool whose
+	 * version predates the pool checkpoint feature.
+	 */
+	if (vd->vdev_top_zap == 0)
+		return;
+
+	/*
+	 * If there is no reference of the vdev_checkpoint_sm in
+	 * the vdev_top_zap, then one of the following scenarios
+	 * is true:
+	 *
+	 * 1] There is no checkpoint
+	 * 2] There is a checkpoint, but no checkpointed blocks
+	 *    have been freed yet
+	 * 3] The current vdev is indirect
+	 *
+	 * In these cases we return immediately.
+	 */
+	if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+		return;
+
+	VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
+	    &checkpoint_sm_obj));
+
+	checkpoint_sm_exclude_entry_arg_t cseea;
+	cseea.cseea_vd = vd;
+	cseea.cseea_checkpoint_size = 0;
+
+	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+
+	VERIFY0(space_map_iterate(checkpoint_sm,
+	    space_map_length(checkpoint_sm),
+	    checkpoint_sm_exclude_entry_cb, &cseea));
+	space_map_close(checkpoint_sm);
+
+	zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
+}
+
+static void
+zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
+{
+	ASSERT(!dump_opt['L']);
+
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
+		zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
+	}
+}
+
+static int
+count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
+    uint64_t txg, void *arg)
+{
+	int64_t *ualloc_space = arg;
+
+	uint64_t offset = sme->sme_offset;
+	uint64_t vdev_id = sme->sme_vdev;
+
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	if (!vdev_is_concrete(vd))
+		return (0);
+
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+	if (txg < metaslab_unflushed_txg(ms))
+		return (0);
+
+	if (sme->sme_type == SM_ALLOC)
+		*ualloc_space += sme->sme_run;
+	else
+		*ualloc_space -= sme->sme_run;
+
+	return (0);
+}
+
+static int64_t
+get_unflushed_alloc_space(spa_t *spa)
+{
+	if (dump_opt['L'])
+		return (0);
+
+	int64_t ualloc_space = 0;
+	iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
+	    &ualloc_space);
+	return (ualloc_space);
+}
+
+static int
+load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
+{
+	maptype_t *uic_maptype = arg;
+
+	uint64_t offset = sme->sme_offset;
+	uint64_t size = sme->sme_run;
+	uint64_t vdev_id = sme->sme_vdev;
+
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+	/* skip indirect vdevs */
+	if (!vdev_is_concrete(vd))
+		return (0);
+
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+	ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
+
+	if (txg < metaslab_unflushed_txg(ms))
+		return (0);
+
+	if (*uic_maptype == sme->sme_type)
+		range_tree_add(ms->ms_allocatable, offset, size);
+	else
+		range_tree_remove(ms->ms_allocatable, offset, size);
+
+	return (0);
+}
+
+static void
+load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
+{
+	iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
+}
+
+static void
+load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *vd = rvd->vdev_child[i];
+
+		ASSERT3U(i, ==, vd->vdev_id);
+
+		if (vd->vdev_ops == &vdev_indirect_ops)
+			continue;
+
+		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+
+			(void) fprintf(stderr,
+			    "\rloading concrete vdev %llu, "
+			    "metaslab %llu of %llu ...",
+			    (longlong_t)vd->vdev_id,
+			    (longlong_t)msp->ms_id,
+			    (longlong_t)vd->vdev_ms_count);
+
+			mutex_enter(&msp->ms_lock);
+			range_tree_vacate(msp->ms_allocatable, NULL, NULL);
+
+			/*
+			 * We don't want to spend the CPU manipulating the
+			 * size-ordered tree, so clear the range_tree ops.
+			 */
+			msp->ms_allocatable->rt_ops = NULL;
+
+			if (msp->ms_sm != NULL) {
+				VERIFY0(space_map_load(msp->ms_sm,
+				    msp->ms_allocatable, maptype));
+			}
+			if (!msp->ms_loaded)
+				msp->ms_loaded = B_TRUE;
+			mutex_exit(&msp->ms_lock);
+		}
+	}
+
+	load_unflushed_to_ms_allocatables(spa, maptype);
+}
+
+/*
+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
+ * index in vim_entries that has the first entry in this metaslab.
+ * On return, it will be set to the first entry after this metaslab.
+ */
+static void
+load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
+    uint64_t *vim_idxp)
+{
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+	mutex_enter(&msp->ms_lock);
+	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
+
+	/*
+	 * We don't want to spend the CPU manipulating the
+	 * size-ordered tree, so clear the range_tree ops.
+	 */
+	msp->ms_allocatable->rt_ops = NULL;
+
+	for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
+	    (*vim_idxp)++) {
+		vdev_indirect_mapping_entry_phys_t *vimep =
+		    &vim->vim_entries[*vim_idxp];
+		uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+		uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
+		ASSERT3U(ent_offset, >=, msp->ms_start);
+		if (ent_offset >= msp->ms_start + msp->ms_size)
+			break;
+
+		/*
+		 * Mappings do not cross metaslab boundaries,
+		 * because we create them by walking the metaslabs.
+		 */
+		ASSERT3U(ent_offset + ent_len, <=,
+		    msp->ms_start + msp->ms_size);
+		range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
+	}
+
+	if (!msp->ms_loaded)
+		msp->ms_loaded = B_TRUE;
+	mutex_exit(&msp->ms_lock);
+}
+
+static void
+zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
+{
+	ASSERT(!dump_opt['L']);
+
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+
+		ASSERT3U(c, ==, vd->vdev_id);
+
+		if (vd->vdev_ops != &vdev_indirect_ops)
+			continue;
+
+		/*
+		 * Note: we don't check for mapping leaks on
+		 * removing vdevs because their ms_allocatable's
+		 * are used to look for leaks in allocated space.
+		 */
+		zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
+
+		/*
+		 * Normally, indirect vdevs don't have any
+		 * metaslabs.  We want to set them up for
+		 * zio_claim().
+		 */
+		VERIFY0(vdev_metaslab_init(vd, 0));
+
+		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+		uint64_t vim_idx = 0;
+		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+
+			(void) fprintf(stderr,
+			    "\rloading indirect vdev %llu, "
+			    "metaslab %llu of %llu ...",
+			    (longlong_t)vd->vdev_id,
+			    (longlong_t)vd->vdev_ms[m]->ms_id,
+			    (longlong_t)vd->vdev_ms_count);
+
+			load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
+			    &vim_idx);
+		}
+		ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
+	}
+}
+
+static void
+zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+	zcb->zcb_spa = spa;
+
+	if (dump_opt['L'])
+		return;
+
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	/*
+	 * We are going to be changing the meaning of the metaslab's
+	 * ms_allocatable.  Ensure that the allocator doesn't try to
+	 * use the tree.
+	 */
+	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+
+	zcb->zcb_vd_obsolete_counts =
+	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
+	    UMEM_NOFAIL);
+
+	/*
+	 * For leak detection, we overload the ms_allocatable trees
+	 * to contain allocated segments instead of free segments.
+	 * As a result, we can't use the normal metaslab_load/unload
+	 * interfaces.
+	 */
+	zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+	load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
+
+	/*
+	 * On load_concrete_ms_allocatable_trees() we loaded all the
+	 * allocated entries from the ms_sm to the ms_allocatable for
+	 * each metaslab. If the pool has a checkpoint or is in the
+	 * middle of discarding a checkpoint, some of these blocks
+	 * may have been freed but their ms_sm may not have been
+	 * updated because they are referenced by the checkpoint. In
+	 * order to avoid false-positives during leak-detection, we
+	 * go through the vdev's checkpoint space map and exclude all
+	 * its entries from their relevant ms_allocatable.
+	 *
+	 * We also aggregate the space held by the checkpoint and add
+	 * it to zcb_checkpoint_size.
+	 *
+	 * Note that at this point we are also verifying that all the
+	 * entries on the checkpoint_sm are marked as allocated in
+	 * the ms_sm of their relevant metaslab.
+	 * [see comment in checkpoint_sm_exclude_entry_cb()]
+	 */
+	zdb_leak_init_exclude_checkpoint(spa, zcb);
+	ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
+
+	/* for cleaner progress output */
+	(void) fprintf(stderr, "\n");
+
+	if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+		ASSERT(spa_feature_is_enabled(spa,
+		    SPA_FEATURE_DEVICE_REMOVAL));
+		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
+		    increment_indirect_mapping_cb, zcb, NULL);
+	}
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	zdb_ddt_leak_init(spa, zcb);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static boolean_t
+zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
+{
+	boolean_t leaks = B_FALSE;
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	uint64_t total_leaked = 0;
+	boolean_t are_precise = B_FALSE;
+
+	ASSERT(vim != NULL);
+
+	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
+		vdev_indirect_mapping_entry_phys_t *vimep =
+		    &vim->vim_entries[i];
+		uint64_t obsolete_bytes = 0;
+		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+		/*
+		 * This is not very efficient but it's easy to
+		 * verify correctness.
+		 */
+		for (uint64_t inner_offset = 0;
+		    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
+		    inner_offset += 1 << vd->vdev_ashift) {
+			if (range_tree_contains(msp->ms_allocatable,
+			    offset + inner_offset, 1 << vd->vdev_ashift)) {
+				obsolete_bytes += 1 << vd->vdev_ashift;
+			}
+		}
+
+		int64_t bytes_leaked = obsolete_bytes -
+		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
+		ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
+		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
+
+		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+		if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
+			(void) printf("obsolete indirect mapping count "
+			    "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
+			    (u_longlong_t)vd->vdev_id,
+			    (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
+			    (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+			    (u_longlong_t)bytes_leaked);
+		}
+		total_leaked += ABS(bytes_leaked);
+	}
+
+	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+	if (!are_precise && total_leaked > 0) {
+		int pct_leaked = total_leaked * 100 /
+		    vdev_indirect_mapping_bytes_mapped(vim);
+		(void) printf("cannot verify obsolete indirect mapping "
+		    "counts of vdev %llu because precise feature was not "
+		    "enabled when it was removed: %d%% (%llx bytes) of mapping"
+		    "unreferenced\n",
+		    (u_longlong_t)vd->vdev_id, pct_leaked,
+		    (u_longlong_t)total_leaked);
+	} else if (total_leaked > 0) {
+		(void) printf("obsolete indirect mapping count mismatch "
+		    "for vdev %llu -- %llx total bytes mismatched\n",
+		    (u_longlong_t)vd->vdev_id,
+		    (u_longlong_t)total_leaked);
+		leaks |= B_TRUE;
+	}
+
+	vdev_indirect_mapping_free_obsolete_counts(vim,
+	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
+	zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
+
+	return (leaks);
+}
+
+static boolean_t
+zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
+{
+	if (dump_opt['L'])
+		return (B_FALSE);
+
+	boolean_t leaks = B_FALSE;
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (unsigned c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+		metaslab_group_t *mg __maybe_unused = vd->vdev_mg;
+
+		if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
+			leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+		}
+
+		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+			ASSERT3P(mg, ==, msp->ms_group);
+
+			/*
+			 * ms_allocatable has been overloaded
+			 * to contain allocated segments. Now that
+			 * we finished traversing all blocks, any
+			 * block that remains in the ms_allocatable
+			 * represents an allocated block that we
+			 * did not claim during the traversal.
+			 * Claimed blocks would have been removed
+			 * from the ms_allocatable.  For indirect
+			 * vdevs, space remaining in the tree
+			 * represents parts of the mapping that are
+			 * not referenced, which is not a bug.
+			 */
+			if (vd->vdev_ops == &vdev_indirect_ops) {
+				range_tree_vacate(msp->ms_allocatable,
+				    NULL, NULL);
+			} else {
+				range_tree_vacate(msp->ms_allocatable,
+				    zdb_leak, vd);
+			}
+			if (msp->ms_loaded) {
+				msp->ms_loaded = B_FALSE;
+			}
+		}
+	}
+
+	umem_free(zcb->zcb_vd_obsolete_counts,
+	    rvd->vdev_children * sizeof (uint32_t *));
+	zcb->zcb_vd_obsolete_counts = NULL;
+
+	return (leaks);
+}
+
+/* ARGSUSED */
+static int
+count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	zdb_cb_t *zcb = arg;
+
+	if (dump_opt['b'] >= 5) {
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("[%s] %s\n",
+		    "deferred free", blkbuf);
+	}
+	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+	return (0);
+}
+
+/*
+ * Iterate over livelists which have been destroyed by the user but
+ * are still present in the MOS, waiting to be freed
+ */
+static void
+iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
+{
+	objset_t *mos = spa->spa_meta_objset;
+	uint64_t zap_obj;
+	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
+	if (err == ENOENT)
+		return;
+	ASSERT0(err);
+
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	dsl_deadlist_t ll;
+	/* NULL out os prior to dsl_deadlist_open in case it's garbage */
+	ll.dl_os = NULL;
+	for (zap_cursor_init(&zc, mos, zap_obj);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		dsl_deadlist_open(&ll, mos, attr.za_first_integer);
+		func(&ll, arg);
+		dsl_deadlist_close(&ll);
+	}
+	zap_cursor_fini(&zc);
+}
+
+static int
+bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	ASSERT(!bp_freed);
+	return (count_block_cb(arg, bp, tx));
+}
+
+static int
+livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
+{
+	zdb_cb_t *zbc = args;
+	bplist_t blks;
+	bplist_create(&blks);
+	/* determine which blocks have been alloc'd but not freed */
+	VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
+	/* count those blocks */
+	(void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
+	bplist_destroy(&blks);
+	return (0);
+}
+
+static void
+livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
+{
+	dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
+}
+
+/*
+ * Count the blocks in the livelists that have been destroyed by the user
+ * but haven't yet been freed.
+ */
+static void
+deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
+{
+	iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
+}
+
+static void
+dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
+{
+	ASSERT3P(arg, ==, NULL);
+	global_feature_count[SPA_FEATURE_LIVELIST]++;
+	dump_blkptr_list(ll, "Deleted Livelist");
+	dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
+}
+
+/*
+ * Print out, register object references to, and increment feature counts for
+ * livelists that have been destroyed by the user but haven't yet been freed.
+ */
+static void
+deleted_livelists_dump_mos(spa_t *spa)
+{
+	uint64_t zap_obj;
+	objset_t *mos = spa->spa_meta_objset;
+	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
+	if (err == ENOENT)
+		return;
+	mos_obj_refd(zap_obj);
+	iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
+}
+
+static int
+dump_block_stats(spa_t *spa)
+{
+	zdb_cb_t zcb;
+	zdb_blkstats_t *zb, *tzb;
+	uint64_t norm_alloc, norm_space, total_alloc, total_found;
+	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+	    TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
+	boolean_t leaks = B_FALSE;
+	int e, c, err;
+	bp_embedded_type_t i;
+
+	bzero(&zcb, sizeof (zcb));
+	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
+	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+	    (dump_opt['c'] == 1) ? "metadata " : "",
+	    dump_opt['c'] ? "checksums " : "",
+	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+	    !dump_opt['L'] ? "nothing leaked " : "");
+
+	/*
+	 * When leak detection is enabled we load all space maps as SM_ALLOC
+	 * maps, then traverse the pool claiming each block we discover. If
+	 * the pool is perfectly consistent, the segment trees will be empty
+	 * when we're done. Anything left over is a leak; any block we can't
+	 * claim (because it's not part of any space map) is a double
+	 * allocation, reference to a freed block, or an unclaimed log block.
+	 *
+	 * When leak detection is disabled (-L option) we still traverse the
+	 * pool claiming each block we discover, but we skip opening any space
+	 * maps.
+	 */
+	bzero(&zcb, sizeof (zdb_cb_t));
+	zdb_leak_init(spa, &zcb);
+
+	/*
+	 * If there's a deferred-free bplist, process that first.
+	 */
+	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+	    bpobj_count_block_cb, &zcb, NULL);
+
+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+		    bpobj_count_block_cb, &zcb, NULL);
+	}
+
+	zdb_claim_removing(spa, &zcb);
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
+		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
+		    &zcb, NULL));
+	}
+
+	deleted_livelists_count_blocks(spa, &zcb);
+
+	if (dump_opt['c'] > 1)
+		flags |= TRAVERSE_PREFETCH_DATA;
+
+	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
+	zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
+	zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
+	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
+	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
+
+	/*
+	 * If we've traversed the data blocks then we need to wait for those
+	 * I/Os to complete. We leverage "The Godfather" zio to wait on
+	 * all async I/Os to complete.
+	 */
+	if (dump_opt['c']) {
+		for (c = 0; c < max_ncpus; c++) {
+			(void) zio_wait(spa->spa_async_zio_root[c]);
+			spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+			    ZIO_FLAG_GODFATHER);
+		}
+	}
+	ASSERT0(spa->spa_load_verify_bytes);
+
+	/*
+	 * Done after zio_wait() since zcb_haderrors is modified in
+	 * zdb_blkptr_done()
+	 */
+	zcb.zcb_haderrors |= err;
+
+	if (zcb.zcb_haderrors) {
+		(void) printf("\nError counts:\n\n");
+		(void) printf("\t%5s  %s\n", "errno", "count");
+		for (e = 0; e < 256; e++) {
+			if (zcb.zcb_errors[e] != 0) {
+				(void) printf("\t%5d  %llu\n",
+				    e, (u_longlong_t)zcb.zcb_errors[e]);
+			}
+		}
+	}
+
+	/*
+	 * Report any leaked segments.
+	 */
+	leaks |= zdb_leak_fini(spa, &zcb);
+
+	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
+
+	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+	norm_space = metaslab_class_get_space(spa_normal_class(spa));
+
+	total_alloc = norm_alloc +
+	    metaslab_class_get_alloc(spa_log_class(spa)) +
+	    metaslab_class_get_alloc(spa_special_class(spa)) +
+	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
+	    get_unflushed_alloc_space(spa);
+	total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
+	    zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
+
+	if (total_found == total_alloc && !dump_opt['L']) {
+		(void) printf("\n\tNo leaks (block sum matches space"
+		    " maps exactly)\n");
+	} else if (!dump_opt['L']) {
+		(void) printf("block traversal size %llu != alloc %llu "
+		    "(%s %lld)\n",
+		    (u_longlong_t)total_found,
+		    (u_longlong_t)total_alloc,
+		    (dump_opt['L']) ? "unreachable" : "leaked",
+		    (longlong_t)(total_alloc - total_found));
+		leaks = B_TRUE;
+	}
+
+	if (tzb->zb_count == 0)
+		return (2);
+
+	(void) printf("\n");
+	(void) printf("\t%-16s %14llu\n", "bp count:",
+	    (u_longlong_t)tzb->zb_count);
+	(void) printf("\t%-16s %14llu\n", "ganged count:",
+	    (longlong_t)tzb->zb_gangs);
+	(void) printf("\t%-16s %14llu      avg: %6llu\n", "bp logical:",
+	    (u_longlong_t)tzb->zb_lsize,
+	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
+	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
+	    "bp physical:", (u_longlong_t)tzb->zb_psize,
+	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
+	    (double)tzb->zb_lsize / tzb->zb_psize);
+	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
+	    "bp allocated:", (u_longlong_t)tzb->zb_asize,
+	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
+	    (double)tzb->zb_lsize / tzb->zb_asize);
+	(void) printf("\t%-16s %14llu    ref>1: %6llu   deduplication: %6.2f\n",
+	    "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize,
+	    (u_longlong_t)zcb.zcb_dedup_blocks,
+	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
+	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
+	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
+
+	if (spa_special_class(spa)->mc_rotor != NULL) {
+		uint64_t alloc = metaslab_class_get_alloc(
+		    spa_special_class(spa));
+		uint64_t space = metaslab_class_get_space(
+		    spa_special_class(spa));
+
+		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
+		    "Special class", (u_longlong_t)alloc,
+		    100.0 * alloc / space);
+	}
+
+	if (spa_dedup_class(spa)->mc_rotor != NULL) {
+		uint64_t alloc = metaslab_class_get_alloc(
+		    spa_dedup_class(spa));
+		uint64_t space = metaslab_class_get_space(
+		    spa_dedup_class(spa));
+
+		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
+		    "Dedup class", (u_longlong_t)alloc,
+		    100.0 * alloc / space);
+	}
+
+	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
+		if (zcb.zcb_embedded_blocks[i] == 0)
+			continue;
+		(void) printf("\n");
+		(void) printf("\tadditional, non-pointer bps of type %u: "
+		    "%10llu\n",
+		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
+
+		if (dump_opt['b'] >= 3) {
+			(void) printf("\t number of (compressed) bytes:  "
+			    "number of bps\n");
+			dump_histogram(zcb.zcb_embedded_histogram[i],
+			    sizeof (zcb.zcb_embedded_histogram[i]) /
+			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
+		}
+	}
+
+	if (tzb->zb_ditto_samevdev != 0) {
+		(void) printf("\tDittoed blocks on same vdev: %llu\n",
+		    (longlong_t)tzb->zb_ditto_samevdev);
+	}
+	if (tzb->zb_ditto_same_ms != 0) {
+		(void) printf("\tDittoed blocks in same metaslab: %llu\n",
+		    (longlong_t)tzb->zb_ditto_same_ms);
+	}
+
+	for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
+		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+		if (vim == NULL) {
+			continue;
+		}
+
+		char mem[32];
+		zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
+		    mem, vdev_indirect_mapping_size(vim));
+
+		(void) printf("\tindirect vdev id %llu has %llu segments "
+		    "(%s in memory)\n",
+		    (longlong_t)vd->vdev_id,
+		    (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
+	}
+
+	if (dump_opt['b'] >= 2) {
+		int l, t, level;
+		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
+		    "\t  avg\t comp\t%%Total\tType\n");
+
+		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
+			char csize[32], lsize[32], psize[32], asize[32];
+			char avg[32], gang[32];
+			const char *typename;
+
+			/* make sure nicenum has enough space */
+			CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
+			CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
+			CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
+			CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
+			CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
+			CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);
+
+			if (t < DMU_OT_NUMTYPES)
+				typename = dmu_ot[t].ot_name;
+			else
+				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
+
+			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
+				(void) printf("%6s\t%5s\t%5s\t%5s"
+				    "\t%5s\t%5s\t%6s\t%s\n",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    "-",
+				    typename);
+				continue;
+			}
+
+			for (l = ZB_TOTAL - 1; l >= -1; l--) {
+				level = (l == -1 ? ZB_TOTAL : l);
+				zb = &zcb.zcb_type[level][t];
+
+				if (zb->zb_asize == 0)
+					continue;
+
+				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
+					continue;
+
+				if (level == 0 && zb->zb_asize ==
+				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
+					continue;
+
+				zdb_nicenum(zb->zb_count, csize,
+				    sizeof (csize));
+				zdb_nicenum(zb->zb_lsize, lsize,
+				    sizeof (lsize));
+				zdb_nicenum(zb->zb_psize, psize,
+				    sizeof (psize));
+				zdb_nicenum(zb->zb_asize, asize,
+				    sizeof (asize));
+				zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
+				    sizeof (avg));
+				zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
+
+				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
+				    "\t%5.2f\t%6.2f\t",
+				    csize, lsize, psize, asize, avg,
+				    (double)zb->zb_lsize / zb->zb_psize,
+				    100.0 * zb->zb_asize / tzb->zb_asize);
+
+				if (level == ZB_TOTAL)
+					(void) printf("%s\n", typename);
+				else
+					(void) printf("    L%d %s\n",
+					    level, typename);
+
+				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
+					(void) printf("\t number of ganged "
+					    "blocks: %s\n", gang);
+				}
+
+				if (dump_opt['b'] >= 4) {
+					(void) printf("psize "
+					    "(in 512-byte sectors): "
+					    "number of blocks\n");
+					dump_histogram(zb->zb_psize_histogram,
+					    PSIZE_HISTO_SIZE, 0);
+				}
+			}
+		}
+
+		/* Output a table summarizing block sizes in the pool */
+		if (dump_opt['b'] >= 2) {
+			dump_size_histograms(&zcb);
+		}
+	}
+
+	(void) printf("\n");
+
+	if (leaks)
+		return (2);
+
+	if (zcb.zcb_haderrors)
+		return (3);
+
+	return (0);
+}
+
+typedef struct zdb_ddt_entry {
+	ddt_key_t	zdde_key;
+	uint64_t	zdde_ref_blocks;
+	uint64_t	zdde_ref_lsize;
+	uint64_t	zdde_ref_psize;
+	uint64_t	zdde_ref_dsize;
+	avl_node_t	zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	avl_tree_t *t = arg;
+	avl_index_t where;
+	zdb_ddt_entry_t *zdde, zdde_search;
+
+	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+	    BP_IS_EMBEDDED(bp))
+		return (0);
+
+	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+		(void) printf("traversing objset %llu, %llu objects, "
+		    "%lu blocks so far\n",
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)BP_GET_FILL(bp),
+		    avl_numnodes(t));
+	}
+
+	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
+		return (0);
+
+	ddt_key_fill(&zdde_search.zdde_key, bp);
+
+	zdde = avl_find(t, &zdde_search, &where);
+
+	if (zdde == NULL) {
+		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+		zdde->zdde_key = zdde_search.zdde_key;
+		avl_insert(t, zdde, where);
+	}
+
+	zdde->zdde_ref_blocks += 1;
+	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+	return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+	avl_tree_t t;
+	void *cookie = NULL;
+	zdb_ddt_entry_t *zdde;
+	ddt_histogram_t ddh_total;
+	ddt_stat_t dds_total;
+
+	bzero(&ddh_total, sizeof (ddh_total));
+	bzero(&dds_total, sizeof (dds_total));
+	avl_create(&t, ddt_entry_compare,
+	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+	    TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+		ddt_stat_t dds;
+		uint64_t refcnt = zdde->zdde_ref_blocks;
+		ASSERT(refcnt != 0);
+
+		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+		dds.dds_ref_psize = zdde->zdde_ref_psize;
+		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
+		    &dds, 0);
+
+		umem_free(zdde, sizeof (*zdde));
+	}
+
+	avl_destroy(&t);
+
+	ddt_histogram_stat(&dds_total, &ddh_total);
+
+	(void) printf("Simulated DDT histogram:\n");
+
+	zpool_dump_ddt(&dds_total, &ddh_total);
+
+	dump_dedup_ratio(&dds_total);
+}
+
+static int
+verify_device_removal_feature_counts(spa_t *spa)
+{
+	uint64_t dr_feature_refcount = 0;
+	uint64_t oc_feature_refcount = 0;
+	uint64_t indirect_vdev_count = 0;
+	uint64_t precise_vdev_count = 0;
+	uint64_t obsolete_counts_object_count = 0;
+	uint64_t obsolete_sm_count = 0;
+	uint64_t obsolete_counts_count = 0;
+	uint64_t scip_count = 0;
+	uint64_t obsolete_bpobj_count = 0;
+	int ret = 0;
+
+	spa_condensing_indirect_phys_t *scip =
+	    &spa->spa_condensing_indirect_phys;
+	if (scip->scip_next_mapping_object != 0) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
+		ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+		(void) printf("Condensing indirect vdev %llu: new mapping "
+		    "object %llu, prev obsolete sm %llu\n",
+		    (u_longlong_t)scip->scip_vdev,
+		    (u_longlong_t)scip->scip_next_mapping_object,
+		    (u_longlong_t)scip->scip_prev_obsolete_sm_object);
+		if (scip->scip_prev_obsolete_sm_object != 0) {
+			space_map_t *prev_obsolete_sm = NULL;
+			VERIFY0(space_map_open(&prev_obsolete_sm,
+			    spa->spa_meta_objset,
+			    scip->scip_prev_obsolete_sm_object,
+			    0, vd->vdev_asize, 0));
+			dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
+			(void) printf("\n");
+			space_map_close(prev_obsolete_sm);
+		}
+
+		scip_count += 2;
+	}
+
+	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+		if (vic->vic_mapping_object != 0) {
+			ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
+			    vd->vdev_removing);
+			indirect_vdev_count++;
+
+			if (vd->vdev_indirect_mapping->vim_havecounts) {
+				obsolete_counts_count++;
+			}
+		}
+
+		boolean_t are_precise;
+		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+		if (are_precise) {
+			ASSERT(vic->vic_mapping_object != 0);
+			precise_vdev_count++;
+		}
+
+		uint64_t obsolete_sm_object;
+		VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+		if (obsolete_sm_object != 0) {
+			ASSERT(vic->vic_mapping_object != 0);
+			obsolete_sm_count++;
+		}
+	}
+
+	(void) feature_get_refcount(spa,
+	    &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
+	    &dr_feature_refcount);
+	(void) feature_get_refcount(spa,
+	    &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
+	    &oc_feature_refcount);
+
+	if (dr_feature_refcount != indirect_vdev_count) {
+		ret = 1;
+		(void) printf("Number of indirect vdevs (%llu) " \
+		    "does not match feature count (%llu)\n",
+		    (u_longlong_t)indirect_vdev_count,
+		    (u_longlong_t)dr_feature_refcount);
+	} else {
+		(void) printf("Verified device_removal feature refcount " \
+		    "of %llu is correct\n",
+		    (u_longlong_t)dr_feature_refcount);
+	}
+
+	if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_OBSOLETE_BPOBJ) == 0) {
+		obsolete_bpobj_count++;
+	}
+
+
+	obsolete_counts_object_count = precise_vdev_count;
+	obsolete_counts_object_count += obsolete_sm_count;
+	obsolete_counts_object_count += obsolete_counts_count;
+	obsolete_counts_object_count += scip_count;
+	obsolete_counts_object_count += obsolete_bpobj_count;
+	obsolete_counts_object_count += remap_deadlist_count;
+
+	if (oc_feature_refcount != obsolete_counts_object_count) {
+		ret = 1;
+		(void) printf("Number of obsolete counts objects (%llu) " \
+		    "does not match feature count (%llu)\n",
+		    (u_longlong_t)obsolete_counts_object_count,
+		    (u_longlong_t)oc_feature_refcount);
+		(void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
+		    "ob:%llu rd:%llu\n",
+		    (u_longlong_t)precise_vdev_count,
+		    (u_longlong_t)obsolete_sm_count,
+		    (u_longlong_t)obsolete_counts_count,
+		    (u_longlong_t)scip_count,
+		    (u_longlong_t)obsolete_bpobj_count,
+		    (u_longlong_t)remap_deadlist_count);
+	} else {
+		(void) printf("Verified indirect_refcount feature refcount " \
+		    "of %llu is correct\n",
+		    (u_longlong_t)oc_feature_refcount);
+	}
+	return (ret);
+}
+
+static void
+zdb_set_skip_mmp(char *target)
+{
+	spa_t *spa;
+
+	/*
+	 * Disable the activity check to allow examination of
+	 * active pools.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(target)) != NULL) {
+		spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
+	}
+	mutex_exit(&spa_namespace_lock);
+}
+
+#define	BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appended to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+	int error = 0;
+	char *poolname, *bogus_name = NULL;
+
+	/* If the target is not a pool, the extract the pool name */
+	char *path_start = strchr(target, '/');
+	if (path_start != NULL) {
+		size_t poolname_len = path_start - target;
+		poolname = strndup(target, poolname_len);
+	} else {
+		poolname = target;
+	}
+
+	if (cfg == NULL) {
+		zdb_set_skip_mmp(poolname);
+		error = spa_get_stats(poolname, &cfg, NULL, 0);
+		if (error != 0) {
+			fatal("Tried to read config of pool \"%s\" but "
+			    "spa_get_stats() failed with error %d\n",
+			    poolname, error);
+		}
+	}
+
+	if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1)
+		return (NULL);
+	fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+	error = spa_import(bogus_name, cfg, NULL,
+	    ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
+	    ZFS_IMPORT_SKIP_MMP);
+	if (error != 0) {
+		fatal("Tried to import pool \"%s\" but spa_import() failed "
+		    "with error %d\n", bogus_name, error);
+	}
+
+	if (new_path != NULL && path_start != NULL) {
+		if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+			if (path_start != NULL)
+				free(poolname);
+			return (NULL);
+		}
+	}
+
+	if (target != poolname)
+		free(poolname);
+
+	return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+	vdev_t *vcsec_vd;
+
+	/* the following fields are only used for printing progress */
+	uint64_t vcsec_entryid;
+	uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define	ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
+{
+	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+	vdev_t *vd = vcsec->vcsec_vd;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
+
+	ASSERT(sme->sme_type == SM_FREE);
+
+	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+		(void) fprintf(stderr,
+		    "\rverifying vdev %llu, space map entry %llu of %llu ...",
+		    (longlong_t)vd->vdev_id,
+		    (longlong_t)vcsec->vcsec_entryid,
+		    (longlong_t)vcsec->vcsec_num_entries);
+	}
+	vcsec->vcsec_entryid++;
+
+	/*
+	 * See comment in checkpoint_sm_exclude_entry_cb()
+	 */
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+	/*
+	 * The entries in the vdev_checkpoint_sm should be marked as
+	 * allocated in the checkpointed state of the pool, therefore
+	 * their respective ms_allocateable trees should not contain them.
+	 */
+	mutex_enter(&ms->ms_lock);
+	range_tree_verify_not_present(ms->ms_allocatable,
+	    sme->sme_offset, sme->sme_run);
+	mutex_exit(&ms->ms_lock);
+
+	return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
+static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+	vdev_t *current_rvd = current->spa_root_vdev;
+
+	load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+	for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+		vdev_t *current_vd = current_rvd->vdev_child[c];
+
+		space_map_t *checkpoint_sm = NULL;
+		uint64_t checkpoint_sm_obj;
+
+		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+			/*
+			 * Since we don't allow device removal in a pool
+			 * that has a checkpoint, we expect that all removed
+			 * vdevs were removed from the pool before the
+			 * checkpoint.
+			 */
+			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+			continue;
+		}
+
+		/*
+		 * If the checkpoint space map doesn't exist, then nothing
+		 * here is checkpointed so there's nothing to verify.
+		 */
+		if (current_vd->vdev_top_zap == 0 ||
+		    zap_contains(spa_meta_objset(current),
+		    current_vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+			continue;
+
+		VERIFY0(zap_lookup(spa_meta_objset(current),
+		    current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
+		    current_vd->vdev_ashift));
+
+		verify_checkpoint_sm_entry_cb_arg_t vcsec;
+		vcsec.vcsec_vd = ckpoint_vd;
+		vcsec.vcsec_entryid = 0;
+		vcsec.vcsec_num_entries =
+		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
+		VERIFY0(space_map_iterate(checkpoint_sm,
+		    space_map_length(checkpoint_sm),
+		    verify_checkpoint_sm_entry_cb, &vcsec));
+		if (dump_opt['m'] > 3)
+			dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+		space_map_close(checkpoint_sm);
+	}
+
+	/*
+	 * If we've added vdevs since we took the checkpoint, ensure
+	 * that their checkpoint space maps are empty.
+	 */
+	if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+		for (uint64_t c = ckpoint_rvd->vdev_children;
+		    c < current_rvd->vdev_children; c++) {
+			vdev_t *current_vd = current_rvd->vdev_child[c];
+			ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+		}
+	}
+
+	/* for cleaner progress output */
+	(void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+	vdev_t *current_rvd = current->spa_root_vdev;
+
+	load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+	load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+	for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+		vdev_t *current_vd = current_rvd->vdev_child[i];
+
+		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+			/*
+			 * See comment in verify_checkpoint_vdev_spacemaps()
+			 */
+			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+			continue;
+		}
+
+		for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+			metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+			metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+			(void) fprintf(stderr,
+			    "\rverifying vdev %llu of %llu, "
+			    "metaslab %llu of %llu ...",
+			    (longlong_t)current_vd->vdev_id,
+			    (longlong_t)current_rvd->vdev_children,
+			    (longlong_t)current_vd->vdev_ms[m]->ms_id,
+			    (longlong_t)current_vd->vdev_ms_count);
+
+			/*
+			 * We walk through the ms_allocatable trees that
+			 * are loaded with the allocated blocks from the
+			 * ms_sm spacemaps of the checkpoint. For each
+			 * one of these ranges we ensure that none of them
+			 * exists in the ms_allocatable trees of the
+			 * current state which are loaded with the ranges
+			 * that are currently free.
+			 *
+			 * This way we ensure that none of the blocks that
+			 * are part of the checkpoint were freed by mistake.
+			 */
+			range_tree_walk(ckpoint_msp->ms_allocatable,
+			    (range_tree_func_t *)range_tree_verify_not_present,
+			    current_msp->ms_allocatable);
+		}
+	}
+
+	/* for cleaner progress output */
+	(void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+	ASSERT(!dump_opt['L']);
+
+	spa_t *checkpoint_spa;
+	char *checkpoint_pool;
+	nvlist_t *config = NULL;
+	int error = 0;
+
+	/*
+	 * We import the checkpointed state of the pool (under a different
+	 * name) so we can do verification on it against the current state
+	 * of the pool.
+	 */
+	checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+	    NULL);
+	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+	error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+	if (error != 0) {
+		fatal("Tried to open pool \"%s\" but spa_open() failed with "
+		    "error %d\n", checkpoint_pool, error);
+	}
+
+	/*
+	 * Ensure that ranges in the checkpoint space maps of each vdev
+	 * are allocated according to the checkpointed state's metaslab
+	 * space maps.
+	 */
+	verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+	/*
+	 * Ensure that allocated ranges in the checkpoint's metaslab
+	 * space maps remain allocated in the metaslab space maps of
+	 * the current state.
+	 */
+	verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
+
+	/*
+	 * Once we are done, we get rid of the checkpointed state.
+	 */
+	spa_close(checkpoint_spa, FTAG);
+	free(checkpoint_pool);
+}
+
+static void
+dump_leftover_checkpoint_blocks(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *vd = rvd->vdev_child[i];
+
+		space_map_t *checkpoint_sm = NULL;
+		uint64_t checkpoint_sm_obj;
+
+		if (vd->vdev_top_zap == 0)
+			continue;
+
+		if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+			continue;
+
+		VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
+		space_map_close(checkpoint_sm);
+	}
+}
+
+static int
+verify_checkpoint(spa_t *spa)
+{
+	uberblock_t checkpoint;
+	int error;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (0);
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+	if (error == ENOENT && !dump_opt['L']) {
+		/*
+		 * If the feature is active but the uberblock is missing
+		 * then we must be in the middle of discarding the
+		 * checkpoint.
+		 */
+		(void) printf("\nPartially discarded checkpoint "
+		    "state found:\n");
+		if (dump_opt['m'] > 3)
+			dump_leftover_checkpoint_blocks(spa);
+		return (0);
+	} else if (error != 0) {
+		(void) printf("lookup error %d when looking for "
+		    "checkpointed uberblock in MOS\n", error);
+		return (error);
+	}
+	dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
+
+	if (checkpoint.ub_checkpoint_txg == 0) {
+		(void) printf("\nub_checkpoint_txg not set in checkpointed "
+		    "uberblock\n");
+		error = 3;
+	}
+
+	if (error == 0 && !dump_opt['L'])
+		verify_checkpoint_blocks(spa);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static void
+mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
+{
+	for (uint64_t i = start; i < size; i++) {
+		(void) printf("MOS object %llu referenced but not allocated\n",
+		    (u_longlong_t)i);
+	}
+}
+
+static void
+mos_obj_refd(uint64_t obj)
+{
+	if (obj != 0 && mos_refd_objs != NULL)
+		range_tree_add(mos_refd_objs, obj, 1);
+}
+
+/*
+ * Call on a MOS object that may already have been referenced.
+ */
+static void
+mos_obj_refd_multiple(uint64_t obj)
+{
+	if (obj != 0 && mos_refd_objs != NULL &&
+	    !range_tree_contains(mos_refd_objs, obj, 1))
+		range_tree_add(mos_refd_objs, obj, 1);
+}
+
+static void
+mos_leak_vdev_top_zap(vdev_t *vd)
+{
+	uint64_t ms_flush_data_obj;
+	int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
+	    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+	    sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
+	if (error == ENOENT)
+		return;
+	ASSERT0(error);
+
+	mos_obj_refd(ms_flush_data_obj);
+}
+
+static void
+mos_leak_vdev(vdev_t *vd)
+{
+	mos_obj_refd(vd->vdev_dtl_object);
+	mos_obj_refd(vd->vdev_ms_array);
+	mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
+	mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
+	mos_obj_refd(vd->vdev_leaf_zap);
+	if (vd->vdev_checkpoint_sm != NULL)
+		mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
+	if (vd->vdev_indirect_mapping != NULL) {
+		mos_obj_refd(vd->vdev_indirect_mapping->
+		    vim_phys->vimp_counts_object);
+	}
+	if (vd->vdev_obsolete_sm != NULL)
+		mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
+
+	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *ms = vd->vdev_ms[m];
+		mos_obj_refd(space_map_object(ms->ms_sm));
+	}
+
+	if (vd->vdev_top_zap != 0) {
+		mos_obj_refd(vd->vdev_top_zap);
+		mos_leak_vdev_top_zap(vd);
+	}
+
+	for (uint64_t c = 0; c < vd->vdev_children; c++) {
+		mos_leak_vdev(vd->vdev_child[c]);
+	}
+}
+
+static void
+mos_leak_log_spacemaps(spa_t *spa)
+{
+	uint64_t spacemap_zap;
+	int error = zap_lookup(spa_meta_objset(spa),
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
+	    sizeof (spacemap_zap), 1, &spacemap_zap);
+	if (error == ENOENT)
+		return;
+	ASSERT0(error);
+
+	mos_obj_refd(spacemap_zap);
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
+		mos_obj_refd(sls->sls_sm_obj);
+}
+
+static int
+dump_mos_leaks(spa_t *spa)
+{
+	int rv = 0;
+	objset_t *mos = spa->spa_meta_objset;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+
+	/* Visit and mark all referenced objects in the MOS */
+
+	mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
+	mos_obj_refd(spa->spa_pool_props_object);
+	mos_obj_refd(spa->spa_config_object);
+	mos_obj_refd(spa->spa_ddt_stat_object);
+	mos_obj_refd(spa->spa_feat_desc_obj);
+	mos_obj_refd(spa->spa_feat_enabled_txg_obj);
+	mos_obj_refd(spa->spa_feat_for_read_obj);
+	mos_obj_refd(spa->spa_feat_for_write_obj);
+	mos_obj_refd(spa->spa_history);
+	mos_obj_refd(spa->spa_errlog_last);
+	mos_obj_refd(spa->spa_errlog_scrub);
+	mos_obj_refd(spa->spa_all_vdev_zaps);
+	mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
+	mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
+	mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
+	bpobj_count_refd(&spa->spa_deferred_bpobj);
+	mos_obj_refd(dp->dp_empty_bpobj);
+	bpobj_count_refd(&dp->dp_obsolete_bpobj);
+	bpobj_count_refd(&dp->dp_free_bpobj);
+	mos_obj_refd(spa->spa_l2cache.sav_object);
+	mos_obj_refd(spa->spa_spares.sav_object);
+
+	if (spa->spa_syncing_log_sm != NULL)
+		mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
+	mos_leak_log_spacemaps(spa);
+
+	mos_obj_refd(spa->spa_condensing_indirect_phys.
+	    scip_next_mapping_object);
+	mos_obj_refd(spa->spa_condensing_indirect_phys.
+	    scip_prev_obsolete_sm_object);
+	if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
+		vdev_indirect_mapping_t *vim =
+		    vdev_indirect_mapping_open(mos,
+		    spa->spa_condensing_indirect_phys.scip_next_mapping_object);
+		mos_obj_refd(vim->vim_phys->vimp_counts_object);
+		vdev_indirect_mapping_close(vim);
+	}
+	deleted_livelists_dump_mos(spa);
+
+	if (dp->dp_origin_snap != NULL) {
+		dsl_dataset_t *ds;
+
+		dsl_pool_config_enter(dp, FTAG);
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
+		    FTAG, &ds));
+		count_ds_mos_objects(ds);
+		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
+		dsl_dataset_rele(ds, FTAG);
+		dsl_pool_config_exit(dp, FTAG);
+
+		count_ds_mos_objects(dp->dp_origin_snap);
+		dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
+	}
+	count_dir_mos_objects(dp->dp_mos_dir);
+	if (dp->dp_free_dir != NULL)
+		count_dir_mos_objects(dp->dp_free_dir);
+	if (dp->dp_leak_dir != NULL)
+		count_dir_mos_objects(dp->dp_leak_dir);
+
+	mos_leak_vdev(spa->spa_root_vdev);
+
+	for (uint64_t class = 0; class < DDT_CLASSES; class++) {
+		for (uint64_t type = 0; type < DDT_TYPES; type++) {
+			for (uint64_t cksum = 0;
+			    cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
+				ddt_t *ddt = spa->spa_ddt[cksum];
+				mos_obj_refd(ddt->ddt_object[type][class]);
+			}
+		}
+	}
+
+	/*
+	 * Visit all allocated objects and make sure they are referenced.
+	 */
+	uint64_t object = 0;
+	while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
+		if (range_tree_contains(mos_refd_objs, object, 1)) {
+			range_tree_remove(mos_refd_objs, object, 1);
+		} else {
+			dmu_object_info_t doi;
+			const char *name;
+			dmu_object_info(mos, object, &doi);
+			if (doi.doi_type & DMU_OT_NEWTYPE) {
+				dmu_object_byteswap_t bswap =
+				    DMU_OT_BYTESWAP(doi.doi_type);
+				name = dmu_ot_byteswap[bswap].ob_name;
+			} else {
+				name = dmu_ot[doi.doi_type].ot_name;
+			}
+
+			(void) printf("MOS object %llu (%s) leaked\n",
+			    (u_longlong_t)object, name);
+			rv = 2;
+		}
+	}
+	(void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
+	if (!range_tree_is_empty(mos_refd_objs))
+		rv = 2;
+	range_tree_vacate(mos_refd_objs, NULL, NULL);
+	range_tree_destroy(mos_refd_objs);
+	return (rv);
+}
+
+typedef struct log_sm_obsolete_stats_arg {
+	uint64_t lsos_current_txg;
+
+	uint64_t lsos_total_entries;
+	uint64_t lsos_valid_entries;
+
+	uint64_t lsos_sm_entries;
+	uint64_t lsos_valid_sm_entries;
+} log_sm_obsolete_stats_arg_t;
+
+static int
+log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
+    uint64_t txg, void *arg)
+{
+	log_sm_obsolete_stats_arg_t *lsos = arg;
+
+	uint64_t offset = sme->sme_offset;
+	uint64_t vdev_id = sme->sme_vdev;
+
+	if (lsos->lsos_current_txg == 0) {
+		/* this is the first log */
+		lsos->lsos_current_txg = txg;
+	} else if (lsos->lsos_current_txg < txg) {
+		/* we just changed log - print stats and reset */
+		(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
+		    (u_longlong_t)lsos->lsos_valid_sm_entries,
+		    (u_longlong_t)lsos->lsos_sm_entries,
+		    (u_longlong_t)lsos->lsos_current_txg);
+		lsos->lsos_valid_sm_entries = 0;
+		lsos->lsos_sm_entries = 0;
+		lsos->lsos_current_txg = txg;
+	}
+	ASSERT3U(lsos->lsos_current_txg, ==, txg);
+
+	lsos->lsos_sm_entries++;
+	lsos->lsos_total_entries++;
+
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	if (!vdev_is_concrete(vd))
+		return (0);
+
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+	if (txg < metaslab_unflushed_txg(ms))
+		return (0);
+	lsos->lsos_valid_sm_entries++;
+	lsos->lsos_valid_entries++;
+	return (0);
+}
+
+static void
+dump_log_spacemap_obsolete_stats(spa_t *spa)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	log_sm_obsolete_stats_arg_t lsos;
+	bzero(&lsos, sizeof (lsos));
+
+	(void) printf("Log Space Map Obsolete Entry Statistics:\n");
+
+	iterate_through_spacemap_logs(spa,
+	    log_spacemap_obsolete_stats_cb, &lsos);
+
+	/* print stats for latest log */
+	(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
+	    (u_longlong_t)lsos.lsos_valid_sm_entries,
+	    (u_longlong_t)lsos.lsos_sm_entries,
+	    (u_longlong_t)lsos.lsos_current_txg);
+
+	(void) printf("%-8llu valid entries out of %-8llu - total\n\n",
+	    (u_longlong_t)lsos.lsos_valid_entries,
+	    (u_longlong_t)lsos.lsos_total_entries);
+}
+
+static void
+dump_zpool(spa_t *spa)
+{
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	int rc = 0;
+
+	if (dump_opt['y']) {
+		livelist_metaslab_validate(spa);
+	}
+
+	if (dump_opt['S']) {
+		dump_simulated_ddt(spa);
+		return;
+	}
+
+	if (!dump_opt['e'] && dump_opt['C'] > 1) {
+		(void) printf("\nCached configuration:\n");
+		dump_nvlist(spa->spa_config, 8);
+	}
+
+	if (dump_opt['C'])
+		dump_config(spa);
+
+	if (dump_opt['u'])
+		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
+
+	if (dump_opt['D'])
+		dump_all_ddts(spa);
+
+	if (dump_opt['d'] > 2 || dump_opt['m'])
+		dump_metaslabs(spa);
+	if (dump_opt['M'])
+		dump_metaslab_groups(spa);
+	if (dump_opt['d'] > 2 || dump_opt['m']) {
+		dump_log_spacemaps(spa);
+		dump_log_spacemap_obsolete_stats(spa);
+	}
+
+	if (dump_opt['d'] || dump_opt['i']) {
+		spa_feature_t f;
+		mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
+		    0);
+		dump_objset(dp->dp_meta_objset);
+
+		if (dump_opt['d'] >= 3) {
+			dsl_pool_t *dp = spa->spa_dsl_pool;
+			dump_full_bpobj(&spa->spa_deferred_bpobj,
+			    "Deferred frees", 0);
+			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+				dump_full_bpobj(&dp->dp_free_bpobj,
+				    "Pool snapshot frees", 0);
+			}
+			if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+				ASSERT(spa_feature_is_enabled(spa,
+				    SPA_FEATURE_DEVICE_REMOVAL));
+				dump_full_bpobj(&dp->dp_obsolete_bpobj,
+				    "Pool obsolete blocks", 0);
+			}
+
+			if (spa_feature_is_active(spa,
+			    SPA_FEATURE_ASYNC_DESTROY)) {
+				dump_bptree(spa->spa_meta_objset,
+				    dp->dp_bptree_obj,
+				    "Pool dataset frees");
+			}
+			dump_dtl(spa->spa_root_vdev, 0);
+		}
+
+		for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
+			global_feature_count[f] = UINT64_MAX;
+		global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
+		global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
+		global_feature_count[SPA_FEATURE_LIVELIST] = 0;
+
+		(void) dmu_objset_find(spa_name(spa), dump_one_objset,
+		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+		if (rc == 0 && !dump_opt['L'])
+			rc = dump_mos_leaks(spa);
+
+		for (f = 0; f < SPA_FEATURES; f++) {
+			uint64_t refcount;
+
+			uint64_t *arr;
+			if (!(spa_feature_table[f].fi_flags &
+			    ZFEATURE_FLAG_PER_DATASET)) {
+				if (global_feature_count[f] == UINT64_MAX)
+					continue;
+				if (!spa_feature_is_enabled(spa, f)) {
+					ASSERT0(global_feature_count[f]);
+					continue;
+				}
+				arr = global_feature_count;
+			} else {
+				if (!spa_feature_is_enabled(spa, f)) {
+					ASSERT0(dataset_feature_count[f]);
+					continue;
+				}
+				arr = dataset_feature_count;
+			}
+			if (feature_get_refcount(spa, &spa_feature_table[f],
+			    &refcount) == ENOTSUP)
+				continue;
+			if (arr[f] != refcount) {
+				(void) printf("%s feature refcount mismatch: "
+				    "%lld consumers != %lld refcount\n",
+				    spa_feature_table[f].fi_uname,
+				    (longlong_t)arr[f], (longlong_t)refcount);
+				rc = 2;
+			} else {
+				(void) printf("Verified %s feature refcount "
+				    "of %llu is correct\n",
+				    spa_feature_table[f].fi_uname,
+				    (longlong_t)refcount);
+			}
+		}
+
+		if (rc == 0)
+			rc = verify_device_removal_feature_counts(spa);
+	}
+
+	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
+		rc = dump_block_stats(spa);
+
+	if (rc == 0)
+		rc = verify_spacemap_refcounts(spa);
+
+	if (dump_opt['s'])
+		show_pool_stats(spa);
+
+	if (dump_opt['h'])
+		dump_history(spa);
+
+	if (rc == 0)
+		rc = verify_checkpoint(spa);
+
+	if (rc != 0) {
+		dump_debug_buffer();
+		exit(rc);
+	}
+}
+
+#define	ZDB_FLAG_CHECKSUM	0x0001
+#define	ZDB_FLAG_DECOMPRESS	0x0002
+#define	ZDB_FLAG_BSWAP		0x0004
+#define	ZDB_FLAG_GBH		0x0008
+#define	ZDB_FLAG_INDIRECT	0x0010
+#define	ZDB_FLAG_RAW		0x0020
+#define	ZDB_FLAG_PRINT_BLKPTR	0x0040
+#define	ZDB_FLAG_VERBOSE	0x0080
+
+static int flagbits[256];
+static char flagbitstr[16];
+
+static void
+zdb_print_blkptr(const blkptr_t *bp, int flags)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+
+	if (flags & ZDB_FLAG_BSWAP)
+		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
+
+	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+	(void) printf("%s\n", blkbuf);
+}
+
+static void
+zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
+{
+	int i;
+
+	for (i = 0; i < nbps; i++)
+		zdb_print_blkptr(&bp[i], flags);
+}
+
+static void
+zdb_dump_gbh(void *buf, int flags)
+{
+	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+}
+
+static void
+zdb_dump_block_raw(void *buf, uint64_t size, int flags)
+{
+	if (flags & ZDB_FLAG_BSWAP)
+		byteswap_uint64_array(buf, size);
+	VERIFY(write(fileno(stdout), buf, size) == size);
+}
+
+static void
+zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
+{
+	uint64_t *d = (uint64_t *)buf;
+	unsigned nwords = size / sizeof (uint64_t);
+	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
+	unsigned i, j;
+	const char *hdr;
+	char *c;
+
+
+	if (do_bswap)
+		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
+	else
+		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
+
+	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
+
+#ifdef _LITTLE_ENDIAN
+	/* correct the endianness */
+	do_bswap = !do_bswap;
+#endif
+	for (i = 0; i < nwords; i += 2) {
+		(void) printf("%06llx:  %016llx  %016llx  ",
+		    (u_longlong_t)(i * sizeof (uint64_t)),
+		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
+		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
+
+		c = (char *)&d[i];
+		for (j = 0; j < 2 * sizeof (uint64_t); j++)
+			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
+		(void) printf("\n");
+	}
+}
+
+/*
+ * There are two acceptable formats:
+ *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
+ *	child[.child]*    - For example: 0.1.1
+ *
+ * The second form can be used to specify arbitrary vdevs anywhere
+ * in the hierarchy.  For example, in a pool with a mirror of
+ * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
+ */
+static vdev_t *
+zdb_vdev_lookup(vdev_t *vdev, const char *path)
+{
+	char *s, *p, *q;
+	unsigned i;
+
+	if (vdev == NULL)
+		return (NULL);
+
+	/* First, assume the x.x.x.x format */
+	i = strtoul(path, &s, 10);
+	if (s == path || (s && *s != '.' && *s != '\0'))
+		goto name;
+	if (i >= vdev->vdev_children)
+		return (NULL);
+
+	vdev = vdev->vdev_child[i];
+	if (s && *s == '\0')
+		return (vdev);
+	return (zdb_vdev_lookup(vdev, s+1));
+
+name:
+	for (i = 0; i < vdev->vdev_children; i++) {
+		vdev_t *vc = vdev->vdev_child[i];
+
+		if (vc->vdev_path == NULL) {
+			vc = zdb_vdev_lookup(vc, path);
+			if (vc == NULL)
+				continue;
+			else
+				return (vc);
+		}
+
+		p = strrchr(vc->vdev_path, '/');
+		p = p ? p + 1 : vc->vdev_path;
+		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
+
+		if (strcmp(vc->vdev_path, path) == 0)
+			return (vc);
+		if (strcmp(p, path) == 0)
+			return (vc);
+		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
+			return (vc);
+	}
+
+	return (NULL);
+}
+
+static int
+name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
+{
+	dsl_dataset_t *ds;
+
+	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
+	int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
+	    NULL, &ds);
+	if (error != 0) {
+		(void) fprintf(stderr, "failed to hold objset %llu: %s\n",
+		    (u_longlong_t)objset_id, strerror(error));
+		dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+		return (error);
+	}
+	dsl_dataset_name(ds, outstr);
+	dsl_dataset_rele(ds, NULL);
+	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+	return (0);
+}
+
+static boolean_t
+zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
+{
+	char *s0, *s1;
+
+	if (sizes == NULL)
+		return (B_FALSE);
+
+	s0 = strtok(sizes, "/");
+	if (s0 == NULL)
+		return (B_FALSE);
+	s1 = strtok(NULL, "/");
+	*lsize = strtoull(s0, NULL, 16);
+	*psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
+	return (*lsize >= *psize && *psize > 0);
+}
+
+#define	ZIO_COMPRESS_MASK(alg)	(1ULL << (ZIO_COMPRESS_##alg))
+
+static boolean_t
+zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
+    uint64_t psize, int flags)
+{
+	boolean_t exceeded = B_FALSE;
+	/*
+	 * We don't know how the data was compressed, so just try
+	 * every decompress function at every inflated blocksize.
+	 */
+	void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+	int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
+	int *cfuncp = cfuncs;
+	uint64_t maxlsize = SPA_MAXBLOCKSIZE;
+	uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
+	    ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
+	    (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0);
+	*cfuncp++ = ZIO_COMPRESS_LZ4;
+	*cfuncp++ = ZIO_COMPRESS_LZJB;
+	mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
+	for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
+		if (((1ULL << c) & mask) == 0)
+			*cfuncp++ = c;
+
+	/*
+	 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
+	 * could take a while and we should let the user know
+	 * we are not stuck.  On the other hand, printing progress
+	 * info gets old after a while.  User can specify 'v' flag
+	 * to see the progression.
+	 */
+	if (lsize == psize)
+		lsize += SPA_MINBLOCKSIZE;
+	else
+		maxlsize = lsize;
+	for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
+		for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
+			if (flags & ZDB_FLAG_VERBOSE) {
+				(void) fprintf(stderr,
+				    "Trying %05llx -> %05llx (%s)\n",
+				    (u_longlong_t)psize,
+				    (u_longlong_t)lsize,
+				    zio_compress_table[*cfuncp].\
+				    ci_name);
+			}
+
+			/*
+			 * We randomize lbuf2, and decompress to both
+			 * lbuf and lbuf2. This way, we will know if
+			 * decompression fill exactly to lsize.
+			 */
+			VERIFY0(random_get_pseudo_bytes(lbuf2, lsize));
+
+			if (zio_decompress_data(*cfuncp, pabd,
+			    lbuf, psize, lsize, NULL) == 0 &&
+			    zio_decompress_data(*cfuncp, pabd,
+			    lbuf2, psize, lsize, NULL) == 0 &&
+			    bcmp(lbuf, lbuf2, lsize) == 0)
+				break;
+		}
+		if (*cfuncp != 0)
+			break;
+	}
+	umem_free(lbuf2, SPA_MAXBLOCKSIZE);
+
+	if (lsize > maxlsize) {
+		exceeded = B_TRUE;
+	}
+	buf = lbuf;
+	if (*cfuncp == ZIO_COMPRESS_ZLE) {
+		printf("\nZLE decompression was selected. If you "
+		    "suspect the results are wrong,\ntry avoiding ZLE "
+		    "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
+	}
+
+	return (exceeded);
+}
+
+/*
+ * Read a block from a pool and print it out.  The syntax of the
+ * block descriptor is:
+ *
+ *	pool:vdev_specifier:offset:[lsize/]psize[:flags]
+ *
+ *	pool           - The name of the pool you wish to read from
+ *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
+ *	offset         - offset, in hex, in bytes
+ *	size           - Amount of data to read, in hex, in bytes
+ *	flags          - A string of characters specifying options
+ *		 b: Decode a blkptr at given offset within block
+ *		 c: Calculate and display checksums
+ *		 d: Decompress data before dumping
+ *		 e: Byteswap data before dumping
+ *		 g: Display data as a gang block header
+ *		 i: Display as an indirect block
+ *		 r: Dump raw data to stdout
+ *		 v: Verbose
+ *
+ */
+static void
+zdb_read_block(char *thing, spa_t *spa)
+{
+	blkptr_t blk, *bp = &blk;
+	dva_t *dva = bp->blk_dva;
+	int flags = 0;
+	uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
+	zio_t *zio;
+	vdev_t *vd;
+	abd_t *pabd;
+	void *lbuf, *buf;
+	char *s, *p, *dup, *vdev, *flagstr, *sizes;
+	int i, error;
+	boolean_t borrowed = B_FALSE, found = B_FALSE;
+
+	dup = strdup(thing);
+	s = strtok(dup, ":");
+	vdev = s ? s : "";
+	s = strtok(NULL, ":");
+	offset = strtoull(s ? s : "", NULL, 16);
+	sizes = strtok(NULL, ":");
+	s = strtok(NULL, ":");
+	flagstr = strdup(s ? s : "");
+
+	s = NULL;
+	if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
+		s = "invalid size(s)";
+	if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
+		s = "size must be a multiple of sector size";
+	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
+		s = "offset must be a multiple of sector size";
+	if (s) {
+		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
+		goto done;
+	}
+
+	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
+		for (i = 0; i < strlen(flagstr); i++) {
+			int bit = flagbits[(uchar_t)flagstr[i]];
+
+			if (bit == 0) {
+				(void) printf("***Ignoring flag: %c\n",
+				    (uchar_t)flagstr[i]);
+				continue;
+			}
+			found = B_TRUE;
+			flags |= bit;
+
+			p = &flagstr[i + 1];
+			if (*p != ':' && *p != '\0') {
+				int j = 0, nextbit = flagbits[(uchar_t)*p];
+				char *end, offstr[8] = { 0 };
+				if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
+				    (nextbit == 0)) {
+					/* look ahead to isolate the offset */
+					while (nextbit == 0 &&
+					    strchr(flagbitstr, *p) == NULL) {
+						offstr[j] = *p;
+						j++;
+						if (i + j > strlen(flagstr))
+							break;
+						p++;
+						nextbit = flagbits[(uchar_t)*p];
+					}
+					blkptr_offset = strtoull(offstr, &end,
+					    16);
+					i += j;
+				} else if (nextbit == 0) {
+					(void) printf("***Ignoring flag arg:"
+					    " '%c'\n", (uchar_t)*p);
+				}
+			}
+		}
+	}
+	if (blkptr_offset % sizeof (blkptr_t)) {
+		printf("Block pointer offset 0x%llx "
+		    "must be divisible by 0x%x\n",
+		    (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
+		goto done;
+	}
+	if (found == B_FALSE && strlen(flagstr) > 0) {
+		printf("Invalid flag arg: '%s'\n", flagstr);
+		goto done;
+	}
+
+	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
+	if (vd == NULL) {
+		(void) printf("***Invalid vdev: %s\n", vdev);
+		free(dup);
+		return;
+	} else {
+		if (vd->vdev_path)
+			(void) fprintf(stderr, "Found vdev: %s\n",
+			    vd->vdev_path);
+		else
+			(void) fprintf(stderr, "Found vdev type: %s\n",
+			    vd->vdev_ops->vdev_op_type);
+	}
+
+	pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
+	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+	BP_ZERO(bp);
+
+	DVA_SET_VDEV(&dva[0], vd->vdev_id);
+	DVA_SET_OFFSET(&dva[0], offset);
+	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
+
+	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+	BP_SET_LSIZE(bp, lsize);
+	BP_SET_PSIZE(bp, psize);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_LEVEL(bp, 0);
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	zio = zio_root(spa, NULL, NULL, 0);
+
+	if (vd == vd->vdev_top) {
+		/*
+		 * Treat this as a normal block read.
+		 */
+		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
+		    ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
+	} else {
+		/*
+		 * Treat this as a vdev child I/O.
+		 */
+		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
+		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+		    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+		    ZIO_FLAG_OPTIONAL, NULL, NULL));
+	}
+
+	error = zio_wait(zio);
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	if (error) {
+		(void) printf("Read of %s failed, error: %d\n", thing, error);
+		goto out;
+	}
+
+	uint64_t orig_lsize = lsize;
+	buf = lbuf;
+	if (flags & ZDB_FLAG_DECOMPRESS) {
+		boolean_t failed = zdb_decompress_block(pabd, buf, lbuf,
+		    lsize, psize, flags);
+		if (failed) {
+			(void) printf("Decompress of %s failed\n", thing);
+			goto out;
+		}
+	} else {
+		buf = abd_borrow_buf_copy(pabd, lsize);
+		borrowed = B_TRUE;
+	}
+	/*
+	 * Try to detect invalid block pointer.  If invalid, try
+	 * decompressing.
+	 */
+	if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
+	    !(flags & ZDB_FLAG_DECOMPRESS)) {
+		const blkptr_t *b = (const blkptr_t *)(void *)
+		    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
+		if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) ==
+		    B_FALSE) {
+			abd_return_buf_copy(pabd, buf, lsize);
+			borrowed = B_FALSE;
+			buf = lbuf;
+			boolean_t failed = zdb_decompress_block(pabd, buf,
+			    lbuf, lsize, psize, flags);
+			b = (const blkptr_t *)(void *)
+			    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
+			if (failed || zfs_blkptr_verify(spa, b, B_FALSE,
+			    BLK_VERIFY_LOG) == B_FALSE) {
+				printf("invalid block pointer at this DVA\n");
+				goto out;
+			}
+		}
+	}
+
+	if (flags & ZDB_FLAG_PRINT_BLKPTR)
+		zdb_print_blkptr((blkptr_t *)(void *)
+		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
+	else if (flags & ZDB_FLAG_RAW)
+		zdb_dump_block_raw(buf, lsize, flags);
+	else if (flags & ZDB_FLAG_INDIRECT)
+		zdb_dump_indirect((blkptr_t *)buf,
+		    orig_lsize / sizeof (blkptr_t), flags);
+	else if (flags & ZDB_FLAG_GBH)
+		zdb_dump_gbh(buf, flags);
+	else
+		zdb_dump_block(thing, buf, lsize, flags);
+
+	/*
+	 * If :c was specified, iterate through the checksum table to
+	 * calculate and display each checksum for our specified
+	 * DVA and length.
+	 */
+	if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
+	    !(flags & ZDB_FLAG_GBH)) {
+		zio_t *czio;
+		(void) printf("\n");
+		for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
+		    ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
+
+			if ((zio_checksum_table[ck].ci_flags &
+			    ZCHECKSUM_FLAG_EMBEDDED) ||
+			    ck == ZIO_CHECKSUM_NOPARITY) {
+				continue;
+			}
+			BP_SET_CHECKSUM(bp, ck);
+			spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+			czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+			czio->io_bp = bp;
+
+			if (vd == vd->vdev_top) {
+				zio_nowait(zio_read(czio, spa, bp, pabd, psize,
+				    NULL, NULL,
+				    ZIO_PRIORITY_SYNC_READ,
+				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+				    ZIO_FLAG_DONT_RETRY, NULL));
+			} else {
+				zio_nowait(zio_vdev_child_io(czio, bp, vd,
+				    offset, pabd, psize, ZIO_TYPE_READ,
+				    ZIO_PRIORITY_SYNC_READ,
+				    ZIO_FLAG_DONT_CACHE |
+				    ZIO_FLAG_DONT_PROPAGATE |
+				    ZIO_FLAG_DONT_RETRY |
+				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+				    ZIO_FLAG_SPECULATIVE |
+				    ZIO_FLAG_OPTIONAL, NULL, NULL));
+			}
+			error = zio_wait(czio);
+			if (error == 0 || error == ECKSUM) {
+				zio_t *ck_zio = zio_root(spa, NULL, NULL, 0);
+				ck_zio->io_offset =
+				    DVA_GET_OFFSET(&bp->blk_dva[0]);
+				ck_zio->io_bp = bp;
+				zio_checksum_compute(ck_zio, ck, pabd, lsize);
+				printf("%12s\tcksum=%llx:%llx:%llx:%llx\n",
+				    zio_checksum_table[ck].ci_name,
+				    (u_longlong_t)bp->blk_cksum.zc_word[0],
+				    (u_longlong_t)bp->blk_cksum.zc_word[1],
+				    (u_longlong_t)bp->blk_cksum.zc_word[2],
+				    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+				zio_wait(ck_zio);
+			} else {
+				printf("error %d reading block\n", error);
+			}
+			spa_config_exit(spa, SCL_STATE, FTAG);
+		}
+	}
+
+	if (borrowed)
+		abd_return_buf_copy(pabd, buf, lsize);
+
+out:
+	abd_free(pabd);
+	umem_free(lbuf, SPA_MAXBLOCKSIZE);
+done:
+	free(flagstr);
+	free(dup);
+}
+
+static void
+zdb_embedded_block(char *thing)
+{
+	blkptr_t bp;
+	unsigned long long *words = (void *)&bp;
+	char *buf;
+	int err;
+
+	bzero(&bp, sizeof (bp));
+	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
+	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
+	    words + 0, words + 1, words + 2, words + 3,
+	    words + 4, words + 5, words + 6, words + 7,
+	    words + 8, words + 9, words + 10, words + 11,
+	    words + 12, words + 13, words + 14, words + 15);
+	if (err != 16) {
+		(void) fprintf(stderr, "invalid input format\n");
+		exit(1);
+	}
+	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
+	buf = malloc(SPA_MAXBLOCKSIZE);
+	if (buf == NULL) {
+		(void) fprintf(stderr, "out of memory\n");
+		exit(1);
+	}
+	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
+	if (err != 0) {
+		(void) fprintf(stderr, "decode failed: %u\n", err);
+		exit(1);
+	}
+	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
+	free(buf);
+}
+
+int
+main(int argc, char **argv)
+{
+	int c;
+	struct rlimit rl = { 1024, 1024 };
+	spa_t *spa = NULL;
+	objset_t *os = NULL;
+	int dump_all = 1;
+	int verbose = 0;
+	int error = 0;
+	char **searchdirs = NULL;
+	int nsearch = 0;
+	char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
+	nvlist_t *policy = NULL;
+	uint64_t max_txg = UINT64_MAX;
+	int64_t objset_id = -1;
+	int flags = ZFS_IMPORT_MISSING_LOG;
+	int rewind = ZPOOL_NEVER_REWIND;
+	char *spa_config_path_env, *objset_str;
+	boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
+	nvlist_t *cfg = NULL;
+
+	(void) setrlimit(RLIMIT_NOFILE, &rl);
+	(void) enable_extended_FILE_stdio(-1, -1);
+
+	dprintf_setup(&argc, argv);
+
+	/*
+	 * If there is an environment variable SPA_CONFIG_PATH it overrides
+	 * default spa_config_path setting. If -U flag is specified it will
+	 * override this environment variable settings once again.
+	 */
+	spa_config_path_env = getenv("SPA_CONFIG_PATH");
+	if (spa_config_path_env != NULL)
+		spa_config_path = spa_config_path_env;
+
+	/*
+	 * For performance reasons, we set this tunable down. We do so before
+	 * the arg parsing section so that the user can override this value if
+	 * they choose.
+	 */
+	zfs_btree_verify_intensity = 3;
+
+	while ((c = getopt(argc, argv,
+	    "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYyZ")) != -1) {
+		switch (c) {
+		case 'b':
+		case 'c':
+		case 'C':
+		case 'd':
+		case 'D':
+		case 'E':
+		case 'G':
+		case 'h':
+		case 'i':
+		case 'l':
+		case 'm':
+		case 'M':
+		case 'O':
+		case 'R':
+		case 's':
+		case 'S':
+		case 'u':
+		case 'y':
+		case 'Z':
+			dump_opt[c]++;
+			dump_all = 0;
+			break;
+		case 'A':
+		case 'e':
+		case 'F':
+		case 'k':
+		case 'L':
+		case 'P':
+		case 'q':
+		case 'X':
+			dump_opt[c]++;
+			break;
+		case 'Y':
+			zfs_reconstruct_indirect_combinations_max = INT_MAX;
+			zfs_deadman_enabled = 0;
+			break;
+		/* NB: Sort single match options below. */
+		case 'I':
+			max_inflight_bytes = strtoull(optarg, NULL, 0);
+			if (max_inflight_bytes == 0) {
+				(void) fprintf(stderr, "maximum number "
+				    "of inflight bytes must be greater "
+				    "than 0\n");
+				usage();
+			}
+			break;
+		case 'o':
+			error = set_global_var(optarg);
+			if (error != 0)
+				usage();
+			break;
+		case 'p':
+			if (searchdirs == NULL) {
+				searchdirs = umem_alloc(sizeof (char *),
+				    UMEM_NOFAIL);
+			} else {
+				char **tmp = umem_alloc((nsearch + 1) *
+				    sizeof (char *), UMEM_NOFAIL);
+				bcopy(searchdirs, tmp, nsearch *
+				    sizeof (char *));
+				umem_free(searchdirs,
+				    nsearch * sizeof (char *));
+				searchdirs = tmp;
+			}
+			searchdirs[nsearch++] = optarg;
+			break;
+		case 't':
+			max_txg = strtoull(optarg, NULL, 0);
+			if (max_txg < TXG_INITIAL) {
+				(void) fprintf(stderr, "incorrect txg "
+				    "specified: %s\n", optarg);
+				usage();
+			}
+			break;
+		case 'U':
+			spa_config_path = optarg;
+			if (spa_config_path[0] != '/') {
+				(void) fprintf(stderr,
+				    "cachefile must be an absolute path "
+				    "(i.e. start with a slash)\n");
+				usage();
+			}
+			break;
+		case 'v':
+			verbose++;
+			break;
+		case 'V':
+			flags = ZFS_IMPORT_VERBATIM;
+			break;
+		case 'x':
+			vn_dumpdir = optarg;
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+
+	if (!dump_opt['e'] && searchdirs != NULL) {
+		(void) fprintf(stderr, "-p option requires use of -e\n");
+		usage();
+	}
+	if (dump_opt['d']) {
+		/* <pool>[/<dataset | objset id> is accepted */
+		if (argv[2] && (objset_str = strchr(argv[2], '/')) != NULL &&
+		    objset_str++ != NULL) {
+			char *endptr;
+			errno = 0;
+			objset_id = strtoull(objset_str, &endptr, 0);
+			/* dataset 0 is the same as opening the pool */
+			if (errno == 0 && endptr != objset_str &&
+			    objset_id != 0) {
+				target_is_spa = B_FALSE;
+				dataset_lookup = B_TRUE;
+			} else if (objset_id != 0) {
+				printf("failed to open objset %s "
+				    "%llu %s", objset_str,
+				    (u_longlong_t)objset_id,
+				    strerror(errno));
+				exit(1);
+			}
+			/* normal dataset name not an objset ID */
+			if (endptr == objset_str) {
+				objset_id = -1;
+			}
+		}
+	}
+
+#if defined(_LP64)
+	/*
+	 * ZDB does not typically re-read blocks; therefore limit the ARC
+	 * to 256 MB, which can be used entirely for metadata.
+	 */
+	zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT;
+	zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
+#endif
+
+	/*
+	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
+	 * "zdb -b" uses traversal prefetch which uses async reads.
+	 * For good performance, let several of them be active at once.
+	 */
+	zfs_vdev_async_read_max_active = 10;
+
+	/*
+	 * Disable reference tracking for better performance.
+	 */
+	reference_tracking_enable = B_FALSE;
+
+	/*
+	 * Do not fail spa_load when spa_load_verify fails. This is needed
+	 * to load non-idle pools.
+	 */
+	spa_load_verify_dryrun = B_TRUE;
+
+	kernel_init(SPA_MODE_READ);
+
+	if (dump_all)
+		verbose = MAX(verbose, 1);
+
+	for (c = 0; c < 256; c++) {
+		if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL)
+			dump_opt[c] = 1;
+		if (dump_opt[c])
+			dump_opt[c] += verbose;
+	}
+
+	aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
+	zfs_recover = (dump_opt['A'] > 1);
+
+	argc -= optind;
+	argv += optind;
+	if (argc < 2 && dump_opt['R'])
+		usage();
+
+	if (dump_opt['E']) {
+		if (argc != 1)
+			usage();
+		zdb_embedded_block(argv[0]);
+		return (0);
+	}
+
+	if (argc < 1) {
+		if (!dump_opt['e'] && dump_opt['C']) {
+			dump_cachefile(spa_config_path);
+			return (0);
+		}
+		usage();
+	}
+
+	if (dump_opt['l'])
+		return (dump_label(argv[0]));
+
+	if (dump_opt['O']) {
+		if (argc != 2)
+			usage();
+		dump_opt['v'] = verbose + 3;
+		return (dump_path(argv[0], argv[1]));
+	}
+
+	if (dump_opt['X'] || dump_opt['F'])
+		rewind = ZPOOL_DO_REWIND |
+		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
+
+	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
+	    nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
+	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
+		fatal("internal error: %s", strerror(ENOMEM));
+
+	error = 0;
+	target = argv[0];
+
+	if (strpbrk(target, "/@") != NULL) {
+		size_t targetlen;
+
+		target_pool = strdup(target);
+		*strpbrk(target_pool, "/@") = '\0';
+
+		target_is_spa = B_FALSE;
+		targetlen = strlen(target);
+		if (targetlen && target[targetlen - 1] == '/')
+			target[targetlen - 1] = '\0';
+	} else {
+		target_pool = target;
+	}
+
+	if (dump_opt['e']) {
+		importargs_t args = { 0 };
+
+		args.paths = nsearch;
+		args.path = searchdirs;
+		args.can_be_active = B_TRUE;
+
+		error = zpool_find_config(NULL, target_pool, &cfg, &args,
+		    &libzpool_config_ops);
+
+		if (error == 0) {
+
+			if (nvlist_add_nvlist(cfg,
+			    ZPOOL_LOAD_POLICY, policy) != 0) {
+				fatal("can't open '%s': %s",
+				    target, strerror(ENOMEM));
+			}
+
+			if (dump_opt['C'] > 1) {
+				(void) printf("\nConfiguration for import:\n");
+				dump_nvlist(cfg, 8);
+			}
+
+			/*
+			 * Disable the activity check to allow examination of
+			 * active pools.
+			 */
+			error = spa_import(target_pool, cfg, NULL,
+			    flags | ZFS_IMPORT_SKIP_MMP);
+		}
+	}
+
+	/*
+	 * import_checkpointed_state makes the assumption that the
+	 * target pool that we pass it is already part of the spa
+	 * namespace. Because of that we need to make sure to call
+	 * it always after the -e option has been processed, which
+	 * imports the pool to the namespace if it's not in the
+	 * cachefile.
+	 */
+	char *checkpoint_pool = NULL;
+	char *checkpoint_target = NULL;
+	if (dump_opt['k']) {
+		checkpoint_pool = import_checkpointed_state(target, cfg,
+		    &checkpoint_target);
+
+		if (checkpoint_target != NULL)
+			target = checkpoint_target;
+	}
+
+	if (target_pool != target)
+		free(target_pool);
+
+	if (error == 0) {
+		if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
+			ASSERT(checkpoint_pool != NULL);
+			ASSERT(checkpoint_target == NULL);
+
+			error = spa_open(checkpoint_pool, &spa, FTAG);
+			if (error != 0) {
+				fatal("Tried to open pool \"%s\" but "
+				    "spa_open() failed with error %d\n",
+				    checkpoint_pool, error);
+			}
+
+		} else if (target_is_spa || dump_opt['R'] || objset_id == 0) {
+			zdb_set_skip_mmp(target);
+			error = spa_open_rewind(target, &spa, FTAG, policy,
+			    NULL);
+			if (error) {
+				/*
+				 * If we're missing the log device then
+				 * try opening the pool after clearing the
+				 * log state.
+				 */
+				mutex_enter(&spa_namespace_lock);
+				if ((spa = spa_lookup(target)) != NULL &&
+				    spa->spa_log_state == SPA_LOG_MISSING) {
+					spa->spa_log_state = SPA_LOG_CLEAR;
+					error = 0;
+				}
+				mutex_exit(&spa_namespace_lock);
+
+				if (!error) {
+					error = spa_open_rewind(target, &spa,
+					    FTAG, policy, NULL);
+				}
+			}
+		} else if (strpbrk(target, "#") != NULL) {
+			dsl_pool_t *dp;
+			error = dsl_pool_hold(target, FTAG, &dp);
+			if (error != 0) {
+				fatal("can't dump '%s': %s", target,
+				    strerror(error));
+			}
+			error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
+			dsl_pool_rele(dp, FTAG);
+			if (error != 0) {
+				fatal("can't dump '%s': %s", target,
+				    strerror(error));
+			}
+			return (error);
+		} else {
+			zdb_set_skip_mmp(target);
+			if (dataset_lookup == B_TRUE) {
+				/*
+				 * Use the supplied id to get the name
+				 * for open_objset.
+				 */
+				error = spa_open(target, &spa, FTAG);
+				if (error == 0) {
+					error = name_from_objset_id(spa,
+					    objset_id, dsname);
+					spa_close(spa, FTAG);
+					if (error == 0)
+						target = dsname;
+				}
+			}
+			if (error == 0)
+				error = open_objset(target, FTAG, &os);
+			if (error == 0)
+				spa = dmu_objset_spa(os);
+		}
+	}
+	nvlist_free(policy);
+
+	if (error)
+		fatal("can't open '%s': %s", target, strerror(error));
+
+	/*
+	 * Set the pool failure mode to panic in order to prevent the pool
+	 * from suspending.  A suspended I/O will have no way to resume and
+	 * can prevent the zdb(8) command from terminating as expected.
+	 */
+	if (spa != NULL)
+		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
+
+	argv++;
+	argc--;
+	if (!dump_opt['R']) {
+		flagbits['d'] = ZOR_FLAG_DIRECTORY;
+		flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
+		flagbits['m'] = ZOR_FLAG_SPACE_MAP;
+		flagbits['z'] = ZOR_FLAG_ZAP;
+		flagbits['A'] = ZOR_FLAG_ALL_TYPES;
+
+		if (argc > 0 && dump_opt['d']) {
+			zopt_object_args = argc;
+			zopt_object_ranges = calloc(zopt_object_args,
+			    sizeof (zopt_object_range_t));
+			for (unsigned i = 0; i < zopt_object_args; i++) {
+				int err;
+				char *msg = NULL;
+
+				err = parse_object_range(argv[i],
+				    &zopt_object_ranges[i], &msg);
+				if (err != 0)
+					fatal("Bad object or range: '%s': %s\n",
+					    argv[i], msg ? msg : "");
+			}
+		} else if (argc > 0 && dump_opt['m']) {
+			zopt_metaslab_args = argc;
+			zopt_metaslab = calloc(zopt_metaslab_args,
+			    sizeof (uint64_t));
+			for (unsigned i = 0; i < zopt_metaslab_args; i++) {
+				errno = 0;
+				zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
+				if (zopt_metaslab[i] == 0 && errno != 0)
+					fatal("bad number %s: %s", argv[i],
+					    strerror(errno));
+			}
+		}
+		if (os != NULL) {
+			dump_objset(os);
+		} else if (zopt_object_args > 0 && !dump_opt['m']) {
+			dump_objset(spa->spa_meta_objset);
+		} else {
+			dump_zpool(spa);
+		}
+	} else {
+		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+		flagbits['c'] = ZDB_FLAG_CHECKSUM;
+		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+		flagbits['e'] = ZDB_FLAG_BSWAP;
+		flagbits['g'] = ZDB_FLAG_GBH;
+		flagbits['i'] = ZDB_FLAG_INDIRECT;
+		flagbits['r'] = ZDB_FLAG_RAW;
+		flagbits['v'] = ZDB_FLAG_VERBOSE;
+
+		for (int i = 0; i < argc; i++)
+			zdb_read_block(argv[i], spa);
+	}
+
+	if (dump_opt['k']) {
+		free(checkpoint_pool);
+		if (!target_is_spa)
+			free(checkpoint_target);
+	}
+
+	if (os != NULL) {
+		close_objset(os, FTAG);
+	} else {
+		spa_close(spa, FTAG);
+	}
+
+	fuid_table_destroy();
+
+	dump_debug_buffer();
+
+	kernel_fini();
+
+	return (error);
+}
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.h b/sys/contrib/openzfs/cmd/zdb/zdb.h
new file mode 100644
index 000000000000..49579811efbb
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.h
@@ -0,0 +1,33 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2017 Spectra Logic Corp Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#ifndef	_ZDB_H
+#define	_ZDB_H
+
+void dump_intent_log(zilog_t *);
+extern uint8_t dump_opt[256];
+
+#endif	/* _ZDB_H */
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb_il.c b/sys/contrib/openzfs/cmd/zdb/zdb_il.c
new file mode 100644
index 000000000000..c12178effae0
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/zdb_il.c
@@ -0,0 +1,431 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * Print intent log header and statistics.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/abd.h>
+
+#include "zdb.h"
+
+extern uint8_t dump_opt[256];
+
+static char tab_prefix[4] = "\t\t\t";
+
+static void
+print_log_bp(const blkptr_t *bp, const char *prefix)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+
+	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+	(void) printf("%s%s\n", prefix, blkbuf);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg)
+{
+	lr_create_t *lr = arg;
+	time_t crtime = lr->lr_crtime[0];
+	char *name, *link;
+	lr_attr_t *lrattr;
+
+	name = (char *)(lr + 1);
+
+	if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR ||
+	    lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
+		lrattr = (lr_attr_t *)(lr + 1);
+		name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+	}
+
+	if (txtype == TX_SYMLINK) {
+		link = name + strlen(name) + 1;
+		(void) printf("%s%s -> %s\n", tab_prefix, name, link);
+	} else if (txtype != TX_MKXATTR) {
+		(void) printf("%s%s\n", tab_prefix, name);
+	}
+
+	(void) printf("%s%s", tab_prefix, ctime(&crtime));
+	(void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n",
+	    tab_prefix, (u_longlong_t)lr->lr_doid,
+	    (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid),
+	    (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid),
+	    (longlong_t)lr->lr_mode);
+	(void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+	    tab_prefix,
+	    (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
+	    (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg)
+{
+	lr_remove_t *lr = arg;
+
+	(void) printf("%sdoid %llu, name %s\n", tab_prefix,
+	    (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg)
+{
+	lr_link_t *lr = arg;
+
+	(void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix,
+	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
+	    (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_rename(zilog_t *zilog, int txtype, void *arg)
+{
+	lr_rename_t *lr = arg;
+	char *snm = (char *)(lr + 1);
+	char *tnm = snm + strlen(snm) + 1;
+
+	(void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix,
+	    (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
+	(void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm);
+}
+
+/* ARGSUSED */
+static int
+zil_prt_rec_write_cb(void *data, size_t len, void *unused)
+{
+	char *cdata = data;
+
+	for (size_t i = 0; i < len; i++) {
+		if (isprint(*cdata))
+			(void) printf("%c ", *cdata);
+		else
+			(void) printf("%2X", *cdata);
+		cdata++;
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg)
+{
+	lr_write_t *lr = arg;
+	abd_t *data;
+	blkptr_t *bp = &lr->lr_blkptr;
+	zbookmark_phys_t zb;
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+	int error;
+
+	(void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix,
+	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+	    (u_longlong_t)lr->lr_length);
+
+	if (txtype == TX_WRITE2 || verbose < 5)
+		return;
+
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		(void) printf("%shas blkptr, %s\n", tab_prefix,
+		    !BP_IS_HOLE(bp) &&
+		    bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+		    "will claim" : "won't claim");
+		print_log_bp(bp, tab_prefix);
+
+		if (BP_IS_HOLE(bp)) {
+			(void) printf("\t\t\tLSIZE 0x%llx\n",
+			    (u_longlong_t)BP_GET_LSIZE(bp));
+			(void) printf("%s<hole>\n", tab_prefix);
+			return;
+		}
+		if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+			(void) printf("%s<block already committed>\n",
+			    tab_prefix);
+			return;
+		}
+
+		SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
+
+		data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
+		error = zio_wait(zio_read(NULL, zilog->zl_spa,
+		    bp, data, BP_GET_LSIZE(bp), NULL, NULL,
+		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
+		if (error)
+			goto out;
+	} else {
+		/* data is stored after the end of the lr_write record */
+		data = abd_alloc(lr->lr_length, B_FALSE);
+		abd_copy_from_buf(data, lr + 1, lr->lr_length);
+	}
+
+	(void) printf("%s", tab_prefix);
+	(void) abd_iterate_func(data,
+	    0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
+	    zil_prt_rec_write_cb, NULL);
+	(void) printf("\n");
+
+out:
+	abd_free(data);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg)
+{
+	lr_truncate_t *lr = arg;
+
+	(void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix,
+	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
+	    (u_longlong_t)lr->lr_length);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg)
+{
+	lr_setattr_t *lr = arg;
+	time_t atime = (time_t)lr->lr_atime[0];
+	time_t mtime = (time_t)lr->lr_mtime[0];
+
+	(void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix,
+	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
+
+	if (lr->lr_mask & AT_MODE) {
+		(void) printf("%sAT_MODE  %llo\n", tab_prefix,
+		    (longlong_t)lr->lr_mode);
+	}
+
+	if (lr->lr_mask & AT_UID) {
+		(void) printf("%sAT_UID   %llu\n", tab_prefix,
+		    (u_longlong_t)lr->lr_uid);
+	}
+
+	if (lr->lr_mask & AT_GID) {
+		(void) printf("%sAT_GID   %llu\n", tab_prefix,
+		    (u_longlong_t)lr->lr_gid);
+	}
+
+	if (lr->lr_mask & AT_SIZE) {
+		(void) printf("%sAT_SIZE  %llu\n", tab_prefix,
+		    (u_longlong_t)lr->lr_size);
+	}
+
+	if (lr->lr_mask & AT_ATIME) {
+		(void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix,
+		    (u_longlong_t)lr->lr_atime[0],
+		    (u_longlong_t)lr->lr_atime[1],
+		    ctime(&atime));
+	}
+
+	if (lr->lr_mask & AT_MTIME) {
+		(void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix,
+		    (u_longlong_t)lr->lr_mtime[0],
+		    (u_longlong_t)lr->lr_mtime[1],
+		    ctime(&mtime));
+	}
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_acl(zilog_t *zilog, int txtype, void *arg)
+{
+	lr_acl_t *lr = arg;
+
+	(void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix,
+	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
+}
+
+typedef void (*zil_prt_rec_func_t)(zilog_t *, int, void *);
+typedef struct zil_rec_info {
+	zil_prt_rec_func_t	zri_print;
+	const char		*zri_name;
+	uint64_t		zri_count;
+} zil_rec_info_t;
+
+static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
+	{.zri_print = NULL,		    .zri_name = "Total              "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_CREATE          "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKDIR           "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKXATTR         "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_SYMLINK         "},
+	{.zri_print = zil_prt_rec_remove,   .zri_name = "TX_REMOVE          "},
+	{.zri_print = zil_prt_rec_remove,   .zri_name = "TX_RMDIR           "},
+	{.zri_print = zil_prt_rec_link,	    .zri_name = "TX_LINK            "},
+	{.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME          "},
+	{.zri_print = zil_prt_rec_write,    .zri_name = "TX_WRITE           "},
+	{.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE        "},
+	{.zri_print = zil_prt_rec_setattr,  .zri_name = "TX_SETATTR         "},
+	{.zri_print = zil_prt_rec_acl,	    .zri_name = "TX_ACL_V0          "},
+	{.zri_print = zil_prt_rec_acl,	    .zri_name = "TX_ACL_ACL         "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_CREATE_ACL      "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_CREATE_ATTR     "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_CREATE_ACL_ATTR "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKDIR_ACL       "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKDIR_ATTR      "},
+	{.zri_print = zil_prt_rec_create,   .zri_name = "TX_MKDIR_ACL_ATTR  "},
+	{.zri_print = zil_prt_rec_write,    .zri_name = "TX_WRITE2          "},
+};
+
+/* ARGSUSED */
+static int
+print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
+{
+	int txtype;
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+
+	/* reduce size of txtype to strip off TX_CI bit */
+	txtype = lr->lrc_txtype;
+
+	ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE);
+	ASSERT(lr->lrc_txg);
+
+	(void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n",
+	    (lr->lrc_txtype & TX_CI) ? "CI-" : "",
+	    zil_rec_info[txtype].zri_name,
+	    (u_longlong_t)lr->lrc_reclen,
+	    (u_longlong_t)lr->lrc_txg,
+	    (u_longlong_t)lr->lrc_seq);
+
+	if (txtype && verbose >= 3) {
+		if (!zilog->zl_os->os_encrypted) {
+			zil_rec_info[txtype].zri_print(zilog, txtype, lr);
+		} else {
+			(void) printf("%s(encrypted)\n", tab_prefix);
+		}
+	}
+
+	zil_rec_info[txtype].zri_count++;
+	zil_rec_info[0].zri_count++;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+	char blkbuf[BP_SPRINTF_LEN + 10];
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+	const char *claim;
+
+	if (verbose <= 3)
+		return (0);
+
+	if (verbose >= 5) {
+		(void) strcpy(blkbuf, ", ");
+		snprintf_blkptr(blkbuf + strlen(blkbuf),
+		    sizeof (blkbuf) - strlen(blkbuf), bp);
+	} else {
+		blkbuf[0] = '\0';
+	}
+
+	if (claim_txg != 0)
+		claim = "already claimed";
+	else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
+		claim = "will claim";
+	else
+		claim = "won't claim";
+
+	(void) printf("\tBlock seqno %llu, %s%s\n",
+	    (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+
+	return (0);
+}
+
+static void
+print_log_stats(int verbose)
+{
+	unsigned i, w, p10;
+
+	if (verbose > 3)
+		(void) printf("\n");
+
+	if (zil_rec_info[0].zri_count == 0)
+		return;
+
+	for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10)
+		w++;
+
+	for (i = 0; i < TX_MAX_TYPE; i++)
+		if (zil_rec_info[i].zri_count || verbose >= 3)
+			(void) printf("\t\t%s %*llu\n",
+			    zil_rec_info[i].zri_name, w,
+			    (u_longlong_t)zil_rec_info[i].zri_count);
+	(void) printf("\n");
+}
+
+/* ARGSUSED */
+void
+dump_intent_log(zilog_t *zilog)
+{
+	const zil_header_t *zh = zilog->zl_header;
+	int verbose = MAX(dump_opt['d'], dump_opt['i']);
+	int i;
+
+	if (BP_IS_HOLE(&zh->zh_log) || verbose < 1)
+		return;
+
+	(void) printf("\n    ZIL header: claim_txg %llu, "
+	    "claim_blk_seq %llu, claim_lr_seq %llu",
+	    (u_longlong_t)zh->zh_claim_txg,
+	    (u_longlong_t)zh->zh_claim_blk_seq,
+	    (u_longlong_t)zh->zh_claim_lr_seq);
+	(void) printf(" replay_seq %llu, flags 0x%llx\n",
+	    (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
+
+	for (i = 0; i < TX_MAX_TYPE; i++)
+		zil_rec_info[i].zri_count = 0;
+
+	/* see comment in zil_claim() or zil_check_log_chain() */
+	if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+	    zh->zh_claim_txg == 0)
+		return;
+
+	if (verbose >= 2) {
+		(void) printf("\n");
+		(void) zil_parse(zilog, print_log_block, print_log_record, NULL,
+		    zh->zh_claim_txg, B_FALSE);
+		print_log_stats(verbose);
+	}
+}
diff --git a/sys/contrib/openzfs/cmd/zed/.gitignore b/sys/contrib/openzfs/cmd/zed/.gitignore
new file mode 100644
index 000000000000..76557bb6bb3a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/.gitignore
@@ -0,0 +1 @@
+/zed
diff --git a/sys/contrib/openzfs/cmd/zed/Makefile.am b/sys/contrib/openzfs/cmd/zed/Makefile.am
new file mode 100644
index 000000000000..4bd8ac4a53e6
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/Makefile.am
@@ -0,0 +1,49 @@
+include $(top_srcdir)/config/Rules.am
+
+AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS)
+
+SUBDIRS = zed.d
+
+sbin_PROGRAMS = zed
+
+ZED_SRC = \
+	zed.c \
+	zed.h \
+	zed_conf.c \
+	zed_conf.h \
+	zed_disk_event.c \
+	zed_disk_event.h \
+	zed_event.c \
+	zed_event.h \
+	zed_exec.c \
+	zed_exec.h \
+	zed_file.c \
+	zed_file.h \
+	zed_log.c \
+	zed_log.h \
+	zed_strings.c \
+	zed_strings.h
+
+FMA_SRC = \
+	agents/zfs_agents.c \
+	agents/zfs_agents.h \
+	agents/zfs_diagnosis.c \
+	agents/zfs_mod.c \
+	agents/zfs_retire.c \
+	agents/fmd_api.c \
+	agents/fmd_api.h \
+	agents/fmd_serd.c \
+	agents/fmd_serd.h
+
+zed_SOURCES = $(ZED_SRC) $(FMA_SRC)
+
+zed_LDADD = \
+	$(abs_top_builddir)/lib/libzfs/libzfs.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+	$(abs_top_builddir)/lib/libnvpair/libnvpair.la \
+	$(abs_top_builddir)/lib/libuutil/libuutil.la
+
+zed_LDADD += -lrt $(LIBUDEV_LIBS) $(LIBUUID_LIBS)
+zed_LDFLAGS = -pthread
+
+EXTRA_DIST = agents/README.md
diff --git a/sys/contrib/openzfs/cmd/zed/agents/README.md b/sys/contrib/openzfs/cmd/zed/agents/README.md
new file mode 100644
index 000000000000..e35b97668a9d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/README.md
@@ -0,0 +1,112 @@
+## Fault Management Logic for ZED ##
+
+The integration of Fault Management Daemon (FMD) logic from illumos
+is being deployed in three phases. This logic is encapsulated in
+several software modules inside ZED.
+
+### ZED+FM Phase 1 ###
+
+All the phase 1 work is in current Master branch. Phase I work includes:
+
+* Add new paths to the persistent VDEV label for device matching.
+* Add a disk monitor for generating _disk-add_ and _disk-change_ events.
+* Add support for automated VDEV auto-online, auto-replace and auto-expand.
+* Expand the statechange event to include all VDEV state transitions.
+
+### ZED+FM Phase 2 (WIP) ###
+
+The phase 2 work primarily entails the _Diagnosis Engine_ and the
+_Retire Agent_ modules. It also includes infrastructure to support a
+crude FMD environment to host these modules. For additional
+information see the **FMD Components in ZED** and **Implementation
+Notes** sections below.
+
+### ZED+FM Phase 3 ###
+
+Future work will add additional functionality and will likely include:
+
+* Add FMD module garbage collection (periodically call `fmd_module_gc()`).
+* Add real module property retrieval (currently hard-coded in accessors).
+* Additional diagnosis telemetry (like latency outliers and SMART data).
+* Export FMD module statistics.
+* Zedlet parallel execution and resiliency (add watchdog).
+
+### ZFS Fault Management Overview ###
+
+The primary purpose with ZFS fault management is automated diagnosis
+and isolation of VDEV faults. A fault is something we can associate
+with an impact (e.g. loss of data redundancy) and a corrective action
+(e.g. offline or replace a disk). A typical ZFS fault management stack
+is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk
+monitor_, a _diagnosis engine_ and _response agents_.
+
+After detecting a software error, the ZFS kernel module sends error
+events to the ZED user daemon which in turn routes the events to its
+internal FMA modules based on their event subscriptions. Likewise, if
+a disk is added or changed in the system, the disk monitor sends disk
+events which are consumed by a response agent.
+
+### FMD Components in ZED ###
+
+There are three FMD modules (aka agents) that are now built into ZED.
+
+  1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`)
+  2. A _Retire Agent_ module (`agents/zfs_retire.c`)
+  3. A _Disk Add Agent_ module (`agents/zfs_mod.c`)
+
+To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum
+ereports and feeds them into a Soft Error Rate Discrimination (SERD)
+algorithm which will generate a corresponding fault diagnosis when the
+tracked VDEV encounters **N** events in a given **T** time window. The
+initial N and T values for the SERD algorithm are estimates inherited
+from illumos (10 errors in 10 minutes).
+
+In turn, a **Retire Agent** responds to diagnosed faults by isolating
+the faulty VDEV. It will notify the ZFS kernel module of the new VDEV
+state (degraded or faulted). The retire agent is also responsible for
+managing hot spares across all pools. When it encounters a device fault
+or a device removal it will replace the device with an appropriate
+spare if available.
+
+Finally, a **Disk Add Agent** responds to events from a libudev disk
+monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or
+expand the associated VDEV. This agent is also known as the `zfs_mod`
+or Sysevent Loadable Module (SLM) on the illumos platform. The added
+disk is matched to a specific VDEV using its device id, physical path
+or VDEV GUID.
+
+Note that the _auto-replace_ feature (aka hot plug) is opt-in and you
+must set the pool's `autoreplace` property to enable it. The new disk
+will be matched to the corresponding leaf VDEV by physical location
+and labeled with a GPT partition before replacing the original VDEV
+in the pool.
+
+### Implementation Notes ###
+
+* The FMD module API required for logic modules is emulated and implemented
+  in the `fmd_api.c` and `fmd_serd.c` source files. This support includes
+  module registration, memory allocation, module property accessors, basic
+  case management, one-shot timers and SERD engines.
+  For detailed information on the FMD module API, see the document --
+  _"Fault Management Daemon Programmer's Reference Manual"_.
+
+* The event subscriptions for the modules (located in a module specific
+  configuration file on illumos) are currently hard-coded into the ZED
+  `zfs_agent_dispatch()` function.
+
+* The FMD modules are called one at a time from a single thread that
+  consumes events queued to the modules. These events are sourced from
+  the normal ZED events and also include events posted from the diagnosis
+  engine and the libudev disk event monitor.
+
+* The FMD code modules have minimal changes and were intentionally left
+  as similar as possible to their upstream source files.
+
+* The sysevent namespace in ZED differs from illumos. For example:
+    * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"`
+    * Linux uses `"sysevent.fs.zfs.vdev_remove"`
+
+* The FMD Modules port was produced by Intel Federal, LLC under award
+  number B609815 between the U.S. Department of Energy (DOE) and Intel
+  Federal, LLC.
+
diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_api.c b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.c
new file mode 100644
index 000000000000..607b387ca3a8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.c
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+/*
+ * This file implements the minimal FMD module API required to support the
+ * fault logic modules in ZED. This support includes module registration,
+ * memory allocation, module property accessors, basic case management,
+ * one-shot timers and SERD engines.
+ *
+ * In the ZED runtime, the modules are called from a single thread so no
+ * locking is required in this emulated FMD environment.
+ */
+
+#include <sys/types.h>
+#include <sys/fm/protocol.h>
+#include <uuid/uuid.h>
+#include <signal.h>
+#include <strings.h>
+#include <time.h>
+
+#include "fmd_api.h"
+#include "fmd_serd.h"
+
+#include "zfs_agents.h"
+#include "../zed_log.h"
+
+typedef struct fmd_modstat {
+	fmd_stat_t	ms_accepted;	/* total events accepted by module */
+	fmd_stat_t	ms_caseopen;	/* cases currently open */
+	fmd_stat_t	ms_casesolved;	/* total cases solved by module */
+	fmd_stat_t	ms_caseclosed;	/* total cases closed by module */
+} fmd_modstat_t;
+
+typedef struct fmd_module {
+	const char	*mod_name;	/* basename of module (ro) */
+	const fmd_hdl_info_t *mod_info;	/* module info registered with handle */
+	void		*mod_spec;	/* fmd_hdl_get/setspecific data value */
+	fmd_stat_t	*mod_ustat;	/* module specific custom stats */
+	uint_t		mod_ustat_cnt;	/* count of ustat stats */
+	fmd_modstat_t	mod_stats;	/* fmd built-in per-module statistics */
+	fmd_serd_hash_t	mod_serds;	/* hash of serd engs owned by module */
+	char		*mod_vers;	/* a copy of module version string */
+} fmd_module_t;
+
+/*
+ * ZED has two FMD hardwired module instances
+ */
+fmd_module_t	zfs_retire_module;
+fmd_module_t	zfs_diagnosis_module;
+
+/*
+ * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
+ */
+
+#ifdef DEBUG
+const char *
+_umem_debug_init(void)
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+#endif
+
+/*
+ * Register a module with fmd and finish module initialization.
+ * Returns an integer indicating whether it succeeded (zero) or
+ * failed (non-zero).
+ */
+int
+fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	mp->mod_info = mip;
+	mp->mod_name = mip->fmdi_desc + 4;	/* drop 'ZFS ' prefix */
+	mp->mod_spec = NULL;
+
+	/* bare minimum module stats */
+	(void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted");
+	(void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen");
+	(void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved");
+	(void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed");
+
+	fmd_serd_hash_create(&mp->mod_serds);
+
+	fmd_hdl_debug(hdl, "register module");
+
+	return (0);
+}
+
+void
+fmd_hdl_unregister(fmd_hdl_t *hdl)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+	fmd_modstat_t *msp = &mp->mod_stats;
+	const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+
+	/* dump generic module stats */
+	fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name,
+	    msp->ms_accepted.fmds_value.ui64);
+	if (ops->fmdo_close != NULL) {
+		fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name,
+		    msp->ms_caseopen.fmds_value.ui64);
+		fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name,
+		    msp->ms_casesolved.fmds_value.ui64);
+		fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name,
+		    msp->ms_caseclosed.fmds_value.ui64);
+	}
+
+	/* dump module specific stats */
+	if (mp->mod_ustat != NULL) {
+		int i;
+
+		for (i = 0; i < mp->mod_ustat_cnt; i++) {
+			fmd_hdl_debug(hdl, "%s: %llu",
+			    mp->mod_ustat[i].fmds_name,
+			    mp->mod_ustat[i].fmds_value.ui64);
+		}
+	}
+
+	fmd_serd_hash_destroy(&mp->mod_serds);
+
+	fmd_hdl_debug(hdl, "unregister module");
+}
+
+/*
+ * fmd_hdl_setspecific() is used to associate a data pointer with
+ * the specified handle for the duration of the module's lifetime.
+ * This pointer can be retrieved using fmd_hdl_getspecific().
+ */
+void
+fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	mp->mod_spec = spec;
+}
+
+/*
+ * Return the module-specific data pointer previously associated
+ * with the handle using fmd_hdl_setspecific().
+ */
+void *
+fmd_hdl_getspecific(fmd_hdl_t *hdl)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	return (mp->mod_spec);
+}
+
+void *
+fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags)
+{
+	return (umem_alloc(size, flags));
+}
+
+void *
+fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags)
+{
+	return (umem_zalloc(size, flags));
+}
+
+void
+fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size)
+{
+	umem_free(data, size);
+}
+
+/*
+ * Record a module debug message using the specified format.
+ */
+void
+fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...)
+{
+	char message[256];
+	va_list vargs;
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	va_start(vargs, format);
+	(void) vsnprintf(message, sizeof (message), format, vargs);
+	va_end(vargs);
+
+	/* prefix message with module name */
+	zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message);
+}
+
+/* Property Retrieval */
+
+int32_t
+fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
+{
+	/*
+	 * These can be looked up in mp->modinfo->fmdi_props
+	 * For now we just hard code for phase 2. In the
+	 * future, there can be a ZED based override.
+	 */
+	if (strcmp(name, "spare_on_remove") == 0)
+		return (1);
+
+	if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
+		return (10);	/* N = 10 events */
+
+	return (0);
+}
+
+int64_t
+fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
+{
+	/*
+	 * These can be looked up in mp->modinfo->fmdi_props
+	 * For now we just hard code for phase 2. In the
+	 * future, there can be a ZED based override.
+	 */
+	if (strcmp(name, "remove_timeout") == 0)
+		return (15ULL * 1000ULL * 1000ULL * 1000ULL);	/* 15 sec */
+
+	if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
+		return (1000ULL * 1000ULL * 1000ULL * 600ULL);	/* 10 min */
+
+	return (0);
+}
+
+/* FMD Statistics */
+
+fmd_stat_t *
+fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	if (flags == FMD_STAT_NOALLOC) {
+		mp->mod_ustat = statv;
+		mp->mod_ustat_cnt = nstats;
+	}
+
+	return (statv);
+}
+
+/* Case Management */
+
+fmd_case_t *
+fmd_case_open(fmd_hdl_t *hdl, void *data)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+	uuid_t uuid;
+
+	fmd_case_t *cp;
+
+	cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP);
+	cp->ci_mod = hdl;
+	cp->ci_state = FMD_CASE_UNSOLVED;
+	cp->ci_flags = FMD_CF_DIRTY;
+	cp->ci_data = data;
+	cp->ci_bufptr = NULL;
+	cp->ci_bufsiz = 0;
+
+	uuid_generate(uuid);
+	uuid_unparse(uuid, cp->ci_uuid);
+
+	fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid);
+	mp->mod_stats.ms_caseopen.fmds_value.ui64++;
+
+	return (cp);
+}
+
+void
+fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	/*
+	 * For ZED, the event was already sent from fmd_case_add_suspect()
+	 */
+
+	if (cp->ci_state >= FMD_CASE_SOLVED)
+		fmd_hdl_debug(hdl, "case is already solved or closed");
+
+	cp->ci_state = FMD_CASE_SOLVED;
+
+	fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid);
+	mp->mod_stats.ms_casesolved.fmds_value.ui64++;
+}
+
+void
+fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+	const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+
+	fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid);
+
+	if (ops->fmdo_close != NULL)
+		ops->fmdo_close(hdl, cp);
+
+	mp->mod_stats.ms_caseopen.fmds_value.ui64--;
+	mp->mod_stats.ms_caseclosed.fmds_value.ui64++;
+
+	if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0)
+		fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz);
+
+	fmd_hdl_free(hdl, cp, sizeof (fmd_case_t));
+}
+
+void
+fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid)
+{
+	fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid);
+}
+
+int
+fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+	return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE);
+}
+
+void
+fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep)
+{
+}
+
+static void
+zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code)
+{
+	nvlist_t *rsrc;
+	char *strval;
+	uint64_t guid;
+	uint8_t byte;
+
+	zed_log_msg(LOG_INFO, "\nzed_fault_event:");
+
+	if (uuid != NULL)
+		zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid);
+	if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval);
+	if (code != NULL)
+		zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code);
+	if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte);
+	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
+		if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0)
+			zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME,
+			    strval);
+		if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0)
+			zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL,
+			    guid);
+		if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0)
+			zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV,
+			    guid);
+	}
+}
+
+static const char *
+fmd_fault_mkcode(nvlist_t *fault)
+{
+	char *class, *code = "-";
+
+	/*
+	 * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po
+	 */
+	if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) {
+		if (strcmp(class, "fault.fs.zfs.vdev.io") == 0)
+			code = "ZFS-8000-FD";
+		else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0)
+			code = "ZFS-8000-GH";
+		else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0)
+			code = "ZFS-8000-HC";
+		else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0)
+			code = "ZFS-8000-JQ";
+		else if (strcmp(class, "fault.fs.zfs.log_replay") == 0)
+			code = "ZFS-8000-K4";
+		else if (strcmp(class, "fault.fs.zfs.pool") == 0)
+			code = "ZFS-8000-CS";
+		else if (strcmp(class, "fault.fs.zfs.device") == 0)
+			code = "ZFS-8000-D3";
+
+	}
+	return (code);
+}
+
+void
+fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault)
+{
+	nvlist_t *nvl;
+	const char *code = fmd_fault_mkcode(fault);
+	int64_t tod[2];
+	int err = 0;
+
+	/*
+	 * payload derived from fmd_protocol_list()
+	 */
+
+	(void) gettimeofday(&cp->ci_tv, NULL);
+	tod[0] = cp->ci_tv.tv_sec;
+	tod[1] = cp->ci_tv.tv_usec;
+
+	nvl = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+	err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION);
+	err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS);
+	err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid);
+	err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code);
+	err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2);
+	err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1);
+	err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1);
+
+	if (err)
+		zed_log_die("failed to populate nvlist");
+
+	zed_log_fault(fault, cp->ci_uuid, code);
+	zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl);
+
+	nvlist_free(nvl);
+	nvlist_free(fault);
+}
+
+void
+fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data)
+{
+	cp->ci_data = data;
+}
+
+void *
+fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+	return (cp->ci_data);
+}
+
+void
+fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size)
+{
+	assert(strcmp(name, "data") == 0);
+	assert(cp->ci_bufptr == NULL);
+	assert(size < (1024 * 1024));
+
+	cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP);
+	cp->ci_bufsiz = size;
+}
+
+void
+fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp,
+    const char *name, void *buf, size_t size)
+{
+	assert(strcmp(name, "data") == 0);
+	assert(cp->ci_bufptr != NULL);
+	assert(size <= cp->ci_bufsiz);
+
+	bcopy(cp->ci_bufptr, buf, size);
+}
+
+void
+fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp,
+    const char *name, const void *buf, size_t size)
+{
+	assert(strcmp(name, "data") == 0);
+	assert(cp->ci_bufptr != NULL);
+	assert(cp->ci_bufsiz >= size);
+
+	bcopy(buf, cp->ci_bufptr, size);
+}
+
+/* SERD Engines */
+
+void
+fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) {
+		zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': "
+		    " name already exists", name);
+		return;
+	}
+
+	(void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t);
+}
+
+void
+fmd_serd_destroy(fmd_hdl_t *hdl, const char *name)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	fmd_serd_eng_delete(&mp->mod_serds, name);
+
+	fmd_hdl_debug(hdl, "serd_destroy %s", name);
+}
+
+int
+fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
+}
+
+void
+fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+	fmd_serd_eng_t *sgp;
+
+	if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+		zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
+		return;
+	}
+
+	fmd_serd_eng_reset(sgp);
+
+	fmd_hdl_debug(hdl, "serd_reset %s", name);
+}
+
+int
+fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+	fmd_serd_eng_t *sgp;
+	int err;
+
+	if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+		zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
+		    name);
+		return (FMD_B_FALSE);
+	}
+	err = fmd_serd_eng_record(sgp, ep->ev_hrt);
+
+	return (err);
+}
+
+/* FMD Timers */
+
+static void
+_timer_notify(union sigval sv)
+{
+	fmd_timer_t *ftp = sv.sival_ptr;
+	fmd_hdl_t *hdl = ftp->ft_hdl;
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+	const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+	struct itimerspec its;
+
+	fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
+
+	/* disarm the timer */
+	bzero(&its, sizeof (struct itimerspec));
+	timer_settime(ftp->ft_tid, 0, &its, NULL);
+
+	/* Note that the fmdo_timeout can remove this timer */
+	if (ops->fmdo_timeout != NULL)
+		ops->fmdo_timeout(hdl, ftp, ftp->ft_arg);
+}
+
+/*
+ * Install a new timer which will fire at least delta nanoseconds after the
+ * current time. After the timeout has expired, the module's fmdo_timeout
+ * entry point is called.
+ */
+fmd_timer_t *
+fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta)
+{
+	struct sigevent sev;
+	struct itimerspec its;
+	fmd_timer_t *ftp;
+
+	ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP);
+	ftp->ft_arg = arg;
+	ftp->ft_hdl = hdl;
+
+	its.it_value.tv_sec = delta / 1000000000;
+	its.it_value.tv_nsec = delta % 1000000000;
+	its.it_interval.tv_sec = its.it_value.tv_sec;
+	its.it_interval.tv_nsec = its.it_value.tv_nsec;
+
+	sev.sigev_notify = SIGEV_THREAD;
+	sev.sigev_notify_function = _timer_notify;
+	sev.sigev_notify_attributes = NULL;
+	sev.sigev_value.sival_ptr = ftp;
+
+	timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid);
+	timer_settime(ftp->ft_tid, 0, &its, NULL);
+
+	fmd_hdl_debug(hdl, "installing timer for %d secs (%p)",
+	    (int)its.it_value.tv_sec, ftp->ft_tid);
+
+	return (ftp);
+}
+
+void
+fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp)
+{
+	fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid);
+
+	timer_delete(ftp->ft_tid);
+
+	fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t));
+}
+
+/* Name-Value Pair Lists */
+
+nvlist_t *
+fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty,
+    nvlist_t *asru, nvlist_t *fru, nvlist_t *resource)
+{
+	nvlist_t *nvl;
+	int err = 0;
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+		zed_log_die("failed to xalloc fault nvlist");
+
+	err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION);
+	err |= nvlist_add_string(nvl, FM_CLASS, class);
+	err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty);
+
+	if (asru != NULL)
+		err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru);
+	if (fru != NULL)
+		err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru);
+	if (resource != NULL)
+		err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource);
+
+	if (err)
+		zed_log_die("failed to populate nvlist: %s\n", strerror(err));
+
+	return (nvl);
+}
+
+/*
+ * sourced from fmd_string.c
+ */
+static int
+fmd_strmatch(const char *s, const char *p)
+{
+	char c;
+
+	if (p == NULL)
+		return (0);
+
+	if (s == NULL)
+		s = ""; /* treat NULL string as the empty string */
+
+	do {
+		if ((c = *p++) == '\0')
+			return (*s == '\0');
+
+		if (c == '*') {
+			while (*p == '*')
+				p++; /* consecutive *'s can be collapsed */
+
+			if (*p == '\0')
+				return (1);
+
+			while (*s != '\0') {
+				if (fmd_strmatch(s++, p) != 0)
+					return (1);
+			}
+
+			return (0);
+		}
+	} while (c == *s++);
+
+	return (0);
+}
+
+int
+fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern)
+{
+	char *class;
+
+	return (nvl != NULL &&
+	    nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 &&
+	    fmd_strmatch(class, pattern));
+}
+
+nvlist_t *
+fmd_nvl_alloc(fmd_hdl_t *hdl, int flags)
+{
+	nvlist_t *nvl = NULL;
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+		return (NULL);
+
+	return (nvl);
+}
+
+
+/*
+ * ZED Agent specific APIs
+ */
+
+fmd_hdl_t *
+fmd_module_hdl(const char *name)
+{
+	if (strcmp(name, "zfs-retire") == 0)
+		return ((fmd_hdl_t *)&zfs_retire_module);
+	if (strcmp(name, "zfs-diagnosis") == 0)
+		return ((fmd_hdl_t *)&zfs_diagnosis_module);
+
+	return (NULL);
+}
+
+boolean_t
+fmd_module_initialized(fmd_hdl_t *hdl)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+
+	return (mp->mod_info != NULL);
+}
+
+/*
+ * fmd_module_recv is called for each event that is received by
+ * the fault manager that has a class that matches one of the
+ * module's subscriptions.
+ */
+void
+fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class)
+{
+	fmd_module_t *mp = (fmd_module_t *)hdl;
+	const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+	fmd_event_t faux_event = {0};
+	int64_t *tv;
+	uint_t n;
+
+	/*
+	 * Will need to normalized this if we persistently store the case data
+	 */
+	if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0)
+		faux_event.ev_hrt = tv[0] * NANOSEC + tv[1];
+	else
+		faux_event.ev_hrt = 0;
+
+	ops->fmdo_recv(hdl, &faux_event, nvl, class);
+
+	mp->mod_stats.ms_accepted.fmds_value.ui64++;
+
+	/* TBD - should we initiate fm_module_gc() periodically? */
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_api.h b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.h
new file mode 100644
index 000000000000..4f06fb244b7b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.h
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef	_FMD_API_H
+#define	_FMD_API_H
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <libnvpair.h>
+#include <stdarg.h>
+#include <umem.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Fault Management Daemon Client Interfaces
+ */
+
+#define	FMD_API_VERSION		5
+
+typedef struct fmd_hdl fmd_hdl_t;
+
+typedef struct fmd_timer {
+	timer_t		ft_tid;
+	void		*ft_arg;
+	fmd_hdl_t	*ft_hdl;
+} fmd_timer_t;
+
+#define	id_t	fmd_timer_t *
+
+
+typedef struct fmd_event {
+	hrtime_t	ev_hrt;		/* event time used by SERD engines */
+} fmd_event_t;
+
+typedef struct fmd_case {
+	char		ci_uuid[48];	/* uuid string for this case */
+	fmd_hdl_t	*ci_mod;	/* module that owns this case */
+	void		*ci_data;	/* data from fmd_case_setspecific() */
+	ushort_t	ci_state;	/* case state (see below) */
+	ushort_t	ci_flags;	/* case flags (see below) */
+	struct timeval	ci_tv;		/* time of original diagnosis */
+	void		*ci_bufptr;	/* case data serialization buffer */
+	size_t		ci_bufsiz;
+} fmd_case_t;
+
+
+#define	FMD_B_FALSE	0		/* false value for booleans as int */
+#define	FMD_B_TRUE	1		/* true value for booleans as int */
+
+
+#define	FMD_CASE_UNSOLVED	0	/* case is not yet solved (waiting) */
+#define	FMD_CASE_SOLVED		1	/* case is solved (suspects added) */
+#define	FMD_CASE_CLOSE_WAIT	2	/* case is executing fmdo_close() */
+#define	FMD_CASE_CLOSED		3	/* case is closed (reconfig done) */
+#define	FMD_CASE_REPAIRED	4	/* case is repaired */
+#define	FMD_CASE_RESOLVED	5	/* case is resolved (can be freed) */
+
+#define	FMD_CF_DIRTY		0x01	/* case is in need of checkpoint */
+#define	FMD_CF_SOLVED		0x02	/* case has been solved */
+#define	FMD_CF_ISOLATED		0x04	/* case has been isolated */
+#define	FMD_CF_REPAIRED		0x08	/* case has been repaired */
+#define	FMD_CF_RESOLVED		0x10	/* case has been resolved */
+
+
+#define	FMD_TYPE_BOOL	0		/* int */
+#define	FMD_TYPE_INT32	1		/* int32_t */
+#define	FMD_TYPE_UINT32	2		/* uint32_t */
+#define	FMD_TYPE_INT64	3		/* int64_t */
+#define	FMD_TYPE_UINT64	4		/* uint64_t */
+#define	FMD_TYPE_TIME	5		/* uint64_t */
+#define	FMD_TYPE_SIZE	6		/* uint64_t */
+
+typedef struct fmd_prop {
+	const char *fmdp_name;		/* property name */
+	uint_t fmdp_type;		/* property type (see above) */
+	const char *fmdp_defv;		/* default value */
+} fmd_prop_t;
+
+typedef struct fmd_stat {
+	char fmds_name[32];		/* statistic name */
+	uint_t fmds_type;		/* statistic type (see above) */
+	char fmds_desc[64];		/* statistic description */
+	union {
+		int bool;		/* FMD_TYPE_BOOL */
+		int32_t i32;		/* FMD_TYPE_INT32 */
+		uint32_t ui32;		/* FMD_TYPE_UINT32 */
+		int64_t i64;		/* FMD_TYPE_INT64 */
+		uint64_t ui64;		/* FMD_TYPE_UINT64 */
+	} fmds_value;
+} fmd_stat_t;
+
+typedef struct fmd_hdl_ops {
+	void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *);
+	void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *);
+	void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *);
+	void (*fmdo_stats)(fmd_hdl_t *);
+	void (*fmdo_gc)(fmd_hdl_t *);
+} fmd_hdl_ops_t;
+
+#define	FMD_SEND_SUCCESS	0	/* fmdo_send queued event */
+#define	FMD_SEND_FAILED		1	/* fmdo_send unrecoverable error */
+#define	FMD_SEND_RETRY		2	/* fmdo_send requests retry */
+
+typedef struct fmd_hdl_info {
+	const char *fmdi_desc;		/* fmd client description string */
+	const char *fmdi_vers;		/* fmd client version string */
+	const fmd_hdl_ops_t *fmdi_ops;	/* ops vector for client */
+	const fmd_prop_t *fmdi_props;	/* array of configuration props */
+} fmd_hdl_info_t;
+
+extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *);
+extern void fmd_hdl_unregister(fmd_hdl_t *);
+
+extern void fmd_hdl_setspecific(fmd_hdl_t *, void *);
+extern void *fmd_hdl_getspecific(fmd_hdl_t *);
+
+#define	FMD_SLEEP	UMEM_NOFAIL
+
+extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int);
+extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int);
+extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t);
+
+extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int);
+extern void fmd_hdl_strfree(fmd_hdl_t *, char *);
+
+extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
+extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
+
+extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
+extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
+
+#define	FMD_STAT_NOALLOC	0x0	/* fmd should use caller's memory */
+#define	FMD_STAT_ALLOC		0x1	/* fmd should allocate stats memory */
+
+extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *);
+extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *);
+extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *);
+
+extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *);
+extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *);
+extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *);
+extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *);
+
+extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *);
+extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *);
+extern void fmd_case_uuclose(fmd_hdl_t *, const char *);
+extern int fmd_case_uuclosed(fmd_hdl_t *, const char *);
+extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *);
+extern void fmd_case_uuresolved(fmd_hdl_t *, const char *);
+
+extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *);
+extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *);
+
+extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *);
+extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *);
+extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *);
+
+extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *);
+extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *);
+
+extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *);
+extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *);
+
+extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t);
+extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *);
+extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *,
+    const char *, void *, size_t);
+extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *,
+    const char *, const void *, size_t);
+extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
+
+extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
+extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
+extern int fmd_serd_exists(fmd_hdl_t *, const char *);
+extern void fmd_serd_reset(fmd_hdl_t *, const char *);
+extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
+extern int fmd_serd_fired(fmd_hdl_t *, const char *);
+extern int fmd_serd_empty(fmd_hdl_t *, const char *);
+
+extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
+extern void fmd_timer_remove(fmd_hdl_t *, id_t);
+
+extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *,
+    const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *);
+
+extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *);
+
+#define	FMD_HAS_FAULT_FRU	0
+#define	FMD_HAS_FAULT_ASRU	1
+#define	FMD_HAS_FAULT_RESOURCE	2
+
+extern void fmd_repair_fru(fmd_hdl_t *, const char *);
+extern int fmd_repair_asru(fmd_hdl_t *, const char *);
+
+extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int);
+extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int);
+
+/*
+ * ZED Specific Interfaces
+ */
+
+extern fmd_hdl_t *fmd_module_hdl(const char *);
+extern boolean_t fmd_module_initialized(fmd_hdl_t *);
+extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *);
+
+/* ZFS FMA Retire Agent */
+extern void _zfs_retire_init(fmd_hdl_t *);
+extern void _zfs_retire_fini(fmd_hdl_t *);
+
+/* ZFS FMA Diagnosis Engine */
+extern void _zfs_diagnosis_init(fmd_hdl_t *);
+extern void _zfs_diagnosis_fini(fmd_hdl_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _FMD_API_H */
diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c
new file mode 100644
index 000000000000..d4ec37fb7691
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c
@@ -0,0 +1,316 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/list.h>
+#include <sys/time.h>
+
+#include "fmd_api.h"
+#include "fmd_serd.h"
+#include "../zed_log.h"
+
+
+#define	FMD_STR_BUCKETS		211
+
+
+#ifdef SERD_ENG_DEBUG
+#define	serd_log_msg(fmt, ...) \
+	zed_log_msg(LOG_INFO, fmt, __VA_ARGS__)
+#else
+#define	serd_log_msg(fmt, ...)
+#endif
+
+
+/*
+ * SERD Engine Backend
+ */
+
+/*
+ * Compute the delta between events in nanoseconds.  To account for very old
+ * events which are replayed, we must handle the case where time is negative.
+ * We convert the hrtime_t's to unsigned 64-bit integers and then handle the
+ * case where 'old' is greater than 'new' (i.e. high-res time has wrapped).
+ */
+static hrtime_t
+fmd_event_delta(hrtime_t t1, hrtime_t t2)
+{
+	uint64_t old = t1;
+	uint64_t new = t2;
+
+	return (new >= old ? new - old : (UINT64_MAX - old) + new + 1);
+}
+
+static fmd_serd_eng_t *
+fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t)
+{
+	fmd_serd_eng_t *sgp;
+
+	sgp = malloc(sizeof (fmd_serd_eng_t));
+	bzero(sgp, sizeof (fmd_serd_eng_t));
+
+	sgp->sg_name = strdup(name);
+	sgp->sg_flags = FMD_SERD_DIRTY;
+	sgp->sg_n = n;
+	sgp->sg_t = t;
+
+	list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t),
+	    offsetof(fmd_serd_elem_t, se_list));
+
+	return (sgp);
+}
+
+static void
+fmd_serd_eng_free(fmd_serd_eng_t *sgp)
+{
+	fmd_serd_eng_reset(sgp);
+	free(sgp->sg_name);
+	list_destroy(&sgp->sg_list);
+	free(sgp);
+}
+
+/*
+ * sourced from fmd_string.c
+ */
+static ulong_t
+fmd_strhash(const char *key)
+{
+	ulong_t g, h = 0;
+	const char *p;
+
+	for (p = key; *p != '\0'; p++) {
+		h = (h << 4) + *p;
+
+		if ((g = (h & 0xf0000000)) != 0) {
+			h ^= (g >> 24);
+			h ^= g;
+		}
+	}
+
+	return (h);
+}
+
+void
+fmd_serd_hash_create(fmd_serd_hash_t *shp)
+{
+	shp->sh_hashlen = FMD_STR_BUCKETS;
+	shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *));
+	shp->sh_count = 0;
+}
+
+void
+fmd_serd_hash_destroy(fmd_serd_hash_t *shp)
+{
+	fmd_serd_eng_t *sgp, *ngp;
+	uint_t i;
+
+	for (i = 0; i < shp->sh_hashlen; i++) {
+		for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) {
+			ngp = sgp->sg_next;
+			fmd_serd_eng_free(sgp);
+		}
+	}
+
+	free(shp->sh_hash);
+	bzero(shp, sizeof (fmd_serd_hash_t));
+}
+
+void
+fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg)
+{
+	fmd_serd_eng_t *sgp;
+	uint_t i;
+
+	for (i = 0; i < shp->sh_hashlen; i++) {
+		for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next)
+			func(sgp, arg);
+	}
+}
+
+fmd_serd_eng_t *
+fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name,
+    uint_t n, hrtime_t t)
+{
+	uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+	fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t);
+
+	serd_log_msg("  SERD Engine: inserting  %s N %d T %llu",
+	    name, (int)n, (long long unsigned)t);
+
+	sgp->sg_next = shp->sh_hash[h];
+	shp->sh_hash[h] = sgp;
+	shp->sh_count++;
+
+	return (sgp);
+}
+
+fmd_serd_eng_t *
+fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name)
+{
+	uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+	fmd_serd_eng_t *sgp;
+
+	for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) {
+		if (strcmp(name, sgp->sg_name) == 0)
+			return (sgp);
+	}
+
+	return (NULL);
+}
+
+void
+fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name)
+{
+	uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+	fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h];
+
+	serd_log_msg("  SERD Engine: deleting %s", name);
+
+	for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) {
+		if (strcmp(sgp->sg_name, name) != 0)
+			pp = &sgp->sg_next;
+		else
+			break;
+	}
+
+	if (sgp != NULL) {
+		*pp = sgp->sg_next;
+		fmd_serd_eng_free(sgp);
+		assert(shp->sh_count != 0);
+		shp->sh_count--;
+	}
+}
+
+static void
+fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep)
+{
+	list_remove(&sgp->sg_list, sep);
+	sgp->sg_count--;
+
+	serd_log_msg("  SERD Engine: discarding %s, %d remaining",
+	    sgp->sg_name, (int)sgp->sg_count);
+
+	free(sep);
+}
+
+int
+fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt)
+{
+	fmd_serd_elem_t *sep, *oep;
+
+	/*
+	 * If the fired flag is already set, return false and discard the
+	 * event.  This means that the caller will only see the engine "fire"
+	 * once until fmd_serd_eng_reset() is called.  The fmd_serd_eng_fired()
+	 * function can also be used in combination with fmd_serd_eng_record().
+	 */
+	if (sgp->sg_flags & FMD_SERD_FIRED) {
+		serd_log_msg("  SERD Engine: record %s already fired!",
+		    sgp->sg_name);
+		return (FMD_B_FALSE);
+	}
+
+	while (sgp->sg_count >= sgp->sg_n)
+		fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list));
+
+	sep = malloc(sizeof (fmd_serd_elem_t));
+	sep->se_hrt = hrt;
+
+	list_insert_head(&sgp->sg_list, sep);
+	sgp->sg_count++;
+
+	serd_log_msg("  SERD Engine: recording %s of %d (%llu)",
+	    sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt);
+
+	/*
+	 * Pick up the oldest element pointer for comparison to 'sep'.  We must
+	 * do this after adding 'sep' because 'oep' and 'sep' can be the same.
+	 */
+	oep = list_tail(&sgp->sg_list);
+
+	if (sgp->sg_count >= sgp->sg_n &&
+	    fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) {
+		sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY;
+		serd_log_msg("  SERD Engine: fired %s", sgp->sg_name);
+		return (FMD_B_TRUE);
+	}
+
+	sgp->sg_flags |= FMD_SERD_DIRTY;
+	return (FMD_B_FALSE);
+}
+
+int
+fmd_serd_eng_fired(fmd_serd_eng_t *sgp)
+{
+	return (sgp->sg_flags & FMD_SERD_FIRED);
+}
+
+int
+fmd_serd_eng_empty(fmd_serd_eng_t *sgp)
+{
+	return (sgp->sg_count == 0);
+}
+
+void
+fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
+{
+	serd_log_msg("  SERD Engine: resetting %s", sgp->sg_name);
+
+	while (sgp->sg_count != 0)
+		fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list));
+
+	sgp->sg_flags &= ~FMD_SERD_FIRED;
+	sgp->sg_flags |= FMD_SERD_DIRTY;
+}
+
+void
+fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
+{
+	fmd_serd_elem_t *sep, *nep;
+	hrtime_t hrt;
+
+	if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED))
+		return; /* no garbage collection needed if empty or fired */
+
+	sep = list_head(&sgp->sg_list);
+	if (sep == NULL)
+		return;
+
+	hrt = sep->se_hrt - sgp->sg_t;
+
+	for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) {
+		if (sep->se_hrt >= hrt)
+			break; /* sep and subsequent events are all within T */
+
+		nep = list_next(&sgp->sg_list, sep);
+		fmd_serd_eng_discard(sgp, sep);
+		sgp->sg_flags |= FMD_SERD_DIRTY;
+	}
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h
new file mode 100644
index 000000000000..c35c9acc7785
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef	_FMD_SERD_H
+#define	_FMD_SERD_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/list.h>
+#include <sys/time.h>
+
+typedef struct fmd_serd_elem {
+	list_node_t	se_list;	/* linked list forward/back pointers */
+	hrtime_t	se_hrt;		/* upper bound on event hrtime */
+} fmd_serd_elem_t;
+
+typedef struct fmd_serd_eng {
+	char		*sg_name;	/* string name for this engine */
+	struct fmd_serd_eng *sg_next;	/* next engine on hash chain */
+	list_t		sg_list;	/* list of fmd_serd_elem_t's */
+	uint_t		sg_count;	/* count of events in sg_list */
+	uint_t		sg_flags;	/* engine flags (see below) */
+	uint_t		sg_n;		/* engine N parameter (event count) */
+	hrtime_t	sg_t;		/* engine T parameter (nanoseconds) */
+} fmd_serd_eng_t;
+
+#define	FMD_SERD_FIRED	0x1		/* error rate has exceeded threshold */
+#define	FMD_SERD_DIRTY	0x2		/* engine needs to be checkpointed */
+
+typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *);
+
+typedef struct fmd_serd_hash {
+	fmd_serd_eng_t	**sh_hash;	/* hash bucket array for buffers */
+	uint_t		sh_hashlen;	/* length of hash bucket array */
+	uint_t		sh_count;	/* count of engines in hash */
+} fmd_serd_hash_t;
+
+extern void fmd_serd_hash_create(fmd_serd_hash_t *);
+extern void fmd_serd_hash_destroy(fmd_serd_hash_t *);
+extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *);
+
+extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *,
+    const char *, uint32_t, hrtime_t);
+
+extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *);
+extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *);
+
+extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t);
+extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
+extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
+
+extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
+extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _FMD_SERD_H */
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c
new file mode 100644
index 000000000000..006e0ab99f47
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c
@@ -0,0 +1,422 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
+ */
+
+#include <libnvpair.h>
+#include <libzfs.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/list.h>
+#include <sys/time.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+#include "../zed_log.h"
+
+/*
+ * agent dispatch code
+ */
+
+static pthread_mutex_t	agent_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t	agent_cond = PTHREAD_COND_INITIALIZER;
+static list_t		agent_events;	/* list of pending events */
+static int		agent_exiting;
+
+typedef struct agent_event {
+	char		ae_class[64];
+	char		ae_subclass[32];
+	nvlist_t	*ae_nvl;
+	list_node_t	ae_node;
+} agent_event_t;
+
+pthread_t g_agents_tid;
+
+libzfs_handle_t *g_zfs_hdl;
+
+/* guid search data */
+typedef enum device_type {
+	DEVICE_TYPE_L2ARC,	/* l2arc device */
+	DEVICE_TYPE_SPARE,	/* spare device */
+	DEVICE_TYPE_PRIMARY	/* any primary pool storage device */
+} device_type_t;
+
+typedef struct guid_search {
+	uint64_t	gs_pool_guid;
+	uint64_t	gs_vdev_guid;
+	char		*gs_devid;
+	device_type_t	gs_vdev_type;
+	uint64_t	gs_vdev_expandtime;	/* vdev expansion time */
+} guid_search_t;
+
+/*
+ * Walks the vdev tree recursively looking for a matching devid.
+ * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise.
+ */
+static boolean_t
+zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
+{
+	guid_search_t *gsp = arg;
+	char *path = NULL;
+	uint_t c, children;
+	nvlist_t **child;
+
+	/*
+	 * First iterate over any children.
+	 */
+	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++) {
+			if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+				gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY;
+				return (B_TRUE);
+			}
+		}
+	}
+	/*
+	 * Iterate over any spares and cache devices
+	 */
+	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++) {
+			if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+				gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
+				return (B_TRUE);
+			}
+		}
+	}
+	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++) {
+			if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+				gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
+				return (B_TRUE);
+			}
+		}
+	}
+	/*
+	 * On a devid match, grab the vdev guid and expansion time, if any.
+	 */
+	if (gsp->gs_devid != NULL &&
+	    (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
+	    (strcmp(gsp->gs_devid, path) == 0)) {
+		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
+		    &gsp->gs_vdev_guid);
+		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
+		    &gsp->gs_vdev_expandtime);
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
+{
+	guid_search_t *gsp = arg;
+	nvlist_t *config, *nvl;
+
+	/*
+	 * For each vdev in this pool, look for a match by devid
+	 */
+	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &nvl) == 0) {
+			(void) zfs_agent_iter_vdev(zhp, nvl, gsp);
+		}
+	}
+	/*
+	 * if a match was found then grab the pool guid
+	 */
+	if (gsp->gs_vdev_guid) {
+		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+		    &gsp->gs_pool_guid);
+	}
+
+	zpool_close(zhp);
+	return (gsp->gs_vdev_guid != 0);
+}
+
+void
+zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+	agent_event_t *event;
+
+	if (subclass == NULL)
+		subclass = "";
+
+	event = malloc(sizeof (agent_event_t));
+	if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) {
+		if (event)
+			free(event);
+		return;
+	}
+
+	if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) {
+		class = EC_ZFS;
+		subclass = ESC_ZFS_VDEV_CHECK;
+	}
+
+	/*
+	 * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED
+	 * ereport from vdev_disk layer after a hot unplug. Fortunately we
+	 * get a EC_DEV_REMOVE from our disk monitor and it is a suitable
+	 * proxy so we remap it here for the benefit of the diagnosis engine.
+	 */
+	if ((strcmp(class, EC_DEV_REMOVE) == 0) &&
+	    (strcmp(subclass, ESC_DISK) == 0) &&
+	    (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) ||
+	    nvlist_exists(nvl, DEV_IDENTIFIER))) {
+		nvlist_t *payload = event->ae_nvl;
+		struct timeval tv;
+		int64_t tod[2];
+		uint64_t pool_guid = 0, vdev_guid = 0;
+		guid_search_t search = { 0 };
+		device_type_t devtype = DEVICE_TYPE_PRIMARY;
+
+		class = "resource.fs.zfs.removed";
+		subclass = "";
+
+		(void) nvlist_add_string(payload, FM_CLASS, class);
+		(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
+		(void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
+
+		(void) gettimeofday(&tv, NULL);
+		tod[0] = tv.tv_sec;
+		tod[1] = tv.tv_usec;
+		(void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
+
+		/*
+		 * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
+		 * ZFS_EV_POOL_GUID may be missing so find them.
+		 */
+		(void) nvlist_lookup_string(nvl, DEV_IDENTIFIER,
+		    &search.gs_devid);
+		(void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
+		pool_guid = search.gs_pool_guid;
+		vdev_guid = search.gs_vdev_guid;
+		devtype = search.gs_vdev_type;
+
+		/*
+		 * We want to avoid reporting "remove" events coming from
+		 * libudev for VDEVs which were expanded recently (10s) and
+		 * avoid activating spares in response to partitions being
+		 * deleted and created in rapid succession.
+		 */
+		if (search.gs_vdev_expandtime != 0 &&
+		    search.gs_vdev_expandtime + 10 > tv.tv_sec) {
+			zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
+			    "for recently expanded device '%s'", EC_DEV_REMOVE,
+			    search.gs_devid);
+			goto out;
+		}
+
+		(void) nvlist_add_uint64(payload,
+		    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid);
+		(void) nvlist_add_uint64(payload,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid);
+		switch (devtype) {
+		case DEVICE_TYPE_L2ARC:
+			(void) nvlist_add_string(payload,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+			    VDEV_TYPE_L2CACHE);
+			break;
+		case DEVICE_TYPE_SPARE:
+			(void) nvlist_add_string(payload,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE);
+			break;
+		case DEVICE_TYPE_PRIMARY:
+			(void) nvlist_add_string(payload,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK);
+			break;
+		}
+
+		zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'",
+		    EC_DEV_REMOVE, class);
+	}
+
+	(void) strlcpy(event->ae_class, class, sizeof (event->ae_class));
+	(void) strlcpy(event->ae_subclass, subclass,
+	    sizeof (event->ae_subclass));
+
+	(void) pthread_mutex_lock(&agent_lock);
+	list_insert_tail(&agent_events, event);
+	(void) pthread_mutex_unlock(&agent_lock);
+
+out:
+	(void) pthread_cond_signal(&agent_cond);
+}
+
+static void
+zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl)
+{
+	/*
+	 * The diagnosis engine subscribes to the following events.
+	 * On illumos these subscriptions reside in:
+	 * 	/usr/lib/fm/fmd/plugins/zfs-diagnosis.conf
+	 */
+	if (strstr(class, "ereport.fs.zfs.") != NULL ||
+	    strstr(class, "resource.fs.zfs.") != NULL ||
+	    strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 ||
+	    strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 ||
+	    strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) {
+		fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class);
+	}
+
+	/*
+	 * The retire agent subscribes to the following events.
+	 * On illumos these subscriptions reside in:
+	 * 	/usr/lib/fm/fmd/plugins/zfs-retire.conf
+	 *
+	 * NOTE: faults events come directly from our diagnosis engine
+	 * and will not pass through the zfs kernel module.
+	 */
+	if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
+	    strcmp(class, "resource.fs.zfs.removed") == 0 ||
+	    strcmp(class, "resource.fs.zfs.statechange") == 0 ||
+	    strcmp(class, "sysevent.fs.zfs.vdev_remove")  == 0) {
+		fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class);
+	}
+
+	/*
+	 * The SLM module only consumes disk events and vdev check events
+	 *
+	 * NOTE: disk events come directly from disk monitor and will
+	 * not pass through the zfs kernel module.
+	 */
+	if (strstr(class, "EC_dev_") != NULL ||
+	    strcmp(class, EC_ZFS) == 0) {
+		(void) zfs_slm_event(class, subclass, nvl);
+	}
+}
+
+/*
+ * Events are consumed and dispatched from this thread
+ * An agent can also post an event so event list lock
+ * is not held when calling an agent.
+ * One event is consumed at a time.
+ */
+static void *
+zfs_agent_consumer_thread(void *arg)
+{
+	for (;;) {
+		agent_event_t *event;
+
+		(void) pthread_mutex_lock(&agent_lock);
+
+		/* wait for an event to show up */
+		while (!agent_exiting && list_is_empty(&agent_events))
+			(void) pthread_cond_wait(&agent_cond, &agent_lock);
+
+		if (agent_exiting) {
+			(void) pthread_mutex_unlock(&agent_lock);
+			zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: "
+			    "exiting");
+			return (NULL);
+		}
+
+		if ((event = (list_head(&agent_events))) != NULL) {
+			list_remove(&agent_events, event);
+
+			(void) pthread_mutex_unlock(&agent_lock);
+
+			/* dispatch to all event subscribers */
+			zfs_agent_dispatch(event->ae_class, event->ae_subclass,
+			    event->ae_nvl);
+
+			nvlist_free(event->ae_nvl);
+			free(event);
+			continue;
+		}
+
+		(void) pthread_mutex_unlock(&agent_lock);
+	}
+
+	return (NULL);
+}
+
+void
+zfs_agent_init(libzfs_handle_t *zfs_hdl)
+{
+	fmd_hdl_t *hdl;
+
+	g_zfs_hdl = zfs_hdl;
+
+	if (zfs_slm_init() != 0)
+		zed_log_die("Failed to initialize zfs slm");
+	zed_log_msg(LOG_INFO, "Add Agent: init");
+
+	hdl = fmd_module_hdl("zfs-diagnosis");
+	_zfs_diagnosis_init(hdl);
+	if (!fmd_module_initialized(hdl))
+		zed_log_die("Failed to initialize zfs diagnosis");
+
+	hdl = fmd_module_hdl("zfs-retire");
+	_zfs_retire_init(hdl);
+	if (!fmd_module_initialized(hdl))
+		zed_log_die("Failed to initialize zfs retire");
+
+	list_create(&agent_events, sizeof (agent_event_t),
+	    offsetof(struct agent_event, ae_node));
+
+	if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread,
+	    NULL) != 0) {
+		list_destroy(&agent_events);
+		zed_log_die("Failed to initialize agents");
+	}
+}
+
+void
+zfs_agent_fini(void)
+{
+	fmd_hdl_t *hdl;
+	agent_event_t *event;
+
+	agent_exiting = 1;
+	(void) pthread_cond_signal(&agent_cond);
+
+	/* wait for zfs_enum_pools thread to complete */
+	(void) pthread_join(g_agents_tid, NULL);
+
+	/* drain any pending events */
+	while ((event = (list_head(&agent_events))) != NULL) {
+		list_remove(&agent_events, event);
+		nvlist_free(event->ae_nvl);
+		free(event);
+	}
+
+	list_destroy(&agent_events);
+
+	if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) {
+		_zfs_retire_fini(hdl);
+		fmd_hdl_unregister(hdl);
+	}
+	if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) {
+		_zfs_diagnosis_fini(hdl);
+		fmd_hdl_unregister(hdl);
+	}
+
+	zed_log_msg(LOG_INFO, "Add Agent: fini");
+	zfs_slm_fini();
+
+	g_zfs_hdl = NULL;
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h
new file mode 100644
index 000000000000..d1a459139b1e
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef	ZFS_AGENTS_H
+#define	ZFS_AGENTS_H
+
+#include <libzfs.h>
+#include <libnvpair.h>
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Agent abstraction presented to ZED
+ */
+extern void zfs_agent_init(libzfs_handle_t *);
+extern void zfs_agent_fini(void);
+extern void zfs_agent_post_event(const char *, const char *, nvlist_t *);
+
+/*
+ * ZFS Sysevent Linkable Module (SLM)
+ */
+extern int zfs_slm_init(void);
+extern void zfs_slm_fini(void);
+extern void zfs_slm_event(const char *, const char *, nvlist_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* !ZFS_AGENTS_H */
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c
new file mode 100644
index 000000000000..0b27f6702ee8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c
@@ -0,0 +1,981 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <strings.h>
+#include <libuutil.h>
+#include <libzfs.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+
+/*
+ * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'.  This
+ * #define reserves enough space for two 64-bit hex values plus the length of
+ * the longest string.
+ */
+#define	MAX_SERDLEN	(16 * 2 + sizeof ("zfs___checksum"))
+
+/*
+ * On-disk case structure.  This must maintain backwards compatibility with
+ * previous versions of the DE.  By default, any members appended to the end
+ * will be filled with zeros if they don't exist in a previous version.
+ */
+typedef struct zfs_case_data {
+	uint64_t	zc_version;
+	uint64_t	zc_ena;
+	uint64_t	zc_pool_guid;
+	uint64_t	zc_vdev_guid;
+	int		zc_pool_state;
+	char		zc_serd_checksum[MAX_SERDLEN];
+	char		zc_serd_io[MAX_SERDLEN];
+	int		zc_has_remove_timer;
+} zfs_case_data_t;
+
+/*
+ * Time-of-day
+ */
+typedef struct er_timeval {
+	uint64_t	ertv_sec;
+	uint64_t	ertv_nsec;
+} er_timeval_t;
+
+/*
+ * In-core case structure.
+ */
+typedef struct zfs_case {
+	boolean_t	zc_present;
+	uint32_t	zc_version;
+	zfs_case_data_t	zc_data;
+	fmd_case_t	*zc_case;
+	uu_list_node_t	zc_node;
+	id_t		zc_remove_timer;
+	char		*zc_fru;
+	er_timeval_t	zc_when;
+} zfs_case_t;
+
+#define	CASE_DATA			"data"
+#define	CASE_FRU			"fru"
+#define	CASE_DATA_VERSION_INITIAL	1
+#define	CASE_DATA_VERSION_SERD		2
+
+typedef struct zfs_de_stats {
+	fmd_stat_t	old_drops;
+	fmd_stat_t	dev_drops;
+	fmd_stat_t	vdev_drops;
+	fmd_stat_t	import_drops;
+	fmd_stat_t	resource_drops;
+} zfs_de_stats_t;
+
+zfs_de_stats_t zfs_stats = {
+	{ "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" },
+	{ "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"},
+	{ "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"},
+	{ "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" },
+	{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
+};
+
+static hrtime_t zfs_remove_timeout;
+
+uu_list_pool_t *zfs_case_pool;
+uu_list_t *zfs_cases;
+
+#define	ZFS_MAKE_RSRC(type)	\
+    FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
+#define	ZFS_MAKE_EREPORT(type)	\
+    FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
+
+/*
+ * Write out the persistent representation of an active case.
+ */
+static void
+zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp)
+{
+	zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD;
+}
+
+/*
+ * Read back the persistent representation of an active case.
+ */
+static zfs_case_t *
+zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+	zfs_case_t *zcp;
+
+	zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP);
+	zcp->zc_case = cp;
+
+	fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data,
+	    sizeof (zcp->zc_data));
+
+	if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) {
+		fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+		return (NULL);
+	}
+
+	/*
+	 * fmd_buf_read() will have already zeroed out the remainder of the
+	 * buffer, so we don't have to do anything special if the version
+	 * doesn't include the SERD engine name.
+	 */
+
+	if (zcp->zc_data.zc_has_remove_timer)
+		zcp->zc_remove_timer = fmd_timer_install(hdl, zcp,
+		    NULL, zfs_remove_timeout);
+
+	uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool);
+	(void) uu_list_insert_before(zfs_cases, NULL, zcp);
+
+	fmd_case_setspecific(hdl, cp, zcp);
+
+	return (zcp);
+}
+
+/*
+ * Iterate over any active cases.  If any cases are associated with a pool or
+ * vdev which is no longer present on the system, close the associated case.
+ */
+static void
+zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded)
+{
+	uint64_t vdev_guid = 0;
+	uint_t c, children;
+	nvlist_t **child;
+	zfs_case_t *zcp;
+
+	(void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid);
+
+	/*
+	 * Mark any cases associated with this (pool, vdev) pair.
+	 */
+	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+	    zcp = uu_list_next(zfs_cases, zcp)) {
+		if (zcp->zc_data.zc_pool_guid == pool_guid &&
+		    zcp->zc_data.zc_vdev_guid == vdev_guid) {
+			zcp->zc_present = B_TRUE;
+			zcp->zc_when = *loaded;
+		}
+	}
+
+	/*
+	 * Iterate over all children.
+	 */
+	if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) == 0) {
+		for (c = 0; c < children; c++)
+			zfs_mark_vdev(pool_guid, child[c], loaded);
+	}
+
+	if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child,
+	    &children) == 0) {
+		for (c = 0; c < children; c++)
+			zfs_mark_vdev(pool_guid, child[c], loaded);
+	}
+
+	if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child,
+	    &children) == 0) {
+		for (c = 0; c < children; c++)
+			zfs_mark_vdev(pool_guid, child[c], loaded);
+	}
+}
+
+/*ARGSUSED*/
+static int
+zfs_mark_pool(zpool_handle_t *zhp, void *unused)
+{
+	zfs_case_t *zcp;
+	uint64_t pool_guid;
+	uint64_t *tod;
+	er_timeval_t loaded = { 0 };
+	nvlist_t *config, *vd;
+	uint_t nelem = 0;
+	int ret;
+
+	pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
+	/*
+	 * Mark any cases associated with just this pool.
+	 */
+	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+	    zcp = uu_list_next(zfs_cases, zcp)) {
+		if (zcp->zc_data.zc_pool_guid == pool_guid &&
+		    zcp->zc_data.zc_vdev_guid == 0)
+			zcp->zc_present = B_TRUE;
+	}
+
+	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+		zpool_close(zhp);
+		return (-1);
+	}
+
+	(void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
+	    &tod, &nelem);
+	if (nelem == 2) {
+		loaded.ertv_sec = tod[0];
+		loaded.ertv_nsec = tod[1];
+		for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+		    zcp = uu_list_next(zfs_cases, zcp)) {
+			if (zcp->zc_data.zc_pool_guid == pool_guid &&
+			    zcp->zc_data.zc_vdev_guid == 0) {
+				zcp->zc_when = loaded;
+			}
+		}
+	}
+
+	ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd);
+	if (ret) {
+		zpool_close(zhp);
+		return (-1);
+	}
+
+	zfs_mark_vdev(pool_guid, vd, &loaded);
+
+	zpool_close(zhp);
+
+	return (0);
+}
+
+struct load_time_arg {
+	uint64_t lt_guid;
+	er_timeval_t *lt_time;
+	boolean_t lt_found;
+};
+
+static int
+zpool_find_load_time(zpool_handle_t *zhp, void *arg)
+{
+	struct load_time_arg *lta = arg;
+	uint64_t pool_guid;
+	uint64_t *tod;
+	nvlist_t *config;
+	uint_t nelem;
+
+	if (lta->lt_found) {
+		zpool_close(zhp);
+		return (0);
+	}
+
+	pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
+	if (pool_guid != lta->lt_guid) {
+		zpool_close(zhp);
+		return (0);
+	}
+
+	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+		zpool_close(zhp);
+		return (-1);
+	}
+
+	if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
+	    &tod, &nelem) == 0 && nelem == 2) {
+		lta->lt_found = B_TRUE;
+		lta->lt_time->ertv_sec = tod[0];
+		lta->lt_time->ertv_nsec = tod[1];
+	}
+
+	zpool_close(zhp);
+
+	return (0);
+}
+
+static void
+zfs_purge_cases(fmd_hdl_t *hdl)
+{
+	zfs_case_t *zcp;
+	uu_list_walk_t *walk;
+	libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
+
+	/*
+	 * There is no way to open a pool by GUID, or lookup a vdev by GUID.  No
+	 * matter what we do, we're going to have to stomach an O(vdevs * cases)
+	 * algorithm.  In reality, both quantities are likely so small that
+	 * neither will matter. Given that iterating over pools is more
+	 * expensive than iterating over the in-memory case list, we opt for a
+	 * 'present' flag in each case that starts off cleared.  We then iterate
+	 * over all pools, marking those that are still present, and removing
+	 * those that aren't found.
+	 *
+	 * Note that we could also construct an FMRI and rely on
+	 * fmd_nvl_fmri_present(), but this would end up doing the same search.
+	 */
+
+	/*
+	 * Mark the cases as not present.
+	 */
+	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+	    zcp = uu_list_next(zfs_cases, zcp))
+		zcp->zc_present = B_FALSE;
+
+	/*
+	 * Iterate over all pools and mark the pools and vdevs found.  If this
+	 * fails (most probably because we're out of memory), then don't close
+	 * any of the cases and we cannot be sure they are accurate.
+	 */
+	if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0)
+		return;
+
+	/*
+	 * Remove those cases which were not found.
+	 */
+	walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
+	while ((zcp = uu_list_walk_next(walk)) != NULL) {
+		if (!zcp->zc_present)
+			fmd_case_close(hdl, zcp->zc_case);
+	}
+	uu_list_walk_end(walk);
+}
+
+/*
+ * Construct the name of a serd engine given the pool/vdev GUID and type (io or
+ * checksum).
+ */
+static void
+zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
+    const char *type)
+{
+	(void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s",
+	    (long long unsigned int)pool_guid,
+	    (long long unsigned int)vdev_guid, type);
+}
+
+/*
+ * Solve a given ZFS case.  This first checks to make sure the diagnosis is
+ * still valid, as well as cleaning up any pending timer associated with the
+ * case.
+ */
+static void
+zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname,
+    boolean_t checkunusable)
+{
+	nvlist_t *detector, *fault;
+	boolean_t serialize;
+	nvlist_t *fru = NULL;
+	fmd_hdl_debug(hdl, "solving fault '%s'", faultname);
+
+	/*
+	 * Construct the detector from the case data.  The detector is in the
+	 * ZFS scheme, and is either the pool or the vdev, depending on whether
+	 * this is a vdev or pool fault.
+	 */
+	detector = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+	(void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
+	(void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
+	(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
+	    zcp->zc_data.zc_pool_guid);
+	if (zcp->zc_data.zc_vdev_guid != 0) {
+		(void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
+		    zcp->zc_data.zc_vdev_guid);
+	}
+
+	fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
+	    fru, detector);
+	fmd_case_add_suspect(hdl, zcp->zc_case, fault);
+
+	nvlist_free(fru);
+
+	fmd_case_solve(hdl, zcp->zc_case);
+
+	serialize = B_FALSE;
+	if (zcp->zc_data.zc_has_remove_timer) {
+		fmd_timer_remove(hdl, zcp->zc_remove_timer);
+		zcp->zc_data.zc_has_remove_timer = 0;
+		serialize = B_TRUE;
+	}
+	if (serialize)
+		zfs_case_serialize(hdl, zcp);
+
+	nvlist_free(detector);
+}
+
+static boolean_t
+timeval_earlier(er_timeval_t *a, er_timeval_t *b)
+{
+	return (a->ertv_sec < b->ertv_sec ||
+	    (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec));
+}
+
+/*ARGSUSED*/
+static void
+zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when)
+{
+	int64_t *tod;
+	uint_t	nelem;
+
+	if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod,
+	    &nelem) == 0 && nelem == 2) {
+		when->ertv_sec = tod[0];
+		when->ertv_nsec = tod[1];
+	} else {
+		when->ertv_sec = when->ertv_nsec = UINT64_MAX;
+	}
+}
+
+/*
+ * Main fmd entry point.
+ */
+/*ARGSUSED*/
+static void
+zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
+{
+	zfs_case_t *zcp, *dcp;
+	int32_t pool_state;
+	uint64_t ena, pool_guid, vdev_guid;
+	er_timeval_t pool_load;
+	er_timeval_t er_when;
+	nvlist_t *detector;
+	boolean_t pool_found = B_FALSE;
+	boolean_t isresource;
+	char *type;
+
+	/*
+	 * We subscribe to notifications for vdev or pool removal.  In these
+	 * cases, there may be cases that no longer apply.  Purge any cases
+	 * that no longer apply.
+	 */
+	if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) {
+		fmd_hdl_debug(hdl, "purging orphaned cases from %s",
+		    strrchr(class, '.') + 1);
+		zfs_purge_cases(hdl);
+		zfs_stats.resource_drops.fmds_value.ui64++;
+		return;
+	}
+
+	isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*");
+
+	if (isresource) {
+		/*
+		 * For resources, we don't have a normal payload.
+		 */
+		if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+		    &vdev_guid) != 0)
+			pool_state = SPA_LOAD_OPEN;
+		else
+			pool_state = SPA_LOAD_NONE;
+		detector = NULL;
+	} else {
+		(void) nvlist_lookup_nvlist(nvl,
+		    FM_EREPORT_DETECTOR, &detector);
+		(void) nvlist_lookup_int32(nvl,
+		    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state);
+	}
+
+	/*
+	 * We also ignore all ereports generated during an import of a pool,
+	 * since the only possible fault (.pool) would result in import failure,
+	 * and hence no persistent fault.  Some day we may want to do something
+	 * with these ereports, so we continue generating them internally.
+	 */
+	if (pool_state == SPA_LOAD_IMPORT) {
+		zfs_stats.import_drops.fmds_value.ui64++;
+		fmd_hdl_debug(hdl, "ignoring '%s' during import", class);
+		return;
+	}
+
+	/*
+	 * Device I/O errors are ignored during pool open.
+	 */
+	if (pool_state == SPA_LOAD_OPEN &&
+	    (fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
+	    fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
+	    fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) {
+		fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class);
+		zfs_stats.dev_drops.fmds_value.ui64++;
+		return;
+	}
+
+	/*
+	 * We ignore ereports for anything except disks and files.
+	 */
+	if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+	    &type) == 0) {
+		if (strcmp(type, VDEV_TYPE_DISK) != 0 &&
+		    strcmp(type, VDEV_TYPE_FILE) != 0) {
+			zfs_stats.vdev_drops.fmds_value.ui64++;
+			return;
+		}
+	}
+
+	/*
+	 * Determine if this ereport corresponds to an open case.
+	 * Each vdev or pool can have a single case.
+	 */
+	(void) nvlist_lookup_uint64(nvl,
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid);
+	if (nvlist_lookup_uint64(nvl,
+	    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
+		vdev_guid = 0;
+	if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0)
+		ena = 0;
+
+	zfs_ereport_when(hdl, nvl, &er_when);
+
+	for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+	    zcp = uu_list_next(zfs_cases, zcp)) {
+		if (zcp->zc_data.zc_pool_guid == pool_guid) {
+			pool_found = B_TRUE;
+			pool_load = zcp->zc_when;
+		}
+		if (zcp->zc_data.zc_vdev_guid == vdev_guid)
+			break;
+	}
+
+	/*
+	 * Avoid falsely accusing a pool of being faulty.  Do so by
+	 * not replaying ereports that were generated prior to the
+	 * current import.  If the failure that generated them was
+	 * transient because the device was actually removed but we
+	 * didn't receive the normal asynchronous notification, we
+	 * don't want to mark it as faulted and potentially panic. If
+	 * there is still a problem we'd expect not to be able to
+	 * import the pool, or that new ereports will be generated
+	 * once the pool is used.
+	 */
+	if (pool_found && timeval_earlier(&er_when, &pool_load)) {
+		fmd_hdl_debug(hdl, "ignoring pool %llx, "
+		    "ereport time %lld.%lld, pool load time = %lld.%lld",
+		    pool_guid, er_when.ertv_sec, er_when.ertv_nsec,
+		    pool_load.ertv_sec, pool_load.ertv_nsec);
+		zfs_stats.old_drops.fmds_value.ui64++;
+		return;
+	}
+
+	if (!pool_found) {
+		/*
+		 * Haven't yet seen this pool, but same situation
+		 * may apply.
+		 */
+		libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
+		struct load_time_arg la;
+
+		la.lt_guid = pool_guid;
+		la.lt_time = &pool_load;
+		la.lt_found = B_FALSE;
+
+		if (zhdl != NULL &&
+		    zpool_iter(zhdl, zpool_find_load_time, &la) == 0 &&
+		    la.lt_found == B_TRUE) {
+			pool_found = B_TRUE;
+
+			if (timeval_earlier(&er_when, &pool_load)) {
+				fmd_hdl_debug(hdl, "ignoring pool %llx, "
+				    "ereport time %lld.%lld, "
+				    "pool load time = %lld.%lld",
+				    pool_guid, er_when.ertv_sec,
+				    er_when.ertv_nsec, pool_load.ertv_sec,
+				    pool_load.ertv_nsec);
+				zfs_stats.old_drops.fmds_value.ui64++;
+				return;
+			}
+		}
+	}
+
+	if (zcp == NULL) {
+		fmd_case_t *cs;
+		zfs_case_data_t data = { 0 };
+
+		/*
+		 * If this is one of our 'fake' resource ereports, and there is
+		 * no case open, simply discard it.
+		 */
+		if (isresource) {
+			zfs_stats.resource_drops.fmds_value.ui64++;
+			fmd_hdl_debug(hdl, "discarding '%s for vdev %llu",
+			    class, vdev_guid);
+			return;
+		}
+
+		/*
+		 * Skip tracking some ereports
+		 */
+		if (strcmp(class,
+		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
+		    strcmp(class,
+		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
+		    strcmp(class,
+		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
+			zfs_stats.resource_drops.fmds_value.ui64++;
+			return;
+		}
+
+		/*
+		 * Open a new case.
+		 */
+		cs = fmd_case_open(hdl, NULL);
+
+		fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'",
+		    vdev_guid, class);
+
+		/*
+		 * Initialize the case buffer.  To commonize code, we actually
+		 * create the buffer with existing data, and then call
+		 * zfs_case_unserialize() to instantiate the in-core structure.
+		 */
+		fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t));
+
+		data.zc_version = CASE_DATA_VERSION_SERD;
+		data.zc_ena = ena;
+		data.zc_pool_guid = pool_guid;
+		data.zc_vdev_guid = vdev_guid;
+		data.zc_pool_state = (int)pool_state;
+
+		fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
+
+		zcp = zfs_case_unserialize(hdl, cs);
+		assert(zcp != NULL);
+		if (pool_found)
+			zcp->zc_when = pool_load;
+	}
+
+	if (isresource) {
+		fmd_hdl_debug(hdl, "resource event '%s'", class);
+
+		if (fmd_nvl_class_match(hdl, nvl,
+		    ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) {
+			/*
+			 * The 'resource.fs.zfs.autoreplace' event indicates
+			 * that the pool was loaded with the 'autoreplace'
+			 * property set.  In this case, any pending device
+			 * failures should be ignored, as the asynchronous
+			 * autoreplace handling will take care of them.
+			 */
+			fmd_case_close(hdl, zcp->zc_case);
+		} else if (fmd_nvl_class_match(hdl, nvl,
+		    ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) {
+			/*
+			 * The 'resource.fs.zfs.removed' event indicates that
+			 * device removal was detected, and the device was
+			 * closed asynchronously.  If this is the case, we
+			 * assume that any recent I/O errors were due to the
+			 * device removal, not any fault of the device itself.
+			 * We reset the SERD engine, and cancel any pending
+			 * timers.
+			 */
+			if (zcp->zc_data.zc_has_remove_timer) {
+				fmd_timer_remove(hdl, zcp->zc_remove_timer);
+				zcp->zc_data.zc_has_remove_timer = 0;
+				zfs_case_serialize(hdl, zcp);
+			}
+			if (zcp->zc_data.zc_serd_io[0] != '\0')
+				fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io);
+			if (zcp->zc_data.zc_serd_checksum[0] != '\0')
+				fmd_serd_reset(hdl,
+				    zcp->zc_data.zc_serd_checksum);
+		} else if (fmd_nvl_class_match(hdl, nvl,
+		    ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
+			uint64_t state = 0;
+
+			if (zcp != NULL &&
+			    nvlist_lookup_uint64(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 &&
+			    state == VDEV_STATE_HEALTHY) {
+				fmd_hdl_debug(hdl, "closing case after a "
+				    "device statechange to healthy");
+				fmd_case_close(hdl, zcp->zc_case);
+			}
+		}
+		zfs_stats.resource_drops.fmds_value.ui64++;
+		return;
+	}
+
+	/*
+	 * Associate the ereport with this case.
+	 */
+	fmd_case_add_ereport(hdl, zcp->zc_case, ep);
+
+	/*
+	 * Don't do anything else if this case is already solved.
+	 */
+	if (fmd_case_solved(hdl, zcp->zc_case))
+		return;
+
+	fmd_hdl_debug(hdl, "error event '%s'", class);
+
+	/*
+	 * Determine if we should solve the case and generate a fault.  We solve
+	 * a case if:
+	 *
+	 * 	a. A pool failed to open (ereport.fs.zfs.pool)
+	 * 	b. A device failed to open (ereport.fs.zfs.pool) while a pool
+	 *	   was up and running.
+	 *
+	 * We may see a series of ereports associated with a pool open, all
+	 * chained together by the same ENA.  If the pool open succeeds, then
+	 * we'll see no further ereports.  To detect when a pool open has
+	 * succeeded, we associate a timer with the event.  When it expires, we
+	 * close the case.
+	 */
+	if (fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) {
+		/*
+		 * Pool level fault.  Before solving the case, go through and
+		 * close any open device cases that may be pending.
+		 */
+		for (dcp = uu_list_first(zfs_cases); dcp != NULL;
+		    dcp = uu_list_next(zfs_cases, dcp)) {
+			if (dcp->zc_data.zc_pool_guid ==
+			    zcp->zc_data.zc_pool_guid &&
+			    dcp->zc_data.zc_vdev_guid != 0)
+				fmd_case_close(hdl, dcp->zc_case);
+		}
+
+		zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE);
+	} else if (fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) {
+		/*
+		 * Pool level fault for reading the intent logs.
+		 */
+		zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE);
+	} else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) {
+		/*
+		 * Device fault.
+		 */
+		zfs_case_solve(hdl, zcp, "fault.fs.zfs.device",  B_TRUE);
+	} else if (fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
+	    fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
+	    fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
+	    fmd_nvl_class_match(hdl, nvl,
+	    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
+		char *failmode = NULL;
+		boolean_t checkremove = B_FALSE;
+
+		/*
+		 * If this is a checksum or I/O error, then toss it into the
+		 * appropriate SERD engine and check to see if it has fired.
+		 * Ideally, we want to do something more sophisticated,
+		 * (persistent errors for a single data block, etc).  For now,
+		 * a single SERD engine is sufficient.
+		 */
+		if (fmd_nvl_class_match(hdl, nvl,
+		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) {
+			if (zcp->zc_data.zc_serd_io[0] == '\0') {
+				zfs_serd_name(zcp->zc_data.zc_serd_io,
+				    pool_guid, vdev_guid, "io");
+				fmd_serd_create(hdl, zcp->zc_data.zc_serd_io,
+				    fmd_prop_get_int32(hdl, "io_N"),
+				    fmd_prop_get_int64(hdl, "io_T"));
+				zfs_case_serialize(hdl, zcp);
+			}
+			if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
+				checkremove = B_TRUE;
+		} else if (fmd_nvl_class_match(hdl, nvl,
+		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
+			if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
+				zfs_serd_name(zcp->zc_data.zc_serd_checksum,
+				    pool_guid, vdev_guid, "checksum");
+				fmd_serd_create(hdl,
+				    zcp->zc_data.zc_serd_checksum,
+				    fmd_prop_get_int32(hdl, "checksum_N"),
+				    fmd_prop_get_int64(hdl, "checksum_T"));
+				zfs_case_serialize(hdl, zcp);
+			}
+			if (fmd_serd_record(hdl,
+			    zcp->zc_data.zc_serd_checksum, ep)) {
+				zfs_case_solve(hdl, zcp,
+				    "fault.fs.zfs.vdev.checksum", B_FALSE);
+			}
+		} else if (fmd_nvl_class_match(hdl, nvl,
+		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) &&
+		    (nvlist_lookup_string(nvl,
+		    FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) &&
+		    failmode != NULL) {
+			if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE,
+			    strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) {
+				zfs_case_solve(hdl, zcp,
+				    "fault.fs.zfs.io_failure_continue",
+				    B_FALSE);
+			} else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT,
+			    strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) {
+				zfs_case_solve(hdl, zcp,
+				    "fault.fs.zfs.io_failure_wait", B_FALSE);
+			}
+		} else if (fmd_nvl_class_match(hdl, nvl,
+		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
+#ifndef __linux__
+			/* This causes an unexpected fault diagnosis on linux */
+			checkremove = B_TRUE;
+#endif
+		}
+
+		/*
+		 * Because I/O errors may be due to device removal, we postpone
+		 * any diagnosis until we're sure that we aren't about to
+		 * receive a 'resource.fs.zfs.removed' event.
+		 */
+		if (checkremove) {
+			if (zcp->zc_data.zc_has_remove_timer)
+				fmd_timer_remove(hdl, zcp->zc_remove_timer);
+			zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL,
+			    zfs_remove_timeout);
+			if (!zcp->zc_data.zc_has_remove_timer) {
+				zcp->zc_data.zc_has_remove_timer = 1;
+				zfs_case_serialize(hdl, zcp);
+			}
+		}
+	}
+}
+
+/*
+ * The timeout is fired when we diagnosed an I/O error, and it was not due to
+ * device removal (which would cause the timeout to be cancelled).
+ */
+/* ARGSUSED */
+static void
+zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data)
+{
+	zfs_case_t *zcp = data;
+
+	if (id == zcp->zc_remove_timer)
+		zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE);
+}
+
+/*
+ * The specified case has been closed and any case-specific
+ * data structures should be deallocated.
+ */
+static void
+zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
+{
+	zfs_case_t *zcp = fmd_case_getspecific(hdl, cs);
+
+	if (zcp->zc_data.zc_serd_checksum[0] != '\0')
+		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
+	if (zcp->zc_data.zc_serd_io[0] != '\0')
+		fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
+	if (zcp->zc_data.zc_has_remove_timer)
+		fmd_timer_remove(hdl, zcp->zc_remove_timer);
+
+	uu_list_remove(zfs_cases, zcp);
+	uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
+	fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+}
+
+/*
+ * We use the fmd gc entry point to look for old cases that no longer apply.
+ * This allows us to keep our set of case data small in a long running system.
+ */
+static void
+zfs_fm_gc(fmd_hdl_t *hdl)
+{
+	zfs_purge_cases(hdl);
+}
+
+static const fmd_hdl_ops_t fmd_ops = {
+	zfs_fm_recv,	/* fmdo_recv */
+	zfs_fm_timeout,	/* fmdo_timeout */
+	zfs_fm_close,	/* fmdo_close */
+	NULL,		/* fmdo_stats */
+	zfs_fm_gc,	/* fmdo_gc */
+};
+
+static const fmd_prop_t fmd_props[] = {
+	{ "checksum_N", FMD_TYPE_UINT32, "10" },
+	{ "checksum_T", FMD_TYPE_TIME, "10min" },
+	{ "io_N", FMD_TYPE_UINT32, "10" },
+	{ "io_T", FMD_TYPE_TIME, "10min" },
+	{ "remove_timeout", FMD_TYPE_TIME, "15sec" },
+	{ NULL, 0, NULL }
+};
+
+static const fmd_hdl_info_t fmd_info = {
+	"ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props
+};
+
+void
+_zfs_diagnosis_init(fmd_hdl_t *hdl)
+{
+	libzfs_handle_t *zhdl;
+
+	if ((zhdl = libzfs_init()) == NULL)
+		return;
+
+	if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool",
+	    sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node),
+	    NULL, UU_LIST_POOL_DEBUG)) == NULL) {
+		libzfs_fini(zhdl);
+		return;
+	}
+
+	if ((zfs_cases = uu_list_create(zfs_case_pool, NULL,
+	    UU_LIST_DEBUG)) == NULL) {
+		uu_list_pool_destroy(zfs_case_pool);
+		libzfs_fini(zhdl);
+		return;
+	}
+
+	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
+		uu_list_destroy(zfs_cases);
+		uu_list_pool_destroy(zfs_case_pool);
+		libzfs_fini(zhdl);
+		return;
+	}
+
+	fmd_hdl_setspecific(hdl, zhdl);
+
+	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
+	    sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
+
+	zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
+}
+
+void
+_zfs_diagnosis_fini(fmd_hdl_t *hdl)
+{
+	zfs_case_t *zcp;
+	uu_list_walk_t *walk;
+	libzfs_handle_t *zhdl;
+
+	/*
+	 * Remove all active cases.
+	 */
+	walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
+	while ((zcp = uu_list_walk_next(walk)) != NULL) {
+		fmd_hdl_debug(hdl, "removing case ena %llu",
+		    (long long unsigned)zcp->zc_data.zc_ena);
+		uu_list_remove(zfs_cases, zcp);
+		uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
+		fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+	}
+	uu_list_walk_end(walk);
+
+	uu_list_destroy(zfs_cases);
+	uu_list_pool_destroy(zfs_case_pool);
+
+	zhdl = fmd_hdl_getspecific(hdl);
+	libzfs_fini(zhdl);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c
new file mode 100644
index 000000000000..8d0a3b420086
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c
@@ -0,0 +1,956 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2016, 2017, Intel Corporation.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ */
+
+/*
+ * ZFS syseventd module.
+ *
+ * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c
+ *
+ * The purpose of this module is to identify when devices are added to the
+ * system, and appropriately online or replace the affected vdevs.
+ *
+ * When a device is added to the system:
+ *
+ * 	1. Search for any vdevs whose devid matches that of the newly added
+ *	   device.
+ *
+ * 	2. If no vdevs are found, then search for any vdevs whose udev path
+ *	   matches that of the new device.
+ *
+ *	3. If no vdevs match by either method, then ignore the event.
+ *
+ * 	4. Attempt to online the device with a flag to indicate that it should
+ *	   be unspared when resilvering completes.  If this succeeds, then the
+ *	   same device was inserted and we should continue normally.
+ *
+ *	5. If the pool does not have the 'autoreplace' property set, attempt to
+ *	   online the device again without the unspare flag, which will
+ *	   generate a FMA fault.
+ *
+ *	6. If the pool has the 'autoreplace' property set, and the matching vdev
+ *	   is a whole disk, then label the new disk and attempt a 'zpool
+ *	   replace'.
+ *
+ * The module responds to EC_DEV_ADD events.  The special ESC_ZFS_VDEV_CHECK
+ * event indicates that a device failed to open during pool load, but the
+ * autoreplace property was set.  In this case, we deferred the associated
+ * FMA fault until our module had a chance to process the autoreplace logic.
+ * If the device could not be replaced, then the second online attempt will
+ * trigger the FMA fault that we skipped earlier.
+ *
+ * ZFS on Linux porting notes:
+ *	Linux udev provides a disk insert for both the disk and the partition
+ *
+ */
+
+#include <ctype.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+#include <sys/list.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+#include <thread_pool.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <errno.h>
+#include "zfs_agents.h"
+#include "../zed_log.h"
+
+#define	DEV_BYID_PATH	"/dev/disk/by-id/"
+#define	DEV_BYPATH_PATH	"/dev/disk/by-path/"
+#define	DEV_BYVDEV_PATH	"/dev/disk/by-vdev/"
+
+typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
+
+libzfs_handle_t *g_zfshdl;
+list_t g_pool_list;	/* list of unavailable pools at initialization */
+list_t g_device_list;	/* list of disks with asynchronous label request */
+tpool_t *g_tpool;
+boolean_t g_enumeration_done;
+pthread_t g_zfs_tid;	/* zfs_enum_pools() thread */
+
+typedef struct unavailpool {
+	zpool_handle_t	*uap_zhp;
+	list_node_t	uap_node;
+} unavailpool_t;
+
+typedef struct pendingdev {
+	char		pd_physpath[128];
+	list_node_t	pd_node;
+} pendingdev_t;
+
+static int
+zfs_toplevel_state(zpool_handle_t *zhp)
+{
+	nvlist_t *nvroot;
+	vdev_stat_t *vs;
+	unsigned int c;
+
+	verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+	return (vs->vs_state);
+}
+
+static int
+zfs_unavail_pool(zpool_handle_t *zhp, void *data)
+{
+	zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)",
+	    zpool_get_name(zhp), (int)zfs_toplevel_state(zhp));
+
+	if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) {
+		unavailpool_t *uap;
+		uap = malloc(sizeof (unavailpool_t));
+		uap->uap_zhp = zhp;
+		list_insert_tail((list_t *)data, uap);
+	} else {
+		zpool_close(zhp);
+	}
+	return (0);
+}
+
+/*
+ * Two stage replace on Linux
+ * since we get disk notifications
+ * we can wait for partitioned disk slice to show up!
+ *
+ * First stage tags the disk, initiates async partitioning, and returns
+ * Second stage finds the tag and proceeds to ZFS labeling/replace
+ *
+ * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach
+ *
+ * 1. physical match with no fs, no partition
+ *	tag it top, partition disk
+ *
+ * 2. physical match again, see partition and tag
+ *
+ */
+
+/*
+ * The device associated with the given vdev (either by devid or physical path)
+ * has been added to the system.  If 'isdisk' is set, then we only attempt a
+ * replacement if it's a whole disk.  This also implies that we should label the
+ * disk first.
+ *
+ * First, we attempt to online the device (making sure to undo any spare
+ * operation when finished).  If this succeeds, then we're done.  If it fails,
+ * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
+ * but that the label was not what we expected.  If the 'autoreplace' property
+ * is enabled, then we relabel the disk (if specified), and attempt a 'zpool
+ * replace'.  If the online is successful, but the new state is something else
+ * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
+ * race, and we should avoid attempting to relabel the disk.
+ *
+ * Also can arrive here from a ESC_ZFS_VDEV_CHECK event
+ */
+static void
+zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
+{
+	char *path;
+	vdev_state_t newstate;
+	nvlist_t *nvroot, *newvd;
+	pendingdev_t *device;
+	uint64_t wholedisk = 0ULL;
+	uint64_t offline = 0ULL;
+	uint64_t guid = 0ULL;
+	char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
+	char rawpath[PATH_MAX], fullpath[PATH_MAX];
+	char devpath[PATH_MAX];
+	int ret;
+	boolean_t is_dm = B_FALSE;
+	boolean_t is_sd = B_FALSE;
+	uint_t c;
+	vdev_stat_t *vs;
+
+	if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
+		return;
+
+	/* Skip healthy disks */
+	verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+	if (vs->vs_state == VDEV_STATE_HEALTHY) {
+		zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.",
+		    __func__, path);
+		return;
+	}
+
+	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
+	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+	    &enc_sysfs_path);
+	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
+	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
+	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid);
+
+	if (offline)
+		return;  /* don't intervene if it was taken offline */
+
+	is_dm = zfs_dev_is_dm(path);
+	zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
+	    " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path,
+	    physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not",
+	    (long long unsigned int)guid);
+
+	/*
+	 * The VDEV guid is preferred for identification (gets passed in path)
+	 */
+	if (guid != 0) {
+		(void) snprintf(fullpath, sizeof (fullpath), "%llu",
+		    (long long unsigned int)guid);
+	} else {
+		/*
+		 * otherwise use path sans partition suffix for whole disks
+		 */
+		(void) strlcpy(fullpath, path, sizeof (fullpath));
+		if (wholedisk) {
+			char *spath = zfs_strip_partition(fullpath);
+			if (!spath) {
+				zed_log_msg(LOG_INFO, "%s: Can't alloc",
+				    __func__);
+				return;
+			}
+
+			(void) strlcpy(fullpath, spath, sizeof (fullpath));
+			free(spath);
+		}
+	}
+
+	/*
+	 * Attempt to online the device.
+	 */
+	if (zpool_vdev_online(zhp, fullpath,
+	    ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
+	    (newstate == VDEV_STATE_HEALTHY ||
+	    newstate == VDEV_STATE_DEGRADED)) {
+		zed_log_msg(LOG_INFO, "  zpool_vdev_online: vdev %s is %s",
+		    fullpath, (newstate == VDEV_STATE_HEALTHY) ?
+		    "HEALTHY" : "DEGRADED");
+		return;
+	}
+
+	/*
+	 * vdev_id alias rule for using scsi_debug devices (FMA automated
+	 * testing)
+	 */
+	if (physpath != NULL && strcmp("scsidebug", physpath) == 0)
+		is_sd = B_TRUE;
+
+	/*
+	 * If the pool doesn't have the autoreplace property set, then use
+	 * vdev online to trigger a FMA fault by posting an ereport.
+	 */
+	if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
+	    !(wholedisk || is_dm) || (physpath == NULL)) {
+		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
+		    &newstate);
+		zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or "
+		    "not a whole disk for '%s'", fullpath);
+		return;
+	}
+
+	/*
+	 * Convert physical path into its current device node.  Rawpath
+	 * needs to be /dev/disk/by-vdev for a scsi_debug device since
+	 * /dev/disk/by-path will not be present.
+	 */
+	(void) snprintf(rawpath, sizeof (rawpath), "%s%s",
+	    is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
+
+	if (realpath(rawpath, devpath) == NULL && !is_dm) {
+		zed_log_msg(LOG_INFO, "  realpath: %s failed (%s)",
+		    rawpath, strerror(errno));
+
+		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
+		    &newstate);
+
+		zed_log_msg(LOG_INFO, "  zpool_vdev_online: %s FORCEFAULT (%s)",
+		    fullpath, libzfs_error_description(g_zfshdl));
+		return;
+	}
+
+	/* Only autoreplace bad disks */
+	if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
+	    (vs->vs_state != VDEV_STATE_FAULTED) &&
+	    (vs->vs_state != VDEV_STATE_CANT_OPEN)) {
+		return;
+	}
+
+	nvlist_lookup_string(vdev, "new_devid", &new_devid);
+
+	if (is_dm) {
+		/* Don't label device mapper or multipath disks. */
+	} else if (!labeled) {
+		/*
+		 * we're auto-replacing a raw disk, so label it first
+		 */
+		char *leafname;
+
+		/*
+		 * If this is a request to label a whole disk, then attempt to
+		 * write out the label.  Before we can label the disk, we need
+		 * to map the physical string that was matched on to the under
+		 * lying device node.
+		 *
+		 * If any part of this process fails, then do a force online
+		 * to trigger a ZFS fault for the device (and any hot spare
+		 * replacement).
+		 */
+		leafname = strrchr(devpath, '/') + 1;
+
+		/*
+		 * If this is a request to label a whole disk, then attempt to
+		 * write out the label.
+		 */
+		if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) {
+			zed_log_msg(LOG_INFO, "  zpool_label_disk: could not "
+			    "label '%s' (%s)", leafname,
+			    libzfs_error_description(g_zfshdl));
+
+			(void) zpool_vdev_online(zhp, fullpath,
+			    ZFS_ONLINE_FORCEFAULT, &newstate);
+			return;
+		}
+
+		/*
+		 * The disk labeling is asynchronous on Linux. Just record
+		 * this label request and return as there will be another
+		 * disk add event for the partition after the labeling is
+		 * completed.
+		 */
+		device = malloc(sizeof (pendingdev_t));
+		(void) strlcpy(device->pd_physpath, physpath,
+		    sizeof (device->pd_physpath));
+		list_insert_tail(&g_device_list, device);
+
+		zed_log_msg(LOG_INFO, "  zpool_label_disk: async '%s' (%llu)",
+		    leafname, (u_longlong_t)guid);
+
+		return;	/* resumes at EC_DEV_ADD.ESC_DISK for partition */
+
+	} else /* labeled */ {
+		boolean_t found = B_FALSE;
+		/*
+		 * match up with request above to label the disk
+		 */
+		for (device = list_head(&g_device_list); device != NULL;
+		    device = list_next(&g_device_list, device)) {
+			if (strcmp(physpath, device->pd_physpath) == 0) {
+				list_remove(&g_device_list, device);
+				free(device);
+				found = B_TRUE;
+				break;
+			}
+			zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s",
+			    physpath, device->pd_physpath);
+		}
+		if (!found) {
+			/* unexpected partition slice encountered */
+			zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
+			    fullpath);
+			(void) zpool_vdev_online(zhp, fullpath,
+			    ZFS_ONLINE_FORCEFAULT, &newstate);
+			return;
+		}
+
+		zed_log_msg(LOG_INFO, "  zpool_label_disk: resume '%s' (%llu)",
+		    physpath, (u_longlong_t)guid);
+
+		(void) snprintf(devpath, sizeof (devpath), "%s%s",
+		    DEV_BYID_PATH, new_devid);
+	}
+
+	/*
+	 * Construct the root vdev to pass to zpool_vdev_attach().  While adding
+	 * the entire vdev structure is harmless, we construct a reduced set of
+	 * path/physpath/wholedisk to keep it simple.
+	 */
+	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) {
+		zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
+		return;
+	}
+	if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
+		zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
+		nvlist_free(nvroot);
+		return;
+	}
+
+	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
+	    nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
+	    nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 ||
+	    (physpath != NULL && nvlist_add_string(newvd,
+	    ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
+	    (enc_sysfs_path != NULL && nvlist_add_string(newvd,
+	    ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) ||
+	    nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
+	    nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
+	    nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
+	    1) != 0) {
+		zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs");
+		nvlist_free(newvd);
+		nvlist_free(nvroot);
+		return;
+	}
+
+	nvlist_free(newvd);
+
+	/*
+	 * Wait for udev to verify the links exist, then auto-replace
+	 * the leaf disk at same physical location.
+	 */
+	if (zpool_label_disk_wait(path, 3000) != 0) {
+		zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
+		    "disk %s is missing", path);
+		nvlist_free(nvroot);
+		return;
+	}
+
+	ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
+
+	zed_log_msg(LOG_INFO, "  zpool_vdev_replace: %s with %s (%s)",
+	    fullpath, path, (ret == 0) ? "no errors" :
+	    libzfs_error_description(g_zfshdl));
+
+	nvlist_free(nvroot);
+}
+
+/*
+ * Utility functions to find a vdev matching given criteria.
+ */
+typedef struct dev_data {
+	const char		*dd_compare;
+	const char		*dd_prop;
+	zfs_process_func_t	dd_func;
+	boolean_t		dd_found;
+	boolean_t		dd_islabeled;
+	uint64_t		dd_pool_guid;
+	uint64_t		dd_vdev_guid;
+	const char		*dd_new_devid;
+} dev_data_t;
+
+static void
+zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
+{
+	dev_data_t *dp = data;
+	char *path = NULL;
+	uint_t c, children;
+	nvlist_t **child;
+
+	/*
+	 * First iterate over any children.
+	 */
+	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			zfs_iter_vdev(zhp, child[c], data);
+	}
+
+	/*
+	 * Iterate over any spares and cache devices
+	 */
+	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			zfs_iter_vdev(zhp, child[c], data);
+	}
+	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			zfs_iter_vdev(zhp, child[c], data);
+	}
+
+	/* once a vdev was matched and processed there is nothing left to do */
+	if (dp->dd_found)
+		return;
+
+	/*
+	 * Match by GUID if available otherwise fallback to devid or physical
+	 */
+	if (dp->dd_vdev_guid != 0) {
+		uint64_t guid;
+
+		if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
+		    &guid) != 0 || guid != dp->dd_vdev_guid) {
+			return;
+		}
+		zed_log_msg(LOG_INFO, "  zfs_iter_vdev: matched on %llu", guid);
+		dp->dd_found = B_TRUE;
+
+	} else if (dp->dd_compare != NULL) {
+		/*
+		 * NOTE: On Linux there is an event for partition, so unlike
+		 * illumos, substring matching is not required to accommodate
+		 * the partition suffix. An exact match will be present in
+		 * the dp->dd_compare value.
+		 */
+		if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
+		    strcmp(dp->dd_compare, path) != 0)
+			return;
+
+		zed_log_msg(LOG_INFO, "  zfs_iter_vdev: matched %s on %s",
+		    dp->dd_prop, path);
+		dp->dd_found = B_TRUE;
+
+		/* pass the new devid for use by replacing code */
+		if (dp->dd_new_devid != NULL) {
+			(void) nvlist_add_string(nvl, "new_devid",
+			    dp->dd_new_devid);
+		}
+	}
+
+	(dp->dd_func)(zhp, nvl, dp->dd_islabeled);
+}
+
+static void
+zfs_enable_ds(void *arg)
+{
+	unavailpool_t *pool = (unavailpool_t *)arg;
+
+	(void) zpool_enable_datasets(pool->uap_zhp, NULL, 0);
+	zpool_close(pool->uap_zhp);
+	free(pool);
+}
+
+static int
+zfs_iter_pool(zpool_handle_t *zhp, void *data)
+{
+	nvlist_t *config, *nvl;
+	dev_data_t *dp = data;
+	uint64_t pool_guid;
+	unavailpool_t *pool;
+
+	zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)",
+	    zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop);
+
+	/*
+	 * For each vdev in this pool, look for a match to apply dd_func
+	 */
+	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+		if (dp->dd_pool_guid == 0 ||
+		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+		    &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
+			(void) nvlist_lookup_nvlist(config,
+			    ZPOOL_CONFIG_VDEV_TREE, &nvl);
+			zfs_iter_vdev(zhp, nvl, data);
+		}
+	}
+
+	/*
+	 * if this pool was originally unavailable,
+	 * then enable its datasets asynchronously
+	 */
+	if (g_enumeration_done)  {
+		for (pool = list_head(&g_pool_list); pool != NULL;
+		    pool = list_next(&g_pool_list, pool)) {
+
+			if (strcmp(zpool_get_name(zhp),
+			    zpool_get_name(pool->uap_zhp)))
+				continue;
+			if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) {
+				list_remove(&g_pool_list, pool);
+				(void) tpool_dispatch(g_tpool, zfs_enable_ds,
+				    pool);
+				break;
+			}
+		}
+	}
+
+	zpool_close(zhp);
+	return (dp->dd_found);	/* cease iteration after a match */
+}
+
+/*
+ * Given a physical device location, iterate over all
+ * (pool, vdev) pairs which correspond to that location.
+ */
+static boolean_t
+devphys_iter(const char *physical, const char *devid, zfs_process_func_t func,
+    boolean_t is_slice)
+{
+	dev_data_t data = { 0 };
+
+	data.dd_compare = physical;
+	data.dd_func = func;
+	data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
+	data.dd_found = B_FALSE;
+	data.dd_islabeled = is_slice;
+	data.dd_new_devid = devid;	/* used by auto replace code */
+
+	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+	return (data.dd_found);
+}
+
+/*
+ * Given a device identifier, find any vdevs with a matching devid.
+ * On Linux we can match devid directly which is always a whole disk.
+ */
+static boolean_t
+devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice)
+{
+	dev_data_t data = { 0 };
+
+	data.dd_compare = devid;
+	data.dd_func = func;
+	data.dd_prop = ZPOOL_CONFIG_DEVID;
+	data.dd_found = B_FALSE;
+	data.dd_islabeled = is_slice;
+	data.dd_new_devid = devid;
+
+	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+	return (data.dd_found);
+}
+
+/*
+ * Handle a EC_DEV_ADD.ESC_DISK event.
+ *
+ * illumos
+ *	Expects: DEV_PHYS_PATH string in schema
+ *	Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
+ *
+ *      path: '/dev/dsk/c0t1d0s0' (persistent)
+ *     devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a'
+ * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a'
+ *
+ * linux
+ *	provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema
+ *	Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
+ *
+ *      path: '/dev/sdc1' (not persistent)
+ *     devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1'
+ * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0'
+ */
+static int
+zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
+{
+	char *devpath = NULL, *devid;
+	boolean_t is_slice;
+
+	/*
+	 * Expecting a devid string and an optional physical location
+	 */
+	if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0)
+		return (-1);
+
+	(void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
+
+	is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0);
+
+	zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)",
+	    devid, devpath ? devpath : "NULL", is_slice);
+
+	/*
+	 * Iterate over all vdevs looking for a match in the following order:
+	 * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
+	 * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location).
+	 *
+	 * For disks, we only want to pay attention to vdevs marked as whole
+	 * disks or are a multipath device.
+	 */
+	if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL)
+		(void) devphys_iter(devpath, devid, zfs_process_add, is_slice);
+
+	return (0);
+}
+
+/*
+ * Called when we receive a VDEV_CHECK event, which indicates a device could not
+ * be opened during initial pool open, but the autoreplace property was set on
+ * the pool.  In this case, we treat it as if it were an add event.
+ */
+static int
+zfs_deliver_check(nvlist_t *nvl)
+{
+	dev_data_t data = { 0 };
+
+	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
+	    &data.dd_pool_guid) != 0 ||
+	    nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
+	    &data.dd_vdev_guid) != 0 ||
+	    data.dd_vdev_guid == 0)
+		return (0);
+
+	zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu",
+	    data.dd_pool_guid, data.dd_vdev_guid);
+
+	data.dd_func = zfs_process_add;
+
+	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+	return (0);
+}
+
+static int
+zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
+{
+	char *devname = data;
+	boolean_t avail_spare, l2cache;
+	nvlist_t *tgt;
+	int error;
+
+	zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'",
+	    devname, zpool_get_name(zhp));
+
+	if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
+	    &avail_spare, &l2cache, NULL)) != NULL) {
+		char *path, fullpath[MAXPATHLEN];
+		uint64_t wholedisk;
+
+		error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path);
+		if (error) {
+			zpool_close(zhp);
+			return (0);
+		}
+
+		error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
+		    &wholedisk);
+		if (error)
+			wholedisk = 0;
+
+		if (wholedisk) {
+			path = strrchr(path, '/');
+			if (path != NULL) {
+				path = zfs_strip_partition(path + 1);
+				if (path == NULL) {
+					zpool_close(zhp);
+					return (0);
+				}
+			} else {
+				zpool_close(zhp);
+				return (0);
+			}
+
+			(void) strlcpy(fullpath, path, sizeof (fullpath));
+			free(path);
+
+			/*
+			 * We need to reopen the pool associated with this
+			 * device so that the kernel can update the size of
+			 * the expanded device.  When expanding there is no
+			 * need to restart the scrub from the beginning.
+			 */
+			boolean_t scrub_restart = B_FALSE;
+			(void) zpool_reopen_one(zhp, &scrub_restart);
+		} else {
+			(void) strlcpy(fullpath, path, sizeof (fullpath));
+		}
+
+		if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
+			vdev_state_t newstate;
+
+			if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
+				error = zpool_vdev_online(zhp, fullpath, 0,
+				    &newstate);
+				zed_log_msg(LOG_INFO, "zfsdle_vdev_online: "
+				    "setting device '%s' to ONLINE state "
+				    "in pool '%s': %d", fullpath,
+				    zpool_get_name(zhp), error);
+			}
+		}
+		zpool_close(zhp);
+		return (1);
+	}
+	zpool_close(zhp);
+	return (0);
+}
+
+/*
+ * This function handles the ESC_DEV_DLE device change event.  Use the
+ * provided vdev guid when looking up a disk or partition, when the guid
+ * is not present assume the entire disk is owned by ZFS and append the
+ * expected -part1 partition information then lookup by physical path.
+ */
+static int
+zfs_deliver_dle(nvlist_t *nvl)
+{
+	char *devname, name[MAXPATHLEN];
+	uint64_t guid;
+
+	if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
+		sprintf(name, "%llu", (u_longlong_t)guid);
+	} else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) {
+		strlcpy(name, devname, MAXPATHLEN);
+		zfs_append_partition(name, MAXPATHLEN);
+	} else {
+		zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath");
+	}
+
+	if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) {
+		zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not "
+		    "found", name);
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * syseventd daemon module event handler
+ *
+ * Handles syseventd daemon zfs device related events:
+ *
+ *	EC_DEV_ADD.ESC_DISK
+ *	EC_DEV_STATUS.ESC_DEV_DLE
+ *	EC_ZFS.ESC_ZFS_VDEV_CHECK
+ *
+ * Note: assumes only one thread active at a time (not thread safe)
+ */
+static int
+zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+	int ret;
+	boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE;
+
+	if (strcmp(class, EC_DEV_ADD) == 0) {
+		/*
+		 * We're mainly interested in disk additions, but we also listen
+		 * for new loop devices, to allow for simplified testing.
+		 */
+		if (strcmp(subclass, ESC_DISK) == 0)
+			is_lofi = B_FALSE;
+		else if (strcmp(subclass, ESC_LOFI) == 0)
+			is_lofi = B_TRUE;
+		else
+			return (0);
+
+		is_check = B_FALSE;
+	} else if (strcmp(class, EC_ZFS) == 0 &&
+	    strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
+		/*
+		 * This event signifies that a device failed to open
+		 * during pool load, but the 'autoreplace' property was
+		 * set, so we should pretend it's just been added.
+		 */
+		is_check = B_TRUE;
+	} else if (strcmp(class, EC_DEV_STATUS) == 0 &&
+	    strcmp(subclass, ESC_DEV_DLE) == 0) {
+		is_dle = B_TRUE;
+	} else {
+		return (0);
+	}
+
+	if (is_dle)
+		ret = zfs_deliver_dle(nvl);
+	else if (is_check)
+		ret = zfs_deliver_check(nvl);
+	else
+		ret = zfs_deliver_add(nvl, is_lofi);
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+static void *
+zfs_enum_pools(void *arg)
+{
+	(void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list);
+	/*
+	 * Linux - instead of using a thread pool, each list entry
+	 * will spawn a thread when an unavailable pool transitions
+	 * to available. zfs_slm_fini will wait for these threads.
+	 */
+	g_enumeration_done = B_TRUE;
+	return (NULL);
+}
+
+/*
+ * called from zed daemon at startup
+ *
+ * sent messages from zevents or udev monitor
+ *
+ * For now, each agent has its own libzfs instance
+ */
+int
+zfs_slm_init()
+{
+	if ((g_zfshdl = libzfs_init()) == NULL)
+		return (-1);
+
+	/*
+	 * collect a list of unavailable pools (asynchronously,
+	 * since this can take a while)
+	 */
+	list_create(&g_pool_list, sizeof (struct unavailpool),
+	    offsetof(struct unavailpool, uap_node));
+
+	if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) {
+		list_destroy(&g_pool_list);
+		libzfs_fini(g_zfshdl);
+		return (-1);
+	}
+
+	list_create(&g_device_list, sizeof (struct pendingdev),
+	    offsetof(struct pendingdev, pd_node));
+
+	return (0);
+}
+
+void
+zfs_slm_fini()
+{
+	unavailpool_t *pool;
+	pendingdev_t *device;
+
+	/* wait for zfs_enum_pools thread to complete */
+	(void) pthread_join(g_zfs_tid, NULL);
+	/* destroy the thread pool */
+	if (g_tpool != NULL) {
+		tpool_wait(g_tpool);
+		tpool_destroy(g_tpool);
+	}
+
+	while ((pool = (list_head(&g_pool_list))) != NULL) {
+		list_remove(&g_pool_list, pool);
+		zpool_close(pool->uap_zhp);
+		free(pool);
+	}
+	list_destroy(&g_pool_list);
+
+	while ((device = (list_head(&g_device_list))) != NULL) {
+		list_remove(&g_device_list, device);
+		free(device);
+	}
+	list_destroy(&g_device_list);
+
+	libzfs_fini(g_zfshdl);
+}
+
+void
+zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+	zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass);
+	(void) zfs_slm_deliver_event(class, subclass, nvl);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c
new file mode 100644
index 000000000000..9e95e20d5683
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c
@@ -0,0 +1,557 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
+ */
+
+/*
+ * The ZFS retire agent is responsible for managing hot spares across all pools.
+ * When we see a device fault or a device removal, we try to open the associated
+ * pool and look for any hot spares.  We iterate over any available hot spares
+ * and attempt a 'zpool replace' for each one.
+ *
+ * For vdevs diagnosed as faulty, the agent is also responsible for proactively
+ * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).
+ */
+
+#include <sys/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+#include <libzfs.h>
+#include <string.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+
+
+typedef struct zfs_retire_repaired {
+	struct zfs_retire_repaired	*zrr_next;
+	uint64_t			zrr_pool;
+	uint64_t			zrr_vdev;
+} zfs_retire_repaired_t;
+
+typedef struct zfs_retire_data {
+	libzfs_handle_t			*zrd_hdl;
+	zfs_retire_repaired_t		*zrd_repaired;
+} zfs_retire_data_t;
+
+static void
+zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp)
+{
+	zfs_retire_repaired_t *zrp;
+
+	while ((zrp = zdp->zrd_repaired) != NULL) {
+		zdp->zrd_repaired = zrp->zrr_next;
+		fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t));
+	}
+}
+
+/*
+ * Find a pool with a matching GUID.
+ */
+typedef struct find_cbdata {
+	uint64_t	cb_guid;
+	zpool_handle_t	*cb_zhp;
+	nvlist_t	*cb_vdev;
+} find_cbdata_t;
+
+static int
+find_pool(zpool_handle_t *zhp, void *data)
+{
+	find_cbdata_t *cbp = data;
+
+	if (cbp->cb_guid ==
+	    zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) {
+		cbp->cb_zhp = zhp;
+		return (1);
+	}
+
+	zpool_close(zhp);
+	return (0);
+}
+
+/*
+ * Find a vdev within a tree with a matching GUID.
+ */
+static nvlist_t *
+find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid)
+{
+	uint64_t guid;
+	nvlist_t **child;
+	uint_t c, children;
+	nvlist_t *ret;
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
+	    guid == search_guid) {
+		fmd_hdl_debug(fmd_module_hdl("zfs-retire"),
+		    "matched vdev %llu", guid);
+		return (nv);
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return (NULL);
+
+	for (c = 0; c < children; c++) {
+		if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
+			return (ret);
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) != 0)
+		return (NULL);
+
+	for (c = 0; c < children; c++) {
+		if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
+			return (ret);
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+	    &child, &children) != 0)
+		return (NULL);
+
+	for (c = 0; c < children; c++) {
+		if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
+			return (ret);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Given a (pool, vdev) GUID pair, find the matching pool and vdev.
+ */
+static zpool_handle_t *
+find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid,
+    nvlist_t **vdevp)
+{
+	find_cbdata_t cb;
+	zpool_handle_t *zhp;
+	nvlist_t *config, *nvroot;
+
+	/*
+	 * Find the corresponding pool and make sure the vdev still exists.
+	 */
+	cb.cb_guid = pool_guid;
+	if (zpool_iter(zhdl, find_pool, &cb) != 1)
+		return (NULL);
+
+	zhp = cb.cb_zhp;
+	config = zpool_get_config(zhp, NULL);
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) != 0) {
+		zpool_close(zhp);
+		return (NULL);
+	}
+
+	if (vdev_guid != 0) {
+		if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) {
+			zpool_close(zhp);
+			return (NULL);
+		}
+	}
+
+	return (zhp);
+}
+
+/*
+ * Given a vdev, attempt to replace it with every known spare until one
+ * succeeds or we run out of devices to try.
+ * Return whether we were successful or not in replacing the device.
+ */
+static boolean_t
+replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
+{
+	nvlist_t *config, *nvroot, *replacement;
+	nvlist_t **spares;
+	uint_t s, nspares;
+	char *dev_name;
+	zprop_source_t source;
+	int ashift;
+
+	config = zpool_get_config(zhp, NULL);
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) != 0)
+		return (B_FALSE);
+
+	/*
+	 * Find out if there are any hot spares available in the pool.
+	 */
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+	    &spares, &nspares) != 0)
+		return (B_FALSE);
+
+	/*
+	 * lookup "ashift" pool property, we may need it for the replacement
+	 */
+	ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source);
+
+	replacement = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+	(void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT);
+
+	dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
+
+	/*
+	 * Try to replace each spare, ending when we successfully
+	 * replace it.
+	 */
+	for (s = 0; s < nspares; s++) {
+		char *spare_name;
+
+		if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
+		    &spare_name) != 0)
+			continue;
+
+		/* if set, add the "ashift" pool property to the spare nvlist */
+		if (source != ZPROP_SRC_DEFAULT)
+			(void) nvlist_add_uint64(spares[s],
+			    ZPOOL_CONFIG_ASHIFT, ashift);
+
+		(void) nvlist_add_nvlist_array(replacement,
+		    ZPOOL_CONFIG_CHILDREN, &spares[s], 1);
+
+		fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'",
+		    dev_name, basename(spare_name));
+
+		if (zpool_vdev_attach(zhp, dev_name, spare_name,
+		    replacement, B_TRUE, B_FALSE) == 0) {
+			free(dev_name);
+			nvlist_free(replacement);
+			return (B_TRUE);
+		}
+	}
+
+	free(dev_name);
+	nvlist_free(replacement);
+
+	return (B_FALSE);
+}
+
+/*
+ * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and
+ * ASRU is now usable.  ZFS has found the device to be present and
+ * functioning.
+ */
+/*ARGSUSED*/
+static void
+zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl)
+{
+	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+	zfs_retire_repaired_t *zrp;
+	uint64_t pool_guid, vdev_guid;
+	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+	    &pool_guid) != 0 || nvlist_lookup_uint64(nvl,
+	    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
+		return;
+
+	/*
+	 * Before checking the state of the ASRU, go through and see if we've
+	 * already made an attempt to repair this ASRU.  This list is cleared
+	 * whenever we receive any kind of list event, and is designed to
+	 * prevent us from generating a feedback loop when we attempt repairs
+	 * against a faulted pool.  The problem is that checking the unusable
+	 * state of the ASRU can involve opening the pool, which can post
+	 * statechange events but otherwise leave the pool in the faulted
+	 * state.  This list allows us to detect when a statechange event is
+	 * due to our own request.
+	 */
+	for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) {
+		if (zrp->zrr_pool == pool_guid &&
+		    zrp->zrr_vdev == vdev_guid)
+			return;
+	}
+
+	zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP);
+	zrp->zrr_next = zdp->zrd_repaired;
+	zrp->zrr_pool = pool_guid;
+	zrp->zrr_vdev = vdev_guid;
+	zdp->zrd_repaired = zrp;
+
+	fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu",
+	    vdev_guid, pool_guid);
+}
+
+/*ARGSUSED*/
+static void
+zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
+    const char *class)
+{
+	uint64_t pool_guid, vdev_guid;
+	zpool_handle_t *zhp;
+	nvlist_t *resource, *fault;
+	nvlist_t **faults;
+	uint_t f, nfaults;
+	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+	libzfs_handle_t *zhdl = zdp->zrd_hdl;
+	boolean_t fault_device, degrade_device;
+	boolean_t is_repair;
+	char *scheme;
+	nvlist_t *vdev = NULL;
+	char *uuid;
+	int repair_done = 0;
+	boolean_t retire;
+	boolean_t is_disk;
+	vdev_aux_t aux;
+	uint64_t state = 0;
+
+	fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);
+
+	nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state);
+
+	/*
+	 * If this is a resource notifying us of device removal then simply
+	 * check for an available spare and continue unless the device is a
+	 * l2arc vdev, in which case we just offline it.
+	 */
+	if (strcmp(class, "resource.fs.zfs.removed") == 0 ||
+	    (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
+	    state == VDEV_STATE_REMOVED)) {
+		char *devtype;
+		char *devname;
+
+		if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+		    &pool_guid) != 0 ||
+		    nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+		    &vdev_guid) != 0)
+			return;
+
+		if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
+		    &vdev)) == NULL)
+			return;
+
+		devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
+
+		/* Can't replace l2arc with a spare: offline the device */
+		if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+		    &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) {
+			fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname);
+			zpool_vdev_offline(zhp, devname, B_TRUE);
+		} else if (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
+		    replace_with_spare(hdl, zhp, vdev) == B_FALSE) {
+			/* Could not handle with spare */
+			fmd_hdl_debug(hdl, "no spare for '%s'", devname);
+		}
+
+		free(devname);
+		zpool_close(zhp);
+		return;
+	}
+
+	if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
+		return;
+
+	/*
+	 * Note: on zfsonlinux statechange events are more than just
+	 * healthy ones so we need to confirm the actual state value.
+	 */
+	if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
+	    state == VDEV_STATE_HEALTHY) {
+		zfs_vdev_repair(hdl, nvl);
+		return;
+	}
+	if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
+		zfs_vdev_repair(hdl, nvl);
+		return;
+	}
+
+	zfs_retire_clear_data(hdl, zdp);
+
+	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
+		is_repair = B_TRUE;
+	else
+		is_repair = B_FALSE;
+
+	/*
+	 * We subscribe to zfs faults as well as all repair events.
+	 */
+	if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
+	    &faults, &nfaults) != 0)
+		return;
+
+	for (f = 0; f < nfaults; f++) {
+		fault = faults[f];
+
+		fault_device = B_FALSE;
+		degrade_device = B_FALSE;
+		is_disk = B_FALSE;
+
+		if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE,
+		    &retire) == 0 && retire == 0)
+			continue;
+
+		/*
+		 * While we subscribe to fault.fs.zfs.*, we only take action
+		 * for faults targeting a specific vdev (open failure or SERD
+		 * failure).  We also subscribe to fault.io.* events, so that
+		 * faulty disks will be faulted in the ZFS configuration.
+		 */
+		if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) {
+			fault_device = B_TRUE;
+		} else if (fmd_nvl_class_match(hdl, fault,
+		    "fault.fs.zfs.vdev.checksum")) {
+			degrade_device = B_TRUE;
+		} else if (fmd_nvl_class_match(hdl, fault,
+		    "fault.fs.zfs.device")) {
+			fault_device = B_FALSE;
+		} else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) {
+			is_disk = B_TRUE;
+			fault_device = B_TRUE;
+		} else {
+			continue;
+		}
+
+		if (is_disk) {
+			continue;
+		} else {
+			/*
+			 * This is a ZFS fault.  Lookup the resource, and
+			 * attempt to find the matching vdev.
+			 */
+			if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE,
+			    &resource) != 0 ||
+			    nvlist_lookup_string(resource, FM_FMRI_SCHEME,
+			    &scheme) != 0)
+				continue;
+
+			if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0)
+				continue;
+
+			if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL,
+			    &pool_guid) != 0)
+				continue;
+
+			if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV,
+			    &vdev_guid) != 0) {
+				if (is_repair)
+					vdev_guid = 0;
+				else
+					continue;
+			}
+
+			if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
+			    &vdev)) == NULL)
+				continue;
+
+			aux = VDEV_AUX_ERR_EXCEEDED;
+		}
+
+		if (vdev_guid == 0) {
+			/*
+			 * For pool-level repair events, clear the entire pool.
+			 */
+			fmd_hdl_debug(hdl, "zpool_clear of pool '%s'",
+			    zpool_get_name(zhp));
+			(void) zpool_clear(zhp, NULL, NULL);
+			zpool_close(zhp);
+			continue;
+		}
+
+		/*
+		 * If this is a repair event, then mark the vdev as repaired and
+		 * continue.
+		 */
+		if (is_repair) {
+			repair_done = 1;
+			fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu",
+			    zpool_get_name(zhp), vdev_guid);
+			(void) zpool_vdev_clear(zhp, vdev_guid);
+			zpool_close(zhp);
+			continue;
+		}
+
+		/*
+		 * Actively fault the device if needed.
+		 */
+		if (fault_device)
+			(void) zpool_vdev_fault(zhp, vdev_guid, aux);
+		if (degrade_device)
+			(void) zpool_vdev_degrade(zhp, vdev_guid, aux);
+
+		if (fault_device || degrade_device)
+			fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'",
+			    fault_device ? "fault" : "degrade", vdev_guid,
+			    zpool_get_name(zhp));
+
+		/*
+		 * Attempt to substitute a hot spare.
+		 */
+		(void) replace_with_spare(hdl, zhp, vdev);
+		zpool_close(zhp);
+	}
+
+	if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done &&
+	    nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0)
+		fmd_case_uuresolved(hdl, uuid);
+}
+
+static const fmd_hdl_ops_t fmd_ops = {
+	zfs_retire_recv,	/* fmdo_recv */
+	NULL,			/* fmdo_timeout */
+	NULL,			/* fmdo_close */
+	NULL,			/* fmdo_stats */
+	NULL,			/* fmdo_gc */
+};
+
+static const fmd_prop_t fmd_props[] = {
+	{ "spare_on_remove", FMD_TYPE_BOOL, "true" },
+	{ NULL, 0, NULL }
+};
+
+static const fmd_hdl_info_t fmd_info = {
+	"ZFS Retire Agent", "1.0", &fmd_ops, fmd_props
+};
+
+void
+_zfs_retire_init(fmd_hdl_t *hdl)
+{
+	zfs_retire_data_t *zdp;
+	libzfs_handle_t *zhdl;
+
+	if ((zhdl = libzfs_init()) == NULL)
+		return;
+
+	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
+		libzfs_fini(zhdl);
+		return;
+	}
+
+	zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP);
+	zdp->zrd_hdl = zhdl;
+
+	fmd_hdl_setspecific(hdl, zdp);
+}
+
+void
+_zfs_retire_fini(fmd_hdl_t *hdl)
+{
+	zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+
+	if (zdp != NULL) {
+		zfs_retire_clear_data(hdl, zdp);
+		libzfs_fini(zdp->zrd_hdl);
+		fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t));
+	}
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed.c b/sys/contrib/openzfs/cmd/zed/zed.c
new file mode 100644
index 000000000000..0784e3834733
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.c
@@ -0,0 +1,306 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "zed.h"
+#include "zed_conf.h"
+#include "zed_event.h"
+#include "zed_file.h"
+#include "zed_log.h"
+
+static volatile sig_atomic_t _got_exit = 0;
+static volatile sig_atomic_t _got_hup = 0;
+
+/*
+ * Signal handler for SIGINT & SIGTERM.
+ */
+static void
+_exit_handler(int signum)
+{
+	_got_exit = 1;
+}
+
+/*
+ * Signal handler for SIGHUP.
+ */
+static void
+_hup_handler(int signum)
+{
+	_got_hup = 1;
+}
+
+/*
+ * Register signal handlers.
+ */
+static void
+_setup_sig_handlers(void)
+{
+	struct sigaction sa;
+
+	if (sigemptyset(&sa.sa_mask) < 0)
+		zed_log_die("Failed to initialize sigset");
+
+	sa.sa_flags = SA_RESTART;
+	sa.sa_handler = SIG_IGN;
+
+	if (sigaction(SIGPIPE, &sa, NULL) < 0)
+		zed_log_die("Failed to ignore SIGPIPE");
+
+	sa.sa_handler = _exit_handler;
+	if (sigaction(SIGINT, &sa, NULL) < 0)
+		zed_log_die("Failed to register SIGINT handler");
+
+	if (sigaction(SIGTERM, &sa, NULL) < 0)
+		zed_log_die("Failed to register SIGTERM handler");
+
+	sa.sa_handler = _hup_handler;
+	if (sigaction(SIGHUP, &sa, NULL) < 0)
+		zed_log_die("Failed to register SIGHUP handler");
+}
+
+/*
+ * Lock all current and future pages in the virtual memory address space.
+ * Access to locked pages will never be delayed by a page fault.
+ *
+ * EAGAIN is tested up to max_tries in case this is a transient error.
+ *
+ * Note that memory locks are not inherited by a child created via fork()
+ * and are automatically removed during an execve().  As such, this must
+ * be called after the daemon fork()s (when running in the background).
+ */
+static void
+_lock_memory(void)
+{
+#if HAVE_MLOCKALL
+	int i = 0;
+	const int max_tries = 10;
+
+	for (i = 0; i < max_tries; i++) {
+		if (mlockall(MCL_CURRENT | MCL_FUTURE) == 0) {
+			zed_log_msg(LOG_INFO, "Locked all pages in memory");
+			return;
+		}
+		if (errno != EAGAIN)
+			break;
+	}
+	zed_log_die("Failed to lock memory pages: %s", strerror(errno));
+
+#else /* HAVE_MLOCKALL */
+	zed_log_die("Failed to lock memory pages: mlockall() not supported");
+#endif /* HAVE_MLOCKALL */
+}
+
+/*
+ * Start daemonization of the process including the double fork().
+ *
+ * The parent process will block here until _finish_daemonize() is called
+ * (in the grandchild process), at which point the parent process will exit.
+ * This prevents the parent process from exiting until initialization is
+ * complete.
+ */
+static void
+_start_daemonize(void)
+{
+	pid_t pid;
+	struct sigaction sa;
+
+	/* Create pipe for communicating with child during daemonization. */
+	zed_log_pipe_open();
+
+	/* Background process and ensure child is not process group leader. */
+	pid = fork();
+	if (pid < 0) {
+		zed_log_die("Failed to create child process: %s",
+		    strerror(errno));
+	} else if (pid > 0) {
+
+		/* Close writes since parent will only read from pipe. */
+		zed_log_pipe_close_writes();
+
+		/* Wait for notification that daemonization is complete. */
+		zed_log_pipe_wait();
+
+		zed_log_pipe_close_reads();
+		_exit(EXIT_SUCCESS);
+	}
+
+	/* Close reads since child will only write to pipe. */
+	zed_log_pipe_close_reads();
+
+	/* Create independent session and detach from terminal. */
+	if (setsid() < 0)
+		zed_log_die("Failed to create new session: %s",
+		    strerror(errno));
+
+	/* Prevent child from terminating on HUP when session leader exits. */
+	if (sigemptyset(&sa.sa_mask) < 0)
+		zed_log_die("Failed to initialize sigset");
+
+	sa.sa_flags = 0;
+	sa.sa_handler = SIG_IGN;
+
+	if (sigaction(SIGHUP, &sa, NULL) < 0)
+		zed_log_die("Failed to ignore SIGHUP");
+
+	/* Ensure process cannot re-acquire terminal. */
+	pid = fork();
+	if (pid < 0) {
+		zed_log_die("Failed to create grandchild process: %s",
+		    strerror(errno));
+	} else if (pid > 0) {
+		_exit(EXIT_SUCCESS);
+	}
+}
+
+/*
+ * Finish daemonization of the process by closing stdin/stdout/stderr.
+ *
+ * This must be called at the end of initialization after all external
+ * communication channels are established and accessible.
+ */
+static void
+_finish_daemonize(void)
+{
+	int devnull;
+
+	/* Preserve fd 0/1/2, but discard data to/from stdin/stdout/stderr. */
+	devnull = open("/dev/null", O_RDWR);
+	if (devnull < 0)
+		zed_log_die("Failed to open /dev/null: %s", strerror(errno));
+
+	if (dup2(devnull, STDIN_FILENO) < 0)
+		zed_log_die("Failed to dup /dev/null onto stdin: %s",
+		    strerror(errno));
+
+	if (dup2(devnull, STDOUT_FILENO) < 0)
+		zed_log_die("Failed to dup /dev/null onto stdout: %s",
+		    strerror(errno));
+
+	if (dup2(devnull, STDERR_FILENO) < 0)
+		zed_log_die("Failed to dup /dev/null onto stderr: %s",
+		    strerror(errno));
+
+	if ((devnull > STDERR_FILENO) && (close(devnull) < 0))
+		zed_log_die("Failed to close /dev/null: %s", strerror(errno));
+
+	/* Notify parent that daemonization is complete. */
+	zed_log_pipe_close_writes();
+}
+
+/*
+ * ZFS Event Daemon (ZED).
+ */
+int
+main(int argc, char *argv[])
+{
+	struct zed_conf *zcp;
+	uint64_t saved_eid;
+	int64_t saved_etime[2];
+
+	zed_log_init(argv[0]);
+	zed_log_stderr_open(LOG_NOTICE);
+	zcp = zed_conf_create();
+	zed_conf_parse_opts(zcp, argc, argv);
+	if (zcp->do_verbose)
+		zed_log_stderr_open(LOG_INFO);
+
+	if (geteuid() != 0)
+		zed_log_die("Must be run as root");
+
+	zed_conf_parse_file(zcp);
+
+	zed_file_close_from(STDERR_FILENO + 1);
+
+	(void) umask(0);
+
+	if (chdir("/") < 0)
+		zed_log_die("Failed to change to root directory");
+
+	if (zed_conf_scan_dir(zcp) < 0)
+		exit(EXIT_FAILURE);
+
+	if (!zcp->do_foreground) {
+		_start_daemonize();
+		zed_log_syslog_open(LOG_DAEMON);
+	}
+	_setup_sig_handlers();
+
+	if (zcp->do_memlock)
+		_lock_memory();
+
+	if ((zed_conf_write_pid(zcp) < 0) && (!zcp->do_force))
+		exit(EXIT_FAILURE);
+
+	if (!zcp->do_foreground)
+		_finish_daemonize();
+
+	zed_log_msg(LOG_NOTICE,
+	    "ZFS Event Daemon %s-%s (PID %d)",
+	    ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid());
+
+	if (zed_conf_open_state(zcp) < 0)
+		exit(EXIT_FAILURE);
+
+	if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0)
+		exit(EXIT_FAILURE);
+
+idle:
+	/*
+	 * If -I is specified, attempt to open /dev/zfs repeatedly until
+	 * successful.
+	 */
+	do {
+		if (!zed_event_init(zcp))
+			break;
+		/* Wait for some time and try again. tunable? */
+		sleep(30);
+	} while (!_got_exit && zcp->do_idle);
+
+	if (_got_exit)
+		goto out;
+
+	zed_event_seek(zcp, saved_eid, saved_etime);
+
+	while (!_got_exit) {
+		int rv;
+		if (_got_hup) {
+			_got_hup = 0;
+			(void) zed_conf_scan_dir(zcp);
+		}
+		rv = zed_event_service(zcp);
+
+		/* ENODEV: When kernel module is unloaded (osx) */
+		if (rv == ENODEV)
+			break;
+	}
+
+	zed_log_msg(LOG_NOTICE, "Exiting");
+	zed_event_fini(zcp);
+
+	if (zcp->do_idle && !_got_exit)
+		goto idle;
+
+out:
+	zed_conf_destroy(zcp);
+	zed_log_fini();
+	exit(EXIT_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/.gitignore b/sys/contrib/openzfs/cmd/zed/zed.d/.gitignore
new file mode 100644
index 000000000000..46a00945aa7c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/.gitignore
@@ -0,0 +1 @@
+history_event-zfs-list-cacher.sh
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am
new file mode 100644
index 000000000000..8b2d0c200286
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am
@@ -0,0 +1,53 @@
+include $(top_srcdir)/config/Rules.am
+include $(top_srcdir)/config/Substfiles.am
+
+EXTRA_DIST += README
+
+zedconfdir = $(sysconfdir)/zfs/zed.d
+
+dist_zedconf_DATA = \
+	zed-functions.sh \
+	zed.rc
+
+zedexecdir = $(zfsexecdir)/zed.d
+
+dist_zedexec_SCRIPTS = \
+	all-debug.sh \
+	all-syslog.sh \
+	data-notify.sh \
+	generic-notify.sh \
+	resilver_finish-notify.sh \
+	scrub_finish-notify.sh \
+	statechange-led.sh \
+	statechange-notify.sh \
+	vdev_clear-led.sh \
+	vdev_attach-led.sh \
+	pool_import-led.sh \
+	resilver_finish-start-scrub.sh \
+	trim_finish-notify.sh
+
+nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh
+
+SUBSTFILES += $(nodist_zedexec_SCRIPTS)
+
+zedconfdefaults = \
+	all-syslog.sh \
+	data-notify.sh \
+	history_event-zfs-list-cacher.sh \
+	resilver_finish-notify.sh \
+	scrub_finish-notify.sh \
+	statechange-led.sh \
+	statechange-notify.sh \
+	vdev_clear-led.sh \
+	vdev_attach-led.sh \
+	pool_import-led.sh \
+	resilver_finish-start-scrub.sh
+
+install-data-hook:
+	$(MKDIR_P) "$(DESTDIR)$(zedconfdir)"
+	for f in $(zedconfdefaults); do \
+	  test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \
+	       -L "$(DESTDIR)$(zedconfdir)/$${f}" || \
+	    ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \
+	done
+	chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/README b/sys/contrib/openzfs/cmd/zed/zed.d/README
new file mode 100644
index 000000000000..7279b93704e2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/README
@@ -0,0 +1,30 @@
+Shell scripts are the recommended choice for ZEDLETs that mostly call
+other utilities and do relatively little data manipulation.
+
+Shell scripts MUST work on both bash and dash.
+
+Shell scripts MUST run cleanly through ShellCheck:
+  http://www.shellcheck.net/
+
+General functions reside in "zed-functions.sh".  Use them where applicable.
+
+Additional references that may be of use:
+
+  Google Shell Style Guide
+  https://github.com/google/styleguide/blob/gh-pages/shell.xml
+
+  Dash as /bin/sh
+  https://wiki.ubuntu.com/DashAsBinSh
+
+  Common shell script mistakes
+  http://www.pixelbeat.org/programming/shell_script_mistakes.html
+
+  Filenames and Pathnames in Shell: How to do it Correctly
+  http://www.dwheeler.com/essays/filenames-in-shell.html
+
+  Autoconf: Portable Shell Programming
+  https://www.gnu.org/software/autoconf/manual/autoconf.html#Portable-Shell
+
+Please BE CONSISTENT with the existing style, check for errors,
+minimize dependencies where possible, try to be portable,
+and comment anything non-obvious.  Festina lente.
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh b/sys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh
new file mode 100755
index 000000000000..14b39caacd9d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+#
+# Log all environment variables to ZED_DEBUG_LOG.
+#
+# This can be a useful aid when developing/debugging ZEDLETs since it shows the
+# environment variables defined for each zevent.
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+: "${ZED_DEBUG_LOG:="${TMPDIR:="/tmp"}/zed.debug.log"}"
+
+zed_exit_if_ignoring_this_event
+
+lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock"
+
+umask 077
+zed_lock "${lockfile}"
+exec >> "${ZED_DEBUG_LOG}"
+
+printenv | sort
+echo
+
+exec >&-
+zed_unlock "${lockfile}"
+exit 0
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh b/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh
new file mode 100755
index 000000000000..cb9286500136
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+#
+# Log the zevent via syslog.
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+zed_exit_if_ignoring_this_event
+
+zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \
+    "${ZEVENT_POOL_GUID:+"pool_guid=${ZEVENT_POOL_GUID}"}" \
+    "${ZEVENT_VDEV_PATH:+"vdev_path=${ZEVENT_VDEV_PATH}"}" \
+    "${ZEVENT_VDEV_STATE_STR:+"vdev_state=${ZEVENT_VDEV_STATE_STR}"}"
+exit 0
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh
new file mode 100755
index 000000000000..639b459bdd3b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+#
+# Send notification in response to a DATA error.
+#
+# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given
+# class/pool/[vdev] combination.  This protects against spamming the recipient
+# should multiple events occur together in time for the same pool/[vdev].
+#
+# Exit codes:
+#   0: notification sent
+#   1: notification failed
+#   2: notification not configured
+#   3: notification suppressed
+#   9: internal error
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_POOL}" ] || exit 9
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
+[ -n "${ZED_NOTIFY_DATA}" ] || exit 3
+
+rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify"
+zed_rate_limit "${rate_limit_tag}" || exit 3
+
+umask 077
+note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+    echo "ZFS has detected a data error:"
+    echo
+    echo "   eid: ${ZEVENT_EID}"
+    echo " class: ${ZEVENT_SUBCLASS}"
+    echo "  host: $(hostname)"
+    echo "  time: ${ZEVENT_TIME_STRING}"
+    echo " error: ${ZEVENT_ZIO_ERR}"
+    echo " objid: ${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}"
+    echo "  pool: ${ZEVENT_POOL}"
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh
new file mode 100755
index 000000000000..e438031a088a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+#
+# Send notification in response to a given zevent.
+#
+# This is a generic script than can be symlinked to a file in the
+# enabled-zedlets directory to have a notification sent when a particular
+# class of zevents occurs.  The symlink filename must begin with the zevent
+# (sub)class string (e.g., "probe_failure-notify.sh" for the "probe_failure"
+# subclass).  Refer to the zed(8) manpage for details.
+#
+# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given
+# class/pool combination.  This protects against spamming the recipient
+# should multiple events occur together in time for the same pool.
+#
+# Exit codes:
+#   0: notification sent
+#   1: notification failed
+#   2: notification not configured
+#   3: notification suppressed
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+# Rate-limit the notification based in part on the filename.
+#
+rate_limit_tag="${ZEVENT_POOL};${ZEVENT_SUBCLASS};$(basename -- "$0")"
+rate_limit_interval="${ZED_NOTIFY_INTERVAL_SECS}"
+zed_rate_limit "${rate_limit_tag}" "${rate_limit_interval}" || exit 3
+
+umask 077
+pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}"
+host_str=" on $(hostname)"
+note_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+    echo "ZFS has posted the following event:"
+    echo
+    echo "   eid: ${ZEVENT_EID}"
+    echo " class: ${ZEVENT_SUBCLASS}"
+    echo "  host: $(hostname)"
+    echo "  time: ${ZEVENT_TIME_STRING}"
+
+    [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}"
+    [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}"
+    [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}"
+
+    [ -n "${ZEVENT_POOL}" ] && [ -x "${ZPOOL}" ] \
+        && "${ZPOOL}" status "${ZEVENT_POOL}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in
new file mode 100755
index 000000000000..053b4414a768
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in
@@ -0,0 +1,85 @@
+#!/bin/sh
+#
+# Track changes to enumerated pools for use in early-boot
+set -ef
+
+FSLIST_DIR="@sysconfdir@/zfs/zfs-list.cache"
+FSLIST_TMP="@runstatedir@/zfs-list.cache.new"
+FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}"
+
+# If the pool specific cache file is not writeable, abort
+[ -w "${FSLIST}" ] || exit 0
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+zed_exit_if_ignoring_this_event
+zed_check_cmd "${ZFS}" sort diff grep
+
+# If we are acting on a snapshot, we have nothing to do
+printf '%s' "${ZEVENT_HISTORY_DSNAME}" | grep '@' && exit 0
+
+# We obtain a lock on zfs-list to avoid any simultaneous writes.
+# If we run into trouble, log and drop the lock
+abort_alter() {
+  zed_log_msg "Error updating zfs-list.cache!"
+  zed_unlock zfs-list
+}
+
+finished() {
+  zed_unlock zfs-list
+  trap - EXIT
+  exit 0
+}
+
+case "${ZEVENT_HISTORY_INTERNAL_NAME}" in
+    create|"finish receiving"|import|destroy|rename)
+      ;;
+
+    export)
+        zed_lock zfs-list
+        trap abort_alter EXIT
+        echo > "${FSLIST}"
+        finished
+      ;;
+
+    set|inherit)
+        # Only act if one of the tracked properties is altered.
+        case "${ZEVENT_HISTORY_INTERNAL_STR%%=*}" in
+            canmount|mountpoint|atime|relatime|devices|exec|readonly| \
+              setuid|nbmand|encroot|keylocation|org.openzfs.systemd:requires| \
+              org.openzfs.systemd:requires-mounts-for| \
+              org.openzfs.systemd:before|org.openzfs.systemd:after| \
+              org.openzfs.systemd:wanted-by|org.openzfs.systemd:required-by| \
+              org.openzfs.systemd:nofail|org.openzfs.systemd:ignore \
+            ) ;;
+            *) exit 0 ;;
+        esac
+      ;;
+
+    *)
+        # Ignore all other events.
+        exit 0
+      ;;
+esac
+
+zed_lock zfs-list
+trap abort_alter EXIT
+
+PROPS="name,mountpoint,canmount,atime,relatime,devices,exec\
+,readonly,setuid,nbmand,encroot,keylocation\
+,org.openzfs.systemd:requires,org.openzfs.systemd:requires-mounts-for\
+,org.openzfs.systemd:before,org.openzfs.systemd:after\
+,org.openzfs.systemd:wanted-by,org.openzfs.systemd:required-by\
+,org.openzfs.systemd:nofail,org.openzfs.systemd:ignore"
+
+"${ZFS}" list -H -t filesystem -o $PROPS -r "${ZEVENT_POOL}" > "${FSLIST_TMP}"
+
+# Sort the output so that it is stable
+sort "${FSLIST_TMP}" -o "${FSLIST_TMP}"
+
+# Don't modify the file if it hasn't changed
+diff -q "${FSLIST_TMP}" "${FSLIST}" || mv "${FSLIST_TMP}" "${FSLIST}"
+rm -f "${FSLIST_TMP}"
+
+finished
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh
new file mode 120000
index 000000000000..7d7404398a4a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh
@@ -0,0 +1 @@
+statechange-led.sh
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh
new file mode 120000
index 000000000000..e4c56bc5f816
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh
@@ -0,0 +1 @@
+scrub_finish-notify.sh
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh
new file mode 100755
index 000000000000..c7cfd1ddba80
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+# resilver_finish-start-scrub.sh
+# Run a scrub after a resilver
+#
+# Exit codes:
+# 1: Internal error
+# 2: Script wasn't enabled in zed.rc
+# 3: Scrubs are automatically started for sequential resilvers
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2
+[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3
+[ -n "${ZEVENT_POOL}" ] || exit 1
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 1
+zed_check_cmd "${ZPOOL}" || exit 1
+
+zed_log_msg "Starting scrub after resilver on ${ZEVENT_POOL}"
+"${ZPOOL}" scrub "${ZEVENT_POOL}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh
new file mode 100755
index 000000000000..2145a100a3fa
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+#
+# Send notification in response to a RESILVER_FINISH or SCRUB_FINISH.
+#
+# By default, "zpool status" output will only be included for a scrub_finish
+# zevent if the pool is not healthy; to always include its output, set
+# ZED_NOTIFY_VERBOSE=1.
+#
+# Exit codes:
+#   0: notification sent
+#   1: notification failed
+#   2: notification not configured
+#   3: notification suppressed
+#   9: internal error
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_POOL}" ] || exit 9
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
+
+if   [ "${ZEVENT_SUBCLASS}" = "resilver_finish" ]; then
+    action="resilver"
+elif [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then
+    action="scrub"
+else
+    zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\""
+    exit 9
+fi
+
+zed_check_cmd "${ZPOOL}" || exit 9
+
+# For scrub, suppress notification if the pool is healthy
+# and verbosity is not enabled.
+#
+if [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then
+    healthy="$("${ZPOOL}" status -x "${ZEVENT_POOL}" \
+        | grep "'${ZEVENT_POOL}' is healthy")"
+    [ -n "${healthy}" ] && [ "${ZED_NOTIFY_VERBOSE}" -eq 0 ] && exit 3
+fi
+
+umask 077
+note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+    echo "ZFS has finished a ${action}:"
+    echo
+    echo "   eid: ${ZEVENT_EID}"
+    echo " class: ${ZEVENT_SUBCLASS}"
+    echo "  host: $(hostname)"
+    echo "  time: ${ZEVENT_TIME_STRING}"
+
+    "${ZPOOL}" status "${ZEVENT_POOL}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh
new file mode 100755
index 000000000000..e656e125d378
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh
@@ -0,0 +1,177 @@
+#!/bin/sh
+#
+# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes.
+#
+# Turn the VDEV's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL.
+# Turn the LED off when it's back ONLINE again.
+#
+# This script run in two basic modes:
+#
+# 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then
+# only set the LED for that particular VDEV. This is the case for statechange
+# events and some vdev_* events.
+#
+# 2. If those vars are not set, then check the state of all VDEVs in the pool
+# and set the LEDs accordingly.  This is the case for pool_import events.
+#
+# Note that this script requires that your enclosure be supported by the
+# Linux SCSI enclosure services (ses) driver.  The script will do nothing
+# if you have no enclosure, or if your enclosure isn't supported.
+#
+# Exit codes:
+#   0: enclosure led successfully set
+#   1: enclosure leds not available
+#   2: enclosure leds administratively disabled
+#   3: The led sysfs path passed from ZFS does not exist
+#   4: $ZPOOL not set
+#   5: awk is not installed
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+if [ ! -d /sys/class/enclosure ] ; then
+	exit 1
+fi
+
+if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then
+	exit 2
+fi
+
+zed_check_cmd "$ZPOOL" || exit 4
+zed_check_cmd awk || exit 5
+
+# Global used in set_led debug print
+vdev=""
+
+# check_and_set_led (file, val)
+#
+# Read an enclosure sysfs file, and write it if it's not already set to 'val'
+#
+# Arguments
+#   file: sysfs file to set (like /sys/class/enclosure/0:0:1:0/SLOT 10/fault)
+#   val: value to set it to
+#
+# Return
+#  0 on success, 3 on missing sysfs path
+#
+check_and_set_led()
+{
+	file="$1"
+	val="$2"
+
+	if [ ! -e "$file" ] ; then
+		return 3
+	fi
+
+	# If another process is accessing the LED when we attempt to update it,
+	# the update will be lost so retry until the LED actually changes or we
+	# timeout.
+	for _ in $(seq 1 5); do
+		# We want to check the current state first, since writing to the
+		# 'fault' entry always causes a SES command, even if the
+		# current state is already what you want.
+		current=$(cat "${file}")
+
+		# On some enclosures if you write 1 to fault, and read it back,
+		# it will return 2.  Treat all non-zero values as 1 for
+		# simplicity.
+		if [ "$current" != "0" ] ; then
+			current=1
+		fi
+
+		if [ "$current" != "$val" ] ; then
+			echo "$val" > "$file"
+			zed_log_msg "vdev $vdev set '$file' LED to $val"
+		else
+			break
+		fi
+        done
+}
+
+state_to_val()
+{
+	state="$1"
+	if [ "$state" = "FAULTED" ] || [ "$state" = "DEGRADED" ] || \
+	   [ "$state" = "UNAVAIL" ] ; then
+		echo 1
+	elif [ "$state" = "ONLINE" ] ; then
+		echo 0
+	fi
+}
+
+# process_pool ([pool])
+#
+# Iterate through a pool (or pools) and set the VDEV's enclosure slot LEDs to
+# the VDEV's state.
+#
+# Arguments
+#   pool:	Optional pool name.  If not specified, iterate though all pools.
+#
+# Return
+#  0 on success, 3 on missing sysfs path
+#
+process_pool()
+{
+	pool="$1"
+	rc=0
+
+	# Lookup all the current LED values and paths in parallel
+	#shellcheck disable=SC2016
+	cmd='echo led_token=$(cat "$VDEV_ENC_SYSFS_PATH/fault"),"$VDEV_ENC_SYSFS_PATH",'
+	out=$($ZPOOL status -vc "$cmd" "$pool" | grep 'led_token=')
+
+	#shellcheck disable=SC2034
+	echo "$out" | while read -r vdev state read write chksum therest; do
+		# Read out current LED value and path
+		tmp=$(echo "$therest" | sed 's/^.*led_token=//g')
+		vdev_enc_sysfs_path=$(echo "$tmp" | awk -F ',' '{print $2}')
+		current_val=$(echo "$tmp" | awk -F ',' '{print $1}')
+
+		if [ "$current_val" != "0" ] ; then
+			current_val=1
+		fi
+
+		if [ -z "$vdev_enc_sysfs_path" ] ; then
+			# Skip anything with no sysfs LED entries
+			continue
+		fi
+
+		if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then
+			#shellcheck disable=SC2030
+			rc=1
+			zed_log_msg "vdev $vdev '$file/fault' doesn't exist"
+			continue;
+		fi
+
+		val=$(state_to_val "$state")
+
+		if [ "$current_val" = "$val" ] ; then
+			# LED is already set correctly
+			continue;
+		fi
+
+		if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then
+			rc=1
+		fi
+
+	done
+
+	#shellcheck disable=SC2031
+	if [ "$rc" = "0" ] ; then
+		return 0
+	else
+		# We didn't see a sysfs entry that we wanted to set
+		return 3
+	fi
+}
+
+if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then
+	# Got a statechange for an individual VDEV
+	val=$(state_to_val "$ZEVENT_VDEV_STATE_STR")
+	vdev=$(basename "$ZEVENT_VDEV_PATH")
+	check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val"
+else
+	# Process the entire pool
+	poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID")
+	process_pool "$poolname"
+fi
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh
new file mode 100755
index 000000000000..f46080a03239
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+# You may not use this file except in compliance with the license.
+#
+# CDDL HEADER END
+#
+
+#
+# Send notification in response to a fault induced statechange
+#
+# ZEVENT_SUBCLASS: 'statechange'
+# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED'
+#
+# Exit codes:
+#   0: notification sent
+#   1: notification failed
+#   2: notification not configured
+#   3: statechange not relevant
+#   4: statechange string missing (unexpected)
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_VDEV_STATE_STR}" ] || exit 4
+
+if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \
+        && [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \
+        && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then
+    exit 3
+fi
+
+umask 077
+note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+    if [ "${ZEVENT_VDEV_STATE_STR}" = "FAULTED" ] ; then
+        echo "The number of I/O errors associated with a ZFS device exceeded"
+        echo "acceptable levels. ZFS has marked the device as faulted."
+    elif [ "${ZEVENT_VDEV_STATE_STR}" = "DEGRADED" ] ; then
+        echo "The number of checksum errors associated with a ZFS device"
+        echo "exceeded acceptable levels. ZFS has marked the device as"
+        echo "degraded."
+    else
+        echo "ZFS has detected that a device was removed."
+    fi
+
+    echo
+    echo " impact: Fault tolerance of the pool may be compromised."
+    echo "    eid: ${ZEVENT_EID}"
+    echo "  class: ${ZEVENT_SUBCLASS}"
+    echo "  state: ${ZEVENT_VDEV_STATE_STR}"
+    echo "   host: $(hostname)"
+    echo "   time: ${ZEVENT_TIME_STRING}"
+
+    [ -n "${ZEVENT_VDEV_TYPE}" ] && echo "  vtype: ${ZEVENT_VDEV_TYPE}"
+    [ -n "${ZEVENT_VDEV_PATH}" ] && echo "  vpath: ${ZEVENT_VDEV_PATH}"
+    [ -n "${ZEVENT_VDEV_PHYSPATH}" ] && echo "  vphys: ${ZEVENT_VDEV_PHYSPATH}"
+    [ -n "${ZEVENT_VDEV_GUID}" ] && echo "  vguid: ${ZEVENT_VDEV_GUID}"
+    [ -n "${ZEVENT_VDEV_DEVID}" ] && echo "  devid: ${ZEVENT_VDEV_DEVID}"
+
+    echo "   pool: ${ZEVENT_POOL_GUID}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh
new file mode 100755
index 000000000000..5075302997e3
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+#
+# Send notification in response to a TRIM_FINISH. The event
+# will be received for each vdev in the pool which was trimmed.
+#
+# Exit codes:
+#   0: notification sent
+#   1: notification failed
+#   2: notification not configured
+#   9: internal error
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_POOL}" ] || exit 9
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
+
+zed_check_cmd "${ZPOOL}" || exit 9
+
+umask 077
+note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+    echo "ZFS has finished a trim:"
+    echo
+    echo "   eid: ${ZEVENT_EID}"
+    echo " class: ${ZEVENT_SUBCLASS}"
+    echo "  host: $(hostname)"
+    echo "  time: ${ZEVENT_TIME_STRING}"
+
+    "${ZPOOL}" status -t "${ZEVENT_POOL}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh
new file mode 120000
index 000000000000..7d7404398a4a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh
@@ -0,0 +1 @@
+statechange-led.sh
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh
new file mode 120000
index 000000000000..7d7404398a4a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh
@@ -0,0 +1 @@
+statechange-led.sh
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh
new file mode 100755
index 000000000000..44a9b8d23303
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh
@@ -0,0 +1,538 @@
+#!/bin/sh
+# shellcheck disable=SC2039
+# zed-functions.sh
+#
+# ZED helper functions for use in ZEDLETs
+
+
+# Variable Defaults
+#
+: "${ZED_LOCKDIR:="/var/lock"}"
+: "${ZED_NOTIFY_INTERVAL_SECS:=3600}"
+: "${ZED_NOTIFY_VERBOSE:=0}"
+: "${ZED_RUNDIR:="/var/run"}"
+: "${ZED_SYSLOG_PRIORITY:="daemon.notice"}"
+: "${ZED_SYSLOG_TAG:="zed"}"
+
+ZED_FLOCK_FD=8
+
+
+# zed_check_cmd (cmd, ...)
+#
+# For each argument given, search PATH for the executable command [cmd].
+# Log a message if [cmd] is not found.
+#
+# Arguments
+#   cmd: name of executable command for which to search
+#
+# Return
+#   0 if all commands are found in PATH and are executable
+#   n for a count of the command executables that are not found
+#
+zed_check_cmd()
+{
+    local cmd
+    local rv=0
+
+    for cmd; do
+        if ! command -v "${cmd}" >/dev/null 2>&1; then
+            zed_log_err "\"${cmd}\" not installed"
+            rv=$((rv + 1))
+        fi
+    done
+    return "${rv}"
+}
+
+
+# zed_log_msg (msg, ...)
+#
+# Write all argument strings to the system log.
+#
+# Globals
+#   ZED_SYSLOG_PRIORITY
+#   ZED_SYSLOG_TAG
+#
+# Return
+#   nothing
+#
+zed_log_msg()
+{
+    logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "$@"
+}
+
+
+# zed_log_err (msg, ...)
+#
+# Write an error message to the system log.  This message will contain the
+# script name, EID, and all argument strings.
+#
+# Globals
+#   ZED_SYSLOG_PRIORITY
+#   ZED_SYSLOG_TAG
+#   ZEVENT_EID
+#
+# Return
+#   nothing
+#
+zed_log_err()
+{
+    logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "error:" \
+        "$(basename -- "$0"):""${ZEVENT_EID:+" eid=${ZEVENT_EID}:"}" "$@"
+}
+
+
+# zed_lock (lockfile, [fd])
+#
+# Obtain an exclusive (write) lock on [lockfile].  If the lock cannot be
+# immediately acquired, wait until it becomes available.
+#
+# Every zed_lock() must be paired with a corresponding zed_unlock().
+#
+# By default, flock-style locks associate the lockfile with file descriptor 8.
+# The bash manpage warns that file descriptors >9 should be used with care as
+# they may conflict with file descriptors used internally by the shell.  File
+# descriptor 9 is reserved for zed_rate_limit().  If concurrent locks are held
+# within the same process, they must use different file descriptors (preferably
+# decrementing from 8); otherwise, obtaining a new lock with a given file
+# descriptor will release the previous lock associated with that descriptor.
+#
+# Arguments
+#   lockfile: pathname of the lock file; the lock will be stored in
+#     ZED_LOCKDIR unless the pathname contains a "/".
+#   fd: integer for the file descriptor used by flock (OPTIONAL unless holding
+#     concurrent locks)
+#
+# Globals
+#   ZED_FLOCK_FD
+#   ZED_LOCKDIR
+#
+# Return
+#   nothing
+#
+zed_lock()
+{
+    local lockfile="$1"
+    local fd="${2:-${ZED_FLOCK_FD}}"
+    local umask_bak
+    local err
+
+    [ -n "${lockfile}" ] || return
+    if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then
+        lockfile="${ZED_LOCKDIR}/${lockfile}"
+    fi
+
+    umask_bak="$(umask)"
+    umask 077
+
+    # Obtain a lock on the file bound to the given file descriptor.
+    #
+    eval "exec ${fd}> '${lockfile}'"
+    err="$(flock --exclusive "${fd}" 2>&1)"
+    # shellcheck disable=SC2181
+    if [ $? -ne 0 ]; then
+        zed_log_err "failed to lock \"${lockfile}\": ${err}"
+    fi
+
+    umask "${umask_bak}"
+}
+
+
+# zed_unlock (lockfile, [fd])
+#
+# Release the lock on [lockfile].
+#
+# Arguments
+#   lockfile: pathname of the lock file
+#   fd: integer for the file descriptor used by flock (must match the file
+#     descriptor passed to the zed_lock function call)
+#
+# Globals
+#   ZED_FLOCK_FD
+#   ZED_LOCKDIR
+#
+# Return
+#   nothing
+#
+zed_unlock()
+{
+    local lockfile="$1"
+    local fd="${2:-${ZED_FLOCK_FD}}"
+    local err
+
+    [ -n "${lockfile}" ] || return
+    if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then
+        lockfile="${ZED_LOCKDIR}/${lockfile}"
+    fi
+
+    # Release the lock and close the file descriptor.
+    err="$(flock --unlock "${fd}" 2>&1)"
+    # shellcheck disable=SC2181
+    if [ $? -ne 0 ]; then
+        zed_log_err "failed to unlock \"${lockfile}\": ${err}"
+    fi
+    eval "exec ${fd}>&-"
+}
+
+
+# zed_notify (subject, pathname)
+#
+# Send a notification via all available methods.
+#
+# Arguments
+#   subject: notification subject
+#   pathname: pathname containing the notification message (OPTIONAL)
+#
+# Return
+#   0: notification succeeded via at least one method
+#   1: notification failed
+#   2: no notification methods configured
+#
+zed_notify()
+{
+    local subject="$1"
+    local pathname="$2"
+    local num_success=0
+    local num_failure=0
+
+    zed_notify_email "${subject}" "${pathname}"; rv=$?
+    [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+    [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
+    zed_notify_pushbullet "${subject}" "${pathname}"; rv=$?
+    [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+    [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
+    zed_notify_slack_webhook "${subject}" "${pathname}"; rv=$?
+    [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+    [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
+    [ "${num_success}" -gt 0 ] && return 0
+    [ "${num_failure}" -gt 0 ] && return 1
+    return 2
+}
+
+
+# zed_notify_email (subject, pathname)
+#
+# Send a notification via email to the address specified by ZED_EMAIL_ADDR.
+#
+# Requires the mail executable to be installed in the standard PATH, or
+# ZED_EMAIL_PROG to be defined with the pathname of an executable capable of
+# reading a message body from stdin.
+#
+# Command-line options to the mail executable can be specified in
+# ZED_EMAIL_OPTS.  This undergoes the following keyword substitutions:
+# - @ADDRESS@ is replaced with the space-delimited recipient email address(es)
+# - @SUBJECT@ is replaced with the notification subject
+#
+# Arguments
+#   subject: notification subject
+#   pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+#   ZED_EMAIL_PROG
+#   ZED_EMAIL_OPTS
+#   ZED_EMAIL_ADDR
+#
+# Return
+#   0: notification sent
+#   1: notification failed
+#   2: not configured
+#
+zed_notify_email()
+{
+    local subject="$1"
+    local pathname="${2:-"/dev/null"}"
+
+    : "${ZED_EMAIL_PROG:="mail"}"
+    : "${ZED_EMAIL_OPTS:="-s '@SUBJECT@' @ADDRESS@"}"
+
+    # For backward compatibility with ZED_EMAIL.
+    if [ -n "${ZED_EMAIL}" ] && [ -z "${ZED_EMAIL_ADDR}" ]; then
+        ZED_EMAIL_ADDR="${ZED_EMAIL}"
+    fi
+    [ -n "${ZED_EMAIL_ADDR}" ] || return 2
+
+    zed_check_cmd "${ZED_EMAIL_PROG}" || return 1
+
+    [ -n "${subject}" ] || return 1
+    if [ ! -r "${pathname}" ]; then
+        zed_log_err \
+                "$(basename "${ZED_EMAIL_PROG}") cannot read \"${pathname}\""
+        return 1
+    fi
+
+    ZED_EMAIL_OPTS="$(echo "${ZED_EMAIL_OPTS}" \
+        | sed   -e "s/@ADDRESS@/${ZED_EMAIL_ADDR}/g" \
+                -e "s/@SUBJECT@/${subject}/g")"
+
+    # shellcheck disable=SC2086
+    eval "${ZED_EMAIL_PROG}" ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1
+    rv=$?
+    if [ "${rv}" -ne 0 ]; then
+        zed_log_err "$(basename "${ZED_EMAIL_PROG}") exit=${rv}"
+        return 1
+    fi
+    return 0
+}
+
+
+# zed_notify_pushbullet (subject, pathname)
+#
+# Send a notification via Pushbullet <https://www.pushbullet.com/>.
+# The access token (ZED_PUSHBULLET_ACCESS_TOKEN) identifies this client to the
+# Pushbullet server.  The optional channel tag (ZED_PUSHBULLET_CHANNEL_TAG) is
+# for pushing to notification feeds that can be subscribed to; if a channel is
+# not defined, push notifications will instead be sent to all devices
+# associated with the account specified by the access token.
+#
+# Requires awk, curl, and sed executables to be installed in the standard PATH.
+#
+# References
+#   https://docs.pushbullet.com/
+#   https://www.pushbullet.com/security
+#
+# Arguments
+#   subject: notification subject
+#   pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+#   ZED_PUSHBULLET_ACCESS_TOKEN
+#   ZED_PUSHBULLET_CHANNEL_TAG
+#
+# Return
+#   0: notification sent
+#   1: notification failed
+#   2: not configured
+#
+zed_notify_pushbullet()
+{
+    local subject="$1"
+    local pathname="${2:-"/dev/null"}"
+    local msg_body
+    local msg_tag
+    local msg_json
+    local msg_out
+    local msg_err
+    local url="https://api.pushbullet.com/v2/pushes"
+
+    [ -n "${ZED_PUSHBULLET_ACCESS_TOKEN}" ] || return 2
+
+    [ -n "${subject}" ] || return 1
+    if [ ! -r "${pathname}" ]; then
+        zed_log_err "pushbullet cannot read \"${pathname}\""
+        return 1
+    fi
+
+    zed_check_cmd "awk" "curl" "sed" || return 1
+
+    # Escape the following characters in the message body for JSON:
+    # newline, backslash, double quote, horizontal tab, vertical tab,
+    # and carriage return.
+    #
+    msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\"");
+        gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \
+        "${pathname}")"
+
+    # Push to a channel if one is configured.
+    #
+    [ -n "${ZED_PUSHBULLET_CHANNEL_TAG}" ] && msg_tag="$(printf \
+        '"channel_tag": "%s", ' "${ZED_PUSHBULLET_CHANNEL_TAG}")"
+
+    # Construct the JSON message for pushing a note.
+    #
+    msg_json="$(printf '{%s"type": "note", "title": "%s", "body": "%s"}' \
+        "${msg_tag}" "${subject}" "${msg_body}")"
+
+    # Send the POST request and check for errors.
+    #
+    msg_out="$(curl -u "${ZED_PUSHBULLET_ACCESS_TOKEN}:" -X POST "${url}" \
+        --header "Content-Type: application/json" --data-binary "${msg_json}" \
+        2>/dev/null)"; rv=$?
+    if [ "${rv}" -ne 0 ]; then
+        zed_log_err "curl exit=${rv}"
+        return 1
+    fi
+    msg_err="$(echo "${msg_out}" \
+        | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')"
+    if [ -n "${msg_err}" ]; then
+        zed_log_err "pushbullet \"${msg_err}"\"
+        return 1
+    fi
+    return 0
+}
+
+
+# zed_notify_slack_webhook (subject, pathname)
+#
+# Notification via Slack Webhook <https://api.slack.com/incoming-webhooks>.
+# The Webhook URL (ZED_SLACK_WEBHOOK_URL) identifies this client to the
+# Slack channel. 
+#
+# Requires awk, curl, and sed executables to be installed in the standard PATH.
+#
+# References
+#   https://api.slack.com/incoming-webhooks
+#
+# Arguments
+#   subject: notification subject
+#   pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+#   ZED_SLACK_WEBHOOK_URL
+#
+# Return
+#   0: notification sent
+#   1: notification failed
+#   2: not configured
+#
+zed_notify_slack_webhook()
+{
+    [ -n "${ZED_SLACK_WEBHOOK_URL}" ] || return 2
+
+    local subject="$1"
+    local pathname="${2:-"/dev/null"}"
+    local msg_body
+    local msg_tag
+    local msg_json
+    local msg_out
+    local msg_err
+    local url="${ZED_SLACK_WEBHOOK_URL}"
+
+    [ -n "${subject}" ] || return 1
+    if [ ! -r "${pathname}" ]; then
+        zed_log_err "slack webhook cannot read \"${pathname}\""
+        return 1
+    fi
+
+    zed_check_cmd "awk" "curl" "sed" || return 1
+
+    # Escape the following characters in the message body for JSON:
+    # newline, backslash, double quote, horizontal tab, vertical tab,
+    # and carriage return.
+    #
+    msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\"");
+        gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \
+        "${pathname}")"
+
+    # Construct the JSON message for posting.
+    #
+    msg_json="$(printf '{"text": "*%s*\n%s"}' "${subject}" "${msg_body}" )"
+
+    # Send the POST request and check for errors.
+    #
+    msg_out="$(curl -X POST "${url}" \
+        --header "Content-Type: application/json" --data-binary "${msg_json}" \
+        2>/dev/null)"; rv=$?
+    if [ "${rv}" -ne 0 ]; then
+        zed_log_err "curl exit=${rv}"
+        return 1
+    fi
+    msg_err="$(echo "${msg_out}" \
+        | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')"
+    if [ -n "${msg_err}" ]; then
+        zed_log_err "slack webhook \"${msg_err}"\"
+        return 1
+    fi
+    return 0
+}
+
+# zed_rate_limit (tag, [interval])
+#
+# Check whether an event of a given type [tag] has already occurred within the
+# last [interval] seconds.
+#
+# This function obtains a lock on the statefile using file descriptor 9.
+#
+# Arguments
+#   tag: arbitrary string for grouping related events to rate-limit
+#   interval: time interval in seconds (OPTIONAL)
+#
+# Globals
+#   ZED_NOTIFY_INTERVAL_SECS
+#   ZED_RUNDIR
+#
+# Return
+#   0 if the event should be processed
+#   1 if the event should be dropped
+#
+# State File Format
+#   time;tag
+#
+zed_rate_limit()
+{
+    local tag="$1"
+    local interval="${2:-${ZED_NOTIFY_INTERVAL_SECS}}"
+    local lockfile="zed.zedlet.state.lock"
+    local lockfile_fd=9
+    local statefile="${ZED_RUNDIR}/zed.zedlet.state"
+    local time_now
+    local time_prev
+    local umask_bak
+    local rv=0
+
+    [ -n "${tag}" ] || return 0
+
+    zed_lock "${lockfile}" "${lockfile_fd}"
+    time_now="$(date +%s)"
+    time_prev="$(grep -E "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \
+        | tail -1 | cut -d\; -f1)"
+
+    if [ -n "${time_prev}" ] \
+            && [ "$((time_now - time_prev))" -lt "${interval}" ]; then
+        rv=1
+    else
+        umask_bak="$(umask)"
+        umask 077
+        grep -E -v "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \
+            > "${statefile}.$$"
+        echo "${time_now};${tag}" >> "${statefile}.$$"
+        mv -f "${statefile}.$$" "${statefile}"
+        umask "${umask_bak}"
+    fi
+
+    zed_unlock "${lockfile}" "${lockfile_fd}"
+    return "${rv}"
+}
+
+
+# zed_guid_to_pool (guid)
+#
+# Convert a pool GUID into its pool name (like "tank")
+# Arguments
+#   guid: pool GUID (decimal or hex)
+#
+# Return
+#   Pool name
+#
+zed_guid_to_pool()
+{
+	if [ -z "$1" ] ; then
+		return
+	fi
+
+	guid=$(printf "%llu" "$1")
+	if [ -n "$guid" ] ; then
+		$ZPOOL get -H -ovalue,name guid | awk '$1=='"$guid"' {print $2}'
+	fi
+}
+
+# zed_exit_if_ignoring_this_event
+#
+# Exit the script if we should ignore this event, as determined by
+# $ZED_SYSLOG_SUBCLASS_INCLUDE and $ZED_SYSLOG_SUBCLASS_EXCLUDE in zed.rc.
+# This function assumes you've imported the normal zed variables.
+zed_exit_if_ignoring_this_event()
+{
+	if [ -n "${ZED_SYSLOG_SUBCLASS_INCLUDE}" ]; then
+	    eval "case ${ZEVENT_SUBCLASS} in
+	    ${ZED_SYSLOG_SUBCLASS_INCLUDE});;
+	    *) exit 0;;
+	    esac"
+	elif [ -n "${ZED_SYSLOG_SUBCLASS_EXCLUDE}" ]; then
+	    eval "case ${ZEVENT_SUBCLASS} in
+	    ${ZED_SYSLOG_SUBCLASS_EXCLUDE}) exit 0;;
+	    *);;
+	    esac"
+	fi
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc
new file mode 100644
index 000000000000..1b220d28db20
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc
@@ -0,0 +1,122 @@
+##
+# zed.rc
+#
+# This file should be owned by root and permissioned 0600.
+##
+
+##
+# Absolute path to the debug output file.
+#
+#ZED_DEBUG_LOG="/tmp/zed.debug.log"
+
+##
+# Email address of the zpool administrator for receipt of notifications;
+#   multiple addresses can be specified if they are delimited by whitespace.
+# Email will only be sent if ZED_EMAIL_ADDR is defined.
+# Disabled by default; uncomment to enable.
+#
+#ZED_EMAIL_ADDR="root"
+
+##
+# Name or path of executable responsible for sending notifications via email;
+#   the mail program must be capable of reading a message body from stdin.
+# Email will only be sent if ZED_EMAIL_ADDR is defined.
+#
+#ZED_EMAIL_PROG="mail"
+
+##
+# Command-line options for ZED_EMAIL_PROG.
+# The string @ADDRESS@ will be replaced with the recipient email address(es).
+# The string @SUBJECT@ will be replaced with the notification subject;
+#   this should be protected with quotes to prevent word-splitting.
+# Email will only be sent if ZED_EMAIL_ADDR is defined.
+#
+#ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@"
+
+##
+# Default directory for zed lock files.
+#
+#ZED_LOCKDIR="/var/lock"
+
+##
+# Minimum number of seconds between notifications for a similar event.
+#
+#ZED_NOTIFY_INTERVAL_SECS=3600
+
+##
+# Notification verbosity.
+#   If set to 0, suppress notification if the pool is healthy.
+#   If set to 1, send notification regardless of pool health.
+#
+#ZED_NOTIFY_VERBOSE=0
+
+##
+# Send notifications for 'ereport.fs.zfs.data' events.
+# Disabled by default, any non-empty value will enable the feature.
+#
+#ZED_NOTIFY_DATA=
+
+##
+# Pushbullet access token.
+# This grants full access to your account -- protect it accordingly!
+#   <https://www.pushbullet.com/get-started>
+#   <https://www.pushbullet.com/account>
+# Disabled by default; uncomment to enable.
+#
+#ZED_PUSHBULLET_ACCESS_TOKEN=""
+
+##
+# Pushbullet channel tag for push notification feeds that can be subscribed to.
+#   <https://www.pushbullet.com/my-channel>
+# If not defined, push notifications will instead be sent to all devices
+#   associated with the account specified by the access token.
+# Disabled by default; uncomment to enable.
+#
+#ZED_PUSHBULLET_CHANNEL_TAG=""
+
+##
+# Slack Webhook URL.
+# This allows posting to the given channel and includes an access token.
+#   <https://api.slack.com/incoming-webhooks>
+# Disabled by default; uncomment to enable.
+#
+#ZED_SLACK_WEBHOOK_URL=""
+
+##
+# Default directory for zed state files.
+#
+#ZED_RUNDIR="/var/run"
+
+##
+# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED.  This works for
+# device mapper and multipath devices as well.  Your enclosure must be
+# supported by the Linux SES driver for this to work.
+#
+ZED_USE_ENCLOSURE_LEDS=1
+
+##
+# Run a scrub after every resilver
+# Disabled by default, 1 to enable and 0 to disable.
+#ZED_SCRUB_AFTER_RESILVER=0
+
+##
+# The syslog priority (e.g., specified as a "facility.level" pair).
+#
+#ZED_SYSLOG_PRIORITY="daemon.notice"
+
+##
+# The syslog tag for marking zed events.
+#
+#ZED_SYSLOG_TAG="zed"
+
+##
+# Which set of event subclasses to log
+# By default, events from all subclasses are logged.
+# If ZED_SYSLOG_SUBCLASS_INCLUDE is set, only subclasses
+# matching the pattern are logged. Use the pipe symbol (|)
+# or shell wildcards (*, ?) to match multiple subclasses.
+# Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the
+# matching subclasses are excluded from logging.
+#ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*"
+#ZED_SYSLOG_SUBCLASS_EXCLUDE="statechange|config_*|history_event"
+
diff --git a/sys/contrib/openzfs/cmd/zed/zed.h b/sys/contrib/openzfs/cmd/zed/zed.h
new file mode 100644
index 000000000000..3ac0e63141e8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.h
@@ -0,0 +1,58 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef	ZED_H
+#define	ZED_H
+
+/*
+ * Absolute path for the default zed configuration file.
+ */
+#define	ZED_CONF_FILE		SYSCONFDIR "/zfs/zed.conf"
+
+/*
+ * Absolute path for the default zed pid file.
+ */
+#define	ZED_PID_FILE		RUNSTATEDIR "/zed.pid"
+
+/*
+ * Absolute path for the default zed state file.
+ */
+#define	ZED_STATE_FILE		RUNSTATEDIR "/zed.state"
+
+/*
+ * Absolute path for the default zed zedlet directory.
+ */
+#define	ZED_ZEDLET_DIR		SYSCONFDIR "/zfs/zed.d"
+
+/*
+ * Reserved for future use.
+ */
+#define	ZED_MAX_EVENTS		0
+
+/*
+ * Reserved for future use.
+ */
+#define	ZED_MIN_EVENTS		0
+
+/*
+ * String prefix for ZED variables passed via environment variables.
+ */
+#define	ZED_VAR_PREFIX		"ZED_"
+
+/*
+ * String prefix for ZFS event names passed via environment variables.
+ */
+#define	ZEVENT_VAR_PREFIX	"ZEVENT_"
+
+#endif	/* !ZED_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_conf.c b/sys/contrib/openzfs/cmd/zed/zed_conf.c
new file mode 100644
index 000000000000..52370eb87b29
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_conf.c
@@ -0,0 +1,735 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include "zed.h"
+#include "zed_conf.h"
+#include "zed_file.h"
+#include "zed_log.h"
+#include "zed_strings.h"
+
+/*
+ * Return a new configuration with default values.
+ */
+struct zed_conf *
+zed_conf_create(void)
+{
+	struct zed_conf *zcp;
+
+	zcp = calloc(1, sizeof (*zcp));
+	if (!zcp)
+		goto nomem;
+
+	zcp->syslog_facility = LOG_DAEMON;
+	zcp->min_events = ZED_MIN_EVENTS;
+	zcp->max_events = ZED_MAX_EVENTS;
+	zcp->pid_fd = -1;
+	zcp->zedlets = NULL;		/* created via zed_conf_scan_dir() */
+	zcp->state_fd = -1;		/* opened via zed_conf_open_state() */
+	zcp->zfs_hdl = NULL;		/* opened via zed_event_init() */
+	zcp->zevent_fd = -1;		/* opened via zed_event_init() */
+
+	if (!(zcp->conf_file = strdup(ZED_CONF_FILE)))
+		goto nomem;
+
+	if (!(zcp->pid_file = strdup(ZED_PID_FILE)))
+		goto nomem;
+
+	if (!(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR)))
+		goto nomem;
+
+	if (!(zcp->state_file = strdup(ZED_STATE_FILE)))
+		goto nomem;
+
+	return (zcp);
+
+nomem:
+	zed_log_die("Failed to create conf: %s", strerror(errno));
+	return (NULL);
+}
+
+/*
+ * Destroy the configuration [zcp].
+ *
+ * Note: zfs_hdl & zevent_fd are destroyed via zed_event_fini().
+ */
+void
+zed_conf_destroy(struct zed_conf *zcp)
+{
+	if (!zcp)
+		return;
+
+	if (zcp->state_fd >= 0) {
+		if (close(zcp->state_fd) < 0)
+			zed_log_msg(LOG_WARNING,
+			    "Failed to close state file \"%s\": %s",
+			    zcp->state_file, strerror(errno));
+		zcp->state_fd = -1;
+	}
+	if (zcp->pid_file) {
+		if ((unlink(zcp->pid_file) < 0) && (errno != ENOENT))
+			zed_log_msg(LOG_WARNING,
+			    "Failed to remove PID file \"%s\": %s",
+			    zcp->pid_file, strerror(errno));
+	}
+	if (zcp->pid_fd >= 0) {
+		if (close(zcp->pid_fd) < 0)
+			zed_log_msg(LOG_WARNING,
+			    "Failed to close PID file \"%s\": %s",
+			    zcp->pid_file, strerror(errno));
+		zcp->pid_fd = -1;
+	}
+	if (zcp->conf_file) {
+		free(zcp->conf_file);
+		zcp->conf_file = NULL;
+	}
+	if (zcp->pid_file) {
+		free(zcp->pid_file);
+		zcp->pid_file = NULL;
+	}
+	if (zcp->zedlet_dir) {
+		free(zcp->zedlet_dir);
+		zcp->zedlet_dir = NULL;
+	}
+	if (zcp->state_file) {
+		free(zcp->state_file);
+		zcp->state_file = NULL;
+	}
+	if (zcp->zedlets) {
+		zed_strings_destroy(zcp->zedlets);
+		zcp->zedlets = NULL;
+	}
+	free(zcp);
+}
+
+/*
+ * Display command-line help and exit.
+ *
+ * If [got_err] is 0, output to stdout and exit normally;
+ * otherwise, output to stderr and exit with a failure status.
+ */
+static void
+_zed_conf_display_help(const char *prog, int got_err)
+{
+	FILE *fp = got_err ? stderr : stdout;
+	int w1 = 4;			/* width of leading whitespace */
+	int w2 = 8;			/* width of L-justified option field */
+
+	fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed"));
+	fprintf(fp, "\n");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h",
+	    "Display help.");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L",
+	    "Display license information.");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V",
+	    "Display version information.");
+	fprintf(fp, "\n");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v",
+	    "Be verbose.");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f",
+	    "Force daemon to run.");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F",
+	    "Run daemon in the foreground.");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-I",
+	    "Idle daemon until kernel module is (re)loaded.");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M",
+	    "Lock all pages in memory.");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-P",
+	    "$PATH for ZED to use (only used by ZTS).");
+	fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z",
+	    "Zero state file.");
+	fprintf(fp, "\n");
+#if 0
+	fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE",
+	    "Read configuration from FILE.", ZED_CONF_FILE);
+#endif
+	fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR",
+	    "Read enabled ZEDLETs from DIR.", ZED_ZEDLET_DIR);
+	fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE",
+	    "Write daemon's PID to FILE.", ZED_PID_FILE);
+	fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE",
+	    "Write daemon's state to FILE.", ZED_STATE_FILE);
+	fprintf(fp, "\n");
+
+	exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS);
+}
+
+/*
+ * Display license information to stdout and exit.
+ */
+static void
+_zed_conf_display_license(void)
+{
+	const char **pp;
+	const char *text[] = {
+	    "The ZFS Event Daemon (ZED) is distributed under the terms of the",
+	    "  Common Development and Distribution License (CDDL-1.0)",
+	    "  <http://opensource.org/licenses/CDDL-1.0>.",
+	    "",
+	    "Developed at Lawrence Livermore National Laboratory"
+	    " (LLNL-CODE-403049).",
+	    "",
+	    NULL
+	};
+
+	for (pp = text; *pp; pp++)
+		printf("%s\n", *pp);
+
+	exit(EXIT_SUCCESS);
+}
+
+/*
+ * Display version information to stdout and exit.
+ */
+static void
+_zed_conf_display_version(void)
+{
+	printf("%s-%s-%s\n",
+	    ZFS_META_NAME, ZFS_META_VERSION, ZFS_META_RELEASE);
+
+	exit(EXIT_SUCCESS);
+}
+
+/*
+ * Copy the [path] string to the [resultp] ptr.
+ * If [path] is not an absolute path, prefix it with the current working dir.
+ * If [resultp] is non-null, free its existing string before assignment.
+ */
+static void
+_zed_conf_parse_path(char **resultp, const char *path)
+{
+	char buf[PATH_MAX];
+
+	assert(resultp != NULL);
+	assert(path != NULL);
+
+	if (*resultp)
+		free(*resultp);
+
+	if (path[0] == '/') {
+		*resultp = strdup(path);
+	} else if (!getcwd(buf, sizeof (buf))) {
+		zed_log_die("Failed to get current working dir: %s",
+		    strerror(errno));
+	} else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) {
+		zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG));
+	} else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) {
+		zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG));
+	} else {
+		*resultp = strdup(buf);
+	}
+	if (!*resultp)
+		zed_log_die("Failed to copy path: %s", strerror(ENOMEM));
+}
+
+/*
+ * Parse the command-line options into the configuration [zcp].
+ */
+void
+zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
+{
+	const char * const opts = ":hLVc:d:p:P:s:vfFMZI";
+	int opt;
+
+	if (!zcp || !argv || !argv[0])
+		zed_log_die("Failed to parse options: Internal error");
+
+	opterr = 0;			/* suppress default getopt err msgs */
+
+	while ((opt = getopt(argc, argv, opts)) != -1) {
+		switch (opt) {
+		case 'h':
+			_zed_conf_display_help(argv[0], EXIT_SUCCESS);
+			break;
+		case 'L':
+			_zed_conf_display_license();
+			break;
+		case 'V':
+			_zed_conf_display_version();
+			break;
+		case 'c':
+			_zed_conf_parse_path(&zcp->conf_file, optarg);
+			break;
+		case 'd':
+			_zed_conf_parse_path(&zcp->zedlet_dir, optarg);
+			break;
+		case 'I':
+			zcp->do_idle = 1;
+			break;
+		case 'p':
+			_zed_conf_parse_path(&zcp->pid_file, optarg);
+			break;
+		case 'P':
+			_zed_conf_parse_path(&zcp->path, optarg);
+			break;
+		case 's':
+			_zed_conf_parse_path(&zcp->state_file, optarg);
+			break;
+		case 'v':
+			zcp->do_verbose = 1;
+			break;
+		case 'f':
+			zcp->do_force = 1;
+			break;
+		case 'F':
+			zcp->do_foreground = 1;
+			break;
+		case 'M':
+			zcp->do_memlock = 1;
+			break;
+		case 'Z':
+			zcp->do_zero = 1;
+			break;
+		case '?':
+		default:
+			if (optopt == '?')
+				_zed_conf_display_help(argv[0], EXIT_SUCCESS);
+
+			fprintf(stderr, "%s: %s '-%c'\n\n", argv[0],
+			    "Invalid option", optopt);
+			_zed_conf_display_help(argv[0], EXIT_FAILURE);
+			break;
+		}
+	}
+}
+
+/*
+ * Parse the configuration file into the configuration [zcp].
+ *
+ * FIXME: Not yet implemented.
+ */
+void
+zed_conf_parse_file(struct zed_conf *zcp)
+{
+	if (!zcp)
+		zed_log_die("Failed to parse config: %s", strerror(EINVAL));
+}
+
+/*
+ * Scan the [zcp] zedlet_dir for files to exec based on the event class.
+ * Files must be executable by user, but not writable by group or other.
+ * Dotfiles are ignored.
+ *
+ * Return 0 on success with an updated set of zedlets,
+ * or -1 on error with errno set.
+ *
+ * FIXME: Check if zedlet_dir and all parent dirs are secure.
+ */
+int
+zed_conf_scan_dir(struct zed_conf *zcp)
+{
+	zed_strings_t *zedlets;
+	DIR *dirp;
+	struct dirent *direntp;
+	char pathname[PATH_MAX];
+	struct stat st;
+	int n;
+
+	if (!zcp) {
+		errno = EINVAL;
+		zed_log_msg(LOG_ERR, "Failed to scan zedlet dir: %s",
+		    strerror(errno));
+		return (-1);
+	}
+	zedlets = zed_strings_create();
+	if (!zedlets) {
+		errno = ENOMEM;
+		zed_log_msg(LOG_WARNING, "Failed to scan dir \"%s\": %s",
+		    zcp->zedlet_dir, strerror(errno));
+		return (-1);
+	}
+	dirp = opendir(zcp->zedlet_dir);
+	if (!dirp) {
+		int errno_bak = errno;
+		zed_log_msg(LOG_WARNING, "Failed to open dir \"%s\": %s",
+		    zcp->zedlet_dir, strerror(errno));
+		zed_strings_destroy(zedlets);
+		errno = errno_bak;
+		return (-1);
+	}
+	while ((direntp = readdir(dirp))) {
+		if (direntp->d_name[0] == '.')
+			continue;
+
+		n = snprintf(pathname, sizeof (pathname),
+		    "%s/%s", zcp->zedlet_dir, direntp->d_name);
+		if ((n < 0) || (n >= sizeof (pathname))) {
+			zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s",
+			    direntp->d_name, strerror(ENAMETOOLONG));
+			continue;
+		}
+		if (stat(pathname, &st) < 0) {
+			zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s",
+			    pathname, strerror(errno));
+			continue;
+		}
+		if (!S_ISREG(st.st_mode)) {
+			zed_log_msg(LOG_INFO,
+			    "Ignoring \"%s\": not a regular file",
+			    direntp->d_name);
+			continue;
+		}
+		if ((st.st_uid != 0) && !zcp->do_force) {
+			zed_log_msg(LOG_NOTICE,
+			    "Ignoring \"%s\": not owned by root",
+			    direntp->d_name);
+			continue;
+		}
+		if (!(st.st_mode & S_IXUSR)) {
+			zed_log_msg(LOG_INFO,
+			    "Ignoring \"%s\": not executable by user",
+			    direntp->d_name);
+			continue;
+		}
+		if ((st.st_mode & S_IWGRP) && !zcp->do_force) {
+			zed_log_msg(LOG_NOTICE,
+			    "Ignoring \"%s\": writable by group",
+			    direntp->d_name);
+			continue;
+		}
+		if ((st.st_mode & S_IWOTH) && !zcp->do_force) {
+			zed_log_msg(LOG_NOTICE,
+			    "Ignoring \"%s\": writable by other",
+			    direntp->d_name);
+			continue;
+		}
+		if (zed_strings_add(zedlets, NULL, direntp->d_name) < 0) {
+			zed_log_msg(LOG_WARNING,
+			    "Failed to register \"%s\": %s",
+			    direntp->d_name, strerror(errno));
+			continue;
+		}
+		if (zcp->do_verbose)
+			zed_log_msg(LOG_INFO,
+			    "Registered zedlet \"%s\"", direntp->d_name);
+	}
+	if (closedir(dirp) < 0) {
+		int errno_bak = errno;
+		zed_log_msg(LOG_WARNING, "Failed to close dir \"%s\": %s",
+		    zcp->zedlet_dir, strerror(errno));
+		zed_strings_destroy(zedlets);
+		errno = errno_bak;
+		return (-1);
+	}
+	if (zcp->zedlets)
+		zed_strings_destroy(zcp->zedlets);
+
+	zcp->zedlets = zedlets;
+	return (0);
+}
+
+/*
+ * Write the PID file specified in [zcp].
+ * Return 0 on success, -1 on error.
+ *
+ * This must be called after fork()ing to become a daemon (so the correct PID
+ * is recorded), but before daemonization is complete and the parent process
+ * exits (for synchronization with systemd).
+ */
+int
+zed_conf_write_pid(struct zed_conf *zcp)
+{
+	const mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+	const mode_t filemode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+	char buf[PATH_MAX];
+	int n;
+	char *p;
+	mode_t mask;
+	int rv;
+
+	if (!zcp || !zcp->pid_file) {
+		errno = EINVAL;
+		zed_log_msg(LOG_ERR, "Failed to create PID file: %s",
+		    strerror(errno));
+		return (-1);
+	}
+	assert(zcp->pid_fd == -1);
+	/*
+	 * Create PID file directory if needed.
+	 */
+	n = strlcpy(buf, zcp->pid_file, sizeof (buf));
+	if (n >= sizeof (buf)) {
+		errno = ENAMETOOLONG;
+		zed_log_msg(LOG_ERR, "Failed to create PID file: %s",
+		    strerror(errno));
+		goto err;
+	}
+	p = strrchr(buf, '/');
+	if (p)
+		*p = '\0';
+
+	if ((mkdirp(buf, dirmode) < 0) && (errno != EEXIST)) {
+		zed_log_msg(LOG_ERR, "Failed to create directory \"%s\": %s",
+		    buf, strerror(errno));
+		goto err;
+	}
+	/*
+	 * Obtain PID file lock.
+	 */
+	mask = umask(0);
+	umask(mask | 022);
+	zcp->pid_fd = open(zcp->pid_file, (O_RDWR | O_CREAT), filemode);
+	umask(mask);
+	if (zcp->pid_fd < 0) {
+		zed_log_msg(LOG_ERR, "Failed to open PID file \"%s\": %s",
+		    zcp->pid_file, strerror(errno));
+		goto err;
+	}
+	rv = zed_file_lock(zcp->pid_fd);
+	if (rv < 0) {
+		zed_log_msg(LOG_ERR, "Failed to lock PID file \"%s\": %s",
+		    zcp->pid_file, strerror(errno));
+		goto err;
+	} else if (rv > 0) {
+		pid_t pid = zed_file_is_locked(zcp->pid_fd);
+		if (pid < 0) {
+			zed_log_msg(LOG_ERR,
+			    "Failed to test lock on PID file \"%s\"",
+			    zcp->pid_file);
+		} else if (pid > 0) {
+			zed_log_msg(LOG_ERR,
+			    "Found PID %d bound to PID file \"%s\"",
+			    pid, zcp->pid_file);
+		} else {
+			zed_log_msg(LOG_ERR,
+			    "Inconsistent lock state on PID file \"%s\"",
+			    zcp->pid_file);
+		}
+		goto err;
+	}
+	/*
+	 * Write PID file.
+	 */
+	n = snprintf(buf, sizeof (buf), "%d\n", (int)getpid());
+	if ((n < 0) || (n >= sizeof (buf))) {
+		errno = ERANGE;
+		zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
+		    zcp->pid_file, strerror(errno));
+	} else if (zed_file_write_n(zcp->pid_fd, buf, n) != n) {
+		zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
+		    zcp->pid_file, strerror(errno));
+	} else if (fdatasync(zcp->pid_fd) < 0) {
+		zed_log_msg(LOG_ERR, "Failed to sync PID file \"%s\": %s",
+		    zcp->pid_file, strerror(errno));
+	} else {
+		return (0);
+	}
+
+err:
+	if (zcp->pid_fd >= 0) {
+		(void) close(zcp->pid_fd);
+		zcp->pid_fd = -1;
+	}
+	return (-1);
+}
+
+/*
+ * Open and lock the [zcp] state_file.
+ * Return 0 on success, -1 on error.
+ *
+ * FIXME: Move state information into kernel.
+ */
+int
+zed_conf_open_state(struct zed_conf *zcp)
+{
+	char dirbuf[PATH_MAX];
+	mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+	int n;
+	char *p;
+	int rv;
+
+	if (!zcp || !zcp->state_file) {
+		errno = EINVAL;
+		zed_log_msg(LOG_ERR, "Failed to open state file: %s",
+		    strerror(errno));
+		return (-1);
+	}
+	n = strlcpy(dirbuf, zcp->state_file, sizeof (dirbuf));
+	if (n >= sizeof (dirbuf)) {
+		errno = ENAMETOOLONG;
+		zed_log_msg(LOG_WARNING, "Failed to open state file: %s",
+		    strerror(errno));
+		return (-1);
+	}
+	p = strrchr(dirbuf, '/');
+	if (p)
+		*p = '\0';
+
+	if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to create directory \"%s\": %s",
+		    dirbuf, strerror(errno));
+		return (-1);
+	}
+	if (zcp->state_fd >= 0) {
+		if (close(zcp->state_fd) < 0) {
+			zed_log_msg(LOG_WARNING,
+			    "Failed to close state file \"%s\": %s",
+			    zcp->state_file, strerror(errno));
+			return (-1);
+		}
+	}
+	if (zcp->do_zero)
+		(void) unlink(zcp->state_file);
+
+	zcp->state_fd = open(zcp->state_file,
+	    (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
+	if (zcp->state_fd < 0) {
+		zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s",
+		    zcp->state_file, strerror(errno));
+		return (-1);
+	}
+	rv = zed_file_lock(zcp->state_fd);
+	if (rv < 0) {
+		zed_log_msg(LOG_WARNING, "Failed to lock state file \"%s\": %s",
+		    zcp->state_file, strerror(errno));
+		return (-1);
+	}
+	if (rv > 0) {
+		pid_t pid = zed_file_is_locked(zcp->state_fd);
+		if (pid < 0) {
+			zed_log_msg(LOG_WARNING,
+			    "Failed to test lock on state file \"%s\"",
+			    zcp->state_file);
+		} else if (pid > 0) {
+			zed_log_msg(LOG_WARNING,
+			    "Found PID %d bound to state file \"%s\"",
+			    pid, zcp->state_file);
+		} else {
+			zed_log_msg(LOG_WARNING,
+			    "Inconsistent lock state on state file \"%s\"",
+			    zcp->state_file);
+		}
+		return (-1);
+	}
+	return (0);
+}
+
+/*
+ * Read the opened [zcp] state_file to obtain the eid & etime of the last event
+ * processed.  Write the state from the last event to the [eidp] & [etime] args
+ * passed by reference.  Note that etime[] is an array of size 2.
+ * Return 0 on success, -1 on error.
+ */
+int
+zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[])
+{
+	ssize_t len;
+	struct iovec iov[3];
+	ssize_t n;
+
+	if (!zcp || !eidp || !etime) {
+		errno = EINVAL;
+		zed_log_msg(LOG_ERR,
+		    "Failed to read state file: %s", strerror(errno));
+		return (-1);
+	}
+	if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to reposition state file offset: %s",
+		    strerror(errno));
+		return (-1);
+	}
+	len = 0;
+	iov[0].iov_base = eidp;
+	len += iov[0].iov_len = sizeof (*eidp);
+	iov[1].iov_base = &etime[0];
+	len += iov[1].iov_len = sizeof (etime[0]);
+	iov[2].iov_base = &etime[1];
+	len += iov[2].iov_len = sizeof (etime[1]);
+
+	n = readv(zcp->state_fd, iov, 3);
+	if (n == 0) {
+		*eidp = 0;
+	} else if (n < 0) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to read state file \"%s\": %s",
+		    zcp->state_file, strerror(errno));
+		return (-1);
+	} else if (n != len) {
+		errno = EIO;
+		zed_log_msg(LOG_WARNING,
+		    "Failed to read state file \"%s\": Read %d of %d bytes",
+		    zcp->state_file, n, len);
+		return (-1);
+	}
+	return (0);
+}
+
+/*
+ * Write the [eid] & [etime] of the last processed event to the opened
+ * [zcp] state_file.  Note that etime[] is an array of size 2.
+ * Return 0 on success, -1 on error.
+ */
+int
+zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[])
+{
+	ssize_t len;
+	struct iovec iov[3];
+	ssize_t n;
+
+	if (!zcp) {
+		errno = EINVAL;
+		zed_log_msg(LOG_ERR,
+		    "Failed to write state file: %s", strerror(errno));
+		return (-1);
+	}
+	if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to reposition state file offset: %s",
+		    strerror(errno));
+		return (-1);
+	}
+	len = 0;
+	iov[0].iov_base = &eid;
+	len += iov[0].iov_len = sizeof (eid);
+	iov[1].iov_base = &etime[0];
+	len += iov[1].iov_len = sizeof (etime[0]);
+	iov[2].iov_base = &etime[1];
+	len += iov[2].iov_len = sizeof (etime[1]);
+
+	n = writev(zcp->state_fd, iov, 3);
+	if (n < 0) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to write state file \"%s\": %s",
+		    zcp->state_file, strerror(errno));
+		return (-1);
+	}
+	if (n != len) {
+		errno = EIO;
+		zed_log_msg(LOG_WARNING,
+		    "Failed to write state file \"%s\": Wrote %d of %d bytes",
+		    zcp->state_file, n, len);
+		return (-1);
+	}
+	if (fdatasync(zcp->state_fd) < 0) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to sync state file \"%s\": %s",
+		    zcp->state_file, strerror(errno));
+		return (-1);
+	}
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_conf.h b/sys/contrib/openzfs/cmd/zed/zed_conf.h
new file mode 100644
index 000000000000..424cb2c01c8c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_conf.h
@@ -0,0 +1,62 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef	ZED_CONF_H
+#define	ZED_CONF_H
+
+#include <libzfs.h>
+#include <stdint.h>
+#include "zed_strings.h"
+
+struct zed_conf {
+	unsigned	do_force:1;		/* true if force enabled */
+	unsigned	do_foreground:1;	/* true if run in foreground */
+	unsigned	do_memlock:1;		/* true if locking memory */
+	unsigned	do_verbose:1;		/* true if verbosity enabled */
+	unsigned	do_zero:1;		/* true if zeroing state */
+	unsigned	do_idle:1;		/* true if idle enabled */
+	int		syslog_facility;	/* syslog facility value */
+	int		min_events;		/* RESERVED FOR FUTURE USE */
+	int		max_events;		/* RESERVED FOR FUTURE USE */
+	char		*conf_file;		/* abs path to config file */
+	char		*pid_file;		/* abs path to pid file */
+	int		pid_fd;			/* fd to pid file for lock */
+	char		*zedlet_dir;		/* abs path to zedlet dir */
+	zed_strings_t	*zedlets;		/* names of enabled zedlets */
+	char		*state_file;		/* abs path to state file */
+	int		state_fd;		/* fd to state file */
+	libzfs_handle_t	*zfs_hdl;		/* handle to libzfs */
+	int		zevent_fd;		/* fd for access to zevents */
+	char		*path;		/* custom $PATH for zedlets to use */
+};
+
+struct zed_conf *zed_conf_create(void);
+
+void zed_conf_destroy(struct zed_conf *zcp);
+
+void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv);
+
+void zed_conf_parse_file(struct zed_conf *zcp);
+
+int zed_conf_scan_dir(struct zed_conf *zcp);
+
+int zed_conf_write_pid(struct zed_conf *zcp);
+
+int zed_conf_open_state(struct zed_conf *zcp);
+
+int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]);
+
+int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]);
+
+#endif	/* !ZED_CONF_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_disk_event.c b/sys/contrib/openzfs/cmd/zed/zed_disk_event.c
new file mode 100644
index 000000000000..174d24523253
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_disk_event.c
@@ -0,0 +1,416 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017, Intel Corporation.
+ */
+
+#ifdef HAVE_LIBUDEV
+
+#include <errno.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <libudev.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+
+#include "zed_log.h"
+#include "zed_disk_event.h"
+#include "agents/zfs_agents.h"
+
+/*
+ * Portions of ZED need to see disk events for disks belonging to ZFS pools.
+ * A libudev monitor is established to monitor block device actions and pass
+ * them on to internal ZED logic modules.  Initially, zfs_mod.c is the only
+ * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
+ * module responsible for handling disk events for ZFS.
+ */
+
+pthread_t g_mon_tid;
+struct udev *g_udev;
+struct udev_monitor *g_mon;
+
+
+#define	DEV_BYID_PATH	"/dev/disk/by-id/"
+
+/* 64MB is minimum usable disk for ZFS */
+#define	MINIMUM_SECTORS		131072
+
+
+/*
+ * Post disk event to SLM module
+ *
+ * occurs in the context of monitor thread
+ */
+static void
+zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+	char *strval;
+	uint64_t numval;
+
+	zed_log_msg(LOG_INFO, "zed_disk_event:");
+	zed_log_msg(LOG_INFO, "\tclass: %s", class);
+	zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
+	if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
+	if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
+	if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
+	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
+	if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
+	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
+	if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
+
+	(void) zfs_agent_post_event(class, subclass, nvl);
+}
+
+/*
+ * dev_event_nvlist: place event schema into an nv pair list
+ *
+ * NAME			VALUE (example)
+ * --------------	--------------------------------------------------------
+ * DEV_NAME		/dev/sdl
+ * DEV_PATH		/devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
+ * DEV_IDENTIFIER	ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
+ * DEV_PHYS_PATH	pci-0000:04:00.0-sas-0x4433221101000000-lun-0
+ * DEV_IS_PART		---
+ * DEV_SIZE		500107862016
+ * ZFS_EV_POOL_GUID	17523635698032189180
+ * ZFS_EV_VDEV_GUID	14663607734290803088
+ */
+static nvlist_t *
+dev_event_nvlist(struct udev_device *dev)
+{
+	nvlist_t *nvl;
+	char strval[128];
+	const char *value, *path;
+	uint64_t guid;
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+		return (NULL);
+
+	if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
+		(void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
+	if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
+		(void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
+	if ((path = udev_device_get_devnode(dev)) != NULL)
+		(void) nvlist_add_string(nvl, DEV_NAME, path);
+	if ((value = udev_device_get_devpath(dev)) != NULL)
+		(void) nvlist_add_string(nvl, DEV_PATH, value);
+	value = udev_device_get_devtype(dev);
+	if ((value != NULL && strcmp("partition", value) == 0) ||
+	    (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
+	    != NULL)) {
+		(void) nvlist_add_boolean(nvl, DEV_IS_PART);
+	}
+	if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
+		uint64_t numval = DEV_BSIZE;
+
+		numval *= strtoull(value, NULL, 10);
+		(void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
+	}
+
+	/*
+	 * Grab the pool and vdev guids from blkid cache
+	 */
+	value = udev_device_get_property_value(dev, "ID_FS_UUID");
+	if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
+		(void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
+
+	value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
+	if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
+		(void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
+
+	/*
+	 * Either a vdev guid or a devid must be present for matching
+	 */
+	if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
+	    !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
+		nvlist_free(nvl);
+		return (NULL);
+	}
+
+	return (nvl);
+}
+
+/*
+ *  Listen for block device uevents
+ */
+static void *
+zed_udev_monitor(void *arg)
+{
+	struct udev_monitor *mon = arg;
+	char *tmp, *tmp2;
+
+	zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
+
+	while (1) {
+		struct udev_device *dev;
+		const char *action, *type, *part, *sectors;
+		const char *bus, *uuid;
+		const char *class, *subclass;
+		nvlist_t *nvl;
+		boolean_t is_zfs = B_FALSE;
+
+		/* allow a cancellation while blocked (recvmsg) */
+		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+
+		/* blocks at recvmsg until an event occurs */
+		if ((dev = udev_monitor_receive_device(mon)) == NULL) {
+			zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
+			    "device error %d", errno);
+			continue;
+		}
+
+		/* allow all steps to complete before a cancellation */
+		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+
+		/*
+		 * Strongly typed device is the preferred filter
+		 */
+		type = udev_device_get_property_value(dev, "ID_FS_TYPE");
+		if (type != NULL && type[0] != '\0') {
+			if (strcmp(type, "zfs_member") == 0) {
+				is_zfs = B_TRUE;
+			} else {
+				/* not ours, so skip */
+				zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
+				    "%s (in use by %s)",
+				    udev_device_get_devnode(dev), type);
+				udev_device_unref(dev);
+				continue;
+			}
+		}
+
+		/*
+		 * if this is a disk and it is partitioned, then the
+		 * zfs label will reside in a DEVTYPE=partition and
+		 * we can skip passing this event
+		 */
+		type = udev_device_get_property_value(dev, "DEVTYPE");
+		part = udev_device_get_property_value(dev,
+		    "ID_PART_TABLE_TYPE");
+		if (type != NULL && type[0] != '\0' &&
+		    strcmp(type, "disk") == 0 &&
+		    part != NULL && part[0] != '\0') {
+			/* skip and wait for partition event */
+			udev_device_unref(dev);
+			continue;
+		}
+
+		/*
+		 * ignore small partitions
+		 */
+		sectors = udev_device_get_property_value(dev,
+		    "ID_PART_ENTRY_SIZE");
+		if (sectors == NULL)
+			sectors = udev_device_get_sysattr_value(dev, "size");
+		if (sectors != NULL &&
+		    strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
+			udev_device_unref(dev);
+			continue;
+		}
+
+		/*
+		 * If the blkid probe didn't find ZFS, then a persistent
+		 * device id string is required in the message schema
+		 * for matching with vdevs. Preflight here for expected
+		 * udev information.
+		 */
+		bus = udev_device_get_property_value(dev, "ID_BUS");
+		uuid = udev_device_get_property_value(dev, "DM_UUID");
+		if (!is_zfs && (bus == NULL && uuid == NULL)) {
+			zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
+			    "source", udev_device_get_devnode(dev));
+			udev_device_unref(dev);
+			continue;
+		}
+
+		action = udev_device_get_action(dev);
+		if (strcmp(action, "add") == 0) {
+			class = EC_DEV_ADD;
+			subclass = ESC_DISK;
+		} else if (strcmp(action, "remove") == 0) {
+			class = EC_DEV_REMOVE;
+			subclass = ESC_DISK;
+		} else if (strcmp(action, "change") == 0) {
+			class = EC_DEV_STATUS;
+			subclass = ESC_DEV_DLE;
+		} else {
+			zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
+			    action);
+			udev_device_unref(dev);
+			continue;
+		}
+
+		/*
+		 * Special case an EC_DEV_ADD for multipath devices
+		 *
+		 * When a multipath device is created, udev reports the
+		 * following:
+		 *
+		 * 1.	"add" event of the dm device for the multipath device
+		 *	(like /dev/dm-3).
+		 * 2.	"change" event to create the actual multipath device
+		 *	symlink (like /dev/mapper/mpatha).  The event also
+		 *	passes back the relevant DM vars we care about, like
+		 *	DM_UUID.
+		 * 3.	Another "change" event identical to #2 (that we ignore).
+		 *
+		 * To get the behavior we want, we treat the "change" event
+		 * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
+		 * a new disk being added.
+		 */
+		if (strcmp(class, EC_DEV_STATUS) == 0 &&
+		    udev_device_get_property_value(dev, "DM_UUID") &&
+		    udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
+			tmp = (char *)udev_device_get_devnode(dev);
+			tmp2 = zfs_get_underlying_path(tmp);
+			if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
+				/*
+				 * We have a real underlying device, which
+				 * means that this multipath "change" event is
+				 * an "add" event.
+				 *
+				 * If the multipath device and the underlying
+				 * dev are the same name (i.e. /dev/dm-5), then
+				 * there is no real underlying disk for this
+				 * multipath device, and so this "change" event
+				 * really is a multipath removal.
+				 */
+				class = EC_DEV_ADD;
+				subclass = ESC_DISK;
+			} else {
+				tmp = (char *)
+				    udev_device_get_property_value(dev,
+				    "DM_NR_VALID_PATHS");
+				/* treat as a multipath remove */
+				if (tmp != NULL && strcmp(tmp, "0") == 0) {
+					class = EC_DEV_REMOVE;
+					subclass = ESC_DISK;
+				}
+			}
+			free(tmp2);
+		}
+
+		/*
+		 * Special case an EC_DEV_ADD for scsi_debug devices
+		 *
+		 * These devices require a udevadm trigger command after
+		 * creation in order to register the vdev_id scsidebug alias
+		 * rule (adds a persistent path (phys_path) used for fault
+		 * management automated tests in the ZFS test suite.
+		 *
+		 * After udevadm trigger command, event registers as a "change"
+		 * event but needs to instead be handled as another "add" event
+		 * to allow for disk labeling and partitioning to occur.
+		 */
+		if (strcmp(class, EC_DEV_STATUS) == 0 &&
+		    udev_device_get_property_value(dev, "ID_VDEV") &&
+		    udev_device_get_property_value(dev, "ID_MODEL")) {
+			const char *id_model, *id_model_sd = "scsi_debug";
+
+			id_model = udev_device_get_property_value(dev,
+			    "ID_MODEL");
+			if (strcmp(id_model, id_model_sd) == 0) {
+				class = EC_DEV_ADD;
+				subclass = ESC_DISK;
+			}
+		}
+
+		if ((nvl = dev_event_nvlist(dev)) != NULL) {
+			zed_udev_event(class, subclass, nvl);
+			nvlist_free(nvl);
+		}
+
+		udev_device_unref(dev);
+	}
+
+	return (NULL);
+}
+
+int
+zed_disk_event_init()
+{
+	int fd, fflags;
+
+	if ((g_udev = udev_new()) == NULL) {
+		zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
+		return (-1);
+	}
+
+	/* Set up a udev monitor for block devices */
+	g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
+	udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
+	udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
+	    "partition");
+	udev_monitor_enable_receiving(g_mon);
+
+	/* Make sure monitoring socket is blocking */
+	fd = udev_monitor_get_fd(g_mon);
+	if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
+		(void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
+
+	/* spawn a thread to monitor events */
+	if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
+		udev_monitor_unref(g_mon);
+		udev_unref(g_udev);
+		zed_log_msg(LOG_WARNING, "pthread_create failed");
+		return (-1);
+	}
+
+	zed_log_msg(LOG_INFO, "zed_disk_event_init");
+
+	return (0);
+}
+
+void
+zed_disk_event_fini()
+{
+	/* cancel monitor thread at recvmsg() */
+	(void) pthread_cancel(g_mon_tid);
+	(void) pthread_join(g_mon_tid, NULL);
+
+	/* cleanup udev resources */
+	udev_monitor_unref(g_mon);
+	udev_unref(g_udev);
+
+	zed_log_msg(LOG_INFO, "zed_disk_event_fini");
+}
+
+#else
+
+#include "zed_disk_event.h"
+
+int
+zed_disk_event_init()
+{
+	return (0);
+}
+
+void
+zed_disk_event_fini()
+{
+}
+
+#endif /* HAVE_LIBUDEV */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_disk_event.h b/sys/contrib/openzfs/cmd/zed/zed_disk_event.h
new file mode 100644
index 000000000000..ea9813d0a595
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_disk_event.h
@@ -0,0 +1,31 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef	ZED_DISK_EVENT_H
+#define	ZED_DISK_EVENT_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern int zed_disk_event_init(void);
+extern void zed_disk_event_fini(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* !ZED_DISK_EVENT_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_event.c b/sys/contrib/openzfs/cmd/zed/zed_event.c
new file mode 100644
index 000000000000..1c5d00e297ff
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_event.c
@@ -0,0 +1,965 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libzfs.h>			/* FIXME: Replace with libzfs_core. */
+#include <paths.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/zfs_ioctl.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/fm/fs/zfs.h>
+#include "zed.h"
+#include "zed_conf.h"
+#include "zed_disk_event.h"
+#include "zed_event.h"
+#include "zed_exec.h"
+#include "zed_file.h"
+#include "zed_log.h"
+#include "zed_strings.h"
+
+#include "agents/zfs_agents.h"
+
+#define	MAXBUF	4096
+
+/*
+ * Open the libzfs interface.
+ */
+int
+zed_event_init(struct zed_conf *zcp)
+{
+	if (!zcp)
+		zed_log_die("Failed zed_event_init: %s", strerror(EINVAL));
+
+	zcp->zfs_hdl = libzfs_init();
+	if (!zcp->zfs_hdl) {
+		if (zcp->do_idle)
+			return (-1);
+		zed_log_die("Failed to initialize libzfs");
+	}
+
+	zcp->zevent_fd = open(ZFS_DEV, O_RDWR);
+	if (zcp->zevent_fd < 0) {
+		if (zcp->do_idle)
+			return (-1);
+		zed_log_die("Failed to open \"%s\": %s",
+		    ZFS_DEV, strerror(errno));
+	}
+
+	zfs_agent_init(zcp->zfs_hdl);
+
+	if (zed_disk_event_init() != 0) {
+		if (zcp->do_idle)
+			return (-1);
+		zed_log_die("Failed to initialize disk events");
+	}
+
+	return (0);
+}
+
+/*
+ * Close the libzfs interface.
+ */
+void
+zed_event_fini(struct zed_conf *zcp)
+{
+	if (!zcp)
+		zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL));
+
+	zed_disk_event_fini();
+	zfs_agent_fini();
+
+	if (zcp->zevent_fd >= 0) {
+		if (close(zcp->zevent_fd) < 0)
+			zed_log_msg(LOG_WARNING, "Failed to close \"%s\": %s",
+			    ZFS_DEV, strerror(errno));
+
+		zcp->zevent_fd = -1;
+	}
+	if (zcp->zfs_hdl) {
+		libzfs_fini(zcp->zfs_hdl);
+		zcp->zfs_hdl = NULL;
+	}
+}
+
+/*
+ * Seek to the event specified by [saved_eid] and [saved_etime].
+ * This protects against processing a given event more than once.
+ * Return 0 upon a successful seek to the specified event, or -1 otherwise.
+ *
+ * A zevent is considered to be uniquely specified by its (eid,time) tuple.
+ * The unsigned 64b eid is set to 1 when the kernel module is loaded, and
+ * incremented by 1 for each new event.  Since the state file can persist
+ * across a kernel module reload, the time must be checked to ensure a match.
+ */
+int
+zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[])
+{
+	uint64_t eid;
+	int found;
+	nvlist_t *nvl;
+	int n_dropped;
+	int64_t *etime;
+	uint_t nelem;
+	int rv;
+
+	if (!zcp) {
+		errno = EINVAL;
+		zed_log_msg(LOG_ERR, "Failed to seek zevent: %s",
+		    strerror(errno));
+		return (-1);
+	}
+	eid = 0;
+	found = 0;
+	while ((eid < saved_eid) && !found) {
+		rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped,
+		    ZEVENT_NONBLOCK, zcp->zevent_fd);
+
+		if ((rv != 0) || !nvl)
+			break;
+
+		if (n_dropped > 0) {
+			zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped);
+			/*
+			 * FIXME: Increase max size of event nvlist in
+			 *   /sys/module/zfs/parameters/zfs_zevent_len_max ?
+			 */
+		}
+		if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) {
+			zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid");
+		} else if (nvlist_lookup_int64_array(nvl, "time",
+		    &etime, &nelem) != 0) {
+			zed_log_msg(LOG_WARNING,
+			    "Failed to lookup zevent time (eid=%llu)", eid);
+		} else if (nelem != 2) {
+			zed_log_msg(LOG_WARNING,
+			    "Failed to lookup zevent time (eid=%llu, nelem=%u)",
+			    eid, nelem);
+		} else if ((eid != saved_eid) ||
+		    (etime[0] != saved_etime[0]) ||
+		    (etime[1] != saved_etime[1])) {
+			/* no-op */
+		} else {
+			found = 1;
+		}
+		free(nvl);
+	}
+	if (!found && (saved_eid > 0)) {
+		if (zpool_events_seek(zcp->zfs_hdl, ZEVENT_SEEK_START,
+		    zcp->zevent_fd) < 0)
+			zed_log_msg(LOG_WARNING, "Failed to seek to eid=0");
+		else
+			eid = 0;
+	}
+	zed_log_msg(LOG_NOTICE, "Processing events since eid=%llu", eid);
+	return (found ? 0 : -1);
+}
+
+/*
+ * Return non-zero if nvpair [name] should be formatted in hex; o/w, return 0.
+ */
+static int
+_zed_event_value_is_hex(const char *name)
+{
+	const char *hex_suffix[] = {
+		"_guid",
+		"_guids",
+		NULL
+	};
+	const char **pp;
+	char *p;
+
+	if (!name)
+		return (0);
+
+	for (pp = hex_suffix; *pp; pp++) {
+		p = strstr(name, *pp);
+		if (p && strlen(p) == strlen(*pp))
+			return (1);
+	}
+	return (0);
+}
+
+/*
+ * Add an environment variable for [eid] to the container [zsp].
+ *
+ * The variable name is the concatenation of [prefix] and [name] converted to
+ * uppercase with non-alphanumeric characters converted to underscores;
+ * [prefix] is optional, and [name] must begin with an alphabetic character.
+ * If the converted variable name already exists within the container [zsp],
+ * its existing value will be replaced with the new value.
+ *
+ * The variable value is specified by the format string [fmt].
+ *
+ * Returns 0 on success, and -1 on error (with errno set).
+ *
+ * All environment variables in [zsp] should be added through this function.
+ */
+static int
+_zed_event_add_var(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, const char *name, const char *fmt, ...)
+{
+	char keybuf[MAXBUF];
+	char valbuf[MAXBUF];
+	char *dstp;
+	const char *srcp;
+	const char *lastp;
+	int n;
+	int buflen;
+	va_list vargs;
+
+	assert(zsp != NULL);
+	assert(fmt != NULL);
+
+	if (!name) {
+		errno = EINVAL;
+		zed_log_msg(LOG_WARNING,
+		    "Failed to add variable for eid=%llu: Name is empty", eid);
+		return (-1);
+	} else if (!isalpha(name[0])) {
+		errno = EINVAL;
+		zed_log_msg(LOG_WARNING,
+		    "Failed to add variable for eid=%llu: "
+		    "Name \"%s\" is invalid", eid, name);
+		return (-1);
+	}
+	/*
+	 * Construct the string key by converting PREFIX (if present) and NAME.
+	 */
+	dstp = keybuf;
+	lastp = keybuf + sizeof (keybuf);
+	if (prefix) {
+		for (srcp = prefix; *srcp && (dstp < lastp); srcp++)
+			*dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_';
+	}
+	for (srcp = name; *srcp && (dstp < lastp); srcp++)
+		*dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_';
+
+	if (dstp == lastp) {
+		errno = ENAMETOOLONG;
+		zed_log_msg(LOG_WARNING,
+		    "Failed to add variable for eid=%llu: Name too long", eid);
+		return (-1);
+	}
+	*dstp = '\0';
+	/*
+	 * Construct the string specified by "[PREFIX][NAME]=[FMT]".
+	 */
+	dstp = valbuf;
+	buflen = sizeof (valbuf);
+	n = strlcpy(dstp, keybuf, buflen);
+	if (n >= sizeof (valbuf)) {
+		errno = EMSGSIZE;
+		zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+		    keybuf, eid, "Exceeded buffer size");
+		return (-1);
+	}
+	dstp += n;
+	buflen -= n;
+
+	*dstp++ = '=';
+	buflen--;
+
+	if (buflen <= 0) {
+		errno = EMSGSIZE;
+		zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+		    keybuf, eid, "Exceeded buffer size");
+		return (-1);
+	}
+
+	va_start(vargs, fmt);
+	n = vsnprintf(dstp, buflen, fmt, vargs);
+	va_end(vargs);
+
+	if ((n < 0) || (n >= buflen)) {
+		errno = EMSGSIZE;
+		zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+		    keybuf, eid, "Exceeded buffer size");
+		return (-1);
+	} else if (zed_strings_add(zsp, keybuf, valbuf) < 0) {
+		zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+		    keybuf, eid, strerror(errno));
+		return (-1);
+	}
+	return (0);
+}
+
+static int
+_zed_event_add_array_err(uint64_t eid, const char *name)
+{
+	errno = EMSGSIZE;
+	zed_log_msg(LOG_WARNING,
+	    "Failed to convert nvpair \"%s\" for eid=%llu: "
+	    "Exceeded buffer size", name, eid);
+	return (-1);
+}
+
+static int
+_zed_event_add_int8_array(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, nvpair_t *nvp)
+{
+	char buf[MAXBUF];
+	int buflen = sizeof (buf);
+	const char *name;
+	int8_t *i8p;
+	uint_t nelem;
+	uint_t i;
+	char *p;
+	int n;
+
+	assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT8_ARRAY));
+
+	name = nvpair_name(nvp);
+	(void) nvpair_value_int8_array(nvp, &i8p, &nelem);
+	for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+		n = snprintf(p, buflen, "%d ", i8p[i]);
+		if ((n < 0) || (n >= buflen))
+			return (_zed_event_add_array_err(eid, name));
+		p += n;
+		buflen -= n;
+	}
+	if (nelem > 0)
+		*--p = '\0';
+
+	return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint8_array(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, nvpair_t *nvp)
+{
+	char buf[MAXBUF];
+	int buflen = sizeof (buf);
+	const char *name;
+	uint8_t *u8p;
+	uint_t nelem;
+	uint_t i;
+	char *p;
+	int n;
+
+	assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT8_ARRAY));
+
+	name = nvpair_name(nvp);
+	(void) nvpair_value_uint8_array(nvp, &u8p, &nelem);
+	for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+		n = snprintf(p, buflen, "%u ", u8p[i]);
+		if ((n < 0) || (n >= buflen))
+			return (_zed_event_add_array_err(eid, name));
+		p += n;
+		buflen -= n;
+	}
+	if (nelem > 0)
+		*--p = '\0';
+
+	return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_int16_array(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, nvpair_t *nvp)
+{
+	char buf[MAXBUF];
+	int buflen = sizeof (buf);
+	const char *name;
+	int16_t *i16p;
+	uint_t nelem;
+	uint_t i;
+	char *p;
+	int n;
+
+	assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT16_ARRAY));
+
+	name = nvpair_name(nvp);
+	(void) nvpair_value_int16_array(nvp, &i16p, &nelem);
+	for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+		n = snprintf(p, buflen, "%d ", i16p[i]);
+		if ((n < 0) || (n >= buflen))
+			return (_zed_event_add_array_err(eid, name));
+		p += n;
+		buflen -= n;
+	}
+	if (nelem > 0)
+		*--p = '\0';
+
+	return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint16_array(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, nvpair_t *nvp)
+{
+	char buf[MAXBUF];
+	int buflen = sizeof (buf);
+	const char *name;
+	uint16_t *u16p;
+	uint_t nelem;
+	uint_t i;
+	char *p;
+	int n;
+
+	assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT16_ARRAY));
+
+	name = nvpair_name(nvp);
+	(void) nvpair_value_uint16_array(nvp, &u16p, &nelem);
+	for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+		n = snprintf(p, buflen, "%u ", u16p[i]);
+		if ((n < 0) || (n >= buflen))
+			return (_zed_event_add_array_err(eid, name));
+		p += n;
+		buflen -= n;
+	}
+	if (nelem > 0)
+		*--p = '\0';
+
+	return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_int32_array(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, nvpair_t *nvp)
+{
+	char buf[MAXBUF];
+	int buflen = sizeof (buf);
+	const char *name;
+	int32_t *i32p;
+	uint_t nelem;
+	uint_t i;
+	char *p;
+	int n;
+
+	assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT32_ARRAY));
+
+	name = nvpair_name(nvp);
+	(void) nvpair_value_int32_array(nvp, &i32p, &nelem);
+	for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+		n = snprintf(p, buflen, "%d ", i32p[i]);
+		if ((n < 0) || (n >= buflen))
+			return (_zed_event_add_array_err(eid, name));
+		p += n;
+		buflen -= n;
+	}
+	if (nelem > 0)
+		*--p = '\0';
+
+	return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint32_array(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, nvpair_t *nvp)
+{
+	char buf[MAXBUF];
+	int buflen = sizeof (buf);
+	const char *name;
+	uint32_t *u32p;
+	uint_t nelem;
+	uint_t i;
+	char *p;
+	int n;
+
+	assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY));
+
+	name = nvpair_name(nvp);
+	(void) nvpair_value_uint32_array(nvp, &u32p, &nelem);
+	for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+		n = snprintf(p, buflen, "%u ", u32p[i]);
+		if ((n < 0) || (n >= buflen))
+			return (_zed_event_add_array_err(eid, name));
+		p += n;
+		buflen -= n;
+	}
+	if (nelem > 0)
+		*--p = '\0';
+
+	return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_int64_array(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, nvpair_t *nvp)
+{
+	char buf[MAXBUF];
+	int buflen = sizeof (buf);
+	const char *name;
+	int64_t *i64p;
+	uint_t nelem;
+	uint_t i;
+	char *p;
+	int n;
+
+	assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT64_ARRAY));
+
+	name = nvpair_name(nvp);
+	(void) nvpair_value_int64_array(nvp, &i64p, &nelem);
+	for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+		n = snprintf(p, buflen, "%lld ", (u_longlong_t)i64p[i]);
+		if ((n < 0) || (n >= buflen))
+			return (_zed_event_add_array_err(eid, name));
+		p += n;
+		buflen -= n;
+	}
+	if (nelem > 0)
+		*--p = '\0';
+
+	return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint64_array(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, nvpair_t *nvp)
+{
+	char buf[MAXBUF];
+	int buflen = sizeof (buf);
+	const char *name;
+	const char *fmt;
+	uint64_t *u64p;
+	uint_t nelem;
+	uint_t i;
+	char *p;
+	int n;
+
+	assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT64_ARRAY));
+
+	name = nvpair_name(nvp);
+	fmt = _zed_event_value_is_hex(name) ? "0x%.16llX " : "%llu ";
+	(void) nvpair_value_uint64_array(nvp, &u64p, &nelem);
+	for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+		n = snprintf(p, buflen, fmt, (u_longlong_t)u64p[i]);
+		if ((n < 0) || (n >= buflen))
+			return (_zed_event_add_array_err(eid, name));
+		p += n;
+		buflen -= n;
+	}
+	if (nelem > 0)
+		*--p = '\0';
+
+	return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_string_array(uint64_t eid, zed_strings_t *zsp,
+    const char *prefix, nvpair_t *nvp)
+{
+	char buf[MAXBUF];
+	int buflen = sizeof (buf);
+	const char *name;
+	char **strp;
+	uint_t nelem;
+	uint_t i;
+	char *p;
+	int n;
+
+	assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_STRING_ARRAY));
+
+	name = nvpair_name(nvp);
+	(void) nvpair_value_string_array(nvp, &strp, &nelem);
+	for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+		n = snprintf(p, buflen, "%s ", strp[i] ? strp[i] : "<NULL>");
+		if ((n < 0) || (n >= buflen))
+			return (_zed_event_add_array_err(eid, name));
+		p += n;
+		buflen -= n;
+	}
+	if (nelem > 0)
+		*--p = '\0';
+
+	return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+/*
+ * Convert the nvpair [nvp] to a string which is added to the environment
+ * of the child process.
+ * Return 0 on success, -1 on error.
+ *
+ * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()?
+ */
+static void
+_zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp)
+{
+	const char *name;
+	data_type_t type;
+	const char *prefix = ZEVENT_VAR_PREFIX;
+	boolean_t b;
+	double d;
+	uint8_t i8;
+	uint16_t i16;
+	uint32_t i32;
+	uint64_t i64;
+	char *str;
+
+	assert(zsp != NULL);
+	assert(nvp != NULL);
+
+	name = nvpair_name(nvp);
+	type = nvpair_type(nvp);
+
+	switch (type) {
+	case DATA_TYPE_BOOLEAN:
+		_zed_event_add_var(eid, zsp, prefix, name, "%s", "1");
+		break;
+	case DATA_TYPE_BOOLEAN_VALUE:
+		(void) nvpair_value_boolean_value(nvp, &b);
+		_zed_event_add_var(eid, zsp, prefix, name, "%s", b ? "1" : "0");
+		break;
+	case DATA_TYPE_BYTE:
+		(void) nvpair_value_byte(nvp, &i8);
+		_zed_event_add_var(eid, zsp, prefix, name, "%d", i8);
+		break;
+	case DATA_TYPE_INT8:
+		(void) nvpair_value_int8(nvp, (int8_t *)&i8);
+		_zed_event_add_var(eid, zsp, prefix, name, "%d", i8);
+		break;
+	case DATA_TYPE_UINT8:
+		(void) nvpair_value_uint8(nvp, &i8);
+		_zed_event_add_var(eid, zsp, prefix, name, "%u", i8);
+		break;
+	case DATA_TYPE_INT16:
+		(void) nvpair_value_int16(nvp, (int16_t *)&i16);
+		_zed_event_add_var(eid, zsp, prefix, name, "%d", i16);
+		break;
+	case DATA_TYPE_UINT16:
+		(void) nvpair_value_uint16(nvp, &i16);
+		_zed_event_add_var(eid, zsp, prefix, name, "%u", i16);
+		break;
+	case DATA_TYPE_INT32:
+		(void) nvpair_value_int32(nvp, (int32_t *)&i32);
+		_zed_event_add_var(eid, zsp, prefix, name, "%d", i32);
+		break;
+	case DATA_TYPE_UINT32:
+		(void) nvpair_value_uint32(nvp, &i32);
+		_zed_event_add_var(eid, zsp, prefix, name, "%u", i32);
+		break;
+	case DATA_TYPE_INT64:
+		(void) nvpair_value_int64(nvp, (int64_t *)&i64);
+		_zed_event_add_var(eid, zsp, prefix, name,
+		    "%lld", (longlong_t)i64);
+		break;
+	case DATA_TYPE_UINT64:
+		(void) nvpair_value_uint64(nvp, &i64);
+		_zed_event_add_var(eid, zsp, prefix, name,
+		    (_zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"),
+		    (u_longlong_t)i64);
+		/*
+		 * shadow readable strings for vdev state pairs
+		 */
+		if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 ||
+		    strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) {
+			char alt[32];
+
+			(void) snprintf(alt, sizeof (alt), "%s_str", name);
+			_zed_event_add_var(eid, zsp, prefix, alt, "%s",
+			    zpool_state_to_name(i64, VDEV_AUX_NONE));
+		} else
+		/*
+		 * shadow readable strings for pool state
+		 */
+		if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE) == 0) {
+			char alt[32];
+
+			(void) snprintf(alt, sizeof (alt), "%s_str", name);
+			_zed_event_add_var(eid, zsp, prefix, alt, "%s",
+			    zpool_pool_state_to_name(i64));
+		}
+		break;
+	case DATA_TYPE_DOUBLE:
+		(void) nvpair_value_double(nvp, &d);
+		_zed_event_add_var(eid, zsp, prefix, name, "%g", d);
+		break;
+	case DATA_TYPE_HRTIME:
+		(void) nvpair_value_hrtime(nvp, (hrtime_t *)&i64);
+		_zed_event_add_var(eid, zsp, prefix, name,
+		    "%llu", (u_longlong_t)i64);
+		break;
+	case DATA_TYPE_NVLIST:
+		_zed_event_add_var(eid, zsp, prefix, name,
+		    "%s", "_NOT_IMPLEMENTED_");			/* FIXME */
+		break;
+	case DATA_TYPE_STRING:
+		(void) nvpair_value_string(nvp, &str);
+		_zed_event_add_var(eid, zsp, prefix, name,
+		    "%s", (str ? str : "<NULL>"));
+		break;
+	case DATA_TYPE_BOOLEAN_ARRAY:
+		_zed_event_add_var(eid, zsp, prefix, name,
+		    "%s", "_NOT_IMPLEMENTED_");			/* FIXME */
+		break;
+	case DATA_TYPE_BYTE_ARRAY:
+		_zed_event_add_var(eid, zsp, prefix, name,
+		    "%s", "_NOT_IMPLEMENTED_");			/* FIXME */
+		break;
+	case DATA_TYPE_INT8_ARRAY:
+		_zed_event_add_int8_array(eid, zsp, prefix, nvp);
+		break;
+	case DATA_TYPE_UINT8_ARRAY:
+		_zed_event_add_uint8_array(eid, zsp, prefix, nvp);
+		break;
+	case DATA_TYPE_INT16_ARRAY:
+		_zed_event_add_int16_array(eid, zsp, prefix, nvp);
+		break;
+	case DATA_TYPE_UINT16_ARRAY:
+		_zed_event_add_uint16_array(eid, zsp, prefix, nvp);
+		break;
+	case DATA_TYPE_INT32_ARRAY:
+		_zed_event_add_int32_array(eid, zsp, prefix, nvp);
+		break;
+	case DATA_TYPE_UINT32_ARRAY:
+		_zed_event_add_uint32_array(eid, zsp, prefix, nvp);
+		break;
+	case DATA_TYPE_INT64_ARRAY:
+		_zed_event_add_int64_array(eid, zsp, prefix, nvp);
+		break;
+	case DATA_TYPE_UINT64_ARRAY:
+		_zed_event_add_uint64_array(eid, zsp, prefix, nvp);
+		break;
+	case DATA_TYPE_STRING_ARRAY:
+		_zed_event_add_string_array(eid, zsp, prefix, nvp);
+		break;
+	case DATA_TYPE_NVLIST_ARRAY:
+		_zed_event_add_var(eid, zsp, prefix, name,
+		    "%s", "_NOT_IMPLEMENTED_");			/* FIXME */
+		break;
+	default:
+		errno = EINVAL;
+		zed_log_msg(LOG_WARNING,
+		    "Failed to convert nvpair \"%s\" for eid=%llu: "
+		    "Unrecognized type=%u", name, eid, (unsigned int) type);
+		break;
+	}
+}
+
+/*
+ * Restrict various environment variables to safe and sane values
+ * when constructing the environment for the child process, unless
+ * we're running with a custom $PATH (like under the ZFS test suite).
+ *
+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1.
+ */
+static void
+_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp,
+    const char *path)
+{
+	const char *env_restrict[][2] = {
+		{ "IFS",		" \t\n" },
+		{ "PATH",		_PATH_STDPATH },
+		{ "ZDB",		SBINDIR "/zdb" },
+		{ "ZED",		SBINDIR "/zed" },
+		{ "ZFS",		SBINDIR "/zfs" },
+		{ "ZINJECT",		SBINDIR "/zinject" },
+		{ "ZPOOL",		SBINDIR "/zpool" },
+		{ "ZFS_ALIAS",		ZFS_META_ALIAS },
+		{ "ZFS_VERSION",	ZFS_META_VERSION },
+		{ "ZFS_RELEASE",	ZFS_META_RELEASE },
+		{ NULL,			NULL }
+	};
+
+	/*
+	 * If we have a custom $PATH, use the default ZFS binary locations
+	 * instead of the hard-coded ones.
+	 */
+	const char *env_path[][2] = {
+		{ "IFS",		" \t\n" },
+		{ "PATH",		NULL }, /* $PATH copied in later on */
+		{ "ZDB",		"zdb" },
+		{ "ZED",		"zed" },
+		{ "ZFS",		"zfs" },
+		{ "ZINJECT",		"zinject" },
+		{ "ZPOOL",		"zpool" },
+		{ "ZFS_ALIAS",		ZFS_META_ALIAS },
+		{ "ZFS_VERSION",	ZFS_META_VERSION },
+		{ "ZFS_RELEASE",	ZFS_META_RELEASE },
+		{ NULL,			NULL }
+	};
+	const char *(*pa)[2];
+
+	assert(zsp != NULL);
+
+	pa = path != NULL ? env_path : env_restrict;
+
+	for (; *(*pa); pa++) {
+		/* Use our custom $PATH if we have one */
+		if (path != NULL && strcmp((*pa)[0], "PATH") == 0)
+			(*pa)[1] = path;
+
+		_zed_event_add_var(eid, zsp, NULL, (*pa)[0], "%s", (*pa)[1]);
+	}
+}
+
+/*
+ * Preserve specified variables from the parent environment
+ * when constructing the environment for the child process.
+ *
+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1.
+ */
+static void
+_zed_event_add_env_preserve(uint64_t eid, zed_strings_t *zsp)
+{
+	const char *env_preserve[] = {
+		"TZ",
+		NULL
+	};
+	const char **keyp;
+	const char *val;
+
+	assert(zsp != NULL);
+
+	for (keyp = env_preserve; *keyp; keyp++) {
+		if ((val = getenv(*keyp)))
+			_zed_event_add_var(eid, zsp, NULL, *keyp, "%s", val);
+	}
+}
+
+/*
+ * Compute the "subclass" by removing the first 3 components of [class]
+ * (which will always be of the form "*.fs.zfs").  Return a pointer inside
+ * the string [class], or NULL if insufficient components exist.
+ */
+static const char *
+_zed_event_get_subclass(const char *class)
+{
+	const char *p;
+	int i;
+
+	if (!class)
+		return (NULL);
+
+	p = class;
+	for (i = 0; i < 3; i++) {
+		p = strchr(p, '.');
+		if (!p)
+			break;
+		p++;
+	}
+	return (p);
+}
+
+/*
+ * Convert the zevent time from a 2-element array of 64b integers
+ * into a more convenient form:
+ * - TIME_SECS is the second component of the time.
+ * - TIME_NSECS is the nanosecond component of the time.
+ * - TIME_STRING is an almost-RFC3339-compliant string representation.
+ */
+static void
+_zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[])
+{
+	struct tm *stp;
+	char buf[32];
+
+	assert(zsp != NULL);
+	assert(etime != NULL);
+
+	_zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_SECS",
+	    "%lld", (long long int) etime[0]);
+	_zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_NSECS",
+	    "%lld", (long long int) etime[1]);
+
+	if (!(stp = localtime((const time_t *) &etime[0]))) {
+		zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s",
+		    ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "localtime error");
+	} else if (!strftime(buf, sizeof (buf), "%Y-%m-%d %H:%M:%S%z", stp)) {
+		zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s",
+		    ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "strftime error");
+	} else {
+		_zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_STRING",
+		    "%s", buf);
+	}
+}
+
+/*
+ * Service the next zevent, blocking until one is available.
+ */
+int
+zed_event_service(struct zed_conf *zcp)
+{
+	nvlist_t *nvl;
+	nvpair_t *nvp;
+	int n_dropped;
+	zed_strings_t *zsp;
+	uint64_t eid;
+	int64_t *etime;
+	uint_t nelem;
+	char *class;
+	const char *subclass;
+	int rv;
+
+	if (!zcp) {
+		errno = EINVAL;
+		zed_log_msg(LOG_ERR, "Failed to service zevent: %s",
+		    strerror(errno));
+		return (EINVAL);
+	}
+	rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, ZEVENT_NONE,
+	    zcp->zevent_fd);
+
+	if ((rv != 0) || !nvl)
+		return (errno);
+
+	if (n_dropped > 0) {
+		zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped);
+		/*
+		 * FIXME: Increase max size of event nvlist in
+		 * /sys/module/zfs/parameters/zfs_zevent_len_max ?
+		 */
+	}
+	if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) {
+		zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid");
+	} else if (nvlist_lookup_int64_array(
+	    nvl, "time", &etime, &nelem) != 0) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to lookup zevent time (eid=%llu)", eid);
+	} else if (nelem != 2) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to lookup zevent time (eid=%llu, nelem=%u)",
+		    eid, nelem);
+	} else if (nvlist_lookup_string(nvl, "class", &class) != 0) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to lookup zevent class (eid=%llu)", eid);
+	} else {
+		/* let internal modules see this event first */
+		zfs_agent_post_event(class, NULL, nvl);
+
+		zsp = zed_strings_create();
+
+		nvp = NULL;
+		while ((nvp = nvlist_next_nvpair(nvl, nvp)))
+			_zed_event_add_nvpair(eid, zsp, nvp);
+
+		_zed_event_add_env_restrict(eid, zsp, zcp->path);
+		_zed_event_add_env_preserve(eid, zsp);
+
+		_zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "PID",
+		    "%d", (int)getpid());
+		_zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "ZEDLET_DIR",
+		    "%s", zcp->zedlet_dir);
+		subclass = _zed_event_get_subclass(class);
+		_zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "SUBCLASS",
+		    "%s", (subclass ? subclass : class));
+
+		_zed_event_add_time_strings(eid, zsp, etime);
+
+		zed_exec_process(eid, class, subclass,
+		    zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd);
+
+		zed_conf_write_state(zcp, eid, etime);
+
+		zed_strings_destroy(zsp);
+	}
+	nvlist_free(nvl);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_event.h b/sys/contrib/openzfs/cmd/zed/zed_event.h
new file mode 100644
index 000000000000..c1455c3a0629
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_event.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef	ZED_EVENT_H
+#define	ZED_EVENT_H
+
+#include <stdint.h>
+
+int zed_event_init(struct zed_conf *zcp);
+
+void zed_event_fini(struct zed_conf *zcp);
+
+int zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid,
+    int64_t saved_etime[]);
+
+int zed_event_service(struct zed_conf *zcp);
+
+#endif	/* !ZED_EVENT_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_exec.c b/sys/contrib/openzfs/cmd/zed/zed_exec.c
new file mode 100644
index 000000000000..08b7b5568362
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_exec.c
@@ -0,0 +1,232 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+#include "zed_exec.h"
+#include "zed_file.h"
+#include "zed_log.h"
+#include "zed_strings.h"
+
+#define	ZEVENT_FILENO	3
+
+/*
+ * Create an environment string array for passing to execve() using the
+ * NAME=VALUE strings in container [zsp].
+ * Return a newly-allocated environment, or NULL on error.
+ */
+static char **
+_zed_exec_create_env(zed_strings_t *zsp)
+{
+	int num_ptrs;
+	int buflen;
+	char *buf;
+	char **pp;
+	char *p;
+	const char *q;
+	int i;
+	int len;
+
+	num_ptrs = zed_strings_count(zsp) + 1;
+	buflen = num_ptrs * sizeof (char *);
+	for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp))
+		buflen += strlen(q) + 1;
+
+	buf = calloc(1, buflen);
+	if (!buf)
+		return (NULL);
+
+	pp = (char **)buf;
+	p = buf + (num_ptrs * sizeof (char *));
+	i = 0;
+	for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) {
+		pp[i] = p;
+		len = strlen(q) + 1;
+		memcpy(p, q, len);
+		p += len;
+		i++;
+	}
+	pp[i] = NULL;
+	assert(buf + buflen == p);
+	return ((char **)buf);
+}
+
+/*
+ * Fork a child process to handle event [eid].  The program [prog]
+ * in directory [dir] is executed with the environment [env].
+ *
+ * The file descriptor [zfd] is the zevent_fd used to track the
+ * current cursor location within the zevent nvlist.
+ */
+static void
+_zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog,
+    char *env[], int zfd)
+{
+	char path[PATH_MAX];
+	int n;
+	pid_t pid;
+	int fd;
+	pid_t wpid;
+	int status;
+
+	assert(dir != NULL);
+	assert(prog != NULL);
+	assert(env != NULL);
+	assert(zfd >= 0);
+
+	n = snprintf(path, sizeof (path), "%s/%s", dir, prog);
+	if ((n < 0) || (n >= sizeof (path))) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to fork \"%s\" for eid=%llu: %s",
+		    prog, eid, strerror(ENAMETOOLONG));
+		return;
+	}
+	pid = fork();
+	if (pid < 0) {
+		zed_log_msg(LOG_WARNING,
+		    "Failed to fork \"%s\" for eid=%llu: %s",
+		    prog, eid, strerror(errno));
+		return;
+	} else if (pid == 0) {
+		(void) umask(022);
+		if ((fd = open("/dev/null", O_RDWR)) != -1) {
+			(void) dup2(fd, STDIN_FILENO);
+			(void) dup2(fd, STDOUT_FILENO);
+			(void) dup2(fd, STDERR_FILENO);
+		}
+		(void) dup2(zfd, ZEVENT_FILENO);
+		zed_file_close_from(ZEVENT_FILENO + 1);
+		execle(path, prog, NULL, env);
+		_exit(127);
+	}
+
+	/* parent process */
+
+	zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d",
+	    prog, eid, pid);
+
+	/* FIXME: Timeout rogue child processes with sigalarm? */
+
+	/*
+	 * Wait for child process using WNOHANG to limit
+	 * the time spent waiting to 10 seconds (10,000ms).
+	 */
+	for (n = 0; n < 1000; n++) {
+		wpid = waitpid(pid, &status, WNOHANG);
+		if (wpid == (pid_t)-1) {
+			if (errno == EINTR)
+				continue;
+			zed_log_msg(LOG_WARNING,
+			    "Failed to wait for \"%s\" eid=%llu pid=%d",
+			    prog, eid, pid);
+			break;
+		} else if (wpid == 0) {
+			struct timespec t;
+
+			/* child still running */
+			t.tv_sec = 0;
+			t.tv_nsec = 10000000;	/* 10ms */
+			(void) nanosleep(&t, NULL);
+			continue;
+		}
+
+		if (WIFEXITED(status)) {
+			zed_log_msg(LOG_INFO,
+			    "Finished \"%s\" eid=%llu pid=%d exit=%d",
+			    prog, eid, pid, WEXITSTATUS(status));
+		} else if (WIFSIGNALED(status)) {
+			zed_log_msg(LOG_INFO,
+			    "Finished \"%s\" eid=%llu pid=%d sig=%d/%s",
+			    prog, eid, pid, WTERMSIG(status),
+			    strsignal(WTERMSIG(status)));
+		} else {
+			zed_log_msg(LOG_INFO,
+			    "Finished \"%s\" eid=%llu pid=%d status=0x%X",
+			    prog, eid, (unsigned int) status);
+		}
+		break;
+	}
+
+	/*
+	 * kill child process after 10 seconds
+	 */
+	if (wpid == 0) {
+		zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d",
+		    prog, pid);
+		(void) kill(pid, SIGKILL);
+	}
+}
+
+/*
+ * Process the event [eid] by synchronously invoking all zedlets with a
+ * matching class prefix.
+ *
+ * Each executable in [zedlets] from the directory [dir] is matched against
+ * the event's [class], [subclass], and the "all" class (which matches
+ * all events).  Every zedlet with a matching class prefix is invoked.
+ * The NAME=VALUE strings in [envs] will be passed to the zedlet as
+ * environment variables.
+ *
+ * The file descriptor [zfd] is the zevent_fd used to track the
+ * current cursor location within the zevent nvlist.
+ *
+ * Return 0 on success, -1 on error.
+ */
+int
+zed_exec_process(uint64_t eid, const char *class, const char *subclass,
+    const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, int zfd)
+{
+	const char *class_strings[4];
+	const char *allclass = "all";
+	const char **csp;
+	const char *z;
+	char **e;
+	int n;
+
+	if (!dir || !zedlets || !envs || zfd < 0)
+		return (-1);
+
+	csp = class_strings;
+
+	if (class)
+		*csp++ = class;
+
+	if (subclass)
+		*csp++ = subclass;
+
+	if (allclass)
+		*csp++ = allclass;
+
+	*csp = NULL;
+
+	e = _zed_exec_create_env(envs);
+
+	for (z = zed_strings_first(zedlets); z; z = zed_strings_next(zedlets)) {
+		for (csp = class_strings; *csp; csp++) {
+			n = strlen(*csp);
+			if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n]))
+				_zed_exec_fork_child(eid, dir, z, e, zfd);
+		}
+	}
+	free(e);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_exec.h b/sys/contrib/openzfs/cmd/zed/zed_exec.h
new file mode 100644
index 000000000000..4153e5519a46
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_exec.h
@@ -0,0 +1,25 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef	ZED_EXEC_H
+#define	ZED_EXEC_H
+
+#include <stdint.h>
+#include "zed_strings.h"
+
+int zed_exec_process(uint64_t eid, const char *class, const char *subclass,
+    const char *dir, zed_strings_t *zedlets, zed_strings_t *envs,
+    int zevent_fd);
+
+#endif	/* !ZED_EXEC_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_file.c b/sys/contrib/openzfs/cmd/zed/zed_file.c
new file mode 100644
index 000000000000..c3cf3d421c6f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_file.c
@@ -0,0 +1,217 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "zed_file.h"
+#include "zed_log.h"
+
+/*
+ * Read up to [n] bytes from [fd] into [buf].
+ * Return the number of bytes read, 0 on EOF, or -1 on error.
+ */
+ssize_t
+zed_file_read_n(int fd, void *buf, size_t n)
+{
+	unsigned char *p;
+	size_t n_left;
+	ssize_t n_read;
+
+	p = buf;
+	n_left = n;
+	while (n_left > 0) {
+		if ((n_read = read(fd, p, n_left)) < 0) {
+			if (errno == EINTR)
+				continue;
+			else
+				return (-1);
+
+		} else if (n_read == 0) {
+			break;
+		}
+		n_left -= n_read;
+		p += n_read;
+	}
+	return (n - n_left);
+}
+
+/*
+ * Write [n] bytes from [buf] out to [fd].
+ * Return the number of bytes written, or -1 on error.
+ */
+ssize_t
+zed_file_write_n(int fd, void *buf, size_t n)
+{
+	const unsigned char *p;
+	size_t n_left;
+	ssize_t n_written;
+
+	p = buf;
+	n_left = n;
+	while (n_left > 0) {
+		if ((n_written = write(fd, p, n_left)) < 0) {
+			if (errno == EINTR)
+				continue;
+			else
+				return (-1);
+
+		}
+		n_left -= n_written;
+		p += n_written;
+	}
+	return (n);
+}
+
+/*
+ * Set an exclusive advisory lock on the open file descriptor [fd].
+ * Return 0 on success, 1 if a conflicting lock is held by another process,
+ * or -1 on error (with errno set).
+ */
+int
+zed_file_lock(int fd)
+{
+	struct flock lock;
+
+	if (fd < 0) {
+		errno = EBADF;
+		return (-1);
+	}
+	lock.l_type = F_WRLCK;
+	lock.l_whence = SEEK_SET;
+	lock.l_start = 0;
+	lock.l_len = 0;
+
+	if (fcntl(fd, F_SETLK, &lock) < 0) {
+		if ((errno == EACCES) || (errno == EAGAIN))
+			return (1);
+
+		return (-1);
+	}
+	return (0);
+}
+
+/*
+ * Release an advisory lock held on the open file descriptor [fd].
+ * Return 0 on success, or -1 on error (with errno set).
+ */
+int
+zed_file_unlock(int fd)
+{
+	struct flock lock;
+
+	if (fd < 0) {
+		errno = EBADF;
+		return (-1);
+	}
+	lock.l_type = F_UNLCK;
+	lock.l_whence = SEEK_SET;
+	lock.l_start = 0;
+	lock.l_len = 0;
+
+	if (fcntl(fd, F_SETLK, &lock) < 0)
+		return (-1);
+
+	return (0);
+}
+
+/*
+ * Test whether an exclusive advisory lock could be obtained for the open
+ * file descriptor [fd].
+ * Return 0 if the file is not locked, >0 for the PID of another process
+ * holding a conflicting lock, or -1 on error (with errno set).
+ */
+pid_t
+zed_file_is_locked(int fd)
+{
+	struct flock lock;
+
+	if (fd < 0) {
+		errno = EBADF;
+		return (-1);
+	}
+	lock.l_type = F_WRLCK;
+	lock.l_whence = SEEK_SET;
+	lock.l_start = 0;
+	lock.l_len = 0;
+
+	if (fcntl(fd, F_GETLK, &lock) < 0)
+		return (-1);
+
+	if (lock.l_type == F_UNLCK)
+		return (0);
+
+	return (lock.l_pid);
+}
+
+/*
+ * Close all open file descriptors greater than or equal to [lowfd].
+ * Any errors encountered while closing file descriptors are ignored.
+ */
+void
+zed_file_close_from(int lowfd)
+{
+	const int maxfd_def = 256;
+	int errno_bak;
+	struct rlimit rl;
+	int maxfd;
+	int fd;
+
+	errno_bak = errno;
+
+	if (getrlimit(RLIMIT_NOFILE, &rl) < 0) {
+		maxfd = maxfd_def;
+	} else if (rl.rlim_max == RLIM_INFINITY) {
+		maxfd = maxfd_def;
+	} else {
+		maxfd = rl.rlim_max;
+	}
+	for (fd = lowfd; fd < maxfd; fd++)
+		(void) close(fd);
+
+	errno = errno_bak;
+}
+
+/*
+ * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically
+ * closed upon successful execution of one of the exec functions.
+ * Return 0 on success, or -1 on error.
+ *
+ * FIXME: No longer needed?
+ */
+int
+zed_file_close_on_exec(int fd)
+{
+	int flags;
+
+	if (fd < 0) {
+		errno = EBADF;
+		return (-1);
+	}
+	flags = fcntl(fd, F_GETFD);
+	if (flags == -1)
+		return (-1);
+
+	flags |= FD_CLOEXEC;
+
+	if (fcntl(fd, F_SETFD, flags) == -1)
+		return (-1);
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_file.h b/sys/contrib/openzfs/cmd/zed/zed_file.h
new file mode 100644
index 000000000000..05f360d20efd
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_file.h
@@ -0,0 +1,35 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef	ZED_FILE_H
+#define	ZED_FILE_H
+
+#include <sys/types.h>
+#include <unistd.h>
+
+ssize_t zed_file_read_n(int fd, void *buf, size_t n);
+
+ssize_t zed_file_write_n(int fd, void *buf, size_t n);
+
+int zed_file_lock(int fd);
+
+int zed_file_unlock(int fd);
+
+pid_t zed_file_is_locked(int fd);
+
+void zed_file_close_from(int fd);
+
+int zed_file_close_on_exec(int fd);
+
+#endif	/* !ZED_FILE_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_log.c b/sys/contrib/openzfs/cmd/zed/zed_log.c
new file mode 100644
index 000000000000..5a3f2dbdb832
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_log.c
@@ -0,0 +1,256 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <syslog.h>
+#include <unistd.h>
+#include "zed_log.h"
+
+#define	ZED_LOG_MAX_LOG_LEN	1024
+
+static struct {
+	unsigned do_stderr:1;
+	unsigned do_syslog:1;
+	const char *identity;
+	int priority;
+	int pipe_fd[2];
+} _ctx;
+
+/*
+ * Initialize the logging subsystem.
+ */
+void
+zed_log_init(const char *identity)
+{
+	if (identity) {
+		const char *p = strrchr(identity, '/');
+		_ctx.identity = (p != NULL) ? p + 1 : identity;
+	} else {
+		_ctx.identity = NULL;
+	}
+	_ctx.pipe_fd[0] = -1;
+	_ctx.pipe_fd[1] = -1;
+}
+
+/*
+ * Shutdown the logging subsystem.
+ */
+void
+zed_log_fini(void)
+{
+	zed_log_stderr_close();
+	zed_log_syslog_close();
+}
+
+/*
+ * Create pipe for communicating daemonization status between the parent and
+ * child processes across the double-fork().
+ */
+void
+zed_log_pipe_open(void)
+{
+	if ((_ctx.pipe_fd[0] != -1) || (_ctx.pipe_fd[1] != -1))
+		zed_log_die("Invalid use of zed_log_pipe_open in PID %d",
+		    (int)getpid());
+
+	if (pipe(_ctx.pipe_fd) < 0)
+		zed_log_die("Failed to create daemonize pipe in PID %d: %s",
+		    (int)getpid(), strerror(errno));
+}
+
+/*
+ * Close the read-half of the daemonize pipe.
+ *
+ * This should be called by the child after fork()ing from the parent since
+ * the child will never read from this pipe.
+ */
+void
+zed_log_pipe_close_reads(void)
+{
+	if (_ctx.pipe_fd[0] < 0)
+		zed_log_die(
+		    "Invalid use of zed_log_pipe_close_reads in PID %d",
+		    (int)getpid());
+
+	if (close(_ctx.pipe_fd[0]) < 0)
+		zed_log_die(
+		    "Failed to close reads on daemonize pipe in PID %d: %s",
+		    (int)getpid(), strerror(errno));
+
+	_ctx.pipe_fd[0] = -1;
+}
+
+/*
+ * Close the write-half of the daemonize pipe.
+ *
+ * This should be called by the parent after fork()ing its child since the
+ * parent will never write to this pipe.
+ *
+ * This should also be called by the child once initialization is complete
+ * in order to signal the parent that it can safely exit.
+ */
+void
+zed_log_pipe_close_writes(void)
+{
+	if (_ctx.pipe_fd[1] < 0)
+		zed_log_die(
+		    "Invalid use of zed_log_pipe_close_writes in PID %d",
+		    (int)getpid());
+
+	if (close(_ctx.pipe_fd[1]) < 0)
+		zed_log_die(
+		    "Failed to close writes on daemonize pipe in PID %d: %s",
+		    (int)getpid(), strerror(errno));
+
+	_ctx.pipe_fd[1] = -1;
+}
+
+/*
+ * Block on reading from the daemonize pipe until signaled by the child
+ * (via zed_log_pipe_close_writes()) that initialization is complete.
+ *
+ * This should only be called by the parent while waiting to exit after
+ * fork()ing the child.
+ */
+void
+zed_log_pipe_wait(void)
+{
+	ssize_t n;
+	char c;
+
+	if (_ctx.pipe_fd[0] < 0)
+		zed_log_die("Invalid use of zed_log_pipe_wait in PID %d",
+		    (int)getpid());
+
+	for (;;) {
+		n = read(_ctx.pipe_fd[0], &c, sizeof (c));
+		if (n < 0) {
+			if (errno == EINTR)
+				continue;
+			zed_log_die(
+			    "Failed to read from daemonize pipe in PID %d: %s",
+			    (int)getpid(), strerror(errno));
+		}
+		if (n == 0) {
+			break;
+		}
+	}
+}
+
+/*
+ * Start logging messages at the syslog [priority] level or higher to stderr.
+ * Refer to syslog(3) for valid priority values.
+ */
+void
+zed_log_stderr_open(int priority)
+{
+	_ctx.do_stderr = 1;
+	_ctx.priority = priority;
+}
+
+/*
+ * Stop logging messages to stderr.
+ */
+void
+zed_log_stderr_close(void)
+{
+	if (_ctx.do_stderr)
+		_ctx.do_stderr = 0;
+}
+
+/*
+ * Start logging messages to syslog.
+ * Refer to syslog(3) for valid option/facility values.
+ */
+void
+zed_log_syslog_open(int facility)
+{
+	_ctx.do_syslog = 1;
+	openlog(_ctx.identity, LOG_NDELAY | LOG_PID, facility);
+}
+
+/*
+ * Stop logging messages to syslog.
+ */
+void
+zed_log_syslog_close(void)
+{
+	if (_ctx.do_syslog) {
+		_ctx.do_syslog = 0;
+		closelog();
+	}
+}
+
+/*
+ * Auxiliary function to log a message to syslog and/or stderr.
+ */
+static void
+_zed_log_aux(int priority, const char *fmt, va_list vargs)
+{
+	char buf[ZED_LOG_MAX_LOG_LEN];
+	int n;
+
+	if (!fmt)
+		return;
+
+	n = vsnprintf(buf, sizeof (buf), fmt, vargs);
+	if ((n < 0) || (n >= sizeof (buf))) {
+		buf[sizeof (buf) - 2] = '+';
+		buf[sizeof (buf) - 1] = '\0';
+	}
+
+	if (_ctx.do_syslog)
+		syslog(priority, "%s", buf);
+
+	if (_ctx.do_stderr && (priority <= _ctx.priority))
+		fprintf(stderr, "%s\n", buf);
+}
+
+/*
+ * Log a message at the given [priority] level specified by the printf-style
+ * format string [fmt].
+ */
+void
+zed_log_msg(int priority, const char *fmt, ...)
+{
+	va_list vargs;
+
+	if (fmt) {
+		va_start(vargs, fmt);
+		_zed_log_aux(priority, fmt, vargs);
+		va_end(vargs);
+	}
+}
+
+/*
+ * Log a fatal error message specified by the printf-style format string [fmt].
+ */
+void
+zed_log_die(const char *fmt, ...)
+{
+	va_list vargs;
+
+	if (fmt) {
+		va_start(vargs, fmt);
+		_zed_log_aux(LOG_ERR, fmt, vargs);
+		va_end(vargs);
+	}
+	exit(EXIT_FAILURE);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_log.h b/sys/contrib/openzfs/cmd/zed/zed_log.h
new file mode 100644
index 000000000000..a03a4f53967c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_log.h
@@ -0,0 +1,44 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef	ZED_LOG_H
+#define	ZED_LOG_H
+
+#include <syslog.h>
+
+void zed_log_init(const char *identity);
+
+void zed_log_fini(void);
+
+void zed_log_pipe_open(void);
+
+void zed_log_pipe_close_reads(void);
+
+void zed_log_pipe_close_writes(void);
+
+void zed_log_pipe_wait(void);
+
+void zed_log_stderr_open(int priority);
+
+void zed_log_stderr_close(void);
+
+void zed_log_syslog_open(int facility);
+
+void zed_log_syslog_close(void);
+
+void zed_log_msg(int priority, const char *fmt, ...);
+
+void zed_log_die(const char *fmt, ...);
+
+#endif	/* !ZED_LOG_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_strings.c b/sys/contrib/openzfs/cmd/zed/zed_strings.c
new file mode 100644
index 000000000000..6b1c669d71f4
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_strings.c
@@ -0,0 +1,247 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/avl.h>
+#include <sys/sysmacros.h>
+#include "zed_strings.h"
+
+struct zed_strings {
+	avl_tree_t tree;
+	avl_node_t *iteratorp;
+};
+
+struct zed_strings_node {
+	avl_node_t node;
+	char *key;
+	char *val;
+};
+
+typedef struct zed_strings_node zed_strings_node_t;
+
+/*
+ * Compare zed_strings_node_t nodes [x1] and [x2].
+ * As required for the AVL tree, return -1 for <, 0 for ==, and +1 for >.
+ */
+static int
+_zed_strings_node_compare(const void *x1, const void *x2)
+{
+	const char *s1;
+	const char *s2;
+	int rv;
+
+	assert(x1 != NULL);
+	assert(x2 != NULL);
+
+	s1 = ((const zed_strings_node_t *) x1)->key;
+	assert(s1 != NULL);
+	s2 = ((const zed_strings_node_t *) x2)->key;
+	assert(s2 != NULL);
+	rv = strcmp(s1, s2);
+
+	if (rv < 0)
+		return (-1);
+
+	if (rv > 0)
+		return (1);
+
+	return (0);
+}
+
+/*
+ * Return a new string container, or NULL on error.
+ */
+zed_strings_t *
+zed_strings_create(void)
+{
+	zed_strings_t *zsp;
+
+	zsp = calloc(1, sizeof (*zsp));
+	if (!zsp)
+		return (NULL);
+
+	avl_create(&zsp->tree, _zed_strings_node_compare,
+	    sizeof (zed_strings_node_t), offsetof(zed_strings_node_t, node));
+
+	zsp->iteratorp = NULL;
+	return (zsp);
+}
+
+/*
+ * Destroy the string node [np].
+ */
+static void
+_zed_strings_node_destroy(zed_strings_node_t *np)
+{
+	if (!np)
+		return;
+
+	if (np->key) {
+		if (np->key != np->val)
+			free(np->key);
+		np->key = NULL;
+	}
+	if (np->val) {
+		free(np->val);
+		np->val = NULL;
+	}
+	free(np);
+}
+
+/*
+ * Return a new string node for storing the string [val], or NULL on error.
+ * If [key] is specified, it will be used to index the node; otherwise,
+ * the string [val] will be used.
+ */
+static zed_strings_node_t *
+_zed_strings_node_create(const char *key, const char *val)
+{
+	zed_strings_node_t *np;
+
+	assert(val != NULL);
+
+	np = calloc(1, sizeof (*np));
+	if (!np)
+		return (NULL);
+
+	np->val = strdup(val);
+	if (!np->val)
+		goto nomem;
+
+	if (key) {
+		np->key = strdup(key);
+		if (!np->key)
+			goto nomem;
+	} else {
+		np->key = np->val;
+	}
+	return (np);
+
+nomem:
+	_zed_strings_node_destroy(np);
+	return (NULL);
+}
+
+/*
+ * Destroy the string container [zsp] and all nodes within.
+ */
+void
+zed_strings_destroy(zed_strings_t *zsp)
+{
+	void *cookie;
+	zed_strings_node_t *np;
+
+	if (!zsp)
+		return;
+
+	cookie = NULL;
+	while ((np = avl_destroy_nodes(&zsp->tree, &cookie)))
+		_zed_strings_node_destroy(np);
+
+	avl_destroy(&zsp->tree);
+	free(zsp);
+}
+
+/*
+ * Add a copy of the string [s] indexed by [key] to the container [zsp].
+ * If [key] already exists within the container [zsp], it will be replaced
+ * with the new string [s].
+ * If [key] is NULL, the string [s] will be used as the key.
+ * Return 0 on success, or -1 on error.
+ */
+int
+zed_strings_add(zed_strings_t *zsp, const char *key, const char *s)
+{
+	zed_strings_node_t *newp, *oldp;
+
+	if (!zsp || !s) {
+		errno = EINVAL;
+		return (-1);
+	}
+	if (key == s)
+		key = NULL;
+
+	newp = _zed_strings_node_create(key, s);
+	if (!newp)
+		return (-1);
+
+	oldp = avl_find(&zsp->tree, newp, NULL);
+	if (oldp) {
+		avl_remove(&zsp->tree, oldp);
+		_zed_strings_node_destroy(oldp);
+	}
+	avl_add(&zsp->tree, newp);
+	return (0);
+}
+
+/*
+ * Return the first string in container [zsp].
+ * Return NULL if there are no strings, or on error.
+ * This can be called multiple times to re-traverse [zsp].
+ * XXX: Not thread-safe.
+ */
+const char *
+zed_strings_first(zed_strings_t *zsp)
+{
+	if (!zsp) {
+		errno = EINVAL;
+		return (NULL);
+	}
+	zsp->iteratorp = avl_first(&zsp->tree);
+	if (!zsp->iteratorp)
+		return (NULL);
+
+	return (((zed_strings_node_t *)zsp->iteratorp)->val);
+
+}
+
+/*
+ * Return the next string in container [zsp].
+ * Return NULL after the last string, or on error.
+ * This must be called after zed_strings_first().
+ * XXX: Not thread-safe.
+ */
+const char *
+zed_strings_next(zed_strings_t *zsp)
+{
+	if (!zsp) {
+		errno = EINVAL;
+		return (NULL);
+	}
+	if (!zsp->iteratorp)
+		return (NULL);
+
+	zsp->iteratorp = AVL_NEXT(&zsp->tree, zsp->iteratorp);
+	if (!zsp->iteratorp)
+		return (NULL);
+
+	return (((zed_strings_node_t *)zsp->iteratorp)->val);
+}
+
+/*
+ * Return the number of strings in container [zsp], or -1 on error.
+ */
+int
+zed_strings_count(zed_strings_t *zsp)
+{
+	if (!zsp) {
+		errno = EINVAL;
+		return (-1);
+	}
+	return (avl_numnodes(&zsp->tree));
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_strings.h b/sys/contrib/openzfs/cmd/zed/zed_strings.h
new file mode 100644
index 000000000000..37a84cad7ffc
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_strings.h
@@ -0,0 +1,27 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef	ZED_STRINGS_H
+#define	ZED_STRINGS_H
+
+typedef struct zed_strings zed_strings_t;
+
+zed_strings_t *zed_strings_create(void);
+void zed_strings_destroy(zed_strings_t *zsp);
+int zed_strings_add(zed_strings_t *zsp, const char *key, const char *s);
+const char *zed_strings_first(zed_strings_t *zsp);
+const char *zed_strings_next(zed_strings_t *zsp);
+int zed_strings_count(zed_strings_t *zsp);
+
+#endif	/* !ZED_STRINGS_H */
diff --git a/sys/contrib/openzfs/cmd/zfs/.gitignore b/sys/contrib/openzfs/cmd/zfs/.gitignore
new file mode 100644
index 000000000000..0fd9cc63af2a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/.gitignore
@@ -0,0 +1 @@
+/zfs
diff --git a/sys/contrib/openzfs/cmd/zfs/Makefile.am b/sys/contrib/openzfs/cmd/zfs/Makefile.am
new file mode 100644
index 000000000000..dec5920381d5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/Makefile.am
@@ -0,0 +1,23 @@
+include $(top_srcdir)/config/Rules.am
+
+sbin_PROGRAMS = zfs
+
+zfs_SOURCES = \
+	zfs_iter.c \
+	zfs_iter.h \
+	zfs_main.c \
+	zfs_util.h \
+	zfs_project.c \
+	zfs_projectutil.h
+
+zfs_LDADD = \
+	$(abs_top_builddir)/lib/libzfs/libzfs.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+	$(abs_top_builddir)/lib/libnvpair/libnvpair.la \
+	$(abs_top_builddir)/lib/libuutil/libuutil.la
+
+zfs_LDADD += $(LTLIBINTL)
+
+if BUILD_FREEBSD
+zfs_LDADD += -lgeom -ljail
+endif
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_iter.c b/sys/contrib/openzfs/cmd/zfs/zfs_iter.c
new file mode 100644
index 000000000000..f2359508c16d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_iter.c
@@ -0,0 +1,512 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <libintl.h>
+#include <libuutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include <libzfs.h>
+
+#include "zfs_util.h"
+#include "zfs_iter.h"
+
+/*
+ * This is a private interface used to gather up all the datasets specified on
+ * the command line so that we can iterate over them in order.
+ *
+ * First, we iterate over all filesystems, gathering them together into an
+ * AVL tree.  We report errors for any explicitly specified datasets
+ * that we couldn't open.
+ *
+ * When finished, we have an AVL tree of ZFS handles.  We go through and execute
+ * the provided callback for each one, passing whatever data the user supplied.
+ */
+
+typedef struct zfs_node {
+	zfs_handle_t	*zn_handle;
+	uu_avl_node_t	zn_avlnode;
+} zfs_node_t;
+
+typedef struct callback_data {
+	uu_avl_t		*cb_avl;
+	int			cb_flags;
+	zfs_type_t		cb_types;
+	zfs_sort_column_t	*cb_sortcol;
+	zprop_list_t		**cb_proplist;
+	int			cb_depth_limit;
+	int			cb_depth;
+	uint8_t			cb_props_table[ZFS_NUM_PROPS];
+} callback_data_t;
+
+uu_avl_pool_t *avl_pool;
+
+/*
+ * Include snaps if they were requested or if this a zfs list where types
+ * were not specified and the "listsnapshots" property is set on this pool.
+ */
+static boolean_t
+zfs_include_snapshots(zfs_handle_t *zhp, callback_data_t *cb)
+{
+	zpool_handle_t *zph;
+
+	if ((cb->cb_flags & ZFS_ITER_PROP_LISTSNAPS) == 0)
+		return (cb->cb_types & ZFS_TYPE_SNAPSHOT);
+
+	zph = zfs_get_pool_handle(zhp);
+	return (zpool_get_prop_int(zph, ZPOOL_PROP_LISTSNAPS, NULL));
+}
+
+/*
+ * Called for each dataset.  If the object is of an appropriate type,
+ * add it to the avl tree and recurse over any children as necessary.
+ */
+static int
+zfs_callback(zfs_handle_t *zhp, void *data)
+{
+	callback_data_t *cb = data;
+	boolean_t should_close = B_TRUE;
+	boolean_t include_snaps = zfs_include_snapshots(zhp, cb);
+	boolean_t include_bmarks = (cb->cb_types & ZFS_TYPE_BOOKMARK);
+
+	if ((zfs_get_type(zhp) & cb->cb_types) ||
+	    ((zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) && include_snaps)) {
+		uu_avl_index_t idx;
+		zfs_node_t *node = safe_malloc(sizeof (zfs_node_t));
+
+		node->zn_handle = zhp;
+		uu_avl_node_init(node, &node->zn_avlnode, avl_pool);
+		if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol,
+		    &idx) == NULL) {
+			if (cb->cb_proplist) {
+				if ((*cb->cb_proplist) &&
+				    !(*cb->cb_proplist)->pl_all)
+					zfs_prune_proplist(zhp,
+					    cb->cb_props_table);
+
+				if (zfs_expand_proplist(zhp, cb->cb_proplist,
+				    (cb->cb_flags & ZFS_ITER_RECVD_PROPS),
+				    (cb->cb_flags & ZFS_ITER_LITERAL_PROPS))
+				    != 0) {
+					free(node);
+					return (-1);
+				}
+			}
+			uu_avl_insert(cb->cb_avl, node, idx);
+			should_close = B_FALSE;
+		} else {
+			free(node);
+		}
+	}
+
+	/*
+	 * Recurse if necessary.
+	 */
+	if (cb->cb_flags & ZFS_ITER_RECURSE &&
+	    ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 ||
+	    cb->cb_depth < cb->cb_depth_limit)) {
+		cb->cb_depth++;
+
+		/*
+		 * If we are not looking for filesystems, we don't need to
+		 * recurse into filesystems when we are at our depth limit.
+		 */
+		if ((cb->cb_depth < cb->cb_depth_limit ||
+		    (cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 ||
+		    (cb->cb_types &
+		    (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME))) &&
+		    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+			(void) zfs_iter_filesystems(zhp, zfs_callback, data);
+		}
+
+		if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT |
+		    ZFS_TYPE_BOOKMARK)) == 0) && include_snaps) {
+			(void) zfs_iter_snapshots(zhp,
+			    (cb->cb_flags & ZFS_ITER_SIMPLE) != 0,
+			    zfs_callback, data, 0, 0);
+		}
+
+		if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT |
+		    ZFS_TYPE_BOOKMARK)) == 0) && include_bmarks) {
+			(void) zfs_iter_bookmarks(zhp, zfs_callback, data);
+		}
+
+		cb->cb_depth--;
+	}
+
+	if (should_close)
+		zfs_close(zhp);
+
+	return (0);
+}
+
+int
+zfs_add_sort_column(zfs_sort_column_t **sc, const char *name,
+    boolean_t reverse)
+{
+	zfs_sort_column_t *col;
+	zfs_prop_t prop;
+
+	if ((prop = zfs_name_to_prop(name)) == ZPROP_INVAL &&
+	    !zfs_prop_user(name))
+		return (-1);
+
+	col = safe_malloc(sizeof (zfs_sort_column_t));
+
+	col->sc_prop = prop;
+	col->sc_reverse = reverse;
+	if (prop == ZPROP_INVAL) {
+		col->sc_user_prop = safe_malloc(strlen(name) + 1);
+		(void) strcpy(col->sc_user_prop, name);
+	}
+
+	if (*sc == NULL) {
+		col->sc_last = col;
+		*sc = col;
+	} else {
+		(*sc)->sc_last->sc_next = col;
+		(*sc)->sc_last = col;
+	}
+
+	return (0);
+}
+
+void
+zfs_free_sort_columns(zfs_sort_column_t *sc)
+{
+	zfs_sort_column_t *col;
+
+	while (sc != NULL) {
+		col = sc->sc_next;
+		free(sc->sc_user_prop);
+		free(sc);
+		sc = col;
+	}
+}
+
+int
+zfs_sort_only_by_name(const zfs_sort_column_t *sc)
+{
+	return (sc != NULL && sc->sc_next == NULL &&
+	    sc->sc_prop == ZFS_PROP_NAME);
+}
+
+/* ARGSUSED */
+static int
+zfs_compare(const void *larg, const void *rarg, void *unused)
+{
+	zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
+	zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
+	const char *lname = zfs_get_name(l);
+	const char *rname = zfs_get_name(r);
+	char *lat, *rat;
+	uint64_t lcreate, rcreate;
+	int ret;
+
+	lat = (char *)strchr(lname, '@');
+	rat = (char *)strchr(rname, '@');
+
+	if (lat != NULL)
+		*lat = '\0';
+	if (rat != NULL)
+		*rat = '\0';
+
+	ret = strcmp(lname, rname);
+	if (ret == 0 && (lat != NULL || rat != NULL)) {
+		/*
+		 * If we're comparing a dataset to one of its snapshots, we
+		 * always make the full dataset first.
+		 */
+		if (lat == NULL) {
+			ret = -1;
+		} else if (rat == NULL) {
+			ret = 1;
+		} else {
+			/*
+			 * If we have two snapshots from the same dataset, then
+			 * we want to sort them according to creation time.  We
+			 * use the hidden CREATETXG property to get an absolute
+			 * ordering of snapshots.
+			 */
+			lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
+			rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
+
+			/*
+			 * Both lcreate and rcreate being 0 means we don't have
+			 * properties and we should compare full name.
+			 */
+			if (lcreate == 0 && rcreate == 0)
+				ret = strcmp(lat + 1, rat + 1);
+			else if (lcreate < rcreate)
+				ret = -1;
+			else if (lcreate > rcreate)
+				ret = 1;
+		}
+	}
+
+	if (lat != NULL)
+		*lat = '@';
+	if (rat != NULL)
+		*rat = '@';
+
+	return (ret);
+}
+
+/*
+ * Sort datasets by specified columns.
+ *
+ * o  Numeric types sort in ascending order.
+ * o  String types sort in alphabetical order.
+ * o  Types inappropriate for a row sort that row to the literal
+ *    bottom, regardless of the specified ordering.
+ *
+ * If no sort columns are specified, or two datasets compare equally
+ * across all specified columns, they are sorted alphabetically by name
+ * with snapshots grouped under their parents.
+ */
+static int
+zfs_sort(const void *larg, const void *rarg, void *data)
+{
+	zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
+	zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
+	zfs_sort_column_t *sc = (zfs_sort_column_t *)data;
+	zfs_sort_column_t *psc;
+
+	for (psc = sc; psc != NULL; psc = psc->sc_next) {
+		char lbuf[ZFS_MAXPROPLEN], rbuf[ZFS_MAXPROPLEN];
+		char *lstr, *rstr;
+		uint64_t lnum, rnum;
+		boolean_t lvalid, rvalid;
+		int ret = 0;
+
+		/*
+		 * We group the checks below the generic code.  If 'lstr' and
+		 * 'rstr' are non-NULL, then we do a string based comparison.
+		 * Otherwise, we compare 'lnum' and 'rnum'.
+		 */
+		lstr = rstr = NULL;
+		if (psc->sc_prop == ZPROP_INVAL) {
+			nvlist_t *luser, *ruser;
+			nvlist_t *lval, *rval;
+
+			luser = zfs_get_user_props(l);
+			ruser = zfs_get_user_props(r);
+
+			lvalid = (nvlist_lookup_nvlist(luser,
+			    psc->sc_user_prop, &lval) == 0);
+			rvalid = (nvlist_lookup_nvlist(ruser,
+			    psc->sc_user_prop, &rval) == 0);
+
+			if (lvalid)
+				verify(nvlist_lookup_string(lval,
+				    ZPROP_VALUE, &lstr) == 0);
+			if (rvalid)
+				verify(nvlist_lookup_string(rval,
+				    ZPROP_VALUE, &rstr) == 0);
+		} else if (psc->sc_prop == ZFS_PROP_NAME) {
+			lvalid = rvalid = B_TRUE;
+
+			(void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf));
+			(void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf));
+
+			lstr = lbuf;
+			rstr = rbuf;
+		} else if (zfs_prop_is_string(psc->sc_prop)) {
+			lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf,
+			    sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0);
+			rvalid = (zfs_prop_get(r, psc->sc_prop, rbuf,
+			    sizeof (rbuf), NULL, NULL, 0, B_TRUE) == 0);
+
+			lstr = lbuf;
+			rstr = rbuf;
+		} else {
+			lvalid = zfs_prop_valid_for_type(psc->sc_prop,
+			    zfs_get_type(l), B_FALSE);
+			rvalid = zfs_prop_valid_for_type(psc->sc_prop,
+			    zfs_get_type(r), B_FALSE);
+
+			if (lvalid)
+				(void) zfs_prop_get_numeric(l, psc->sc_prop,
+				    &lnum, NULL, NULL, 0);
+			if (rvalid)
+				(void) zfs_prop_get_numeric(r, psc->sc_prop,
+				    &rnum, NULL, NULL, 0);
+		}
+
+		if (!lvalid && !rvalid)
+			continue;
+		else if (!lvalid)
+			return (1);
+		else if (!rvalid)
+			return (-1);
+
+		if (lstr)
+			ret = strcmp(lstr, rstr);
+		else if (lnum < rnum)
+			ret = -1;
+		else if (lnum > rnum)
+			ret = 1;
+
+		if (ret != 0) {
+			if (psc->sc_reverse == B_TRUE)
+				ret = (ret < 0) ? 1 : -1;
+			return (ret);
+		}
+	}
+
+	return (zfs_compare(larg, rarg, NULL));
+}
+
+int
+zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
+    zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit,
+    zfs_iter_f callback, void *data)
+{
+	callback_data_t cb = {0};
+	int ret = 0;
+	zfs_node_t *node;
+	uu_avl_walk_t *walk;
+
+	avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t),
+	    offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT);
+
+	if (avl_pool == NULL)
+		nomem();
+
+	cb.cb_sortcol = sortcol;
+	cb.cb_flags = flags;
+	cb.cb_proplist = proplist;
+	cb.cb_types = types;
+	cb.cb_depth_limit = limit;
+	/*
+	 * If cb_proplist is provided then in the zfs_handles created we
+	 * retain only those properties listed in cb_proplist and sortcol.
+	 * The rest are pruned. So, the caller should make sure that no other
+	 * properties other than those listed in cb_proplist/sortcol are
+	 * accessed.
+	 *
+	 * If cb_proplist is NULL then we retain all the properties.  We
+	 * always retain the zoned property, which some other properties
+	 * need (userquota & friends), and the createtxg property, which
+	 * we need to sort snapshots.
+	 */
+	if (cb.cb_proplist && *cb.cb_proplist) {
+		zprop_list_t *p = *cb.cb_proplist;
+
+		while (p) {
+			if (p->pl_prop >= ZFS_PROP_TYPE &&
+			    p->pl_prop < ZFS_NUM_PROPS) {
+				cb.cb_props_table[p->pl_prop] = B_TRUE;
+			}
+			p = p->pl_next;
+		}
+
+		while (sortcol) {
+			if (sortcol->sc_prop >= ZFS_PROP_TYPE &&
+			    sortcol->sc_prop < ZFS_NUM_PROPS) {
+				cb.cb_props_table[sortcol->sc_prop] = B_TRUE;
+			}
+			sortcol = sortcol->sc_next;
+		}
+
+		cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE;
+		cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE;
+	} else {
+		(void) memset(cb.cb_props_table, B_TRUE,
+		    sizeof (cb.cb_props_table));
+	}
+
+	if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+		nomem();
+
+	if (argc == 0) {
+		/*
+		 * If given no arguments, iterate over all datasets.
+		 */
+		cb.cb_flags |= ZFS_ITER_RECURSE;
+		ret = zfs_iter_root(g_zfs, zfs_callback, &cb);
+	} else {
+		int i;
+		zfs_handle_t *zhp;
+		zfs_type_t argtype;
+
+		/*
+		 * If we're recursive, then we always allow filesystems as
+		 * arguments.  If we also are interested in snapshots or
+		 * bookmarks, then we can take volumes as well.
+		 */
+		argtype = types;
+		if (flags & ZFS_ITER_RECURSE) {
+			argtype |= ZFS_TYPE_FILESYSTEM;
+			if (types & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK))
+				argtype |= ZFS_TYPE_VOLUME;
+		}
+
+		for (i = 0; i < argc; i++) {
+			if (flags & ZFS_ITER_ARGS_CAN_BE_PATHS) {
+				zhp = zfs_path_to_zhandle(g_zfs, argv[i],
+				    argtype);
+			} else {
+				zhp = zfs_open(g_zfs, argv[i], argtype);
+			}
+			if (zhp != NULL)
+				ret |= zfs_callback(zhp, &cb);
+			else
+				ret = 1;
+		}
+	}
+
+	/*
+	 * At this point we've got our AVL tree full of zfs handles, so iterate
+	 * over each one and execute the real user callback.
+	 */
+	for (node = uu_avl_first(cb.cb_avl); node != NULL;
+	    node = uu_avl_next(cb.cb_avl, node))
+		ret |= callback(node->zn_handle, data);
+
+	/*
+	 * Finally, clean up the AVL tree.
+	 */
+	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
+
+	while ((node = uu_avl_walk_next(walk)) != NULL) {
+		uu_avl_remove(cb.cb_avl, node);
+		zfs_close(node->zn_handle);
+		free(node);
+	}
+
+	uu_avl_walk_end(walk);
+	uu_avl_destroy(cb.cb_avl);
+	uu_avl_pool_destroy(avl_pool);
+
+	return (ret);
+}
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_iter.h b/sys/contrib/openzfs/cmd/zfs/zfs_iter.h
new file mode 100644
index 000000000000..2697fbdca1df
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_iter.h
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#ifndef	ZFS_ITER_H
+#define	ZFS_ITER_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct zfs_sort_column {
+	struct zfs_sort_column	*sc_next;
+	struct zfs_sort_column	*sc_last;
+	zfs_prop_t		sc_prop;
+	char			*sc_user_prop;
+	boolean_t		sc_reverse;
+} zfs_sort_column_t;
+
+#define	ZFS_ITER_RECURSE	   (1 << 0)
+#define	ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1)
+#define	ZFS_ITER_PROP_LISTSNAPS    (1 << 2)
+#define	ZFS_ITER_DEPTH_LIMIT	   (1 << 3)
+#define	ZFS_ITER_RECVD_PROPS	   (1 << 4)
+#define	ZFS_ITER_LITERAL_PROPS	   (1 << 5)
+#define	ZFS_ITER_SIMPLE		   (1 << 6)
+
+int zfs_for_each(int, char **, int options, zfs_type_t,
+    zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *);
+int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t);
+void zfs_free_sort_columns(zfs_sort_column_t *);
+int zfs_sort_only_by_name(const zfs_sort_column_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* ZFS_ITER_H */
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
new file mode 100644
index 000000000000..650b4fc9b74f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
@@ -0,0 +1,8620 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright 2012 Milan Jurik. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland.  All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
+ * Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <sys/debug.h>
+#include <errno.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <libnvpair.h>
+#include <locale.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <zone.h>
+#include <grp.h>
+#include <pwd.h>
+#include <signal.h>
+#include <sys/debug.h>
+#include <sys/list.h>
+#include <sys/mkdev.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/fs/zfs.h>
+#include <sys/systeminfo.h>
+#include <sys/types.h>
+#include <time.h>
+#include <sys/zfs_project.h>
+
+#include <libzfs.h>
+#include <libzfs_core.h>
+#include <zfs_prop.h>
+#include <zfs_deleg.h>
+#include <libzutil.h>
+#include <libuutil.h>
+#ifdef HAVE_IDMAP
+#include <aclutils.h>
+#include <directory.h>
+#endif /* HAVE_IDMAP */
+
+#include "zfs_iter.h"
+#include "zfs_util.h"
+#include "zfs_comutil.h"
+#include "libzfs_impl.h"
+#include "zfs_projectutil.h"
+
+libzfs_handle_t *g_zfs;
+
+static FILE *mnttab_file;
+static char history_str[HIS_MAX_RECORD_LEN];
+static boolean_t log_history = B_TRUE;
+
+static int zfs_do_clone(int argc, char **argv);
+static int zfs_do_create(int argc, char **argv);
+static int zfs_do_destroy(int argc, char **argv);
+static int zfs_do_get(int argc, char **argv);
+static int zfs_do_inherit(int argc, char **argv);
+static int zfs_do_list(int argc, char **argv);
+static int zfs_do_mount(int argc, char **argv);
+static int zfs_do_rename(int argc, char **argv);
+static int zfs_do_rollback(int argc, char **argv);
+static int zfs_do_set(int argc, char **argv);
+static int zfs_do_upgrade(int argc, char **argv);
+static int zfs_do_snapshot(int argc, char **argv);
+static int zfs_do_unmount(int argc, char **argv);
+static int zfs_do_share(int argc, char **argv);
+static int zfs_do_unshare(int argc, char **argv);
+static int zfs_do_send(int argc, char **argv);
+static int zfs_do_receive(int argc, char **argv);
+static int zfs_do_promote(int argc, char **argv);
+static int zfs_do_userspace(int argc, char **argv);
+static int zfs_do_allow(int argc, char **argv);
+static int zfs_do_unallow(int argc, char **argv);
+static int zfs_do_hold(int argc, char **argv);
+static int zfs_do_holds(int argc, char **argv);
+static int zfs_do_release(int argc, char **argv);
+static int zfs_do_diff(int argc, char **argv);
+static int zfs_do_bookmark(int argc, char **argv);
+static int zfs_do_channel_program(int argc, char **argv);
+static int zfs_do_load_key(int argc, char **argv);
+static int zfs_do_unload_key(int argc, char **argv);
+static int zfs_do_change_key(int argc, char **argv);
+static int zfs_do_project(int argc, char **argv);
+static int zfs_do_version(int argc, char **argv);
+static int zfs_do_redact(int argc, char **argv);
+static int zfs_do_wait(int argc, char **argv);
+
+#ifdef __FreeBSD__
+static int zfs_do_jail(int argc, char **argv);
+static int zfs_do_unjail(int argc, char **argv);
+#endif
+
+/*
+ * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
+ */
+
+#ifdef DEBUG
+const char *
+_umem_debug_init(void)
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+#endif
+
+typedef enum {
+	HELP_CLONE,
+	HELP_CREATE,
+	HELP_DESTROY,
+	HELP_GET,
+	HELP_INHERIT,
+	HELP_UPGRADE,
+	HELP_LIST,
+	HELP_MOUNT,
+	HELP_PROMOTE,
+	HELP_RECEIVE,
+	HELP_RENAME,
+	HELP_ROLLBACK,
+	HELP_SEND,
+	HELP_SET,
+	HELP_SHARE,
+	HELP_SNAPSHOT,
+	HELP_UNMOUNT,
+	HELP_UNSHARE,
+	HELP_ALLOW,
+	HELP_UNALLOW,
+	HELP_USERSPACE,
+	HELP_GROUPSPACE,
+	HELP_PROJECTSPACE,
+	HELP_PROJECT,
+	HELP_HOLD,
+	HELP_HOLDS,
+	HELP_RELEASE,
+	HELP_DIFF,
+	HELP_BOOKMARK,
+	HELP_CHANNEL_PROGRAM,
+	HELP_LOAD_KEY,
+	HELP_UNLOAD_KEY,
+	HELP_CHANGE_KEY,
+	HELP_VERSION,
+	HELP_REDACT,
+	HELP_JAIL,
+	HELP_UNJAIL,
+	HELP_WAIT,
+} zfs_help_t;
+
+typedef struct zfs_command {
+	const char	*name;
+	int		(*func)(int argc, char **argv);
+	zfs_help_t	usage;
+} zfs_command_t;
+
+/*
+ * Master command table.  Each ZFS command has a name, associated function, and
+ * usage message.  The usage messages need to be internationalized, so we have
+ * to have a function to return the usage message based on a command index.
+ *
+ * These commands are organized according to how they are displayed in the usage
+ * message.  An empty command (one with a NULL name) indicates an empty line in
+ * the generic usage message.
+ */
+static zfs_command_t command_table[] = {
+	{ "version",	zfs_do_version, 	HELP_VERSION		},
+	{ NULL },
+	{ "create",	zfs_do_create,		HELP_CREATE		},
+	{ "destroy",	zfs_do_destroy,		HELP_DESTROY		},
+	{ NULL },
+	{ "snapshot",	zfs_do_snapshot,	HELP_SNAPSHOT		},
+	{ "rollback",	zfs_do_rollback,	HELP_ROLLBACK		},
+	{ "clone",	zfs_do_clone,		HELP_CLONE		},
+	{ "promote",	zfs_do_promote,		HELP_PROMOTE		},
+	{ "rename",	zfs_do_rename,		HELP_RENAME		},
+	{ "bookmark",	zfs_do_bookmark,	HELP_BOOKMARK		},
+	{ "program",    zfs_do_channel_program, HELP_CHANNEL_PROGRAM    },
+	{ NULL },
+	{ "list",	zfs_do_list,		HELP_LIST		},
+	{ NULL },
+	{ "set",	zfs_do_set,		HELP_SET		},
+	{ "get",	zfs_do_get,		HELP_GET		},
+	{ "inherit",	zfs_do_inherit,		HELP_INHERIT		},
+	{ "upgrade",	zfs_do_upgrade,		HELP_UPGRADE		},
+	{ NULL },
+	{ "userspace",	zfs_do_userspace,	HELP_USERSPACE		},
+	{ "groupspace",	zfs_do_userspace,	HELP_GROUPSPACE		},
+	{ "projectspace", zfs_do_userspace,	HELP_PROJECTSPACE	},
+	{ NULL },
+	{ "project",	zfs_do_project,		HELP_PROJECT		},
+	{ NULL },
+	{ "mount",	zfs_do_mount,		HELP_MOUNT		},
+	{ "unmount",	zfs_do_unmount,		HELP_UNMOUNT		},
+	{ "share",	zfs_do_share,		HELP_SHARE		},
+	{ "unshare",	zfs_do_unshare,		HELP_UNSHARE		},
+	{ NULL },
+	{ "send",	zfs_do_send,		HELP_SEND		},
+	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
+	{ NULL },
+	{ "allow",	zfs_do_allow,		HELP_ALLOW		},
+	{ NULL },
+	{ "unallow",	zfs_do_unallow,		HELP_UNALLOW		},
+	{ NULL },
+	{ "hold",	zfs_do_hold,		HELP_HOLD		},
+	{ "holds",	zfs_do_holds,		HELP_HOLDS		},
+	{ "release",	zfs_do_release,		HELP_RELEASE		},
+	{ "diff",	zfs_do_diff,		HELP_DIFF		},
+	{ "load-key",	zfs_do_load_key,	HELP_LOAD_KEY		},
+	{ "unload-key",	zfs_do_unload_key,	HELP_UNLOAD_KEY		},
+	{ "change-key",	zfs_do_change_key,	HELP_CHANGE_KEY		},
+	{ "redact",	zfs_do_redact,		HELP_REDACT		},
+	{ "wait",	zfs_do_wait,		HELP_WAIT		},
+
+#ifdef __FreeBSD__
+	{ "jail",	zfs_do_jail,		HELP_JAIL		},
+	{ "unjail",	zfs_do_unjail,		HELP_UNJAIL		},
+#endif
+};
+
+#define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
+
+zfs_command_t *current_command;
+
+static const char *
+get_usage(zfs_help_t idx)
+{
+	switch (idx) {
+	case HELP_CLONE:
+		return (gettext("\tclone [-p] [-o property=value] ... "
+		    "<snapshot> <filesystem|volume>\n"));
+	case HELP_CREATE:
+		return (gettext("\tcreate [-Pnpv] [-o property=value] ... "
+		    "<filesystem>\n"
+		    "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... "
+		    "-V <size> <volume>\n"));
+	case HELP_DESTROY:
+		return (gettext("\tdestroy [-fnpRrv] <filesystem|volume>\n"
+		    "\tdestroy [-dnpRrv] "
+		    "<filesystem|volume>@<snap>[%<snap>][,...]\n"
+		    "\tdestroy <filesystem|volume>#<bookmark>\n"));
+	case HELP_GET:
+		return (gettext("\tget [-rHp] [-d max] "
+		    "[-o \"all\" | field[,...]]\n"
+		    "\t    [-t type[,...]] [-s source[,...]]\n"
+		    "\t    <\"all\" | property[,...]> "
+		    "[filesystem|volume|snapshot|bookmark] ...\n"));
+	case HELP_INHERIT:
+		return (gettext("\tinherit [-rS] <property> "
+		    "<filesystem|volume|snapshot> ...\n"));
+	case HELP_UPGRADE:
+		return (gettext("\tupgrade [-v]\n"
+		    "\tupgrade [-r] [-V version] <-a | filesystem ...>\n"));
+	case HELP_LIST:
+		return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] "
+		    "[-s property]...\n\t    [-S property]... [-t type[,...]] "
+		    "[filesystem|volume|snapshot] ...\n"));
+	case HELP_MOUNT:
+		return (gettext("\tmount\n"
+		    "\tmount [-flvO] [-o opts] <-a | filesystem>\n"));
+	case HELP_PROMOTE:
+		return (gettext("\tpromote <clone-filesystem>\n"));
+	case HELP_RECEIVE:
+		return (gettext("\treceive [-vMnsFhu] "
+		    "[-o <property>=<value>] ... [-x <property>] ...\n"
+		    "\t    <filesystem|volume|snapshot>\n"
+		    "\treceive [-vMnsFhu] [-o <property>=<value>] ... "
+		    "[-x <property>] ... \n"
+		    "\t    [-d | -e] <filesystem>\n"
+		    "\treceive -A <filesystem|volume>\n"));
+	case HELP_RENAME:
+		return (gettext("\trename [-f] <filesystem|volume|snapshot> "
+		    "<filesystem|volume|snapshot>\n"
+		    "\trename [-f] -p <filesystem|volume> <filesystem|volume>\n"
+		    "\trename -r <snapshot> <snapshot>\n"));
+	case HELP_ROLLBACK:
+		return (gettext("\trollback [-rRf] <snapshot>\n"));
+	case HELP_SEND:
+		return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] "
+		    "<snapshot>\n"
+		    "\tsend [-nvPLecw] [-i snapshot|bookmark] "
+		    "<filesystem|volume|snapshot>\n"
+		    "\tsend [-DnPpvLec] [-i bookmark|snapshot] "
+		    "--redact <bookmark> <snapshot>\n"
+		    "\tsend [-nvPe] -t <receive_resume_token>\n"
+		    "\tsend [-Pnv] --saved filesystem\n"));
+	case HELP_SET:
+		return (gettext("\tset <property=value> ... "
+		    "<filesystem|volume|snapshot> ...\n"));
+	case HELP_SHARE:
+		return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n"));
+	case HELP_SNAPSHOT:
+		return (gettext("\tsnapshot [-r] [-o property=value] ... "
+		    "<filesystem|volume>@<snap> ...\n"));
+	case HELP_UNMOUNT:
+		return (gettext("\tunmount [-fu] "
+		    "<-a | filesystem|mountpoint>\n"));
+	case HELP_UNSHARE:
+		return (gettext("\tunshare "
+		    "<-a [nfs|smb] | filesystem|mountpoint>\n"));
+	case HELP_ALLOW:
+		return (gettext("\tallow <filesystem|volume>\n"
+		    "\tallow [-ldug] "
+		    "<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n"
+		    "\t    <filesystem|volume>\n"
+		    "\tallow [-ld] -e <perm|@setname>[,...] "
+		    "<filesystem|volume>\n"
+		    "\tallow -c <perm|@setname>[,...] <filesystem|volume>\n"
+		    "\tallow -s @setname <perm|@setname>[,...] "
+		    "<filesystem|volume>\n"));
+	case HELP_UNALLOW:
+		return (gettext("\tunallow [-rldug] "
+		    "<\"everyone\"|user|group>[,...]\n"
+		    "\t    [<perm|@setname>[,...]] <filesystem|volume>\n"
+		    "\tunallow [-rld] -e [<perm|@setname>[,...]] "
+		    "<filesystem|volume>\n"
+		    "\tunallow [-r] -c [<perm|@setname>[,...]] "
+		    "<filesystem|volume>\n"
+		    "\tunallow [-r] -s @setname [<perm|@setname>[,...]] "
+		    "<filesystem|volume>\n"));
+	case HELP_USERSPACE:
+		return (gettext("\tuserspace [-Hinp] [-o field[,...]] "
+		    "[-s field] ...\n"
+		    "\t    [-S field] ... [-t type[,...]] "
+		    "<filesystem|snapshot>\n"));
+	case HELP_GROUPSPACE:
+		return (gettext("\tgroupspace [-Hinp] [-o field[,...]] "
+		    "[-s field] ...\n"
+		    "\t    [-S field] ... [-t type[,...]] "
+		    "<filesystem|snapshot>\n"));
+	case HELP_PROJECTSPACE:
+		return (gettext("\tprojectspace [-Hp] [-o field[,...]] "
+		    "[-s field] ... \n"
+		    "\t    [-S field] ... <filesystem|snapshot>\n"));
+	case HELP_PROJECT:
+		return (gettext("\tproject [-d|-r] <directory|file ...>\n"
+		    "\tproject -c [-0] [-d|-r] [-p id] <directory|file ...>\n"
+		    "\tproject -C [-k] [-r] <directory ...>\n"
+		    "\tproject [-p id] [-r] [-s] <directory ...>\n"));
+	case HELP_HOLD:
+		return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
+	case HELP_HOLDS:
+		return (gettext("\tholds [-rH] <snapshot> ...\n"));
+	case HELP_RELEASE:
+		return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
+	case HELP_DIFF:
+		return (gettext("\tdiff [-FHt] <snapshot> "
+		    "[snapshot|filesystem]\n"));
+	case HELP_BOOKMARK:
+		return (gettext("\tbookmark <snapshot|bookmark> "
+		    "<newbookmark>\n"));
+	case HELP_CHANNEL_PROGRAM:
+		return (gettext("\tprogram [-jn] [-t <instruction limit>] "
+		    "[-m <memory limit (b)>]\n"
+		    "\t    <pool> <program file> [lua args...]\n"));
+	case HELP_LOAD_KEY:
+		return (gettext("\tload-key [-rn] [-L <keylocation>] "
+		    "<-a | filesystem|volume>\n"));
+	case HELP_UNLOAD_KEY:
+		return (gettext("\tunload-key [-r] "
+		    "<-a | filesystem|volume>\n"));
+	case HELP_CHANGE_KEY:
+		return (gettext("\tchange-key [-l] [-o keyformat=<value>]\n"
+		    "\t    [-o keylocation=<value>] [-o pbkfd2iters=<value>]\n"
+		    "\t    <filesystem|volume>\n"
+		    "\tchange-key -i [-l] <filesystem|volume>\n"));
+	case HELP_VERSION:
+		return (gettext("\tversion\n"));
+	case HELP_REDACT:
+		return (gettext("\tredact <snapshot> <bookmark> "
+		    "<redaction_snapshot> ...\n"));
+	case HELP_JAIL:
+		return (gettext("\tjail <jailid|jailname> <filesystem>\n"));
+	case HELP_UNJAIL:
+		return (gettext("\tunjail <jailid|jailname> <filesystem>\n"));
+	case HELP_WAIT:
+		return (gettext("\twait [-t <activity>] <filesystem>\n"));
+	}
+
+	abort();
+	/* NOTREACHED */
+}
+
+void
+nomem(void)
+{
+	(void) fprintf(stderr, gettext("internal error: out of memory\n"));
+	exit(1);
+}
+
+/*
+ * Utility function to guarantee malloc() success.
+ */
+
+void *
+safe_malloc(size_t size)
+{
+	void *data;
+
+	if ((data = calloc(1, size)) == NULL)
+		nomem();
+
+	return (data);
+}
+
+static void *
+safe_realloc(void *data, size_t size)
+{
+	void *newp;
+	if ((newp = realloc(data, size)) == NULL) {
+		free(data);
+		nomem();
+	}
+
+	return (newp);
+}
+
+static char *
+safe_strdup(char *str)
+{
+	char *dupstr = strdup(str);
+
+	if (dupstr == NULL)
+		nomem();
+
+	return (dupstr);
+}
+
+/*
+ * Callback routine that will print out information for each of
+ * the properties.
+ */
+static int
+usage_prop_cb(int prop, void *cb)
+{
+	FILE *fp = cb;
+
+	(void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop));
+
+	if (zfs_prop_readonly(prop))
+		(void) fprintf(fp, " NO    ");
+	else
+		(void) fprintf(fp, "YES    ");
+
+	if (zfs_prop_inheritable(prop))
+		(void) fprintf(fp, "  YES   ");
+	else
+		(void) fprintf(fp, "   NO   ");
+
+	if (zfs_prop_values(prop) == NULL)
+		(void) fprintf(fp, "-\n");
+	else
+		(void) fprintf(fp, "%s\n", zfs_prop_values(prop));
+
+	return (ZPROP_CONT);
+}
+
+/*
+ * Display usage message.  If we're inside a command, display only the usage for
+ * that command.  Otherwise, iterate over the entire command table and display
+ * a complete usage message.
+ */
+static void
+usage(boolean_t requested)
+{
+	int i;
+	boolean_t show_properties = B_FALSE;
+	FILE *fp = requested ? stdout : stderr;
+
+	if (current_command == NULL) {
+
+		(void) fprintf(fp, gettext("usage: zfs command args ...\n"));
+		(void) fprintf(fp,
+		    gettext("where 'command' is one of the following:\n\n"));
+
+		for (i = 0; i < NCOMMAND; i++) {
+			if (command_table[i].name == NULL)
+				(void) fprintf(fp, "\n");
+			else
+				(void) fprintf(fp, "%s",
+				    get_usage(command_table[i].usage));
+		}
+
+		(void) fprintf(fp, gettext("\nEach dataset is of the form: "
+		    "pool/[dataset/]*dataset[@name]\n"));
+	} else {
+		(void) fprintf(fp, gettext("usage:\n"));
+		(void) fprintf(fp, "%s", get_usage(current_command->usage));
+	}
+
+	if (current_command != NULL &&
+	    (strcmp(current_command->name, "set") == 0 ||
+	    strcmp(current_command->name, "get") == 0 ||
+	    strcmp(current_command->name, "inherit") == 0 ||
+	    strcmp(current_command->name, "list") == 0))
+		show_properties = B_TRUE;
+
+	if (show_properties) {
+		(void) fprintf(fp,
+		    gettext("\nThe following properties are supported:\n"));
+
+		(void) fprintf(fp, "\n\t%-14s %s  %s   %s\n\n",
+		    "PROPERTY", "EDIT", "INHERIT", "VALUES");
+
+		/* Iterate over all properties */
+		(void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
+		    ZFS_TYPE_DATASET);
+
+		(void) fprintf(fp, "\t%-15s ", "userused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "groupused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "projectused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "userobjused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "groupobjused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "projectobjused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "userquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "groupquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "projectquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "userobjquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "groupobjquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "projectobjquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "written@<snap>");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "written#<bookmark>");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+
+		(void) fprintf(fp, gettext("\nSizes are specified in bytes "
+		    "with standard units such as K, M, G, etc.\n"));
+		(void) fprintf(fp, gettext("\nUser-defined properties can "
+		    "be specified by using a name containing a colon (:).\n"));
+		(void) fprintf(fp, gettext("\nThe {user|group|project}"
+		    "[obj]{used|quota}@ properties must be appended with\n"
+		    "a user|group|project specifier of one of these forms:\n"
+		    "    POSIX name      (eg: \"matt\")\n"
+		    "    POSIX id        (eg: \"126829\")\n"
+		    "    SMB name@domain (eg: \"matt@sun\")\n"
+		    "    SMB SID         (eg: \"S-1-234-567-89\")\n"));
+	} else {
+		(void) fprintf(fp,
+		    gettext("\nFor the property list, run: %s\n"),
+		    "zfs set|get");
+		(void) fprintf(fp,
+		    gettext("\nFor the delegated permission list, run: %s\n"),
+		    "zfs allow|unallow");
+	}
+
+	/*
+	 * See comments at end of main().
+	 */
+	if (getenv("ZFS_ABORT") != NULL) {
+		(void) printf("dumping core by request\n");
+		abort();
+	}
+
+	exit(requested ? 0 : 2);
+}
+
+/*
+ * Take a property=value argument string and add it to the given nvlist.
+ * Modifies the argument inplace.
+ */
+static boolean_t
+parseprop(nvlist_t *props, char *propname)
+{
+	char *propval;
+
+	if ((propval = strchr(propname, '=')) == NULL) {
+		(void) fprintf(stderr, gettext("missing "
+		    "'=' for property=value argument\n"));
+		return (B_FALSE);
+	}
+	*propval = '\0';
+	propval++;
+	if (nvlist_exists(props, propname)) {
+		(void) fprintf(stderr, gettext("property '%s' "
+		    "specified multiple times\n"), propname);
+		return (B_FALSE);
+	}
+	if (nvlist_add_string(props, propname, propval) != 0)
+		nomem();
+	return (B_TRUE);
+}
+
+/*
+ * Take a property name argument and add it to the given nvlist.
+ * Modifies the argument inplace.
+ */
+static boolean_t
+parsepropname(nvlist_t *props, char *propname)
+{
+	if (strchr(propname, '=') != NULL) {
+		(void) fprintf(stderr, gettext("invalid character "
+		    "'=' in property argument\n"));
+		return (B_FALSE);
+	}
+	if (nvlist_exists(props, propname)) {
+		(void) fprintf(stderr, gettext("property '%s' "
+		    "specified multiple times\n"), propname);
+		return (B_FALSE);
+	}
+	if (nvlist_add_boolean(props, propname) != 0)
+		nomem();
+	return (B_TRUE);
+}
+
+static int
+parse_depth(char *opt, int *flags)
+{
+	char *tmp;
+	int depth;
+
+	depth = (int)strtol(opt, &tmp, 0);
+	if (*tmp) {
+		(void) fprintf(stderr,
+		    gettext("%s is not an integer\n"), optarg);
+		usage(B_FALSE);
+	}
+	if (depth < 0) {
+		(void) fprintf(stderr,
+		    gettext("Depth can not be negative.\n"));
+		usage(B_FALSE);
+	}
+	*flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE);
+	return (depth);
+}
+
+#define	PROGRESS_DELAY 2		/* seconds */
+
+static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+static time_t pt_begin;
+static char *pt_header = NULL;
+static boolean_t pt_shown;
+
+static void
+start_progress_timer(void)
+{
+	pt_begin = time(NULL) + PROGRESS_DELAY;
+	pt_shown = B_FALSE;
+}
+
+static void
+set_progress_header(char *header)
+{
+	assert(pt_header == NULL);
+	pt_header = safe_strdup(header);
+	if (pt_shown) {
+		(void) printf("%s: ", header);
+		(void) fflush(stdout);
+	}
+}
+
+static void
+update_progress(char *update)
+{
+	if (!pt_shown && time(NULL) > pt_begin) {
+		int len = strlen(update);
+
+		(void) printf("%s: %s%*.*s", pt_header, update, len, len,
+		    pt_reverse);
+		(void) fflush(stdout);
+		pt_shown = B_TRUE;
+	} else if (pt_shown) {
+		int len = strlen(update);
+
+		(void) printf("%s%*.*s", update, len, len, pt_reverse);
+		(void) fflush(stdout);
+	}
+}
+
+static void
+finish_progress(char *done)
+{
+	if (pt_shown) {
+		(void) printf("%s\n", done);
+		(void) fflush(stdout);
+	}
+	free(pt_header);
+	pt_header = NULL;
+}
+
+static int
+zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type)
+{
+	zfs_handle_t *zhp = NULL;
+	int ret = 0;
+
+	zhp = zfs_open(hdl, dataset, type);
+	if (zhp == NULL)
+		return (1);
+
+	/*
+	 * Volumes may neither be mounted or shared.  Potentially in the
+	 * future filesystems detected on these volumes could be mounted.
+	 */
+	if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	/*
+	 * Mount and/or share the new filesystem as appropriate.  We provide a
+	 * verbose error message to let the user know that their filesystem was
+	 * in fact created, even if we failed to mount or share it.
+	 *
+	 * If the user doesn't want the dataset automatically mounted, then
+	 * skip the mount/share step
+	 */
+	if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) {
+		if (zfs_mount_delegation_check()) {
+			(void) fprintf(stderr, gettext("filesystem "
+			    "successfully created, but it may only be "
+			    "mounted by root\n"));
+			ret = 1;
+		} else if (zfs_mount(zhp, NULL, 0) != 0) {
+			(void) fprintf(stderr, gettext("filesystem "
+			    "successfully created, but not mounted\n"));
+			ret = 1;
+		} else if (zfs_share(zhp) != 0) {
+			(void) fprintf(stderr, gettext("filesystem "
+			    "successfully created, but not shared\n"));
+			ret = 1;
+		}
+		zfs_commit_all_shares();
+	}
+
+	zfs_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
+ *
+ * Given an existing dataset, create a writable copy whose initial contents
+ * are the same as the source.  The newly created dataset maintains a
+ * dependency on the original; the original cannot be destroyed so long as
+ * the clone exists.
+ *
+ * The '-p' flag creates all the non-existing ancestors of the target first.
+ */
+static int
+zfs_do_clone(int argc, char **argv)
+{
+	zfs_handle_t *zhp = NULL;
+	boolean_t parents = B_FALSE;
+	nvlist_t *props;
+	int ret = 0;
+	int c;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	/* check options */
+	while ((c = getopt(argc, argv, "o:p")) != -1) {
+		switch (c) {
+		case 'o':
+			if (!parseprop(props, optarg)) {
+				nvlist_free(props);
+				return (1);
+			}
+			break;
+		case 'p':
+			parents = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			goto usage;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing source dataset "
+		    "argument\n"));
+		goto usage;
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing target dataset "
+		    "argument\n"));
+		goto usage;
+	}
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		goto usage;
+	}
+
+	/* open the source dataset */
+	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) {
+		nvlist_free(props);
+		return (1);
+	}
+
+	if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME)) {
+		/*
+		 * Now create the ancestors of the target dataset.  If the
+		 * target already exists and '-p' option was used we should not
+		 * complain.
+		 */
+		if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM |
+		    ZFS_TYPE_VOLUME)) {
+			zfs_close(zhp);
+			nvlist_free(props);
+			return (0);
+		}
+		if (zfs_create_ancestors(g_zfs, argv[1]) != 0) {
+			zfs_close(zhp);
+			nvlist_free(props);
+			return (1);
+		}
+	}
+
+	/* pass to libzfs */
+	ret = zfs_clone(zhp, argv[1], props);
+
+	/* create the mountpoint if necessary */
+	if (ret == 0) {
+		if (log_history) {
+			(void) zpool_log_history(g_zfs, history_str);
+			log_history = B_FALSE;
+		}
+
+		ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET);
+	}
+
+	zfs_close(zhp);
+	nvlist_free(props);
+
+	return (!!ret);
+
+usage:
+	ASSERT3P(zhp, ==, NULL);
+	nvlist_free(props);
+	usage(B_FALSE);
+	return (-1);
+}
+
+/*
+ * zfs create [-Pnpv] [-o prop=value] ... fs
+ * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size
+ *
+ * Create a new dataset.  This command can be used to create filesystems
+ * and volumes.  Snapshot creation is handled by 'zfs snapshot'.
+ * For volumes, the user must specify a size to be used.
+ *
+ * The '-s' flag applies only to volumes, and indicates that we should not try
+ * to set the reservation for this volume.  By default we set a reservation
+ * equal to the size for any volume.  For pools with SPA_VERSION >=
+ * SPA_VERSION_REFRESERVATION, we set a refreservation instead.
+ *
+ * The '-p' flag creates all the non-existing ancestors of the target first.
+ *
+ * The '-n' flag is no-op (dry run) mode.  This will perform a user-space sanity
+ * check of arguments and properties, but does not check for permissions,
+ * available space, etc.
+ *
+ * The '-v' flag is for verbose output.
+ *
+ * The '-P' flag is used for parseable output.  It implies '-v'.
+ */
+static int
+zfs_do_create(int argc, char **argv)
+{
+	zfs_type_t type = ZFS_TYPE_FILESYSTEM;
+	zpool_handle_t *zpool_handle = NULL;
+	nvlist_t *real_props = NULL;
+	uint64_t volsize = 0;
+	int c;
+	boolean_t noreserve = B_FALSE;
+	boolean_t bflag = B_FALSE;
+	boolean_t parents = B_FALSE;
+	boolean_t dryrun = B_FALSE;
+	boolean_t verbose = B_FALSE;
+	boolean_t parseable = B_FALSE;
+	int ret = 1;
+	nvlist_t *props;
+	uint64_t intval;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":PV:b:nso:pv")) != -1) {
+		switch (c) {
+		case 'V':
+			type = ZFS_TYPE_VOLUME;
+			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
+				(void) fprintf(stderr, gettext("bad volume "
+				    "size '%s': %s\n"), optarg,
+				    libzfs_error_description(g_zfs));
+				goto error;
+			}
+
+			if (nvlist_add_uint64(props,
+			    zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
+				nomem();
+			volsize = intval;
+			break;
+		case 'P':
+			verbose = B_TRUE;
+			parseable = B_TRUE;
+			break;
+		case 'p':
+			parents = B_TRUE;
+			break;
+		case 'b':
+			bflag = B_TRUE;
+			if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
+				(void) fprintf(stderr, gettext("bad volume "
+				    "block size '%s': %s\n"), optarg,
+				    libzfs_error_description(g_zfs));
+				goto error;
+			}
+
+			if (nvlist_add_uint64(props,
+			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+			    intval) != 0)
+				nomem();
+			break;
+		case 'n':
+			dryrun = B_TRUE;
+			break;
+		case 'o':
+			if (!parseprop(props, optarg))
+				goto error;
+			break;
+		case 's':
+			noreserve = B_TRUE;
+			break;
+		case 'v':
+			verbose = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing size "
+			    "argument\n"));
+			goto badusage;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			goto badusage;
+		}
+	}
+
+	if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) {
+		(void) fprintf(stderr, gettext("'-s' and '-b' can only be "
+		    "used when creating a volume\n"));
+		goto badusage;
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc == 0) {
+		(void) fprintf(stderr, gettext("missing %s argument\n"),
+		    zfs_type_to_name(type));
+		goto badusage;
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		goto badusage;
+	}
+
+	if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) {
+		char msg[ZFS_MAX_DATASET_NAME_LEN * 2];
+		char *p;
+
+		if ((p = strchr(argv[0], '/')) != NULL)
+			*p = '\0';
+		zpool_handle = zpool_open(g_zfs, argv[0]);
+		if (p != NULL)
+			*p = '/';
+		if (zpool_handle == NULL)
+			goto error;
+
+		(void) snprintf(msg, sizeof (msg),
+		    dryrun ? gettext("cannot verify '%s'") :
+		    gettext("cannot create '%s'"), argv[0]);
+		if (props && (real_props = zfs_valid_proplist(g_zfs, type,
+		    props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) {
+			zpool_close(zpool_handle);
+			goto error;
+		}
+	}
+
+	/*
+	 * if volsize is not a multiple of volblocksize, round it up to the
+	 * nearest multiple of the volblocksize
+	 */
+	if (type == ZFS_TYPE_VOLUME) {
+		uint64_t volblocksize;
+
+		if (nvlist_lookup_uint64(props,
+		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+		    &volblocksize) != 0)
+			volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
+
+		if (volsize % volblocksize) {
+			volsize = P2ROUNDUP_TYPED(volsize, volblocksize,
+			    uint64_t);
+
+			if (nvlist_add_uint64(props,
+			    zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) {
+				nvlist_free(props);
+				nomem();
+			}
+		}
+	}
+
+
+	if (type == ZFS_TYPE_VOLUME && !noreserve) {
+		uint64_t spa_version;
+		zfs_prop_t resv_prop;
+		char *strval;
+
+		spa_version = zpool_get_prop_int(zpool_handle,
+		    ZPOOL_PROP_VERSION, NULL);
+		if (spa_version >= SPA_VERSION_REFRESERVATION)
+			resv_prop = ZFS_PROP_REFRESERVATION;
+		else
+			resv_prop = ZFS_PROP_RESERVATION;
+
+		volsize = zvol_volsize_to_reservation(zpool_handle, volsize,
+		    real_props);
+
+		if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
+		    &strval) != 0) {
+			if (nvlist_add_uint64(props,
+			    zfs_prop_to_name(resv_prop), volsize) != 0) {
+				nvlist_free(props);
+				nomem();
+			}
+		}
+	}
+	if (zpool_handle != NULL) {
+		zpool_close(zpool_handle);
+		nvlist_free(real_props);
+	}
+
+	if (parents && zfs_name_valid(argv[0], type)) {
+		/*
+		 * Now create the ancestors of target dataset.  If the target
+		 * already exists and '-p' option was used we should not
+		 * complain.
+		 */
+		if (zfs_dataset_exists(g_zfs, argv[0], type)) {
+			ret = 0;
+			goto error;
+		}
+		if (verbose) {
+			(void) printf(parseable ? "create_ancestors\t%s\n" :
+			    dryrun ?  "would create ancestors of %s\n" :
+			    "create ancestors of %s\n", argv[0]);
+		}
+		if (!dryrun) {
+			if (zfs_create_ancestors(g_zfs, argv[0]) != 0) {
+				goto error;
+			}
+		}
+	}
+
+	if (verbose) {
+		nvpair_t *nvp = NULL;
+		(void) printf(parseable ? "create\t%s\n" :
+		    dryrun ? "would create %s\n" : "create %s\n", argv[0]);
+		while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) {
+			uint64_t uval;
+			char *sval;
+
+			switch (nvpair_type(nvp)) {
+			case DATA_TYPE_UINT64:
+				VERIFY0(nvpair_value_uint64(nvp, &uval));
+				(void) printf(parseable ?
+				    "property\t%s\t%llu\n" : "\t%s=%llu\n",
+				    nvpair_name(nvp), (u_longlong_t)uval);
+				break;
+			case DATA_TYPE_STRING:
+				VERIFY0(nvpair_value_string(nvp, &sval));
+				(void) printf(parseable ?
+				    "property\t%s\t%s\n" : "\t%s=%s\n",
+				    nvpair_name(nvp), sval);
+				break;
+			default:
+				(void) fprintf(stderr, "property '%s' "
+				    "has illegal type %d\n",
+				    nvpair_name(nvp), nvpair_type(nvp));
+				abort();
+			}
+		}
+	}
+	if (dryrun) {
+		ret = 0;
+		goto error;
+	}
+
+	/* pass to libzfs */
+	if (zfs_create(g_zfs, argv[0], type, props) != 0)
+		goto error;
+
+	if (log_history) {
+		(void) zpool_log_history(g_zfs, history_str);
+		log_history = B_FALSE;
+	}
+
+	ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET);
+error:
+	nvlist_free(props);
+	return (ret);
+badusage:
+	nvlist_free(props);
+	usage(B_FALSE);
+	return (2);
+}
+
+/*
+ * zfs destroy [-rRf] <fs, vol>
+ * zfs destroy [-rRd] <snap>
+ *
+ *	-r	Recursively destroy all children
+ *	-R	Recursively destroy all dependents, including clones
+ *	-f	Force unmounting of any dependents
+ *	-d	If we can't destroy now, mark for deferred destruction
+ *
+ * Destroys the given dataset.  By default, it will unmount any filesystems,
+ * and refuse to destroy a dataset that has any dependents.  A dependent can
+ * either be a child, or a clone of a child.
+ */
+typedef struct destroy_cbdata {
+	boolean_t	cb_first;
+	boolean_t	cb_force;
+	boolean_t	cb_recurse;
+	boolean_t	cb_error;
+	boolean_t	cb_doclones;
+	zfs_handle_t	*cb_target;
+	boolean_t	cb_defer_destroy;
+	boolean_t	cb_verbose;
+	boolean_t	cb_parsable;
+	boolean_t	cb_dryrun;
+	nvlist_t	*cb_nvl;
+	nvlist_t	*cb_batchedsnaps;
+
+	/* first snap in contiguous run */
+	char		*cb_firstsnap;
+	/* previous snap in contiguous run */
+	char		*cb_prevsnap;
+	int64_t		cb_snapused;
+	char		*cb_snapspec;
+	char		*cb_bookmark;
+	uint64_t	cb_snap_count;
+} destroy_cbdata_t;
+
+/*
+ * Check for any dependents based on the '-r' or '-R' flags.
+ */
+static int
+destroy_check_dependent(zfs_handle_t *zhp, void *data)
+{
+	destroy_cbdata_t *cbp = data;
+	const char *tname = zfs_get_name(cbp->cb_target);
+	const char *name = zfs_get_name(zhp);
+
+	if (strncmp(tname, name, strlen(tname)) == 0 &&
+	    (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) {
+		/*
+		 * This is a direct descendant, not a clone somewhere else in
+		 * the hierarchy.
+		 */
+		if (cbp->cb_recurse)
+			goto out;
+
+		if (cbp->cb_first) {
+			(void) fprintf(stderr, gettext("cannot destroy '%s': "
+			    "%s has children\n"),
+			    zfs_get_name(cbp->cb_target),
+			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+			(void) fprintf(stderr, gettext("use '-r' to destroy "
+			    "the following datasets:\n"));
+			cbp->cb_first = B_FALSE;
+			cbp->cb_error = B_TRUE;
+		}
+
+		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+	} else {
+		/*
+		 * This is a clone.  We only want to report this if the '-r'
+		 * wasn't specified, or the target is a snapshot.
+		 */
+		if (!cbp->cb_recurse &&
+		    zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
+			goto out;
+
+		if (cbp->cb_first) {
+			(void) fprintf(stderr, gettext("cannot destroy '%s': "
+			    "%s has dependent clones\n"),
+			    zfs_get_name(cbp->cb_target),
+			    zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+			(void) fprintf(stderr, gettext("use '-R' to destroy "
+			    "the following datasets:\n"));
+			cbp->cb_first = B_FALSE;
+			cbp->cb_error = B_TRUE;
+			cbp->cb_dryrun = B_TRUE;
+		}
+
+		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+	}
+
+out:
+	zfs_close(zhp);
+	return (0);
+}
+
+static int
+destroy_batched(destroy_cbdata_t *cb)
+{
+	int error = zfs_destroy_snaps_nvl(g_zfs,
+	    cb->cb_batchedsnaps, B_FALSE);
+	fnvlist_free(cb->cb_batchedsnaps);
+	cb->cb_batchedsnaps = fnvlist_alloc();
+	return (error);
+}
+
+static int
+destroy_callback(zfs_handle_t *zhp, void *data)
+{
+	destroy_cbdata_t *cb = data;
+	const char *name = zfs_get_name(zhp);
+	int error;
+
+	if (cb->cb_verbose) {
+		if (cb->cb_parsable) {
+			(void) printf("destroy\t%s\n", name);
+		} else if (cb->cb_dryrun) {
+			(void) printf(gettext("would destroy %s\n"),
+			    name);
+		} else {
+			(void) printf(gettext("will destroy %s\n"),
+			    name);
+		}
+	}
+
+	/*
+	 * Ignore pools (which we've already flagged as an error before getting
+	 * here).
+	 */
+	if (strchr(zfs_get_name(zhp), '/') == NULL &&
+	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+		zfs_close(zhp);
+		return (0);
+	}
+	if (cb->cb_dryrun) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	/*
+	 * We batch up all contiguous snapshots (even of different
+	 * filesystems) and destroy them with one ioctl.  We can't
+	 * simply do all snap deletions and then all fs deletions,
+	 * because we must delete a clone before its origin.
+	 */
+	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
+		cb->cb_snap_count++;
+		fnvlist_add_boolean(cb->cb_batchedsnaps, name);
+		if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy)
+			error = destroy_batched(cb);
+	} else {
+		error = destroy_batched(cb);
+		if (error != 0 ||
+		    zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
+		    zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
+			zfs_close(zhp);
+			/*
+			 * When performing a recursive destroy we ignore errors
+			 * so that the recursive destroy could continue
+			 * destroying past problem datasets
+			 */
+			if (cb->cb_recurse) {
+				cb->cb_error = B_TRUE;
+				return (0);
+			}
+			return (-1);
+		}
+	}
+
+	zfs_close(zhp);
+	return (0);
+}
+
+static int
+destroy_print_cb(zfs_handle_t *zhp, void *arg)
+{
+	destroy_cbdata_t *cb = arg;
+	const char *name = zfs_get_name(zhp);
+	int err = 0;
+
+	if (nvlist_exists(cb->cb_nvl, name)) {
+		if (cb->cb_firstsnap == NULL)
+			cb->cb_firstsnap = strdup(name);
+		if (cb->cb_prevsnap != NULL)
+			free(cb->cb_prevsnap);
+		/* this snap continues the current range */
+		cb->cb_prevsnap = strdup(name);
+		if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL)
+			nomem();
+		if (cb->cb_verbose) {
+			if (cb->cb_parsable) {
+				(void) printf("destroy\t%s\n", name);
+			} else if (cb->cb_dryrun) {
+				(void) printf(gettext("would destroy %s\n"),
+				    name);
+			} else {
+				(void) printf(gettext("will destroy %s\n"),
+				    name);
+			}
+		}
+	} else if (cb->cb_firstsnap != NULL) {
+		/* end of this range */
+		uint64_t used = 0;
+		err = lzc_snaprange_space(cb->cb_firstsnap,
+		    cb->cb_prevsnap, &used);
+		cb->cb_snapused += used;
+		free(cb->cb_firstsnap);
+		cb->cb_firstsnap = NULL;
+		free(cb->cb_prevsnap);
+		cb->cb_prevsnap = NULL;
+	}
+	zfs_close(zhp);
+	return (err);
+}
+
+static int
+destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb)
+{
+	int err;
+	assert(cb->cb_firstsnap == NULL);
+	assert(cb->cb_prevsnap == NULL);
+	err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0);
+	if (cb->cb_firstsnap != NULL) {
+		uint64_t used = 0;
+		if (err == 0) {
+			err = lzc_snaprange_space(cb->cb_firstsnap,
+			    cb->cb_prevsnap, &used);
+		}
+		cb->cb_snapused += used;
+		free(cb->cb_firstsnap);
+		cb->cb_firstsnap = NULL;
+		free(cb->cb_prevsnap);
+		cb->cb_prevsnap = NULL;
+	}
+	return (err);
+}
+
+static int
+snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg)
+{
+	destroy_cbdata_t *cb = arg;
+	int err = 0;
+
+	/* Check for clones. */
+	if (!cb->cb_doclones && !cb->cb_defer_destroy) {
+		cb->cb_target = zhp;
+		cb->cb_first = B_TRUE;
+		err = zfs_iter_dependents(zhp, B_TRUE,
+		    destroy_check_dependent, cb);
+	}
+
+	if (err == 0) {
+		if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp)))
+			nomem();
+	}
+	zfs_close(zhp);
+	return (err);
+}
+
+static int
+gather_snapshots(zfs_handle_t *zhp, void *arg)
+{
+	destroy_cbdata_t *cb = arg;
+	int err = 0;
+
+	err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb);
+	if (err == ENOENT)
+		err = 0;
+	if (err != 0)
+		goto out;
+
+	if (cb->cb_verbose) {
+		err = destroy_print_snapshots(zhp, cb);
+		if (err != 0)
+			goto out;
+	}
+
+	if (cb->cb_recurse)
+		err = zfs_iter_filesystems(zhp, gather_snapshots, cb);
+
+out:
+	zfs_close(zhp);
+	return (err);
+}
+
+static int
+destroy_clones(destroy_cbdata_t *cb)
+{
+	nvpair_t *pair;
+	for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(cb->cb_nvl, pair)) {
+		zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair),
+		    ZFS_TYPE_SNAPSHOT);
+		if (zhp != NULL) {
+			boolean_t defer = cb->cb_defer_destroy;
+			int err;
+
+			/*
+			 * We can't defer destroy non-snapshots, so set it to
+			 * false while destroying the clones.
+			 */
+			cb->cb_defer_destroy = B_FALSE;
+			err = zfs_iter_dependents(zhp, B_FALSE,
+			    destroy_callback, cb);
+			cb->cb_defer_destroy = defer;
+			zfs_close(zhp);
+			if (err != 0)
+				return (err);
+		}
+	}
+	return (0);
+}
+
+static int
+zfs_do_destroy(int argc, char **argv)
+{
+	destroy_cbdata_t cb = { 0 };
+	int rv = 0;
+	int err = 0;
+	int c;
+	zfs_handle_t *zhp = NULL;
+	char *at, *pound;
+	zfs_type_t type = ZFS_TYPE_DATASET;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "vpndfrR")) != -1) {
+		switch (c) {
+		case 'v':
+			cb.cb_verbose = B_TRUE;
+			break;
+		case 'p':
+			cb.cb_verbose = B_TRUE;
+			cb.cb_parsable = B_TRUE;
+			break;
+		case 'n':
+			cb.cb_dryrun = B_TRUE;
+			break;
+		case 'd':
+			cb.cb_defer_destroy = B_TRUE;
+			type = ZFS_TYPE_SNAPSHOT;
+			break;
+		case 'f':
+			cb.cb_force = B_TRUE;
+			break;
+		case 'r':
+			cb.cb_recurse = B_TRUE;
+			break;
+		case 'R':
+			cb.cb_recurse = B_TRUE;
+			cb.cb_doclones = B_TRUE;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc == 0) {
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	at = strchr(argv[0], '@');
+	pound = strchr(argv[0], '#');
+	if (at != NULL) {
+
+		/* Build the list of snaps to destroy in cb_nvl. */
+		cb.cb_nvl = fnvlist_alloc();
+
+		*at = '\0';
+		zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL) {
+			nvlist_free(cb.cb_nvl);
+			return (1);
+		}
+
+		cb.cb_snapspec = at + 1;
+		if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 ||
+		    cb.cb_error) {
+			rv = 1;
+			goto out;
+		}
+
+		if (nvlist_empty(cb.cb_nvl)) {
+			(void) fprintf(stderr, gettext("could not find any "
+			    "snapshots to destroy; check snapshot names.\n"));
+			rv = 1;
+			goto out;
+		}
+
+		if (cb.cb_verbose) {
+			char buf[16];
+			zfs_nicebytes(cb.cb_snapused, buf, sizeof (buf));
+			if (cb.cb_parsable) {
+				(void) printf("reclaim\t%llu\n",
+				    (u_longlong_t)cb.cb_snapused);
+			} else if (cb.cb_dryrun) {
+				(void) printf(gettext("would reclaim %s\n"),
+				    buf);
+			} else {
+				(void) printf(gettext("will reclaim %s\n"),
+				    buf);
+			}
+		}
+
+		if (!cb.cb_dryrun) {
+			if (cb.cb_doclones) {
+				cb.cb_batchedsnaps = fnvlist_alloc();
+				err = destroy_clones(&cb);
+				if (err == 0) {
+					err = zfs_destroy_snaps_nvl(g_zfs,
+					    cb.cb_batchedsnaps, B_FALSE);
+				}
+				if (err != 0) {
+					rv = 1;
+					goto out;
+				}
+			}
+			if (err == 0) {
+				err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
+				    cb.cb_defer_destroy);
+			}
+		}
+
+		if (err != 0)
+			rv = 1;
+	} else if (pound != NULL) {
+		int err;
+		nvlist_t *nvl;
+
+		if (cb.cb_dryrun) {
+			(void) fprintf(stderr,
+			    "dryrun is not supported with bookmark\n");
+			return (-1);
+		}
+
+		if (cb.cb_defer_destroy) {
+			(void) fprintf(stderr,
+			    "defer destroy is not supported with bookmark\n");
+			return (-1);
+		}
+
+		if (cb.cb_recurse) {
+			(void) fprintf(stderr,
+			    "recursive is not supported with bookmark\n");
+			return (-1);
+		}
+
+		/*
+		 * Unfortunately, zfs_bookmark() doesn't honor the
+		 * casesensitivity setting.  However, we can't simply
+		 * remove this check, because lzc_destroy_bookmarks()
+		 * ignores non-existent bookmarks, so this is necessary
+		 * to get a proper error message.
+		 */
+		if (!zfs_bookmark_exists(argv[0])) {
+			(void) fprintf(stderr, gettext("bookmark '%s' "
+			    "does not exist.\n"), argv[0]);
+			return (1);
+		}
+
+		nvl = fnvlist_alloc();
+		fnvlist_add_boolean(nvl, argv[0]);
+
+		err = lzc_destroy_bookmarks(nvl, NULL);
+		if (err != 0) {
+			(void) zfs_standard_error(g_zfs, err,
+			    "cannot destroy bookmark");
+		}
+
+		nvlist_free(nvl);
+
+		return (err);
+	} else {
+		/* Open the given dataset */
+		if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
+			return (1);
+
+		cb.cb_target = zhp;
+
+		/*
+		 * Perform an explicit check for pools before going any further.
+		 */
+		if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
+		    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+			(void) fprintf(stderr, gettext("cannot destroy '%s': "
+			    "operation does not apply to pools\n"),
+			    zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use 'zfs destroy -r "
+			    "%s' to destroy all datasets in the pool\n"),
+			    zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
+			    "to destroy the pool itself\n"), zfs_get_name(zhp));
+			rv = 1;
+			goto out;
+		}
+
+		/*
+		 * Check for any dependents and/or clones.
+		 */
+		cb.cb_first = B_TRUE;
+		if (!cb.cb_doclones &&
+		    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
+		    &cb) != 0) {
+			rv = 1;
+			goto out;
+		}
+
+		if (cb.cb_error) {
+			rv = 1;
+			goto out;
+		}
+		cb.cb_batchedsnaps = fnvlist_alloc();
+		if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
+		    &cb) != 0) {
+			rv = 1;
+			goto out;
+		}
+
+		/*
+		 * Do the real thing.  The callback will close the
+		 * handle regardless of whether it succeeds or not.
+		 */
+		err = destroy_callback(zhp, &cb);
+		zhp = NULL;
+		if (err == 0) {
+			err = zfs_destroy_snaps_nvl(g_zfs,
+			    cb.cb_batchedsnaps, cb.cb_defer_destroy);
+		}
+		if (err != 0 || cb.cb_error == B_TRUE)
+			rv = 1;
+	}
+
+out:
+	fnvlist_free(cb.cb_batchedsnaps);
+	fnvlist_free(cb.cb_nvl);
+	if (zhp != NULL)
+		zfs_close(zhp);
+	return (rv);
+}
+
+static boolean_t
+is_recvd_column(zprop_get_cbdata_t *cbp)
+{
+	int i;
+	zfs_get_column_t col;
+
+	for (i = 0; i < ZFS_GET_NCOLS &&
+	    (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
+		if (col == GET_COL_RECVD)
+			return (B_TRUE);
+	return (B_FALSE);
+}
+
+/*
+ * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
+ *	< all | property[,property]... > < fs | snap | vol > ...
+ *
+ *	-r	recurse over any child datasets
+ *	-H	scripted mode.  Headers are stripped, and fields are separated
+ *		by tabs instead of spaces.
+ *	-o	Set of fields to display.  One of "name,property,value,
+ *		received,source". Default is "name,property,value,source".
+ *		"all" is an alias for all five.
+ *	-s	Set of sources to allow.  One of
+ *		"local,default,inherited,received,temporary,none".  Default is
+ *		all six.
+ *	-p	Display values in parsable (literal) format.
+ *
+ *  Prints properties for the given datasets.  The user can control which
+ *  columns to display as well as which property types to allow.
+ */
+
+/*
+ * Invoked to display the properties for a single dataset.
+ */
+static int
+get_callback(zfs_handle_t *zhp, void *data)
+{
+	char buf[ZFS_MAXPROPLEN];
+	char rbuf[ZFS_MAXPROPLEN];
+	zprop_source_t sourcetype;
+	char source[ZFS_MAX_DATASET_NAME_LEN];
+	zprop_get_cbdata_t *cbp = data;
+	nvlist_t *user_props = zfs_get_user_props(zhp);
+	zprop_list_t *pl = cbp->cb_proplist;
+	nvlist_t *propval;
+	char *strval;
+	char *sourceval;
+	boolean_t received = is_recvd_column(cbp);
+
+	for (; pl != NULL; pl = pl->pl_next) {
+		char *recvdval = NULL;
+		/*
+		 * Skip the special fake placeholder.  This will also skip over
+		 * the name property when 'all' is specified.
+		 */
+		if (pl->pl_prop == ZFS_PROP_NAME &&
+		    pl == cbp->cb_proplist)
+			continue;
+
+		if (pl->pl_prop != ZPROP_INVAL) {
+			if (zfs_prop_get(zhp, pl->pl_prop, buf,
+			    sizeof (buf), &sourcetype, source,
+			    sizeof (source),
+			    cbp->cb_literal) != 0) {
+				if (pl->pl_all)
+					continue;
+				if (!zfs_prop_valid_for_type(pl->pl_prop,
+				    ZFS_TYPE_DATASET, B_FALSE)) {
+					(void) fprintf(stderr,
+					    gettext("No such property '%s'\n"),
+					    zfs_prop_to_name(pl->pl_prop));
+					continue;
+				}
+				sourcetype = ZPROP_SRC_NONE;
+				(void) strlcpy(buf, "-", sizeof (buf));
+			}
+
+			if (received && (zfs_prop_get_recvd(zhp,
+			    zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
+			    cbp->cb_literal) == 0))
+				recvdval = rbuf;
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    zfs_prop_to_name(pl->pl_prop),
+			    buf, sourcetype, source, recvdval);
+		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
+			sourcetype = ZPROP_SRC_LOCAL;
+
+			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+			    buf, sizeof (buf), cbp->cb_literal) != 0) {
+				sourcetype = ZPROP_SRC_NONE;
+				(void) strlcpy(buf, "-", sizeof (buf));
+			}
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    pl->pl_user_prop, buf, sourcetype, source, NULL);
+		} else if (zfs_prop_written(pl->pl_user_prop)) {
+			sourcetype = ZPROP_SRC_LOCAL;
+
+			if (zfs_prop_get_written(zhp, pl->pl_user_prop,
+			    buf, sizeof (buf), cbp->cb_literal) != 0) {
+				sourcetype = ZPROP_SRC_NONE;
+				(void) strlcpy(buf, "-", sizeof (buf));
+			}
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    pl->pl_user_prop, buf, sourcetype, source, NULL);
+		} else {
+			if (nvlist_lookup_nvlist(user_props,
+			    pl->pl_user_prop, &propval) != 0) {
+				if (pl->pl_all)
+					continue;
+				sourcetype = ZPROP_SRC_NONE;
+				strval = "-";
+			} else {
+				verify(nvlist_lookup_string(propval,
+				    ZPROP_VALUE, &strval) == 0);
+				verify(nvlist_lookup_string(propval,
+				    ZPROP_SOURCE, &sourceval) == 0);
+
+				if (strcmp(sourceval,
+				    zfs_get_name(zhp)) == 0) {
+					sourcetype = ZPROP_SRC_LOCAL;
+				} else if (strcmp(sourceval,
+				    ZPROP_SOURCE_VAL_RECVD) == 0) {
+					sourcetype = ZPROP_SRC_RECEIVED;
+				} else {
+					sourcetype = ZPROP_SRC_INHERITED;
+					(void) strlcpy(source,
+					    sourceval, sizeof (source));
+				}
+			}
+
+			if (received && (zfs_prop_get_recvd(zhp,
+			    pl->pl_user_prop, rbuf, sizeof (rbuf),
+			    cbp->cb_literal) == 0))
+				recvdval = rbuf;
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    pl->pl_user_prop, strval, sourcetype,
+			    source, recvdval);
+		}
+	}
+
+	return (0);
+}
+
+static int
+zfs_do_get(int argc, char **argv)
+{
+	zprop_get_cbdata_t cb = { 0 };
+	int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
+	int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK;
+	char *value, *fields;
+	int ret = 0;
+	int limit = 0;
+	zprop_list_t fake_name = { 0 };
+
+	/*
+	 * Set up default columns and sources.
+	 */
+	cb.cb_sources = ZPROP_SRC_ALL;
+	cb.cb_columns[0] = GET_COL_NAME;
+	cb.cb_columns[1] = GET_COL_PROPERTY;
+	cb.cb_columns[2] = GET_COL_VALUE;
+	cb.cb_columns[3] = GET_COL_SOURCE;
+	cb.cb_type = ZFS_TYPE_DATASET;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) {
+		switch (c) {
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			break;
+		case 'd':
+			limit = parse_depth(optarg, &flags);
+			break;
+		case 'r':
+			flags |= ZFS_ITER_RECURSE;
+			break;
+		case 'H':
+			cb.cb_scripted = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case 'o':
+			/*
+			 * Process the set of columns to display.  We zero out
+			 * the structure to give us a blank slate.
+			 */
+			bzero(&cb.cb_columns, sizeof (cb.cb_columns));
+			i = 0;
+			while (*optarg != '\0') {
+				static char *col_subopts[] =
+				    { "name", "property", "value", "received",
+				    "source", "all", NULL };
+
+				if (i == ZFS_GET_NCOLS) {
+					(void) fprintf(stderr, gettext("too "
+					    "many fields given to -o "
+					    "option\n"));
+					usage(B_FALSE);
+				}
+
+				switch (getsubopt(&optarg, col_subopts,
+				    &value)) {
+				case 0:
+					cb.cb_columns[i++] = GET_COL_NAME;
+					break;
+				case 1:
+					cb.cb_columns[i++] = GET_COL_PROPERTY;
+					break;
+				case 2:
+					cb.cb_columns[i++] = GET_COL_VALUE;
+					break;
+				case 3:
+					cb.cb_columns[i++] = GET_COL_RECVD;
+					flags |= ZFS_ITER_RECVD_PROPS;
+					break;
+				case 4:
+					cb.cb_columns[i++] = GET_COL_SOURCE;
+					break;
+				case 5:
+					if (i > 0) {
+						(void) fprintf(stderr,
+						    gettext("\"all\" conflicts "
+						    "with specific fields "
+						    "given to -o option\n"));
+						usage(B_FALSE);
+					}
+					cb.cb_columns[0] = GET_COL_NAME;
+					cb.cb_columns[1] = GET_COL_PROPERTY;
+					cb.cb_columns[2] = GET_COL_VALUE;
+					cb.cb_columns[3] = GET_COL_RECVD;
+					cb.cb_columns[4] = GET_COL_SOURCE;
+					flags |= ZFS_ITER_RECVD_PROPS;
+					i = ZFS_GET_NCOLS;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid column name "
+					    "'%s'\n"), value);
+					usage(B_FALSE);
+				}
+			}
+			break;
+
+		case 's':
+			cb.cb_sources = 0;
+			while (*optarg != '\0') {
+				static char *source_subopts[] = {
+					"local", "default", "inherited",
+					"received", "temporary", "none",
+					NULL };
+
+				switch (getsubopt(&optarg, source_subopts,
+				    &value)) {
+				case 0:
+					cb.cb_sources |= ZPROP_SRC_LOCAL;
+					break;
+				case 1:
+					cb.cb_sources |= ZPROP_SRC_DEFAULT;
+					break;
+				case 2:
+					cb.cb_sources |= ZPROP_SRC_INHERITED;
+					break;
+				case 3:
+					cb.cb_sources |= ZPROP_SRC_RECEIVED;
+					break;
+				case 4:
+					cb.cb_sources |= ZPROP_SRC_TEMPORARY;
+					break;
+				case 5:
+					cb.cb_sources |= ZPROP_SRC_NONE;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid source "
+					    "'%s'\n"), value);
+					usage(B_FALSE);
+				}
+			}
+			break;
+
+		case 't':
+			types = 0;
+			flags &= ~ZFS_ITER_PROP_LISTSNAPS;
+			while (*optarg != '\0') {
+				static char *type_subopts[] = { "filesystem",
+				    "volume", "snapshot", "snap", "bookmark",
+				    "all", NULL };
+
+				switch (getsubopt(&optarg, type_subopts,
+				    &value)) {
+				case 0:
+					types |= ZFS_TYPE_FILESYSTEM;
+					break;
+				case 1:
+					types |= ZFS_TYPE_VOLUME;
+					break;
+				case 2:
+				case 3:
+					types |= ZFS_TYPE_SNAPSHOT;
+					break;
+				case 4:
+					types |= ZFS_TYPE_BOOKMARK;
+					break;
+				case 5:
+					types = ZFS_TYPE_DATASET |
+					    ZFS_TYPE_BOOKMARK;
+					break;
+
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid type '%s'\n"),
+					    value);
+					usage(B_FALSE);
+				}
+			}
+			break;
+
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing property "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+
+	fields = argv[0];
+
+	/*
+	 * Handle users who want to get all snapshots or bookmarks
+	 * of a dataset (ex. 'zfs get -t snapshot refer <dataset>').
+	 */
+	if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) &&
+	    argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) {
+		flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE);
+		limit = 1;
+	}
+
+	if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
+	    != 0)
+		usage(B_FALSE);
+
+	argc--;
+	argv++;
+
+	/*
+	 * As part of zfs_expand_proplist(), we keep track of the maximum column
+	 * width for each property.  For the 'NAME' (and 'SOURCE') columns, we
+	 * need to know the maximum name length.  However, the user likely did
+	 * not specify 'name' as one of the properties to fetch, so we need to
+	 * make sure we always include at least this property for
+	 * print_get_headers() to work properly.
+	 */
+	if (cb.cb_proplist != NULL) {
+		fake_name.pl_prop = ZFS_PROP_NAME;
+		fake_name.pl_width = strlen(gettext("NAME"));
+		fake_name.pl_next = cb.cb_proplist;
+		cb.cb_proplist = &fake_name;
+	}
+
+	cb.cb_first = B_TRUE;
+
+	/* run for each object */
+	ret = zfs_for_each(argc, argv, flags, types, NULL,
+	    &cb.cb_proplist, limit, get_callback, &cb);
+
+	if (cb.cb_proplist == &fake_name)
+		zprop_free_list(fake_name.pl_next);
+	else
+		zprop_free_list(cb.cb_proplist);
+
+	return (ret);
+}
+
+/*
+ * inherit [-rS] <property> <fs|vol> ...
+ *
+ *	-r	Recurse over all children
+ *	-S	Revert to received value, if any
+ *
+ * For each dataset specified on the command line, inherit the given property
+ * from its parent.  Inheriting a property at the pool level will cause it to
+ * use the default value.  The '-r' flag will recurse over all children, and is
+ * useful for setting a property on a hierarchy-wide basis, regardless of any
+ * local modifications for each dataset.
+ */
+
+typedef struct inherit_cbdata {
+	const char *cb_propname;
+	boolean_t cb_received;
+} inherit_cbdata_t;
+
+static int
+inherit_recurse_cb(zfs_handle_t *zhp, void *data)
+{
+	inherit_cbdata_t *cb = data;
+	zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);
+
+	/*
+	 * If we're doing it recursively, then ignore properties that
+	 * are not valid for this type of dataset.
+	 */
+	if (prop != ZPROP_INVAL &&
+	    !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE))
+		return (0);
+
+	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
+}
+
+static int
+inherit_cb(zfs_handle_t *zhp, void *data)
+{
+	inherit_cbdata_t *cb = data;
+
+	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
+}
+
+static int
+zfs_do_inherit(int argc, char **argv)
+{
+	int c;
+	zfs_prop_t prop;
+	inherit_cbdata_t cb = { 0 };
+	char *propname;
+	int ret = 0;
+	int flags = 0;
+	boolean_t received = B_FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "rS")) != -1) {
+		switch (c) {
+		case 'r':
+			flags |= ZFS_ITER_RECURSE;
+			break;
+		case 'S':
+			received = B_TRUE;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing property argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
+		usage(B_FALSE);
+	}
+
+	propname = argv[0];
+	argc--;
+	argv++;
+
+	if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
+		if (zfs_prop_readonly(prop)) {
+			(void) fprintf(stderr, gettext(
+			    "%s property is read-only\n"),
+			    propname);
+			return (1);
+		}
+		if (!zfs_prop_inheritable(prop) && !received) {
+			(void) fprintf(stderr, gettext("'%s' property cannot "
+			    "be inherited\n"), propname);
+			if (prop == ZFS_PROP_QUOTA ||
+			    prop == ZFS_PROP_RESERVATION ||
+			    prop == ZFS_PROP_REFQUOTA ||
+			    prop == ZFS_PROP_REFRESERVATION) {
+				(void) fprintf(stderr, gettext("use 'zfs set "
+				    "%s=none' to clear\n"), propname);
+				(void) fprintf(stderr, gettext("use 'zfs "
+				    "inherit -S %s' to revert to received "
+				    "value\n"), propname);
+			}
+			return (1);
+		}
+		if (received && (prop == ZFS_PROP_VOLSIZE ||
+		    prop == ZFS_PROP_VERSION)) {
+			(void) fprintf(stderr, gettext("'%s' property cannot "
+			    "be reverted to a received value\n"), propname);
+			return (1);
+		}
+	} else if (!zfs_prop_user(propname)) {
+		(void) fprintf(stderr, gettext("invalid property '%s'\n"),
+		    propname);
+		usage(B_FALSE);
+	}
+
+	cb.cb_propname = propname;
+	cb.cb_received = received;
+
+	if (flags & ZFS_ITER_RECURSE) {
+		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
+		    NULL, NULL, 0, inherit_recurse_cb, &cb);
+	} else {
+		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
+		    NULL, NULL, 0, inherit_cb, &cb);
+	}
+
+	return (ret);
+}
+
+typedef struct upgrade_cbdata {
+	uint64_t cb_numupgraded;
+	uint64_t cb_numsamegraded;
+	uint64_t cb_numfailed;
+	uint64_t cb_version;
+	boolean_t cb_newer;
+	boolean_t cb_foundone;
+	char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN];
+} upgrade_cbdata_t;
+
+static int
+same_pool(zfs_handle_t *zhp, const char *name)
+{
+	int len1 = strcspn(name, "/@");
+	const char *zhname = zfs_get_name(zhp);
+	int len2 = strcspn(zhname, "/@");
+
+	if (len1 != len2)
+		return (B_FALSE);
+	return (strncmp(name, zhname, len1) == 0);
+}
+
+static int
+upgrade_list_callback(zfs_handle_t *zhp, void *data)
+{
+	upgrade_cbdata_t *cb = data;
+	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+
+	/* list if it's old/new */
+	if ((!cb->cb_newer && version < ZPL_VERSION) ||
+	    (cb->cb_newer && version > ZPL_VERSION)) {
+		char *str;
+		if (cb->cb_newer) {
+			str = gettext("The following filesystems are "
+			    "formatted using a newer software version and\n"
+			    "cannot be accessed on the current system.\n\n");
+		} else {
+			str = gettext("The following filesystems are "
+			    "out of date, and can be upgraded.  After being\n"
+			    "upgraded, these filesystems (and any 'zfs send' "
+			    "streams generated from\n"
+			    "subsequent snapshots) will no longer be "
+			    "accessible by older software versions.\n\n");
+		}
+
+		if (!cb->cb_foundone) {
+			(void) puts(str);
+			(void) printf(gettext("VER  FILESYSTEM\n"));
+			(void) printf(gettext("---  ------------\n"));
+			cb->cb_foundone = B_TRUE;
+		}
+
+		(void) printf("%2u   %s\n", version, zfs_get_name(zhp));
+	}
+
+	return (0);
+}
+
+static int
+upgrade_set_callback(zfs_handle_t *zhp, void *data)
+{
+	upgrade_cbdata_t *cb = data;
+	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+	int needed_spa_version;
+	int spa_version;
+
+	if (zfs_spa_version(zhp, &spa_version) < 0)
+		return (-1);
+
+	needed_spa_version = zfs_spa_version_map(cb->cb_version);
+
+	if (needed_spa_version < 0)
+		return (-1);
+
+	if (spa_version < needed_spa_version) {
+		/* can't upgrade */
+		(void) printf(gettext("%s: can not be "
+		    "upgraded; the pool version needs to first "
+		    "be upgraded\nto version %d\n\n"),
+		    zfs_get_name(zhp), needed_spa_version);
+		cb->cb_numfailed++;
+		return (0);
+	}
+
+	/* upgrade */
+	if (version < cb->cb_version) {
+		char verstr[16];
+		(void) snprintf(verstr, sizeof (verstr),
+		    "%llu", (u_longlong_t)cb->cb_version);
+		if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) {
+			/*
+			 * If they did "zfs upgrade -a", then we could
+			 * be doing ioctls to different pools.  We need
+			 * to log this history once to each pool, and bypass
+			 * the normal history logging that happens in main().
+			 */
+			(void) zpool_log_history(g_zfs, history_str);
+			log_history = B_FALSE;
+		}
+		if (zfs_prop_set(zhp, "version", verstr) == 0)
+			cb->cb_numupgraded++;
+		else
+			cb->cb_numfailed++;
+		(void) strcpy(cb->cb_lastfs, zfs_get_name(zhp));
+	} else if (version > cb->cb_version) {
+		/* can't downgrade */
+		(void) printf(gettext("%s: can not be downgraded; "
+		    "it is already at version %u\n"),
+		    zfs_get_name(zhp), version);
+		cb->cb_numfailed++;
+	} else {
+		cb->cb_numsamegraded++;
+	}
+	return (0);
+}
+
+/*
+ * zfs upgrade
+ * zfs upgrade -v
+ * zfs upgrade [-r] [-V <version>] <-a | filesystem>
+ */
+static int
+zfs_do_upgrade(int argc, char **argv)
+{
+	boolean_t all = B_FALSE;
+	boolean_t showversions = B_FALSE;
+	int ret = 0;
+	upgrade_cbdata_t cb = { 0 };
+	int c;
+	int flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "rvV:a")) != -1) {
+		switch (c) {
+		case 'r':
+			flags |= ZFS_ITER_RECURSE;
+			break;
+		case 'v':
+			showversions = B_TRUE;
+			break;
+		case 'V':
+			if (zfs_prop_string_to_index(ZFS_PROP_VERSION,
+			    optarg, &cb.cb_version) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid version %s\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 'a':
+			all = B_TRUE;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version))
+		usage(B_FALSE);
+	if (showversions && (flags & ZFS_ITER_RECURSE || all ||
+	    cb.cb_version || argc))
+		usage(B_FALSE);
+	if ((all || argc) && (showversions))
+		usage(B_FALSE);
+	if (all && argc)
+		usage(B_FALSE);
+
+	if (showversions) {
+		/* Show info on available versions. */
+		(void) printf(gettext("The following filesystem versions are "
+		    "supported:\n\n"));
+		(void) printf(gettext("VER  DESCRIPTION\n"));
+		(void) printf("---  -----------------------------------------"
+		    "---------------\n");
+		(void) printf(gettext(" 1   Initial ZFS filesystem version\n"));
+		(void) printf(gettext(" 2   Enhanced directory entries\n"));
+		(void) printf(gettext(" 3   Case insensitive and filesystem "
+		    "user identifier (FUID)\n"));
+		(void) printf(gettext(" 4   userquota, groupquota "
+		    "properties\n"));
+		(void) printf(gettext(" 5   System attributes\n"));
+		(void) printf(gettext("\nFor more information on a particular "
+		    "version, including supported releases,\n"));
+		(void) printf("see the ZFS Administration Guide.\n\n");
+		ret = 0;
+	} else if (argc || all) {
+		/* Upgrade filesystems */
+		if (cb.cb_version == 0)
+			cb.cb_version = ZPL_VERSION;
+		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
+		    NULL, NULL, 0, upgrade_set_callback, &cb);
+		(void) printf(gettext("%llu filesystems upgraded\n"),
+		    (u_longlong_t)cb.cb_numupgraded);
+		if (cb.cb_numsamegraded) {
+			(void) printf(gettext("%llu filesystems already at "
+			    "this version\n"),
+			    (u_longlong_t)cb.cb_numsamegraded);
+		}
+		if (cb.cb_numfailed != 0)
+			ret = 1;
+	} else {
+		/* List old-version filesystems */
+		boolean_t found;
+		(void) printf(gettext("This system is currently running "
+		    "ZFS filesystem version %llu.\n\n"), ZPL_VERSION);
+
+		flags |= ZFS_ITER_RECURSE;
+		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
+		    NULL, NULL, 0, upgrade_list_callback, &cb);
+
+		found = cb.cb_foundone;
+		cb.cb_foundone = B_FALSE;
+		cb.cb_newer = B_TRUE;
+
+		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
+		    NULL, NULL, 0, upgrade_list_callback, &cb);
+
+		if (!cb.cb_foundone && !found) {
+			(void) printf(gettext("All filesystems are "
+			    "formatted with the current version.\n"));
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
+ *               [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
+ * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
+ *                [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
+ * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...]
+ *                [-S field [-S field]...] filesystem | snapshot
+ *
+ *	-H      Scripted mode; elide headers and separate columns by tabs.
+ *	-i	Translate SID to POSIX ID.
+ *	-n	Print numeric ID instead of user/group name.
+ *	-o      Control which fields to display.
+ *	-p	Use exact (parsable) numeric output.
+ *	-s      Specify sort columns, descending order.
+ *	-S      Specify sort columns, ascending order.
+ *	-t      Control which object types to display.
+ *
+ *	Displays space consumed by, and quotas on, each user in the specified
+ *	filesystem or snapshot.
+ */
+
+/* us_field_types, us_field_hdr and us_field_names should be kept in sync */
+enum us_field_types {
+	USFIELD_TYPE,
+	USFIELD_NAME,
+	USFIELD_USED,
+	USFIELD_QUOTA,
+	USFIELD_OBJUSED,
+	USFIELD_OBJQUOTA
+};
+static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA",
+				    "OBJUSED", "OBJQUOTA" };
+static char *us_field_names[] = { "type", "name", "used", "quota",
+				    "objused", "objquota" };
+#define	USFIELD_LAST	(sizeof (us_field_names) / sizeof (char *))
+
+#define	USTYPE_PSX_GRP	(1 << 0)
+#define	USTYPE_PSX_USR	(1 << 1)
+#define	USTYPE_SMB_GRP	(1 << 2)
+#define	USTYPE_SMB_USR	(1 << 3)
+#define	USTYPE_PROJ	(1 << 4)
+#define	USTYPE_ALL	\
+	(USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR | \
+	    USTYPE_PROJ)
+
+static int us_type_bits[] = {
+	USTYPE_PSX_GRP,
+	USTYPE_PSX_USR,
+	USTYPE_SMB_GRP,
+	USTYPE_SMB_USR,
+	USTYPE_ALL
+};
+static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup",
+	"smbuser", "all" };
+
+typedef struct us_node {
+	nvlist_t	*usn_nvl;
+	uu_avl_node_t	usn_avlnode;
+	uu_list_node_t	usn_listnode;
+} us_node_t;
+
+typedef struct us_cbdata {
+	nvlist_t	**cb_nvlp;
+	uu_avl_pool_t	*cb_avl_pool;
+	uu_avl_t	*cb_avl;
+	boolean_t	cb_numname;
+	boolean_t	cb_nicenum;
+	boolean_t	cb_sid2posix;
+	zfs_userquota_prop_t cb_prop;
+	zfs_sort_column_t *cb_sortcol;
+	size_t		cb_width[USFIELD_LAST];
+} us_cbdata_t;
+
+static boolean_t us_populated = B_FALSE;
+
+typedef struct {
+	zfs_sort_column_t *si_sortcol;
+	boolean_t	si_numname;
+} us_sort_info_t;
+
+static int
+us_field_index(char *field)
+{
+	int i;
+
+	for (i = 0; i < USFIELD_LAST; i++) {
+		if (strcmp(field, us_field_names[i]) == 0)
+			return (i);
+	}
+
+	return (-1);
+}
+
+static int
+us_compare(const void *larg, const void *rarg, void *unused)
+{
+	const us_node_t *l = larg;
+	const us_node_t *r = rarg;
+	us_sort_info_t *si = (us_sort_info_t *)unused;
+	zfs_sort_column_t *sortcol = si->si_sortcol;
+	boolean_t numname = si->si_numname;
+	nvlist_t *lnvl = l->usn_nvl;
+	nvlist_t *rnvl = r->usn_nvl;
+	int rc = 0;
+	boolean_t lvb, rvb;
+
+	for (; sortcol != NULL; sortcol = sortcol->sc_next) {
+		char *lvstr = "";
+		char *rvstr = "";
+		uint32_t lv32 = 0;
+		uint32_t rv32 = 0;
+		uint64_t lv64 = 0;
+		uint64_t rv64 = 0;
+		zfs_prop_t prop = sortcol->sc_prop;
+		const char *propname = NULL;
+		boolean_t reverse = sortcol->sc_reverse;
+
+		switch (prop) {
+		case ZFS_PROP_TYPE:
+			propname = "type";
+			(void) nvlist_lookup_uint32(lnvl, propname, &lv32);
+			(void) nvlist_lookup_uint32(rnvl, propname, &rv32);
+			if (rv32 != lv32)
+				rc = (rv32 < lv32) ? 1 : -1;
+			break;
+		case ZFS_PROP_NAME:
+			propname = "name";
+			if (numname) {
+compare_nums:
+				(void) nvlist_lookup_uint64(lnvl, propname,
+				    &lv64);
+				(void) nvlist_lookup_uint64(rnvl, propname,
+				    &rv64);
+				if (rv64 != lv64)
+					rc = (rv64 < lv64) ? 1 : -1;
+			} else {
+				if ((nvlist_lookup_string(lnvl, propname,
+				    &lvstr) == ENOENT) ||
+				    (nvlist_lookup_string(rnvl, propname,
+				    &rvstr) == ENOENT)) {
+					goto compare_nums;
+				}
+				rc = strcmp(lvstr, rvstr);
+			}
+			break;
+		case ZFS_PROP_USED:
+		case ZFS_PROP_QUOTA:
+			if (!us_populated)
+				break;
+			if (prop == ZFS_PROP_USED)
+				propname = "used";
+			else
+				propname = "quota";
+			(void) nvlist_lookup_uint64(lnvl, propname, &lv64);
+			(void) nvlist_lookup_uint64(rnvl, propname, &rv64);
+			if (rv64 != lv64)
+				rc = (rv64 < lv64) ? 1 : -1;
+			break;
+
+		default:
+			break;
+		}
+
+		if (rc != 0) {
+			if (rc < 0)
+				return (reverse ? 1 : -1);
+			else
+				return (reverse ? -1 : 1);
+		}
+	}
+
+	/*
+	 * If entries still seem to be the same, check if they are of the same
+	 * type (smbentity is added only if we are doing SID to POSIX ID
+	 * translation where we can have duplicate type/name combinations).
+	 */
+	if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 &&
+	    nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 &&
+	    lvb != rvb)
+		return (lvb < rvb ? -1 : 1);
+
+	return (0);
+}
+
+static boolean_t
+zfs_prop_is_user(unsigned p)
+{
+	return (p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA ||
+	    p == ZFS_PROP_USEROBJUSED || p == ZFS_PROP_USEROBJQUOTA);
+}
+
+static boolean_t
+zfs_prop_is_group(unsigned p)
+{
+	return (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA ||
+	    p == ZFS_PROP_GROUPOBJUSED || p == ZFS_PROP_GROUPOBJQUOTA);
+}
+
+static boolean_t
+zfs_prop_is_project(unsigned p)
+{
+	return (p == ZFS_PROP_PROJECTUSED || p == ZFS_PROP_PROJECTQUOTA ||
+	    p == ZFS_PROP_PROJECTOBJUSED || p == ZFS_PROP_PROJECTOBJQUOTA);
+}
+
+static inline const char *
+us_type2str(unsigned field_type)
+{
+	switch (field_type) {
+	case USTYPE_PSX_USR:
+		return ("POSIX User");
+	case USTYPE_PSX_GRP:
+		return ("POSIX Group");
+	case USTYPE_SMB_USR:
+		return ("SMB User");
+	case USTYPE_SMB_GRP:
+		return ("SMB Group");
+	case USTYPE_PROJ:
+		return ("Project");
+	default:
+		return ("Undefined");
+	}
+}
+
+static int
+userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
+{
+	us_cbdata_t *cb = (us_cbdata_t *)arg;
+	zfs_userquota_prop_t prop = cb->cb_prop;
+	char *name = NULL;
+	char *propname;
+	char sizebuf[32];
+	us_node_t *node;
+	uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
+	uu_avl_t *avl = cb->cb_avl;
+	uu_avl_index_t idx;
+	nvlist_t *props;
+	us_node_t *n;
+	zfs_sort_column_t *sortcol = cb->cb_sortcol;
+	unsigned type = 0;
+	const char *typestr;
+	size_t namelen;
+	size_t typelen;
+	size_t sizelen;
+	int typeidx, nameidx, sizeidx;
+	us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
+	boolean_t smbentity = B_FALSE;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	node = safe_malloc(sizeof (us_node_t));
+	uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
+	node->usn_nvl = props;
+
+	if (domain != NULL && domain[0] != '\0') {
+#ifdef HAVE_IDMAP
+		/* SMB */
+		char sid[MAXNAMELEN + 32];
+		uid_t id;
+		uint64_t classes;
+		int err;
+		directory_error_t e;
+
+		smbentity = B_TRUE;
+
+		(void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);
+
+		if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+			type = USTYPE_SMB_GRP;
+			err = sid_to_id(sid, B_FALSE, &id);
+		} else {
+			type = USTYPE_SMB_USR;
+			err = sid_to_id(sid, B_TRUE, &id);
+		}
+
+		if (err == 0) {
+			rid = id;
+			if (!cb->cb_sid2posix) {
+				e = directory_name_from_sid(NULL, sid, &name,
+				    &classes);
+				if (e != NULL)
+					directory_error_free(e);
+				if (name == NULL)
+					name = sid;
+			}
+		}
+#else
+		nvlist_free(props);
+		free(node);
+
+		return (-1);
+#endif /* HAVE_IDMAP */
+	}
+
+	if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') {
+		/* POSIX or -i */
+		if (zfs_prop_is_group(prop)) {
+			type = USTYPE_PSX_GRP;
+			if (!cb->cb_numname) {
+				struct group *g;
+
+				if ((g = getgrgid(rid)) != NULL)
+					name = g->gr_name;
+			}
+		} else if (zfs_prop_is_user(prop)) {
+			type = USTYPE_PSX_USR;
+			if (!cb->cb_numname) {
+				struct passwd *p;
+
+				if ((p = getpwuid(rid)) != NULL)
+					name = p->pw_name;
+			}
+		} else {
+			type = USTYPE_PROJ;
+		}
+	}
+
+	/*
+	 * Make sure that the type/name combination is unique when doing
+	 * SID to POSIX ID translation (hence changing the type from SMB to
+	 * POSIX).
+	 */
+	if (cb->cb_sid2posix &&
+	    nvlist_add_boolean_value(props, "smbentity", smbentity) != 0)
+		nomem();
+
+	/* Calculate/update width of TYPE field */
+	typestr = us_type2str(type);
+	typelen = strlen(gettext(typestr));
+	typeidx = us_field_index("type");
+	if (typelen > cb->cb_width[typeidx])
+		cb->cb_width[typeidx] = typelen;
+	if (nvlist_add_uint32(props, "type", type) != 0)
+		nomem();
+
+	/* Calculate/update width of NAME field */
+	if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) {
+		if (nvlist_add_uint64(props, "name", rid) != 0)
+			nomem();
+		namelen = snprintf(NULL, 0, "%u", rid);
+	} else {
+		if (nvlist_add_string(props, "name", name) != 0)
+			nomem();
+		namelen = strlen(name);
+	}
+	nameidx = us_field_index("name");
+	if (nameidx >= 0 && namelen > cb->cb_width[nameidx])
+		cb->cb_width[nameidx] = namelen;
+
+	/*
+	 * Check if this type/name combination is in the list and update it;
+	 * otherwise add new node to the list.
+	 */
+	if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) {
+		uu_avl_insert(avl, node, idx);
+	} else {
+		nvlist_free(props);
+		free(node);
+		node = n;
+		props = node->usn_nvl;
+	}
+
+	/* Calculate/update width of USED/QUOTA fields */
+	if (cb->cb_nicenum) {
+		if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED ||
+		    prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA ||
+		    prop == ZFS_PROP_PROJECTUSED ||
+		    prop == ZFS_PROP_PROJECTQUOTA) {
+			zfs_nicebytes(space, sizebuf, sizeof (sizebuf));
+		} else {
+			zfs_nicenum(space, sizebuf, sizeof (sizebuf));
+		}
+	} else {
+		(void) snprintf(sizebuf, sizeof (sizebuf), "%llu",
+		    (u_longlong_t)space);
+	}
+	sizelen = strlen(sizebuf);
+	if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED ||
+	    prop == ZFS_PROP_PROJECTUSED) {
+		propname = "used";
+		if (!nvlist_exists(props, "quota"))
+			(void) nvlist_add_uint64(props, "quota", 0);
+	} else if (prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA ||
+	    prop == ZFS_PROP_PROJECTQUOTA) {
+		propname = "quota";
+		if (!nvlist_exists(props, "used"))
+			(void) nvlist_add_uint64(props, "used", 0);
+	} else if (prop == ZFS_PROP_USEROBJUSED ||
+	    prop == ZFS_PROP_GROUPOBJUSED || prop == ZFS_PROP_PROJECTOBJUSED) {
+		propname = "objused";
+		if (!nvlist_exists(props, "objquota"))
+			(void) nvlist_add_uint64(props, "objquota", 0);
+	} else if (prop == ZFS_PROP_USEROBJQUOTA ||
+	    prop == ZFS_PROP_GROUPOBJQUOTA ||
+	    prop == ZFS_PROP_PROJECTOBJQUOTA) {
+		propname = "objquota";
+		if (!nvlist_exists(props, "objused"))
+			(void) nvlist_add_uint64(props, "objused", 0);
+	} else {
+		return (-1);
+	}
+	sizeidx = us_field_index(propname);
+	if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx])
+		cb->cb_width[sizeidx] = sizelen;
+
+	if (nvlist_add_uint64(props, propname, space) != 0)
+		nomem();
+
+	return (0);
+}
+
+static void
+print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types,
+    size_t *width, us_node_t *node)
+{
+	nvlist_t *nvl = node->usn_nvl;
+	char valstr[MAXNAMELEN];
+	boolean_t first = B_TRUE;
+	int cfield = 0;
+	int field;
+	uint32_t ustype;
+
+	/* Check type */
+	(void) nvlist_lookup_uint32(nvl, "type", &ustype);
+	if (!(ustype & types))
+		return;
+
+	while ((field = fields[cfield]) != USFIELD_LAST) {
+		nvpair_t *nvp = NULL;
+		data_type_t type;
+		uint32_t val32;
+		uint64_t val64;
+		char *strval = "-";
+
+		while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+			if (strcmp(nvpair_name(nvp),
+			    us_field_names[field]) == 0)
+				break;
+		}
+
+		type = nvp == NULL ? DATA_TYPE_UNKNOWN : nvpair_type(nvp);
+		switch (type) {
+		case DATA_TYPE_UINT32:
+			(void) nvpair_value_uint32(nvp, &val32);
+			break;
+		case DATA_TYPE_UINT64:
+			(void) nvpair_value_uint64(nvp, &val64);
+			break;
+		case DATA_TYPE_STRING:
+			(void) nvpair_value_string(nvp, &strval);
+			break;
+		case DATA_TYPE_UNKNOWN:
+			break;
+		default:
+			(void) fprintf(stderr, "invalid data type\n");
+		}
+
+		switch (field) {
+		case USFIELD_TYPE:
+			if (type == DATA_TYPE_UINT32)
+				strval = (char *)us_type2str(val32);
+			break;
+		case USFIELD_NAME:
+			if (type == DATA_TYPE_UINT64) {
+				(void) sprintf(valstr, "%llu",
+				    (u_longlong_t)val64);
+				strval = valstr;
+			}
+			break;
+		case USFIELD_USED:
+		case USFIELD_QUOTA:
+			if (type == DATA_TYPE_UINT64) {
+				if (parsable) {
+					(void) sprintf(valstr, "%llu",
+					    (u_longlong_t)val64);
+					strval = valstr;
+				} else if (field == USFIELD_QUOTA &&
+				    val64 == 0) {
+					strval = "none";
+				} else {
+					zfs_nicebytes(val64, valstr,
+					    sizeof (valstr));
+					strval = valstr;
+				}
+			}
+			break;
+		case USFIELD_OBJUSED:
+		case USFIELD_OBJQUOTA:
+			if (type == DATA_TYPE_UINT64) {
+				if (parsable) {
+					(void) sprintf(valstr, "%llu",
+					    (u_longlong_t)val64);
+					strval = valstr;
+				} else if (field == USFIELD_OBJQUOTA &&
+				    val64 == 0) {
+					strval = "none";
+				} else {
+					zfs_nicenum(val64, valstr,
+					    sizeof (valstr));
+					strval = valstr;
+				}
+			}
+			break;
+		}
+
+		if (!first) {
+			if (scripted)
+				(void) printf("\t");
+			else
+				(void) printf("  ");
+		}
+		if (scripted)
+			(void) printf("%s", strval);
+		else if (field == USFIELD_TYPE || field == USFIELD_NAME)
+			(void) printf("%-*s", (int)width[field], strval);
+		else
+			(void) printf("%*s", (int)width[field], strval);
+
+		first = B_FALSE;
+		cfield++;
+	}
+
+	(void) printf("\n");
+}
+
+static void
+print_us(boolean_t scripted, boolean_t parsable, int *fields, int types,
+    size_t *width, boolean_t rmnode, uu_avl_t *avl)
+{
+	us_node_t *node;
+	const char *col;
+	int cfield = 0;
+	int field;
+
+	if (!scripted) {
+		boolean_t first = B_TRUE;
+
+		while ((field = fields[cfield]) != USFIELD_LAST) {
+			col = gettext(us_field_hdr[field]);
+			if (field == USFIELD_TYPE || field == USFIELD_NAME) {
+				(void) printf(first ? "%-*s" : "  %-*s",
+				    (int)width[field], col);
+			} else {
+				(void) printf(first ? "%*s" : "  %*s",
+				    (int)width[field], col);
+			}
+			first = B_FALSE;
+			cfield++;
+		}
+		(void) printf("\n");
+	}
+
+	for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) {
+		print_us_node(scripted, parsable, fields, types, width, node);
+		if (rmnode)
+			nvlist_free(node->usn_nvl);
+	}
+}
+
+static int
+zfs_do_userspace(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	zfs_userquota_prop_t p;
+	uu_avl_pool_t *avl_pool;
+	uu_avl_t *avl_tree;
+	uu_avl_walk_t *walk;
+	char *delim;
+	char deffields[] = "type,name,used,quota,objused,objquota";
+	char *ofield = NULL;
+	char *tfield = NULL;
+	int cfield = 0;
+	int fields[256];
+	int i;
+	boolean_t scripted = B_FALSE;
+	boolean_t prtnum = B_FALSE;
+	boolean_t parsable = B_FALSE;
+	boolean_t sid2posix = B_FALSE;
+	int ret = 0;
+	int c;
+	zfs_sort_column_t *sortcol = NULL;
+	int types = USTYPE_PSX_USR | USTYPE_SMB_USR;
+	us_cbdata_t cb;
+	us_node_t *node;
+	us_node_t *rmnode;
+	uu_list_pool_t *listpool;
+	uu_list_t *list;
+	uu_avl_index_t idx = 0;
+	uu_list_index_t idx2 = 0;
+
+	if (argc < 2)
+		usage(B_FALSE);
+
+	if (strcmp(argv[0], "groupspace") == 0) {
+		/* Toggle default group types */
+		types = USTYPE_PSX_GRP | USTYPE_SMB_GRP;
+	} else if (strcmp(argv[0], "projectspace") == 0) {
+		types = USTYPE_PROJ;
+		prtnum = B_TRUE;
+	}
+
+	while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
+		switch (c) {
+		case 'n':
+			if (types == USTYPE_PROJ) {
+				(void) fprintf(stderr,
+				    gettext("invalid option 'n'\n"));
+				usage(B_FALSE);
+			}
+			prtnum = B_TRUE;
+			break;
+		case 'H':
+			scripted = B_TRUE;
+			break;
+		case 'p':
+			parsable = B_TRUE;
+			break;
+		case 'o':
+			ofield = optarg;
+			break;
+		case 's':
+		case 'S':
+			if (zfs_add_sort_column(&sortcol, optarg,
+			    c == 's' ? B_FALSE : B_TRUE) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid field '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 't':
+			if (types == USTYPE_PROJ) {
+				(void) fprintf(stderr,
+				    gettext("invalid option 't'\n"));
+				usage(B_FALSE);
+			}
+			tfield = optarg;
+			break;
+		case 'i':
+			if (types == USTYPE_PROJ) {
+				(void) fprintf(stderr,
+				    gettext("invalid option 'i'\n"));
+				usage(B_FALSE);
+			}
+			sid2posix = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing dataset name\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	/* Use default output fields if not specified using -o */
+	if (ofield == NULL)
+		ofield = deffields;
+	do {
+		if ((delim = strchr(ofield, ',')) != NULL)
+			*delim = '\0';
+		if ((fields[cfield++] = us_field_index(ofield)) == -1) {
+			(void) fprintf(stderr, gettext("invalid type '%s' "
+			    "for -o option\n"), ofield);
+			return (-1);
+		}
+		if (delim != NULL)
+			ofield = delim + 1;
+	} while (delim != NULL);
+	fields[cfield] = USFIELD_LAST;
+
+	/* Override output types (-t option) */
+	if (tfield != NULL) {
+		types = 0;
+
+		do {
+			boolean_t found = B_FALSE;
+
+			if ((delim = strchr(tfield, ',')) != NULL)
+				*delim = '\0';
+			for (i = 0; i < sizeof (us_type_bits) / sizeof (int);
+			    i++) {
+				if (strcmp(tfield, us_type_names[i]) == 0) {
+					found = B_TRUE;
+					types |= us_type_bits[i];
+					break;
+				}
+			}
+			if (!found) {
+				(void) fprintf(stderr, gettext("invalid type "
+				    "'%s' for -t option\n"), tfield);
+				return (-1);
+			}
+			if (delim != NULL)
+				tfield = delim + 1;
+		} while (delim != NULL);
+	}
+
+	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_SNAPSHOT)) == NULL)
+		return (1);
+	if (zhp->zfs_head_type != ZFS_TYPE_FILESYSTEM) {
+		(void) fprintf(stderr, gettext("operation is only applicable "
+		    "to filesystems and their snapshots\n"));
+		zfs_close(zhp);
+		return (1);
+	}
+
+	if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
+	    offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL)
+		nomem();
+	if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+		nomem();
+
+	/* Always add default sorting columns */
+	(void) zfs_add_sort_column(&sortcol, "type", B_FALSE);
+	(void) zfs_add_sort_column(&sortcol, "name", B_FALSE);
+
+	cb.cb_sortcol = sortcol;
+	cb.cb_numname = prtnum;
+	cb.cb_nicenum = !parsable;
+	cb.cb_avl_pool = avl_pool;
+	cb.cb_avl = avl_tree;
+	cb.cb_sid2posix = sid2posix;
+
+	for (i = 0; i < USFIELD_LAST; i++)
+		cb.cb_width[i] = strlen(gettext(us_field_hdr[i]));
+
+	for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
+		if ((zfs_prop_is_user(p) &&
+		    !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) ||
+		    (zfs_prop_is_group(p) &&
+		    !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))) ||
+		    (zfs_prop_is_project(p) && types != USTYPE_PROJ))
+			continue;
+
+		cb.cb_prop = p;
+		if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) {
+			zfs_close(zhp);
+			return (ret);
+		}
+	}
+	zfs_close(zhp);
+
+	/* Sort the list */
+	if ((node = uu_avl_first(avl_tree)) == NULL)
+		return (0);
+
+	us_populated = B_TRUE;
+
+	listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
+	    offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT);
+	list = uu_list_create(listpool, NULL, UU_DEFAULT);
+	uu_list_node_init(node, &node->usn_listnode, listpool);
+
+	while (node != NULL) {
+		rmnode = node;
+		node = uu_avl_next(avl_tree, node);
+		uu_avl_remove(avl_tree, rmnode);
+		if (uu_list_find(list, rmnode, NULL, &idx2) == NULL)
+			uu_list_insert(list, rmnode, idx2);
+	}
+
+	for (node = uu_list_first(list); node != NULL;
+	    node = uu_list_next(list, node)) {
+		us_sort_info_t sortinfo = { sortcol, cb.cb_numname };
+
+		if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL)
+			uu_avl_insert(avl_tree, node, idx);
+	}
+
+	uu_list_destroy(list);
+	uu_list_pool_destroy(listpool);
+
+	/* Print and free node nvlist memory */
+	print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE,
+	    cb.cb_avl);
+
+	zfs_free_sort_columns(sortcol);
+
+	/* Clean up the AVL tree */
+	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
+
+	while ((node = uu_avl_walk_next(walk)) != NULL) {
+		uu_avl_remove(cb.cb_avl, node);
+		free(node);
+	}
+
+	uu_avl_walk_end(walk);
+	uu_avl_destroy(avl_tree);
+	uu_avl_pool_destroy(avl_pool);
+
+	return (ret);
+}
+
+/*
+ * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property]
+ *      [-t type[,...]] [filesystem|volume|snapshot] ...
+ *
+ *	-H	Scripted mode; elide headers and separate columns by tabs
+ *	-p	Display values in parsable (literal) format.
+ *	-r	Recurse over all children
+ *	-d	Limit recursion by depth.
+ *	-o	Control which fields to display.
+ *	-s	Specify sort columns, descending order.
+ *	-S	Specify sort columns, ascending order.
+ *	-t	Control which object types to display.
+ *
+ * When given no arguments, list all filesystems in the system.
+ * Otherwise, list the specified datasets, optionally recursing down them if
+ * '-r' is specified.
+ */
+typedef struct list_cbdata {
+	boolean_t	cb_first;
+	boolean_t	cb_literal;
+	boolean_t	cb_scripted;
+	zprop_list_t	*cb_proplist;
+} list_cbdata_t;
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+static void
+print_header(list_cbdata_t *cb)
+{
+	zprop_list_t *pl = cb->cb_proplist;
+	char headerbuf[ZFS_MAXPROPLEN];
+	const char *header;
+	int i;
+	boolean_t first = B_TRUE;
+	boolean_t right_justify;
+
+	for (; pl != NULL; pl = pl->pl_next) {
+		if (!first) {
+			(void) printf("  ");
+		} else {
+			first = B_FALSE;
+		}
+
+		right_justify = B_FALSE;
+		if (pl->pl_prop != ZPROP_INVAL) {
+			header = zfs_prop_column_name(pl->pl_prop);
+			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else {
+			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
+				headerbuf[i] = toupper(pl->pl_user_prop[i]);
+			headerbuf[i] = '\0';
+			header = headerbuf;
+		}
+
+		if (pl->pl_next == NULL && !right_justify)
+			(void) printf("%s", header);
+		else if (right_justify)
+			(void) printf("%*s", (int)pl->pl_width, header);
+		else
+			(void) printf("%-*s", (int)pl->pl_width, header);
+	}
+
+	(void) printf("\n");
+}
+
+/*
+ * Given a dataset and a list of fields, print out all the properties according
+ * to the described layout.
+ */
+static void
+print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb)
+{
+	zprop_list_t *pl = cb->cb_proplist;
+	boolean_t first = B_TRUE;
+	char property[ZFS_MAXPROPLEN];
+	nvlist_t *userprops = zfs_get_user_props(zhp);
+	nvlist_t *propval;
+	char *propstr;
+	boolean_t right_justify;
+
+	for (; pl != NULL; pl = pl->pl_next) {
+		if (!first) {
+			if (cb->cb_scripted)
+				(void) printf("\t");
+			else
+				(void) printf("  ");
+		} else {
+			first = B_FALSE;
+		}
+
+		if (pl->pl_prop == ZFS_PROP_NAME) {
+			(void) strlcpy(property, zfs_get_name(zhp),
+			    sizeof (property));
+			propstr = property;
+			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else if (pl->pl_prop != ZPROP_INVAL) {
+			if (zfs_prop_get(zhp, pl->pl_prop, property,
+			    sizeof (property), NULL, NULL, 0,
+			    cb->cb_literal) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
+			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+			    property, sizeof (property), cb->cb_literal) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+			right_justify = B_TRUE;
+		} else if (zfs_prop_written(pl->pl_user_prop)) {
+			if (zfs_prop_get_written(zhp, pl->pl_user_prop,
+			    property, sizeof (property), cb->cb_literal) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+			right_justify = B_TRUE;
+		} else {
+			if (nvlist_lookup_nvlist(userprops,
+			    pl->pl_user_prop, &propval) != 0)
+				propstr = "-";
+			else
+				verify(nvlist_lookup_string(propval,
+				    ZPROP_VALUE, &propstr) == 0);
+			right_justify = B_FALSE;
+		}
+
+		/*
+		 * If this is being called in scripted mode, or if this is the
+		 * last column and it is left-justified, don't include a width
+		 * format specifier.
+		 */
+		if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
+			(void) printf("%s", propstr);
+		else if (right_justify)
+			(void) printf("%*s", (int)pl->pl_width, propstr);
+		else
+			(void) printf("%-*s", (int)pl->pl_width, propstr);
+	}
+
+	(void) printf("\n");
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+list_callback(zfs_handle_t *zhp, void *data)
+{
+	list_cbdata_t *cbp = data;
+
+	if (cbp->cb_first) {
+		if (!cbp->cb_scripted)
+			print_header(cbp);
+		cbp->cb_first = B_FALSE;
+	}
+
+	print_dataset(zhp, cbp);
+
+	return (0);
+}
+
+static int
+zfs_do_list(int argc, char **argv)
+{
+	int c;
+	static char default_fields[] =
+	    "name,used,available,referenced,mountpoint";
+	int types = ZFS_TYPE_DATASET;
+	boolean_t types_specified = B_FALSE;
+	char *fields = NULL;
+	list_cbdata_t cb = { 0 };
+	char *value;
+	int limit = 0;
+	int ret = 0;
+	zfs_sort_column_t *sortcol = NULL;
+	int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) {
+		switch (c) {
+		case 'o':
+			fields = optarg;
+			break;
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			flags |= ZFS_ITER_LITERAL_PROPS;
+			break;
+		case 'd':
+			limit = parse_depth(optarg, &flags);
+			break;
+		case 'r':
+			flags |= ZFS_ITER_RECURSE;
+			break;
+		case 'H':
+			cb.cb_scripted = B_TRUE;
+			break;
+		case 's':
+			if (zfs_add_sort_column(&sortcol, optarg,
+			    B_FALSE) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid property '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 'S':
+			if (zfs_add_sort_column(&sortcol, optarg,
+			    B_TRUE) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid property '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 't':
+			types = 0;
+			types_specified = B_TRUE;
+			flags &= ~ZFS_ITER_PROP_LISTSNAPS;
+			while (*optarg != '\0') {
+				static char *type_subopts[] = { "filesystem",
+				    "volume", "snapshot", "snap", "bookmark",
+				    "all", NULL };
+
+				switch (getsubopt(&optarg, type_subopts,
+				    &value)) {
+				case 0:
+					types |= ZFS_TYPE_FILESYSTEM;
+					break;
+				case 1:
+					types |= ZFS_TYPE_VOLUME;
+					break;
+				case 2:
+				case 3:
+					types |= ZFS_TYPE_SNAPSHOT;
+					break;
+				case 4:
+					types |= ZFS_TYPE_BOOKMARK;
+					break;
+				case 5:
+					types = ZFS_TYPE_DATASET |
+					    ZFS_TYPE_BOOKMARK;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid type '%s'\n"),
+					    value);
+					usage(B_FALSE);
+				}
+			}
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (fields == NULL)
+		fields = default_fields;
+
+	/*
+	 * If we are only going to list snapshot names and sort by name,
+	 * then we can use faster version.
+	 */
+	if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol))
+		flags |= ZFS_ITER_SIMPLE;
+
+	/*
+	 * If "-o space" and no types were specified, don't display snapshots.
+	 */
+	if (strcmp(fields, "space") == 0 && types_specified == B_FALSE)
+		types &= ~ZFS_TYPE_SNAPSHOT;
+
+	/*
+	 * Handle users who want to list all snapshots or bookmarks
+	 * of the current dataset (ex. 'zfs list -t snapshot <dataset>').
+	 */
+	if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) &&
+	    argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) {
+		flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE);
+		limit = 1;
+	}
+
+	/*
+	 * If the user specifies '-o all', the zprop_get_list() doesn't
+	 * normally include the name of the dataset.  For 'zfs list', we always
+	 * want this property to be first.
+	 */
+	if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
+	    != 0)
+		usage(B_FALSE);
+
+	cb.cb_first = B_TRUE;
+
+	ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
+	    limit, list_callback, &cb);
+
+	zprop_free_list(cb.cb_proplist);
+	zfs_free_sort_columns(sortcol);
+
+	if (ret == 0 && cb.cb_first && !cb.cb_scripted)
+		(void) fprintf(stderr, gettext("no datasets available\n"));
+
+	return (ret);
+}
+
+/*
+ * zfs rename [-f] <fs | snap | vol> <fs | snap | vol>
+ * zfs rename [-f] -p <fs | vol> <fs | vol>
+ * zfs rename -r <snap> <snap>
+ *
+ * Renames the given dataset to another of the same type.
+ *
+ * The '-p' flag creates all the non-existing ancestors of the target first.
+ */
+/* ARGSUSED */
+static int
+zfs_do_rename(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	int c;
+	int ret = 0;
+	boolean_t recurse = B_FALSE;
+	boolean_t parents = B_FALSE;
+	boolean_t force_unmount = B_FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "prf")) != -1) {
+		switch (c) {
+		case 'p':
+			parents = B_TRUE;
+			break;
+		case 'r':
+			recurse = B_TRUE;
+			break;
+		case 'f':
+			force_unmount = B_TRUE;
+			break;
+		case '?':
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing source dataset "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing target dataset "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	if (recurse && parents) {
+		(void) fprintf(stderr, gettext("-p and -r options are mutually "
+		    "exclusive\n"));
+		usage(B_FALSE);
+	}
+
+	if (recurse && strchr(argv[0], '@') == 0) {
+		(void) fprintf(stderr, gettext("source dataset for recursive "
+		    "rename must be a snapshot\n"));
+		usage(B_FALSE);
+	}
+
+	if ((zhp = zfs_open(g_zfs, argv[0], parents ? ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME : ZFS_TYPE_DATASET)) == NULL)
+		return (1);
+
+	/* If we were asked and the name looks good, try to create ancestors. */
+	if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) &&
+	    zfs_create_ancestors(g_zfs, argv[1]) != 0) {
+		zfs_close(zhp);
+		return (1);
+	}
+
+	ret = (zfs_rename(zhp, argv[1], recurse, force_unmount) != 0);
+
+	zfs_close(zhp);
+	return (ret);
+}
+
+/*
+ * zfs promote <fs>
+ *
+ * Promotes the given clone fs to be the parent
+ */
+/* ARGSUSED */
+static int
+zfs_do_promote(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	int ret = 0;
+
+	/* check options */
+	if (argc > 1 && argv[1][0] == '-') {
+		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+		    argv[1][1]);
+		usage(B_FALSE);
+	}
+
+	/* check number of arguments */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing clone filesystem"
+		    " argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+	if (zhp == NULL)
+		return (1);
+
+	ret = (zfs_promote(zhp) != 0);
+
+
+	zfs_close(zhp);
+	return (ret);
+}
+
+static int
+zfs_do_redact(int argc, char **argv)
+{
+	char *snap = NULL;
+	char *bookname = NULL;
+	char **rsnaps = NULL;
+	int numrsnaps = 0;
+	argv++;
+	argc--;
+	if (argc < 3) {
+		(void) fprintf(stderr, gettext("too few arguments\n"));
+		usage(B_FALSE);
+	}
+
+	snap = argv[0];
+	bookname = argv[1];
+	rsnaps = argv + 2;
+	numrsnaps = argc - 2;
+
+	nvlist_t *rsnapnv = fnvlist_alloc();
+
+	for (int i = 0; i < numrsnaps; i++) {
+		fnvlist_add_boolean(rsnapnv, rsnaps[i]);
+	}
+
+	int err = lzc_redact(snap, bookname, rsnapnv);
+	fnvlist_free(rsnapnv);
+
+	switch (err) {
+	case 0:
+		break;
+	case ENOENT:
+		(void) fprintf(stderr,
+		    gettext("provided snapshot %s does not exist\n"), snap);
+		break;
+	case EEXIST:
+		(void) fprintf(stderr, gettext("specified redaction bookmark "
+		    "(%s) provided already exists\n"), bookname);
+		break;
+	case ENAMETOOLONG:
+		(void) fprintf(stderr, gettext("provided bookmark name cannot "
+		    "be used, final name would be too long\n"));
+		break;
+	case E2BIG:
+		(void) fprintf(stderr, gettext("too many redaction snapshots "
+		    "specified\n"));
+		break;
+	case EINVAL:
+		if (strchr(bookname, '#') != NULL)
+			(void) fprintf(stderr, gettext(
+			    "redaction bookmark name must not contain '#'\n"));
+		else
+			(void) fprintf(stderr, gettext(
+			    "redaction snapshot must be descendent of "
+			    "snapshot being redacted\n"));
+		break;
+	case EALREADY:
+		(void) fprintf(stderr, gettext("attempted to redact redacted "
+		    "dataset or with respect to redacted dataset\n"));
+		break;
+	case ENOTSUP:
+		(void) fprintf(stderr, gettext("redaction bookmarks feature "
+		    "not enabled\n"));
+		break;
+	case EXDEV:
+		(void) fprintf(stderr, gettext("potentially invalid redaction "
+		    "snapshot; full dataset names required\n"));
+		break;
+	default:
+		(void) fprintf(stderr, gettext("internal error: %s\n"),
+		    strerror(errno));
+	}
+
+	return (err);
+}
+
+/*
+ * zfs rollback [-rRf] <snapshot>
+ *
+ *	-r	Delete any intervening snapshots before doing rollback
+ *	-R	Delete any snapshots and their clones
+ *	-f	ignored for backwards compatibility
+ *
+ * Given a filesystem, rollback to a specific snapshot, discarding any changes
+ * since then and making it the active dataset.  If more recent snapshots exist,
+ * the command will complain unless the '-r' flag is given.
+ */
+typedef struct rollback_cbdata {
+	uint64_t	cb_create;
+	uint8_t		cb_younger_ds_printed;
+	boolean_t	cb_first;
+	int		cb_doclones;
+	char		*cb_target;
+	int		cb_error;
+	boolean_t	cb_recurse;
+} rollback_cbdata_t;
+
+static int
+rollback_check_dependent(zfs_handle_t *zhp, void *data)
+{
+	rollback_cbdata_t *cbp = data;
+
+	if (cbp->cb_first && cbp->cb_recurse) {
+		(void) fprintf(stderr, gettext("cannot rollback to "
+		    "'%s': clones of previous snapshots exist\n"),
+		    cbp->cb_target);
+		(void) fprintf(stderr, gettext("use '-R' to "
+		    "force deletion of the following clones and "
+		    "dependents:\n"));
+		cbp->cb_first = 0;
+		cbp->cb_error = 1;
+	}
+
+	(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+
+	zfs_close(zhp);
+	return (0);
+}
+
+
+/*
+ * Report some snapshots/bookmarks more recent than the one specified.
+ * Used when '-r' is not specified. We reuse this same callback for the
+ * snapshot dependents - if 'cb_dependent' is set, then this is a
+ * dependent and we should report it without checking the transaction group.
+ */
+static int
+rollback_check(zfs_handle_t *zhp, void *data)
+{
+	rollback_cbdata_t *cbp = data;
+	/*
+	 * Max number of younger snapshots and/or bookmarks to display before
+	 * we stop the iteration.
+	 */
+	const uint8_t max_younger = 32;
+
+	if (cbp->cb_doclones) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
+		if (cbp->cb_first && !cbp->cb_recurse) {
+			(void) fprintf(stderr, gettext("cannot "
+			    "rollback to '%s': more recent snapshots "
+			    "or bookmarks exist\n"),
+			    cbp->cb_target);
+			(void) fprintf(stderr, gettext("use '-r' to "
+			    "force deletion of the following "
+			    "snapshots and bookmarks:\n"));
+			cbp->cb_first = 0;
+			cbp->cb_error = 1;
+		}
+
+		if (cbp->cb_recurse) {
+			if (zfs_iter_dependents(zhp, B_TRUE,
+			    rollback_check_dependent, cbp) != 0) {
+				zfs_close(zhp);
+				return (-1);
+			}
+		} else {
+			(void) fprintf(stderr, "%s\n",
+			    zfs_get_name(zhp));
+			cbp->cb_younger_ds_printed++;
+		}
+	}
+	zfs_close(zhp);
+
+	if (cbp->cb_younger_ds_printed == max_younger) {
+		/*
+		 * This non-recursive rollback is going to fail due to the
+		 * presence of snapshots and/or bookmarks that are younger than
+		 * the rollback target.
+		 * We printed some of the offending objects, now we stop
+		 * zfs_iter_snapshot/bookmark iteration so we can fail fast and
+		 * avoid iterating over the rest of the younger objects
+		 */
+		(void) fprintf(stderr, gettext("Output limited to %d "
+		    "snapshots/bookmarks\n"), max_younger);
+		return (-1);
+	}
+	return (0);
+}
+
+static int
+zfs_do_rollback(int argc, char **argv)
+{
+	int ret = 0;
+	int c;
+	boolean_t force = B_FALSE;
+	rollback_cbdata_t cb = { 0 };
+	zfs_handle_t *zhp, *snap;
+	char parentname[ZFS_MAX_DATASET_NAME_LEN];
+	char *delim;
+	uint64_t min_txg = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "rRf")) != -1) {
+		switch (c) {
+		case 'r':
+			cb.cb_recurse = 1;
+			break;
+		case 'R':
+			cb.cb_recurse = 1;
+			cb.cb_doclones = 1;
+			break;
+		case 'f':
+			force = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	/* open the snapshot */
+	if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
+		return (1);
+
+	/* open the parent dataset */
+	(void) strlcpy(parentname, argv[0], sizeof (parentname));
+	verify((delim = strrchr(parentname, '@')) != NULL);
+	*delim = '\0';
+	if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) {
+		zfs_close(snap);
+		return (1);
+	}
+
+	/*
+	 * Check for more recent snapshots and/or clones based on the presence
+	 * of '-r' and '-R'.
+	 */
+	cb.cb_target = argv[0];
+	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
+	cb.cb_first = B_TRUE;
+	cb.cb_error = 0;
+
+	if (cb.cb_create > 0)
+		min_txg = cb.cb_create;
+
+	if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb,
+	    min_txg, 0)) != 0)
+		goto out;
+	if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0)
+		goto out;
+
+	if ((ret = cb.cb_error) != 0)
+		goto out;
+
+	/*
+	 * Rollback parent to the given snapshot.
+	 */
+	ret = zfs_rollback(zhp, snap, force);
+
+out:
+	zfs_close(snap);
+	zfs_close(zhp);
+
+	if (ret == 0)
+		return (0);
+	else
+		return (1);
+}
+
+/*
+ * zfs set property=value ... { fs | snap | vol } ...
+ *
+ * Sets the given properties for all datasets specified on the command line.
+ */
+
+static int
+set_callback(zfs_handle_t *zhp, void *data)
+{
+	nvlist_t *props = data;
+
+	if (zfs_prop_set_list(zhp, props) != 0) {
+		switch (libzfs_errno(g_zfs)) {
+		case EZFS_MOUNTFAILED:
+			(void) fprintf(stderr, gettext("property may be set "
+			    "but unable to remount filesystem\n"));
+			break;
+		case EZFS_SHARENFSFAILED:
+			(void) fprintf(stderr, gettext("property may be set "
+			    "but unable to reshare filesystem\n"));
+			break;
+		}
+		return (1);
+	}
+	return (0);
+}
+
+static int
+zfs_do_set(int argc, char **argv)
+{
+	nvlist_t *props = NULL;
+	int ds_start = -1; /* argv idx of first dataset arg */
+	int ret = 0;
+	int i;
+
+	/* check for options */
+	if (argc > 1 && argv[1][0] == '-') {
+		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+		    argv[1][1]);
+		usage(B_FALSE);
+	}
+
+	/* check number of arguments */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing arguments\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 3) {
+		if (strchr(argv[1], '=') == NULL) {
+			(void) fprintf(stderr, gettext("missing property=value "
+			    "argument(s)\n"));
+		} else {
+			(void) fprintf(stderr, gettext("missing dataset "
+			    "name(s)\n"));
+		}
+		usage(B_FALSE);
+	}
+
+	/* validate argument order:  prop=val args followed by dataset args */
+	for (i = 1; i < argc; i++) {
+		if (strchr(argv[i], '=') != NULL) {
+			if (ds_start > 0) {
+				/* out-of-order prop=val argument */
+				(void) fprintf(stderr, gettext("invalid "
+				    "argument order\n"));
+				usage(B_FALSE);
+			}
+		} else if (ds_start < 0) {
+			ds_start = i;
+		}
+	}
+	if (ds_start < 0) {
+		(void) fprintf(stderr, gettext("missing dataset name(s)\n"));
+		usage(B_FALSE);
+	}
+
+	/* Populate a list of property settings */
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	for (i = 1; i < ds_start; i++) {
+		if (!parseprop(props, argv[i])) {
+			ret = -1;
+			goto error;
+		}
+	}
+
+	ret = zfs_for_each(argc - ds_start, argv + ds_start, 0,
+	    ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props);
+
+error:
+	nvlist_free(props);
+	return (ret);
+}
+
+typedef struct snap_cbdata {
+	nvlist_t *sd_nvl;
+	boolean_t sd_recursive;
+	const char *sd_snapname;
+} snap_cbdata_t;
+
+static int
+zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
+{
+	snap_cbdata_t *sd = arg;
+	char *name;
+	int rv = 0;
+	int error;
+
+	if (sd->sd_recursive &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
+	if (error == -1)
+		nomem();
+	fnvlist_add_boolean(sd->sd_nvl, name);
+	free(name);
+
+	if (sd->sd_recursive)
+		rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
+	zfs_close(zhp);
+	return (rv);
+}
+
+/*
+ * zfs snapshot [-r] [-o prop=value] ... <fs@snap>
+ *
+ * Creates a snapshot with the given name.  While functionally equivalent to
+ * 'zfs create', it is a separate command to differentiate intent.
+ */
+static int
+zfs_do_snapshot(int argc, char **argv)
+{
+	int ret = 0;
+	int c;
+	nvlist_t *props;
+	snap_cbdata_t sd = { 0 };
+	boolean_t multiple_snaps = B_FALSE;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	/* check options */
+	while ((c = getopt(argc, argv, "ro:")) != -1) {
+		switch (c) {
+		case 'o':
+			if (!parseprop(props, optarg)) {
+				nvlist_free(sd.sd_nvl);
+				nvlist_free(props);
+				return (1);
+			}
+			break;
+		case 'r':
+			sd.sd_recursive = B_TRUE;
+			multiple_snaps = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			goto usage;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		goto usage;
+	}
+
+	if (argc > 1)
+		multiple_snaps = B_TRUE;
+	for (; argc > 0; argc--, argv++) {
+		char *atp;
+		zfs_handle_t *zhp;
+
+		atp = strchr(argv[0], '@');
+		if (atp == NULL)
+			goto usage;
+		*atp = '\0';
+		sd.sd_snapname = atp + 1;
+		zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL)
+			goto usage;
+		if (zfs_snapshot_cb(zhp, &sd) != 0)
+			goto usage;
+	}
+
+	ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props);
+	nvlist_free(sd.sd_nvl);
+	nvlist_free(props);
+	if (ret != 0 && multiple_snaps)
+		(void) fprintf(stderr, gettext("no snapshots were created\n"));
+	return (ret != 0);
+
+usage:
+	nvlist_free(sd.sd_nvl);
+	nvlist_free(props);
+	usage(B_FALSE);
+	return (-1);
+}
+
+
+/*
+ * Send a backup stream to stdout.
+ */
+static int
+zfs_do_send(int argc, char **argv)
+{
+	char *fromname = NULL;
+	char *toname = NULL;
+	char *resume_token = NULL;
+	char *cp;
+	zfs_handle_t *zhp;
+	sendflags_t flags = { 0 };
+	int c, err;
+	nvlist_t *dbgnv = NULL;
+	char *redactbook = NULL;
+
+	struct option long_options[] = {
+		{"replicate",	no_argument,		NULL, 'R'},
+		{"redact",	required_argument,	NULL, 'd'},
+		{"props",	no_argument,		NULL, 'p'},
+		{"parsable",	no_argument,		NULL, 'P'},
+		{"dedup",	no_argument,		NULL, 'D'},
+		{"verbose",	no_argument,		NULL, 'v'},
+		{"dryrun",	no_argument,		NULL, 'n'},
+		{"large-block",	no_argument,		NULL, 'L'},
+		{"embed",	no_argument,		NULL, 'e'},
+		{"resume",	required_argument,	NULL, 't'},
+		{"compressed",	no_argument,		NULL, 'c'},
+		{"raw",		no_argument,		NULL, 'w'},
+		{"backup",	no_argument,		NULL, 'b'},
+		{"holds",	no_argument,		NULL, 'h'},
+		{"saved",	no_argument,		NULL, 'S'},
+		{0, 0, 0, 0}
+	};
+
+	/* check options */
+	while ((c = getopt_long(argc, argv, ":i:I:RDpvnPLeht:cwbd:S",
+	    long_options, NULL)) != -1) {
+		switch (c) {
+		case 'i':
+			if (fromname)
+				usage(B_FALSE);
+			fromname = optarg;
+			break;
+		case 'I':
+			if (fromname)
+				usage(B_FALSE);
+			fromname = optarg;
+			flags.doall = B_TRUE;
+			break;
+		case 'R':
+			flags.replicate = B_TRUE;
+			break;
+		case 'd':
+			redactbook = optarg;
+			break;
+		case 'p':
+			flags.props = B_TRUE;
+			break;
+		case 'b':
+			flags.backup = B_TRUE;
+			break;
+		case 'h':
+			flags.holds = B_TRUE;
+			break;
+		case 'P':
+			flags.parsable = B_TRUE;
+			break;
+		case 'v':
+			flags.verbosity++;
+			flags.progress = B_TRUE;
+			break;
+		case 'D':
+			(void) fprintf(stderr,
+			    gettext("WARNING: deduplicated send is no "
+			    "longer supported.  A regular,\n"
+			    "non-deduplicated stream will be generated.\n\n"));
+			break;
+		case 'n':
+			flags.dryrun = B_TRUE;
+			break;
+		case 'L':
+			flags.largeblock = B_TRUE;
+			break;
+		case 'e':
+			flags.embed_data = B_TRUE;
+			break;
+		case 't':
+			resume_token = optarg;
+			break;
+		case 'c':
+			flags.compress = B_TRUE;
+			break;
+		case 'w':
+			flags.raw = B_TRUE;
+			flags.compress = B_TRUE;
+			flags.embed_data = B_TRUE;
+			flags.largeblock = B_TRUE;
+			break;
+		case 'S':
+			flags.saved = B_TRUE;
+			break;
+		case ':':
+			/*
+			 * If a parameter was not passed, optopt contains the
+			 * value that would normally lead us into the
+			 * appropriate case statement.  If it's > 256, then this
+			 * must be a longopt and we should look at argv to get
+			 * the string.  Otherwise it's just the character, so we
+			 * should use it directly.
+			 */
+			if (optopt <= UINT8_MAX) {
+				(void) fprintf(stderr,
+				    gettext("missing argument for '%c' "
+				    "option\n"), optopt);
+			} else {
+				(void) fprintf(stderr,
+				    gettext("missing argument for '%s' "
+				    "option\n"), argv[optind - 1]);
+			}
+			usage(B_FALSE);
+			break;
+		case '?':
+			/*FALLTHROUGH*/
+		default:
+			/*
+			 * If an invalid flag was passed, optopt contains the
+			 * character if it was a short flag, or 0 if it was a
+			 * longopt.
+			 */
+			if (optopt != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid option '%c'\n"), optopt);
+			} else {
+				(void) fprintf(stderr,
+				    gettext("invalid option '%s'\n"),
+				    argv[optind - 1]);
+
+			}
+			usage(B_FALSE);
+		}
+	}
+
+	if (flags.parsable && flags.verbosity == 0)
+		flags.verbosity = 1;
+
+	argc -= optind;
+	argv += optind;
+
+	if (resume_token != NULL) {
+		if (fromname != NULL || flags.replicate || flags.props ||
+		    flags.backup || flags.holds ||
+		    flags.saved || redactbook != NULL) {
+			(void) fprintf(stderr,
+			    gettext("invalid flags combined with -t\n"));
+			usage(B_FALSE);
+		}
+		if (argc > 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+	} else {
+		if (argc < 1) {
+			(void) fprintf(stderr,
+			    gettext("missing snapshot argument\n"));
+			usage(B_FALSE);
+		}
+		if (argc > 1) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+	}
+
+	if (flags.saved) {
+		if (fromname != NULL || flags.replicate || flags.props ||
+		    flags.doall || flags.backup ||
+		    flags.holds || flags.largeblock || flags.embed_data ||
+		    flags.compress || flags.raw || redactbook != NULL) {
+			(void) fprintf(stderr, gettext("incompatible flags "
+			    "combined with saved send flag\n"));
+			usage(B_FALSE);
+		}
+		if (strchr(argv[0], '@') != NULL) {
+			(void) fprintf(stderr, gettext("saved send must "
+			    "specify the dataset with partially-received "
+			    "state\n"));
+			usage(B_FALSE);
+		}
+	}
+
+	if (flags.raw && redactbook != NULL) {
+		(void) fprintf(stderr,
+		    gettext("Error: raw sends may not be redacted.\n"));
+		return (1);
+	}
+
+	if (!flags.dryrun && isatty(STDOUT_FILENO)) {
+		(void) fprintf(stderr,
+		    gettext("Error: Stream can not be written to a terminal.\n"
+		    "You must redirect standard output.\n"));
+		return (1);
+	}
+
+	if (flags.saved) {
+		zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
+		if (zhp == NULL)
+			return (1);
+
+		err = zfs_send_saved(zhp, &flags, STDOUT_FILENO,
+		    resume_token);
+		zfs_close(zhp);
+		return (err != 0);
+	} else if (resume_token != NULL) {
+		return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
+		    resume_token));
+	}
+
+	/*
+	 * For everything except -R and -I, use the new, cleaner code path.
+	 */
+	if (!(flags.replicate || flags.doall)) {
+		char frombuf[ZFS_MAX_DATASET_NAME_LEN];
+
+		if (fromname != NULL && (strchr(fromname, '#') == NULL &&
+		    strchr(fromname, '@') == NULL)) {
+			/*
+			 * Neither bookmark or snapshot was specified.  Print a
+			 * warning, and assume snapshot.
+			 */
+			(void) fprintf(stderr, "Warning: incremental source "
+			    "didn't specify type, assuming snapshot. Use '@' "
+			    "or '#' prefix to avoid ambiguity.\n");
+			(void) snprintf(frombuf, sizeof (frombuf), "@%s",
+			    fromname);
+			fromname = frombuf;
+		}
+		if (fromname != NULL &&
+		    (fromname[0] == '#' || fromname[0] == '@')) {
+			/*
+			 * Incremental source name begins with # or @.
+			 * Default to same fs as target.
+			 */
+			char tmpbuf[ZFS_MAX_DATASET_NAME_LEN];
+			(void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf));
+			(void) strlcpy(frombuf, argv[0], sizeof (frombuf));
+			cp = strchr(frombuf, '@');
+			if (cp != NULL)
+				*cp = '\0';
+			(void) strlcat(frombuf, tmpbuf, sizeof (frombuf));
+			fromname = frombuf;
+		}
+
+		zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
+		if (zhp == NULL)
+			return (1);
+		err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags,
+		    redactbook);
+		zfs_close(zhp);
+		return (err != 0);
+	}
+
+	if (fromname != NULL && strchr(fromname, '#')) {
+		(void) fprintf(stderr,
+		    gettext("Error: multiple snapshots cannot be "
+		    "sent from a bookmark.\n"));
+		return (1);
+	}
+
+	if (redactbook != NULL) {
+		(void) fprintf(stderr, gettext("Error: multiple snapshots "
+		    "cannot be sent redacted.\n"));
+		return (1);
+	}
+
+	if ((cp = strchr(argv[0], '@')) == NULL) {
+		(void) fprintf(stderr, gettext("Error: "
+		    "Unsupported flag with filesystem or bookmark.\n"));
+		return (1);
+	}
+	*cp = '\0';
+	toname = cp + 1;
+	zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+	if (zhp == NULL)
+		return (1);
+
+	/*
+	 * If they specified the full path to the snapshot, chop off
+	 * everything except the short name of the snapshot, but special
+	 * case if they specify the origin.
+	 */
+	if (fromname && (cp = strchr(fromname, '@')) != NULL) {
+		char origin[ZFS_MAX_DATASET_NAME_LEN];
+		zprop_source_t src;
+
+		(void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
+		    origin, sizeof (origin), &src, NULL, 0, B_FALSE);
+
+		if (strcmp(origin, fromname) == 0) {
+			fromname = NULL;
+			flags.fromorigin = B_TRUE;
+		} else {
+			*cp = '\0';
+			if (cp != fromname && strcmp(argv[0], fromname)) {
+				(void) fprintf(stderr,
+				    gettext("incremental source must be "
+				    "in same filesystem\n"));
+				usage(B_FALSE);
+			}
+			fromname = cp + 1;
+			if (strchr(fromname, '@') || strchr(fromname, '/')) {
+				(void) fprintf(stderr,
+				    gettext("invalid incremental source\n"));
+				usage(B_FALSE);
+			}
+		}
+	}
+
+	if (flags.replicate && fromname == NULL)
+		flags.doall = B_TRUE;
+
+	err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0,
+	    flags.verbosity >= 3 ? &dbgnv : NULL);
+
+	if (flags.verbosity >= 3 && dbgnv != NULL) {
+		/*
+		 * dump_nvlist prints to stdout, but that's been
+		 * redirected to a file.  Make it print to stderr
+		 * instead.
+		 */
+		(void) dup2(STDERR_FILENO, STDOUT_FILENO);
+		dump_nvlist(dbgnv, 0);
+		nvlist_free(dbgnv);
+	}
+	zfs_close(zhp);
+
+	return (err != 0);
+}
+
+/*
+ * Restore a backup stream from stdin.
+ */
+static int
+zfs_do_receive(int argc, char **argv)
+{
+	int c, err = 0;
+	recvflags_t flags = { 0 };
+	boolean_t abort_resumable = B_FALSE;
+	nvlist_t *props;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":o:x:dehMnuvFsA")) != -1) {
+		switch (c) {
+		case 'o':
+			if (!parseprop(props, optarg)) {
+				nvlist_free(props);
+				usage(B_FALSE);
+			}
+			break;
+		case 'x':
+			if (!parsepropname(props, optarg)) {
+				nvlist_free(props);
+				usage(B_FALSE);
+			}
+			break;
+		case 'd':
+			if (flags.istail) {
+				(void) fprintf(stderr, gettext("invalid option "
+				    "combination: -d and -e are mutually "
+				    "exclusive\n"));
+				usage(B_FALSE);
+			}
+			flags.isprefix = B_TRUE;
+			break;
+		case 'e':
+			if (flags.isprefix) {
+				(void) fprintf(stderr, gettext("invalid option "
+				    "combination: -d and -e are mutually "
+				    "exclusive\n"));
+				usage(B_FALSE);
+			}
+			flags.istail = B_TRUE;
+			break;
+		case 'h':
+			flags.skipholds = B_TRUE;
+			break;
+		case 'M':
+			flags.forceunmount = B_TRUE;
+			break;
+		case 'n':
+			flags.dryrun = B_TRUE;
+			break;
+		case 'u':
+			flags.nomount = B_TRUE;
+			break;
+		case 'v':
+			flags.verbose = B_TRUE;
+			break;
+		case 's':
+			flags.resumable = B_TRUE;
+			break;
+		case 'F':
+			flags.force = B_TRUE;
+			break;
+		case 'A':
+			abort_resumable = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* zfs recv -e (use "tail" name) implies -d (remove dataset "head") */
+	if (flags.istail)
+		flags.isprefix = B_TRUE;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	if (abort_resumable) {
+		if (flags.isprefix || flags.istail || flags.dryrun ||
+		    flags.resumable || flags.nomount) {
+			(void) fprintf(stderr, gettext("invalid option\n"));
+			usage(B_FALSE);
+		}
+
+		char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+		(void) snprintf(namebuf, sizeof (namebuf),
+		    "%s/%%recv", argv[0]);
+
+		if (zfs_dataset_exists(g_zfs, namebuf,
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) {
+			zfs_handle_t *zhp = zfs_open(g_zfs,
+			    namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+			if (zhp == NULL) {
+				nvlist_free(props);
+				return (1);
+			}
+			err = zfs_destroy(zhp, B_FALSE);
+			zfs_close(zhp);
+		} else {
+			zfs_handle_t *zhp = zfs_open(g_zfs,
+			    argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+			if (zhp == NULL)
+				usage(B_FALSE);
+			if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) ||
+			    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+			    NULL, 0, NULL, NULL, 0, B_TRUE) == -1) {
+				(void) fprintf(stderr,
+				    gettext("'%s' does not have any "
+				    "resumable receive state to abort\n"),
+				    argv[0]);
+				nvlist_free(props);
+				zfs_close(zhp);
+				return (1);
+			}
+			err = zfs_destroy(zhp, B_FALSE);
+			zfs_close(zhp);
+		}
+		nvlist_free(props);
+		return (err != 0);
+	}
+
+	if (isatty(STDIN_FILENO)) {
+		(void) fprintf(stderr,
+		    gettext("Error: Backup stream can not be read "
+		    "from a terminal.\n"
+		    "You must redirect standard input.\n"));
+		nvlist_free(props);
+		return (1);
+	}
+	err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
+	nvlist_free(props);
+
+	return (err != 0);
+}
+
+/*
+ * allow/unallow stuff
+ */
+/* copied from zfs/sys/dsl_deleg.h */
+#define	ZFS_DELEG_PERM_CREATE		"create"
+#define	ZFS_DELEG_PERM_DESTROY		"destroy"
+#define	ZFS_DELEG_PERM_SNAPSHOT		"snapshot"
+#define	ZFS_DELEG_PERM_ROLLBACK		"rollback"
+#define	ZFS_DELEG_PERM_CLONE		"clone"
+#define	ZFS_DELEG_PERM_PROMOTE		"promote"
+#define	ZFS_DELEG_PERM_RENAME		"rename"
+#define	ZFS_DELEG_PERM_MOUNT		"mount"
+#define	ZFS_DELEG_PERM_SHARE		"share"
+#define	ZFS_DELEG_PERM_SEND		"send"
+#define	ZFS_DELEG_PERM_RECEIVE		"receive"
+#define	ZFS_DELEG_PERM_ALLOW		"allow"
+#define	ZFS_DELEG_PERM_USERPROP		"userprop"
+#define	ZFS_DELEG_PERM_VSCAN		"vscan" /* ??? */
+#define	ZFS_DELEG_PERM_USERQUOTA	"userquota"
+#define	ZFS_DELEG_PERM_GROUPQUOTA	"groupquota"
+#define	ZFS_DELEG_PERM_USERUSED		"userused"
+#define	ZFS_DELEG_PERM_GROUPUSED	"groupused"
+#define	ZFS_DELEG_PERM_USEROBJQUOTA	"userobjquota"
+#define	ZFS_DELEG_PERM_GROUPOBJQUOTA	"groupobjquota"
+#define	ZFS_DELEG_PERM_USEROBJUSED	"userobjused"
+#define	ZFS_DELEG_PERM_GROUPOBJUSED	"groupobjused"
+
+#define	ZFS_DELEG_PERM_HOLD		"hold"
+#define	ZFS_DELEG_PERM_RELEASE		"release"
+#define	ZFS_DELEG_PERM_DIFF		"diff"
+#define	ZFS_DELEG_PERM_BOOKMARK		"bookmark"
+#define	ZFS_DELEG_PERM_LOAD_KEY		"load-key"
+#define	ZFS_DELEG_PERM_CHANGE_KEY	"change-key"
+
+#define	ZFS_DELEG_PERM_PROJECTUSED	"projectused"
+#define	ZFS_DELEG_PERM_PROJECTQUOTA	"projectquota"
+#define	ZFS_DELEG_PERM_PROJECTOBJUSED	"projectobjused"
+#define	ZFS_DELEG_PERM_PROJECTOBJQUOTA	"projectobjquota"
+
+#define	ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE
+
+static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
+	{ ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
+	{ ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
+	{ ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
+	{ ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
+	{ ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
+	{ ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+	{ ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
+	{ ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
+	{ ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
+	{ ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
+	{ ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
+	{ ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
+	{ ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
+	{ ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
+	{ ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
+	{ ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK },
+	{ ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY },
+	{ ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY },
+
+	{ ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
+	{ ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+	{ ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
+	{ ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
+	{ ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
+	{ ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_NOTE_USEROBJQUOTA },
+	{ ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_NOTE_USEROBJUSED },
+	{ ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_NOTE_GROUPOBJQUOTA },
+	{ ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_NOTE_GROUPOBJUSED },
+	{ ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_NOTE_PROJECTUSED },
+	{ ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTQUOTA },
+	{ ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJUSED },
+	{ ZFS_DELEG_PERM_PROJECTOBJQUOTA, ZFS_DELEG_NOTE_PROJECTOBJQUOTA },
+	{ NULL, ZFS_DELEG_NOTE_NONE }
+};
+
+/* permission structure */
+typedef struct deleg_perm {
+	zfs_deleg_who_type_t	dp_who_type;
+	const char		*dp_name;
+	boolean_t		dp_local;
+	boolean_t		dp_descend;
+} deleg_perm_t;
+
+/* */
+typedef struct deleg_perm_node {
+	deleg_perm_t		dpn_perm;
+
+	uu_avl_node_t		dpn_avl_node;
+} deleg_perm_node_t;
+
+typedef struct fs_perm fs_perm_t;
+
+/* permissions set */
+typedef struct who_perm {
+	zfs_deleg_who_type_t	who_type;
+	const char		*who_name;		/* id */
+	char			who_ug_name[256];	/* user/group name */
+	fs_perm_t		*who_fsperm;		/* uplink */
+
+	uu_avl_t		*who_deleg_perm_avl;	/* permissions */
+} who_perm_t;
+
+/* */
+typedef struct who_perm_node {
+	who_perm_t	who_perm;
+	uu_avl_node_t	who_avl_node;
+} who_perm_node_t;
+
+typedef struct fs_perm_set fs_perm_set_t;
+/* fs permissions */
+struct fs_perm {
+	const char		*fsp_name;
+
+	uu_avl_t		*fsp_sc_avl;	/* sets,create */
+	uu_avl_t		*fsp_uge_avl;	/* user,group,everyone */
+
+	fs_perm_set_t		*fsp_set;	/* uplink */
+};
+
+/* */
+typedef struct fs_perm_node {
+	fs_perm_t	fspn_fsperm;
+	uu_avl_t	*fspn_avl;
+
+	uu_list_node_t	fspn_list_node;
+} fs_perm_node_t;
+
+/* top level structure */
+struct fs_perm_set {
+	uu_list_pool_t	*fsps_list_pool;
+	uu_list_t	*fsps_list; /* list of fs_perms */
+
+	uu_avl_pool_t	*fsps_named_set_avl_pool;
+	uu_avl_pool_t	*fsps_who_perm_avl_pool;
+	uu_avl_pool_t	*fsps_deleg_perm_avl_pool;
+};
+
+static inline const char *
+deleg_perm_type(zfs_deleg_note_t note)
+{
+	/* subcommands */
+	switch (note) {
+		/* SUBCOMMANDS */
+		/* OTHER */
+	case ZFS_DELEG_NOTE_GROUPQUOTA:
+	case ZFS_DELEG_NOTE_GROUPUSED:
+	case ZFS_DELEG_NOTE_USERPROP:
+	case ZFS_DELEG_NOTE_USERQUOTA:
+	case ZFS_DELEG_NOTE_USERUSED:
+	case ZFS_DELEG_NOTE_USEROBJQUOTA:
+	case ZFS_DELEG_NOTE_USEROBJUSED:
+	case ZFS_DELEG_NOTE_GROUPOBJQUOTA:
+	case ZFS_DELEG_NOTE_GROUPOBJUSED:
+	case ZFS_DELEG_NOTE_PROJECTUSED:
+	case ZFS_DELEG_NOTE_PROJECTQUOTA:
+	case ZFS_DELEG_NOTE_PROJECTOBJUSED:
+	case ZFS_DELEG_NOTE_PROJECTOBJQUOTA:
+		/* other */
+		return (gettext("other"));
+	default:
+		return (gettext("subcommand"));
+	}
+}
+
+static int
+who_type2weight(zfs_deleg_who_type_t who_type)
+{
+	int res;
+	switch (who_type) {
+		case ZFS_DELEG_NAMED_SET_SETS:
+		case ZFS_DELEG_NAMED_SET:
+			res = 0;
+			break;
+		case ZFS_DELEG_CREATE_SETS:
+		case ZFS_DELEG_CREATE:
+			res = 1;
+			break;
+		case ZFS_DELEG_USER_SETS:
+		case ZFS_DELEG_USER:
+			res = 2;
+			break;
+		case ZFS_DELEG_GROUP_SETS:
+		case ZFS_DELEG_GROUP:
+			res = 3;
+			break;
+		case ZFS_DELEG_EVERYONE_SETS:
+		case ZFS_DELEG_EVERYONE:
+			res = 4;
+			break;
+		default:
+			res = -1;
+	}
+
+	return (res);
+}
+
+/* ARGSUSED */
+static int
+who_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+	const who_perm_node_t *l = larg;
+	const who_perm_node_t *r = rarg;
+	zfs_deleg_who_type_t ltype = l->who_perm.who_type;
+	zfs_deleg_who_type_t rtype = r->who_perm.who_type;
+	int lweight = who_type2weight(ltype);
+	int rweight = who_type2weight(rtype);
+	int res = lweight - rweight;
+	if (res == 0)
+		res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
+		    ZFS_MAX_DELEG_NAME-1);
+
+	if (res == 0)
+		return (0);
+	if (res > 0)
+		return (1);
+	else
+		return (-1);
+}
+
+/* ARGSUSED */
+static int
+deleg_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+	const deleg_perm_node_t *l = larg;
+	const deleg_perm_node_t *r = rarg;
+	int res =  strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
+	    ZFS_MAX_DELEG_NAME-1);
+
+	if (res == 0)
+		return (0);
+
+	if (res > 0)
+		return (1);
+	else
+		return (-1);
+}
+
+static inline void
+fs_perm_set_init(fs_perm_set_t *fspset)
+{
+	bzero(fspset, sizeof (fs_perm_set_t));
+
+	if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
+	    sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
+	    NULL, UU_DEFAULT)) == NULL)
+		nomem();
+	if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
+	    "named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
+	    who_perm_node_t, who_avl_node), who_perm_compare,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
+	    "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
+	    who_perm_node_t, who_avl_node), who_perm_compare,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
+	    "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
+	    deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
+	    == NULL)
+		nomem();
+}
+
+static inline void fs_perm_fini(fs_perm_t *);
+static inline void who_perm_fini(who_perm_t *);
+
+static inline void
+fs_perm_set_fini(fs_perm_set_t *fspset)
+{
+	fs_perm_node_t *node = uu_list_first(fspset->fsps_list);
+
+	while (node != NULL) {
+		fs_perm_node_t *next_node =
+		    uu_list_next(fspset->fsps_list, node);
+		fs_perm_t *fsperm = &node->fspn_fsperm;
+		fs_perm_fini(fsperm);
+		uu_list_remove(fspset->fsps_list, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
+	uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
+	uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
+}
+
+static inline void
+deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
+    const char *name)
+{
+	deleg_perm->dp_who_type = type;
+	deleg_perm->dp_name = name;
+}
+
+static inline void
+who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm,
+    zfs_deleg_who_type_t type, const char *name)
+{
+	uu_avl_pool_t	*pool;
+	pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;
+
+	bzero(who_perm, sizeof (who_perm_t));
+
+	if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	who_perm->who_type = type;
+	who_perm->who_name = name;
+	who_perm->who_fsperm = fsperm;
+}
+
+static inline void
+who_perm_fini(who_perm_t *who_perm)
+{
+	deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);
+
+	while (node != NULL) {
+		deleg_perm_node_t *next_node =
+		    uu_avl_next(who_perm->who_deleg_perm_avl, node);
+
+		uu_avl_remove(who_perm->who_deleg_perm_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_destroy(who_perm->who_deleg_perm_avl);
+}
+
+static inline void
+fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname)
+{
+	uu_avl_pool_t	*nset_pool = fspset->fsps_named_set_avl_pool;
+	uu_avl_pool_t	*who_pool = fspset->fsps_who_perm_avl_pool;
+
+	bzero(fsperm, sizeof (fs_perm_t));
+
+	if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
+	    == NULL)
+		nomem();
+
+	if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
+	    == NULL)
+		nomem();
+
+	fsperm->fsp_set = fspset;
+	fsperm->fsp_name = fsname;
+}
+
+static inline void
+fs_perm_fini(fs_perm_t *fsperm)
+{
+	who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
+	while (node != NULL) {
+		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
+		    node);
+		who_perm_t *who_perm = &node->who_perm;
+		who_perm_fini(who_perm);
+		uu_avl_remove(fsperm->fsp_sc_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	node = uu_avl_first(fsperm->fsp_uge_avl);
+	while (node != NULL) {
+		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
+		    node);
+		who_perm_t *who_perm = &node->who_perm;
+		who_perm_fini(who_perm);
+		uu_avl_remove(fsperm->fsp_uge_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_destroy(fsperm->fsp_sc_avl);
+	uu_avl_destroy(fsperm->fsp_uge_avl);
+}
+
+static void
+set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node,
+    zfs_deleg_who_type_t who_type, const char *name, char locality)
+{
+	uu_avl_index_t idx = 0;
+
+	deleg_perm_node_t *found_node = NULL;
+	deleg_perm_t	*deleg_perm = &node->dpn_perm;
+
+	deleg_perm_init(deleg_perm, who_type, name);
+
+	if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+	    == NULL)
+		uu_avl_insert(avl, node, idx);
+	else {
+		node = found_node;
+		deleg_perm = &node->dpn_perm;
+	}
+
+
+	switch (locality) {
+	case ZFS_DELEG_LOCAL:
+		deleg_perm->dp_local = B_TRUE;
+		break;
+	case ZFS_DELEG_DESCENDENT:
+		deleg_perm->dp_descend = B_TRUE;
+		break;
+	case ZFS_DELEG_NA:
+		break;
+	default:
+		assert(B_FALSE); /* invalid locality */
+	}
+}
+
+static inline int
+parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality)
+{
+	nvpair_t *nvp = NULL;
+	fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
+	uu_avl_t *avl = who_perm->who_deleg_perm_avl;
+	zfs_deleg_who_type_t who_type = who_perm->who_type;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *name = nvpair_name(nvp);
+		data_type_t type = nvpair_type(nvp);
+		uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
+		deleg_perm_node_t *node =
+		    safe_malloc(sizeof (deleg_perm_node_t));
+
+		VERIFY(type == DATA_TYPE_BOOLEAN);
+
+		uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
+		set_deleg_perm_node(avl, node, who_type, name, locality);
+	}
+
+	return (0);
+}
+
+static inline int
+parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl)
+{
+	nvpair_t *nvp = NULL;
+	fs_perm_set_t *fspset = fsperm->fsp_set;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		nvlist_t *nvl2 = NULL;
+		const char *name = nvpair_name(nvp);
+		uu_avl_t *avl = NULL;
+		uu_avl_pool_t *avl_pool = NULL;
+		zfs_deleg_who_type_t perm_type = name[0];
+		char perm_locality = name[1];
+		const char *perm_name = name + 3;
+		who_perm_t *who_perm = NULL;
+
+		assert('$' == name[2]);
+
+		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+			return (-1);
+
+		switch (perm_type) {
+		case ZFS_DELEG_CREATE:
+		case ZFS_DELEG_CREATE_SETS:
+		case ZFS_DELEG_NAMED_SET:
+		case ZFS_DELEG_NAMED_SET_SETS:
+			avl_pool = fspset->fsps_named_set_avl_pool;
+			avl = fsperm->fsp_sc_avl;
+			break;
+		case ZFS_DELEG_USER:
+		case ZFS_DELEG_USER_SETS:
+		case ZFS_DELEG_GROUP:
+		case ZFS_DELEG_GROUP_SETS:
+		case ZFS_DELEG_EVERYONE:
+		case ZFS_DELEG_EVERYONE_SETS:
+			avl_pool = fspset->fsps_who_perm_avl_pool;
+			avl = fsperm->fsp_uge_avl;
+			break;
+
+		default:
+			assert(!"unhandled zfs_deleg_who_type_t");
+		}
+
+		who_perm_node_t *found_node = NULL;
+		who_perm_node_t *node = safe_malloc(
+		    sizeof (who_perm_node_t));
+		who_perm = &node->who_perm;
+		uu_avl_index_t idx = 0;
+
+		uu_avl_node_init(node, &node->who_avl_node, avl_pool);
+		who_perm_init(who_perm, fsperm, perm_type, perm_name);
+
+		if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+		    == NULL) {
+			if (avl == fsperm->fsp_uge_avl) {
+				uid_t rid = 0;
+				struct passwd *p = NULL;
+				struct group *g = NULL;
+				const char *nice_name = NULL;
+
+				switch (perm_type) {
+				case ZFS_DELEG_USER_SETS:
+				case ZFS_DELEG_USER:
+					rid = atoi(perm_name);
+					p = getpwuid(rid);
+					if (p)
+						nice_name = p->pw_name;
+					break;
+				case ZFS_DELEG_GROUP_SETS:
+				case ZFS_DELEG_GROUP:
+					rid = atoi(perm_name);
+					g = getgrgid(rid);
+					if (g)
+						nice_name = g->gr_name;
+					break;
+
+				default:
+					break;
+				}
+
+				if (nice_name != NULL) {
+					(void) strlcpy(
+					    node->who_perm.who_ug_name,
+					    nice_name, 256);
+				} else {
+					/* User or group unknown */
+					(void) snprintf(
+					    node->who_perm.who_ug_name,
+					    sizeof (node->who_perm.who_ug_name),
+					    "(unknown: %d)", rid);
+				}
+			}
+
+			uu_avl_insert(avl, node, idx);
+		} else {
+			node = found_node;
+			who_perm = &node->who_perm;
+		}
+
+		assert(who_perm != NULL);
+		(void) parse_who_perm(who_perm, nvl2, perm_locality);
+	}
+
+	return (0);
+}
+
+static inline int
+parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)
+{
+	nvpair_t *nvp = NULL;
+	uu_avl_index_t idx = 0;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		nvlist_t *nvl2 = NULL;
+		const char *fsname = nvpair_name(nvp);
+		data_type_t type = nvpair_type(nvp);
+		fs_perm_t *fsperm = NULL;
+		fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
+		if (node == NULL)
+			nomem();
+
+		fsperm = &node->fspn_fsperm;
+
+		VERIFY(DATA_TYPE_NVLIST == type);
+
+		uu_list_node_init(node, &node->fspn_list_node,
+		    fspset->fsps_list_pool);
+
+		idx = uu_list_numnodes(fspset->fsps_list);
+		fs_perm_init(fsperm, fspset, fsname);
+
+		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+			return (-1);
+
+		(void) parse_fs_perm(fsperm, nvl2);
+
+		uu_list_insert(fspset->fsps_list, node, idx);
+	}
+
+	return (0);
+}
+
+static inline const char *
+deleg_perm_comment(zfs_deleg_note_t note)
+{
+	const char *str = "";
+
+	/* subcommands */
+	switch (note) {
+		/* SUBCOMMANDS */
+	case ZFS_DELEG_NOTE_ALLOW:
+		str = gettext("Must also have the permission that is being"
+		    "\n\t\t\t\tallowed");
+		break;
+	case ZFS_DELEG_NOTE_CLONE:
+		str = gettext("Must also have the 'create' ability and 'mount'"
+		    "\n\t\t\t\tability in the origin file system");
+		break;
+	case ZFS_DELEG_NOTE_CREATE:
+		str = gettext("Must also have the 'mount' ability");
+		break;
+	case ZFS_DELEG_NOTE_DESTROY:
+		str = gettext("Must also have the 'mount' ability");
+		break;
+	case ZFS_DELEG_NOTE_DIFF:
+		str = gettext("Allows lookup of paths within a dataset;"
+		    "\n\t\t\t\tgiven an object number. Ordinary users need this"
+		    "\n\t\t\t\tin order to use zfs diff");
+		break;
+	case ZFS_DELEG_NOTE_HOLD:
+		str = gettext("Allows adding a user hold to a snapshot");
+		break;
+	case ZFS_DELEG_NOTE_MOUNT:
+		str = gettext("Allows mount/umount of ZFS datasets");
+		break;
+	case ZFS_DELEG_NOTE_PROMOTE:
+		str = gettext("Must also have the 'mount'\n\t\t\t\tand"
+		    " 'promote' ability in the origin file system");
+		break;
+	case ZFS_DELEG_NOTE_RECEIVE:
+		str = gettext("Must also have the 'mount' and 'create'"
+		    " ability");
+		break;
+	case ZFS_DELEG_NOTE_RELEASE:
+		str = gettext("Allows releasing a user hold which\n\t\t\t\t"
+		    "might destroy the snapshot");
+		break;
+	case ZFS_DELEG_NOTE_RENAME:
+		str = gettext("Must also have the 'mount' and 'create'"
+		    "\n\t\t\t\tability in the new parent");
+		break;
+	case ZFS_DELEG_NOTE_ROLLBACK:
+		str = gettext("");
+		break;
+	case ZFS_DELEG_NOTE_SEND:
+		str = gettext("");
+		break;
+	case ZFS_DELEG_NOTE_SHARE:
+		str = gettext("Allows sharing file systems over NFS or SMB"
+		    "\n\t\t\t\tprotocols");
+		break;
+	case ZFS_DELEG_NOTE_SNAPSHOT:
+		str = gettext("");
+		break;
+	case ZFS_DELEG_NOTE_LOAD_KEY:
+		str = gettext("Allows loading or unloading an encryption key");
+		break;
+	case ZFS_DELEG_NOTE_CHANGE_KEY:
+		str = gettext("Allows changing or adding an encryption key");
+		break;
+/*
+ *	case ZFS_DELEG_NOTE_VSCAN:
+ *		str = gettext("");
+ *		break;
+ */
+		/* OTHER */
+	case ZFS_DELEG_NOTE_GROUPQUOTA:
+		str = gettext("Allows accessing any groupquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_GROUPUSED:
+		str = gettext("Allows reading any groupused@... property");
+		break;
+	case ZFS_DELEG_NOTE_USERPROP:
+		str = gettext("Allows changing any user property");
+		break;
+	case ZFS_DELEG_NOTE_USERQUOTA:
+		str = gettext("Allows accessing any userquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_USERUSED:
+		str = gettext("Allows reading any userused@... property");
+		break;
+	case ZFS_DELEG_NOTE_USEROBJQUOTA:
+		str = gettext("Allows accessing any userobjquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_GROUPOBJQUOTA:
+		str = gettext("Allows accessing any \n\t\t\t\t"
+		    "groupobjquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_GROUPOBJUSED:
+		str = gettext("Allows reading any groupobjused@... property");
+		break;
+	case ZFS_DELEG_NOTE_USEROBJUSED:
+		str = gettext("Allows reading any userobjused@... property");
+		break;
+	case ZFS_DELEG_NOTE_PROJECTQUOTA:
+		str = gettext("Allows accessing any projectquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_PROJECTOBJQUOTA:
+		str = gettext("Allows accessing any \n\t\t\t\t"
+		    "projectobjquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_PROJECTUSED:
+		str = gettext("Allows reading any projectused@... property");
+		break;
+	case ZFS_DELEG_NOTE_PROJECTOBJUSED:
+		str = gettext("Allows accessing any \n\t\t\t\t"
+		    "projectobjused@... property");
+		break;
+		/* other */
+	default:
+		str = "";
+	}
+
+	return (str);
+}
+
+struct allow_opts {
+	boolean_t local;
+	boolean_t descend;
+	boolean_t user;
+	boolean_t group;
+	boolean_t everyone;
+	boolean_t create;
+	boolean_t set;
+	boolean_t recursive; /* unallow only */
+	boolean_t prt_usage;
+
+	boolean_t prt_perms;
+	char *who;
+	char *perms;
+	const char *dataset;
+};
+
+static inline int
+prop_cmp(const void *a, const void *b)
+{
+	const char *str1 = *(const char **)a;
+	const char *str2 = *(const char **)b;
+	return (strcmp(str1, str2));
+}
+
+static void
+allow_usage(boolean_t un, boolean_t requested, const char *msg)
+{
+	const char *opt_desc[] = {
+		"-h", gettext("show this help message and exit"),
+		"-l", gettext("set permission locally"),
+		"-d", gettext("set permission for descents"),
+		"-u", gettext("set permission for user"),
+		"-g", gettext("set permission for group"),
+		"-e", gettext("set permission for everyone"),
+		"-c", gettext("set create time permission"),
+		"-s", gettext("define permission set"),
+		/* unallow only */
+		"-r", gettext("remove permissions recursively"),
+	};
+	size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
+	size_t allow_size = unallow_size - 2;
+	const char *props[ZFS_NUM_PROPS];
+	int i;
+	size_t count = 0;
+	FILE *fp = requested ? stdout : stderr;
+	zprop_desc_t *pdtbl = zfs_prop_get_table();
+	const char *fmt = gettext("%-16s %-14s\t%s\n");
+
+	(void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
+	    HELP_ALLOW));
+	(void) fprintf(fp, gettext("Options:\n"));
+	for (i = 0; i < (un ? unallow_size : allow_size); i += 2) {
+		const char *opt = opt_desc[i];
+		const char *optdsc = opt_desc[i + 1];
+		(void) fprintf(fp, gettext("  %-10s  %s\n"), opt, optdsc);
+	}
+
+	(void) fprintf(fp, gettext("\nThe following permissions are "
+	    "supported:\n\n"));
+	(void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
+	    gettext("NOTES"));
+	for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
+		const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
+		zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
+		const char *perm_type = deleg_perm_type(perm_note);
+		const char *perm_comment = deleg_perm_comment(perm_note);
+		(void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
+	}
+
+	for (i = 0; i < ZFS_NUM_PROPS; i++) {
+		zprop_desc_t *pd = &pdtbl[i];
+		if (pd->pd_visible != B_TRUE)
+			continue;
+
+		if (pd->pd_attr == PROP_READONLY)
+			continue;
+
+		props[count++] = pd->pd_name;
+	}
+	props[count] = NULL;
+
+	qsort(props, count, sizeof (char *), prop_cmp);
+
+	for (i = 0; i < count; i++)
+		(void) fprintf(fp, fmt, props[i], gettext("property"), "");
+
+	if (msg != NULL)
+		(void) fprintf(fp, gettext("\nzfs: error: %s"), msg);
+
+	exit(requested ? 0 : 2);
+}
+
+static inline const char *
+munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
+    char **permsp)
+{
+	if (un && argc == expected_argc - 1)
+		*permsp = NULL;
+	else if (argc == expected_argc)
+		*permsp = argv[argc - 2];
+	else
+		allow_usage(un, B_FALSE,
+		    gettext("wrong number of parameters\n"));
+
+	return (argv[argc - 1]);
+}
+
+static void
+parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts)
+{
+	int uge_sum = opts->user + opts->group + opts->everyone;
+	int csuge_sum = opts->create + opts->set + uge_sum;
+	int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
+	int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;
+
+	if (uge_sum > 1)
+		allow_usage(un, B_FALSE,
+		    gettext("-u, -g, and -e are mutually exclusive\n"));
+
+	if (opts->prt_usage) {
+		if (argc == 0 && all_sum == 0)
+			allow_usage(un, B_TRUE, NULL);
+		else
+			usage(B_FALSE);
+	}
+
+	if (opts->set) {
+		if (csuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -s\n"));
+
+		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+		if (argv[0][0] != '@')
+			allow_usage(un, B_FALSE,
+			    gettext("invalid set name: missing '@' prefix\n"));
+		opts->who = argv[0];
+	} else if (opts->create) {
+		if (ldcsuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -c\n"));
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (opts->everyone) {
+		if (csuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -e\n"));
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
+	    == 0) {
+		opts->everyone = B_TRUE;
+		argc--;
+		argv++;
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (argc == 1 && !un) {
+		opts->prt_perms = B_TRUE;
+		opts->dataset = argv[argc-1];
+	} else {
+		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+		opts->who = argv[0];
+	}
+
+	if (!opts->local && !opts->descend) {
+		opts->local = B_TRUE;
+		opts->descend = B_TRUE;
+	}
+}
+
+static void
+store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
+    const char *who, char *perms, nvlist_t *top_nvl)
+{
+	int i;
+	char ld[2] = { '\0', '\0' };
+	char who_buf[MAXNAMELEN + 32];
+	char base_type = '\0';
+	char set_type = '\0';
+	nvlist_t *base_nvl = NULL;
+	nvlist_t *set_nvl = NULL;
+	nvlist_t *nvl;
+
+	if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) !=  0)
+		nomem();
+
+	switch (type) {
+	case ZFS_DELEG_NAMED_SET_SETS:
+	case ZFS_DELEG_NAMED_SET:
+		set_type = ZFS_DELEG_NAMED_SET_SETS;
+		base_type = ZFS_DELEG_NAMED_SET;
+		ld[0] = ZFS_DELEG_NA;
+		break;
+	case ZFS_DELEG_CREATE_SETS:
+	case ZFS_DELEG_CREATE:
+		set_type = ZFS_DELEG_CREATE_SETS;
+		base_type = ZFS_DELEG_CREATE;
+		ld[0] = ZFS_DELEG_NA;
+		break;
+	case ZFS_DELEG_USER_SETS:
+	case ZFS_DELEG_USER:
+		set_type = ZFS_DELEG_USER_SETS;
+		base_type = ZFS_DELEG_USER;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+	case ZFS_DELEG_GROUP_SETS:
+	case ZFS_DELEG_GROUP:
+		set_type = ZFS_DELEG_GROUP_SETS;
+		base_type = ZFS_DELEG_GROUP;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+	case ZFS_DELEG_EVERYONE_SETS:
+	case ZFS_DELEG_EVERYONE:
+		set_type = ZFS_DELEG_EVERYONE_SETS;
+		base_type = ZFS_DELEG_EVERYONE;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+
+	default:
+		assert(set_type != '\0' && base_type != '\0');
+	}
+
+	if (perms != NULL) {
+		char *curr = perms;
+		char *end = curr + strlen(perms);
+
+		while (curr < end) {
+			char *delim = strchr(curr, ',');
+			if (delim == NULL)
+				delim = end;
+			else
+				*delim = '\0';
+
+			if (curr[0] == '@')
+				nvl = set_nvl;
+			else
+				nvl = base_nvl;
+
+			(void) nvlist_add_boolean(nvl, curr);
+			if (delim != end)
+				*delim = ',';
+			curr = delim + 1;
+		}
+
+		for (i = 0; i < 2; i++) {
+			char locality = ld[i];
+			if (locality == 0)
+				continue;
+
+			if (!nvlist_empty(base_nvl)) {
+				if (who != NULL)
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$%s",
+					    base_type, locality, who);
+				else
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$",
+					    base_type, locality);
+
+				(void) nvlist_add_nvlist(top_nvl, who_buf,
+				    base_nvl);
+			}
+
+
+			if (!nvlist_empty(set_nvl)) {
+				if (who != NULL)
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$%s",
+					    set_type, locality, who);
+				else
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$",
+					    set_type, locality);
+
+				(void) nvlist_add_nvlist(top_nvl, who_buf,
+				    set_nvl);
+			}
+		}
+	} else {
+		for (i = 0; i < 2; i++) {
+			char locality = ld[i];
+			if (locality == 0)
+				continue;
+
+			if (who != NULL)
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$%s", base_type, locality, who);
+			else
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$", base_type, locality);
+			(void) nvlist_add_boolean(top_nvl, who_buf);
+
+			if (who != NULL)
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$%s", set_type, locality, who);
+			else
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$", set_type, locality);
+			(void) nvlist_add_boolean(top_nvl, who_buf);
+		}
+	}
+}
+
+static int
+construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
+{
+	if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	if (opts->set) {
+		store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
+		    opts->descend, opts->who, opts->perms, *nvlp);
+	} else if (opts->create) {
+		store_allow_perm(ZFS_DELEG_CREATE, opts->local,
+		    opts->descend, NULL, opts->perms, *nvlp);
+	} else if (opts->everyone) {
+		store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
+		    opts->descend, NULL, opts->perms, *nvlp);
+	} else {
+		char *curr = opts->who;
+		char *end = curr + strlen(curr);
+
+		while (curr < end) {
+			const char *who;
+			zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
+			char *endch;
+			char *delim = strchr(curr, ',');
+			char errbuf[256];
+			char id[64];
+			struct passwd *p = NULL;
+			struct group *g = NULL;
+
+			uid_t rid;
+			if (delim == NULL)
+				delim = end;
+			else
+				*delim = '\0';
+
+			rid = (uid_t)strtol(curr, &endch, 0);
+			if (opts->user) {
+				who_type = ZFS_DELEG_USER;
+				if (*endch != '\0')
+					p = getpwnam(curr);
+				else
+					p = getpwuid(rid);
+
+				if (p != NULL)
+					rid = p->pw_uid;
+				else if (*endch != '\0') {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid user %s\n"), curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			} else if (opts->group) {
+				who_type = ZFS_DELEG_GROUP;
+				if (*endch != '\0')
+					g = getgrnam(curr);
+				else
+					g = getgrgid(rid);
+
+				if (g != NULL)
+					rid = g->gr_gid;
+				else if (*endch != '\0') {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid group %s\n"),  curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			} else {
+				if (*endch != '\0') {
+					p = getpwnam(curr);
+				} else {
+					p = getpwuid(rid);
+				}
+
+				if (p == NULL) {
+					if (*endch != '\0') {
+						g = getgrnam(curr);
+					} else {
+						g = getgrgid(rid);
+					}
+				}
+
+				if (p != NULL) {
+					who_type = ZFS_DELEG_USER;
+					rid = p->pw_uid;
+				} else if (g != NULL) {
+					who_type = ZFS_DELEG_GROUP;
+					rid = g->gr_gid;
+				} else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid user/group %s\n"), curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			}
+
+			(void) sprintf(id, "%u", rid);
+			who = id;
+
+			store_allow_perm(who_type, opts->local,
+			    opts->descend, who, opts->perms, *nvlp);
+			curr = delim + 1;
+		}
+	}
+
+	return (0);
+}
+
+static void
+print_set_creat_perms(uu_avl_t *who_avl)
+{
+	const char *sc_title[] = {
+		gettext("Permission sets:\n"),
+		gettext("Create time permissions:\n"),
+		NULL
+	};
+	who_perm_node_t *who_node = NULL;
+	int prev_weight = -1;
+
+	for (who_node = uu_avl_first(who_avl); who_node != NULL;
+	    who_node = uu_avl_next(who_avl, who_node)) {
+		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+		const char *who_name = who_node->who_perm.who_name;
+		int weight = who_type2weight(who_type);
+		boolean_t first = B_TRUE;
+		deleg_perm_node_t *deleg_node;
+
+		if (prev_weight != weight) {
+			(void) printf("%s", sc_title[weight]);
+			prev_weight = weight;
+		}
+
+		if (who_name == NULL || strnlen(who_name, 1) == 0)
+			(void) printf("\t");
+		else
+			(void) printf("\t%s ", who_name);
+
+		for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
+		    deleg_node = uu_avl_next(avl, deleg_node)) {
+			if (first) {
+				(void) printf("%s",
+				    deleg_node->dpn_perm.dp_name);
+				first = B_FALSE;
+			} else
+				(void) printf(",%s",
+				    deleg_node->dpn_perm.dp_name);
+		}
+
+		(void) printf("\n");
+	}
+}
+
+static void
+print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
+    const char *title)
+{
+	who_perm_node_t *who_node = NULL;
+	boolean_t prt_title = B_TRUE;
+	uu_avl_walk_t *walk;
+
+	if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
+
+	while ((who_node = uu_avl_walk_next(walk)) != NULL) {
+		const char *who_name = who_node->who_perm.who_name;
+		const char *nice_who_name = who_node->who_perm.who_ug_name;
+		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+		char delim = ' ';
+		deleg_perm_node_t *deleg_node;
+		boolean_t prt_who = B_TRUE;
+
+		for (deleg_node = uu_avl_first(avl);
+		    deleg_node != NULL;
+		    deleg_node = uu_avl_next(avl, deleg_node)) {
+			if (local != deleg_node->dpn_perm.dp_local ||
+			    descend != deleg_node->dpn_perm.dp_descend)
+				continue;
+
+			if (prt_who) {
+				const char *who = NULL;
+				if (prt_title) {
+					prt_title = B_FALSE;
+					(void) printf("%s", title);
+				}
+
+				switch (who_type) {
+				case ZFS_DELEG_USER_SETS:
+				case ZFS_DELEG_USER:
+					who = gettext("user");
+					if (nice_who_name)
+						who_name  = nice_who_name;
+					break;
+				case ZFS_DELEG_GROUP_SETS:
+				case ZFS_DELEG_GROUP:
+					who = gettext("group");
+					if (nice_who_name)
+						who_name  = nice_who_name;
+					break;
+				case ZFS_DELEG_EVERYONE_SETS:
+				case ZFS_DELEG_EVERYONE:
+					who = gettext("everyone");
+					who_name = NULL;
+					break;
+
+				default:
+					assert(who != NULL);
+				}
+
+				prt_who = B_FALSE;
+				if (who_name == NULL)
+					(void) printf("\t%s", who);
+				else
+					(void) printf("\t%s %s", who, who_name);
+			}
+
+			(void) printf("%c%s", delim,
+			    deleg_node->dpn_perm.dp_name);
+			delim = ',';
+		}
+
+		if (!prt_who)
+			(void) printf("\n");
+	}
+
+	uu_avl_walk_end(walk);
+}
+
+static void
+print_fs_perms(fs_perm_set_t *fspset)
+{
+	fs_perm_node_t *node = NULL;
+	char buf[MAXNAMELEN + 32];
+	const char *dsname = buf;
+
+	for (node = uu_list_first(fspset->fsps_list); node != NULL;
+	    node = uu_list_next(fspset->fsps_list, node)) {
+		uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
+		uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
+		int left = 0;
+
+		(void) snprintf(buf, sizeof (buf),
+		    gettext("---- Permissions on %s "),
+		    node->fspn_fsperm.fsp_name);
+		(void) printf("%s", dsname);
+		left = 70 - strlen(buf);
+		while (left-- > 0)
+			(void) printf("-");
+		(void) printf("\n");
+
+		print_set_creat_perms(sc_avl);
+		print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
+		    gettext("Local permissions:\n"));
+		print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
+		    gettext("Descendent permissions:\n"));
+		print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
+		    gettext("Local+Descendent permissions:\n"));
+	}
+}
+
+static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };
+
+struct deleg_perms {
+	boolean_t un;
+	nvlist_t *nvl;
+};
+
+static int
+set_deleg_perms(zfs_handle_t *zhp, void *data)
+{
+	struct deleg_perms *perms = (struct deleg_perms *)data;
+	zfs_type_t zfs_type = zfs_get_type(zhp);
+
+	if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
+		return (0);
+
+	return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
+}
+
+static int
+zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
+{
+	zfs_handle_t *zhp;
+	nvlist_t *perm_nvl = NULL;
+	nvlist_t *update_perm_nvl = NULL;
+	int error = 1;
+	int c;
+	struct allow_opts opts = { 0 };
+
+	const char *optstr = un ? "ldugecsrh" : "ldugecsh";
+
+	/* check opts */
+	while ((c = getopt(argc, argv, optstr)) != -1) {
+		switch (c) {
+		case 'l':
+			opts.local = B_TRUE;
+			break;
+		case 'd':
+			opts.descend = B_TRUE;
+			break;
+		case 'u':
+			opts.user = B_TRUE;
+			break;
+		case 'g':
+			opts.group = B_TRUE;
+			break;
+		case 'e':
+			opts.everyone = B_TRUE;
+			break;
+		case 's':
+			opts.set = B_TRUE;
+			break;
+		case 'c':
+			opts.create = B_TRUE;
+			break;
+		case 'r':
+			opts.recursive = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case 'h':
+			opts.prt_usage = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check arguments */
+	parse_allow_args(argc, argv, un, &opts);
+
+	/* try to open the dataset */
+	if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME)) == NULL) {
+		(void) fprintf(stderr, "Failed to open dataset: %s\n",
+		    opts.dataset);
+		return (-1);
+	}
+
+	if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
+		goto cleanup2;
+
+	fs_perm_set_init(&fs_perm_set);
+	if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
+		(void) fprintf(stderr, "Failed to parse fsacl permissions\n");
+		goto cleanup1;
+	}
+
+	if (opts.prt_perms)
+		print_fs_perms(&fs_perm_set);
+	else {
+		(void) construct_fsacl_list(un, &opts, &update_perm_nvl);
+		if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
+			goto cleanup0;
+
+		if (un && opts.recursive) {
+			struct deleg_perms data = { un, update_perm_nvl };
+			if (zfs_iter_filesystems(zhp, set_deleg_perms,
+			    &data) != 0)
+				goto cleanup0;
+		}
+	}
+
+	error = 0;
+
+cleanup0:
+	nvlist_free(perm_nvl);
+	nvlist_free(update_perm_nvl);
+cleanup1:
+	fs_perm_set_fini(&fs_perm_set);
+cleanup2:
+	zfs_close(zhp);
+
+	return (error);
+}
+
+static int
+zfs_do_allow(int argc, char **argv)
+{
+	return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
+}
+
+static int
+zfs_do_unallow(int argc, char **argv)
+{
+	return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
+}
+
+static int
+zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
+{
+	int errors = 0;
+	int i;
+	const char *tag;
+	boolean_t recursive = B_FALSE;
+	const char *opts = holding ? "rt" : "r";
+	int c;
+
+	/* check options */
+	while ((c = getopt(argc, argv, opts)) != -1) {
+		switch (c) {
+		case 'r':
+			recursive = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 2)
+		usage(B_FALSE);
+
+	tag = argv[0];
+	--argc;
+	++argv;
+
+	if (holding && tag[0] == '.') {
+		/* tags starting with '.' are reserved for libzfs */
+		(void) fprintf(stderr, gettext("tag may not start with '.'\n"));
+		usage(B_FALSE);
+	}
+
+	for (i = 0; i < argc; ++i) {
+		zfs_handle_t *zhp;
+		char parent[ZFS_MAX_DATASET_NAME_LEN];
+		const char *delim;
+		char *path = argv[i];
+
+		delim = strchr(path, '@');
+		if (delim == NULL) {
+			(void) fprintf(stderr,
+			    gettext("'%s' is not a snapshot\n"), path);
+			++errors;
+			continue;
+		}
+		(void) strncpy(parent, path, delim - path);
+		parent[delim - path] = '\0';
+
+		zhp = zfs_open(g_zfs, parent,
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL) {
+			++errors;
+			continue;
+		}
+		if (holding) {
+			if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0)
+				++errors;
+		} else {
+			if (zfs_release(zhp, delim+1, tag, recursive) != 0)
+				++errors;
+		}
+		zfs_close(zhp);
+	}
+
+	return (errors != 0);
+}
+
+/*
+ * zfs hold [-r] [-t] <tag> <snap> ...
+ *
+ *	-r	Recursively hold
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_hold(int argc, char **argv)
+{
+	return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
+}
+
+/*
+ * zfs release [-r] <tag> <snap> ...
+ *
+ *	-r	Recursively release
+ *
+ * Release a user-hold with the given tag from the list of snapshots.
+ */
+static int
+zfs_do_release(int argc, char **argv)
+{
+	return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
+}
+
+typedef struct holds_cbdata {
+	boolean_t	cb_recursive;
+	const char	*cb_snapname;
+	nvlist_t	**cb_nvlp;
+	size_t		cb_max_namelen;
+	size_t		cb_max_taglen;
+} holds_cbdata_t;
+
+#define	STRFTIME_FMT_STR "%a %b %e %H:%M %Y"
+#define	DATETIME_BUF_LEN (32)
+/*
+ *
+ */
+static void
+print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl)
+{
+	int i;
+	nvpair_t *nvp = NULL;
+	char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
+	const char *col;
+
+	if (!scripted) {
+		for (i = 0; i < 3; i++) {
+			col = gettext(hdr_cols[i]);
+			if (i < 2)
+				(void) printf("%-*s  ", i ? tagwidth : nwidth,
+				    col);
+			else
+				(void) printf("%s\n", col);
+		}
+	}
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		char *zname = nvpair_name(nvp);
+		nvlist_t *nvl2;
+		nvpair_t *nvp2 = NULL;
+		(void) nvpair_value_nvlist(nvp, &nvl2);
+		while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
+			char tsbuf[DATETIME_BUF_LEN];
+			char *tagname = nvpair_name(nvp2);
+			uint64_t val = 0;
+			time_t time;
+			struct tm t;
+
+			(void) nvpair_value_uint64(nvp2, &val);
+			time = (time_t)val;
+			(void) localtime_r(&time, &t);
+			(void) strftime(tsbuf, DATETIME_BUF_LEN,
+			    gettext(STRFTIME_FMT_STR), &t);
+
+			if (scripted) {
+				(void) printf("%s\t%s\t%s\n", zname,
+				    tagname, tsbuf);
+			} else {
+				(void) printf("%-*s  %-*s  %s\n", nwidth,
+				    zname, tagwidth, tagname, tsbuf);
+			}
+		}
+	}
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+holds_callback(zfs_handle_t *zhp, void *data)
+{
+	holds_cbdata_t *cbp = data;
+	nvlist_t *top_nvl = *cbp->cb_nvlp;
+	nvlist_t *nvl = NULL;
+	nvpair_t *nvp = NULL;
+	const char *zname = zfs_get_name(zhp);
+	size_t znamelen = strlen(zname);
+
+	if (cbp->cb_recursive) {
+		const char *snapname;
+		char *delim  = strchr(zname, '@');
+		if (delim == NULL)
+			return (0);
+
+		snapname = delim + 1;
+		if (strcmp(cbp->cb_snapname, snapname))
+			return (0);
+	}
+
+	if (zfs_get_holds(zhp, &nvl) != 0)
+		return (-1);
+
+	if (znamelen > cbp->cb_max_namelen)
+		cbp->cb_max_namelen  = znamelen;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *tag = nvpair_name(nvp);
+		size_t taglen = strlen(tag);
+		if (taglen > cbp->cb_max_taglen)
+			cbp->cb_max_taglen  = taglen;
+	}
+
+	return (nvlist_add_nvlist(top_nvl, zname, nvl));
+}
+
+/*
+ * zfs holds [-rH] <snap> ...
+ *
+ *	-r	Lists holds that are set on the named snapshots recursively.
+ *	-H	Scripted mode; elide headers and separate columns by tabs.
+ */
+static int
+zfs_do_holds(int argc, char **argv)
+{
+	int errors = 0;
+	int c;
+	int i;
+	boolean_t scripted = B_FALSE;
+	boolean_t recursive = B_FALSE;
+	const char *opts = "rH";
+	nvlist_t *nvl;
+
+	int types = ZFS_TYPE_SNAPSHOT;
+	holds_cbdata_t cb = { 0 };
+
+	int limit = 0;
+	int ret = 0;
+	int flags = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, opts)) != -1) {
+		switch (c) {
+		case 'r':
+			recursive = B_TRUE;
+			break;
+		case 'H':
+			scripted = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	if (recursive) {
+		types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
+		flags |= ZFS_ITER_RECURSE;
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1)
+		usage(B_FALSE);
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	for (i = 0; i < argc; ++i) {
+		char *snapshot = argv[i];
+		const char *delim;
+		const char *snapname;
+
+		delim = strchr(snapshot, '@');
+		if (delim == NULL) {
+			(void) fprintf(stderr,
+			    gettext("'%s' is not a snapshot\n"), snapshot);
+			++errors;
+			continue;
+		}
+		snapname = delim + 1;
+		if (recursive)
+			snapshot[delim - snapshot] = '\0';
+
+		cb.cb_recursive = recursive;
+		cb.cb_snapname = snapname;
+		cb.cb_nvlp = &nvl;
+
+		/*
+		 *  1. collect holds data, set format options
+		 */
+		ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
+		    holds_callback, &cb);
+		if (ret != 0)
+			++errors;
+	}
+
+	/*
+	 *  2. print holds data
+	 */
+	print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl);
+
+	if (nvlist_empty(nvl))
+		(void) fprintf(stderr, gettext("no datasets available\n"));
+
+	nvlist_free(nvl);
+
+	return (0 != errors);
+}
+
+#define	CHECK_SPINNER 30
+#define	SPINNER_TIME 3		/* seconds */
+#define	MOUNT_TIME 1		/* seconds */
+
+typedef struct get_all_state {
+	boolean_t	ga_verbose;
+	get_all_cb_t	*ga_cbp;
+} get_all_state_t;
+
+static int
+get_one_dataset(zfs_handle_t *zhp, void *data)
+{
+	static char *spin[] = { "-", "\\", "|", "/" };
+	static int spinval = 0;
+	static int spincheck = 0;
+	static time_t last_spin_time = (time_t)0;
+	get_all_state_t *state = data;
+	zfs_type_t type = zfs_get_type(zhp);
+
+	if (state->ga_verbose) {
+		if (--spincheck < 0) {
+			time_t now = time(NULL);
+			if (last_spin_time + SPINNER_TIME < now) {
+				update_progress(spin[spinval++ % 4]);
+				last_spin_time = now;
+			}
+			spincheck = CHECK_SPINNER;
+		}
+	}
+
+	/*
+	 * Iterate over any nested datasets.
+	 */
+	if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
+		zfs_close(zhp);
+		return (1);
+	}
+
+	/*
+	 * Skip any datasets whose type does not match.
+	 */
+	if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
+		zfs_close(zhp);
+		return (0);
+	}
+	libzfs_add_handle(state->ga_cbp, zhp);
+	assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc);
+
+	return (0);
+}
+
+static void
+get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
+{
+	get_all_state_t state = {
+	    .ga_verbose = verbose,
+	    .ga_cbp = cbp
+	};
+
+	if (verbose)
+		set_progress_header(gettext("Reading ZFS config"));
+	(void) zfs_iter_root(g_zfs, get_one_dataset, &state);
+
+	if (verbose)
+		finish_progress(gettext("done."));
+}
+
+/*
+ * Generic callback for sharing or mounting filesystems.  Because the code is so
+ * similar, we have a common function with an extra parameter to determine which
+ * mode we are using.
+ */
+typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t;
+
+typedef struct share_mount_state {
+	share_mount_op_t	sm_op;
+	boolean_t	sm_verbose;
+	int	sm_flags;
+	char	*sm_options;
+	char	*sm_proto; /* only valid for OP_SHARE */
+	pthread_mutex_t	sm_lock; /* protects the remaining fields */
+	uint_t	sm_total; /* number of filesystems to process */
+	uint_t	sm_done; /* number of filesystems processed */
+	int	sm_status; /* -1 if any of the share/mount operations failed */
+} share_mount_state_t;
+
+/*
+ * Share or mount a dataset.
+ */
+static int
+share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
+    boolean_t explicit, const char *options)
+{
+	char mountpoint[ZFS_MAXPROPLEN];
+	char shareopts[ZFS_MAXPROPLEN];
+	char smbshareopts[ZFS_MAXPROPLEN];
+	const char *cmdname = op == OP_SHARE ? "share" : "mount";
+	struct mnttab mnt;
+	uint64_t zoned, canmount;
+	boolean_t shared_nfs, shared_smb;
+
+	assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);
+
+	/*
+	 * Check to make sure we can mount/share this dataset.  If we
+	 * are in the global zone and the filesystem is exported to a
+	 * local zone, or if we are in a local zone and the
+	 * filesystem is not exported, then it is an error.
+	 */
+	zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+
+	if (zoned && getzoneid() == GLOBAL_ZONEID) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "dataset is exported to a local zone\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
+
+	} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "permission denied\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
+	}
+
+	/*
+	 * Ignore any filesystems which don't apply to us. This
+	 * includes those with a legacy mountpoint, or those with
+	 * legacy share options.
+	 */
+	verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+	verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
+	    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+	verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
+	    sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
+
+	if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
+	    strcmp(smbshareopts, "off") == 0) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot share '%s': "
+		    "legacy share\n"), zfs_get_name(zhp));
+		(void) fprintf(stderr, gettext("use share(1M) to "
+		    "share this filesystem, or set "
+		    "sharenfs property on\n"));
+		return (1);
+	}
+
+	/*
+	 * We cannot share or mount legacy filesystems. If the
+	 * shareopts is non-legacy but the mountpoint is legacy, we
+	 * treat it as a legacy share.
+	 */
+	if (strcmp(mountpoint, "legacy") == 0) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
+		(void) fprintf(stderr, gettext("use %s(1M) to "
+		    "%s this filesystem\n"), cmdname, cmdname);
+		return (1);
+	}
+
+	if (strcmp(mountpoint, "none") == 0) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': no "
+		    "mountpoint set\n"), cmdname, zfs_get_name(zhp));
+		return (1);
+	}
+
+	/*
+	 * canmount	explicit	outcome
+	 * on		no		pass through
+	 * on		yes		pass through
+	 * off		no		return 0
+	 * off		yes		display error, return 1
+	 * noauto	no		return 0
+	 * noauto	yes		pass through
+	 */
+	canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+	if (canmount == ZFS_CANMOUNT_OFF) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "'canmount' property is set to 'off'\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
+	} else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
+		/*
+		 * When performing a 'zfs mount -a', we skip any mounts for
+		 * datasets that have 'noauto' set. Sharing a dataset with
+		 * 'noauto' set is only allowed if it's mounted.
+		 */
+		if (op == OP_MOUNT)
+			return (0);
+		if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) {
+			/* also purge it from existing exports */
+			zfs_unshareall_bypath(zhp, mountpoint);
+			return (0);
+		}
+	}
+
+	/*
+	 * If this filesystem is encrypted and does not have
+	 * a loaded key, we can not mount it.
+	 */
+	if ((flags & MS_CRYPT) == 0 &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) ==
+	    ZFS_KEYSTATUS_UNAVAILABLE) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "encryption key not loaded\n"), cmdname, zfs_get_name(zhp));
+		return (1);
+	}
+
+	/*
+	 * If this filesystem is inconsistent and has a receive resume
+	 * token, we can not mount it.
+	 */
+	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
+	    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "Contains partially-completed state from "
+		    "\"zfs receive -s\", which can be resumed with "
+		    "\"zfs send -t\"\n"),
+		    cmdname, zfs_get_name(zhp));
+		return (1);
+	}
+
+	if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) {
+		if (!explicit)
+			return (0);
+
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "Dataset is not complete, was created by receiving "
+		    "a redacted zfs send stream.\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
+	}
+
+	/*
+	 * At this point, we have verified that the mountpoint and/or
+	 * shareopts are appropriate for auto management. If the
+	 * filesystem is already mounted or shared, return (failing
+	 * for explicit requests); otherwise mount or share the
+	 * filesystem.
+	 */
+	switch (op) {
+	case OP_SHARE:
+
+		shared_nfs = zfs_is_shared_nfs(zhp, NULL);
+		shared_smb = zfs_is_shared_smb(zhp, NULL);
+
+		if ((shared_nfs && shared_smb) ||
+		    (shared_nfs && strcmp(shareopts, "on") == 0 &&
+		    strcmp(smbshareopts, "off") == 0) ||
+		    (shared_smb && strcmp(smbshareopts, "on") == 0 &&
+		    strcmp(shareopts, "off") == 0)) {
+			if (!explicit)
+				return (0);
+
+			(void) fprintf(stderr, gettext("cannot share "
+			    "'%s': filesystem already shared\n"),
+			    zfs_get_name(zhp));
+			return (1);
+		}
+
+		if (!zfs_is_mounted(zhp, NULL) &&
+		    zfs_mount(zhp, NULL, flags) != 0)
+			return (1);
+
+		if (protocol == NULL) {
+			if (zfs_shareall(zhp) != 0)
+				return (1);
+		} else if (strcmp(protocol, "nfs") == 0) {
+			if (zfs_share_nfs(zhp))
+				return (1);
+		} else if (strcmp(protocol, "smb") == 0) {
+			if (zfs_share_smb(zhp))
+				return (1);
+		} else {
+			(void) fprintf(stderr, gettext("cannot share "
+			    "'%s': invalid share type '%s' "
+			    "specified\n"),
+			    zfs_get_name(zhp), protocol);
+			return (1);
+		}
+
+		break;
+
+	case OP_MOUNT:
+		if (options == NULL)
+			mnt.mnt_mntopts = "";
+		else
+			mnt.mnt_mntopts = (char *)options;
+
+		if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
+		    zfs_is_mounted(zhp, NULL)) {
+			if (!explicit)
+				return (0);
+
+			(void) fprintf(stderr, gettext("cannot mount "
+			    "'%s': filesystem already mounted\n"),
+			    zfs_get_name(zhp));
+			return (1);
+		}
+
+		if (zfs_mount(zhp, options, flags) != 0)
+			return (1);
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * Reports progress in the form "(current/total)".  Not thread-safe.
+ */
+static void
+report_mount_progress(int current, int total)
+{
+	static time_t last_progress_time = 0;
+	time_t now = time(NULL);
+	char info[32];
+
+	/* report 1..n instead of 0..n-1 */
+	++current;
+
+	/* display header if we're here for the first time */
+	if (current == 1) {
+		set_progress_header(gettext("Mounting ZFS filesystems"));
+	} else if (current != total && last_progress_time + MOUNT_TIME >= now) {
+		/* too soon to report again */
+		return;
+	}
+
+	last_progress_time = now;
+
+	(void) sprintf(info, "(%d/%d)", current, total);
+
+	if (current == total)
+		finish_progress(info);
+	else
+		update_progress(info);
+}
+
+/*
+ * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and
+ * updates the progress meter.
+ */
+static int
+share_mount_one_cb(zfs_handle_t *zhp, void *arg)
+{
+	share_mount_state_t *sms = arg;
+	int ret;
+
+	ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto,
+	    B_FALSE, sms->sm_options);
+
+	pthread_mutex_lock(&sms->sm_lock);
+	if (ret != 0)
+		sms->sm_status = ret;
+	sms->sm_done++;
+	if (sms->sm_verbose)
+		report_mount_progress(sms->sm_done, sms->sm_total);
+	pthread_mutex_unlock(&sms->sm_lock);
+	return (ret);
+}
+
+static void
+append_options(char *mntopts, char *newopts)
+{
+	int len = strlen(mntopts);
+
+	/* original length plus new string to append plus 1 for the comma */
+	if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) {
+		(void) fprintf(stderr, gettext("the opts argument for "
+		    "'%s' option is too long (more than %d chars)\n"),
+		    "-o", MNT_LINE_MAX);
+		usage(B_FALSE);
+	}
+
+	if (*mntopts)
+		mntopts[len++] = ',';
+
+	(void) strcpy(&mntopts[len], newopts);
+}
+
+static int
+share_mount(int op, int argc, char **argv)
+{
+	int do_all = 0;
+	boolean_t verbose = B_FALSE;
+	int c, ret = 0;
+	char *options = NULL;
+	int flags = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al"))
+	    != -1) {
+		switch (c) {
+		case 'a':
+			do_all = 1;
+			break;
+		case 'v':
+			verbose = B_TRUE;
+			break;
+		case 'l':
+			flags |= MS_CRYPT;
+			break;
+		case 'o':
+			if (*optarg == '\0') {
+				(void) fprintf(stderr, gettext("empty mount "
+				    "options (-o) specified\n"));
+				usage(B_FALSE);
+			}
+
+			if (options == NULL)
+				options = safe_malloc(MNT_LINE_MAX + 1);
+
+			/* option validation is done later */
+			append_options(options, optarg);
+			break;
+		case 'O':
+			flags |= MS_OVERLAY;
+			break;
+		case 'f':
+			flags |= MS_FORCE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (do_all) {
+		char *protocol = NULL;
+
+		if (op == OP_SHARE && argc > 0) {
+			if (strcmp(argv[0], "nfs") != 0 &&
+			    strcmp(argv[0], "smb") != 0) {
+				(void) fprintf(stderr, gettext("share type "
+				    "must be 'nfs' or 'smb'\n"));
+				usage(B_FALSE);
+			}
+			protocol = argv[0];
+			argc--;
+			argv++;
+		}
+
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+
+		start_progress_timer();
+		get_all_cb_t cb = { 0 };
+		get_all_datasets(&cb, verbose);
+
+		if (cb.cb_used == 0) {
+			if (options != NULL)
+				free(options);
+			return (0);
+		}
+
+		share_mount_state_t share_mount_state = { 0 };
+		share_mount_state.sm_op = op;
+		share_mount_state.sm_verbose = verbose;
+		share_mount_state.sm_flags = flags;
+		share_mount_state.sm_options = options;
+		share_mount_state.sm_proto = protocol;
+		share_mount_state.sm_total = cb.cb_used;
+		pthread_mutex_init(&share_mount_state.sm_lock, NULL);
+
+		/*
+		 * libshare isn't mt-safe, so only do the operation in parallel
+		 * if we're mounting. Additionally, the key-loading option must
+		 * be serialized so that we can prompt the user for their keys
+		 * in a consistent manner.
+		 */
+		zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used,
+		    share_mount_one_cb, &share_mount_state,
+		    op == OP_MOUNT && !(flags & MS_CRYPT));
+		zfs_commit_all_shares();
+
+		ret = share_mount_state.sm_status;
+
+		for (int i = 0; i < cb.cb_used; i++)
+			zfs_close(cb.cb_handles[i]);
+		free(cb.cb_handles);
+	} else if (argc == 0) {
+		struct mnttab entry;
+
+		if ((op == OP_SHARE) || (options != NULL)) {
+			(void) fprintf(stderr, gettext("missing filesystem "
+			    "argument (specify -a for all)\n"));
+			usage(B_FALSE);
+		}
+
+		/*
+		 * When mount is given no arguments, go through
+		 * /proc/self/mounts and display any active ZFS mounts.
+		 * We hide any snapshots, since they are controlled
+		 * automatically.
+		 */
+
+		/* Reopen MNTTAB to prevent reading stale data from open file */
+		if (freopen(MNTTAB, "r", mnttab_file) == NULL) {
+			if (options != NULL)
+				free(options);
+			return (ENOENT);
+		}
+
+		while (getmntent(mnttab_file, &entry) == 0) {
+			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 ||
+			    strchr(entry.mnt_special, '@') != NULL)
+				continue;
+
+			(void) printf("%-30s  %s\n", entry.mnt_special,
+			    entry.mnt_mountp);
+		}
+
+	} else {
+		zfs_handle_t *zhp;
+
+		if (argc > 1) {
+			(void) fprintf(stderr,
+			    gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+
+		if ((zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM)) == NULL) {
+			ret = 1;
+		} else {
+			ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
+			    options);
+			zfs_commit_all_shares();
+			zfs_close(zhp);
+		}
+	}
+
+	if (options != NULL)
+		free(options);
+
+	return (ret);
+}
+
+/*
+ * zfs mount -a [nfs]
+ * zfs mount filesystem
+ *
+ * Mount all filesystems, or mount the given filesystem.
+ */
+static int
+zfs_do_mount(int argc, char **argv)
+{
+	return (share_mount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs share -a [nfs | smb]
+ * zfs share filesystem
+ *
+ * Share all filesystems, or share the given filesystem.
+ */
+static int
+zfs_do_share(int argc, char **argv)
+{
+	return (share_mount(OP_SHARE, argc, argv));
+}
+
+typedef struct unshare_unmount_node {
+	zfs_handle_t	*un_zhp;
+	char		*un_mountp;
+	uu_avl_node_t	un_avlnode;
+} unshare_unmount_node_t;
+
+/* ARGSUSED */
+static int
+unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
+{
+	const unshare_unmount_node_t *l = larg;
+	const unshare_unmount_node_t *r = rarg;
+
+	return (strcmp(l->un_mountp, r->un_mountp));
+}
+
+/*
+ * Convenience routine used by zfs_do_umount() and manual_unmount().  Given an
+ * absolute path, find the entry /proc/self/mounts, verify that it's a
+ * ZFS filesystem, and unmount it appropriately.
+ */
+static int
+unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
+{
+	zfs_handle_t *zhp;
+	int ret = 0;
+	struct stat64 statbuf;
+	struct extmnttab entry;
+	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
+	ino_t path_inode;
+
+	/*
+	 * Search for the given (major,minor) pair in the mount table.
+	 */
+
+	/* Reopen MNTTAB to prevent reading stale data from open file */
+	if (freopen(MNTTAB, "r", mnttab_file) == NULL)
+		return (ENOENT);
+
+	if (getextmntent(path, &entry, &statbuf) != 0) {
+		if (op == OP_SHARE) {
+			(void) fprintf(stderr, gettext("cannot %s '%s': not "
+			    "currently mounted\n"), cmdname, path);
+			return (1);
+		}
+		(void) fprintf(stderr, gettext("warning: %s not in"
+		    "/proc/self/mounts\n"), path);
+		if ((ret = umount2(path, flags)) != 0)
+			(void) fprintf(stderr, gettext("%s: %s\n"), path,
+			    strerror(errno));
+		return (ret != 0);
+	}
+	path_inode = statbuf.st_ino;
+
+	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
+		(void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
+		    "filesystem\n"), cmdname, path);
+		return (1);
+	}
+
+	if ((zhp = zfs_open(g_zfs, entry.mnt_special,
+	    ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (1);
+
+	ret = 1;
+	if (stat64(entry.mnt_mountp, &statbuf) != 0) {
+		(void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
+		    cmdname, path, strerror(errno));
+		goto out;
+	} else if (statbuf.st_ino != path_inode) {
+		(void) fprintf(stderr, gettext("cannot "
+		    "%s '%s': not a mountpoint\n"), cmdname, path);
+		goto out;
+	}
+
+	if (op == OP_SHARE) {
+		char nfs_mnt_prop[ZFS_MAXPROPLEN];
+		char smbshare_prop[ZFS_MAXPROPLEN];
+
+		verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop,
+		    sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0);
+		verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop,
+		    sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0);
+
+		if (strcmp(nfs_mnt_prop, "off") == 0 &&
+		    strcmp(smbshare_prop, "off") == 0) {
+			(void) fprintf(stderr, gettext("cannot unshare "
+			    "'%s': legacy share\n"), path);
+			(void) fprintf(stderr, gettext("use exportfs(8) "
+			    "or smbcontrol(1) to unshare this filesystem\n"));
+		} else if (!zfs_is_shared(zhp)) {
+			(void) fprintf(stderr, gettext("cannot unshare '%s': "
+			    "not currently shared\n"), path);
+		} else {
+			ret = zfs_unshareall_bypath(zhp, path);
+			zfs_commit_all_shares();
+		}
+	} else {
+		char mtpt_prop[ZFS_MAXPROPLEN];
+
+		verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop,
+		    sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0);
+
+		if (is_manual) {
+			ret = zfs_unmount(zhp, NULL, flags);
+		} else if (strcmp(mtpt_prop, "legacy") == 0) {
+			(void) fprintf(stderr, gettext("cannot unmount "
+			    "'%s': legacy mountpoint\n"),
+			    zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use umount(8) "
+			    "to unmount this filesystem\n"));
+		} else {
+			ret = zfs_unmountall(zhp, flags);
+		}
+	}
+
+out:
+	zfs_close(zhp);
+
+	return (ret != 0);
+}
+
+/*
+ * Generic callback for unsharing or unmounting a filesystem.
+ */
+static int
+unshare_unmount(int op, int argc, char **argv)
+{
+	int do_all = 0;
+	int flags = 0;
+	int ret = 0;
+	int c;
+	zfs_handle_t *zhp;
+	char nfs_mnt_prop[ZFS_MAXPROPLEN];
+	char sharesmb[ZFS_MAXPROPLEN];
+
+	/* check options */
+	while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) {
+		switch (c) {
+		case 'a':
+			do_all = 1;
+			break;
+		case 'f':
+			flags |= MS_FORCE;
+			break;
+		case 'u':
+			flags |= MS_CRYPT;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (do_all) {
+		/*
+		 * We could make use of zfs_for_each() to walk all datasets in
+		 * the system, but this would be very inefficient, especially
+		 * since we would have to linearly search /proc/self/mounts for
+		 * each one. Instead, do one pass through /proc/self/mounts
+		 * looking for zfs entries and call zfs_unmount() for each one.
+		 *
+		 * Things get a little tricky if the administrator has created
+		 * mountpoints beneath other ZFS filesystems.  In this case, we
+		 * have to unmount the deepest filesystems first.  To accomplish
+		 * this, we place all the mountpoints in an AVL tree sorted by
+		 * the special type (dataset name), and walk the result in
+		 * reverse to make sure to get any snapshots first.
+		 */
+		struct mnttab entry;
+		uu_avl_pool_t *pool;
+		uu_avl_t *tree = NULL;
+		unshare_unmount_node_t *node;
+		uu_avl_index_t idx;
+		uu_avl_walk_t *walk;
+		char *protocol = NULL;
+
+		if (op == OP_SHARE && argc > 0) {
+			if (strcmp(argv[0], "nfs") != 0 &&
+			    strcmp(argv[0], "smb") != 0) {
+				(void) fprintf(stderr, gettext("share type "
+				    "must be 'nfs' or 'smb'\n"));
+				usage(B_FALSE);
+			}
+			protocol = argv[0];
+			argc--;
+			argv++;
+		}
+
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+
+		if (((pool = uu_avl_pool_create("unmount_pool",
+		    sizeof (unshare_unmount_node_t),
+		    offsetof(unshare_unmount_node_t, un_avlnode),
+		    unshare_unmount_compare, UU_DEFAULT)) == NULL) ||
+		    ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
+			nomem();
+
+		/* Reopen MNTTAB to prevent reading stale data from open file */
+		if (freopen(MNTTAB, "r", mnttab_file) == NULL)
+			return (ENOENT);
+
+		while (getmntent(mnttab_file, &entry) == 0) {
+
+			/* ignore non-ZFS entries */
+			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
+				continue;
+
+			/* ignore snapshots */
+			if (strchr(entry.mnt_special, '@') != NULL)
+				continue;
+
+			if ((zhp = zfs_open(g_zfs, entry.mnt_special,
+			    ZFS_TYPE_FILESYSTEM)) == NULL) {
+				ret = 1;
+				continue;
+			}
+
+			/*
+			 * Ignore datasets that are excluded/restricted by
+			 * parent pool name.
+			 */
+			if (zpool_skip_pool(zfs_get_pool_name(zhp))) {
+				zfs_close(zhp);
+				continue;
+			}
+
+			switch (op) {
+			case OP_SHARE:
+				verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+				    nfs_mnt_prop,
+				    sizeof (nfs_mnt_prop),
+				    NULL, NULL, 0, B_FALSE) == 0);
+				if (strcmp(nfs_mnt_prop, "off") != 0)
+					break;
+				verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
+				    nfs_mnt_prop,
+				    sizeof (nfs_mnt_prop),
+				    NULL, NULL, 0, B_FALSE) == 0);
+				if (strcmp(nfs_mnt_prop, "off") == 0)
+					continue;
+				break;
+			case OP_MOUNT:
+				/* Ignore legacy mounts */
+				verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
+				    nfs_mnt_prop,
+				    sizeof (nfs_mnt_prop),
+				    NULL, NULL, 0, B_FALSE) == 0);
+				if (strcmp(nfs_mnt_prop, "legacy") == 0)
+					continue;
+				/* Ignore canmount=noauto mounts */
+				if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
+				    ZFS_CANMOUNT_NOAUTO)
+					continue;
+			default:
+				break;
+			}
+
+			node = safe_malloc(sizeof (unshare_unmount_node_t));
+			node->un_zhp = zhp;
+			node->un_mountp = safe_strdup(entry.mnt_mountp);
+
+			uu_avl_node_init(node, &node->un_avlnode, pool);
+
+			if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
+				uu_avl_insert(tree, node, idx);
+			} else {
+				zfs_close(node->un_zhp);
+				free(node->un_mountp);
+				free(node);
+			}
+		}
+
+		/*
+		 * Walk the AVL tree in reverse, unmounting each filesystem and
+		 * removing it from the AVL tree in the process.
+		 */
+		if ((walk = uu_avl_walk_start(tree,
+		    UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL)
+			nomem();
+
+		while ((node = uu_avl_walk_next(walk)) != NULL) {
+			const char *mntarg = NULL;
+
+			uu_avl_remove(tree, node);
+			switch (op) {
+			case OP_SHARE:
+				if (zfs_unshareall_bytype(node->un_zhp,
+				    node->un_mountp, protocol) != 0)
+					ret = 1;
+				break;
+
+			case OP_MOUNT:
+				if (zfs_unmount(node->un_zhp,
+				    mntarg, flags) != 0)
+					ret = 1;
+				break;
+			}
+
+			zfs_close(node->un_zhp);
+			free(node->un_mountp);
+			free(node);
+		}
+
+		if (op == OP_SHARE)
+			zfs_commit_shares(protocol);
+
+		uu_avl_walk_end(walk);
+		uu_avl_destroy(tree);
+		uu_avl_pool_destroy(pool);
+
+	} else {
+		if (argc != 1) {
+			if (argc == 0)
+				(void) fprintf(stderr,
+				    gettext("missing filesystem argument\n"));
+			else
+				(void) fprintf(stderr,
+				    gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+
+		/*
+		 * We have an argument, but it may be a full path or a ZFS
+		 * filesystem.  Pass full paths off to unmount_path() (shared by
+		 * manual_unmount), otherwise open the filesystem and pass to
+		 * zfs_unmount().
+		 */
+		if (argv[0][0] == '/')
+			return (unshare_unmount_path(op, argv[0],
+			    flags, B_FALSE));
+
+		if ((zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM)) == NULL)
+			return (1);
+
+		verify(zfs_prop_get(zhp, op == OP_SHARE ?
+		    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
+		    nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
+		    NULL, 0, B_FALSE) == 0);
+
+		switch (op) {
+		case OP_SHARE:
+			verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+			    nfs_mnt_prop,
+			    sizeof (nfs_mnt_prop),
+			    NULL, NULL, 0, B_FALSE) == 0);
+			verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
+			    sharesmb, sizeof (sharesmb), NULL, NULL,
+			    0, B_FALSE) == 0);
+
+			if (strcmp(nfs_mnt_prop, "off") == 0 &&
+			    strcmp(sharesmb, "off") == 0) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unshare '%s': legacy share\n"),
+				    zfs_get_name(zhp));
+				(void) fprintf(stderr, gettext("use "
+				    "unshare(1M) to unshare this "
+				    "filesystem\n"));
+				ret = 1;
+			} else if (!zfs_is_shared(zhp)) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unshare '%s': not currently "
+				    "shared\n"), zfs_get_name(zhp));
+				ret = 1;
+			} else if (zfs_unshareall(zhp) != 0) {
+				ret = 1;
+			}
+			break;
+
+		case OP_MOUNT:
+			if (strcmp(nfs_mnt_prop, "legacy") == 0) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unmount '%s': legacy "
+				    "mountpoint\n"), zfs_get_name(zhp));
+				(void) fprintf(stderr, gettext("use "
+				    "umount(1M) to unmount this "
+				    "filesystem\n"));
+				ret = 1;
+			} else if (!zfs_is_mounted(zhp, NULL)) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unmount '%s': not currently "
+				    "mounted\n"),
+				    zfs_get_name(zhp));
+				ret = 1;
+			} else if (zfs_unmountall(zhp, flags) != 0) {
+				ret = 1;
+			}
+			break;
+		}
+
+		zfs_close(zhp);
+	}
+
+	return (ret);
+}
+
+/*
+ * zfs unmount [-fu] -a
+ * zfs unmount [-fu] filesystem
+ *
+ * Unmount all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unmount(int argc, char **argv)
+{
+	return (unshare_unmount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs unshare -a
+ * zfs unshare filesystem
+ *
+ * Unshare all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unshare(int argc, char **argv)
+{
+	return (unshare_unmount(OP_SHARE, argc, argv));
+}
+
+static int
+find_command_idx(char *command, int *idx)
+{
+	int i;
+
+	for (i = 0; i < NCOMMAND; i++) {
+		if (command_table[i].name == NULL)
+			continue;
+
+		if (strcmp(command, command_table[i].name) == 0) {
+			*idx = i;
+			return (0);
+		}
+	}
+	return (1);
+}
+
+static int
+zfs_do_diff(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	int flags = 0;
+	char *tosnap = NULL;
+	char *fromsnap = NULL;
+	char *atp, *copy;
+	int err = 0;
+	int c;
+	struct sigaction sa;
+
+	while ((c = getopt(argc, argv, "FHt")) != -1) {
+		switch (c) {
+		case 'F':
+			flags |= ZFS_DIFF_CLASSIFY;
+			break;
+		case 'H':
+			flags |= ZFS_DIFF_PARSEABLE;
+			break;
+		case 't':
+			flags |= ZFS_DIFF_TIMESTAMP;
+			break;
+		default:
+			(void) fprintf(stderr,
+			    gettext("invalid option '%c'\n"), optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr,
+		    gettext("must provide at least one snapshot name\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	fromsnap = argv[0];
+	tosnap = (argc == 2) ? argv[1] : NULL;
+
+	copy = NULL;
+	if (*fromsnap != '@')
+		copy = strdup(fromsnap);
+	else if (tosnap)
+		copy = strdup(tosnap);
+	if (copy == NULL)
+		usage(B_FALSE);
+
+	if ((atp = strchr(copy, '@')) != NULL)
+		*atp = '\0';
+
+	if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) {
+		free(copy);
+		return (1);
+	}
+	free(copy);
+
+	/*
+	 * Ignore SIGPIPE so that the library can give us
+	 * information on any failure
+	 */
+	if (sigemptyset(&sa.sa_mask) == -1) {
+		err = errno;
+		goto out;
+	}
+	sa.sa_flags = 0;
+	sa.sa_handler = SIG_IGN;
+	if (sigaction(SIGPIPE, &sa, NULL) == -1) {
+		err = errno;
+		goto out;
+	}
+
+	err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);
+out:
+	zfs_close(zhp);
+
+	return (err != 0);
+}
+
+/*
+ * zfs bookmark <fs@source>|<fs#source> <fs#bookmark>
+ *
+ * Creates a bookmark with the given name from the source snapshot
+ * or creates a copy of an existing source bookmark.
+ */
+static int
+zfs_do_bookmark(int argc, char **argv)
+{
+	char *source, *bookname;
+	char expbuf[ZFS_MAX_DATASET_NAME_LEN];
+	int source_type;
+	nvlist_t *nvl;
+	int ret = 0;
+	int c;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "")) != -1) {
+		switch (c) {
+		case '?':
+			(void) fprintf(stderr,
+			    gettext("invalid option '%c'\n"), optopt);
+			goto usage;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing source argument\n"));
+		goto usage;
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing bookmark argument\n"));
+		goto usage;
+	}
+
+	source = argv[0];
+	bookname = argv[1];
+
+	if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) {
+		(void) fprintf(stderr,
+		    gettext("invalid source name '%s': "
+		    "must contain a '@' or '#'\n"), source);
+		goto usage;
+	}
+	if (strchr(bookname, '#') == NULL) {
+		(void) fprintf(stderr,
+		    gettext("invalid bookmark name '%s': "
+		    "must contain a '#'\n"), bookname);
+		goto usage;
+	}
+
+	/*
+	 * expand source or bookname to full path:
+	 * one of them may be specified as short name
+	 */
+	{
+		char **expand;
+		char *source_short, *bookname_short;
+		source_short = strpbrk(source, "@#");
+		bookname_short = strpbrk(bookname, "#");
+		if (source_short == source &&
+		    bookname_short == bookname) {
+			(void) fprintf(stderr, gettext(
+			    "either source or bookmark must be specified as "
+			    "full dataset paths"));
+			goto usage;
+		} else if (source_short != source &&
+		    bookname_short != bookname) {
+			expand = NULL;
+		} else if (source_short != source) {
+			strlcpy(expbuf, source, sizeof (expbuf));
+			expand = &bookname;
+		} else if (bookname_short != bookname) {
+			strlcpy(expbuf, bookname, sizeof (expbuf));
+			expand = &source;
+		} else {
+			abort();
+		}
+		if (expand != NULL) {
+			*strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */
+			(void) strlcat(expbuf, *expand, sizeof (expbuf));
+			*expand = expbuf;
+		}
+	}
+
+	/* determine source type */
+	switch (*strpbrk(source, "@#")) {
+		case '@': source_type = ZFS_TYPE_SNAPSHOT; break;
+		case '#': source_type = ZFS_TYPE_BOOKMARK; break;
+		default: abort();
+	}
+
+	/* test the source exists */
+	zfs_handle_t *zhp;
+	zhp = zfs_open(g_zfs, source, source_type);
+	if (zhp == NULL)
+		goto usage;
+	zfs_close(zhp);
+
+	nvl = fnvlist_alloc();
+	fnvlist_add_string(nvl, bookname, source);
+	ret = lzc_bookmark(nvl, NULL);
+	fnvlist_free(nvl);
+
+	if (ret != 0) {
+		const char *err_msg = NULL;
+		char errbuf[1024];
+
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "cannot create bookmark '%s'"), bookname);
+
+		switch (ret) {
+		case EXDEV:
+			err_msg = "bookmark is in a different pool";
+			break;
+		case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR:
+			err_msg = "source is not an ancestor of the "
+			    "new bookmark's dataset";
+			break;
+		case EEXIST:
+			err_msg = "bookmark exists";
+			break;
+		case EINVAL:
+			err_msg = "invalid argument";
+			break;
+		case ENOTSUP:
+			err_msg = "bookmark feature not enabled";
+			break;
+		case ENOSPC:
+			err_msg = "out of space";
+			break;
+		case ENOENT:
+			err_msg = "dataset does not exist";
+			break;
+		default:
+			(void) zfs_standard_error(g_zfs, ret, errbuf);
+			break;
+		}
+		if (err_msg != NULL) {
+			(void) fprintf(stderr, "%s: %s\n", errbuf,
+			    dgettext(TEXT_DOMAIN, err_msg));
+		}
+	}
+
+	return (ret != 0);
+
+usage:
+	usage(B_FALSE);
+	return (-1);
+}
+
+static int
+zfs_do_channel_program(int argc, char **argv)
+{
+	int ret, fd, c;
+	char *progbuf, *filename, *poolname;
+	size_t progsize, progread;
+	nvlist_t *outnvl = NULL;
+	uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT;
+	uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT;
+	boolean_t sync_flag = B_TRUE, json_output = B_FALSE;
+	zpool_handle_t *zhp;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "nt:m:j")) != -1) {
+		switch (c) {
+		case 't':
+		case 'm': {
+			uint64_t arg;
+			char *endp;
+
+			errno = 0;
+			arg = strtoull(optarg, &endp, 0);
+			if (errno != 0 || *endp != '\0') {
+				(void) fprintf(stderr, gettext(
+				    "invalid argument "
+				    "'%s': expected integer\n"), optarg);
+				goto usage;
+			}
+
+			if (c == 't') {
+				instrlimit = arg;
+			} else {
+				ASSERT3U(c, ==, 'm');
+				memlimit = arg;
+			}
+			break;
+		}
+		case 'n': {
+			sync_flag = B_FALSE;
+			break;
+		}
+		case 'j': {
+			json_output = B_TRUE;
+			break;
+		}
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			goto usage;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 2) {
+		(void) fprintf(stderr,
+		    gettext("invalid number of arguments\n"));
+		goto usage;
+	}
+
+	poolname = argv[0];
+	filename = argv[1];
+	if (strcmp(filename, "-") == 0) {
+		fd = 0;
+		filename = "standard input";
+	} else if ((fd = open(filename, O_RDONLY)) < 0) {
+		(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
+		    filename, strerror(errno));
+		return (1);
+	}
+
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
+		(void) fprintf(stderr, gettext("cannot open pool '%s'\n"),
+		    poolname);
+		if (fd != 0)
+			(void) close(fd);
+		return (1);
+	}
+	zpool_close(zhp);
+
+	/*
+	 * Read in the channel program, expanding the program buffer as
+	 * necessary.
+	 */
+	progread = 0;
+	progsize = 1024;
+	progbuf = safe_malloc(progsize);
+	do {
+		ret = read(fd, progbuf + progread, progsize - progread);
+		progread += ret;
+		if (progread == progsize && ret > 0) {
+			progsize *= 2;
+			progbuf = safe_realloc(progbuf, progsize);
+		}
+	} while (ret > 0);
+
+	if (fd != 0)
+		(void) close(fd);
+	if (ret < 0) {
+		free(progbuf);
+		(void) fprintf(stderr,
+		    gettext("cannot read '%s': %s\n"),
+		    filename, strerror(errno));
+		return (1);
+	}
+	progbuf[progread] = '\0';
+
+	/*
+	 * Any remaining arguments are passed as arguments to the lua script as
+	 * a string array:
+	 * {
+	 *	"argv" -> [ "arg 1", ... "arg n" ],
+	 * }
+	 */
+	nvlist_t *argnvl = fnvlist_alloc();
+	fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2);
+
+	if (sync_flag) {
+		ret = lzc_channel_program(poolname, progbuf,
+		    instrlimit, memlimit, argnvl, &outnvl);
+	} else {
+		ret = lzc_channel_program_nosync(poolname, progbuf,
+		    instrlimit, memlimit, argnvl, &outnvl);
+	}
+
+	if (ret != 0) {
+		/*
+		 * On error, report the error message handed back by lua if one
+		 * exists.  Otherwise, generate an appropriate error message,
+		 * falling back on strerror() for an unexpected return code.
+		 */
+		char *errstring = NULL;
+		const char *msg = gettext("Channel program execution failed");
+		uint64_t instructions = 0;
+		if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) {
+			(void) nvlist_lookup_string(outnvl,
+			    ZCP_RET_ERROR, &errstring);
+			if (errstring == NULL)
+				errstring = strerror(ret);
+			if (ret == ETIME) {
+				(void) nvlist_lookup_uint64(outnvl,
+				    ZCP_ARG_INSTRLIMIT, &instructions);
+			}
+		} else {
+			switch (ret) {
+			case EINVAL:
+				errstring =
+				    "Invalid instruction or memory limit.";
+				break;
+			case ENOMEM:
+				errstring = "Return value too large.";
+				break;
+			case ENOSPC:
+				errstring = "Memory limit exhausted.";
+				break;
+			case ETIME:
+				errstring = "Timed out.";
+				break;
+			case EPERM:
+				errstring = "Permission denied. Channel "
+				    "programs must be run as root.";
+				break;
+			default:
+				(void) zfs_standard_error(g_zfs, ret, msg);
+			}
+		}
+		if (errstring != NULL)
+			(void) fprintf(stderr, "%s:\n%s\n", msg, errstring);
+
+		if (ret == ETIME && instructions != 0)
+			(void) fprintf(stderr,
+			    gettext("%llu Lua instructions\n"),
+			    (u_longlong_t)instructions);
+	} else {
+		if (json_output) {
+			(void) nvlist_print_json(stdout, outnvl);
+		} else if (nvlist_empty(outnvl)) {
+			(void) fprintf(stdout, gettext("Channel program fully "
+			    "executed and did not produce output.\n"));
+		} else {
+			(void) fprintf(stdout, gettext("Channel program fully "
+			    "executed and produced output:\n"));
+			dump_nvlist(outnvl, 4);
+		}
+	}
+
+	free(progbuf);
+	fnvlist_free(outnvl);
+	fnvlist_free(argnvl);
+	return (ret != 0);
+
+usage:
+	usage(B_FALSE);
+	return (-1);
+}
+
+
+typedef struct loadkey_cbdata {
+	boolean_t cb_loadkey;
+	boolean_t cb_recursive;
+	boolean_t cb_noop;
+	char *cb_keylocation;
+	uint64_t cb_numfailed;
+	uint64_t cb_numattempted;
+} loadkey_cbdata_t;
+
+static int
+load_key_callback(zfs_handle_t *zhp, void *data)
+{
+	int ret;
+	boolean_t is_encroot;
+	loadkey_cbdata_t *cb = data;
+	uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS);
+
+	/*
+	 * If we are working recursively, we want to skip loading / unloading
+	 * keys for non-encryption roots and datasets whose keys are already
+	 * in the desired end-state.
+	 */
+	if (cb->cb_recursive) {
+		ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL);
+		if (ret != 0)
+			return (ret);
+		if (!is_encroot)
+			return (0);
+
+		if ((cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_AVAILABLE) ||
+		    (!cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_UNAVAILABLE))
+			return (0);
+	}
+
+	cb->cb_numattempted++;
+
+	if (cb->cb_loadkey)
+		ret = zfs_crypto_load_key(zhp, cb->cb_noop, cb->cb_keylocation);
+	else
+		ret = zfs_crypto_unload_key(zhp);
+
+	if (ret != 0) {
+		cb->cb_numfailed++;
+		return (ret);
+	}
+
+	return (0);
+}
+
+static int
+load_unload_keys(int argc, char **argv, boolean_t loadkey)
+{
+	int c, ret = 0, flags = 0;
+	boolean_t do_all = B_FALSE;
+	loadkey_cbdata_t cb = { 0 };
+
+	cb.cb_loadkey = loadkey;
+
+	while ((c = getopt(argc, argv, "anrL:")) != -1) {
+		/* noop and alternate keylocations only apply to zfs load-key */
+		if (loadkey) {
+			switch (c) {
+			case 'n':
+				cb.cb_noop = B_TRUE;
+				continue;
+			case 'L':
+				cb.cb_keylocation = optarg;
+				continue;
+			default:
+				break;
+			}
+		}
+
+		switch (c) {
+		case 'a':
+			do_all = B_TRUE;
+			cb.cb_recursive = B_TRUE;
+			break;
+		case 'r':
+			flags |= ZFS_ITER_RECURSE;
+			cb.cb_recursive = B_TRUE;
+			break;
+		default:
+			(void) fprintf(stderr,
+			    gettext("invalid option '%c'\n"), optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (!do_all && argc == 0) {
+		(void) fprintf(stderr,
+		    gettext("Missing dataset argument or -a option\n"));
+		usage(B_FALSE);
+	}
+
+	if (do_all && argc != 0) {
+		(void) fprintf(stderr,
+		    gettext("Cannot specify dataset with -a option\n"));
+		usage(B_FALSE);
+	}
+
+	if (cb.cb_recursive && cb.cb_keylocation != NULL &&
+	    strcmp(cb.cb_keylocation, "prompt") != 0) {
+		(void) fprintf(stderr, gettext("alternate keylocation may only "
+		    "be 'prompt' with -r or -a\n"));
+		usage(B_FALSE);
+	}
+
+	ret = zfs_for_each(argc, argv, flags,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL, 0,
+	    load_key_callback, &cb);
+
+	if (cb.cb_noop || (cb.cb_recursive && cb.cb_numattempted != 0)) {
+		(void) printf(gettext("%llu / %llu key(s) successfully %s\n"),
+		    (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed),
+		    (u_longlong_t)cb.cb_numattempted,
+		    loadkey ? (cb.cb_noop ? "verified" : "loaded") :
+		    "unloaded");
+	}
+
+	if (cb.cb_numfailed != 0)
+		ret = -1;
+
+	return (ret);
+}
+
+static int
+zfs_do_load_key(int argc, char **argv)
+{
+	return (load_unload_keys(argc, argv, B_TRUE));
+}
+
+
+static int
+zfs_do_unload_key(int argc, char **argv)
+{
+	return (load_unload_keys(argc, argv, B_FALSE));
+}
+
+static int
+zfs_do_change_key(int argc, char **argv)
+{
+	int c, ret;
+	uint64_t keystatus;
+	boolean_t loadkey = B_FALSE, inheritkey = B_FALSE;
+	zfs_handle_t *zhp = NULL;
+	nvlist_t *props = fnvlist_alloc();
+
+	while ((c = getopt(argc, argv, "lio:")) != -1) {
+		switch (c) {
+		case 'l':
+			loadkey = B_TRUE;
+			break;
+		case 'i':
+			inheritkey = B_TRUE;
+			break;
+		case 'o':
+			if (!parseprop(props, optarg)) {
+				nvlist_free(props);
+				return (1);
+			}
+			break;
+		default:
+			(void) fprintf(stderr,
+			    gettext("invalid option '%c'\n"), optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	if (inheritkey && !nvlist_empty(props)) {
+		(void) fprintf(stderr,
+		    gettext("Properties not allowed for inheriting\n"));
+		usage(B_FALSE);
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("Missing dataset argument\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("Too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	zhp = zfs_open(g_zfs, argv[argc - 1],
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+	if (zhp == NULL)
+		usage(B_FALSE);
+
+	if (loadkey) {
+		keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS);
+		if (keystatus != ZFS_KEYSTATUS_AVAILABLE) {
+			ret = zfs_crypto_load_key(zhp, B_FALSE, NULL);
+			if (ret != 0) {
+				nvlist_free(props);
+				zfs_close(zhp);
+				return (-1);
+			}
+		}
+
+		/* refresh the properties so the new keystatus is visible */
+		zfs_refresh_properties(zhp);
+	}
+
+	ret = zfs_crypto_rewrap(zhp, props, inheritkey);
+	if (ret != 0) {
+		nvlist_free(props);
+		zfs_close(zhp);
+		return (-1);
+	}
+
+	nvlist_free(props);
+	zfs_close(zhp);
+	return (0);
+}
+
+/*
+ * 1) zfs project [-d|-r] <file|directory ...>
+ *    List project ID and inherit flag of file(s) or directories.
+ *    -d: List the directory itself, not its children.
+ *    -r: List subdirectories recursively.
+ *
+ * 2) zfs project -C [-k] [-r] <file|directory ...>
+ *    Clear project inherit flag and/or ID on the file(s) or directories.
+ *    -k: Keep the project ID unchanged. If not specified, the project ID
+ *	  will be reset as zero.
+ *    -r: Clear on subdirectories recursively.
+ *
+ * 3) zfs project -c [-0] [-d|-r] [-p id] <file|directory ...>
+ *    Check project ID and inherit flag on the file(s) or directories,
+ *    report the outliers.
+ *    -0: Print file name followed by a NUL instead of newline.
+ *    -d: Check the directory itself, not its children.
+ *    -p: Specify the referenced ID for comparing with the target file(s)
+ *	  or directories' project IDs. If not specified, the target (top)
+ *	  directory's project ID will be used as the referenced one.
+ *    -r: Check subdirectories recursively.
+ *
+ * 4) zfs project [-p id] [-r] [-s] <file|directory ...>
+ *    Set project ID and/or inherit flag on the file(s) or directories.
+ *    -p: Set the project ID as the given id.
+ *    -r: Set on subdirectories recursively. If not specify "-p" option,
+ *	  it will use top-level directory's project ID as the given id,
+ *	  then set both project ID and inherit flag on all descendants
+ *	  of the top-level directory.
+ *    -s: Set project inherit flag.
+ */
+static int
+zfs_do_project(int argc, char **argv)
+{
+	zfs_project_control_t zpc = {
+		.zpc_expected_projid = ZFS_INVALID_PROJID,
+		.zpc_op = ZFS_PROJECT_OP_DEFAULT,
+		.zpc_dironly = B_FALSE,
+		.zpc_keep_projid = B_FALSE,
+		.zpc_newline = B_TRUE,
+		.zpc_recursive = B_FALSE,
+		.zpc_set_flag = B_FALSE,
+	};
+	int ret = 0, c;
+
+	if (argc < 2)
+		usage(B_FALSE);
+
+	while ((c = getopt(argc, argv, "0Ccdkp:rs")) != -1) {
+		switch (c) {
+		case '0':
+			zpc.zpc_newline = B_FALSE;
+			break;
+		case 'C':
+			if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "specify '-C' '-c' '-s' together\n"));
+				usage(B_FALSE);
+			}
+
+			zpc.zpc_op = ZFS_PROJECT_OP_CLEAR;
+			break;
+		case 'c':
+			if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "specify '-C' '-c' '-s' together\n"));
+				usage(B_FALSE);
+			}
+
+			zpc.zpc_op = ZFS_PROJECT_OP_CHECK;
+			break;
+		case 'd':
+			zpc.zpc_dironly = B_TRUE;
+			/* overwrite "-r" option */
+			zpc.zpc_recursive = B_FALSE;
+			break;
+		case 'k':
+			zpc.zpc_keep_projid = B_TRUE;
+			break;
+		case 'p': {
+			char *endptr;
+
+			errno = 0;
+			zpc.zpc_expected_projid = strtoull(optarg, &endptr, 0);
+			if (errno != 0 || *endptr != '\0') {
+				(void) fprintf(stderr,
+				    gettext("project ID must be less than "
+				    "%u\n"), UINT32_MAX);
+				usage(B_FALSE);
+			}
+			if (zpc.zpc_expected_projid >= UINT32_MAX) {
+				(void) fprintf(stderr,
+				    gettext("invalid project ID\n"));
+				usage(B_FALSE);
+			}
+			break;
+		}
+		case 'r':
+			zpc.zpc_recursive = B_TRUE;
+			/* overwrite "-d" option */
+			zpc.zpc_dironly = B_FALSE;
+			break;
+		case 's':
+			if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "specify '-C' '-c' '-s' together\n"));
+				usage(B_FALSE);
+			}
+
+			zpc.zpc_set_flag = B_TRUE;
+			zpc.zpc_op = ZFS_PROJECT_OP_SET;
+			break;
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	if (zpc.zpc_op == ZFS_PROJECT_OP_DEFAULT) {
+		if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID)
+			zpc.zpc_op = ZFS_PROJECT_OP_SET;
+		else
+			zpc.zpc_op = ZFS_PROJECT_OP_LIST;
+	}
+
+	switch (zpc.zpc_op) {
+	case ZFS_PROJECT_OP_LIST:
+		if (zpc.zpc_keep_projid) {
+			(void) fprintf(stderr,
+			    gettext("'-k' is only valid together with '-C'\n"));
+			usage(B_FALSE);
+		}
+		if (!zpc.zpc_newline) {
+			(void) fprintf(stderr,
+			    gettext("'-0' is only valid together with '-c'\n"));
+			usage(B_FALSE);
+		}
+		break;
+	case ZFS_PROJECT_OP_CHECK:
+		if (zpc.zpc_keep_projid) {
+			(void) fprintf(stderr,
+			    gettext("'-k' is only valid together with '-C'\n"));
+			usage(B_FALSE);
+		}
+		break;
+	case ZFS_PROJECT_OP_CLEAR:
+		if (zpc.zpc_dironly) {
+			(void) fprintf(stderr,
+			    gettext("'-d' is useless together with '-C'\n"));
+			usage(B_FALSE);
+		}
+		if (!zpc.zpc_newline) {
+			(void) fprintf(stderr,
+			    gettext("'-0' is only valid together with '-c'\n"));
+			usage(B_FALSE);
+		}
+		if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) {
+			(void) fprintf(stderr,
+			    gettext("'-p' is useless together with '-C'\n"));
+			usage(B_FALSE);
+		}
+		break;
+	case ZFS_PROJECT_OP_SET:
+		if (zpc.zpc_dironly) {
+			(void) fprintf(stderr,
+			    gettext("'-d' is useless for set project ID and/or "
+			    "inherit flag\n"));
+			usage(B_FALSE);
+		}
+		if (zpc.zpc_keep_projid) {
+			(void) fprintf(stderr,
+			    gettext("'-k' is only valid together with '-C'\n"));
+			usage(B_FALSE);
+		}
+		if (!zpc.zpc_newline) {
+			(void) fprintf(stderr,
+			    gettext("'-0' is only valid together with '-c'\n"));
+			usage(B_FALSE);
+		}
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	argv += optind;
+	argc -= optind;
+	if (argc == 0) {
+		(void) fprintf(stderr,
+		    gettext("missing file or directory target(s)\n"));
+		usage(B_FALSE);
+	}
+
+	for (int i = 0; i < argc; i++) {
+		int err;
+
+		err = zfs_project_handle(argv[i], &zpc);
+		if (err && !ret)
+			ret = err;
+	}
+
+	return (ret);
+}
+
+static int
+zfs_do_wait(int argc, char **argv)
+{
+	boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES];
+	int error, i;
+	char c;
+
+	/* By default, wait for all types of activity. */
+	for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++)
+		enabled[i] = B_TRUE;
+
+	while ((c = getopt(argc, argv, "t:")) != -1) {
+		switch (c) {
+		case 't':
+		{
+			static char *col_subopts[] = { "deleteq", NULL };
+			char *value;
+
+			/* Reset activities array */
+			bzero(&enabled, sizeof (enabled));
+			while (*optarg != '\0') {
+				int activity = getsubopt(&optarg, col_subopts,
+				    &value);
+
+				if (activity < 0) {
+					(void) fprintf(stderr,
+					    gettext("invalid activity '%s'\n"),
+					    value);
+					usage(B_FALSE);
+				}
+
+				enabled[activity] = B_TRUE;
+			}
+			break;
+		}
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argv += optind;
+	argc -= optind;
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing 'filesystem' "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM);
+	if (zhp == NULL)
+		return (1);
+
+	for (;;) {
+		boolean_t missing = B_FALSE;
+		boolean_t any_waited = B_FALSE;
+
+		for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) {
+			boolean_t waited;
+
+			if (!enabled[i])
+				continue;
+
+			error = zfs_wait_status(zhp, i, &missing, &waited);
+			if (error != 0 || missing)
+				break;
+
+			any_waited = (any_waited || waited);
+		}
+
+		if (error != 0 || missing || !any_waited)
+			break;
+	}
+
+	zfs_close(zhp);
+
+	return (error);
+}
+
+/*
+ * Display version message
+ */
+static int
+zfs_do_version(int argc, char **argv)
+{
+	if (zfs_version_print() == -1)
+		return (1);
+
+	return (0);
+}
+
+int
+main(int argc, char **argv)
+{
+	int ret = 0;
+	int i = 0;
+	char *cmdname;
+	char **newargv;
+
+	(void) setlocale(LC_ALL, "");
+	(void) textdomain(TEXT_DOMAIN);
+
+	opterr = 0;
+
+	/*
+	 * Make sure the user has specified some command.
+	 */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing command\n"));
+		usage(B_FALSE);
+	}
+
+	cmdname = argv[1];
+
+	/*
+	 * The 'umount' command is an alias for 'unmount'
+	 */
+	if (strcmp(cmdname, "umount") == 0)
+		cmdname = "unmount";
+
+	/*
+	 * The 'recv' command is an alias for 'receive'
+	 */
+	if (strcmp(cmdname, "recv") == 0)
+		cmdname = "receive";
+
+	/*
+	 * The 'snap' command is an alias for 'snapshot'
+	 */
+	if (strcmp(cmdname, "snap") == 0)
+		cmdname = "snapshot";
+
+	/*
+	 * Special case '-?'
+	 */
+	if ((strcmp(cmdname, "-?") == 0) ||
+	    (strcmp(cmdname, "--help") == 0))
+		usage(B_TRUE);
+
+	/*
+	 * Special case '-V|--version'
+	 */
+	if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0))
+		return (zfs_do_version(argc, argv));
+
+	if ((g_zfs = libzfs_init()) == NULL) {
+		(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+		return (1);
+	}
+
+	mnttab_file = g_zfs->libzfs_mnttab;
+
+	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
+
+	libzfs_print_on_error(g_zfs, B_TRUE);
+
+	/*
+	 * Many commands modify input strings for string parsing reasons.
+	 * We create a copy to protect the original argv.
+	 */
+	newargv = malloc((argc + 1) * sizeof (newargv[0]));
+	for (i = 0; i < argc; i++)
+		newargv[i] = strdup(argv[i]);
+	newargv[argc] = NULL;
+
+	/*
+	 * Run the appropriate command.
+	 */
+	libzfs_mnttab_cache(g_zfs, B_TRUE);
+	if (find_command_idx(cmdname, &i) == 0) {
+		current_command = &command_table[i];
+		ret = command_table[i].func(argc - 1, newargv + 1);
+	} else if (strchr(cmdname, '=') != NULL) {
+		verify(find_command_idx("set", &i) == 0);
+		current_command = &command_table[i];
+		ret = command_table[i].func(argc, newargv);
+	} else {
+		(void) fprintf(stderr, gettext("unrecognized "
+		    "command '%s'\n"), cmdname);
+		usage(B_FALSE);
+		ret = 1;
+	}
+
+	for (i = 0; i < argc; i++)
+		free(newargv[i]);
+	free(newargv);
+
+	if (ret == 0 && log_history)
+		(void) zpool_log_history(g_zfs, history_str);
+
+	libzfs_fini(g_zfs);
+
+	/*
+	 * The 'ZFS_ABORT' environment variable causes us to dump core on exit
+	 * for the purposes of running ::findleaks.
+	 */
+	if (getenv("ZFS_ABORT") != NULL) {
+		(void) printf("dumping core by request\n");
+		abort();
+	}
+
+	return (ret);
+}
+
+#ifdef __FreeBSD__
+#include <sys/jail.h>
+#include <jail.h>
+/*
+ * Attach/detach the given dataset to/from the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_jail_impl(int argc, char **argv, boolean_t attach)
+{
+	zfs_handle_t *zhp;
+	int jailid, ret;
+
+	/* check number of arguments */
+	if (argc < 3) {
+		(void) fprintf(stderr, gettext("missing argument(s)\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 3) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	jailid = jail_getid(argv[1]);
+	if (jailid < 0) {
+		(void) fprintf(stderr, gettext("invalid jail id or name\n"));
+		usage(B_FALSE);
+	}
+
+	zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
+	if (zhp == NULL)
+		return (1);
+
+	ret = (zfs_jail(zhp, jailid, attach) != 0);
+
+	zfs_close(zhp);
+	return (ret);
+}
+
+/*
+ * zfs jail jailid filesystem
+ *
+ * Attach the given dataset to the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_jail(int argc, char **argv)
+{
+	return (zfs_do_jail_impl(argc, argv, B_TRUE));
+}
+
+/*
+ * zfs unjail jailid filesystem
+ *
+ * Detach the given dataset from the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_unjail(int argc, char **argv)
+{
+	return (zfs_do_jail_impl(argc, argv, B_FALSE));
+}
+#endif
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_project.c b/sys/contrib/openzfs/cmd/zfs/zfs_project.c
new file mode 100644
index 000000000000..341cc005de48
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_project.c
@@ -0,0 +1,295 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Intle Corporation. All rights reserved.
+ */
+
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <stddef.h>
+#include <libintl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/list.h>
+#include <sys/zfs_project.h>
+
+#include "zfs_util.h"
+#include "zfs_projectutil.h"
+
+typedef struct zfs_project_item {
+	list_node_t	zpi_list;
+	char		zpi_name[0];
+} zfs_project_item_t;
+
+static void
+zfs_project_item_alloc(list_t *head, const char *name)
+{
+	zfs_project_item_t *zpi;
+
+	zpi = safe_malloc(sizeof (zfs_project_item_t) + strlen(name) + 1);
+	strcpy(zpi->zpi_name, name);
+	list_insert_tail(head, zpi);
+}
+
+static int
+zfs_project_sanity_check(const char *name, zfs_project_control_t *zpc,
+    struct stat *st)
+{
+	int ret;
+
+	ret = stat(name, st);
+	if (ret) {
+		(void) fprintf(stderr, gettext("failed to stat %s: %s\n"),
+		    name, strerror(errno));
+		return (ret);
+	}
+
+	if (!S_ISREG(st->st_mode) && !S_ISDIR(st->st_mode)) {
+		(void) fprintf(stderr, gettext("only support project quota on "
+		    "regular file or directory\n"));
+		return (-1);
+	}
+
+	if (!S_ISDIR(st->st_mode)) {
+		if (zpc->zpc_dironly) {
+			(void) fprintf(stderr, gettext(
+			    "'-d' option on non-dir target %s\n"), name);
+			return (-1);
+		}
+
+		if (zpc->zpc_recursive) {
+			(void) fprintf(stderr, gettext(
+			    "'-r' option on non-dir target %s\n"), name);
+			return (-1);
+		}
+	}
+
+	return (0);
+}
+
+static int
+zfs_project_load_projid(const char *name, zfs_project_control_t *zpc)
+{
+	zfsxattr_t fsx;
+	int ret, fd;
+
+	fd = open(name, O_RDONLY | O_NOCTTY);
+	if (fd < 0) {
+		(void) fprintf(stderr, gettext("failed to open %s: %s\n"),
+		    name, strerror(errno));
+		return (fd);
+	}
+
+	ret = ioctl(fd, ZFS_IOC_FSGETXATTR, &fsx);
+	if (ret)
+		(void) fprintf(stderr,
+		    gettext("failed to get xattr for %s: %s\n"),
+		    name, strerror(errno));
+	else
+		zpc->zpc_expected_projid = fsx.fsx_projid;
+
+	close(fd);
+	return (ret);
+}
+
+static int
+zfs_project_handle_one(const char *name, zfs_project_control_t *zpc)
+{
+	zfsxattr_t fsx;
+	int ret, fd;
+
+	fd = open(name, O_RDONLY | O_NOCTTY);
+	if (fd < 0) {
+		if (errno == ENOENT && zpc->zpc_ignore_noent)
+			return (0);
+
+		(void) fprintf(stderr, gettext("failed to open %s: %s\n"),
+		    name, strerror(errno));
+		return (fd);
+	}
+
+	ret = ioctl(fd, ZFS_IOC_FSGETXATTR, &fsx);
+	if (ret) {
+		(void) fprintf(stderr,
+		    gettext("failed to get xattr for %s: %s\n"),
+		    name, strerror(errno));
+		goto out;
+	}
+
+	switch (zpc->zpc_op) {
+	case ZFS_PROJECT_OP_LIST:
+		(void) printf("%5u %c %s\n", fsx.fsx_projid,
+		    (fsx.fsx_xflags & ZFS_PROJINHERIT_FL) ? 'P' : '-', name);
+		goto out;
+	case ZFS_PROJECT_OP_CHECK:
+		if (fsx.fsx_projid == zpc->zpc_expected_projid &&
+		    fsx.fsx_xflags & ZFS_PROJINHERIT_FL)
+			goto out;
+
+		if (!zpc->zpc_newline) {
+			char c = '\0';
+
+			(void) printf("%s%c", name, c);
+			goto out;
+		}
+
+		if (fsx.fsx_projid != zpc->zpc_expected_projid)
+			(void) printf("%s - project ID is not set properly "
+			    "(%u/%u)\n", name, fsx.fsx_projid,
+			    (uint32_t)zpc->zpc_expected_projid);
+
+		if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL))
+			(void) printf("%s - project inherit flag is not set\n",
+			    name);
+
+		goto out;
+	case ZFS_PROJECT_OP_CLEAR:
+		if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL) &&
+		    (zpc->zpc_keep_projid ||
+		    fsx.fsx_projid == ZFS_DEFAULT_PROJID))
+			goto out;
+
+		fsx.fsx_xflags &= ~ZFS_PROJINHERIT_FL;
+		if (!zpc->zpc_keep_projid)
+			fsx.fsx_projid = ZFS_DEFAULT_PROJID;
+		break;
+	case ZFS_PROJECT_OP_SET:
+		if (fsx.fsx_projid == zpc->zpc_expected_projid &&
+		    (!zpc->zpc_set_flag || fsx.fsx_xflags & ZFS_PROJINHERIT_FL))
+			goto out;
+
+		fsx.fsx_projid = zpc->zpc_expected_projid;
+		if (zpc->zpc_set_flag)
+			fsx.fsx_xflags |= ZFS_PROJINHERIT_FL;
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	ret = ioctl(fd, ZFS_IOC_FSSETXATTR, &fsx);
+	if (ret)
+		(void) fprintf(stderr,
+		    gettext("failed to set xattr for %s: %s\n"),
+		    name, strerror(errno));
+
+out:
+	close(fd);
+	return (ret);
+}
+
+static int
+zfs_project_handle_dir(const char *name, zfs_project_control_t *zpc,
+    list_t *head)
+{
+	char fullname[PATH_MAX];
+	struct dirent *ent;
+	DIR *dir;
+	int ret = 0;
+
+	dir = opendir(name);
+	if (dir == NULL) {
+		if (errno == ENOENT && zpc->zpc_ignore_noent)
+			return (0);
+
+		ret = -errno;
+		(void) fprintf(stderr, gettext("failed to opendir %s: %s\n"),
+		    name, strerror(errno));
+		return (ret);
+	}
+
+	/* Non-top item, ignore the case of being removed or renamed by race. */
+	zpc->zpc_ignore_noent = B_TRUE;
+	errno = 0;
+	while (!ret && (ent = readdir(dir)) != NULL) {
+		/* skip "." and ".." */
+		if (strcmp(ent->d_name, ".") == 0 ||
+		    strcmp(ent->d_name, "..") == 0)
+			continue;
+
+		if (strlen(ent->d_name) + strlen(name) >=
+		    sizeof (fullname) + 1) {
+			errno = ENAMETOOLONG;
+			break;
+		}
+
+		sprintf(fullname, "%s/%s", name, ent->d_name);
+		ret = zfs_project_handle_one(fullname, zpc);
+		if (!ret && zpc->zpc_recursive && ent->d_type == DT_DIR)
+			zfs_project_item_alloc(head, fullname);
+	}
+
+	if (errno && !ret) {
+		ret = -errno;
+		(void) fprintf(stderr, gettext("failed to readdir %s: %s\n"),
+		    name, strerror(errno));
+	}
+
+	closedir(dir);
+	return (ret);
+}
+
+int
+zfs_project_handle(const char *name, zfs_project_control_t *zpc)
+{
+	zfs_project_item_t *zpi;
+	struct stat st;
+	list_t head;
+	int ret;
+
+	ret = zfs_project_sanity_check(name, zpc, &st);
+	if (ret)
+		return (ret);
+
+	if ((zpc->zpc_op == ZFS_PROJECT_OP_SET ||
+	    zpc->zpc_op == ZFS_PROJECT_OP_CHECK) &&
+	    zpc->zpc_expected_projid == ZFS_INVALID_PROJID) {
+		ret = zfs_project_load_projid(name, zpc);
+		if (ret)
+			return (ret);
+	}
+
+	zpc->zpc_ignore_noent = B_FALSE;
+	ret = zfs_project_handle_one(name, zpc);
+	if (ret || !S_ISDIR(st.st_mode) || zpc->zpc_dironly ||
+	    (!zpc->zpc_recursive &&
+	    zpc->zpc_op != ZFS_PROJECT_OP_LIST &&
+	    zpc->zpc_op != ZFS_PROJECT_OP_CHECK))
+		return (ret);
+
+	list_create(&head, sizeof (zfs_project_item_t),
+	    offsetof(zfs_project_item_t, zpi_list));
+	zfs_project_item_alloc(&head, name);
+	while ((zpi = list_remove_head(&head)) != NULL) {
+		if (!ret)
+			ret = zfs_project_handle_dir(zpi->zpi_name, zpc, &head);
+		free(zpi);
+	}
+
+	return (ret);
+}
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h b/sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h
new file mode 100644
index 000000000000..1792a3383a03
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation. All rights reserved.
+ */
+
+#ifndef	_ZFS_PROJECTUTIL_H
+#define	_ZFS_PROJECTUTIL_H
+
+typedef enum {
+	ZFS_PROJECT_OP_DEFAULT	= 0,
+	ZFS_PROJECT_OP_LIST	= 1,
+	ZFS_PROJECT_OP_CHECK	= 2,
+	ZFS_PROJECT_OP_CLEAR	= 3,
+	ZFS_PROJECT_OP_SET	= 4,
+} zfs_project_ops_t;
+
+typedef struct zfs_project_control {
+	uint64_t		zpc_expected_projid;
+	zfs_project_ops_t	zpc_op;
+	boolean_t		zpc_dironly;
+	boolean_t		zpc_ignore_noent;
+	boolean_t		zpc_keep_projid;
+	boolean_t		zpc_newline;
+	boolean_t		zpc_recursive;
+	boolean_t		zpc_set_flag;
+} zfs_project_control_t;
+
+int zfs_project_handle(const char *name, zfs_project_control_t *zpc);
+
+#endif	/* _ZFS_PROJECTUTIL_H */
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_util.h b/sys/contrib/openzfs/cmd/zfs/zfs_util.h
new file mode 100644
index 000000000000..a56af59adb15
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_util.h
@@ -0,0 +1,42 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_ZFS_UTIL_H
+#define	_ZFS_UTIL_H
+
+#include <libzfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+void * safe_malloc(size_t size);
+void nomem(void);
+extern libzfs_handle_t *g_zfs;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_UTIL_H */
diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore b/sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore
new file mode 100644
index 000000000000..f95f853e48c2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore
@@ -0,0 +1 @@
+zfs_ids_to_path
diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am b/sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am
new file mode 100644
index 000000000000..176eeb3c72c5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am
@@ -0,0 +1,9 @@
+include $(top_srcdir)/config/Rules.am
+
+sbin_PROGRAMS = zfs_ids_to_path
+
+zfs_ids_to_path_SOURCES = \
+	zfs_ids_to_path.c
+
+zfs_ids_to_path_LDADD = \
+        $(abs_top_builddir)/lib/libzfs/libzfs.la
diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c b/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c
new file mode 100644
index 000000000000..6cfaa6f41fa5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c
@@ -0,0 +1,96 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+#include <libintl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <libzfs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+libzfs_handle_t *g_zfs;
+
+static void
+usage(int err)
+{
+	fprintf(stderr, "Usage: [-v] zfs_ids_to_path <pool> <objset id> "
+	    "<object id>\n");
+	exit(err);
+}
+
+int
+main(int argc, char **argv)
+{
+	boolean_t verbose = B_FALSE;
+	char c;
+	while ((c = getopt(argc, argv, "v")) != -1) {
+		switch (c) {
+		case 'v':
+			verbose = B_TRUE;
+			break;
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (argc != 3) {
+		(void) fprintf(stderr, "Incorrect number of arguments: %d\n",
+		    argc);
+		usage(1);
+	}
+
+	uint64_t objset, object;
+	if (sscanf(argv[1], "%llu", (u_longlong_t *)&objset) != 1) {
+		(void) fprintf(stderr, "Invalid objset id: %s\n", argv[2]);
+		usage(2);
+	}
+	if (sscanf(argv[2], "%llu", (u_longlong_t *)&object) != 1) {
+		(void) fprintf(stderr, "Invalid object id: %s\n", argv[3]);
+		usage(3);
+	}
+	if ((g_zfs = libzfs_init()) == NULL) {
+		(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+		return (4);
+	}
+	zpool_handle_t *pool = zpool_open(g_zfs, argv[0]);
+	if (pool == NULL) {
+		fprintf(stderr, "Could not open pool %s\n", argv[1]);
+		libzfs_fini(g_zfs);
+		return (5);
+	}
+
+	char pathname[PATH_MAX * 2];
+	if (verbose) {
+		zpool_obj_to_path_ds(pool, objset, object, pathname,
+		    sizeof (pathname));
+	} else {
+		zpool_obj_to_path(pool, objset, object, pathname,
+		    sizeof (pathname));
+	}
+	printf("%s\n", pathname);
+	zpool_close(pool);
+	libzfs_fini(g_zfs);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zgenhostid/Makefile.am b/sys/contrib/openzfs/cmd/zgenhostid/Makefile.am
new file mode 100644
index 000000000000..69c99ca9d828
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zgenhostid/Makefile.am
@@ -0,0 +1 @@
+dist_bin_SCRIPTS = zgenhostid
diff --git a/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid b/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid
new file mode 100755
index 000000000000..8b468740c72b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Emulate genhostid(1) available on RHEL/CENTOS, for use on distros
+# which do not provide that utility.
+#
+# Usage:
+#    zgenhostid
+#    zgenhostid <value>
+#
+# If /etc/hostid already exists and is size > 0, the script exits immediately
+# and changes nothing.  Unlike genhostid, this generates an error message.
+#
+# The first form generates a random hostid and stores it in /etc/hostid.
+# The second form checks that the provided value is between 0x1 and 0xFFFFFFFF
+# and if so, stores it in /etc/hostid.  This form is not supported by
+# genhostid(1).
+
+hostid_file=/etc/hostid
+
+function usage {
+	echo "$0 [value]"
+	echo "If $hostid_file is not present, store a hostid in it." >&2
+	echo "The optional value must be an 8-digit hex number between" >&2
+	echo "1 and 2^32-1.  If no value is provided, a random one will" >&2
+	echo "be generated.  The value must be unique among your systems." >&2
+}
+
+# hostid(1) ignores contents of /etc/hostid if size < 4 bytes.  It would
+# be better if this checked size >= 4 bytes but it the method must be
+# widely portable.
+if [ -s $hostid_file ]; then
+	echo "$hostid_file already exists.  No change made." >&2
+	exit 1
+fi
+
+if [ -n "$1" ]; then
+	host_id=$1
+else
+	# $RANDOM goes from 0..32k-1
+	number=$((((RANDOM % 4) * 32768 + RANDOM) * 32768 + RANDOM))
+	host_id=$(printf "%08x" $number)
+fi
+
+if egrep -o '^0{8}$' <<< $host_id >/dev/null 2>&1; then
+	usage
+	exit 2
+fi
+
+if ! egrep -o '^[a-fA-F0-9]{8}$' <<< $host_id >/dev/null 2>&1; then
+	usage
+	exit 3
+fi
+
+a=${host_id:6:2}
+b=${host_id:4:2}
+c=${host_id:2:2}
+d=${host_id:0:2}
+
+echo -ne \\x$a\\x$b\\x$c\\x$d > $hostid_file
+
+exit 0
diff --git a/sys/contrib/openzfs/cmd/zhack/.gitignore b/sys/contrib/openzfs/cmd/zhack/.gitignore
new file mode 100644
index 000000000000..763a18898b88
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zhack/.gitignore
@@ -0,0 +1 @@
+/zhack
diff --git a/sys/contrib/openzfs/cmd/zhack/Makefile.am b/sys/contrib/openzfs/cmd/zhack/Makefile.am
new file mode 100644
index 000000000000..5cddac32b5ac
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zhack/Makefile.am
@@ -0,0 +1,14 @@
+include $(top_srcdir)/config/Rules.am
+
+# Unconditionally enable debugging for zhack
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+sbin_PROGRAMS = zhack
+
+zhack_SOURCES = \
+	zhack.c
+
+zhack_LDADD = \
+	$(abs_top_builddir)/lib/libzpool/libzpool.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+	$(abs_top_builddir)/lib/libnvpair/libnvpair.la
diff --git a/sys/contrib/openzfs/cmd/zhack/zhack.c b/sys/contrib/openzfs/cmd/zhack/zhack.c
new file mode 100644
index 000000000000..4d958fe4365a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zhack/zhack.c
@@ -0,0 +1,532 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ */
+
+/*
+ * zhack is a debugging tool that can write changes to ZFS pool using libzpool
+ * for testing purposes. Altering pools with zhack is unsupported and may
+ * result in corrupted pools.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/dsl_synctask.h>
+#include <sys/vdev.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/zfeature.h>
+#include <sys/dmu_tx.h>
+#include <libzutil.h>
+
+extern boolean_t zfeature_checks_disable;
+
+const char cmdname[] = "zhack";
+static importargs_t g_importargs;
+static char *g_pool;
+static boolean_t g_readonly;
+
+static void
+usage(void)
+{
+	(void) fprintf(stderr,
+	    "Usage: %s [-c cachefile] [-d dir] <subcommand> <args> ...\n"
+	    "where <subcommand> <args> is one of the following:\n"
+	    "\n", cmdname);
+
+	(void) fprintf(stderr,
+	    "    feature stat <pool>\n"
+	    "        print information about enabled features\n"
+	    "    feature enable [-r] [-d desc] <pool> <feature>\n"
+	    "        add a new enabled feature to the pool\n"
+	    "        -d <desc> sets the feature's description\n"
+	    "        -r set read-only compatible flag for feature\n"
+	    "    feature ref [-md] <pool> <feature>\n"
+	    "        change the refcount on the given feature\n"
+	    "        -d decrease instead of increase the refcount\n"
+	    "        -m add the feature to the label if increasing refcount\n"
+	    "\n"
+	    "    <feature> : should be a feature guid\n");
+	exit(1);
+}
+
+
+static void
+fatal(spa_t *spa, void *tag, const char *fmt, ...)
+{
+	va_list ap;
+
+	if (spa != NULL) {
+		spa_close(spa, tag);
+		(void) spa_export(g_pool, NULL, B_TRUE, B_FALSE);
+	}
+
+	va_start(ap, fmt);
+	(void) fprintf(stderr, "%s: ", cmdname);
+	(void) vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	(void) fprintf(stderr, "\n");
+
+	exit(1);
+}
+
+/* ARGSUSED */
+static int
+space_delta_cb(dmu_object_type_t bonustype, const void *data,
+    zfs_file_info_t *zoi)
+{
+	/*
+	 * Is it a valid type of object to track?
+	 */
+	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+		return (ENOENT);
+	(void) fprintf(stderr, "modifying object that needs user accounting");
+	abort();
+	/* NOTREACHED */
+}
+
+/*
+ * Target is the dataset whose pool we want to open.
+ */
+static void
+zhack_import(char *target, boolean_t readonly)
+{
+	nvlist_t *config;
+	nvlist_t *props;
+	int error;
+
+	kernel_init(readonly ? SPA_MODE_READ :
+	    (SPA_MODE_READ | SPA_MODE_WRITE));
+
+	dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb);
+
+	g_readonly = readonly;
+	g_importargs.can_be_active = readonly;
+	g_pool = strdup(target);
+
+	error = zpool_find_config(NULL, target, &config, &g_importargs,
+	    &libzpool_config_ops);
+	if (error)
+		fatal(NULL, FTAG, "cannot import '%s'", target);
+
+	props = NULL;
+	if (readonly) {
+		VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+		VERIFY(nvlist_add_uint64(props,
+		    zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0);
+	}
+
+	zfeature_checks_disable = B_TRUE;
+	error = spa_import(target, config, props,
+	    (readonly ?  ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL));
+	zfeature_checks_disable = B_FALSE;
+	if (error == EEXIST)
+		error = 0;
+
+	if (error)
+		fatal(NULL, FTAG, "can't import '%s': %s", target,
+		    strerror(error));
+}
+
+static void
+zhack_spa_open(char *target, boolean_t readonly, void *tag, spa_t **spa)
+{
+	int err;
+
+	zhack_import(target, readonly);
+
+	zfeature_checks_disable = B_TRUE;
+	err = spa_open(target, spa, tag);
+	zfeature_checks_disable = B_FALSE;
+
+	if (err != 0)
+		fatal(*spa, FTAG, "cannot open '%s': %s", target,
+		    strerror(err));
+	if (spa_version(*spa) < SPA_VERSION_FEATURES) {
+		fatal(*spa, FTAG, "'%s' has version %d, features not enabled",
+		    target, (int)spa_version(*spa));
+	}
+}
+
+static void
+dump_obj(objset_t *os, uint64_t obj, const char *name)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	(void) printf("%s_obj:\n", name);
+
+	for (zap_cursor_init(&zc, os, obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		if (za.za_integer_length == 8) {
+			ASSERT(za.za_num_integers == 1);
+			(void) printf("\t%s = %llu\n",
+			    za.za_name, (u_longlong_t)za.za_first_integer);
+		} else {
+			ASSERT(za.za_integer_length == 1);
+			char val[1024];
+			VERIFY(zap_lookup(os, obj, za.za_name,
+			    1, sizeof (val), val) == 0);
+			(void) printf("\t%s = %s\n", za.za_name, val);
+		}
+	}
+	zap_cursor_fini(&zc);
+}
+
+static void
+dump_mos(spa_t *spa)
+{
+	nvlist_t *nv = spa->spa_label_features;
+	nvpair_t *pair;
+
+	(void) printf("label config:\n");
+	for (pair = nvlist_next_nvpair(nv, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(nv, pair)) {
+		(void) printf("\t%s\n", nvpair_name(pair));
+	}
+}
+
+static void
+zhack_do_feature_stat(int argc, char **argv)
+{
+	spa_t *spa;
+	objset_t *os;
+	char *target;
+
+	argc--;
+	argv++;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, "error: missing pool name\n");
+		usage();
+	}
+	target = argv[0];
+
+	zhack_spa_open(target, B_TRUE, FTAG, &spa);
+	os = spa->spa_meta_objset;
+
+	dump_obj(os, spa->spa_feat_for_read_obj, "for_read");
+	dump_obj(os, spa->spa_feat_for_write_obj, "for_write");
+	dump_obj(os, spa->spa_feat_desc_obj, "descriptions");
+	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
+		dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg");
+	}
+	dump_mos(spa);
+
+	spa_close(spa, FTAG);
+}
+
+static void
+zhack_feature_enable_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	zfeature_info_t *feature = arg;
+
+	feature_enable_sync(spa, feature, tx);
+
+	spa_history_log_internal(spa, "zhack enable feature", tx,
+	    "name=%s flags=%u",
+	    feature->fi_guid, feature->fi_flags);
+}
+
+static void
+zhack_do_feature_enable(int argc, char **argv)
+{
+	int c;
+	char *desc, *target;
+	spa_t *spa;
+	objset_t *mos;
+	zfeature_info_t feature;
+	spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
+
+	/*
+	 * Features are not added to the pool's label until their refcounts
+	 * are incremented, so fi_mos can just be left as false for now.
+	 */
+	desc = NULL;
+	feature.fi_uname = "zhack";
+	feature.fi_flags = 0;
+	feature.fi_depends = nodeps;
+	feature.fi_feature = SPA_FEATURE_NONE;
+
+	optind = 1;
+	while ((c = getopt(argc, argv, "+rd:")) != -1) {
+		switch (c) {
+		case 'r':
+			feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT;
+			break;
+		case 'd':
+			desc = strdup(optarg);
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+
+	if (desc == NULL)
+		desc = strdup("zhack injected");
+	feature.fi_desc = desc;
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 2) {
+		(void) fprintf(stderr, "error: missing feature or pool name\n");
+		usage();
+	}
+	target = argv[0];
+	feature.fi_guid = argv[1];
+
+	if (!zfeature_is_valid_guid(feature.fi_guid))
+		fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
+
+	zhack_spa_open(target, B_FALSE, FTAG, &spa);
+	mos = spa->spa_meta_objset;
+
+	if (zfeature_is_supported(feature.fi_guid))
+		fatal(spa, FTAG, "'%s' is a real feature, will not enable");
+	if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid))
+		fatal(spa, FTAG, "feature already enabled: %s",
+		    feature.fi_guid);
+
+	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+	    zhack_feature_enable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL));
+
+	spa_close(spa, FTAG);
+
+	free(desc);
+}
+
+static void
+feature_incr_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	zfeature_info_t *feature = arg;
+	uint64_t refcount;
+
+	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
+	feature_sync(spa, feature, refcount + 1, tx);
+	spa_history_log_internal(spa, "zhack feature incr", tx,
+	    "name=%s", feature->fi_guid);
+}
+
+static void
+feature_decr_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	zfeature_info_t *feature = arg;
+	uint64_t refcount;
+
+	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
+	feature_sync(spa, feature, refcount - 1, tx);
+	spa_history_log_internal(spa, "zhack feature decr", tx,
+	    "name=%s", feature->fi_guid);
+}
+
+static void
+zhack_do_feature_ref(int argc, char **argv)
+{
+	int c;
+	char *target;
+	boolean_t decr = B_FALSE;
+	spa_t *spa;
+	objset_t *mos;
+	zfeature_info_t feature;
+	spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
+
+	/*
+	 * fi_desc does not matter here because it was written to disk
+	 * when the feature was enabled, but we need to properly set the
+	 * feature for read or write based on the information we read off
+	 * disk later.
+	 */
+	feature.fi_uname = "zhack";
+	feature.fi_flags = 0;
+	feature.fi_desc = NULL;
+	feature.fi_depends = nodeps;
+	feature.fi_feature = SPA_FEATURE_NONE;
+
+	optind = 1;
+	while ((c = getopt(argc, argv, "+md")) != -1) {
+		switch (c) {
+		case 'm':
+			feature.fi_flags |= ZFEATURE_FLAG_MOS;
+			break;
+		case 'd':
+			decr = B_TRUE;
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 2) {
+		(void) fprintf(stderr, "error: missing feature or pool name\n");
+		usage();
+	}
+	target = argv[0];
+	feature.fi_guid = argv[1];
+
+	if (!zfeature_is_valid_guid(feature.fi_guid))
+		fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
+
+	zhack_spa_open(target, B_FALSE, FTAG, &spa);
+	mos = spa->spa_meta_objset;
+
+	if (zfeature_is_supported(feature.fi_guid)) {
+		fatal(spa, FTAG,
+		    "'%s' is a real feature, will not change refcount");
+	}
+
+	if (0 == zap_contains(mos, spa->spa_feat_for_read_obj,
+	    feature.fi_guid)) {
+		feature.fi_flags &= ~ZFEATURE_FLAG_READONLY_COMPAT;
+	} else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj,
+	    feature.fi_guid)) {
+		feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT;
+	} else {
+		fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid);
+	}
+
+	if (decr) {
+		uint64_t count;
+		if (feature_get_refcount_from_disk(spa, &feature,
+		    &count) == 0 && count == 0) {
+			fatal(spa, FTAG, "feature refcount already 0: %s",
+			    feature.fi_guid);
+		}
+	}
+
+	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+	    decr ? feature_decr_sync : feature_incr_sync, &feature,
+	    5, ZFS_SPACE_CHECK_NORMAL));
+
+	spa_close(spa, FTAG);
+}
+
+static int
+zhack_do_feature(int argc, char **argv)
+{
+	char *subcommand;
+
+	argc--;
+	argv++;
+	if (argc == 0) {
+		(void) fprintf(stderr,
+		    "error: no feature operation specified\n");
+		usage();
+	}
+
+	subcommand = argv[0];
+	if (strcmp(subcommand, "stat") == 0) {
+		zhack_do_feature_stat(argc, argv);
+	} else if (strcmp(subcommand, "enable") == 0) {
+		zhack_do_feature_enable(argc, argv);
+	} else if (strcmp(subcommand, "ref") == 0) {
+		zhack_do_feature_ref(argc, argv);
+	} else {
+		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
+		    subcommand);
+		usage();
+	}
+
+	return (0);
+}
+
+#define	MAX_NUM_PATHS 1024
+
+int
+main(int argc, char **argv)
+{
+	extern void zfs_prop_init(void);
+
+	char *path[MAX_NUM_PATHS];
+	const char *subcommand;
+	int rv = 0;
+	int c;
+
+	g_importargs.path = path;
+
+	dprintf_setup(&argc, argv);
+	zfs_prop_init();
+
+	while ((c = getopt(argc, argv, "+c:d:")) != -1) {
+		switch (c) {
+		case 'c':
+			g_importargs.cachefile = optarg;
+			break;
+		case 'd':
+			assert(g_importargs.paths < MAX_NUM_PATHS);
+			g_importargs.path[g_importargs.paths++] = optarg;
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+	optind = 1;
+
+	if (argc == 0) {
+		(void) fprintf(stderr, "error: no command specified\n");
+		usage();
+	}
+
+	subcommand = argv[0];
+
+	if (strcmp(subcommand, "feature") == 0) {
+		rv = zhack_do_feature(argc, argv);
+	} else {
+		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
+		    subcommand);
+		usage();
+	}
+
+	if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_FALSE) != 0) {
+		fatal(NULL, FTAG, "pool export failed; "
+		    "changes may not be committed to disk\n");
+	}
+
+	kernel_fini();
+
+	return (rv);
+}
diff --git a/sys/contrib/openzfs/cmd/zinject/.gitignore b/sys/contrib/openzfs/cmd/zinject/.gitignore
new file mode 100644
index 000000000000..bded8400996c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/.gitignore
@@ -0,0 +1 @@
+/zinject
diff --git a/sys/contrib/openzfs/cmd/zinject/Makefile.am b/sys/contrib/openzfs/cmd/zinject/Makefile.am
new file mode 100644
index 000000000000..091d92cd6026
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/Makefile.am
@@ -0,0 +1,13 @@
+include $(top_srcdir)/config/Rules.am
+
+sbin_PROGRAMS = zinject
+
+zinject_SOURCES = \
+	translate.c \
+	zinject.c \
+	zinject.h
+
+zinject_LDADD = \
+	$(abs_top_builddir)/lib/libzfs/libzfs.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+	$(abs_top_builddir)/lib/libnvpair/libnvpair.la
diff --git a/sys/contrib/openzfs/cmd/zinject/translate.c b/sys/contrib/openzfs/cmd/zinject/translate.c
new file mode 100644
index 000000000000..4939c0b85b5f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/translate.c
@@ -0,0 +1,397 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ */
+
+#include <libzfs.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/file.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/vdev_impl.h>
+
+#include <sys/mkdev.h>
+
+#include "zinject.h"
+
+static int debug;
+
+static void
+ziprintf(const char *fmt, ...)
+{
+	va_list ap;
+
+	if (!debug)
+		return;
+
+	va_start(ap, fmt);
+	(void) vprintf(fmt, ap);
+	va_end(ap);
+}
+
+static void
+compress_slashes(const char *src, char *dest)
+{
+	while (*src != '\0') {
+		*dest = *src++;
+		while (*dest == '/' && *src == '/')
+			++src;
+		++dest;
+	}
+	*dest = '\0';
+}
+
+/*
+ * Given a full path to a file, translate into a dataset name and a relative
+ * path within the dataset.  'dataset' must be at least MAXNAMELEN characters,
+ * and 'relpath' must be at least MAXPATHLEN characters.  We also pass a stat64
+ * buffer, which we need later to get the object ID.
+ */
+static int
+parse_pathname(const char *inpath, char *dataset, char *relpath,
+    struct stat64 *statbuf)
+{
+	struct extmnttab mp;
+	const char *rel;
+	char fullpath[MAXPATHLEN];
+
+	compress_slashes(inpath, fullpath);
+
+	if (fullpath[0] != '/') {
+		(void) fprintf(stderr, "invalid object '%s': must be full "
+		    "path\n", fullpath);
+		usage();
+		return (-1);
+	}
+
+	if (getextmntent(fullpath, &mp, statbuf) != 0) {
+		(void) fprintf(stderr, "cannot find mountpoint for '%s'\n",
+		    fullpath);
+		return (-1);
+	}
+
+	if (strcmp(mp.mnt_fstype, MNTTYPE_ZFS) != 0) {
+		(void) fprintf(stderr, "invalid path '%s': not a ZFS "
+		    "filesystem\n", fullpath);
+		return (-1);
+	}
+
+	if (strncmp(fullpath, mp.mnt_mountp, strlen(mp.mnt_mountp)) != 0) {
+		(void) fprintf(stderr, "invalid path '%s': mountpoint "
+		    "doesn't match path\n", fullpath);
+		return (-1);
+	}
+
+	(void) strcpy(dataset, mp.mnt_special);
+
+	rel = fullpath + strlen(mp.mnt_mountp);
+	if (rel[0] == '/')
+		rel++;
+	(void) strcpy(relpath, rel);
+
+	return (0);
+}
+
+/*
+ * Convert from a dataset to a objset id. Note that
+ * we grab the object number from the inode number.
+ */
+static int
+object_from_path(const char *dataset, uint64_t object, zinject_record_t *record)
+{
+	zfs_handle_t *zhp;
+
+	if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL)
+		return (-1);
+
+	record->zi_objset = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
+	record->zi_object = object;
+
+	zfs_close(zhp);
+
+	return (0);
+}
+
+/*
+ * Initialize the range based on the type, level, and range given.
+ */
+static int
+initialize_range(err_type_t type, int level, char *range,
+    zinject_record_t *record)
+{
+	/*
+	 * Determine the numeric range from the string.
+	 */
+	if (range == NULL) {
+		/*
+		 * If range is unspecified, set the range to [0,-1], which
+		 * indicates that the whole object should be treated as an
+		 * error.
+		 */
+		record->zi_start = 0;
+		record->zi_end = -1ULL;
+	} else {
+		char *end;
+
+		/* XXX add support for suffixes */
+		record->zi_start = strtoull(range, &end, 10);
+
+
+		if (*end == '\0')
+			record->zi_end = record->zi_start + 1;
+		else if (*end == ',')
+			record->zi_end = strtoull(end + 1, &end, 10);
+
+		if (*end != '\0') {
+			(void) fprintf(stderr, "invalid range '%s': must be "
+			    "a numeric range of the form 'start[,end]'\n",
+			    range);
+			return (-1);
+		}
+	}
+
+	switch (type) {
+	default:
+		break;
+	case TYPE_DATA:
+		break;
+
+	case TYPE_DNODE:
+		/*
+		 * If this is a request to inject faults into the dnode, then we
+		 * must translate the current (objset,object) pair into an
+		 * offset within the metadnode for the objset.  Specifying any
+		 * kind of range with type 'dnode' is illegal.
+		 */
+		if (range != NULL) {
+			(void) fprintf(stderr, "range cannot be specified when "
+			    "type is 'dnode'\n");
+			return (-1);
+		}
+
+		record->zi_start = record->zi_object * sizeof (dnode_phys_t);
+		record->zi_end = record->zi_start + sizeof (dnode_phys_t);
+		record->zi_object = 0;
+		break;
+	}
+
+	record->zi_level = level;
+
+	return (0);
+}
+
+int
+translate_record(err_type_t type, const char *object, const char *range,
+    int level, zinject_record_t *record, char *poolname, char *dataset)
+{
+	char path[MAXPATHLEN];
+	char *slash;
+	struct stat64 statbuf;
+	int ret = -1;
+
+	debug = (getenv("ZINJECT_DEBUG") != NULL);
+
+	ziprintf("translating: %s\n", object);
+
+	if (MOS_TYPE(type)) {
+		/*
+		 * MOS objects are treated specially.
+		 */
+		switch (type) {
+		default:
+			break;
+		case TYPE_MOS:
+			record->zi_type = 0;
+			break;
+		case TYPE_MOSDIR:
+			record->zi_type = DMU_OT_OBJECT_DIRECTORY;
+			break;
+		case TYPE_METASLAB:
+			record->zi_type = DMU_OT_OBJECT_ARRAY;
+			break;
+		case TYPE_CONFIG:
+			record->zi_type = DMU_OT_PACKED_NVLIST;
+			break;
+		case TYPE_BPOBJ:
+			record->zi_type = DMU_OT_BPOBJ;
+			break;
+		case TYPE_SPACEMAP:
+			record->zi_type = DMU_OT_SPACE_MAP;
+			break;
+		case TYPE_ERRLOG:
+			record->zi_type = DMU_OT_ERROR_LOG;
+			break;
+		}
+
+		dataset[0] = '\0';
+		(void) strcpy(poolname, object);
+		return (0);
+	}
+
+	/*
+	 * Convert a full path into a (dataset, file) pair.
+	 */
+	if (parse_pathname(object, dataset, path, &statbuf) != 0)
+		goto err;
+
+	ziprintf("   dataset: %s\n", dataset);
+	ziprintf("      path: %s\n", path);
+
+	/*
+	 * Convert (dataset, file) into (objset, object)
+	 */
+	if (object_from_path(dataset, statbuf.st_ino, record) != 0)
+		goto err;
+
+	ziprintf("raw objset: %llu\n", record->zi_objset);
+	ziprintf("raw object: %llu\n", record->zi_object);
+
+	/*
+	 * For the given object, initialize the range in bytes
+	 */
+	if (initialize_range(type, level, (char *)range, record) != 0)
+		goto err;
+
+	ziprintf("    objset: %llu\n", record->zi_objset);
+	ziprintf("    object: %llu\n", record->zi_object);
+	if (record->zi_start == 0 &&
+	    record->zi_end == -1ULL)
+		ziprintf("     range: all\n");
+	else
+		ziprintf("     range: [%llu, %llu]\n", record->zi_start,
+		    record->zi_end);
+
+	/*
+	 * Copy the pool name
+	 */
+	(void) strcpy(poolname, dataset);
+	if ((slash = strchr(poolname, '/')) != NULL)
+		*slash = '\0';
+
+	ret = 0;
+
+err:
+	return (ret);
+}
+
+int
+translate_raw(const char *str, zinject_record_t *record)
+{
+	/*
+	 * A raw bookmark of the form objset:object:level:blkid, where each
+	 * number is a hexadecimal value.
+	 */
+	if (sscanf(str, "%llx:%llx:%x:%llx", (u_longlong_t *)&record->zi_objset,
+	    (u_longlong_t *)&record->zi_object, &record->zi_level,
+	    (u_longlong_t *)&record->zi_start) != 4) {
+		(void) fprintf(stderr, "bad raw spec '%s': must be of the form "
+		    "'objset:object:level:blkid'\n", str);
+		return (-1);
+	}
+
+	record->zi_end = record->zi_start;
+
+	return (0);
+}
+
+int
+translate_device(const char *pool, const char *device, err_type_t label_type,
+    zinject_record_t *record)
+{
+	char *end;
+	zpool_handle_t *zhp;
+	nvlist_t *tgt;
+	boolean_t isspare, iscache;
+
+	/*
+	 * Given a device name or GUID, create an appropriate injection record
+	 * with zi_guid set.
+	 */
+	if ((zhp = zpool_open(g_zfs, pool)) == NULL)
+		return (-1);
+
+	record->zi_guid = strtoull(device, &end, 0);
+	if (record->zi_guid == 0 || *end != '\0') {
+		tgt = zpool_find_vdev(zhp, device, &isspare, &iscache, NULL);
+
+		if (tgt == NULL) {
+			(void) fprintf(stderr, "cannot find device '%s' in "
+			    "pool '%s'\n", device, pool);
+			zpool_close(zhp);
+			return (-1);
+		}
+
+		verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID,
+		    &record->zi_guid) == 0);
+	}
+
+	/*
+	 * Device faults can take on three different forms:
+	 * 1). delayed or hanging I/O
+	 * 2). zfs label faults
+	 * 3). generic disk faults
+	 */
+	if (record->zi_timer != 0) {
+		record->zi_cmd = ZINJECT_DELAY_IO;
+	} else if (label_type != TYPE_INVAL) {
+		record->zi_cmd = ZINJECT_LABEL_FAULT;
+	} else {
+		record->zi_cmd = ZINJECT_DEVICE_FAULT;
+	}
+
+	switch (label_type) {
+	default:
+		break;
+	case TYPE_LABEL_UBERBLOCK:
+		record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]);
+		record->zi_end = record->zi_start + VDEV_UBERBLOCK_RING - 1;
+		break;
+	case TYPE_LABEL_NVLIST:
+		record->zi_start = offsetof(vdev_label_t, vl_vdev_phys);
+		record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1;
+		break;
+	case TYPE_LABEL_PAD1:
+		record->zi_start = offsetof(vdev_label_t, vl_pad1);
+		record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+		break;
+	case TYPE_LABEL_PAD2:
+		record->zi_start = offsetof(vdev_label_t, vl_be);
+		record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+		break;
+	}
+	zpool_close(zhp);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.c b/sys/contrib/openzfs/cmd/zinject/zinject.c
new file mode 100644
index 000000000000..bf97b0d68713
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/zinject.c
@@ -0,0 +1,1287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+/*
+ * ZFS Fault Injector
+ *
+ * This userland component takes a set of options and uses libzpool to translate
+ * from a user-visible object type and name to an internal representation.
+ * There are two basic types of faults: device faults and data faults.
+ *
+ *
+ * DEVICE FAULTS
+ *
+ * Errors can be injected into a particular vdev using the '-d' option.  This
+ * option takes a path or vdev GUID to uniquely identify the device within a
+ * pool.  There are four types of errors that can be injected, IO, ENXIO,
+ * ECHILD, and EILSEQ.  These can be controlled through the '-e' option and the
+ * default is ENXIO.  For EIO failures, any attempt to read data from the device
+ * will return EIO, but a subsequent attempt to reopen the device will succeed.
+ * For ENXIO failures, any attempt to read from the device will return EIO, but
+ * any attempt to reopen the device will also return ENXIO.  The EILSEQ failures
+ * only apply to read operations (-T read) and will flip a bit after the device
+ * has read the original data.
+ *
+ * For label faults, the -L option must be specified. This allows faults
+ * to be injected into either the nvlist, uberblock, pad1, or pad2 region
+ * of all the labels for the specified device.
+ *
+ * This form of the command looks like:
+ *
+ * 	zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
+ *
+ *
+ * DATA FAULTS
+ *
+ * We begin with a tuple of the form:
+ *
+ * 	<type,level,range,object>
+ *
+ * 	type	A string describing the type of data to target.  Each type
+ * 		implicitly describes how to interpret 'object'. Currently,
+ * 		the following values are supported:
+ *
+ * 		data		User data for a file
+ * 		dnode		Dnode for a file or directory
+ *
+ *		The following MOS objects are special.  Instead of injecting
+ *		errors on a particular object or blkid, we inject errors across
+ *		all objects of the given type.
+ *
+ * 		mos		Any data in the MOS
+ * 		mosdir		object directory
+ * 		config		pool configuration
+ * 		bpobj		blkptr list
+ * 		spacemap	spacemap
+ * 		metaslab	metaslab
+ * 		errlog		persistent error log
+ *
+ * 	level	Object level.  Defaults to '0', not applicable to all types.  If
+ * 		a range is given, this corresponds to the indirect block
+ * 		corresponding to the specific range.
+ *
+ *	range	A numerical range [start,end) within the object.  Defaults to
+ *		the full size of the file.
+ *
+ * 	object	A string describing the logical location of the object.  For
+ * 		files and directories (currently the only supported types),
+ * 		this is the path of the object on disk.
+ *
+ * This is translated, via libzpool, into the following internal representation:
+ *
+ * 	<type,objset,object,level,range>
+ *
+ * These types should be self-explanatory.  This tuple is then passed to the
+ * kernel via a special ioctl() to initiate fault injection for the given
+ * object.  Note that 'type' is not strictly necessary for fault injection, but
+ * is used when translating existing faults into a human-readable string.
+ *
+ *
+ * The command itself takes one of the forms:
+ *
+ * 	zinject
+ * 	zinject <-a | -u pool>
+ * 	zinject -c <id|all>
+ * 	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
+ *	    [-r range] <object>
+ * 	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
+ *
+ * With no arguments, the command prints all currently registered injection
+ * handlers, with their numeric identifiers.
+ *
+ * The '-c' option will clear the given handler, or all handlers if 'all' is
+ * specified.
+ *
+ * The '-e' option takes a string describing the errno to simulate.  This must
+ * be one of 'io', 'checksum', 'decompress', or 'decrypt'.  In most cases this
+ * will result in the same behavior, but RAID-Z will produce a different set of
+ * ereports for this situation.
+ *
+ * The '-a', '-u', and '-m' flags toggle internal flush behavior.  If '-a' is
+ * specified, then the ARC cache is flushed appropriately.  If '-u' is
+ * specified, then the underlying SPA is unloaded.  Either of these flags can be
+ * specified independently of any other handlers.  The '-m' flag automatically
+ * does an unmount and remount of the underlying dataset to aid in flushing the
+ * cache.
+ *
+ * The '-f' flag controls the frequency of errors injected, expressed as a
+ * real number percentage between 0.0001 and 100.  The default is 100.
+ *
+ * The this form is responsible for actually injecting the handler into the
+ * framework.  It takes the arguments described above, translates them to the
+ * internal tuple using libzpool, and then issues an ioctl() to register the
+ * handler.
+ *
+ * The final form can target a specific bookmark, regardless of whether a
+ * human-readable interface has been designed.  It allows developers to specify
+ * a particular block by number.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/fs/zfs.h>
+#include <sys/mount.h>
+
+#include <libzfs.h>
+
+#undef verify	/* both libzfs.h and zfs_context.h want to define this */
+
+#include "zinject.h"
+
+libzfs_handle_t *g_zfs;
+int zfs_fd;
+
+static const char *errtable[TYPE_INVAL] = {
+	"data",
+	"dnode",
+	"mos",
+	"mosdir",
+	"metaslab",
+	"config",
+	"bpobj",
+	"spacemap",
+	"errlog",
+	"uber",
+	"nvlist",
+	"pad1",
+	"pad2"
+};
+
+static err_type_t
+name_to_type(const char *arg)
+{
+	int i;
+	for (i = 0; i < TYPE_INVAL; i++)
+		if (strcmp(errtable[i], arg) == 0)
+			return (i);
+
+	return (TYPE_INVAL);
+}
+
+static const char *
+type_to_name(uint64_t type)
+{
+	switch (type) {
+	case DMU_OT_OBJECT_DIRECTORY:
+		return ("mosdir");
+	case DMU_OT_OBJECT_ARRAY:
+		return ("metaslab");
+	case DMU_OT_PACKED_NVLIST:
+		return ("config");
+	case DMU_OT_BPOBJ:
+		return ("bpobj");
+	case DMU_OT_SPACE_MAP:
+		return ("spacemap");
+	case DMU_OT_ERROR_LOG:
+		return ("errlog");
+	default:
+		return ("-");
+	}
+}
+
+
+/*
+ * Print usage message.
+ */
+void
+usage(void)
+{
+	(void) printf(
+	    "usage:\n"
+	    "\n"
+	    "\tzinject\n"
+	    "\n"
+	    "\t\tList all active injection records.\n"
+	    "\n"
+	    "\tzinject -c <id|all>\n"
+	    "\n"
+	    "\t\tClear the particular record (if given a numeric ID), or\n"
+	    "\t\tall records if 'all' is specified.\n"
+	    "\n"
+	    "\tzinject -p <function name> pool\n"
+	    "\t\tInject a panic fault at the specified function. Only \n"
+	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
+	    "\t\tspa_vdev_exit() will trigger a panic.\n"
+	    "\n"
+	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
+	    "\t\t[-T <read|write|free|claim|all>] [-f frequency] pool\n\n"
+	    "\t\tInject a fault into a particular device or the device's\n"
+	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
+	    "\t\t'pad1', or 'pad2'.\n"
+	    "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n"
+	    "\t\t'corrupt' (bit flip).\n"
+	    "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
+	    "\t\tdevice error injection to a percentage of the IOs.\n"
+	    "\n"
+	    "\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
+	    "\t\tPerform a specific action on a particular device.\n"
+	    "\n"
+	    "\tzinject -d device -D latency:lanes pool\n"
+	    "\n"
+	    "\t\tAdd an artificial delay to IO requests on a particular\n"
+	    "\t\tdevice, such that the requests take a minimum of 'latency'\n"
+	    "\t\tmilliseconds to complete. Each delay has an associated\n"
+	    "\t\tnumber of 'lanes' which defines the number of concurrent\n"
+	    "\t\tIO requests that can be processed.\n"
+	    "\n"
+	    "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
+	    "\t\tthe device will only be able to service a single IO request\n"
+	    "\t\tat a time with each request taking 10 ms to complete. So,\n"
+	    "\t\tif only a single request is submitted every 10 ms, the\n"
+	    "\t\taverage latency will be 10 ms; but if more than one request\n"
+	    "\t\tis submitted every 10 ms, the average latency will be more\n"
+	    "\t\tthan 10 ms.\n"
+	    "\n"
+	    "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
+	    "\t\tlanes (-D 10:2), then the device will be able to service\n"
+	    "\t\ttwo requests at a time, each with a minimum latency of\n"
+	    "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
+	    "\t\tthe average latency will be 10 ms; but if more than two\n"
+	    "\t\trequests are submitted every 10 ms, the average latency\n"
+	    "\t\twill be more than 10 ms.\n"
+	    "\n"
+	    "\t\tAlso note, these delays are additive. So two invocations\n"
+	    "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
+	    "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
+	    "\t\tlanes with differing target latencies. For example, an\n"
+	    "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
+	    "\t\tcreate 3 lanes on the device; one lane with a latency\n"
+	    "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
+	    "\n"
+	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
+	    "\t\tCause the pool to stop writing blocks yet not\n"
+	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
+	    "\t\tthat fails to honor cache flush requests.\n"
+	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
+	    "\t\tat the end of the duration.\n"
+	    "\n"
+	    "\tzinject -b objset:object:level:blkid pool\n"
+	    "\n"
+	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
+	    "\t\tspecified by the remaining tuple.  Each number is in\n"
+	    "\t\thexadecimal, and only one block can be specified.\n"
+	    "\n"
+	    "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
+	    "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
+	    "\n"
+	    "\t\tInject an error into the object specified by the '-t' option\n"
+	    "\t\tand the object descriptor.  The 'object' parameter is\n"
+	    "\t\tinterpreted depending on the '-t' option.\n"
+	    "\n"
+	    "\t\t-q\tQuiet mode.  Only print out the handler number added.\n"
+	    "\t\t-e\tInject a specific error.  Must be one of 'io',\n"
+	    "\t\t\t'checksum', 'decompress', or 'decrypt'.  Default is 'io'.\n"
+	    "\t\t-C\tInject the given error only into specific DVAs. The\n"
+	    "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
+	    "\t\t\tseparated by commas (ex. '0,2').\n"
+	    "\t\t-l\tInject error at a particular block level. Default is "
+	    "0.\n"
+	    "\t\t-m\tAutomatically remount underlying filesystem.\n"
+	    "\t\t-r\tInject error over a particular logical range of an\n"
+	    "\t\t\tobject.  Will be translated to the appropriate blkid\n"
+	    "\t\t\trange according to the object's properties.\n"
+	    "\t\t-a\tFlush the ARC cache.  Can be specified without any\n"
+	    "\t\t\tassociated object.\n"
+	    "\t\t-u\tUnload the associated pool.  Can be specified with only\n"
+	    "\t\t\ta pool object.\n"
+	    "\t\t-f\tOnly inject errors a fraction of the time.  Expressed as\n"
+	    "\t\t\ta percentage between 0.0001 and 100.\n"
+	    "\n"
+	    "\t-t data\t\tInject an error into the plain file contents of a\n"
+	    "\t\t\tfile.  The object must be specified as a complete path\n"
+	    "\t\t\tto a file on a ZFS filesystem.\n"
+	    "\n"
+	    "\t-t dnode\tInject an error into the metadnode in the block\n"
+	    "\t\t\tcorresponding to the dnode for a file or directory.  The\n"
+	    "\t\t\t'-r' option is incompatible with this mode.  The object\n"
+	    "\t\t\tis specified as a complete path to a file or directory\n"
+	    "\t\t\ton a ZFS filesystem.\n"
+	    "\n"
+	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
+	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
+	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
+	    "\t\t\tthe poolname.\n");
+}
+
+static int
+iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
+    void *data)
+{
+	zfs_cmd_t zc = {"\0"};
+	int ret;
+
+	while (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
+		if ((ret = func((int)zc.zc_guid, zc.zc_name,
+		    &zc.zc_inject_record, data)) != 0)
+			return (ret);
+
+	if (errno != ENOENT) {
+		(void) fprintf(stderr, "Unable to list handlers: %s\n",
+		    strerror(errno));
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+print_data_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	int *count = data;
+
+	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
+		return (0);
+
+	if (*count == 0) {
+		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-4s  "
+		    "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
+		    "LVL", "DVAs", "RANGE");
+		(void) printf("---  ---------------  ------  "
+		    "------  --------  ---  ----  ---------------\n");
+	}
+
+	*count += 1;
+
+	(void) printf("%3d  %-15s  %-6llu  %-6llu  %-8s  %-3d  0x%02x  ",
+	    id, pool, (u_longlong_t)record->zi_objset,
+	    (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
+	    record->zi_level, record->zi_dvas);
+
+
+	if (record->zi_start == 0 &&
+	    record->zi_end == -1ULL)
+		(void) printf("all\n");
+	else
+		(void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
+		    (u_longlong_t)record->zi_end);
+
+	return (0);
+}
+
+static int
+print_device_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	int *count = data;
+
+	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
+		return (0);
+
+	if (record->zi_cmd == ZINJECT_DELAY_IO)
+		return (0);
+
+	if (*count == 0) {
+		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
+		(void) printf("---  ---------------  ----------------\n");
+	}
+
+	*count += 1;
+
+	(void) printf("%3d  %-15s  %llx\n", id, pool,
+	    (u_longlong_t)record->zi_guid);
+
+	return (0);
+}
+
+static int
+print_delay_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	int *count = data;
+
+	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
+		return (0);
+
+	if (record->zi_cmd != ZINJECT_DELAY_IO)
+		return (0);
+
+	if (*count == 0) {
+		(void) printf("%3s  %-15s  %-15s  %-15s  %s\n",
+		    "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
+		(void) printf("---  ---------------  ---------------  "
+		    "---------------  ----------------\n");
+	}
+
+	*count += 1;
+
+	(void) printf("%3d  %-15s  %-15llu  %-15llu  %llx\n", id, pool,
+	    (u_longlong_t)NSEC2MSEC(record->zi_timer),
+	    (u_longlong_t)record->zi_nlanes,
+	    (u_longlong_t)record->zi_guid);
+
+	return (0);
+}
+
+static int
+print_panic_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	int *count = data;
+
+	if (record->zi_func[0] == '\0')
+		return (0);
+
+	if (*count == 0) {
+		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
+		(void) printf("---  ---------------  ----------------\n");
+	}
+
+	*count += 1;
+
+	(void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
+
+	return (0);
+}
+
+/*
+ * Print all registered error handlers.  Returns the number of handlers
+ * registered.
+ */
+static int
+print_all_handlers(void)
+{
+	int count = 0, total = 0;
+
+	(void) iter_handlers(print_device_handler, &count);
+	if (count > 0) {
+		total += count;
+		(void) printf("\n");
+		count = 0;
+	}
+
+	(void) iter_handlers(print_delay_handler, &count);
+	if (count > 0) {
+		total += count;
+		(void) printf("\n");
+		count = 0;
+	}
+
+	(void) iter_handlers(print_data_handler, &count);
+	if (count > 0) {
+		total += count;
+		(void) printf("\n");
+		count = 0;
+	}
+
+	(void) iter_handlers(print_panic_handler, &count);
+
+	return (count + total);
+}
+
+/* ARGSUSED */
+static int
+cancel_one_handler(int id, const char *pool, zinject_record_t *record,
+    void *data)
+{
+	zfs_cmd_t zc = {"\0"};
+
+	zc.zc_guid = (uint64_t)id;
+
+	if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
+		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
+		    id, strerror(errno));
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * Remove all fault injection handlers.
+ */
+static int
+cancel_all_handlers(void)
+{
+	int ret = iter_handlers(cancel_one_handler, NULL);
+
+	if (ret == 0)
+		(void) printf("removed all registered handlers\n");
+
+	return (ret);
+}
+
+/*
+ * Remove a specific fault injection handler.
+ */
+static int
+cancel_handler(int id)
+{
+	zfs_cmd_t zc = {"\0"};
+
+	zc.zc_guid = (uint64_t)id;
+
+	if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
+		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
+		    id, strerror(errno));
+		return (1);
+	}
+
+	(void) printf("removed handler %d\n", id);
+
+	return (0);
+}
+
+/*
+ * Register a new fault injection handler.
+ */
+static int
+register_handler(const char *pool, int flags, zinject_record_t *record,
+    int quiet)
+{
+	zfs_cmd_t zc = {"\0"};
+
+	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+	zc.zc_inject_record = *record;
+	zc.zc_guid = flags;
+
+	if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
+		(void) fprintf(stderr, "failed to add handler: %s\n",
+		    errno == EDOM ? "block level exceeds max level of object" :
+		    strerror(errno));
+		return (1);
+	}
+
+	if (flags & ZINJECT_NULL)
+		return (0);
+
+	if (quiet) {
+		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
+	} else {
+		(void) printf("Added handler %llu with the following "
+		    "properties:\n", (u_longlong_t)zc.zc_guid);
+		(void) printf("  pool: %s\n", pool);
+		if (record->zi_guid) {
+			(void) printf("  vdev: %llx\n",
+			    (u_longlong_t)record->zi_guid);
+		} else if (record->zi_func[0] != '\0') {
+			(void) printf("  panic function: %s\n",
+			    record->zi_func);
+		} else if (record->zi_duration > 0) {
+			(void) printf(" time: %lld seconds\n",
+			    (u_longlong_t)record->zi_duration);
+		} else if (record->zi_duration < 0) {
+			(void) printf(" txgs: %lld \n",
+			    (u_longlong_t)-record->zi_duration);
+		} else {
+			(void) printf("objset: %llu\n",
+			    (u_longlong_t)record->zi_objset);
+			(void) printf("object: %llu\n",
+			    (u_longlong_t)record->zi_object);
+			(void) printf("  type: %llu\n",
+			    (u_longlong_t)record->zi_type);
+			(void) printf(" level: %d\n", record->zi_level);
+			if (record->zi_start == 0 &&
+			    record->zi_end == -1ULL)
+				(void) printf(" range: all\n");
+			else
+				(void) printf(" range: [%llu, %llu)\n",
+				    (u_longlong_t)record->zi_start,
+				    (u_longlong_t)record->zi_end);
+			(void) printf("  dvas: 0x%x\n", record->zi_dvas);
+		}
+	}
+
+	return (0);
+}
+
+static int
+perform_action(const char *pool, zinject_record_t *record, int cmd)
+{
+	zfs_cmd_t zc = {"\0"};
+
+	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
+	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+	zc.zc_guid = record->zi_guid;
+	zc.zc_cookie = cmd;
+
+	if (zfs_ioctl(g_zfs, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+		return (0);
+
+	return (1);
+}
+
+static int
+parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
+{
+	unsigned long scan_delay;
+	unsigned long scan_nlanes;
+
+	if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
+		return (1);
+
+	/*
+	 * We explicitly disallow a delay of zero here, because we key
+	 * off this value being non-zero in translate_device(), to
+	 * determine if the fault is a ZINJECT_DELAY_IO fault or not.
+	 */
+	if (scan_delay == 0)
+		return (1);
+
+	/*
+	 * The units for the CLI delay parameter is milliseconds, but
+	 * the data passed to the kernel is interpreted as nanoseconds.
+	 * Thus we scale the milliseconds to nanoseconds here, and this
+	 * nanosecond value is used to pass the delay to the kernel.
+	 */
+	*delay = MSEC2NSEC(scan_delay);
+	*nlanes = scan_nlanes;
+
+	return (0);
+}
+
+static int
+parse_frequency(const char *str, uint32_t *percent)
+{
+	double val;
+	char *post;
+
+	val = strtod(str, &post);
+	if (post == NULL || *post != '\0')
+		return (EINVAL);
+
+	/* valid range is [0.0001, 100.0] */
+	val /= 100.0f;
+	if (val < 0.000001f || val > 1.0f)
+		return (ERANGE);
+
+	/* convert to an integer for use by kernel */
+	*percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX));
+
+	return (0);
+}
+
+/*
+ * This function converts a string specifier for DVAs into a bit mask.
+ * The dva's provided by the user should be 0 indexed and separated by
+ * a comma. For example:
+ *	"1"	-> 0b0010  (0x2)
+ *	"0,1"	-> 0b0011  (0x3)
+ *	"0,1,2"	-> 0b0111  (0x7)
+ */
+static int
+parse_dvas(const char *str, uint32_t *dvas_out)
+{
+	const char *c = str;
+	uint32_t mask = 0;
+	boolean_t need_delim = B_FALSE;
+
+	/* max string length is 5 ("0,1,2") */
+	if (strlen(str) > 5 || strlen(str) == 0)
+		return (EINVAL);
+
+	while (*c != '\0') {
+		switch (*c) {
+		case '0':
+		case '1':
+		case '2':
+			/* check for pipe between DVAs */
+			if (need_delim)
+				return (EINVAL);
+
+			/* check if this DVA has been set already */
+			if (mask & (1 << ((*c) - '0')))
+				return (EINVAL);
+
+			mask |= (1 << ((*c) - '0'));
+			need_delim = B_TRUE;
+			break;
+		case ',':
+			need_delim = B_FALSE;
+			break;
+		default:
+			/* check for invalid character */
+			return (EINVAL);
+		}
+		c++;
+	}
+
+	/* check for dangling delimiter */
+	if (!need_delim)
+		return (EINVAL);
+
+	*dvas_out = mask;
+	return (0);
+}
+
+int
+main(int argc, char **argv)
+{
+	int c;
+	char *range = NULL;
+	char *cancel = NULL;
+	char *end;
+	char *raw = NULL;
+	char *device = NULL;
+	int level = 0;
+	int quiet = 0;
+	int error = 0;
+	int domount = 0;
+	int io_type = ZIO_TYPES;
+	int action = VDEV_STATE_UNKNOWN;
+	err_type_t type = TYPE_INVAL;
+	err_type_t label = TYPE_INVAL;
+	zinject_record_t record = { 0 };
+	char pool[MAXNAMELEN] = "";
+	char dataset[MAXNAMELEN] = "";
+	zfs_handle_t *zhp = NULL;
+	int nowrites = 0;
+	int dur_txg = 0;
+	int dur_secs = 0;
+	int ret;
+	int flags = 0;
+	uint32_t dvas = 0;
+
+	if ((g_zfs = libzfs_init()) == NULL) {
+		(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+		return (1);
+	}
+
+	libzfs_print_on_error(g_zfs, B_TRUE);
+
+	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
+		(void) fprintf(stderr, "failed to open ZFS device\n");
+		libzfs_fini(g_zfs);
+		return (1);
+	}
+
+	if (argc == 1) {
+		/*
+		 * No arguments.  Print the available handlers.  If there are no
+		 * available handlers, direct the user to '-h' for help
+		 * information.
+		 */
+		if (print_all_handlers() == 0) {
+			(void) printf("No handlers registered.\n");
+			(void) printf("Run 'zinject -h' for usage "
+			    "information.\n");
+		}
+		libzfs_fini(g_zfs);
+		return (0);
+	}
+
+	while ((c = getopt(argc, argv,
+	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+		switch (c) {
+		case 'a':
+			flags |= ZINJECT_FLUSH_ARC;
+			break;
+		case 'A':
+			if (strcasecmp(optarg, "degrade") == 0) {
+				action = VDEV_STATE_DEGRADED;
+			} else if (strcasecmp(optarg, "fault") == 0) {
+				action = VDEV_STATE_FAULTED;
+			} else {
+				(void) fprintf(stderr, "invalid action '%s': "
+				    "must be 'degrade' or 'fault'\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case 'b':
+			raw = optarg;
+			break;
+		case 'c':
+			cancel = optarg;
+			break;
+		case 'C':
+			ret = parse_dvas(optarg, &dvas);
+			if (ret != 0) {
+				(void) fprintf(stderr, "invalid DVA list '%s': "
+				    "DVAs should be 0 indexed and separated by "
+				    "commas.\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case 'd':
+			device = optarg;
+			break;
+		case 'D':
+			errno = 0;
+			ret = parse_delay(optarg, &record.zi_timer,
+			    &record.zi_nlanes);
+			if (ret != 0) {
+
+				(void) fprintf(stderr, "invalid i/o delay "
+				    "value: '%s'\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case 'e':
+			if (strcasecmp(optarg, "io") == 0) {
+				error = EIO;
+			} else if (strcasecmp(optarg, "checksum") == 0) {
+				error = ECKSUM;
+			} else if (strcasecmp(optarg, "decompress") == 0) {
+				error = EINVAL;
+			} else if (strcasecmp(optarg, "decrypt") == 0) {
+				error = EACCES;
+			} else if (strcasecmp(optarg, "nxio") == 0) {
+				error = ENXIO;
+			} else if (strcasecmp(optarg, "dtl") == 0) {
+				error = ECHILD;
+			} else if (strcasecmp(optarg, "corrupt") == 0) {
+				error = EILSEQ;
+			} else {
+				(void) fprintf(stderr, "invalid error type "
+				    "'%s': must be 'io', 'checksum' or "
+				    "'nxio'\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case 'f':
+			ret = parse_frequency(optarg, &record.zi_freq);
+			if (ret != 0) {
+				(void) fprintf(stderr, "%sfrequency value must "
+				    "be in the range [0.0001, 100.0]\n",
+				    ret == EINVAL ? "invalid value: " :
+				    ret == ERANGE ? "out of range: " : "");
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case 'F':
+			record.zi_failfast = B_TRUE;
+			break;
+		case 'g':
+			dur_txg = 1;
+			record.zi_duration = (int)strtol(optarg, &end, 10);
+			if (record.zi_duration <= 0 || *end != '\0') {
+				(void) fprintf(stderr, "invalid duration '%s': "
+				    "must be a positive integer\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			/* store duration of txgs as its negative */
+			record.zi_duration *= -1;
+			break;
+		case 'h':
+			usage();
+			libzfs_fini(g_zfs);
+			return (0);
+		case 'I':
+			/* default duration, if one hasn't yet been defined */
+			nowrites = 1;
+			if (dur_secs == 0 && dur_txg == 0)
+				record.zi_duration = 30;
+			break;
+		case 'l':
+			level = (int)strtol(optarg, &end, 10);
+			if (*end != '\0') {
+				(void) fprintf(stderr, "invalid level '%s': "
+				    "must be an integer\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case 'm':
+			domount = 1;
+			break;
+		case 'p':
+			(void) strlcpy(record.zi_func, optarg,
+			    sizeof (record.zi_func));
+			record.zi_cmd = ZINJECT_PANIC;
+			break;
+		case 'q':
+			quiet = 1;
+			break;
+		case 'r':
+			range = optarg;
+			flags |= ZINJECT_CALC_RANGE;
+			break;
+		case 's':
+			dur_secs = 1;
+			record.zi_duration = (int)strtol(optarg, &end, 10);
+			if (record.zi_duration <= 0 || *end != '\0') {
+				(void) fprintf(stderr, "invalid duration '%s': "
+				    "must be a positive integer\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case 'T':
+			if (strcasecmp(optarg, "read") == 0) {
+				io_type = ZIO_TYPE_READ;
+			} else if (strcasecmp(optarg, "write") == 0) {
+				io_type = ZIO_TYPE_WRITE;
+			} else if (strcasecmp(optarg, "free") == 0) {
+				io_type = ZIO_TYPE_FREE;
+			} else if (strcasecmp(optarg, "claim") == 0) {
+				io_type = ZIO_TYPE_CLAIM;
+			} else if (strcasecmp(optarg, "all") == 0) {
+				io_type = ZIO_TYPES;
+			} else {
+				(void) fprintf(stderr, "invalid I/O type "
+				    "'%s': must be 'read', 'write', 'free', "
+				    "'claim' or 'all'\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case 't':
+			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
+			    !MOS_TYPE(type)) {
+				(void) fprintf(stderr, "invalid type '%s'\n",
+				    optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case 'u':
+			flags |= ZINJECT_UNLOAD_SPA;
+			break;
+		case 'L':
+			if ((label = name_to_type(optarg)) == TYPE_INVAL &&
+			    !LABEL_TYPE(type)) {
+				(void) fprintf(stderr, "invalid label type "
+				    "'%s'\n", optarg);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			break;
+		case ':':
+			(void) fprintf(stderr, "option -%c requires an "
+			    "operand\n", optopt);
+			usage();
+			libzfs_fini(g_zfs);
+			return (1);
+		case '?':
+			(void) fprintf(stderr, "invalid option '%c'\n",
+			    optopt);
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (record.zi_duration != 0)
+		record.zi_cmd = ZINJECT_IGNORED_WRITES;
+
+	if (cancel != NULL) {
+		/*
+		 * '-c' is invalid with any other options.
+		 */
+		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
+		    record.zi_freq > 0 || dvas != 0) {
+			(void) fprintf(stderr, "cancel (-c) incompatible with "
+			    "any other options\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+		if (argc != 0) {
+			(void) fprintf(stderr, "extraneous argument to '-c'\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		if (strcmp(cancel, "all") == 0) {
+			return (cancel_all_handlers());
+		} else {
+			int id = (int)strtol(cancel, &end, 10);
+			if (*end != '\0') {
+				(void) fprintf(stderr, "invalid handle id '%s':"
+				    " must be an integer or 'all'\n", cancel);
+				usage();
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+			return (cancel_handler(id));
+		}
+	}
+
+	if (device != NULL) {
+		/*
+		 * Device (-d) injection uses a completely different mechanism
+		 * for doing injection, so handle it separately here.
+		 */
+		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
+		    dvas != 0) {
+			(void) fprintf(stderr, "device (-d) incompatible with "
+			    "data error injection\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		if (argc != 1) {
+			(void) fprintf(stderr, "device (-d) injection requires "
+			    "a single pool name\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		(void) strlcpy(pool, argv[0], sizeof (pool));
+		dataset[0] = '\0';
+
+		if (error == ECKSUM) {
+			(void) fprintf(stderr, "device error type must be "
+			    "'io', 'nxio' or 'corrupt'\n");
+			libzfs_fini(g_zfs);
+			return (1);
+		}
+
+		if (error == EILSEQ &&
+		    (record.zi_freq == 0 || io_type != ZIO_TYPE_READ)) {
+			(void) fprintf(stderr, "device corrupt errors require "
+			    "io type read and a frequency value\n");
+			libzfs_fini(g_zfs);
+			return (1);
+		}
+
+		record.zi_iotype = io_type;
+		if (translate_device(pool, device, label, &record) != 0) {
+			libzfs_fini(g_zfs);
+			return (1);
+		}
+		if (!error)
+			error = ENXIO;
+
+		if (action != VDEV_STATE_UNKNOWN)
+			return (perform_action(pool, &record, action));
+
+	} else if (raw != NULL) {
+		if (range != NULL || type != TYPE_INVAL || level != 0 ||
+		    record.zi_cmd != ZINJECT_UNINITIALIZED ||
+		    record.zi_freq > 0 || dvas != 0) {
+			(void) fprintf(stderr, "raw (-b) format with "
+			    "any other options\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		if (argc != 1) {
+			(void) fprintf(stderr, "raw (-b) format expects a "
+			    "single pool name\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		(void) strlcpy(pool, argv[0], sizeof (pool));
+		dataset[0] = '\0';
+
+		if (error == ENXIO) {
+			(void) fprintf(stderr, "data error type must be "
+			    "'checksum' or 'io'\n");
+			libzfs_fini(g_zfs);
+			return (1);
+		}
+
+		record.zi_cmd = ZINJECT_DATA_FAULT;
+		if (translate_raw(raw, &record) != 0) {
+			libzfs_fini(g_zfs);
+			return (1);
+		}
+		if (!error)
+			error = EIO;
+	} else if (record.zi_cmd == ZINJECT_PANIC) {
+		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+		    level != 0 || device != NULL || record.zi_freq > 0 ||
+		    dvas != 0) {
+			(void) fprintf(stderr, "panic (-p) incompatible with "
+			    "other options\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		if (argc < 1 || argc > 2) {
+			(void) fprintf(stderr, "panic (-p) injection requires "
+			    "a single pool name and an optional id\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		(void) strlcpy(pool, argv[0], sizeof (pool));
+		if (argv[1] != NULL)
+			record.zi_type = atoi(argv[1]);
+		dataset[0] = '\0';
+	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
+		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+		    level != 0 || record.zi_freq > 0 || dvas != 0) {
+			(void) fprintf(stderr, "hardware failure (-I) "
+			    "incompatible with other options\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		if (nowrites == 0) {
+			(void) fprintf(stderr, "-s or -g meaningless "
+			    "without -I (ignore writes)\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		} else if (dur_secs && dur_txg) {
+			(void) fprintf(stderr, "choose a duration either "
+			    "in seconds (-s) or a number of txgs (-g) "
+			    "but not both\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		} else if (argc != 1) {
+			(void) fprintf(stderr, "ignore writes (-I) "
+			    "injection requires a single pool name\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		(void) strlcpy(pool, argv[0], sizeof (pool));
+		dataset[0] = '\0';
+	} else if (type == TYPE_INVAL) {
+		if (flags == 0) {
+			(void) fprintf(stderr, "at least one of '-b', '-d', "
+			    "'-t', '-a', '-p', '-I' or '-u' "
+			    "must be specified\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
+			(void) strlcpy(pool, argv[0], sizeof (pool));
+			dataset[0] = '\0';
+		} else if (argc != 0) {
+			(void) fprintf(stderr, "extraneous argument for "
+			    "'-f'\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		flags |= ZINJECT_NULL;
+	} else {
+		if (argc != 1) {
+			(void) fprintf(stderr, "missing object\n");
+			usage();
+			libzfs_fini(g_zfs);
+			return (2);
+		}
+
+		if (error == ENXIO || error == EILSEQ) {
+			(void) fprintf(stderr, "data error type must be "
+			    "'checksum' or 'io'\n");
+			libzfs_fini(g_zfs);
+			return (1);
+		}
+
+		if (dvas != 0) {
+			if (error == EACCES || error == EINVAL) {
+				(void) fprintf(stderr, "the '-C' option may "
+				    "not be used with logical data errors "
+				    "'decrypt' and 'decompress'\n");
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+
+			record.zi_dvas = dvas;
+		}
+
+		if (error == EACCES) {
+			if (type != TYPE_DATA) {
+				(void) fprintf(stderr, "decryption errors "
+				    "may only be injected for 'data' types\n");
+				libzfs_fini(g_zfs);
+				return (1);
+			}
+
+			record.zi_cmd = ZINJECT_DECRYPT_FAULT;
+			/*
+			 * Internally, ZFS actually uses ECKSUM for decryption
+			 * errors since EACCES is used to indicate the key was
+			 * not found.
+			 */
+			error = ECKSUM;
+		} else {
+			record.zi_cmd = ZINJECT_DATA_FAULT;
+		}
+
+		if (translate_record(type, argv[0], range, level, &record, pool,
+		    dataset) != 0) {
+			libzfs_fini(g_zfs);
+			return (1);
+		}
+		if (!error)
+			error = EIO;
+	}
+
+	/*
+	 * If this is pool-wide metadata, unmount everything.  The ioctl() will
+	 * unload the pool, so that we trigger spa-wide reopen of metadata next
+	 * time we access the pool.
+	 */
+	if (dataset[0] != '\0' && domount) {
+		if ((zhp = zfs_open(g_zfs, dataset,
+		    ZFS_TYPE_DATASET)) == NULL) {
+			libzfs_fini(g_zfs);
+			return (1);
+		}
+		if (zfs_unmount(zhp, NULL, 0) != 0) {
+			libzfs_fini(g_zfs);
+			return (1);
+		}
+	}
+
+	record.zi_error = error;
+
+	ret = register_handler(pool, flags, &record, quiet);
+
+	if (dataset[0] != '\0' && domount)
+		ret = (zfs_mount(zhp, NULL, 0) != 0);
+
+	libzfs_fini(g_zfs);
+
+	return (ret);
+}
diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.h b/sys/contrib/openzfs/cmd/zinject/zinject.h
new file mode 100644
index 000000000000..46fdcad8b31f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/zinject.h
@@ -0,0 +1,70 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_ZINJECT_H
+#define	_ZINJECT_H
+
+#include <sys/zfs_ioctl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	TYPE_DATA,		/* plain file contents		*/
+	TYPE_DNODE,		/* metadnode contents		*/
+	TYPE_MOS,		/* all MOS data			*/
+	TYPE_MOSDIR,		/* MOS object directory		*/
+	TYPE_METASLAB,		/* metaslab objects		*/
+	TYPE_CONFIG,		/* MOS config			*/
+	TYPE_BPOBJ,		/* block pointer list		*/
+	TYPE_SPACEMAP,		/* space map objects		*/
+	TYPE_ERRLOG,		/* persistent error log		*/
+	TYPE_LABEL_UBERBLOCK,	/* label specific uberblock	*/
+	TYPE_LABEL_NVLIST,	/* label specific nvlist	*/
+	TYPE_LABEL_PAD1,	/* label specific 8K pad1 area	*/
+	TYPE_LABEL_PAD2,	/* label specific 8K pad2 area	*/
+	TYPE_INVAL
+} err_type_t;
+
+#define	MOS_TYPE(t)	\
+	((t) >= TYPE_MOS && (t) < TYPE_LABEL_UBERBLOCK)
+
+#define	LABEL_TYPE(t)	\
+	((t) >= TYPE_LABEL_UBERBLOCK && (t) < TYPE_INVAL)
+
+int translate_record(err_type_t type, const char *object, const char *range,
+    int level, zinject_record_t *record, char *poolname, char *dataset);
+int translate_raw(const char *raw, zinject_record_t *record);
+int translate_device(const char *pool, const char *device,
+    err_type_t label_type, zinject_record_t *record);
+void usage(void);
+
+extern libzfs_handle_t *g_zfs;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZINJECT_H */
diff --git a/sys/contrib/openzfs/cmd/zpool/.gitignore b/sys/contrib/openzfs/cmd/zpool/.gitignore
new file mode 100644
index 000000000000..8ea518af78e5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/.gitignore
@@ -0,0 +1 @@
+/zpool
diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am
new file mode 100644
index 000000000000..c0378b136901
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am
@@ -0,0 +1,136 @@
+include $(top_srcdir)/config/Rules.am
+
+AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUUID_CFLAGS)
+
+DEFAULT_INCLUDES += -I$(srcdir)
+
+sbin_PROGRAMS = zpool
+
+zpool_SOURCES = \
+	zpool_iter.c \
+	zpool_main.c \
+	zpool_util.c \
+	zpool_util.h \
+	zpool_vdev.c
+
+if BUILD_FREEBSD
+zpool_SOURCES += os/freebsd/zpool_vdev_os.c
+endif
+
+if BUILD_LINUX
+zpool_SOURCES += os/linux/zpool_vdev_os.c
+endif
+
+zpool_LDADD = \
+	$(abs_top_builddir)/lib/libzfs/libzfs.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+	$(abs_top_builddir)/lib/libnvpair/libnvpair.la \
+	$(abs_top_builddir)/lib/libuutil/libuutil.la
+
+zpool_LDADD += $(LTLIBINTL)
+
+if BUILD_FREEBSD
+zpool_LDADD += -lgeom
+endif
+zpool_LDADD += -lm $(LIBBLKID_LIBS) $(LIBUUID_LIBS)
+
+zpoolconfdir = $(sysconfdir)/zfs/zpool.d
+zpoolexecdir = $(zfsexecdir)/zpool.d
+
+EXTRA_DIST = zpool.d/README
+
+dist_zpoolexec_SCRIPTS = \
+	zpool.d/dm-deps \
+	zpool.d/enc \
+	zpool.d/encdev \
+	zpool.d/fault_led \
+	zpool.d/iostat \
+	zpool.d/iostat-1s \
+	zpool.d/iostat-10s \
+	zpool.d/label \
+	zpool.d/locate_led \
+	zpool.d/lsblk \
+	zpool.d/media \
+	zpool.d/model \
+	zpool.d/serial \
+	zpool.d/ses \
+	zpool.d/size \
+	zpool.d/slot \
+	zpool.d/smart \
+	zpool.d/smartx \
+	zpool.d/temp \
+	zpool.d/health \
+	zpool.d/r_proc \
+	zpool.d/w_proc \
+	zpool.d/r_ucor \
+	zpool.d/w_ucor \
+	zpool.d/nonmed \
+	zpool.d/defect \
+	zpool.d/hours_on \
+	zpool.d/realloc \
+	zpool.d/rep_ucor \
+	zpool.d/cmd_to \
+	zpool.d/pend_sec \
+	zpool.d/off_ucor \
+	zpool.d/ata_err \
+	zpool.d/nvme_err \
+	zpool.d/pwr_cyc \
+	zpool.d/upath \
+	zpool.d/vendor \
+	zpool.d/smart_test \
+	zpool.d/test_type \
+	zpool.d/test_status \
+	zpool.d/test_progress \
+	zpool.d/test_ended
+
+zpoolconfdefaults = \
+	dm-deps \
+	enc \
+	encdev \
+	fault_led \
+	iostat \
+	iostat-1s \
+	iostat-10s \
+	label \
+	locate_led \
+	lsblk \
+	media \
+	model \
+	serial \
+	ses \
+	size \
+	slot \
+	smart \
+	smartx \
+	temp \
+	health \
+	r_proc \
+	w_proc \
+	r_ucor \
+	w_ucor \
+	nonmed \
+	defect \
+	hours_on \
+	realloc \
+	rep_ucor \
+	cmd_to \
+	pend_sec \
+	off_ucor \
+	ata_err \
+	nvme_err \
+	pwr_cyc \
+	upath \
+	vendor \
+	smart_test \
+	test_type \
+	test_status \
+	test_progress \
+	test_ended
+
+install-data-hook:
+	$(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)"
+	for f in $(zpoolconfdefaults); do \
+	  test -f "$(DESTDIR)$(zpoolconfdir)/$${f}" -o \
+	       -L "$(DESTDIR)$(zpoolconfdir)/$${f}" || \
+	    ln -s "$(zpoolexecdir)/$${f}" "$(DESTDIR)$(zpoolconfdir)"; \
+	done
diff --git a/sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c b/sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c
new file mode 100644
index 000000000000..7d48f61a0ee7
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c
@@ -0,0 +1,103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 Intel Corporation.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ */
+
+/*
+ * Functions to convert between a list of vdevs and an nvlist representing the
+ * configuration.  Each entry in the list can be one of:
+ *
+ * 	Device vdevs
+ * 		disk=(path=..., devid=...)
+ * 		file=(path=...)
+ *
+ * 	Group vdevs
+ * 		raidz[1|2]=(...)
+ * 		mirror=(...)
+ *
+ * 	Hot spares
+ *
+ * While the underlying implementation supports it, group vdevs cannot contain
+ * other group vdevs.  All userland verification of devices is contained within
+ * this file.  If successful, the nvlist returned can be passed directly to the
+ * kernel; we've done as much verification as possible in userland.
+ *
+ * Hot spares are a special case, and passed down as an array of disk vdevs, at
+ * the same level as the root of the vdev tree.
+ *
+ * The only function exported by this file is 'make_root_vdev'.  The
+ * function performs several passes:
+ *
+ * 	1. Construct the vdev specification.  Performs syntax validation and
+ *         makes sure each device is valid.
+ * 	2. Check for devices in use.  Using libdiskmgt, makes sure that no
+ *         devices are also in use.  Some can be overridden using the 'force'
+ *         flag, others cannot.
+ * 	3. Check for replication errors if the 'force' flag is not specified.
+ *         validates that the replication level is consistent across the
+ *         entire pool.
+ * 	4. Call libzfs to label any whole disks with an EFI label.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <libnvpair.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <sys/spa.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <paths.h>
+#include <sys/stat.h>
+#include <sys/disk.h>
+#include <sys/mntent.h>
+#include <libgeom.h>
+
+#include "zpool_util.h"
+#include <sys/zfs_context.h>
+
+int
+check_device(const char *name, boolean_t force, boolean_t isspare,
+    boolean_t iswholedisk)
+{
+	char path[MAXPATHLEN];
+
+	if (strncmp(name, _PATH_DEV, sizeof (_PATH_DEV) - 1) != 0)
+		snprintf(path, sizeof (path), "%s%s", _PATH_DEV, name);
+	else
+		strlcpy(path, name, sizeof (path));
+
+	return (check_file(path, force, isspare));
+}
+
+boolean_t
+check_sector_size_database(char *path, int *sector_size)
+{
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c b/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c
new file mode 100644
index 000000000000..d087c4c14dac
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c
@@ -0,0 +1,410 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 Intel Corporation.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ */
+
+/*
+ * Functions to convert between a list of vdevs and an nvlist representing the
+ * configuration.  Each entry in the list can be one of:
+ *
+ * 	Device vdevs
+ * 		disk=(path=..., devid=...)
+ * 		file=(path=...)
+ *
+ * 	Group vdevs
+ * 		raidz[1|2]=(...)
+ * 		mirror=(...)
+ *
+ * 	Hot spares
+ *
+ * While the underlying implementation supports it, group vdevs cannot contain
+ * other group vdevs.  All userland verification of devices is contained within
+ * this file.  If successful, the nvlist returned can be passed directly to the
+ * kernel; we've done as much verification as possible in userland.
+ *
+ * Hot spares are a special case, and passed down as an array of disk vdevs, at
+ * the same level as the root of the vdev tree.
+ *
+ * The only function exported by this file is 'make_root_vdev'.  The
+ * function performs several passes:
+ *
+ * 	1. Construct the vdev specification.  Performs syntax validation and
+ *         makes sure each device is valid.
+ * 	2. Check for devices in use.  Using libblkid to make sure that no
+ *         devices are also in use.  Some can be overridden using the 'force'
+ *         flag, others cannot.
+ * 	3. Check for replication errors if the 'force' flag is not specified.
+ *         validates that the replication level is consistent across the
+ *         entire pool.
+ * 	4. Call libzfs to label any whole disks with an EFI label.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <libnvpair.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <sys/spa.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "zpool_util.h"
+#include <sys/zfs_context.h>
+
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/efi_partition.h>
+#include <sys/stat.h>
+#include <sys/vtoc.h>
+#include <sys/mntent.h>
+#include <uuid/uuid.h>
+#include <blkid/blkid.h>
+
+typedef struct vdev_disk_db_entry
+{
+	char id[24];
+	int sector_size;
+} vdev_disk_db_entry_t;
+
+/*
+ * Database of block devices that lie about physical sector sizes.  The
+ * identification string must be precisely 24 characters to avoid false
+ * negatives
+ */
+static vdev_disk_db_entry_t vdev_disk_database[] = {
+	{"ATA     ADATA SSD S396 3", 8192},
+	{"ATA     APPLE SSD SM128E", 8192},
+	{"ATA     APPLE SSD SM256E", 8192},
+	{"ATA     APPLE SSD SM512E", 8192},
+	{"ATA     APPLE SSD SM768E", 8192},
+	{"ATA     C400-MTFDDAC064M", 8192},
+	{"ATA     C400-MTFDDAC128M", 8192},
+	{"ATA     C400-MTFDDAC256M", 8192},
+	{"ATA     C400-MTFDDAC512M", 8192},
+	{"ATA     Corsair Force 3 ", 8192},
+	{"ATA     Corsair Force GS", 8192},
+	{"ATA     INTEL SSDSA2CT04", 8192},
+	{"ATA     INTEL SSDSA2BZ10", 8192},
+	{"ATA     INTEL SSDSA2BZ20", 8192},
+	{"ATA     INTEL SSDSA2BZ30", 8192},
+	{"ATA     INTEL SSDSA2CW04", 8192},
+	{"ATA     INTEL SSDSA2CW08", 8192},
+	{"ATA     INTEL SSDSA2CW12", 8192},
+	{"ATA     INTEL SSDSA2CW16", 8192},
+	{"ATA     INTEL SSDSA2CW30", 8192},
+	{"ATA     INTEL SSDSA2CW60", 8192},
+	{"ATA     INTEL SSDSC2CT06", 8192},
+	{"ATA     INTEL SSDSC2CT12", 8192},
+	{"ATA     INTEL SSDSC2CT18", 8192},
+	{"ATA     INTEL SSDSC2CT24", 8192},
+	{"ATA     INTEL SSDSC2CW06", 8192},
+	{"ATA     INTEL SSDSC2CW12", 8192},
+	{"ATA     INTEL SSDSC2CW18", 8192},
+	{"ATA     INTEL SSDSC2CW24", 8192},
+	{"ATA     INTEL SSDSC2CW48", 8192},
+	{"ATA     KINGSTON SH100S3", 8192},
+	{"ATA     KINGSTON SH103S3", 8192},
+	{"ATA     M4-CT064M4SSD2  ", 8192},
+	{"ATA     M4-CT128M4SSD2  ", 8192},
+	{"ATA     M4-CT256M4SSD2  ", 8192},
+	{"ATA     M4-CT512M4SSD2  ", 8192},
+	{"ATA     OCZ-AGILITY2    ", 8192},
+	{"ATA     OCZ-AGILITY3    ", 8192},
+	{"ATA     OCZ-VERTEX2 3.5 ", 8192},
+	{"ATA     OCZ-VERTEX3     ", 8192},
+	{"ATA     OCZ-VERTEX3 LT  ", 8192},
+	{"ATA     OCZ-VERTEX3 MI  ", 8192},
+	{"ATA     OCZ-VERTEX4     ", 8192},
+	{"ATA     SAMSUNG MZ7WD120", 8192},
+	{"ATA     SAMSUNG MZ7WD240", 8192},
+	{"ATA     SAMSUNG MZ7WD480", 8192},
+	{"ATA     SAMSUNG MZ7WD960", 8192},
+	{"ATA     SAMSUNG SSD 830 ", 8192},
+	{"ATA     Samsung SSD 840 ", 8192},
+	{"ATA     SanDisk SSD U100", 8192},
+	{"ATA     TOSHIBA THNSNH06", 8192},
+	{"ATA     TOSHIBA THNSNH12", 8192},
+	{"ATA     TOSHIBA THNSNH25", 8192},
+	{"ATA     TOSHIBA THNSNH51", 8192},
+	{"ATA     APPLE SSD TS064C", 4096},
+	{"ATA     APPLE SSD TS128C", 4096},
+	{"ATA     APPLE SSD TS256C", 4096},
+	{"ATA     APPLE SSD TS512C", 4096},
+	{"ATA     INTEL SSDSA2M040", 4096},
+	{"ATA     INTEL SSDSA2M080", 4096},
+	{"ATA     INTEL SSDSA2M160", 4096},
+	{"ATA     INTEL SSDSC2MH12", 4096},
+	{"ATA     INTEL SSDSC2MH25", 4096},
+	{"ATA     OCZ CORE_SSD    ", 4096},
+	{"ATA     OCZ-VERTEX      ", 4096},
+	{"ATA     SAMSUNG MCCOE32G", 4096},
+	{"ATA     SAMSUNG MCCOE64G", 4096},
+	{"ATA     SAMSUNG SSD PM80", 4096},
+	/* Flash drives optimized for 4KB IOs on larger pages */
+	{"ATA     INTEL SSDSC2BA10", 4096},
+	{"ATA     INTEL SSDSC2BA20", 4096},
+	{"ATA     INTEL SSDSC2BA40", 4096},
+	{"ATA     INTEL SSDSC2BA80", 4096},
+	{"ATA     INTEL SSDSC2BB08", 4096},
+	{"ATA     INTEL SSDSC2BB12", 4096},
+	{"ATA     INTEL SSDSC2BB16", 4096},
+	{"ATA     INTEL SSDSC2BB24", 4096},
+	{"ATA     INTEL SSDSC2BB30", 4096},
+	{"ATA     INTEL SSDSC2BB40", 4096},
+	{"ATA     INTEL SSDSC2BB48", 4096},
+	{"ATA     INTEL SSDSC2BB60", 4096},
+	{"ATA     INTEL SSDSC2BB80", 4096},
+	{"ATA     INTEL SSDSC2BW24", 4096},
+	{"ATA     INTEL SSDSC2BW48", 4096},
+	{"ATA     INTEL SSDSC2BP24", 4096},
+	{"ATA     INTEL SSDSC2BP48", 4096},
+	{"NA      SmrtStorSDLKAE9W", 4096},
+	{"NVMe    Amazon EC2 NVMe ", 4096},
+	/* Imported from Open Solaris */
+	{"ATA     MARVELL SD88SA02", 4096},
+	/* Advanced format Hard drives */
+	{"ATA     Hitachi HDS5C303", 4096},
+	{"ATA     SAMSUNG HD204UI ", 4096},
+	{"ATA     ST2000DL004 HD20", 4096},
+	{"ATA     WDC WD10EARS-00M", 4096},
+	{"ATA     WDC WD10EARS-00S", 4096},
+	{"ATA     WDC WD10EARS-00Z", 4096},
+	{"ATA     WDC WD15EARS-00M", 4096},
+	{"ATA     WDC WD15EARS-00S", 4096},
+	{"ATA     WDC WD15EARS-00Z", 4096},
+	{"ATA     WDC WD20EARS-00M", 4096},
+	{"ATA     WDC WD20EARS-00S", 4096},
+	{"ATA     WDC WD20EARS-00Z", 4096},
+	{"ATA     WDC WD1600BEVT-0", 4096},
+	{"ATA     WDC WD2500BEVT-0", 4096},
+	{"ATA     WDC WD3200BEVT-0", 4096},
+	{"ATA     WDC WD5000BEVT-0", 4096},
+};
+
+
+#define	INQ_REPLY_LEN	96
+#define	INQ_CMD_LEN	6
+
+static const int vdev_disk_database_size =
+	sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
+
+boolean_t
+check_sector_size_database(char *path, int *sector_size)
+{
+	unsigned char inq_buff[INQ_REPLY_LEN];
+	unsigned char sense_buffer[32];
+	unsigned char inq_cmd_blk[INQ_CMD_LEN] =
+	    {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
+	sg_io_hdr_t io_hdr;
+	int error;
+	int fd;
+	int i;
+
+	/* Prepare INQUIRY command */
+	memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
+	io_hdr.interface_id = 'S';
+	io_hdr.cmd_len = sizeof (inq_cmd_blk);
+	io_hdr.mx_sb_len = sizeof (sense_buffer);
+	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+	io_hdr.dxfer_len = INQ_REPLY_LEN;
+	io_hdr.dxferp = inq_buff;
+	io_hdr.cmdp = inq_cmd_blk;
+	io_hdr.sbp = sense_buffer;
+	io_hdr.timeout = 10;		/* 10 milliseconds is ample time */
+
+	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
+		return (B_FALSE);
+
+	error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
+
+	(void) close(fd);
+
+	if (error < 0)
+		return (B_FALSE);
+
+	if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
+		return (B_FALSE);
+
+	for (i = 0; i < vdev_disk_database_size; i++) {
+		if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
+			continue;
+
+		*sector_size = vdev_disk_database[i].sector_size;
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
+{
+	int err;
+	char *value;
+
+	/* No valid type detected device is safe to use */
+	value = blkid_get_tag_value(cache, "TYPE", path);
+	if (value == NULL)
+		return (0);
+
+	/*
+	 * If libblkid detects a ZFS device, we check the device
+	 * using check_file() to see if it's safe.  The one safe
+	 * case is a spare device shared between multiple pools.
+	 */
+	if (strcmp(value, "zfs_member") == 0) {
+		err = check_file(path, force, isspare);
+	} else {
+		if (force) {
+			err = 0;
+		} else {
+			err = -1;
+			vdev_error(gettext("%s contains a filesystem of "
+			    "type '%s'\n"), path, value);
+		}
+	}
+
+	free(value);
+
+	return (err);
+}
+
+/*
+ * Validate that a disk including all partitions are safe to use.
+ *
+ * For EFI labeled disks this can done relatively easily with the libefi
+ * library.  The partition numbers are extracted from the label and used
+ * to generate the expected /dev/ paths.  Each partition can then be
+ * checked for conflicts.
+ *
+ * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
+ * but due to the lack of a readily available libraries this scanning is
+ * not implemented.  Instead only the device path as given is checked.
+ */
+static int
+check_disk(const char *path, blkid_cache cache, int force,
+    boolean_t isspare, boolean_t iswholedisk)
+{
+	struct dk_gpt *vtoc;
+	char slice_path[MAXPATHLEN];
+	int err = 0;
+	int fd, i;
+	int flags = O_RDONLY|O_DIRECT;
+
+	if (!iswholedisk)
+		return (check_slice(path, cache, force, isspare));
+
+	/* only spares can be shared, other devices require exclusive access */
+	if (!isspare)
+		flags |= O_EXCL;
+
+	if ((fd = open(path, flags)) < 0) {
+		char *value = blkid_get_tag_value(cache, "TYPE", path);
+		(void) fprintf(stderr, gettext("%s is in use and contains "
+		    "a %s filesystem.\n"), path, value ? value : "unknown");
+		free(value);
+		return (-1);
+	}
+
+	/*
+	 * Expected to fail for non-EFI labeled disks.  Just check the device
+	 * as given and do not attempt to detect and scan partitions.
+	 */
+	err = efi_alloc_and_read(fd, &vtoc);
+	if (err) {
+		(void) close(fd);
+		return (check_slice(path, cache, force, isspare));
+	}
+
+	/*
+	 * The primary efi partition label is damaged however the secondary
+	 * label at the end of the device is intact.  Rather than use this
+	 * label we should play it safe and treat this as a non efi device.
+	 */
+	if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
+		efi_free(vtoc);
+		(void) close(fd);
+
+		if (force) {
+			/* Partitions will now be created using the backup */
+			return (0);
+		} else {
+			vdev_error(gettext("%s contains a corrupt primary "
+			    "EFI label.\n"), path);
+			return (-1);
+		}
+	}
+
+	for (i = 0; i < vtoc->efi_nparts; i++) {
+
+		if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
+		    uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
+			continue;
+
+		if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
+			(void) snprintf(slice_path, sizeof (slice_path),
+			    "%s%s%d", path, "-part", i+1);
+		else
+			(void) snprintf(slice_path, sizeof (slice_path),
+			    "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
+			    "p" : "", i+1);
+
+		err = check_slice(slice_path, cache, force, isspare);
+		if (err)
+			break;
+	}
+
+	efi_free(vtoc);
+	(void) close(fd);
+
+	return (err);
+}
+
+int
+check_device(const char *path, boolean_t force,
+    boolean_t isspare, boolean_t iswholedisk)
+{
+	blkid_cache cache;
+	int error;
+
+	error = blkid_get_cache(&cache, NULL);
+	if (error != 0) {
+		(void) fprintf(stderr, gettext("unable to access the blkid "
+		    "cache.\n"));
+		return (-1);
+	}
+
+	error = check_disk(path, cache, force, isspare, iswholedisk);
+	blkid_put_cache(cache);
+
+	return (error);
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/README b/sys/contrib/openzfs/cmd/zpool/zpool.d/README
new file mode 100644
index 000000000000..033b7c363f5a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/README
@@ -0,0 +1,9 @@
+This directory contains scripts that can be run the zpool status/iostat
+-c option:
+
+	zpool status -c script1,script2, ...
+
+	zpool iostat -vc script1,script2, ...
+
+Some scripts output different values depending on the symlink name that is
+used to run them.  See the zpool(8) man page for more details.
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err b/sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to b/sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/defect b/sys/contrib/openzfs/cmd/zpool/zpool.d/defect
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/defect
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps b/sys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps
new file mode 100755
index 000000000000..ee39514e4d92
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps
@@ -0,0 +1,29 @@
+#!/bin/sh
+#
+# Show device mapper dependent / underlying devices.  This is useful for
+# looking up the /dev/sd* devices associated with a dm or multipath device. 
+#
+
+if [ "$1" = "-h" ] ; then
+	echo "Show device mapper dependent (underlying) devices."
+	exit
+fi
+
+dev="$VDEV_PATH"
+
+# If the VDEV path is a symlink, resolve it to a real device
+if [ -L "$dev" ] ; then
+	dev=$(readlink "$dev")
+fi
+
+dev=$(basename "$dev")
+val=""
+if [ -d "/sys/class/block/$dev/slaves" ] ; then
+	# ls -C: output in columns, no newlines
+	val=$(ls -C "/sys/class/block/$dev/slaves")
+
+	# ls -C will print two spaces between files; change to one space.
+	val=$(echo "$val" | sed -r 's/[[:blank:]]+/ /g')
+fi
+
+echo "dm-deps=$val"
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/enc b/sys/contrib/openzfs/cmd/zpool/zpool.d/enc
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/enc
@@ -0,0 +1 @@
+ses
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/encdev b/sys/contrib/openzfs/cmd/zpool/zpool.d/encdev
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/encdev
@@ -0,0 +1 @@
+ses
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led b/sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led
@@ -0,0 +1 @@
+ses
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/health b/sys/contrib/openzfs/cmd/zpool/zpool.d/health
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/health
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on b/sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat
new file mode 100755
index 000000000000..41a3acfae7a4
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat
@@ -0,0 +1,77 @@
+#!/bin/sh
+#
+# Display most relevant iostat bandwidth/latency numbers.  The output is
+# dependent on the name of the script/symlink used to call it.
+#
+
+helpstr="
+iostat:		Show iostat values since boot (summary page).
+iostat-1s:	Do a single 1-second iostat sample and show values.
+iostat-10s:	Do a single 10-second iostat sample and show values."
+
+script=$(basename "$0")
+if [ "$1" = "-h" ] ; then
+	echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
+	exit
+fi
+
+if [ "$script" = "iostat-1s" ] ; then
+	# Do a single one-second sample
+	interval=1
+	# Don't show summary stats
+	brief="yes"
+elif [ "$script" = "iostat-10s" ] ; then
+	# Do a single ten-second sample
+	interval=10
+	# Don't show summary stats
+	brief="yes"
+fi
+
+if [ -f "$VDEV_UPATH" ] ; then
+	# We're a file-based vdev, iostat doesn't work on us.  Do nothing.
+	exit
+fi
+
+if [ "$(uname)" = "FreeBSD" ]; then
+	out=$(iostat -dKx \
+		${interval:+"-w $interval"} \
+		${interval:+"-c 1"} \
+		"$VDEV_UPATH" | tail -n 2)
+else
+	out=$(iostat -kx \
+		${brief:+"-y"} \
+		${interval:+"$interval"} \
+		${interval:+"1"} \
+		"$VDEV_UPATH" | awk NF | tail -n 2)
+fi
+
+
+# Sample output (we want the last two lines):
+#
+# Linux 2.6.32-642.13.1.el6.x86_64 (centos68) 	03/09/2017 	_x86_64_	(6 CPU)
+#
+# avg-cpu:  %user   %nice %system %iowait  %steal   %idle
+#           0.00    0.00    0.00    0.00    0.00  100.00
+#
+# Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
+# sdb               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00
+#
+
+# Get the column names
+cols=$(echo "$out" | head -n 1)
+
+# Get the values and tab separate them to make them cut-able.
+vals=$(echo "$out" | tail -n 1 | sed -r 's/[[:blank:]]+/\t/g')
+
+i=0
+for col in $cols ; do
+	i=$((i+1))
+	# Skip the first column since it's just the device name
+	if [ $i -eq 1 ]; then
+		continue
+	fi
+
+	# Get i'th value
+	val=$(echo "$vals" | cut -f "$i")
+	echo "$col=$val"
+done
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s
new file mode 120000
index 000000000000..084278d99f0f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s
@@ -0,0 +1 @@
+iostat
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s
new file mode 120000
index 000000000000..084278d99f0f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s
@@ -0,0 +1 @@
+iostat
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/label b/sys/contrib/openzfs/cmd/zpool/zpool.d/label
new file mode 120000
index 000000000000..7d1e766add99
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/label
@@ -0,0 +1 @@
+lsblk
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led b/sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led
@@ -0,0 +1 @@
+ses
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/lsblk b/sys/contrib/openzfs/cmd/zpool/zpool.d/lsblk
new file mode 100755
index 000000000000..1cdef40494fe
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/lsblk
@@ -0,0 +1,83 @@
+#!/bin/sh
+#
+# Print some common lsblk values
+#
+# Any (lowercased) name symlinked to the lsblk script will be passed to lsblk
+# as one of its --output names.  Here's a partial list of --output names
+# from the lsblk binary:
+#
+# Available columns (for --output):
+#        NAME  device name
+#       KNAME  internal kernel device name
+#     MAJ:MIN  major:minor device number
+#      FSTYPE  filesystem type
+#  MOUNTPOINT  where the device is mounted
+#       LABEL  filesystem LABEL
+#        UUID  filesystem UUID
+#          RA  read-ahead of the device
+#          RO  read-only device
+#          RM  removable device
+#       MODEL  device identifier
+#        SIZE  size of the device
+#       STATE  state of the device
+#       OWNER  user name
+#       GROUP  group name
+#        MODE  device node permissions
+#   ALIGNMENT  alignment offset
+#      MIN-IO  minimum I/O size
+#      OPT-IO  optimal I/O size
+#     PHY-SEC  physical sector size
+#     LOG-SEC  logical sector size
+#        ROTA  rotational device
+#       SCHED  I/O scheduler name
+#     RQ-SIZE  request queue size
+#        TYPE  device type
+#    DISC-ALN  discard alignment offset
+#   DISC-GRAN  discard granularity
+#    DISC-MAX  discard max bytes
+#   DISC-ZERO  discard zeroes data
+#
+# If the script is run as just 'lsblk' then print out disk size, vendor,
+# and model number.
+
+
+helpstr="
+label:	Show filesystem label.
+model:	Show disk model number.
+size:	Show the disk capacity.
+vendor:	Show the disk vendor.
+lsblk:	Show the disk size, vendor, and model number."
+
+script=$(basename "$0")
+
+if [ "$1" = "-h" ] ; then
+        echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
+        exit
+fi
+
+if [ "$script" = "lsblk" ] ; then
+	list="size vendor model"
+else
+	list=$(echo "$script" | tr '[:upper:]' '[:lower:]')
+fi
+
+# Older versions of lsblk don't support all these values (like SERIAL).
+for i in $list ; do
+
+	# Special case: Looking up the size of a file-based vdev can't
+	# be done with lsblk.
+	if [ "$i" = "size" ] && [ -f "$VDEV_UPATH" ] ; then
+		size=$(du -h --apparent-size "$VDEV_UPATH" | cut -f 1)
+		echo "size=$size"
+		continue
+	fi
+
+
+	val=""
+	if val=$(eval "lsblk -dl -n -o $i $VDEV_UPATH 2>/dev/null") ; then
+		# Remove leading/trailing whitespace from value
+		val=$(echo "$val" | sed -e 's/^[[:space:]]*//' \
+		     -e 's/[[:space:]]*$//')
+	fi
+	echo "$i=$val"
+done
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/media b/sys/contrib/openzfs/cmd/zpool/zpool.d/media
new file mode 100755
index 000000000000..05bc15918bc9
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/media
@@ -0,0 +1,27 @@
+#!/bin/sh
+#
+# Print out the type of device
+#
+
+if [ "$1" = "-h" ] ; then
+	echo "Show whether a vdev is a file, hdd, or ssd."
+	exit
+fi
+
+if [ -b "$VDEV_UPATH" ]; then
+	device=$(basename "$VDEV_UPATH")
+	val=$(cat "/sys/block/$device/queue/rotational" 2>/dev/null)
+	if [ "$val" = "0" ]; then
+		MEDIA="ssd"
+	fi
+
+	if [ "$val" = "1" ]; then
+		MEDIA="hdd"
+	fi
+else
+	if [ -f "$VDEV_UPATH" ]; then
+		MEDIA="file"
+	fi
+fi
+
+echo "media=$MEDIA"
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/model b/sys/contrib/openzfs/cmd/zpool/zpool.d/model
new file mode 120000
index 000000000000..7d1e766add99
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/model
@@ -0,0 +1 @@
+lsblk
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed b/sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err b/sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec b/sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc b/sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/realloc b/sys/contrib/openzfs/cmd/zpool/zpool.d/realloc
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/realloc
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/serial b/sys/contrib/openzfs/cmd/zpool/zpool.d/serial
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/serial
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/ses b/sys/contrib/openzfs/cmd/zpool/zpool.d/ses
new file mode 100755
index 000000000000..f6b7520dfb6c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/ses
@@ -0,0 +1,52 @@
+#!/bin/sh
+#
+# Print SCSI Enclosure Services (SES) info. The output is dependent on the name
+# of the script/symlink used to call it.
+#
+helpstr="
+enc:		Show disk enclosure w:x:y:z value.
+slot:		Show disk slot number as reported by the enclosure.
+encdev:		Show /dev/sg* device associated with the enclosure disk slot.
+fault_led:	Show value of the disk enclosure slot fault LED.
+locate_led:	Show value of the disk enclosure slot locate LED.
+ses:		Show disk's enc, enc device, slot, and fault/locate LED values."
+
+script=$(basename "$0")
+if [ "$1" = "-h" ] ; then
+	echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
+	exit
+fi
+
+if [ "$script" = "ses" ] ; then
+	scripts='enc encdev slot fault_led locate_led'
+else
+	scripts="$script"
+fi
+
+for i in $scripts ; do
+	if [ -z "$VDEV_ENC_SYSFS_PATH" ] ; then
+		echo "$i="
+		continue
+	fi
+
+	val=""
+	case $i in
+	enc)
+		val=$(ls "$VDEV_ENC_SYSFS_PATH/../../" 2>/dev/null)
+		;;
+	slot)
+		val=$(cat "$VDEV_ENC_SYSFS_PATH/slot" 2>/dev/null)
+		;;
+	encdev)
+		val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null)
+		;;
+	fault_led)
+		val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null)
+		;;
+	locate_led)
+		val=$(cat "$VDEV_ENC_SYSFS_PATH/locate" 2>/dev/null)
+		;;
+	esac
+	echo "$i=$val"
+done
+
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/size b/sys/contrib/openzfs/cmd/zpool/zpool.d/size
new file mode 120000
index 000000000000..7d1e766add99
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/size
@@ -0,0 +1 @@
+lsblk
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/slot b/sys/contrib/openzfs/cmd/zpool/zpool.d/slot
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/slot
@@ -0,0 +1 @@
+ses
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/smart b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart
new file mode 100755
index 000000000000..f8854b75227c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart
@@ -0,0 +1,243 @@
+#!/bin/sh
+#
+# Show SMART stats
+#
+
+helpstr="
+smart:		Show SMART temperature and error stats (specific to drive type)
+smartx:		Show SMART extended drive stats (specific to drive type).
+temp:		Show SMART drive temperature in celsius (all drives).
+health:		Show reported SMART status (all drives).
+r_proc:		Show SMART read GBytes processed over drive lifetime (SAS).
+w_proc:		Show SMART write GBytes processed over drive lifetime (SAS).
+r_ucor:		Show SMART read uncorrectable errors (SAS).
+w_ucor:		Show SMART write uncorrectable errors (SAS).
+nonmed:		Show SMART non-medium errors (SAS).
+defect:		Show SMART grown defect list (SAS).
+hours_on:	Show number of hours drive powered on (all drives).
+realloc:	Show SMART reallocated sectors count (ATA).
+rep_ucor:	Show SMART reported uncorrectable count (ATA).
+cmd_to:		Show SMART command timeout count (ATA).
+pend_sec:	Show SMART current pending sector count (ATA).
+off_ucor:	Show SMART offline uncorrectable errors (ATA).
+ata_err:	Show SMART ATA errors (ATA).
+pwr_cyc:	Show SMART power cycle count (ATA).
+serial:		Show disk serial number.
+nvme_err:	Show SMART NVMe errors (NVMe).
+smart_test:	Show SMART self-test results summary.
+test_type:	Show SMART self-test type (short, long... ).
+test_status:	Show SMART self-test status.
+test_progress:	Show SMART self-test percentage done.
+test_ended:	Show when the last SMART self-test ended (if supported).
+"
+
+# Hack for developer testing
+#
+# If you set $samples to a directory containing smartctl output text files,
+# we will use them instead of running smartctl on the vdevs.  This can be
+# useful if you want to test a bunch of different smartctl outputs.  Also, if
+# $samples is set, and additional 'file' column is added to the zpool output
+# showing the filename.
+samples=
+
+# get_filename_from_dir DIR
+#
+# Look in directory DIR and return a filename from it.  The filename returned
+# is chosen quasi-sequentially (based off our PID).  This allows us to return
+# a different filename every time this script is invoked (which we do for each
+# vdev), without having to maintain state.
+get_filename_from_dir()
+{
+	dir=$1
+	pid="$$"
+	num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
+	mod=$((pid % num_files))
+	i=0
+	find "$dir" -type f -printf "%f\n" | while read -r file ; do
+		if [ "$mod" = "$i" ] ; then
+			echo "$file"
+			break
+		fi
+		i=$((i+1))
+	done
+}
+
+script=$(basename "$0")
+
+if [ "$1" = "-h" ] ; then
+        echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
+        exit
+fi
+
+smartctl_path=$(command -v smartctl)
+
+# shellcheck disable=SC2015
+if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
+	if [ -n "$samples" ] ; then
+		# cat a smartctl output text file instead of running smartctl
+		# on a vdev (only used for developer testing).
+		file=$(get_filename_from_dir "$samples")
+		echo "file=$file"
+		raw_out=$(cat "$samples/$file")
+	else
+		raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
+	fi
+
+	# What kind of drive are we?  Look for the right line in smartctl:
+	#
+	# SAS:
+	#	Transport protocol:   SAS
+	#
+	# SATA:
+	#	ATA Version is:   8
+	#
+	# NVMe:
+	#       SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
+	#
+	out=$(echo "$raw_out" | awk '
+# SAS specific
+/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
+/write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8}
+/Non-medium error count/{print "nonmed="$4}
+/Elements in grown defect list/{print "defect="$6}
+
+# SAS common
+/SAS/{type="sas"}
+/Drive Temperature:/{print "temp="$4}
+# Status can be a long string, substitute spaces for '_'
+/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
+/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
+/Serial number:/{print "serial="$3}
+
+# SATA specific
+/Reallocated_Sector_Ct/{print "realloc="$10}
+/Reported_Uncorrect/{print "rep_ucor="$10}
+/Command_Timeout/{print "cmd_to="$10}
+/Current_Pending_Sector/{print "pend_sec="$10}
+/Offline_Uncorrectable/{print "off_ucor="$10}
+/ATA Error Count:/{print "ata_err="$4}
+/Power_Cycle_Count/{print "pwr_cyc="$10}
+
+# SATA common
+/SATA/{type="sata"}
+/Temperature_Celsius/{print "temp="$10}
+/Airflow_Temperature_Cel/{print "temp="$10}
+/Current Temperature:/{print "temp="$3}
+/SMART overall-health self-assessment test result:/{print "health="$6}
+/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
+/Serial Number:/{print "serial="$3}
+
+# NVMe common
+/NVMe/{type="nvme"}
+/Temperature:/{print "temp="$2}
+/SMART overall-health self-assessment test result:/{print "health="$6}
+/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
+/Serial Number:/{print "serial="$3}
+/Power Cycles:/{print "pwr_cyc="$3}
+
+# NVMe specific
+/Media and Data Integrity Errors:/{print "nvme_err="$6}
+
+# SMART self-test info
+/Self-test execution status:/{progress=tolower($4)} # SAS
+/SMART Self-test log/{test_seen=1} # SAS
+/SMART Extended Self-test Log/{test_seen=1} # SATA
+/# 1/{
+	test_type=tolower($3"_"$4);
+	# Status could be one word ("Completed") or multiple ("Completed: read
+	# failure").  Look for the ":" to see if we need to grab more words.
+
+	if ($5 ~ ":")
+		status=tolower($5""$6"_"$7)
+	else
+		status=tolower($5)
+	if (status=="self")
+		status="running";
+
+	if (type == "sas") {
+		hours=int($(NF-4))
+	} else {
+		hours=int($(NF-1))
+		# SATA reports percent remaining, rather than percent done
+		# Convert it to percent done.
+		progress=(100-int($(NF-2)))"%"
+	}
+	# When we int()-ify "hours", it converts stuff like "NOW" and "-" into
+	# 0.  In those cases, set it to hours_on, so they will cancel out in
+	# the "hours_ago" calculation later on.
+	if (hours == 0)
+		hours=hours_on
+
+	if (test_seen) {
+		print "test="hours_on
+		print "test_type="test_type
+		print "test_status="status
+		print "test_progress="progress
+	}
+	# Not all drives report hours_on
+	if (hours_on && hours) {
+		total_hours_ago=(hours_on-hours)
+		days_ago=int(total_hours_ago/24)
+		hours_ago=(total_hours_ago % 24)
+		if (days_ago != 0)
+			ago_str=days_ago"d"
+		if (hours_ago !=0)
+			ago_str=ago_str""hours_ago"h"
+		print "test_ended="ago_str
+	}
+}
+
+END {print "type="type; ORS="\n"; print ""}
+');
+fi
+type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
+
+# If type is not set by now, either we don't have a block device
+# or smartctl failed. Either way, default to ATA and set $out to
+# nothing.
+if [ -z "$type" ]; then
+	type="sata"
+	out=
+fi
+
+case $script in
+smart)
+	# Print temperature plus common predictors of drive failure
+	if [ "$type" = "sas" ] ; then
+		scripts="temp|health|r_ucor|w_ucor"
+	elif [ "$type" = "sata" ] ; then
+		scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
+	elif [ "$type" = "nvme" ] ; then
+		scripts="temp|health|nvme_err"
+	fi
+	;;
+smartx)
+	# Print some other interesting stats
+	if [ "$type" = "sas" ] ; then
+		scripts="hours_on|defect|nonmed|r_proc|w_proc"
+	elif [ "$type" = "sata" ] ; then
+		scripts="hours_on|pwr_cyc"
+	elif [ "$type" = "nvme" ] ; then
+		scripts="hours_on|pwr_cyc"
+	fi
+	;;
+smart_test)
+	scripts="test_type|test_status|test_progress|test_ended"
+	;;
+*)
+	scripts="$script"
+esac
+
+with_vals=$(echo "$out" | grep -E "$scripts")
+if [ -n "$with_vals" ]; then
+	echo "$with_vals"
+	without_vals=$(echo "$scripts" | tr "|" "\n" |
+		grep -v -E "$(echo "$with_vals" |
+		awk -F "=" '{print $1}')" | awk '{print $0"="}')
+else
+	without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}')
+fi
+
+if [ -n "$without_vals" ]; then
+	echo "$without_vals"
+fi
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/smartx b/sys/contrib/openzfs/cmd/zpool/zpool.d/smartx
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/smartx
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/temp b/sys/contrib/openzfs/cmd/zpool/zpool.d/temp
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/temp
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_status b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_status
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_status
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_type b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_type
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_type
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/upath b/sys/contrib/openzfs/cmd/zpool/zpool.d/upath
new file mode 100755
index 000000000000..16a4327d4850
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/upath
@@ -0,0 +1,7 @@
+#!/bin/sh
+if [ "$1" = "-h" ] ; then
+	echo "Show the underlying path for a device."
+	exit
+fi
+
+echo upath="$VDEV_UPATH"
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/vendor b/sys/contrib/openzfs/cmd/zpool/zpool.d/vendor
new file mode 120000
index 000000000000..7d1e766add99
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/vendor
@@ -0,0 +1 @@
+lsblk
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor
@@ -0,0 +1 @@
+smart
+\ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
new file mode 100644
index 000000000000..5f3153bca2c2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
@@ -0,0 +1,757 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ */
+
+#include <libintl.h>
+#include <libuutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <thread_pool.h>
+
+#include <libzfs.h>
+#include <libzutil.h>
+#include <sys/zfs_context.h>
+#include <sys/wait.h>
+
+#include "zpool_util.h"
+
+/*
+ * Private interface for iterating over pools specified on the command line.
+ * Most consumers will call for_each_pool, but in order to support iostat, we
+ * allow fined grained control through the zpool_list_t interface.
+ */
+
+typedef struct zpool_node {
+	zpool_handle_t	*zn_handle;
+	uu_avl_node_t	zn_avlnode;
+	int		zn_mark;
+} zpool_node_t;
+
+struct zpool_list {
+	boolean_t	zl_findall;
+	uu_avl_t	*zl_avl;
+	uu_avl_pool_t	*zl_pool;
+	zprop_list_t	**zl_proplist;
+};
+
+/* ARGSUSED */
+static int
+zpool_compare(const void *larg, const void *rarg, void *unused)
+{
+	zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle;
+	zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle;
+	const char *lname = zpool_get_name(l);
+	const char *rname = zpool_get_name(r);
+
+	return (strcmp(lname, rname));
+}
+
+/*
+ * Callback function for pool_list_get().  Adds the given pool to the AVL tree
+ * of known pools.
+ */
+static int
+add_pool(zpool_handle_t *zhp, void *data)
+{
+	zpool_list_t *zlp = data;
+	zpool_node_t *node = safe_malloc(sizeof (zpool_node_t));
+	uu_avl_index_t idx;
+
+	node->zn_handle = zhp;
+	uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool);
+	if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) {
+		if (zlp->zl_proplist &&
+		    zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) {
+			zpool_close(zhp);
+			free(node);
+			return (-1);
+		}
+		uu_avl_insert(zlp->zl_avl, node, idx);
+	} else {
+		zpool_close(zhp);
+		free(node);
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Create a list of pools based on the given arguments.  If we're given no
+ * arguments, then iterate over all pools in the system and add them to the AVL
+ * tree.  Otherwise, add only those pool explicitly specified on the command
+ * line.
+ */
+zpool_list_t *
+pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err)
+{
+	zpool_list_t *zlp;
+
+	zlp = safe_malloc(sizeof (zpool_list_t));
+
+	zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t),
+	    offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT);
+
+	if (zlp->zl_pool == NULL)
+		zpool_no_memory();
+
+	if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		zpool_no_memory();
+
+	zlp->zl_proplist = proplist;
+
+	if (argc == 0) {
+		(void) zpool_iter(g_zfs, add_pool, zlp);
+		zlp->zl_findall = B_TRUE;
+	} else {
+		int i;
+
+		for (i = 0; i < argc; i++) {
+			zpool_handle_t *zhp;
+
+			if ((zhp = zpool_open_canfail(g_zfs, argv[i])) !=
+			    NULL) {
+				if (add_pool(zhp, zlp) != 0)
+					*err = B_TRUE;
+			} else {
+				*err = B_TRUE;
+			}
+		}
+	}
+
+	return (zlp);
+}
+
+/*
+ * Search for any new pools, adding them to the list.  We only add pools when no
+ * options were given on the command line.  Otherwise, we keep the list fixed as
+ * those that were explicitly specified.
+ */
+void
+pool_list_update(zpool_list_t *zlp)
+{
+	if (zlp->zl_findall)
+		(void) zpool_iter(g_zfs, add_pool, zlp);
+}
+
+/*
+ * Iterate over all pools in the list, executing the callback for each
+ */
+int
+pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func,
+    void *data)
+{
+	zpool_node_t *node, *next_node;
+	int ret = 0;
+
+	for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) {
+		next_node = uu_avl_next(zlp->zl_avl, node);
+		if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL ||
+		    unavail)
+			ret |= func(node->zn_handle, data);
+	}
+
+	return (ret);
+}
+
+/*
+ * Remove the given pool from the list.  When running iostat, we want to remove
+ * those pools that no longer exist.
+ */
+void
+pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp)
+{
+	zpool_node_t search, *node;
+
+	search.zn_handle = zhp;
+	if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) {
+		uu_avl_remove(zlp->zl_avl, node);
+		zpool_close(node->zn_handle);
+		free(node);
+	}
+}
+
+/*
+ * Free all the handles associated with this list.
+ */
+void
+pool_list_free(zpool_list_t *zlp)
+{
+	uu_avl_walk_t *walk;
+	zpool_node_t *node;
+
+	if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) {
+		(void) fprintf(stderr,
+		    gettext("internal error: out of memory"));
+		exit(1);
+	}
+
+	while ((node = uu_avl_walk_next(walk)) != NULL) {
+		uu_avl_remove(zlp->zl_avl, node);
+		zpool_close(node->zn_handle);
+		free(node);
+	}
+
+	uu_avl_walk_end(walk);
+	uu_avl_destroy(zlp->zl_avl);
+	uu_avl_pool_destroy(zlp->zl_pool);
+
+	free(zlp);
+}
+
+/*
+ * Returns the number of elements in the pool list.
+ */
+int
+pool_list_count(zpool_list_t *zlp)
+{
+	return (uu_avl_numnodes(zlp->zl_avl));
+}
+
+/*
+ * High level function which iterates over all pools given on the command line,
+ * using the pool_list_* interfaces.
+ */
+int
+for_each_pool(int argc, char **argv, boolean_t unavail,
+    zprop_list_t **proplist, zpool_iter_f func, void *data)
+{
+	zpool_list_t *list;
+	int ret = 0;
+
+	if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL)
+		return (1);
+
+	if (pool_list_iter(list, unavail, func, data) != 0)
+		ret = 1;
+
+	pool_list_free(list);
+
+	return (ret);
+}
+
+static int
+for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func,
+    void *data)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	int ret = 0;
+	int i;
+	char *type;
+
+	const char *list[] = {
+	    ZPOOL_CONFIG_SPARES,
+	    ZPOOL_CONFIG_L2CACHE,
+	    ZPOOL_CONFIG_CHILDREN
+	};
+
+	for (i = 0; i < ARRAY_SIZE(list); i++) {
+		if (nvlist_lookup_nvlist_array(nv, list[i], &child,
+		    &children) == 0) {
+			for (c = 0; c < children; c++) {
+				uint64_t ishole = 0;
+
+				(void) nvlist_lookup_uint64(child[c],
+				    ZPOOL_CONFIG_IS_HOLE, &ishole);
+
+				if (ishole)
+					continue;
+
+				ret |= for_each_vdev_cb(zhp, child[c], func,
+				    data);
+			}
+		}
+	}
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+		return (ret);
+
+	/* Don't run our function on root vdevs */
+	if (strcmp(type, VDEV_TYPE_ROOT) != 0) {
+		ret |= func(zhp, nv, data);
+	}
+
+	return (ret);
+}
+
+/*
+ * This is the equivalent of for_each_pool() for vdevs.  It iterates thorough
+ * all vdevs in the pool, ignoring root vdevs and holes, calling func() on
+ * each one.
+ *
+ * @zhp:	Zpool handle
+ * @func:	Function to call on each vdev
+ * @data:	Custom data to pass to the function
+ */
+int
+for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data)
+{
+	nvlist_t *config, *nvroot = NULL;
+
+	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &nvroot) == 0);
+	}
+	return (for_each_vdev_cb(zhp, nvroot, func, data));
+}
+
+/*
+ * Process the vcdl->vdev_cmd_data[] array to figure out all the unique column
+ * names and their widths.  When this function is done, vcdl->uniq_cols,
+ * vcdl->uniq_cols_cnt, and vcdl->uniq_cols_width will be filled in.
+ */
+static void
+process_unique_cmd_columns(vdev_cmd_data_list_t *vcdl)
+{
+	char **uniq_cols = NULL, **tmp = NULL;
+	int *uniq_cols_width;
+	vdev_cmd_data_t *data;
+	int cnt = 0;
+	int k;
+
+	/* For each vdev */
+	for (int i = 0; i < vcdl->count; i++) {
+		data = &vcdl->data[i];
+		/* For each column the vdev reported */
+		for (int j = 0; j < data->cols_cnt; j++) {
+			/* Is this column in our list of unique column names? */
+			for (k = 0; k < cnt; k++) {
+				if (strcmp(data->cols[j], uniq_cols[k]) == 0)
+					break; /* yes it is */
+			}
+			if (k == cnt) {
+				/* No entry for column, add to list */
+				tmp = realloc(uniq_cols, sizeof (*uniq_cols) *
+				    (cnt + 1));
+				if (tmp == NULL)
+					break; /* Nothing we can do... */
+				uniq_cols = tmp;
+				uniq_cols[cnt] = data->cols[j];
+				cnt++;
+			}
+		}
+	}
+
+	/*
+	 * We now have a list of all the unique column names.  Figure out the
+	 * max width of each column by looking at the column name and all its
+	 * values.
+	 */
+	uniq_cols_width = safe_malloc(sizeof (*uniq_cols_width) * cnt);
+	for (int i = 0; i < cnt; i++) {
+		/* Start off with the column title's width */
+		uniq_cols_width[i] = strlen(uniq_cols[i]);
+		/* For each vdev */
+		for (int j = 0; j < vcdl->count; j++) {
+			/* For each of the vdev's values in a column */
+			data = &vcdl->data[j];
+			for (k = 0; k < data->cols_cnt; k++) {
+				/* Does this vdev have a value for this col? */
+				if (strcmp(data->cols[k], uniq_cols[i]) == 0) {
+					/* Is the value width larger? */
+					uniq_cols_width[i] =
+					    MAX(uniq_cols_width[i],
+					    strlen(data->lines[k]));
+				}
+			}
+		}
+	}
+
+	vcdl->uniq_cols = uniq_cols;
+	vcdl->uniq_cols_cnt = cnt;
+	vcdl->uniq_cols_width = uniq_cols_width;
+}
+
+
+/*
+ * Process a line of command output
+ *
+ * When running 'zpool iostat|status -c' the lines of output can either be
+ * in the form of:
+ *
+ *	column_name=value
+ *
+ * Or just:
+ *
+ *	value
+ *
+ * Process the column_name (if any) and value.
+ *
+ * Returns 0 if line was processed, and there are more lines can still be
+ * processed.
+ *
+ * Returns 1 if this was the last line to process, or error.
+ */
+static int
+vdev_process_cmd_output(vdev_cmd_data_t *data, char *line)
+{
+	char *col = NULL;
+	char *val = line;
+	char *equals;
+	char **tmp;
+
+	if (line == NULL)
+		return (1);
+
+	equals = strchr(line, '=');
+	if (equals != NULL) {
+		/*
+		 * We have a 'column=value' type line.  Split it into the
+		 * column and value strings by turning the '=' into a '\0'.
+		 */
+		*equals = '\0';
+		col = line;
+		val = equals + 1;
+	} else {
+		val = line;
+	}
+
+	/* Do we already have a column by this name?  If so, skip it. */
+	if (col != NULL) {
+		for (int i = 0; i < data->cols_cnt; i++) {
+			if (strcmp(col, data->cols[i]) == 0)
+				return (0); /* Duplicate, skip */
+		}
+	}
+
+	if (val != NULL) {
+		tmp = realloc(data->lines,
+		    (data->lines_cnt + 1) * sizeof (*data->lines));
+		if (tmp == NULL)
+			return (1);
+
+		data->lines = tmp;
+		data->lines[data->lines_cnt] = strdup(val);
+		data->lines_cnt++;
+	}
+
+	if (col != NULL) {
+		tmp = realloc(data->cols,
+		    (data->cols_cnt + 1) * sizeof (*data->cols));
+		if (tmp == NULL)
+			return (1);
+
+		data->cols = tmp;
+		data->cols[data->cols_cnt] = strdup(col);
+		data->cols_cnt++;
+	}
+
+	if (val != NULL && col == NULL)
+		return (1);
+
+	return (0);
+}
+
+/*
+ * Run the cmd and store results in *data.
+ */
+static void
+vdev_run_cmd(vdev_cmd_data_t *data, char *cmd)
+{
+	int rc;
+	char *argv[2] = {cmd, 0};
+	char *env[5] = {"PATH=/bin:/sbin:/usr/bin:/usr/sbin", NULL, NULL, NULL,
+	    NULL};
+	char **lines = NULL;
+	int lines_cnt = 0;
+	int i;
+
+	/* Setup our custom environment variables */
+	rc = asprintf(&env[1], "VDEV_PATH=%s",
+	    data->path ? data->path : "");
+	if (rc == -1)
+		goto out;
+
+	rc = asprintf(&env[2], "VDEV_UPATH=%s",
+	    data->upath ? data->upath : "");
+	if (rc == -1)
+		goto out;
+
+	rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s",
+	    data->vdev_enc_sysfs_path ?
+	    data->vdev_enc_sysfs_path : "");
+	if (rc == -1)
+		goto out;
+
+	/* Run the command */
+	rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines,
+	    &lines_cnt);
+	if (rc != 0)
+		goto out;
+
+	/* Process the output we got */
+	for (i = 0; i < lines_cnt; i++)
+		if (vdev_process_cmd_output(data, lines[i]) != 0)
+			break;
+
+out:
+	if (lines != NULL)
+		libzfs_free_str_array(lines, lines_cnt);
+
+	/* Start with i = 1 since env[0] was statically allocated */
+	for (i = 1; i < ARRAY_SIZE(env); i++)
+		if (env[i] != NULL)
+			free(env[i]);
+}
+
+/*
+ * Generate the search path for zpool iostat/status -c scripts.
+ * The string returned must be freed.
+ */
+char *
+zpool_get_cmd_search_path(void)
+{
+	const char *env;
+	char *sp = NULL;
+
+	env = getenv("ZPOOL_SCRIPTS_PATH");
+	if (env != NULL)
+		return (strdup(env));
+
+	env = getenv("HOME");
+	if (env != NULL) {
+		if (asprintf(&sp, "%s/.zpool.d:%s",
+		    env, ZPOOL_SCRIPTS_DIR) != -1) {
+			return (sp);
+		}
+	}
+
+	if (asprintf(&sp, "%s", ZPOOL_SCRIPTS_DIR) != -1)
+		return (sp);
+
+	return (NULL);
+}
+
+/* Thread function run for each vdev */
+static void
+vdev_run_cmd_thread(void *cb_cmd_data)
+{
+	vdev_cmd_data_t *data = cb_cmd_data;
+	char *cmd = NULL, *cmddup, *cmdrest;
+
+	cmddup = strdup(data->cmd);
+	if (cmddup == NULL)
+		return;
+
+	cmdrest = cmddup;
+	while ((cmd = strtok_r(cmdrest, ",", &cmdrest))) {
+		char *dir = NULL, *sp, *sprest;
+		char fullpath[MAXPATHLEN];
+
+		if (strchr(cmd, '/') != NULL)
+			continue;
+
+		sp = zpool_get_cmd_search_path();
+		if (sp == NULL)
+			continue;
+
+		sprest = sp;
+		while ((dir = strtok_r(sprest, ":", &sprest))) {
+			if (snprintf(fullpath, sizeof (fullpath),
+			    "%s/%s", dir, cmd) == -1)
+				continue;
+
+			if (access(fullpath, X_OK) == 0) {
+				vdev_run_cmd(data, fullpath);
+				break;
+			}
+		}
+		free(sp);
+	}
+	free(cmddup);
+}
+
+/* For each vdev in the pool run a command */
+static int
+for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl)
+{
+	vdev_cmd_data_list_t *vcdl = cb_vcdl;
+	vdev_cmd_data_t *data;
+	char *path = NULL;
+	char *vname = NULL;
+	char *vdev_enc_sysfs_path = NULL;
+	int i, match = 0;
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
+		return (1);
+
+	nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+	    &vdev_enc_sysfs_path);
+
+	/* Spares show more than once if they're in use, so skip if exists */
+	for (i = 0; i < vcdl->count; i++) {
+		if ((strcmp(vcdl->data[i].path, path) == 0) &&
+		    (strcmp(vcdl->data[i].pool, zpool_get_name(zhp)) == 0)) {
+			/* vdev already exists, skip it */
+			return (0);
+		}
+	}
+
+	/* Check for selected vdevs here, if any */
+	for (i = 0; i < vcdl->vdev_names_count; i++) {
+		vname = zpool_vdev_name(g_zfs, zhp, nv, vcdl->cb_name_flags);
+		if (strcmp(vcdl->vdev_names[i], vname) == 0) {
+			free(vname);
+			match = 1;
+			break; /* match */
+		}
+		free(vname);
+	}
+
+	/* If we selected vdevs, and this isn't one of them, then bail out */
+	if (!match && vcdl->vdev_names_count)
+		return (0);
+
+	/*
+	 * Resize our array and add in the new element.
+	 */
+	if (!(vcdl->data = realloc(vcdl->data,
+	    sizeof (*vcdl->data) * (vcdl->count + 1))))
+		return (ENOMEM);	/* couldn't realloc */
+
+	data = &vcdl->data[vcdl->count];
+
+	data->pool = strdup(zpool_get_name(zhp));
+	data->path = strdup(path);
+	data->upath = zfs_get_underlying_path(path);
+	data->cmd = vcdl->cmd;
+	data->lines = data->cols = NULL;
+	data->lines_cnt = data->cols_cnt = 0;
+	if (vdev_enc_sysfs_path)
+		data->vdev_enc_sysfs_path = strdup(vdev_enc_sysfs_path);
+	else
+		data->vdev_enc_sysfs_path = NULL;
+
+	vcdl->count++;
+
+	return (0);
+}
+
+/* Get the names and count of the vdevs */
+static int
+all_pools_for_each_vdev_gather_cb(zpool_handle_t *zhp, void *cb_vcdl)
+{
+	return (for_each_vdev(zhp, for_each_vdev_run_cb, cb_vcdl));
+}
+
+/*
+ * Now that vcdl is populated with our complete list of vdevs, spawn
+ * off the commands.
+ */
+static void
+all_pools_for_each_vdev_run_vcdl(vdev_cmd_data_list_t *vcdl)
+{
+	tpool_t *t;
+
+	t = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL);
+	if (t == NULL)
+		return;
+
+	/* Spawn off the command for each vdev */
+	for (int i = 0; i < vcdl->count; i++) {
+		(void) tpool_dispatch(t, vdev_run_cmd_thread,
+		    (void *) &vcdl->data[i]);
+	}
+
+	/* Wait for threads to finish */
+	tpool_wait(t);
+	tpool_destroy(t);
+}
+
+/*
+ * Run command 'cmd' on all vdevs in all pools in argv.  Saves the first line of
+ * output from the command in vcdk->data[].line for all vdevs.  If you want
+ * to run the command on only certain vdevs, fill in g_zfs, vdev_names,
+ * vdev_names_count, and cb_name_flags.  Otherwise leave them as zero.
+ *
+ * Returns a vdev_cmd_data_list_t that must be freed with
+ * free_vdev_cmd_data_list();
+ */
+vdev_cmd_data_list_t *
+all_pools_for_each_vdev_run(int argc, char **argv, char *cmd,
+    libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count,
+    int cb_name_flags)
+{
+	vdev_cmd_data_list_t *vcdl;
+	vcdl = safe_malloc(sizeof (vdev_cmd_data_list_t));
+	vcdl->cmd = cmd;
+
+	vcdl->vdev_names = vdev_names;
+	vcdl->vdev_names_count = vdev_names_count;
+	vcdl->cb_name_flags = cb_name_flags;
+	vcdl->g_zfs = g_zfs;
+
+	/* Gather our list of all vdevs in all pools */
+	for_each_pool(argc, argv, B_TRUE, NULL,
+	    all_pools_for_each_vdev_gather_cb, vcdl);
+
+	/* Run command on all vdevs in all pools */
+	all_pools_for_each_vdev_run_vcdl(vcdl);
+
+	/*
+	 * vcdl->data[] now contains all the column names and values for each
+	 * vdev.  We need to process that into a master list of unique column
+	 * names, and figure out the width of each column.
+	 */
+	process_unique_cmd_columns(vcdl);
+
+	return (vcdl);
+}
+
+/*
+ * Free the vdev_cmd_data_list_t created by all_pools_for_each_vdev_run()
+ */
+void
+free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl)
+{
+	free(vcdl->uniq_cols);
+	free(vcdl->uniq_cols_width);
+
+	for (int i = 0; i < vcdl->count; i++) {
+		free(vcdl->data[i].path);
+		free(vcdl->data[i].pool);
+		free(vcdl->data[i].upath);
+
+		for (int j = 0; j < vcdl->data[i].lines_cnt; j++)
+			free(vcdl->data[i].lines[j]);
+
+		free(vcdl->data[i].lines);
+
+		for (int j = 0; j < vcdl->data[i].cols_cnt; j++)
+			free(vcdl->data[i].cols[j]);
+
+		free(vcdl->data[i].cols);
+		free(vcdl->data[i].vdev_enc_sysfs_path);
+	}
+	free(vcdl->data);
+	free(vcdl);
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
new file mode 100644
index 000000000000..f3756a5d9547
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@@ -0,0 +1,10326 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Frederik Wessels. All rights reserved.
+ * Copyright (c) 2012 by Cyril Plisko. All rights reserved.
+ * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <locale.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <time.h>
+#include <unistd.h>
+#include <pwd.h>
+#include <zone.h>
+#include <sys/wait.h>
+#include <zfs_prop.h>
+#include <sys/fs/zfs.h>
+#include <sys/stat.h>
+#include <sys/systeminfo.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/util.h>
+#include <sys/fm/protocol.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/mount.h>
+#include <sys/sysmacros.h>
+
+#include <math.h>
+
+#include <libzfs.h>
+#include <libzutil.h>
+
+#include "zpool_util.h"
+#include "zfs_comutil.h"
+#include "zfeature_common.h"
+
+#include "statcommon.h"
+
+libzfs_handle_t *g_zfs;
+
+static int zpool_do_create(int, char **);
+static int zpool_do_destroy(int, char **);
+
+static int zpool_do_add(int, char **);
+static int zpool_do_remove(int, char **);
+static int zpool_do_labelclear(int, char **);
+
+static int zpool_do_checkpoint(int, char **);
+
+static int zpool_do_list(int, char **);
+static int zpool_do_iostat(int, char **);
+static int zpool_do_status(int, char **);
+
+static int zpool_do_online(int, char **);
+static int zpool_do_offline(int, char **);
+static int zpool_do_clear(int, char **);
+static int zpool_do_reopen(int, char **);
+
+static int zpool_do_reguid(int, char **);
+
+static int zpool_do_attach(int, char **);
+static int zpool_do_detach(int, char **);
+static int zpool_do_replace(int, char **);
+static int zpool_do_split(int, char **);
+
+static int zpool_do_initialize(int, char **);
+static int zpool_do_scrub(int, char **);
+static int zpool_do_resilver(int, char **);
+static int zpool_do_trim(int, char **);
+
+static int zpool_do_import(int, char **);
+static int zpool_do_export(int, char **);
+
+static int zpool_do_upgrade(int, char **);
+
+static int zpool_do_history(int, char **);
+static int zpool_do_events(int, char **);
+
+static int zpool_do_get(int, char **);
+static int zpool_do_set(int, char **);
+
+static int zpool_do_sync(int, char **);
+
+static int zpool_do_version(int, char **);
+
+static int zpool_do_wait(int, char **);
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+
+#ifdef DEBUG
+const char *
+_umem_debug_init(void)
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+#endif
+
+typedef enum {
+	HELP_ADD,
+	HELP_ATTACH,
+	HELP_CLEAR,
+	HELP_CREATE,
+	HELP_CHECKPOINT,
+	HELP_DESTROY,
+	HELP_DETACH,
+	HELP_EXPORT,
+	HELP_HISTORY,
+	HELP_IMPORT,
+	HELP_IOSTAT,
+	HELP_LABELCLEAR,
+	HELP_LIST,
+	HELP_OFFLINE,
+	HELP_ONLINE,
+	HELP_REPLACE,
+	HELP_REMOVE,
+	HELP_INITIALIZE,
+	HELP_SCRUB,
+	HELP_RESILVER,
+	HELP_TRIM,
+	HELP_STATUS,
+	HELP_UPGRADE,
+	HELP_EVENTS,
+	HELP_GET,
+	HELP_SET,
+	HELP_SPLIT,
+	HELP_SYNC,
+	HELP_REGUID,
+	HELP_REOPEN,
+	HELP_VERSION,
+	HELP_WAIT
+} zpool_help_t;
+
+
+/*
+ * Flags for stats to display with "zpool iostats"
+ */
+enum iostat_type {
+	IOS_DEFAULT = 0,
+	IOS_LATENCY = 1,
+	IOS_QUEUES = 2,
+	IOS_L_HISTO = 3,
+	IOS_RQ_HISTO = 4,
+	IOS_COUNT,	/* always last element */
+};
+
+/* iostat_type entries as bitmasks */
+#define	IOS_DEFAULT_M	(1ULL << IOS_DEFAULT)
+#define	IOS_LATENCY_M	(1ULL << IOS_LATENCY)
+#define	IOS_QUEUES_M	(1ULL << IOS_QUEUES)
+#define	IOS_L_HISTO_M	(1ULL << IOS_L_HISTO)
+#define	IOS_RQ_HISTO_M	(1ULL << IOS_RQ_HISTO)
+
+/* Mask of all the histo bits */
+#define	IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M)
+
+/*
+ * Lookup table for iostat flags to nvlist names.  Basically a list
+ * of all the nvlists a flag requires.  Also specifies the order in
+ * which data gets printed in zpool iostat.
+ */
+static const char *vsx_type_to_nvlist[IOS_COUNT][13] = {
+	[IOS_L_HISTO] = {
+	    ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+	    NULL},
+	[IOS_LATENCY] = {
+	    ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+	    ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+	    NULL},
+	[IOS_QUEUES] = {
+	    ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+	    ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+	    ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+	    ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+	    ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+	    ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+	    NULL},
+	[IOS_RQ_HISTO] = {
+	    ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
+	    ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
+	    ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,
+	    ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,
+	    ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,
+	    ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,
+	    ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,
+	    ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
+	    ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
+	    ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
+	    ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
+	    ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
+	    NULL},
+};
+
+
+/*
+ * Given a cb->cb_flags with a histogram bit set, return the iostat_type.
+ * Right now, only one histo bit is ever set at one time, so we can
+ * just do a highbit64(a)
+ */
+#define	IOS_HISTO_IDX(a)	(highbit64(a & IOS_ANYHISTO_M) - 1)
+
+typedef struct zpool_command {
+	const char	*name;
+	int		(*func)(int, char **);
+	zpool_help_t	usage;
+} zpool_command_t;
+
+/*
+ * Master command table.  Each ZFS command has a name, associated function, and
+ * usage message.  The usage messages need to be internationalized, so we have
+ * to have a function to return the usage message based on a command index.
+ *
+ * These commands are organized according to how they are displayed in the usage
+ * message.  An empty command (one with a NULL name) indicates an empty line in
+ * the generic usage message.
+ */
+static zpool_command_t command_table[] = {
+	{ "version",	zpool_do_version,	HELP_VERSION		},
+	{ NULL },
+	{ "create",	zpool_do_create,	HELP_CREATE		},
+	{ "destroy",	zpool_do_destroy,	HELP_DESTROY		},
+	{ NULL },
+	{ "add",	zpool_do_add,		HELP_ADD		},
+	{ "remove",	zpool_do_remove,	HELP_REMOVE		},
+	{ NULL },
+	{ "labelclear",	zpool_do_labelclear,	HELP_LABELCLEAR		},
+	{ NULL },
+	{ "checkpoint",	zpool_do_checkpoint,	HELP_CHECKPOINT		},
+	{ NULL },
+	{ "list",	zpool_do_list,		HELP_LIST		},
+	{ "iostat",	zpool_do_iostat,	HELP_IOSTAT		},
+	{ "status",	zpool_do_status,	HELP_STATUS		},
+	{ NULL },
+	{ "online",	zpool_do_online,	HELP_ONLINE		},
+	{ "offline",	zpool_do_offline,	HELP_OFFLINE		},
+	{ "clear",	zpool_do_clear,		HELP_CLEAR		},
+	{ "reopen",	zpool_do_reopen,	HELP_REOPEN		},
+	{ NULL },
+	{ "attach",	zpool_do_attach,	HELP_ATTACH		},
+	{ "detach",	zpool_do_detach,	HELP_DETACH		},
+	{ "replace",	zpool_do_replace,	HELP_REPLACE		},
+	{ "split",	zpool_do_split,		HELP_SPLIT		},
+	{ NULL },
+	{ "initialize",	zpool_do_initialize,	HELP_INITIALIZE		},
+	{ "resilver",	zpool_do_resilver,	HELP_RESILVER		},
+	{ "scrub",	zpool_do_scrub,		HELP_SCRUB		},
+	{ "trim",	zpool_do_trim,		HELP_TRIM		},
+	{ NULL },
+	{ "import",	zpool_do_import,	HELP_IMPORT		},
+	{ "export",	zpool_do_export,	HELP_EXPORT		},
+	{ "upgrade",	zpool_do_upgrade,	HELP_UPGRADE		},
+	{ "reguid",	zpool_do_reguid,	HELP_REGUID		},
+	{ NULL },
+	{ "history",	zpool_do_history,	HELP_HISTORY		},
+	{ "events",	zpool_do_events,	HELP_EVENTS		},
+	{ NULL },
+	{ "get",	zpool_do_get,		HELP_GET		},
+	{ "set",	zpool_do_set,		HELP_SET		},
+	{ "sync",	zpool_do_sync,		HELP_SYNC		},
+	{ NULL },
+	{ "wait",	zpool_do_wait,		HELP_WAIT		},
+};
+
+#define	NCOMMAND	(ARRAY_SIZE(command_table))
+
+#define	VDEV_ALLOC_CLASS_LOGS	"logs"
+
+static zpool_command_t *current_command;
+static char history_str[HIS_MAX_RECORD_LEN];
+static boolean_t log_history = B_TRUE;
+static uint_t timestamp_fmt = NODATE;
+
+static const char *
+get_usage(zpool_help_t idx)
+{
+	switch (idx) {
+	case HELP_ADD:
+		return (gettext("\tadd [-fgLnP] [-o property=value] "
+		    "<pool> <vdev> ...\n"));
+	case HELP_ATTACH:
+		return (gettext("\tattach [-fsw] [-o property=value] "
+		    "<pool> <device> <new-device>\n"));
+	case HELP_CLEAR:
+		return (gettext("\tclear [-nF] <pool> [device]\n"));
+	case HELP_CREATE:
+		return (gettext("\tcreate [-fnd] [-o property=value] ... \n"
+		    "\t    [-O file-system-property=value] ... \n"
+		    "\t    [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
+	case HELP_CHECKPOINT:
+		return (gettext("\tcheckpoint [-d [-w]] <pool> ...\n"));
+	case HELP_DESTROY:
+		return (gettext("\tdestroy [-f] <pool>\n"));
+	case HELP_DETACH:
+		return (gettext("\tdetach <pool> <device>\n"));
+	case HELP_EXPORT:
+		return (gettext("\texport [-af] <pool> ...\n"));
+	case HELP_HISTORY:
+		return (gettext("\thistory [-il] [<pool>] ...\n"));
+	case HELP_IMPORT:
+		return (gettext("\timport [-d dir] [-D]\n"
+		    "\timport [-o mntopts] [-o property=value] ... \n"
+		    "\t    [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] "
+		    "[-R root] [-F [-n]] -a\n"
+		    "\timport [-o mntopts] [-o property=value] ... \n"
+		    "\t    [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] "
+		    "[-R root] [-F [-n]]\n"
+		    "\t    [--rewind-to-checkpoint] <pool | id> [newpool]\n"));
+	case HELP_IOSTAT:
+		return (gettext("\tiostat [[[-c [script1,script2,...]"
+		    "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n"
+		    "\t    [[pool ...]|[pool vdev ...]|[vdev ...]]"
+		    " [[-n] interval [count]]\n"));
+	case HELP_LABELCLEAR:
+		return (gettext("\tlabelclear [-f] <vdev>\n"));
+	case HELP_LIST:
+		return (gettext("\tlist [-gHLpPv] [-o property[,...]] "
+		    "[-T d|u] [pool] ... \n"
+		    "\t    [interval [count]]\n"));
+	case HELP_OFFLINE:
+		return (gettext("\toffline [-f] [-t] <pool> <device> ...\n"));
+	case HELP_ONLINE:
+		return (gettext("\tonline [-e] <pool> <device> ...\n"));
+	case HELP_REPLACE:
+		return (gettext("\treplace [-fsw] [-o property=value] "
+		    "<pool> <device> [new-device]\n"));
+	case HELP_REMOVE:
+		return (gettext("\tremove [-npsw] <pool> <device> ...\n"));
+	case HELP_REOPEN:
+		return (gettext("\treopen [-n] <pool>\n"));
+	case HELP_INITIALIZE:
+		return (gettext("\tinitialize [-c | -s] [-w] <pool> "
+		    "[<device> ...]\n"));
+	case HELP_SCRUB:
+		return (gettext("\tscrub [-s | -p] [-w] <pool> ...\n"));
+	case HELP_RESILVER:
+		return (gettext("\tresilver <pool> ...\n"));
+	case HELP_TRIM:
+		return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] <pool> "
+		    "[<device> ...]\n"));
+	case HELP_STATUS:
+		return (gettext("\tstatus [-c [script1,script2,...]] "
+		    "[-igLpPstvxD]  [-T d|u] [pool] ... \n"
+		    "\t    [interval [count]]\n"));
+	case HELP_UPGRADE:
+		return (gettext("\tupgrade\n"
+		    "\tupgrade -v\n"
+		    "\tupgrade [-V version] <-a | pool ...>\n"));
+	case HELP_EVENTS:
+		return (gettext("\tevents [-vHf [pool] | -c]\n"));
+	case HELP_GET:
+		return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] "
+		    "<\"all\" | property[,...]> <pool> ...\n"));
+	case HELP_SET:
+		return (gettext("\tset <property=value> <pool> \n"));
+	case HELP_SPLIT:
+		return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n"
+		    "\t    [-o property=value] <pool> <newpool> "
+		    "[<device> ...]\n"));
+	case HELP_REGUID:
+		return (gettext("\treguid <pool>\n"));
+	case HELP_SYNC:
+		return (gettext("\tsync [pool] ...\n"));
+	case HELP_VERSION:
+		return (gettext("\tversion\n"));
+	case HELP_WAIT:
+		return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
+		    "<pool> [interval]\n"));
+	}
+
+	abort();
+	/* NOTREACHED */
+}
+
+static void
+zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
+{
+	uint_t children = 0;
+	nvlist_t **child;
+	uint_t i;
+
+	(void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children);
+
+	if (children == 0) {
+		char *path = zpool_vdev_name(g_zfs, zhp, nvroot,
+		    VDEV_NAME_PATH);
+
+		if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 &&
+		    strcmp(path, VDEV_TYPE_HOLE) != 0)
+			fnvlist_add_boolean(res, path);
+
+		free(path);
+		return;
+	}
+
+	for (i = 0; i < children; i++) {
+		zpool_collect_leaves(zhp, child[i], res);
+	}
+}
+
+/*
+ * Callback routine that will print out a pool property value.
+ */
+static int
+print_prop_cb(int prop, void *cb)
+{
+	FILE *fp = cb;
+
+	(void) fprintf(fp, "\t%-19s  ", zpool_prop_to_name(prop));
+
+	if (zpool_prop_readonly(prop))
+		(void) fprintf(fp, "  NO   ");
+	else
+		(void) fprintf(fp, " YES   ");
+
+	if (zpool_prop_values(prop) == NULL)
+		(void) fprintf(fp, "-\n");
+	else
+		(void) fprintf(fp, "%s\n", zpool_prop_values(prop));
+
+	return (ZPROP_CONT);
+}
+
+/*
+ * Display usage message.  If we're inside a command, display only the usage for
+ * that command.  Otherwise, iterate over the entire command table and display
+ * a complete usage message.
+ */
+static void
+usage(boolean_t requested)
+{
+	FILE *fp = requested ? stdout : stderr;
+
+	if (current_command == NULL) {
+		int i;
+
+		(void) fprintf(fp, gettext("usage: zpool command args ...\n"));
+		(void) fprintf(fp,
+		    gettext("where 'command' is one of the following:\n\n"));
+
+		for (i = 0; i < NCOMMAND; i++) {
+			if (command_table[i].name == NULL)
+				(void) fprintf(fp, "\n");
+			else
+				(void) fprintf(fp, "%s",
+				    get_usage(command_table[i].usage));
+		}
+	} else {
+		(void) fprintf(fp, gettext("usage:\n"));
+		(void) fprintf(fp, "%s", get_usage(current_command->usage));
+	}
+
+	if (current_command != NULL &&
+	    ((strcmp(current_command->name, "set") == 0) ||
+	    (strcmp(current_command->name, "get") == 0) ||
+	    (strcmp(current_command->name, "list") == 0))) {
+
+		(void) fprintf(fp,
+		    gettext("\nthe following properties are supported:\n"));
+
+		(void) fprintf(fp, "\n\t%-19s  %s   %s\n\n",
+		    "PROPERTY", "EDIT", "VALUES");
+
+		/* Iterate over all properties */
+		(void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE,
+		    ZFS_TYPE_POOL);
+
+		(void) fprintf(fp, "\t%-19s   ", "feature@...");
+		(void) fprintf(fp, "YES   disabled | enabled | active\n");
+
+		(void) fprintf(fp, gettext("\nThe feature@ properties must be "
+		    "appended with a feature name.\nSee zpool-features(5).\n"));
+	}
+
+	/*
+	 * See comments at end of main().
+	 */
+	if (getenv("ZFS_ABORT") != NULL) {
+		(void) printf("dumping core by request\n");
+		abort();
+	}
+
+	exit(requested ? 0 : 2);
+}
+
+/*
+ * zpool initialize [-c | -s] [-w] <pool> [<vdev> ...]
+ * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool
+ * if none specified.
+ *
+ *	-c	Cancel. Ends active initializing.
+ *	-s	Suspend. Initializing can then be restarted with no flags.
+ *	-w	Wait. Blocks until initializing has completed.
+ */
+int
+zpool_do_initialize(int argc, char **argv)
+{
+	int c;
+	char *poolname;
+	zpool_handle_t *zhp;
+	nvlist_t *vdevs;
+	int err = 0;
+	boolean_t wait = B_FALSE;
+
+	struct option long_options[] = {
+		{"cancel",	no_argument,		NULL, 'c'},
+		{"suspend",	no_argument,		NULL, 's'},
+		{"wait",	no_argument,		NULL, 'w'},
+		{0, 0, 0, 0}
+	};
+
+	pool_initialize_func_t cmd_type = POOL_INITIALIZE_START;
+	while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) {
+		switch (c) {
+		case 'c':
+			if (cmd_type != POOL_INITIALIZE_START &&
+			    cmd_type != POOL_INITIALIZE_CANCEL) {
+				(void) fprintf(stderr, gettext("-c cannot be "
+				    "combined with other options\n"));
+				usage(B_FALSE);
+			}
+			cmd_type = POOL_INITIALIZE_CANCEL;
+			break;
+		case 's':
+			if (cmd_type != POOL_INITIALIZE_START &&
+			    cmd_type != POOL_INITIALIZE_SUSPEND) {
+				(void) fprintf(stderr, gettext("-s cannot be "
+				    "combined with other options\n"));
+				usage(B_FALSE);
+			}
+			cmd_type = POOL_INITIALIZE_SUSPEND;
+			break;
+		case 'w':
+			wait = B_TRUE;
+			break;
+		case '?':
+			if (optopt != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid option '%c'\n"), optopt);
+			} else {
+				(void) fprintf(stderr,
+				    gettext("invalid option '%s'\n"),
+				    argv[optind - 1]);
+			}
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(B_FALSE);
+		return (-1);
+	}
+
+	if (wait && (cmd_type != POOL_INITIALIZE_START)) {
+		(void) fprintf(stderr, gettext("-w cannot be used with -c or "
+		    "-s\n"));
+		usage(B_FALSE);
+	}
+
+	poolname = argv[0];
+	zhp = zpool_open(g_zfs, poolname);
+	if (zhp == NULL)
+		return (-1);
+
+	vdevs = fnvlist_alloc();
+	if (argc == 1) {
+		/* no individual leaf vdevs specified, so add them all */
+		nvlist_t *config = zpool_get_config(zhp, NULL);
+		nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+		    ZPOOL_CONFIG_VDEV_TREE);
+		zpool_collect_leaves(zhp, nvroot, vdevs);
+	} else {
+		for (int i = 1; i < argc; i++) {
+			fnvlist_add_boolean(vdevs, argv[i]);
+		}
+	}
+
+	if (wait)
+		err = zpool_initialize_wait(zhp, cmd_type, vdevs);
+	else
+		err = zpool_initialize(zhp, cmd_type, vdevs);
+
+	fnvlist_free(vdevs);
+	zpool_close(zhp);
+
+	return (err);
+}
+
+/*
+ * print a pool vdev config for dry runs
+ */
+static void
+print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
+    const char *match, int name_flags)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	char *vname;
+	boolean_t printed = B_FALSE;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+		if (name != NULL)
+			(void) printf("\t%*s%s\n", indent, "", name);
+		return;
+	}
+
+	for (c = 0; c < children; c++) {
+		uint64_t is_log = B_FALSE;
+		char *class = "";
+
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+		if (is_log)
+			class = VDEV_ALLOC_BIAS_LOG;
+		(void) nvlist_lookup_string(child[c],
+		    ZPOOL_CONFIG_ALLOCATION_BIAS, &class);
+		if (strcmp(match, class) != 0)
+			continue;
+
+		if (!printed && name != NULL) {
+			(void) printf("\t%*s%s\n", indent, "", name);
+			printed = B_TRUE;
+		}
+		vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags);
+		print_vdev_tree(zhp, vname, child[c], indent + 2, "",
+		    name_flags);
+		free(vname);
+	}
+}
+
+static boolean_t
+prop_list_contains_feature(nvlist_t *proplist)
+{
+	nvpair_t *nvp;
+	for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp;
+	    nvp = nvlist_next_nvpair(proplist, nvp)) {
+		if (zpool_prop_feature(nvpair_name(nvp)))
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Add a property pair (name, string-value) into a property nvlist.
+ */
+static int
+add_prop_list(const char *propname, char *propval, nvlist_t **props,
+    boolean_t poolprop)
+{
+	zpool_prop_t prop = ZPOOL_PROP_INVAL;
+	nvlist_t *proplist;
+	const char *normnm;
+	char *strval;
+
+	if (*props == NULL &&
+	    nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) {
+		(void) fprintf(stderr,
+		    gettext("internal error: out of memory\n"));
+		return (1);
+	}
+
+	proplist = *props;
+
+	if (poolprop) {
+		const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION);
+
+		if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL &&
+		    !zpool_prop_feature(propname)) {
+			(void) fprintf(stderr, gettext("property '%s' is "
+			    "not a valid pool property\n"), propname);
+			return (2);
+		}
+
+		/*
+		 * feature@ properties and version should not be specified
+		 * at the same time.
+		 */
+		if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) &&
+		    nvlist_exists(proplist, vname)) ||
+		    (prop == ZPOOL_PROP_VERSION &&
+		    prop_list_contains_feature(proplist))) {
+			(void) fprintf(stderr, gettext("'feature@' and "
+			    "'version' properties cannot be specified "
+			    "together\n"));
+			return (2);
+		}
+
+
+		if (zpool_prop_feature(propname))
+			normnm = propname;
+		else
+			normnm = zpool_prop_to_name(prop);
+	} else {
+		zfs_prop_t fsprop = zfs_name_to_prop(propname);
+
+		if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM,
+		    B_FALSE)) {
+			normnm = zfs_prop_to_name(fsprop);
+		} else if (zfs_prop_user(propname) ||
+		    zfs_prop_userquota(propname)) {
+			normnm = propname;
+		} else {
+			(void) fprintf(stderr, gettext("property '%s' is "
+			    "not a valid filesystem property\n"), propname);
+			return (2);
+		}
+	}
+
+	if (nvlist_lookup_string(proplist, normnm, &strval) == 0 &&
+	    prop != ZPOOL_PROP_CACHEFILE) {
+		(void) fprintf(stderr, gettext("property '%s' "
+		    "specified multiple times\n"), propname);
+		return (2);
+	}
+
+	if (nvlist_add_string(proplist, normnm, propval) != 0) {
+		(void) fprintf(stderr, gettext("internal "
+		    "error: out of memory\n"));
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * Set a default property pair (name, string-value) in a property nvlist
+ */
+static int
+add_prop_list_default(const char *propname, char *propval, nvlist_t **props,
+    boolean_t poolprop)
+{
+	char *pval;
+
+	if (nvlist_lookup_string(*props, propname, &pval) == 0)
+		return (0);
+
+	return (add_prop_list(propname, propval, props, B_TRUE));
+}
+
+/*
+ * zpool add [-fgLnP] [-o property=value] <pool> <vdev> ...
+ *
+ *	-f	Force addition of devices, even if they appear in use
+ *	-g	Display guid for individual vdev name.
+ *	-L	Follow links when resolving vdev path name.
+ *	-n	Do not add the devices, but display the resulting layout if
+ *		they were to be added.
+ *	-o	Set property=value.
+ *	-P	Display full path for vdev name.
+ *
+ * Adds the given vdevs to 'pool'.  As with create, the bulk of this work is
+ * handled by make_root_vdev(), which constructs the nvlist needed to pass to
+ * libzfs.
+ */
+int
+zpool_do_add(int argc, char **argv)
+{
+	boolean_t force = B_FALSE;
+	boolean_t dryrun = B_FALSE;
+	int name_flags = 0;
+	int c;
+	nvlist_t *nvroot;
+	char *poolname;
+	int ret;
+	zpool_handle_t *zhp;
+	nvlist_t *config;
+	nvlist_t *props = NULL;
+	char *propval;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "fgLno:P")) != -1) {
+		switch (c) {
+		case 'f':
+			force = B_TRUE;
+			break;
+		case 'g':
+			name_flags |= VDEV_NAME_GUID;
+			break;
+		case 'L':
+			name_flags |= VDEV_NAME_FOLLOW_LINKS;
+			break;
+		case 'n':
+			dryrun = B_TRUE;
+			break;
+		case 'o':
+			if ((propval = strchr(optarg, '=')) == NULL) {
+				(void) fprintf(stderr, gettext("missing "
+				    "'=' for -o option\n"));
+				usage(B_FALSE);
+			}
+			*propval = '\0';
+			propval++;
+
+			if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) ||
+			    (add_prop_list(optarg, propval, &props, B_TRUE)))
+				usage(B_FALSE);
+			break;
+		case 'P':
+			name_flags |= VDEV_NAME_PATH;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing vdev specification\n"));
+		usage(B_FALSE);
+	}
+
+	poolname = argv[0];
+
+	argc--;
+	argv++;
+
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+		return (1);
+
+	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+		(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
+		    poolname);
+		zpool_close(zhp);
+		return (1);
+	}
+
+	/* unless manually specified use "ashift" pool property (if set) */
+	if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) {
+		int intval;
+		zprop_source_t src;
+		char strval[ZPOOL_MAXPROPLEN];
+
+		intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src);
+		if (src != ZPROP_SRC_DEFAULT) {
+			(void) sprintf(strval, "%" PRId32, intval);
+			verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval,
+			    &props, B_TRUE) == 0);
+		}
+	}
+
+	/* pass off to make_root_vdev for processing */
+	nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun,
+	    argc, argv);
+	if (nvroot == NULL) {
+		zpool_close(zhp);
+		return (1);
+	}
+
+	if (dryrun) {
+		nvlist_t *poolnvroot;
+		nvlist_t **l2child;
+		uint_t l2children, c;
+		char *vname;
+		boolean_t hadcache = B_FALSE;
+
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &poolnvroot) == 0);
+
+		(void) printf(gettext("would update '%s' to the following "
+		    "configuration:\n"), zpool_get_name(zhp));
+
+		/* print original main pool and new tree */
+		print_vdev_tree(zhp, poolname, poolnvroot, 0, "",
+		    name_flags | VDEV_NAME_TYPE_ID);
+		print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags);
+
+		/* print other classes: 'dedup', 'special', and 'log' */
+		if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) {
+			print_vdev_tree(zhp, "dedup", poolnvroot, 0,
+			    VDEV_ALLOC_BIAS_DEDUP, name_flags);
+			print_vdev_tree(zhp, NULL, nvroot, 0,
+			    VDEV_ALLOC_BIAS_DEDUP, name_flags);
+		} else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) {
+			print_vdev_tree(zhp, "dedup", nvroot, 0,
+			    VDEV_ALLOC_BIAS_DEDUP, name_flags);
+		}
+
+		if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) {
+			print_vdev_tree(zhp, "special", poolnvroot, 0,
+			    VDEV_ALLOC_BIAS_SPECIAL, name_flags);
+			print_vdev_tree(zhp, NULL, nvroot, 0,
+			    VDEV_ALLOC_BIAS_SPECIAL, name_flags);
+		} else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) {
+			print_vdev_tree(zhp, "special", nvroot, 0,
+			    VDEV_ALLOC_BIAS_SPECIAL, name_flags);
+		}
+
+		if (num_logs(poolnvroot) > 0) {
+			print_vdev_tree(zhp, "logs", poolnvroot, 0,
+			    VDEV_ALLOC_BIAS_LOG, name_flags);
+			print_vdev_tree(zhp, NULL, nvroot, 0,
+			    VDEV_ALLOC_BIAS_LOG, name_flags);
+		} else if (num_logs(nvroot) > 0) {
+			print_vdev_tree(zhp, "logs", nvroot, 0,
+			    VDEV_ALLOC_BIAS_LOG, name_flags);
+		}
+
+		/* Do the same for the caches */
+		if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE,
+		    &l2child, &l2children) == 0 && l2children) {
+			hadcache = B_TRUE;
+			(void) printf(gettext("\tcache\n"));
+			for (c = 0; c < l2children; c++) {
+				vname = zpool_vdev_name(g_zfs, NULL,
+				    l2child[c], name_flags);
+				(void) printf("\t  %s\n", vname);
+				free(vname);
+			}
+		}
+		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+		    &l2child, &l2children) == 0 && l2children) {
+			if (!hadcache)
+				(void) printf(gettext("\tcache\n"));
+			for (c = 0; c < l2children; c++) {
+				vname = zpool_vdev_name(g_zfs, NULL,
+				    l2child[c], name_flags);
+				(void) printf("\t  %s\n", vname);
+				free(vname);
+			}
+		}
+
+		ret = 0;
+	} else {
+		ret = (zpool_add(zhp, nvroot) != 0);
+	}
+
+	nvlist_free(props);
+	nvlist_free(nvroot);
+	zpool_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * zpool remove [-npsw] <pool> <vdev> ...
+ *
+ * Removes the given vdev from the pool.
+ */
+int
+zpool_do_remove(int argc, char **argv)
+{
+	char *poolname;
+	int i, ret = 0;
+	zpool_handle_t *zhp = NULL;
+	boolean_t stop = B_FALSE;
+	int c;
+	boolean_t noop = B_FALSE;
+	boolean_t parsable = B_FALSE;
+	boolean_t wait = B_FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "npsw")) != -1) {
+		switch (c) {
+		case 'n':
+			noop = B_TRUE;
+			break;
+		case 'p':
+			parsable = B_TRUE;
+			break;
+		case 's':
+			stop = B_TRUE;
+			break;
+		case 'w':
+			wait = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(B_FALSE);
+	}
+
+	poolname = argv[0];
+
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+		return (1);
+
+	if (stop && noop) {
+		(void) fprintf(stderr, gettext("stop request ignored\n"));
+		return (0);
+	}
+
+	if (stop) {
+		if (argc > 1) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+		if (zpool_vdev_remove_cancel(zhp) != 0)
+			ret = 1;
+		if (wait) {
+			(void) fprintf(stderr, gettext("invalid option "
+			    "combination: -w cannot be used with -s\n"));
+			usage(B_FALSE);
+		}
+	} else {
+		if (argc < 2) {
+			(void) fprintf(stderr, gettext("missing device\n"));
+			usage(B_FALSE);
+		}
+
+		for (i = 1; i < argc; i++) {
+			if (noop) {
+				uint64_t size;
+
+				if (zpool_vdev_indirect_size(zhp, argv[i],
+				    &size) != 0) {
+					ret = 1;
+					break;
+				}
+				if (parsable) {
+					(void) printf("%s %llu\n",
+					    argv[i], (unsigned long long)size);
+				} else {
+					char valstr[32];
+					zfs_nicenum(size, valstr,
+					    sizeof (valstr));
+					(void) printf("Memory that will be "
+					    "used after removing %s: %s\n",
+					    argv[i], valstr);
+				}
+			} else {
+				if (zpool_vdev_remove(zhp, argv[i]) != 0)
+					ret = 1;
+			}
+		}
+
+		if (ret == 0 && wait)
+			ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE);
+	}
+	zpool_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * zpool labelclear [-f] <vdev>
+ *
+ *	-f	Force clearing the label for the vdevs which are members of
+ *		the exported or foreign pools.
+ *
+ * Verifies that the vdev is not active and zeros out the label information
+ * on the device.
+ */
+int
+zpool_do_labelclear(int argc, char **argv)
+{
+	char vdev[MAXPATHLEN];
+	char *name = NULL;
+	struct stat st;
+	int c, fd = -1, ret = 0;
+	nvlist_t *config;
+	pool_state_t state;
+	boolean_t inuse = B_FALSE;
+	boolean_t force = B_FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "f")) != -1) {
+		switch (c) {
+		case 'f':
+			force = B_TRUE;
+			break;
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get vdev name */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing vdev name\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	/*
+	 * Check if we were given absolute path and use it as is.
+	 * Otherwise if the provided vdev name doesn't point to a file,
+	 * try prepending expected disk paths and partition numbers.
+	 */
+	(void) strlcpy(vdev, argv[0], sizeof (vdev));
+	if (vdev[0] != '/' && stat(vdev, &st) != 0) {
+		int error;
+
+		error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN);
+		if (error == 0 && zfs_dev_is_whole_disk(vdev)) {
+			if (zfs_append_partition(vdev, MAXPATHLEN) == -1)
+				error = ENOENT;
+		}
+
+		if (error || (stat(vdev, &st) != 0)) {
+			(void) fprintf(stderr, gettext(
+			    "failed to find device %s, try specifying absolute "
+			    "path instead\n"), argv[0]);
+			return (1);
+		}
+	}
+
+	if ((fd = open(vdev, O_RDWR)) < 0) {
+		(void) fprintf(stderr, gettext("failed to open %s: %s\n"),
+		    vdev, strerror(errno));
+		return (1);
+	}
+
+	/*
+	 * Flush all dirty pages for the block device.  This should not be
+	 * fatal when the device does not support BLKFLSBUF as would be the
+	 * case for a file vdev.
+	 */
+	if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY))
+		(void) fprintf(stderr, gettext("failed to invalidate "
+		    "cache for %s: %s\n"), vdev, strerror(errno));
+
+	if (zpool_read_label(fd, &config, NULL) != 0) {
+		(void) fprintf(stderr,
+		    gettext("failed to read label from %s\n"), vdev);
+		ret = 1;
+		goto errout;
+	}
+	nvlist_free(config);
+
+	ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse);
+	if (ret != 0) {
+		(void) fprintf(stderr,
+		    gettext("failed to check state for %s\n"), vdev);
+		ret = 1;
+		goto errout;
+	}
+
+	if (!inuse)
+		goto wipe_label;
+
+	switch (state) {
+	default:
+	case POOL_STATE_ACTIVE:
+	case POOL_STATE_SPARE:
+	case POOL_STATE_L2CACHE:
+		(void) fprintf(stderr, gettext(
+		    "%s is a member (%s) of pool \"%s\"\n"),
+		    vdev, zpool_pool_state_to_name(state), name);
+		ret = 1;
+		goto errout;
+
+	case POOL_STATE_EXPORTED:
+		if (force)
+			break;
+		(void) fprintf(stderr, gettext(
+		    "use '-f' to override the following error:\n"
+		    "%s is a member of exported pool \"%s\"\n"),
+		    vdev, name);
+		ret = 1;
+		goto errout;
+
+	case POOL_STATE_POTENTIALLY_ACTIVE:
+		if (force)
+			break;
+		(void) fprintf(stderr, gettext(
+		    "use '-f' to override the following error:\n"
+		    "%s is a member of potentially active pool \"%s\"\n"),
+		    vdev, name);
+		ret = 1;
+		goto errout;
+
+	case POOL_STATE_DESTROYED:
+		/* inuse should never be set for a destroyed pool */
+		assert(0);
+		break;
+	}
+
+wipe_label:
+	ret = zpool_clear_label(fd);
+	if (ret != 0) {
+		(void) fprintf(stderr,
+		    gettext("failed to clear label for %s\n"), vdev);
+	}
+
+errout:
+	free(name);
+	(void) close(fd);
+
+	return (ret);
+}
+
+/*
+ * zpool create [-fnd] [-o property=value] ...
+ *		[-O file-system-property=value] ...
+ *		[-R root] [-m mountpoint] <pool> <dev> ...
+ *
+ *	-f	Force creation, even if devices appear in use
+ *	-n	Do not create the pool, but display the resulting layout if it
+ *		were to be created.
+ *      -R	Create a pool under an alternate root
+ *      -m	Set default mountpoint for the root dataset.  By default it's
+ *		'/<pool>'
+ *	-o	Set property=value.
+ *	-o	Set feature@feature=enabled|disabled.
+ *	-d	Don't automatically enable all supported pool features
+ *		(individual features can be enabled with -o).
+ *	-O	Set fsproperty=value in the pool's root file system
+ *
+ * Creates the named pool according to the given vdev specification.  The
+ * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c.
+ * Once we get the nvlist back from make_root_vdev(), we either print out the
+ * contents (if '-n' was specified), or pass it to libzfs to do the creation.
+ */
+int
+zpool_do_create(int argc, char **argv)
+{
+	boolean_t force = B_FALSE;
+	boolean_t dryrun = B_FALSE;
+	boolean_t enable_all_pool_feat = B_TRUE;
+	int c;
+	nvlist_t *nvroot = NULL;
+	char *poolname;
+	char *tname = NULL;
+	int ret = 1;
+	char *altroot = NULL;
+	char *mountpoint = NULL;
+	nvlist_t *fsprops = NULL;
+	nvlist_t *props = NULL;
+	char *propval;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) {
+		switch (c) {
+		case 'f':
+			force = B_TRUE;
+			break;
+		case 'n':
+			dryrun = B_TRUE;
+			break;
+		case 'd':
+			enable_all_pool_feat = B_FALSE;
+			break;
+		case 'R':
+			altroot = optarg;
+			if (add_prop_list(zpool_prop_to_name(
+			    ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
+				goto errout;
+			if (add_prop_list_default(zpool_prop_to_name(
+			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
+				goto errout;
+			break;
+		case 'm':
+			/* Equivalent to -O mountpoint=optarg */
+			mountpoint = optarg;
+			break;
+		case 'o':
+			if ((propval = strchr(optarg, '=')) == NULL) {
+				(void) fprintf(stderr, gettext("missing "
+				    "'=' for -o option\n"));
+				goto errout;
+			}
+			*propval = '\0';
+			propval++;
+
+			if (add_prop_list(optarg, propval, &props, B_TRUE))
+				goto errout;
+
+			/*
+			 * If the user is creating a pool that doesn't support
+			 * feature flags, don't enable any features.
+			 */
+			if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) {
+				char *end;
+				u_longlong_t ver;
+
+				ver = strtoull(propval, &end, 10);
+				if (*end == '\0' &&
+				    ver < SPA_VERSION_FEATURES) {
+					enable_all_pool_feat = B_FALSE;
+				}
+			}
+			if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT)
+				altroot = propval;
+			break;
+		case 'O':
+			if ((propval = strchr(optarg, '=')) == NULL) {
+				(void) fprintf(stderr, gettext("missing "
+				    "'=' for -O option\n"));
+				goto errout;
+			}
+			*propval = '\0';
+			propval++;
+
+			/*
+			 * Mountpoints are checked and then added later.
+			 * Uniquely among properties, they can be specified
+			 * more than once, to avoid conflict with -m.
+			 */
+			if (0 == strcmp(optarg,
+			    zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) {
+				mountpoint = propval;
+			} else if (add_prop_list(optarg, propval, &fsprops,
+			    B_FALSE)) {
+				goto errout;
+			}
+			break;
+		case 't':
+			/*
+			 * Sanity check temporary pool name.
+			 */
+			if (strchr(optarg, '/') != NULL) {
+				(void) fprintf(stderr, gettext("cannot create "
+				    "'%s': invalid character '/' in temporary "
+				    "name\n"), optarg);
+				(void) fprintf(stderr, gettext("use 'zfs "
+				    "create' to create a dataset\n"));
+				goto errout;
+			}
+
+			if (add_prop_list(zpool_prop_to_name(
+			    ZPOOL_PROP_TNAME), optarg, &props, B_TRUE))
+				goto errout;
+			if (add_prop_list_default(zpool_prop_to_name(
+			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
+				goto errout;
+			tname = optarg;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			goto badusage;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			goto badusage;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		goto badusage;
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing vdev specification\n"));
+		goto badusage;
+	}
+
+	poolname = argv[0];
+
+	/*
+	 * As a special case, check for use of '/' in the name, and direct the
+	 * user to use 'zfs create' instead.
+	 */
+	if (strchr(poolname, '/') != NULL) {
+		(void) fprintf(stderr, gettext("cannot create '%s': invalid "
+		    "character '/' in pool name\n"), poolname);
+		(void) fprintf(stderr, gettext("use 'zfs create' to "
+		    "create a dataset\n"));
+		goto errout;
+	}
+
+	/* pass off to make_root_vdev for bulk processing */
+	nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun,
+	    argc - 1, argv + 1);
+	if (nvroot == NULL)
+		goto errout;
+
+	/* make_root_vdev() allows 0 toplevel children if there are spares */
+	if (!zfs_allocatable_devs(nvroot)) {
+		(void) fprintf(stderr, gettext("invalid vdev "
+		    "specification: at least one toplevel vdev must be "
+		    "specified\n"));
+		goto errout;
+	}
+
+	if (altroot != NULL && altroot[0] != '/') {
+		(void) fprintf(stderr, gettext("invalid alternate root '%s': "
+		    "must be an absolute path\n"), altroot);
+		goto errout;
+	}
+
+	/*
+	 * Check the validity of the mountpoint and direct the user to use the
+	 * '-m' mountpoint option if it looks like its in use.
+	 */
+	if (mountpoint == NULL ||
+	    (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
+	    strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) {
+		char buf[MAXPATHLEN];
+		DIR *dirp;
+
+		if (mountpoint && mountpoint[0] != '/') {
+			(void) fprintf(stderr, gettext("invalid mountpoint "
+			    "'%s': must be an absolute path, 'legacy', or "
+			    "'none'\n"), mountpoint);
+			goto errout;
+		}
+
+		if (mountpoint == NULL) {
+			if (altroot != NULL)
+				(void) snprintf(buf, sizeof (buf), "%s/%s",
+				    altroot, poolname);
+			else
+				(void) snprintf(buf, sizeof (buf), "/%s",
+				    poolname);
+		} else {
+			if (altroot != NULL)
+				(void) snprintf(buf, sizeof (buf), "%s%s",
+				    altroot, mountpoint);
+			else
+				(void) snprintf(buf, sizeof (buf), "%s",
+				    mountpoint);
+		}
+
+		if ((dirp = opendir(buf)) == NULL && errno != ENOENT) {
+			(void) fprintf(stderr, gettext("mountpoint '%s' : "
+			    "%s\n"), buf, strerror(errno));
+			(void) fprintf(stderr, gettext("use '-m' "
+			    "option to provide a different default\n"));
+			goto errout;
+		} else if (dirp) {
+			int count = 0;
+
+			while (count < 3 && readdir(dirp) != NULL)
+				count++;
+			(void) closedir(dirp);
+
+			if (count > 2) {
+				(void) fprintf(stderr, gettext("mountpoint "
+				    "'%s' exists and is not empty\n"), buf);
+				(void) fprintf(stderr, gettext("use '-m' "
+				    "option to provide a "
+				    "different default\n"));
+				goto errout;
+			}
+		}
+	}
+
+	/*
+	 * Now that the mountpoint's validity has been checked, ensure that
+	 * the property is set appropriately prior to creating the pool.
+	 */
+	if (mountpoint != NULL) {
+		ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
+		    mountpoint, &fsprops, B_FALSE);
+		if (ret != 0)
+			goto errout;
+	}
+
+	ret = 1;
+	if (dryrun) {
+		/*
+		 * For a dry run invocation, print out a basic message and run
+		 * through all the vdevs in the list and print out in an
+		 * appropriate hierarchy.
+		 */
+		(void) printf(gettext("would create '%s' with the "
+		    "following layout:\n\n"), poolname);
+
+		print_vdev_tree(NULL, poolname, nvroot, 0, "", 0);
+		print_vdev_tree(NULL, "dedup", nvroot, 0,
+		    VDEV_ALLOC_BIAS_DEDUP, 0);
+		print_vdev_tree(NULL, "special", nvroot, 0,
+		    VDEV_ALLOC_BIAS_SPECIAL, 0);
+		print_vdev_tree(NULL, "logs", nvroot, 0,
+		    VDEV_ALLOC_BIAS_LOG, 0);
+
+		ret = 0;
+	} else {
+		/*
+		 * Hand off to libzfs.
+		 */
+		spa_feature_t i;
+		for (i = 0; i < SPA_FEATURES; i++) {
+			char propname[MAXPATHLEN];
+			char *propval;
+			zfeature_info_t *feat = &spa_feature_table[i];
+
+			(void) snprintf(propname, sizeof (propname),
+			    "feature@%s", feat->fi_uname);
+
+			/*
+			 * Only features contained in props will be enabled:
+			 * remove from the nvlist every ZFS_FEATURE_DISABLED
+			 * value and add every missing ZFS_FEATURE_ENABLED if
+			 * enable_all_pool_feat is set.
+			 */
+			if (!nvlist_lookup_string(props, propname, &propval)) {
+				if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0)
+					(void) nvlist_remove_all(props,
+					    propname);
+			} else if (enable_all_pool_feat) {
+				ret = add_prop_list(propname,
+				    ZFS_FEATURE_ENABLED, &props, B_TRUE);
+				if (ret != 0)
+					goto errout;
+			}
+		}
+
+		ret = 1;
+		if (zpool_create(g_zfs, poolname,
+		    nvroot, props, fsprops) == 0) {
+			zfs_handle_t *pool = zfs_open(g_zfs,
+			    tname ? tname : poolname, ZFS_TYPE_FILESYSTEM);
+			if (pool != NULL) {
+				if (zfs_mount(pool, NULL, 0) == 0) {
+					ret = zfs_shareall(pool);
+					zfs_commit_all_shares();
+				}
+				zfs_close(pool);
+			}
+		} else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) {
+			(void) fprintf(stderr, gettext("pool name may have "
+			    "been omitted\n"));
+		}
+	}
+
+errout:
+	nvlist_free(nvroot);
+	nvlist_free(fsprops);
+	nvlist_free(props);
+	return (ret);
+badusage:
+	nvlist_free(fsprops);
+	nvlist_free(props);
+	usage(B_FALSE);
+	return (2);
+}
+
+/*
+ * zpool destroy <pool>
+ *
+ * 	-f	Forcefully unmount any datasets
+ *
+ * Destroy the given pool.  Automatically unmounts any datasets in the pool.
+ */
+int
+zpool_do_destroy(int argc, char **argv)
+{
+	boolean_t force = B_FALSE;
+	int c;
+	char *pool;
+	zpool_handle_t *zhp;
+	int ret;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "f")) != -1) {
+		switch (c) {
+		case 'f':
+			force = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	pool = argv[0];
+
+	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
+		/*
+		 * As a special case, check for use of '/' in the name, and
+		 * direct the user to use 'zfs destroy' instead.
+		 */
+		if (strchr(pool, '/') != NULL)
+			(void) fprintf(stderr, gettext("use 'zfs destroy' to "
+			    "destroy a dataset\n"));
+		return (1);
+	}
+
+	if (zpool_disable_datasets(zhp, force) != 0) {
+		(void) fprintf(stderr, gettext("could not destroy '%s': "
+		    "could not unmount datasets\n"), zpool_get_name(zhp));
+		zpool_close(zhp);
+		return (1);
+	}
+
+	/* The history must be logged as part of the export */
+	log_history = B_FALSE;
+
+	ret = (zpool_destroy(zhp, history_str) != 0);
+
+	zpool_close(zhp);
+
+	return (ret);
+}
+
+typedef struct export_cbdata {
+	boolean_t force;
+	boolean_t hardforce;
+} export_cbdata_t;
+
+/*
+ * Export one pool
+ */
+static int
+zpool_export_one(zpool_handle_t *zhp, void *data)
+{
+	export_cbdata_t *cb = data;
+
+	if (zpool_disable_datasets(zhp, cb->force) != 0)
+		return (1);
+
+	/* The history must be logged as part of the export */
+	log_history = B_FALSE;
+
+	if (cb->hardforce) {
+		if (zpool_export_force(zhp, history_str) != 0)
+			return (1);
+	} else if (zpool_export(zhp, cb->force, history_str) != 0) {
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * zpool export [-f] <pool> ...
+ *
+ *	-a	Export all pools
+ *	-f	Forcefully unmount datasets
+ *
+ * Export the given pools.  By default, the command will attempt to cleanly
+ * unmount any active datasets within the pool.  If the '-f' flag is specified,
+ * then the datasets will be forcefully unmounted.
+ */
+int
+zpool_do_export(int argc, char **argv)
+{
+	export_cbdata_t cb;
+	boolean_t do_all = B_FALSE;
+	boolean_t force = B_FALSE;
+	boolean_t hardforce = B_FALSE;
+	int c, ret;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "afF")) != -1) {
+		switch (c) {
+		case 'a':
+			do_all = B_TRUE;
+			break;
+		case 'f':
+			force = B_TRUE;
+			break;
+		case 'F':
+			hardforce = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	cb.force = force;
+	cb.hardforce = hardforce;
+	argc -= optind;
+	argv += optind;
+
+	if (do_all) {
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+
+		return (for_each_pool(argc, argv, B_TRUE, NULL,
+		    zpool_export_one, &cb));
+	}
+
+	/* check arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool argument\n"));
+		usage(B_FALSE);
+	}
+
+	ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb);
+
+	return (ret);
+}
+
+/*
+ * Given a vdev configuration, determine the maximum width needed for the device
+ * name column.
+ */
+static int
+max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max,
+    int name_flags)
+{
+	char *name;
+	nvlist_t **child;
+	uint_t c, children;
+	int ret;
+
+	name = zpool_vdev_name(g_zfs, zhp, nv, name_flags);
+	if (strlen(name) + depth > max)
+		max = strlen(name) + depth;
+
+	free(name);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			if ((ret = max_width(zhp, child[c], depth + 2,
+			    max, name_flags)) > max)
+				max = ret;
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			if ((ret = max_width(zhp, child[c], depth + 2,
+			    max, name_flags)) > max)
+				max = ret;
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			if ((ret = max_width(zhp, child[c], depth + 2,
+			    max, name_flags)) > max)
+				max = ret;
+	}
+
+	return (max);
+}
+
+typedef struct spare_cbdata {
+	uint64_t	cb_guid;
+	zpool_handle_t	*cb_zhp;
+} spare_cbdata_t;
+
+static boolean_t
+find_vdev(nvlist_t *nv, uint64_t search)
+{
+	uint64_t guid;
+	nvlist_t **child;
+	uint_t c, children;
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
+	    search == guid)
+		return (B_TRUE);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			if (find_vdev(child[c], search))
+				return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+find_spare(zpool_handle_t *zhp, void *data)
+{
+	spare_cbdata_t *cbp = data;
+	nvlist_t *config, *nvroot;
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	if (find_vdev(nvroot, cbp->cb_guid)) {
+		cbp->cb_zhp = zhp;
+		return (1);
+	}
+
+	zpool_close(zhp);
+	return (0);
+}
+
+typedef struct status_cbdata {
+	int		cb_count;
+	int		cb_name_flags;
+	int		cb_namewidth;
+	boolean_t	cb_allpools;
+	boolean_t	cb_verbose;
+	boolean_t	cb_literal;
+	boolean_t	cb_explain;
+	boolean_t	cb_first;
+	boolean_t	cb_dedup_stats;
+	boolean_t	cb_print_status;
+	boolean_t	cb_print_slow_ios;
+	boolean_t	cb_print_vdev_init;
+	boolean_t	cb_print_vdev_trim;
+	vdev_cmd_data_list_t	*vcdl;
+} status_cbdata_t;
+
+/* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */
+static int
+is_blank_str(char *str)
+{
+	while (str != NULL && *str != '\0') {
+		if (!isblank(*str))
+			return (0);
+		str++;
+	}
+	return (1);
+}
+
+/* Print command output lines for specific vdev in a specific pool */
+static void
+zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path)
+{
+	vdev_cmd_data_t *data;
+	int i, j;
+	char *val;
+
+	for (i = 0; i < vcdl->count; i++) {
+		if ((strcmp(vcdl->data[i].path, path) != 0) ||
+		    (strcmp(vcdl->data[i].pool, pool) != 0)) {
+			/* Not the vdev we're looking for */
+			continue;
+		}
+
+		data = &vcdl->data[i];
+		/* Print out all the output values for this vdev */
+		for (j = 0; j < vcdl->uniq_cols_cnt; j++) {
+			val = NULL;
+			/* Does this vdev have values for this column? */
+			for (int k = 0; k < data->cols_cnt; k++) {
+				if (strcmp(data->cols[k],
+				    vcdl->uniq_cols[j]) == 0) {
+					/* yes it does, record the value */
+					val = data->lines[k];
+					break;
+				}
+			}
+			/*
+			 * Mark empty values with dashes to make output
+			 * awk-able.
+			 */
+			if (is_blank_str(val))
+				val = "-";
+
+			printf("%*s", vcdl->uniq_cols_width[j], val);
+			if (j < vcdl->uniq_cols_cnt - 1)
+				printf("  ");
+		}
+
+		/* Print out any values that aren't in a column at the end */
+		for (j = data->cols_cnt; j < data->lines_cnt; j++) {
+			/* Did we have any columns?  If so print a spacer. */
+			if (vcdl->uniq_cols_cnt > 0)
+				printf("  ");
+
+			val = data->lines[j];
+			printf("%s", val ? val : "");
+		}
+		break;
+	}
+}
+
+/*
+ * Print vdev initialization status for leaves
+ */
+static void
+print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
+{
+	if (verbose) {
+		if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+		    vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+		    vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) &&
+		    !vs->vs_scan_removing) {
+			char zbuf[1024];
+			char tbuf[256];
+			struct tm zaction_ts;
+
+			time_t t = vs->vs_initialize_action_time;
+			int initialize_pct = 100;
+			if (vs->vs_initialize_state !=
+			    VDEV_INITIALIZE_COMPLETE) {
+				initialize_pct = (vs->vs_initialize_bytes_done *
+				    100 / (vs->vs_initialize_bytes_est + 1));
+			}
+
+			(void) localtime_r(&t, &zaction_ts);
+			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+
+			switch (vs->vs_initialize_state) {
+			case VDEV_INITIALIZE_SUSPENDED:
+				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+				    gettext("suspended, started at"), tbuf);
+				break;
+			case VDEV_INITIALIZE_ACTIVE:
+				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+				    gettext("started at"), tbuf);
+				break;
+			case VDEV_INITIALIZE_COMPLETE:
+				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+				    gettext("completed at"), tbuf);
+				break;
+			}
+
+			(void) printf(gettext("  (%d%% initialized%s)"),
+			    initialize_pct, zbuf);
+		} else {
+			(void) printf(gettext("  (uninitialized)"));
+		}
+	} else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+		(void) printf(gettext("  (initializing)"));
+	}
+}
+
+/*
+ * Print vdev TRIM status for leaves
+ */
+static void
+print_status_trim(vdev_stat_t *vs, boolean_t verbose)
+{
+	if (verbose) {
+		if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE ||
+		    vs->vs_trim_state == VDEV_TRIM_SUSPENDED ||
+		    vs->vs_trim_state == VDEV_TRIM_COMPLETE) &&
+		    !vs->vs_scan_removing) {
+			char zbuf[1024];
+			char tbuf[256];
+			struct tm zaction_ts;
+
+			time_t t = vs->vs_trim_action_time;
+			int trim_pct = 100;
+			if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) {
+				trim_pct = (vs->vs_trim_bytes_done *
+				    100 / (vs->vs_trim_bytes_est + 1));
+			}
+
+			(void) localtime_r(&t, &zaction_ts);
+			(void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+
+			switch (vs->vs_trim_state) {
+			case VDEV_TRIM_SUSPENDED:
+				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+				    gettext("suspended, started at"), tbuf);
+				break;
+			case VDEV_TRIM_ACTIVE:
+				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+				    gettext("started at"), tbuf);
+				break;
+			case VDEV_TRIM_COMPLETE:
+				(void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+				    gettext("completed at"), tbuf);
+				break;
+			}
+
+			(void) printf(gettext("  (%d%% trimmed%s)"),
+			    trim_pct, zbuf);
+		} else if (vs->vs_trim_notsup) {
+			(void) printf(gettext("  (trim unsupported)"));
+		} else {
+			(void) printf(gettext("  (untrimmed)"));
+		}
+	} else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) {
+		(void) printf(gettext("  (trimming)"));
+	}
+}
+
+/*
+ * Return the color associated with a health string.  This includes returning
+ * NULL for no color change.
+ */
+static char *
+health_str_to_color(const char *health)
+{
+	if (strcmp(health, gettext("FAULTED")) == 0 ||
+	    strcmp(health, gettext("SUSPENDED")) == 0 ||
+	    strcmp(health, gettext("UNAVAIL")) == 0) {
+		return (ANSI_RED);
+	}
+
+	if (strcmp(health, gettext("OFFLINE")) == 0 ||
+	    strcmp(health, gettext("DEGRADED")) == 0 ||
+	    strcmp(health, gettext("REMOVED")) == 0) {
+		return (ANSI_YELLOW);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Print out configuration state as requested by status_callback.
+ */
+static void
+print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
+    nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs)
+{
+	nvlist_t **child, *root;
+	uint_t c, i, vsc, children;
+	pool_scan_stat_t *ps = NULL;
+	vdev_stat_t *vs;
+	char rbuf[6], wbuf[6], cbuf[6];
+	char *vname;
+	uint64_t notpresent;
+	spare_cbdata_t spare_cb;
+	const char *state;
+	char *type;
+	char *path = NULL;
+	char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		children = 0;
+
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &vsc) == 0);
+
+	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+	if (strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+		return;
+
+	state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+
+	if (isspare) {
+		/*
+		 * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
+		 * online drives.
+		 */
+		if (vs->vs_aux == VDEV_AUX_SPARED)
+			state = gettext("INUSE");
+		else if (vs->vs_state == VDEV_STATE_HEALTHY)
+			state = gettext("AVAIL");
+	}
+
+	printf_color(health_str_to_color(state),
+	    "\t%*s%-*s  %-8s", depth, "", cb->cb_namewidth - depth,
+	    name, state);
+
+	if (!isspare) {
+		if (vs->vs_read_errors)
+			rcolor = ANSI_RED;
+
+		if (vs->vs_write_errors)
+			wcolor = ANSI_RED;
+
+		if (vs->vs_checksum_errors)
+			ccolor = ANSI_RED;
+
+		if (cb->cb_literal) {
+			printf(" ");
+			printf_color(rcolor, "%5llu",
+			    (u_longlong_t)vs->vs_read_errors);
+			printf(" ");
+			printf_color(wcolor, "%5llu",
+			    (u_longlong_t)vs->vs_write_errors);
+			printf(" ");
+			printf_color(ccolor, "%5llu",
+			    (u_longlong_t)vs->vs_checksum_errors);
+		} else {
+			zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
+			zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
+			zfs_nicenum(vs->vs_checksum_errors, cbuf,
+			    sizeof (cbuf));
+			printf(" ");
+			printf_color(rcolor, "%5s", rbuf);
+			printf(" ");
+			printf_color(wcolor, "%5s", wbuf);
+			printf(" ");
+			printf_color(ccolor, "%5s", cbuf);
+		}
+		if (cb->cb_print_slow_ios) {
+			if (children == 0)  {
+				/* Only leafs vdevs have slow IOs */
+				zfs_nicenum(vs->vs_slow_ios, rbuf,
+				    sizeof (rbuf));
+			} else {
+				snprintf(rbuf, sizeof (rbuf), "-");
+			}
+
+			if (cb->cb_literal)
+				printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
+			else
+				printf(" %5s", rbuf);
+		}
+	}
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &notpresent) == 0) {
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+		(void) printf("  %s %s", gettext("was"), path);
+	} else if (vs->vs_aux != 0) {
+		(void) printf("  ");
+		color_start(ANSI_RED);
+		switch (vs->vs_aux) {
+		case VDEV_AUX_OPEN_FAILED:
+			(void) printf(gettext("cannot open"));
+			break;
+
+		case VDEV_AUX_BAD_GUID_SUM:
+			(void) printf(gettext("missing device"));
+			break;
+
+		case VDEV_AUX_NO_REPLICAS:
+			(void) printf(gettext("insufficient replicas"));
+			break;
+
+		case VDEV_AUX_VERSION_NEWER:
+			(void) printf(gettext("newer version"));
+			break;
+
+		case VDEV_AUX_UNSUP_FEAT:
+			(void) printf(gettext("unsupported feature(s)"));
+			break;
+
+		case VDEV_AUX_ASHIFT_TOO_BIG:
+			(void) printf(gettext("unsupported minimum blocksize"));
+			break;
+
+		case VDEV_AUX_SPARED:
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+			    &spare_cb.cb_guid) == 0);
+			if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) {
+				if (strcmp(zpool_get_name(spare_cb.cb_zhp),
+				    zpool_get_name(zhp)) == 0)
+					(void) printf(gettext("currently in "
+					    "use"));
+				else
+					(void) printf(gettext("in use by "
+					    "pool '%s'"),
+					    zpool_get_name(spare_cb.cb_zhp));
+				zpool_close(spare_cb.cb_zhp);
+			} else {
+				(void) printf(gettext("currently in use"));
+			}
+			break;
+
+		case VDEV_AUX_ERR_EXCEEDED:
+			(void) printf(gettext("too many errors"));
+			break;
+
+		case VDEV_AUX_IO_FAILURE:
+			(void) printf(gettext("experienced I/O failures"));
+			break;
+
+		case VDEV_AUX_BAD_LOG:
+			(void) printf(gettext("bad intent log"));
+			break;
+
+		case VDEV_AUX_EXTERNAL:
+			(void) printf(gettext("external device fault"));
+			break;
+
+		case VDEV_AUX_SPLIT_POOL:
+			(void) printf(gettext("split into new pool"));
+			break;
+
+		case VDEV_AUX_ACTIVE:
+			(void) printf(gettext("currently in use"));
+			break;
+
+		case VDEV_AUX_CHILDREN_OFFLINE:
+			(void) printf(gettext("all children offline"));
+			break;
+
+		default:
+			(void) printf(gettext("corrupted data"));
+			break;
+		}
+		color_end();
+	}
+
+	/* The root vdev has the scrub/resilver stats */
+	root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+	    ZPOOL_CONFIG_VDEV_TREE);
+	(void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS,
+	    (uint64_t **)&ps, &c);
+
+	if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) {
+		if (vs->vs_scan_processed != 0) {
+			(void) printf(gettext("  (%s)"),
+			    (ps->pss_func == POOL_SCAN_RESILVER) ?
+			    "resilvering" : "repairing");
+		} else if (vs->vs_resilver_deferred) {
+			(void) printf(gettext("  (awaiting resilver)"));
+		}
+	}
+
+	/* The top-level vdevs have the rebuild stats */
+	if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE &&
+	    children == 0) {
+		if (vs->vs_rebuild_processed != 0) {
+			(void) printf(gettext("  (resilvering)"));
+		}
+	}
+
+	if (cb->vcdl != NULL) {
+		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
+			printf("  ");
+			zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path);
+		}
+	}
+
+	/* Display vdev initialization and trim status for leaves */
+	if (children == 0) {
+		print_status_initialize(vs, cb->cb_print_vdev_init);
+		print_status_trim(vs, cb->cb_print_vdev_trim);
+	}
+
+	(void) printf("\n");
+
+	for (c = 0; c < children; c++) {
+		uint64_t islog = B_FALSE, ishole = B_FALSE;
+
+		/* Don't print logs or holes here */
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &islog);
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+		    &ishole);
+		if (islog || ishole)
+			continue;
+		/* Only print normal classes here */
+		if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+			continue;
+
+		/* Provide vdev_rebuild_stats to children if available */
+		if (vrs == NULL) {
+			(void) nvlist_lookup_uint64_array(nv,
+			    ZPOOL_CONFIG_REBUILD_STATS,
+			    (uint64_t **)&vrs, &i);
+		}
+
+		vname = zpool_vdev_name(g_zfs, zhp, child[c],
+		    cb->cb_name_flags | VDEV_NAME_TYPE_ID);
+		print_status_config(zhp, cb, vname, child[c], depth + 2,
+		    isspare, vrs);
+		free(vname);
+	}
+}
+
+/*
+ * Print the configuration of an exported pool.  Iterate over all vdevs in the
+ * pool, printing out the name and status for each one.
+ */
+static void
+print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv,
+    int depth)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	vdev_stat_t *vs;
+	char *type, *vname;
+
+	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+	if (strcmp(type, VDEV_TYPE_MISSING) == 0 ||
+	    strcmp(type, VDEV_TYPE_HOLE) == 0)
+		return;
+
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	(void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name);
+	(void) printf("  %s", zpool_state_to_name(vs->vs_state, vs->vs_aux));
+
+	if (vs->vs_aux != 0) {
+		(void) printf("  ");
+
+		switch (vs->vs_aux) {
+		case VDEV_AUX_OPEN_FAILED:
+			(void) printf(gettext("cannot open"));
+			break;
+
+		case VDEV_AUX_BAD_GUID_SUM:
+			(void) printf(gettext("missing device"));
+			break;
+
+		case VDEV_AUX_NO_REPLICAS:
+			(void) printf(gettext("insufficient replicas"));
+			break;
+
+		case VDEV_AUX_VERSION_NEWER:
+			(void) printf(gettext("newer version"));
+			break;
+
+		case VDEV_AUX_UNSUP_FEAT:
+			(void) printf(gettext("unsupported feature(s)"));
+			break;
+
+		case VDEV_AUX_ERR_EXCEEDED:
+			(void) printf(gettext("too many errors"));
+			break;
+
+		case VDEV_AUX_ACTIVE:
+			(void) printf(gettext("currently in use"));
+			break;
+
+		case VDEV_AUX_CHILDREN_OFFLINE:
+			(void) printf(gettext("all children offline"));
+			break;
+
+		default:
+			(void) printf(gettext("corrupted data"));
+			break;
+		}
+	}
+	(void) printf("\n");
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return;
+
+	for (c = 0; c < children; c++) {
+		uint64_t is_log = B_FALSE;
+
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+		if (is_log)
+			continue;
+		if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+			continue;
+
+		vname = zpool_vdev_name(g_zfs, NULL, child[c],
+		    cb->cb_name_flags | VDEV_NAME_TYPE_ID);
+		print_import_config(cb, vname, child[c], depth + 2);
+		free(vname);
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) == 0) {
+		(void) printf(gettext("\tcache\n"));
+		for (c = 0; c < children; c++) {
+			vname = zpool_vdev_name(g_zfs, NULL, child[c],
+			    cb->cb_name_flags);
+			(void) printf("\t  %s\n", vname);
+			free(vname);
+		}
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+	    &child, &children) == 0) {
+		(void) printf(gettext("\tspares\n"));
+		for (c = 0; c < children; c++) {
+			vname = zpool_vdev_name(g_zfs, NULL, child[c],
+			    cb->cb_name_flags);
+			(void) printf("\t  %s\n", vname);
+			free(vname);
+		}
+	}
+}
+
+/*
+ * Print specialized class vdevs.
+ *
+ * These are recorded as top level vdevs in the main pool child array
+ * but with "is_log" set to 1 or an "alloc_bias" string. We use either
+ * print_status_config() or print_import_config() to print the top level
+ * class vdevs then any of their children (eg mirrored slogs) are printed
+ * recursively - which works because only the top level vdev is marked.
+ */
+static void
+print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv,
+    const char *class)
+{
+	uint_t c, children;
+	nvlist_t **child;
+	boolean_t printed = B_FALSE;
+
+	assert(zhp != NULL || !cb->cb_verbose);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) != 0)
+		return;
+
+	for (c = 0; c < children; c++) {
+		uint64_t is_log = B_FALSE;
+		char *bias = NULL;
+		char *type = NULL;
+
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+
+		if (is_log) {
+			bias = VDEV_ALLOC_CLASS_LOGS;
+		} else {
+			(void) nvlist_lookup_string(child[c],
+			    ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
+			(void) nvlist_lookup_string(child[c],
+			    ZPOOL_CONFIG_TYPE, &type);
+		}
+
+		if (bias == NULL || strcmp(bias, class) != 0)
+			continue;
+		if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+			continue;
+
+		if (!printed) {
+			(void) printf("\t%s\t\n", gettext(class));
+			printed = B_TRUE;
+		}
+
+		char *name = zpool_vdev_name(g_zfs, zhp, child[c],
+		    cb->cb_name_flags | VDEV_NAME_TYPE_ID);
+		if (cb->cb_print_status)
+			print_status_config(zhp, cb, name, child[c], 2,
+			    B_FALSE, NULL);
+		else
+			print_import_config(cb, name, child[c], 2);
+		free(name);
+	}
+}
+
+/*
+ * Display the status for the given pool.
+ */
+static void
+show_import(nvlist_t *config)
+{
+	uint64_t pool_state;
+	vdev_stat_t *vs;
+	char *name;
+	uint64_t guid;
+	uint64_t hostid = 0;
+	char *msgid;
+	char *hostname = "unknown";
+	nvlist_t *nvroot, *nvinfo;
+	zpool_status_t reason;
+	zpool_errata_t errata;
+	const char *health;
+	uint_t vsc;
+	char *comment;
+	status_cbdata_t cb = { 0 };
+
+	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+	    &name) == 0);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &guid) == 0);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+	    &pool_state) == 0);
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &vsc) == 0);
+	health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+
+	reason = zpool_import_status(config, &msgid, &errata);
+
+	(void) printf(gettext("   pool: %s\n"), name);
+	(void) printf(gettext("     id: %llu\n"), (u_longlong_t)guid);
+	(void) printf(gettext("  state: %s"), health);
+	if (pool_state == POOL_STATE_DESTROYED)
+		(void) printf(gettext(" (DESTROYED)"));
+	(void) printf("\n");
+
+	switch (reason) {
+	case ZPOOL_STATUS_MISSING_DEV_R:
+	case ZPOOL_STATUS_MISSING_DEV_NR:
+	case ZPOOL_STATUS_BAD_GUID_SUM:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices are "
+		    "missing from the system.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_LABEL_R:
+	case ZPOOL_STATUS_CORRUPT_LABEL_NR:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices contains"
+		    " corrupted data.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_DATA:
+		(void) printf(
+		    gettext(" status: The pool data is corrupted.\n"));
+		break;
+
+	case ZPOOL_STATUS_OFFLINE_DEV:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices "
+		    "are offlined.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_POOL:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool metadata is "
+		    "corrupted.\n"));
+		break;
+
+	case ZPOOL_STATUS_VERSION_OLDER:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool is formatted using "
+		    "a legacy on-disk version.\n"));
+		break;
+
+	case ZPOOL_STATUS_VERSION_NEWER:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool is formatted using "
+		    "an incompatible version.\n"));
+		break;
+
+	case ZPOOL_STATUS_FEAT_DISABLED:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("Some supported features are "
+		    "not enabled on the pool.\n"));
+		break;
+
+	case ZPOOL_STATUS_UNSUP_FEAT_READ:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool uses the following "
+		    "feature(s) not supported on this system:\n"));
+		color_start(ANSI_YELLOW);
+		zpool_print_unsup_feat(config);
+		color_end();
+		break;
+
+	case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool can only be "
+		    "accessed in read-only mode on this system. It\n\tcannot be"
+		    " accessed in read-write mode because it uses the "
+		    "following\n\tfeature(s) not supported on this system:\n"));
+		color_start(ANSI_YELLOW);
+		zpool_print_unsup_feat(config);
+		color_end();
+		break;
+
+	case ZPOOL_STATUS_HOSTID_ACTIVE:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool is currently "
+		    "imported by another system.\n"));
+		break;
+
+	case ZPOOL_STATUS_HOSTID_REQUIRED:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool has the "
+		    "multihost property on.  It cannot\n\tbe safely imported "
+		    "when the system hostid is not set.\n"));
+		break;
+
+	case ZPOOL_STATUS_HOSTID_MISMATCH:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool was last accessed "
+		    "by another system.\n"));
+		break;
+
+	case ZPOOL_STATUS_FAULTED_DEV_R:
+	case ZPOOL_STATUS_FAULTED_DEV_NR:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices are "
+		    "faulted.\n"));
+		break;
+
+	case ZPOOL_STATUS_BAD_LOG:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("An intent log record cannot "
+		    "be read.\n"));
+		break;
+
+	case ZPOOL_STATUS_RESILVERING:
+	case ZPOOL_STATUS_REBUILDING:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices were "
+		    "being resilvered.\n"));
+		break;
+
+	case ZPOOL_STATUS_ERRATA:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"),
+		    errata);
+		break;
+
+	default:
+		/*
+		 * No other status can be seen when importing pools.
+		 */
+		assert(reason == ZPOOL_STATUS_OK);
+	}
+
+	/*
+	 * Print out an action according to the overall state of the pool.
+	 */
+	if (vs->vs_state == VDEV_STATE_HEALTHY) {
+		if (reason == ZPOOL_STATUS_VERSION_OLDER ||
+		    reason == ZPOOL_STATUS_FEAT_DISABLED) {
+			(void) printf(gettext(" action: The pool can be "
+			    "imported using its name or numeric identifier, "
+			    "though\n\tsome features will not be available "
+			    "without an explicit 'zpool upgrade'.\n"));
+		} else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) {
+			(void) printf(gettext(" action: The pool can be "
+			    "imported using its name or numeric "
+			    "identifier and\n\tthe '-f' flag.\n"));
+		} else if (reason == ZPOOL_STATUS_ERRATA) {
+			switch (errata) {
+			case ZPOOL_ERRATA_NONE:
+				break;
+
+			case ZPOOL_ERRATA_ZOL_2094_SCRUB:
+				(void) printf(gettext(" action: The pool can "
+				    "be imported using its name or numeric "
+				    "identifier,\n\thowever there is a compat"
+				    "ibility issue which should be corrected"
+				    "\n\tby running 'zpool scrub'\n"));
+				break;
+
+			case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY:
+				(void) printf(gettext(" action: The pool can"
+				    "not be imported with this version of ZFS "
+				    "due to\n\tan active asynchronous destroy. "
+				    "Revert to an earlier version\n\tand "
+				    "allow the destroy to complete before "
+				    "updating.\n"));
+				break;
+
+			case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION:
+				(void) printf(gettext(" action: Existing "
+				    "encrypted datasets contain an on-disk "
+				    "incompatibility, which\n\tneeds to be "
+				    "corrected. Backup these datasets to new "
+				    "encrypted datasets\n\tand destroy the "
+				    "old ones.\n"));
+				break;
+
+			case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION:
+				(void) printf(gettext(" action: Existing "
+				    "encrypted snapshots and bookmarks contain "
+				    "an on-disk\n\tincompatibility. This may "
+				    "cause on-disk corruption if they are used"
+				    "\n\twith 'zfs recv'. To correct the "
+				    "issue, enable the bookmark_v2 feature.\n\t"
+				    "No additional action is needed if there "
+				    "are no encrypted snapshots or\n\t"
+				    "bookmarks. If preserving the encrypted "
+				    "snapshots and bookmarks is\n\trequired, "
+				    "use a non-raw send to backup and restore "
+				    "them. Alternately,\n\tthey may be removed"
+				    " to resolve the incompatibility.\n"));
+				break;
+			default:
+				/*
+				 * All errata must contain an action message.
+				 */
+				assert(0);
+			}
+		} else {
+			(void) printf(gettext(" action: The pool can be "
+			    "imported using its name or numeric "
+			    "identifier.\n"));
+		}
+	} else if (vs->vs_state == VDEV_STATE_DEGRADED) {
+		(void) printf(gettext(" action: The pool can be imported "
+		    "despite missing or damaged devices.  The\n\tfault "
+		    "tolerance of the pool may be compromised if imported.\n"));
+	} else {
+		switch (reason) {
+		case ZPOOL_STATUS_VERSION_NEWER:
+			(void) printf(gettext(" action: The pool cannot be "
+			    "imported.  Access the pool on a system running "
+			    "newer\n\tsoftware, or recreate the pool from "
+			    "backup.\n"));
+			break;
+		case ZPOOL_STATUS_UNSUP_FEAT_READ:
+			printf_color(ANSI_BOLD, gettext("action: "));
+			printf_color(ANSI_YELLOW, gettext("The pool cannot be "
+			    "imported. Access the pool on a system that "
+			    "supports\n\tthe required feature(s), or recreate "
+			    "the pool from backup.\n"));
+			break;
+		case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+			printf_color(ANSI_BOLD, gettext("action: "));
+			printf_color(ANSI_YELLOW, gettext("The pool cannot be "
+			    "imported in read-write mode. Import the pool "
+			    "with\n"
+			    "\t\"-o readonly=on\", access the pool on a system "
+			    "that supports the\n\trequired feature(s), or "
+			    "recreate the pool from backup.\n"));
+			break;
+		case ZPOOL_STATUS_MISSING_DEV_R:
+		case ZPOOL_STATUS_MISSING_DEV_NR:
+		case ZPOOL_STATUS_BAD_GUID_SUM:
+			(void) printf(gettext(" action: The pool cannot be "
+			    "imported. Attach the missing\n\tdevices and try "
+			    "again.\n"));
+			break;
+		case ZPOOL_STATUS_HOSTID_ACTIVE:
+			VERIFY0(nvlist_lookup_nvlist(config,
+			    ZPOOL_CONFIG_LOAD_INFO, &nvinfo));
+
+			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME))
+				hostname = fnvlist_lookup_string(nvinfo,
+				    ZPOOL_CONFIG_MMP_HOSTNAME);
+
+			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID))
+				hostid = fnvlist_lookup_uint64(nvinfo,
+				    ZPOOL_CONFIG_MMP_HOSTID);
+
+			(void) printf(gettext(" action: The pool must be "
+			    "exported from %s (hostid=%lx)\n\tbefore it "
+			    "can be safely imported.\n"), hostname,
+			    (unsigned long) hostid);
+			break;
+		case ZPOOL_STATUS_HOSTID_REQUIRED:
+			(void) printf(gettext(" action: Set a unique system "
+			    "hostid with the zgenhostid(8) command.\n"));
+			break;
+		default:
+			(void) printf(gettext(" action: The pool cannot be "
+			    "imported due to damaged devices or data.\n"));
+		}
+	}
+
+	/* Print the comment attached to the pool. */
+	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+		(void) printf(gettext("comment: %s\n"), comment);
+
+	/*
+	 * If the state is "closed" or "can't open", and the aux state
+	 * is "corrupt data":
+	 */
+	if (((vs->vs_state == VDEV_STATE_CLOSED) ||
+	    (vs->vs_state == VDEV_STATE_CANT_OPEN)) &&
+	    (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) {
+		if (pool_state == POOL_STATE_DESTROYED)
+			(void) printf(gettext("\tThe pool was destroyed, "
+			    "but can be imported using the '-Df' flags.\n"));
+		else if (pool_state != POOL_STATE_EXPORTED)
+			(void) printf(gettext("\tThe pool may be active on "
+			    "another system, but can be imported using\n\t"
+			    "the '-f' flag.\n"));
+	}
+
+	if (msgid != NULL) {
+		(void) printf(gettext(
+		    "   see: https://zfsonlinux.org/msg/%s\n"), msgid);
+	}
+
+	(void) printf(gettext(" config:\n\n"));
+
+	cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name),
+	    VDEV_NAME_TYPE_ID);
+	if (cb.cb_namewidth < 10)
+		cb.cb_namewidth = 10;
+
+	print_import_config(&cb, name, nvroot, 0);
+
+	print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP);
+	print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
+	print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS);
+
+	if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
+		(void) printf(gettext("\n\tAdditional devices are known to "
+		    "be part of this pool, though their\n\texact "
+		    "configuration cannot be determined.\n"));
+	}
+}
+
+static boolean_t
+zfs_force_import_required(nvlist_t *config)
+{
+	uint64_t state;
+	uint64_t hostid = 0;
+	nvlist_t *nvinfo;
+
+	state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE);
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
+
+	if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid())
+		return (B_TRUE);
+
+	nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
+	if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) {
+		mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo,
+		    ZPOOL_CONFIG_MMP_STATE);
+
+		if (mmp_state != MMP_STATE_INACTIVE)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Perform the import for the given configuration.  This passes the heavy
+ * lifting off to zpool_import_props(), and then mounts the datasets contained
+ * within the pool.
+ */
+static int
+do_import(nvlist_t *config, const char *newname, const char *mntopts,
+    nvlist_t *props, int flags)
+{
+	int ret = 0;
+	zpool_handle_t *zhp;
+	char *name;
+	uint64_t version;
+
+	name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME);
+	version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION);
+
+	if (!SPA_VERSION_IS_SUPPORTED(version)) {
+		(void) fprintf(stderr, gettext("cannot import '%s': pool "
+		    "is formatted using an unsupported ZFS version\n"), name);
+		return (1);
+	} else if (zfs_force_import_required(config) &&
+	    !(flags & ZFS_IMPORT_ANY_HOST)) {
+		mmp_state_t mmp_state = MMP_STATE_INACTIVE;
+		nvlist_t *nvinfo;
+
+		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
+		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE))
+			mmp_state = fnvlist_lookup_uint64(nvinfo,
+			    ZPOOL_CONFIG_MMP_STATE);
+
+		if (mmp_state == MMP_STATE_ACTIVE) {
+			char *hostname = "<unknown>";
+			uint64_t hostid = 0;
+
+			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME))
+				hostname = fnvlist_lookup_string(nvinfo,
+				    ZPOOL_CONFIG_MMP_HOSTNAME);
+
+			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID))
+				hostid = fnvlist_lookup_uint64(nvinfo,
+				    ZPOOL_CONFIG_MMP_HOSTID);
+
+			(void) fprintf(stderr, gettext("cannot import '%s': "
+			    "pool is imported on %s (hostid: "
+			    "0x%lx)\nExport the pool on the other system, "
+			    "then run 'zpool import'.\n"),
+			    name, hostname, (unsigned long) hostid);
+		} else if (mmp_state == MMP_STATE_NO_HOSTID) {
+			(void) fprintf(stderr, gettext("Cannot import '%s': "
+			    "pool has the multihost property on and the\n"
+			    "system's hostid is not set. Set a unique hostid "
+			    "with the zgenhostid(8) command.\n"), name);
+		} else {
+			char *hostname = "<unknown>";
+			uint64_t timestamp = 0;
+			uint64_t hostid = 0;
+
+			if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
+				hostname = fnvlist_lookup_string(config,
+				    ZPOOL_CONFIG_HOSTNAME);
+
+			if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP))
+				timestamp = fnvlist_lookup_uint64(config,
+				    ZPOOL_CONFIG_TIMESTAMP);
+
+			if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
+				hostid = fnvlist_lookup_uint64(config,
+				    ZPOOL_CONFIG_HOSTID);
+
+			(void) fprintf(stderr, gettext("cannot import '%s': "
+			    "pool was previously in use from another system.\n"
+			    "Last accessed by %s (hostid=%lx) at %s"
+			    "The pool can be imported, use 'zpool import -f' "
+			    "to import the pool.\n"), name, hostname,
+			    (unsigned long)hostid, ctime((time_t *)&timestamp));
+		}
+
+		return (1);
+	}
+
+	if (zpool_import_props(g_zfs, config, newname, props, flags) != 0)
+		return (1);
+
+	if (newname != NULL)
+		name = (char *)newname;
+
+	if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL)
+		return (1);
+
+	/*
+	 * Loading keys is best effort. We don't want to return immediately
+	 * if it fails but we do want to give the error to the caller.
+	 */
+	if (flags & ZFS_IMPORT_LOAD_KEYS) {
+		ret = zfs_crypto_attempt_load_keys(g_zfs, name);
+		if (ret != 0)
+			ret = 1;
+	}
+
+	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+	    !(flags & ZFS_IMPORT_ONLY) &&
+	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
+		zpool_close(zhp);
+		return (1);
+	}
+
+	zpool_close(zhp);
+	return (ret);
+}
+
+typedef struct target_exists_args {
+	const char	*poolname;
+	uint64_t	poolguid;
+} target_exists_args_t;
+
+static int
+name_or_guid_exists(zpool_handle_t *zhp, void *data)
+{
+	target_exists_args_t *args = data;
+	nvlist_t *config = zpool_get_config(zhp, NULL);
+	int found = 0;
+
+	if (config == NULL)
+		return (0);
+
+	if (args->poolname != NULL) {
+		char *pool_name;
+
+		verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+		    &pool_name) == 0);
+		if (strcmp(pool_name, args->poolname) == 0)
+			found = 1;
+	} else {
+		uint64_t pool_guid;
+
+		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+		    &pool_guid) == 0);
+		if (pool_guid == args->poolguid)
+			found = 1;
+	}
+	zpool_close(zhp);
+
+	return (found);
+}
+/*
+ * zpool checkpoint <pool>
+ *       checkpoint --discard <pool>
+ *
+ *       -d         Discard the checkpoint from a checkpointed
+ *       --discard  pool.
+ *
+ *       -w         Wait for discarding a checkpoint to complete.
+ *       --wait
+ *
+ * Checkpoints the specified pool, by taking a "snapshot" of its
+ * current state. A pool can only have one checkpoint at a time.
+ */
+int
+zpool_do_checkpoint(int argc, char **argv)
+{
+	boolean_t discard, wait;
+	char *pool;
+	zpool_handle_t *zhp;
+	int c, err;
+
+	struct option long_options[] = {
+		{"discard", no_argument, NULL, 'd'},
+		{"wait", no_argument, NULL, 'w'},
+		{0, 0, 0, 0}
+	};
+
+	discard = B_FALSE;
+	wait = B_FALSE;
+	while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) {
+		switch (c) {
+		case 'd':
+			discard = B_TRUE;
+			break;
+		case 'w':
+			wait = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	if (wait && !discard) {
+		(void) fprintf(stderr, gettext("--wait only valid when "
+		    "--discard also specified\n"));
+		usage(B_FALSE);
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool argument\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	pool = argv[0];
+
+	if ((zhp = zpool_open(g_zfs, pool)) == NULL) {
+		/* As a special case, check for use of '/' in the name */
+		if (strchr(pool, '/') != NULL)
+			(void) fprintf(stderr, gettext("'zpool checkpoint' "
+			    "doesn't work on datasets. To save the state "
+			    "of a dataset from a specific point in time "
+			    "please use 'zfs snapshot'\n"));
+		return (1);
+	}
+
+	if (discard) {
+		err = (zpool_discard_checkpoint(zhp) != 0);
+		if (err == 0 && wait)
+			err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD);
+	} else {
+		err = (zpool_checkpoint(zhp) != 0);
+	}
+
+	zpool_close(zhp);
+
+	return (err);
+}
+
+#define	CHECKPOINT_OPT	1024
+
+/*
+ * zpool import [-d dir] [-D]
+ *       import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l]
+ *              [-d dir | -c cachefile] [-f] -a
+ *       import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l]
+ *              [-d dir | -c cachefile] [-f] [-n] [-F] <pool | id> [newpool]
+ *
+ *	 -c	Read pool information from a cachefile instead of searching
+ *		devices.
+ *
+ *       -d	Scan in a specific directory, other than /dev/.  More than
+ *		one directory can be specified using multiple '-d' options.
+ *
+ *       -D     Scan for previously destroyed pools or import all or only
+ *              specified destroyed pools.
+ *
+ *       -R	Temporarily import the pool, with all mountpoints relative to
+ *		the given root.  The pool will remain exported when the machine
+ *		is rebooted.
+ *
+ *       -V	Import even in the presence of faulted vdevs.  This is an
+ *       	intentionally undocumented option for testing purposes, and
+ *       	treats the pool configuration as complete, leaving any bad
+ *		vdevs in the FAULTED state. In other words, it does verbatim
+ *		import.
+ *
+ *       -f	Force import, even if it appears that the pool is active.
+ *
+ *       -F     Attempt rewind if necessary.
+ *
+ *       -n     See if rewind would work, but don't actually rewind.
+ *
+ *       -N     Import the pool but don't mount datasets.
+ *
+ *       -T     Specify a starting txg to use for import. This option is
+ *       	intentionally undocumented option for testing purposes.
+ *
+ *       -a	Import all pools found.
+ *
+ *       -l	Load encryption keys while importing.
+ *
+ *       -o	Set property=value and/or temporary mount options (without '=').
+ *
+ *	 -s	Scan using the default search path, the libblkid cache will
+ *	        not be consulted.
+ *
+ *       --rewind-to-checkpoint
+ *       	Import the pool and revert back to the checkpoint.
+ *
+ * The import command scans for pools to import, and import pools based on pool
+ * name and GUID.  The pool can also be renamed as part of the import process.
+ */
+int
+zpool_do_import(int argc, char **argv)
+{
+	char **searchdirs = NULL;
+	char *env, *envdup = NULL;
+	int nsearch = 0;
+	int c;
+	int err = 0;
+	nvlist_t *pools = NULL;
+	boolean_t do_all = B_FALSE;
+	boolean_t do_destroyed = B_FALSE;
+	char *mntopts = NULL;
+	nvpair_t *elem;
+	nvlist_t *config;
+	uint64_t searchguid = 0;
+	char *searchname = NULL;
+	char *propval;
+	nvlist_t *found_config;
+	nvlist_t *policy = NULL;
+	nvlist_t *props = NULL;
+	boolean_t first;
+	int flags = ZFS_IMPORT_NORMAL;
+	uint32_t rewind_policy = ZPOOL_NO_REWIND;
+	boolean_t dryrun = B_FALSE;
+	boolean_t do_rewind = B_FALSE;
+	boolean_t xtreme_rewind = B_FALSE;
+	boolean_t do_scan = B_FALSE;
+	boolean_t pool_exists = B_FALSE;
+	uint64_t pool_state, txg = -1ULL;
+	char *cachefile = NULL;
+	importargs_t idata = { 0 };
+	char *endptr;
+
+	struct option long_options[] = {
+		{"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT},
+		{0, 0, 0, 0}
+	};
+
+	/* check options */
+	while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX",
+	    long_options, NULL)) != -1) {
+		switch (c) {
+		case 'a':
+			do_all = B_TRUE;
+			break;
+		case 'c':
+			cachefile = optarg;
+			break;
+		case 'd':
+			if (searchdirs == NULL) {
+				searchdirs = safe_malloc(sizeof (char *));
+			} else {
+				char **tmp = safe_malloc((nsearch + 1) *
+				    sizeof (char *));
+				bcopy(searchdirs, tmp, nsearch *
+				    sizeof (char *));
+				free(searchdirs);
+				searchdirs = tmp;
+			}
+			searchdirs[nsearch++] = optarg;
+			break;
+		case 'D':
+			do_destroyed = B_TRUE;
+			break;
+		case 'f':
+			flags |= ZFS_IMPORT_ANY_HOST;
+			break;
+		case 'F':
+			do_rewind = B_TRUE;
+			break;
+		case 'l':
+			flags |= ZFS_IMPORT_LOAD_KEYS;
+			break;
+		case 'm':
+			flags |= ZFS_IMPORT_MISSING_LOG;
+			break;
+		case 'n':
+			dryrun = B_TRUE;
+			break;
+		case 'N':
+			flags |= ZFS_IMPORT_ONLY;
+			break;
+		case 'o':
+			if ((propval = strchr(optarg, '=')) != NULL) {
+				*propval = '\0';
+				propval++;
+				if (add_prop_list(optarg, propval,
+				    &props, B_TRUE))
+					goto error;
+			} else {
+				mntopts = optarg;
+			}
+			break;
+		case 'R':
+			if (add_prop_list(zpool_prop_to_name(
+			    ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
+				goto error;
+			if (add_prop_list_default(zpool_prop_to_name(
+			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
+				goto error;
+			break;
+		case 's':
+			do_scan = B_TRUE;
+			break;
+		case 't':
+			flags |= ZFS_IMPORT_TEMP_NAME;
+			if (add_prop_list_default(zpool_prop_to_name(
+			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
+				goto error;
+			break;
+
+		case 'T':
+			errno = 0;
+			txg = strtoull(optarg, &endptr, 0);
+			if (errno != 0 || *endptr != '\0') {
+				(void) fprintf(stderr,
+				    gettext("invalid txg value\n"));
+				usage(B_FALSE);
+			}
+			rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND;
+			break;
+		case 'V':
+			flags |= ZFS_IMPORT_VERBATIM;
+			break;
+		case 'X':
+			xtreme_rewind = B_TRUE;
+			break;
+		case CHECKPOINT_OPT:
+			flags |= ZFS_IMPORT_CHECKPOINT;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (cachefile && nsearch != 0) {
+		(void) fprintf(stderr, gettext("-c is incompatible with -d\n"));
+		usage(B_FALSE);
+	}
+
+	if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) {
+		(void) fprintf(stderr, gettext("-l is incompatible with -N\n"));
+		usage(B_FALSE);
+	}
+
+	if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) {
+		(void) fprintf(stderr, gettext("-l is only meaningful during "
+		    "an import\n"));
+		usage(B_FALSE);
+	}
+
+	if ((dryrun || xtreme_rewind) && !do_rewind) {
+		(void) fprintf(stderr,
+		    gettext("-n or -X only meaningful with -F\n"));
+		usage(B_FALSE);
+	}
+	if (dryrun)
+		rewind_policy = ZPOOL_TRY_REWIND;
+	else if (do_rewind)
+		rewind_policy = ZPOOL_DO_REWIND;
+	if (xtreme_rewind)
+		rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+	/* In the future, we can capture further policy and include it here */
+	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+	    nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 ||
+	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY,
+	    rewind_policy) != 0)
+		goto error;
+
+	/* check argument count */
+	if (do_all) {
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+	} else {
+		if (argc > 2) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
+	}
+
+	/*
+	 * Check for the effective uid.  We do this explicitly here because
+	 * otherwise any attempt to discover pools will silently fail.
+	 */
+	if (argc == 0 && geteuid() != 0) {
+		(void) fprintf(stderr, gettext("cannot "
+		    "discover pools: permission denied\n"));
+		if (searchdirs != NULL)
+			free(searchdirs);
+
+		nvlist_free(props);
+		nvlist_free(policy);
+		return (1);
+	}
+
+	/*
+	 * Depending on the arguments given, we do one of the following:
+	 *
+	 *	<none>	Iterate through all pools and display information about
+	 *		each one.
+	 *
+	 *	-a	Iterate through all pools and try to import each one.
+	 *
+	 *	<id>	Find the pool that corresponds to the given GUID/pool
+	 *		name and import that one.
+	 *
+	 *	-D	Above options applies only to destroyed pools.
+	 */
+	if (argc != 0) {
+		char *endptr;
+
+		errno = 0;
+		searchguid = strtoull(argv[0], &endptr, 10);
+		if (errno != 0 || *endptr != '\0') {
+			searchname = argv[0];
+			searchguid = 0;
+		}
+		found_config = NULL;
+
+		/*
+		 * User specified a name or guid.  Ensure it's unique.
+		 */
+		target_exists_args_t search = {searchname, searchguid};
+		pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search);
+	}
+
+	/*
+	 * Check the environment for the preferred search path.
+	 */
+	if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) {
+		char *dir;
+
+		envdup = strdup(env);
+
+		dir = strtok(envdup, ":");
+		while (dir != NULL) {
+			if (searchdirs == NULL) {
+				searchdirs = safe_malloc(sizeof (char *));
+			} else {
+				char **tmp = safe_malloc((nsearch + 1) *
+				    sizeof (char *));
+				bcopy(searchdirs, tmp, nsearch *
+				    sizeof (char *));
+				free(searchdirs);
+				searchdirs = tmp;
+			}
+			searchdirs[nsearch++] = dir;
+			dir = strtok(NULL, ":");
+		}
+	}
+
+	idata.path = searchdirs;
+	idata.paths = nsearch;
+	idata.poolname = searchname;
+	idata.guid = searchguid;
+	idata.cachefile = cachefile;
+	idata.scan = do_scan;
+	idata.policy = policy;
+
+	pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops);
+
+	if (pools != NULL && pool_exists &&
+	    (argc == 1 || strcmp(argv[0], argv[1]) == 0)) {
+		(void) fprintf(stderr, gettext("cannot import '%s': "
+		    "a pool with that name already exists\n"),
+		    argv[0]);
+		(void) fprintf(stderr, gettext("use the form '%s "
+		    "<pool | id> <newpool>' to give it a new name\n"),
+		    "zpool import");
+		err = 1;
+	} else if (pools == NULL && pool_exists) {
+		(void) fprintf(stderr, gettext("cannot import '%s': "
+		    "a pool with that name is already created/imported,\n"),
+		    argv[0]);
+		(void) fprintf(stderr, gettext("and no additional pools "
+		    "with that name were found\n"));
+		err = 1;
+	} else if (pools == NULL) {
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("cannot import '%s': "
+			    "no such pool available\n"), argv[0]);
+		}
+		err = 1;
+	}
+
+	if (err == 1) {
+		if (searchdirs != NULL)
+			free(searchdirs);
+		if (envdup != NULL)
+			free(envdup);
+		nvlist_free(policy);
+		nvlist_free(pools);
+		nvlist_free(props);
+		return (1);
+	}
+
+	/*
+	 * At this point we have a list of import candidate configs. Even if
+	 * we were searching by pool name or guid, we still need to
+	 * post-process the list to deal with pool state and possible
+	 * duplicate names.
+	 */
+	err = 0;
+	elem = NULL;
+	first = B_TRUE;
+	while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
+
+		verify(nvpair_value_nvlist(elem, &config) == 0);
+
+		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+		    &pool_state) == 0);
+		if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
+			continue;
+		if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
+			continue;
+
+		verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
+		    policy) == 0);
+
+		if (argc == 0) {
+			if (first)
+				first = B_FALSE;
+			else if (!do_all)
+				(void) printf("\n");
+
+			if (do_all) {
+				err |= do_import(config, NULL, mntopts,
+				    props, flags);
+			} else {
+				show_import(config);
+			}
+		} else if (searchname != NULL) {
+			char *name;
+
+			/*
+			 * We are searching for a pool based on name.
+			 */
+			verify(nvlist_lookup_string(config,
+			    ZPOOL_CONFIG_POOL_NAME, &name) == 0);
+
+			if (strcmp(name, searchname) == 0) {
+				if (found_config != NULL) {
+					(void) fprintf(stderr, gettext(
+					    "cannot import '%s': more than "
+					    "one matching pool\n"), searchname);
+					(void) fprintf(stderr, gettext(
+					    "import by numeric ID instead\n"));
+					err = B_TRUE;
+				}
+				found_config = config;
+			}
+		} else {
+			uint64_t guid;
+
+			/*
+			 * Search for a pool by guid.
+			 */
+			verify(nvlist_lookup_uint64(config,
+			    ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
+
+			if (guid == searchguid)
+				found_config = config;
+		}
+	}
+
+	/*
+	 * If we were searching for a specific pool, verify that we found a
+	 * pool, and then do the import.
+	 */
+	if (argc != 0 && err == 0) {
+		if (found_config == NULL) {
+			(void) fprintf(stderr, gettext("cannot import '%s': "
+			    "no such pool available\n"), argv[0]);
+			err = B_TRUE;
+		} else {
+			err |= do_import(found_config, argc == 1 ? NULL :
+			    argv[1], mntopts, props, flags);
+		}
+	}
+
+	/*
+	 * If we were just looking for pools, report an error if none were
+	 * found.
+	 */
+	if (argc == 0 && first)
+		(void) fprintf(stderr,
+		    gettext("no pools available to import\n"));
+
+error:
+	nvlist_free(props);
+	nvlist_free(pools);
+	nvlist_free(policy);
+	if (searchdirs != NULL)
+		free(searchdirs);
+	if (envdup != NULL)
+		free(envdup);
+
+	return (err ? 1 : 0);
+}
+
+/*
+ * zpool sync [-f] [pool] ...
+ *
+ * -f (undocumented) force uberblock (and config including zpool cache file)
+ *    update.
+ *
+ * Sync the specified pool(s).
+ * Without arguments "zpool sync" will sync all pools.
+ * This command initiates TXG sync(s) and will return after the TXG(s) commit.
+ *
+ */
+static int
+zpool_do_sync(int argc, char **argv)
+{
+	int ret;
+	boolean_t force = B_FALSE;
+
+	/* check options */
+	while ((ret  = getopt(argc, argv, "f")) != -1) {
+		switch (ret) {
+		case 'f':
+			force = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* if argc == 0 we will execute zpool_sync_one on all pools */
+	ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force);
+
+	return (ret);
+}
+
+typedef struct iostat_cbdata {
+	uint64_t cb_flags;
+	int cb_name_flags;
+	int cb_namewidth;
+	int cb_iteration;
+	char **cb_vdev_names; /* Only show these vdevs */
+	unsigned int cb_vdev_names_count;
+	boolean_t cb_verbose;
+	boolean_t cb_literal;
+	boolean_t cb_scripted;
+	zpool_list_t *cb_list;
+	vdev_cmd_data_list_t *vcdl;
+} iostat_cbdata_t;
+
+/*  iostat labels */
+typedef struct name_and_columns {
+	const char *name;	/* Column name */
+	unsigned int columns;	/* Center name to this number of columns */
+} name_and_columns_t;
+
+#define	IOSTAT_MAX_LABELS	13	/* Max number of labels on one line */
+
+static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] =
+{
+	[IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2},
+	    {NULL}},
+	[IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
+	    {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {NULL}},
+	[IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2},
+	    {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2},
+	    {"trimq_write", 2}, {NULL}},
+	[IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
+	    {"asyncq_wait", 2}, {NULL}},
+	[IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2},
+	    {"async_read", 2}, {"async_write", 2}, {"scrub", 2},
+	    {"trim", 2}, {NULL}},
+};
+
+/* Shorthand - if "columns" field not set, default to 1 column */
+static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] =
+{
+	[IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"},
+	    {"write"}, {NULL}},
+	[IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
+	    {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}},
+	[IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"},
+	    {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"},
+	    {"pend"}, {"activ"}, {NULL}},
+	[IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
+	    {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}},
+	[IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"},
+	    {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}},
+};
+
+static const char *histo_to_title[] = {
+	[IOS_L_HISTO] = "latency",
+	[IOS_RQ_HISTO] = "req_size",
+};
+
+/*
+ * Return the number of labels in a null-terminated name_and_columns_t
+ * array.
+ *
+ */
+static unsigned int
+label_array_len(const name_and_columns_t *labels)
+{
+	int i = 0;
+
+	while (labels[i].name)
+		i++;
+
+	return (i);
+}
+
+/*
+ * Return the number of strings in a null-terminated string array.
+ * For example:
+ *
+ *     const char foo[] = {"bar", "baz", NULL}
+ *
+ * returns 2
+ */
+static uint64_t
+str_array_len(const char *array[])
+{
+	uint64_t i = 0;
+	while (array[i])
+		i++;
+
+	return (i);
+}
+
+
+/*
+ * Return a default column width for default/latency/queue columns. This does
+ * not include histograms, which have their columns autosized.
+ */
+static unsigned int
+default_column_width(iostat_cbdata_t *cb, enum iostat_type type)
+{
+	unsigned long column_width = 5; /* Normal niceprint */
+	static unsigned long widths[] = {
+		/*
+		 * Choose some sane default column sizes for printing the
+		 * raw numbers.
+		 */
+		[IOS_DEFAULT] = 15, /* 1PB capacity */
+		[IOS_LATENCY] = 10, /* 1B ns = 10sec */
+		[IOS_QUEUES] = 6,   /* 1M queue entries */
+		[IOS_L_HISTO] = 10, /* 1B ns = 10sec */
+		[IOS_RQ_HISTO] = 6, /* 1M queue entries */
+	};
+
+	if (cb->cb_literal)
+		column_width = widths[type];
+
+	return (column_width);
+}
+
+/*
+ * Print the column labels, i.e:
+ *
+ *   capacity     operations     bandwidth
+ * alloc   free   read  write   read  write  ...
+ *
+ * If force_column_width is set, use it for the column width.  If not set, use
+ * the default column width.
+ */
+static void
+print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
+    const name_and_columns_t labels[][IOSTAT_MAX_LABELS])
+{
+	int i, idx, s;
+	int text_start, rw_column_width, spaces_to_end;
+	uint64_t flags = cb->cb_flags;
+	uint64_t f;
+	unsigned int column_width = force_column_width;
+
+	/* For each bit set in flags */
+	for (f = flags; f; f &= ~(1ULL << idx)) {
+		idx = lowbit64(f) - 1;
+		if (!force_column_width)
+			column_width = default_column_width(cb, idx);
+		/* Print our top labels centered over "read  write" label. */
+		for (i = 0; i < label_array_len(labels[idx]); i++) {
+			const char *name = labels[idx][i].name;
+			/*
+			 * We treat labels[][].columns == 0 as shorthand
+			 * for one column.  It makes writing out the label
+			 * tables more concise.
+			 */
+			unsigned int columns = MAX(1, labels[idx][i].columns);
+			unsigned int slen = strlen(name);
+
+			rw_column_width = (column_width * columns) +
+			    (2 * (columns - 1));
+
+			text_start = (int)((rw_column_width) / columns -
+			    slen / columns);
+			if (text_start < 0)
+				text_start = 0;
+
+			printf("  ");	/* Two spaces between columns */
+
+			/* Space from beginning of column to label */
+			for (s = 0; s < text_start; s++)
+				printf(" ");
+
+			printf("%s", name);
+
+			/* Print space after label to end of column */
+			spaces_to_end = rw_column_width - text_start - slen;
+			if (spaces_to_end < 0)
+				spaces_to_end = 0;
+
+			for (s = 0; s < spaces_to_end; s++)
+				printf(" ");
+		}
+	}
+}
+
+
+/*
+ * print_cmd_columns - Print custom column titles from -c
+ *
+ * If the user specified the "zpool status|iostat -c" then print their custom
+ * column titles in the header.  For example, print_cmd_columns() would print
+ * the "  col1  col2" part of this:
+ *
+ * $ zpool iostat -vc 'echo col1=val1; echo col2=val2'
+ * ...
+ *	      capacity     operations     bandwidth
+ * pool        alloc   free   read  write   read  write  col1  col2
+ * ----------  -----  -----  -----  -----  -----  -----  ----  ----
+ * mypool       269K  1008M      0      0    107    946
+ *   mirror     269K  1008M      0      0    107    946
+ *     sdb         -      -      0      0    102    473  val1  val2
+ *     sdc         -      -      0      0      5    473  val1  val2
+ * ----------  -----  -----  -----  -----  -----  -----  ----  ----
+ */
+static void
+print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes)
+{
+	int i, j;
+	vdev_cmd_data_t *data = &vcdl->data[0];
+
+	if (vcdl->count == 0 || data == NULL)
+		return;
+
+	/*
+	 * Each vdev cmd should have the same column names unless the user did
+	 * something weird with their cmd.  Just take the column names from the
+	 * first vdev and assume it works for all of them.
+	 */
+	for (i = 0; i < vcdl->uniq_cols_cnt; i++) {
+		printf("  ");
+		if (use_dashes) {
+			for (j = 0; j < vcdl->uniq_cols_width[i]; j++)
+				printf("-");
+		} else {
+			printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i],
+			    vcdl->uniq_cols[i]);
+		}
+	}
+}
+
+
+/*
+ * Utility function to print out a line of dashes like:
+ *
+ * 	--------------------------------  -----  -----  -----  -----  -----
+ *
+ * ...or a dashed named-row line like:
+ *
+ * 	logs                                  -      -      -      -      -
+ *
+ * @cb:				iostat data
+ *
+ * @force_column_width		If non-zero, use the value as the column width.
+ * 				Otherwise use the default column widths.
+ *
+ * @name:			Print a dashed named-row line starting
+ * 				with @name.  Otherwise, print a regular
+ * 				dashed line.
+ */
+static void
+print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width,
+    const char *name)
+{
+	int i;
+	unsigned int namewidth;
+	uint64_t flags = cb->cb_flags;
+	uint64_t f;
+	int idx;
+	const name_and_columns_t *labels;
+	const char *title;
+
+
+	if (cb->cb_flags & IOS_ANYHISTO_M) {
+		title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)];
+	} else if (cb->cb_vdev_names_count) {
+		title = "vdev";
+	} else  {
+		title = "pool";
+	}
+
+	namewidth = MAX(MAX(strlen(title), cb->cb_namewidth),
+	    name ? strlen(name) : 0);
+
+
+	if (name) {
+		printf("%-*s", namewidth, name);
+	} else {
+		for (i = 0; i < namewidth; i++)
+			(void) printf("-");
+	}
+
+	/* For each bit in flags */
+	for (f = flags; f; f &= ~(1ULL << idx)) {
+		unsigned int column_width;
+		idx = lowbit64(f) - 1;
+		if (force_column_width)
+			column_width = force_column_width;
+		else
+			column_width = default_column_width(cb, idx);
+
+		labels = iostat_bottom_labels[idx];
+		for (i = 0; i < label_array_len(labels); i++) {
+			if (name)
+				printf("  %*s-", column_width - 1, " ");
+			else
+				printf("  %.*s", column_width,
+				    "--------------------");
+		}
+	}
+}
+
+
+static void
+print_iostat_separator_impl(iostat_cbdata_t *cb,
+    unsigned int force_column_width)
+{
+	print_iostat_dashes(cb, force_column_width, NULL);
+}
+
+static void
+print_iostat_separator(iostat_cbdata_t *cb)
+{
+	print_iostat_separator_impl(cb, 0);
+}
+
+static void
+print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width,
+    const char *histo_vdev_name)
+{
+	unsigned int namewidth;
+	const char *title;
+
+	if (cb->cb_flags & IOS_ANYHISTO_M) {
+		title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)];
+	} else if (cb->cb_vdev_names_count) {
+		title = "vdev";
+	} else  {
+		title = "pool";
+	}
+
+	namewidth = MAX(MAX(strlen(title), cb->cb_namewidth),
+	    histo_vdev_name ? strlen(histo_vdev_name) : 0);
+
+	if (histo_vdev_name)
+		printf("%-*s", namewidth, histo_vdev_name);
+	else
+		printf("%*s", namewidth, "");
+
+
+	print_iostat_labels(cb, force_column_width, iostat_top_labels);
+	printf("\n");
+
+	printf("%-*s", namewidth, title);
+
+	print_iostat_labels(cb, force_column_width, iostat_bottom_labels);
+	if (cb->vcdl != NULL)
+		print_cmd_columns(cb->vcdl, 0);
+
+	printf("\n");
+
+	print_iostat_separator_impl(cb, force_column_width);
+
+	if (cb->vcdl != NULL)
+		print_cmd_columns(cb->vcdl, 1);
+
+	printf("\n");
+}
+
+static void
+print_iostat_header(iostat_cbdata_t *cb)
+{
+	print_iostat_header_impl(cb, 0, NULL);
+}
+
+
+/*
+ * Display a single statistic.
+ */
+static void
+print_one_stat(uint64_t value, enum zfs_nicenum_format format,
+    unsigned int column_size, boolean_t scripted)
+{
+	char buf[64];
+
+	zfs_nicenum_format(value, buf, sizeof (buf), format);
+
+	if (scripted)
+		printf("\t%s", buf);
+	else
+		printf("  %*s", column_size, buf);
+}
+
+/*
+ * Calculate the default vdev stats
+ *
+ * Subtract oldvs from newvs, apply a scaling factor, and save the resulting
+ * stats into calcvs.
+ */
+static void
+calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs,
+    vdev_stat_t *calcvs)
+{
+	int i;
+
+	memcpy(calcvs, newvs, sizeof (*calcvs));
+	for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++)
+		calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]);
+
+	for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++)
+		calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]);
+}
+
+/*
+ * Internal representation of the extended iostats data.
+ *
+ * The extended iostat stats are exported in nvlists as either uint64_t arrays
+ * or single uint64_t's.  We make both look like arrays to make them easier
+ * to process.  In order to make single uint64_t's look like arrays, we set
+ * __data to the stat data, and then set *data = &__data with count = 1.  Then,
+ * we can just use *data and count.
+ */
+struct stat_array {
+	uint64_t *data;
+	uint_t count;	/* Number of entries in data[] */
+	uint64_t __data; /* Only used when data is a single uint64_t */
+};
+
+static uint64_t
+stat_histo_max(struct stat_array *nva, unsigned int len)
+{
+	uint64_t max = 0;
+	int i;
+	for (i = 0; i < len; i++)
+		max = MAX(max, array64_max(nva[i].data, nva[i].count));
+
+	return (max);
+}
+
+/*
+ * Helper function to lookup a uint64_t array or uint64_t value and store its
+ * data as a stat_array.  If the nvpair is a single uint64_t value, then we make
+ * it look like a one element array to make it easier to process.
+ */
+static int
+nvpair64_to_stat_array(nvlist_t *nvl, const char *name,
+    struct stat_array *nva)
+{
+	nvpair_t *tmp;
+	int ret;
+
+	verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0);
+	switch (nvpair_type(tmp)) {
+	case DATA_TYPE_UINT64_ARRAY:
+		ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count);
+		break;
+	case DATA_TYPE_UINT64:
+		ret = nvpair_value_uint64(tmp, &nva->__data);
+		nva->data = &nva->__data;
+		nva->count = 1;
+		break;
+	default:
+		/* Not a uint64_t */
+		ret = EINVAL;
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * Given a list of nvlist names, look up the extended stats in newnv and oldnv,
+ * subtract them, and return the results in a newly allocated stat_array.
+ * You must free the returned array after you are done with it with
+ * free_calc_stats().
+ *
+ * Additionally, you can set "oldnv" to NULL if you simply want the newnv
+ * values.
+ */
+static struct stat_array *
+calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv,
+    nvlist_t *newnv)
+{
+	nvlist_t *oldnvx = NULL, *newnvx;
+	struct stat_array *oldnva, *newnva, *calcnva;
+	int i, j;
+	unsigned int alloc_size = (sizeof (struct stat_array)) * len;
+
+	/* Extract our extended stats nvlist from the main list */
+	verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX,
+	    &newnvx) == 0);
+	if (oldnv) {
+		verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX,
+		    &oldnvx) == 0);
+	}
+
+	newnva = safe_malloc(alloc_size);
+	oldnva = safe_malloc(alloc_size);
+	calcnva = safe_malloc(alloc_size);
+
+	for (j = 0; j < len; j++) {
+		verify(nvpair64_to_stat_array(newnvx, names[j],
+		    &newnva[j]) == 0);
+		calcnva[j].count = newnva[j].count;
+		alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]);
+		calcnva[j].data = safe_malloc(alloc_size);
+		memcpy(calcnva[j].data, newnva[j].data, alloc_size);
+
+		if (oldnvx) {
+			verify(nvpair64_to_stat_array(oldnvx, names[j],
+			    &oldnva[j]) == 0);
+			for (i = 0; i < oldnva[j].count; i++)
+				calcnva[j].data[i] -= oldnva[j].data[i];
+		}
+	}
+	free(newnva);
+	free(oldnva);
+	return (calcnva);
+}
+
+static void
+free_calc_stats(struct stat_array *nva, unsigned int len)
+{
+	int i;
+	for (i = 0; i < len; i++)
+		free(nva[i].data);
+
+	free(nva);
+}
+
+static void
+print_iostat_histo(struct stat_array *nva, unsigned int len,
+    iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth,
+    double scale)
+{
+	int i, j;
+	char buf[6];
+	uint64_t val;
+	enum zfs_nicenum_format format;
+	unsigned int buckets;
+	unsigned int start_bucket;
+
+	if (cb->cb_literal)
+		format = ZFS_NICENUM_RAW;
+	else
+		format = ZFS_NICENUM_1024;
+
+	/* All these histos are the same size, so just use nva[0].count */
+	buckets = nva[0].count;
+
+	if (cb->cb_flags & IOS_RQ_HISTO_M) {
+		/* Start at 512 - req size should never be lower than this */
+		start_bucket = 9;
+	} else {
+		start_bucket = 0;
+	}
+
+	for (j = start_bucket; j < buckets; j++) {
+		/* Print histogram bucket label */
+		if (cb->cb_flags & IOS_L_HISTO_M) {
+			/* Ending range of this bucket */
+			val = (1UL << (j + 1)) - 1;
+			zfs_nicetime(val, buf, sizeof (buf));
+		} else {
+			/* Request size (starting range of bucket) */
+			val = (1UL << j);
+			zfs_nicenum(val, buf, sizeof (buf));
+		}
+
+		if (cb->cb_scripted)
+			printf("%llu", (u_longlong_t)val);
+		else
+			printf("%-*s", namewidth, buf);
+
+		/* Print the values on the line */
+		for (i = 0; i < len; i++) {
+			print_one_stat(nva[i].data[j] * scale, format,
+			    column_width, cb->cb_scripted);
+		}
+		printf("\n");
+	}
+}
+
+static void
+print_solid_separator(unsigned int length)
+{
+	while (length--)
+		printf("-");
+	printf("\n");
+}
+
+static void
+print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv,
+    nvlist_t *newnv, double scale, const char *name)
+{
+	unsigned int column_width;
+	unsigned int namewidth;
+	unsigned int entire_width;
+	enum iostat_type type;
+	struct stat_array *nva;
+	const char **names;
+	unsigned int names_len;
+
+	/* What type of histo are we? */
+	type = IOS_HISTO_IDX(cb->cb_flags);
+
+	/* Get NULL-terminated array of nvlist names for our histo */
+	names = vsx_type_to_nvlist[type];
+	names_len = str_array_len(names); /* num of names */
+
+	nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv);
+
+	if (cb->cb_literal) {
+		column_width = MAX(5,
+		    (unsigned int) log10(stat_histo_max(nva, names_len)) + 1);
+	} else {
+		column_width = 5;
+	}
+
+	namewidth = MAX(cb->cb_namewidth,
+	    strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]));
+
+	/*
+	 * Calculate the entire line width of what we're printing.  The
+	 * +2 is for the two spaces between columns:
+	 */
+	/*	 read  write				*/
+	/*	-----  -----				*/
+	/*	|___|  <---------- column_width		*/
+	/*						*/
+	/*	|__________|  <--- entire_width		*/
+	/*						*/
+	entire_width = namewidth + (column_width + 2) *
+	    label_array_len(iostat_bottom_labels[type]);
+
+	if (cb->cb_scripted)
+		printf("%s\n", name);
+	else
+		print_iostat_header_impl(cb, column_width, name);
+
+	print_iostat_histo(nva, names_len, cb, column_width,
+	    namewidth, scale);
+
+	free_calc_stats(nva, names_len);
+	if (!cb->cb_scripted)
+		print_solid_separator(entire_width);
+}
+
+/*
+ * Calculate the average latency of a power-of-two latency histogram
+ */
+static uint64_t
+single_histo_average(uint64_t *histo, unsigned int buckets)
+{
+	int i;
+	uint64_t count = 0, total = 0;
+
+	for (i = 0; i < buckets; i++) {
+		/*
+		 * Our buckets are power-of-two latency ranges.  Use the
+		 * midpoint latency of each bucket to calculate the average.
+		 * For example:
+		 *
+		 * Bucket          Midpoint
+		 * 8ns-15ns:       12ns
+		 * 16ns-31ns:      24ns
+		 * ...
+		 */
+		if (histo[i] != 0) {
+			total += histo[i] * (((1UL << i) + ((1UL << i)/2)));
+			count += histo[i];
+		}
+	}
+
+	/* Prevent divide by zero */
+	return (count == 0 ? 0 : total / count);
+}
+
+static void
+print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv,
+    nvlist_t *newnv)
+{
+	int i;
+	uint64_t val;
+	const char *names[] = {
+		ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
+		ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+		ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
+		ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+		ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
+		ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+		ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
+		ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+		ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
+		ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+		ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
+		ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+	};
+
+	struct stat_array *nva;
+
+	unsigned int column_width = default_column_width(cb, IOS_QUEUES);
+	enum zfs_nicenum_format format;
+
+	nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv);
+
+	if (cb->cb_literal)
+		format = ZFS_NICENUM_RAW;
+	else
+		format = ZFS_NICENUM_1024;
+
+	for (i = 0; i < ARRAY_SIZE(names); i++) {
+		val = nva[i].data[0];
+		print_one_stat(val, format, column_width, cb->cb_scripted);
+	}
+
+	free_calc_stats(nva, ARRAY_SIZE(names));
+}
+
+static void
+print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv,
+    nvlist_t *newnv)
+{
+	int i;
+	uint64_t val;
+	const char *names[] = {
+		ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+		ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+		ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+		ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+		ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+		ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+		ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+		ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+		ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+		ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+	};
+	struct stat_array *nva;
+
+	unsigned int column_width = default_column_width(cb, IOS_LATENCY);
+	enum zfs_nicenum_format format;
+
+	nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv);
+
+	if (cb->cb_literal)
+		format = ZFS_NICENUM_RAWTIME;
+	else
+		format = ZFS_NICENUM_TIME;
+
+	/* Print our avg latencies on the line */
+	for (i = 0; i < ARRAY_SIZE(names); i++) {
+		/* Compute average latency for a latency histo */
+		val = single_histo_average(nva[i].data, nva[i].count);
+		print_one_stat(val, format, column_width, cb->cb_scripted);
+	}
+	free_calc_stats(nva, ARRAY_SIZE(names));
+}
+
+/*
+ * Print default statistics (capacity/operations/bandwidth)
+ */
+static void
+print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale)
+{
+	unsigned int column_width = default_column_width(cb, IOS_DEFAULT);
+	enum zfs_nicenum_format format;
+	char na;	/* char to print for "not applicable" values */
+
+	if (cb->cb_literal) {
+		format = ZFS_NICENUM_RAW;
+		na = '0';
+	} else {
+		format = ZFS_NICENUM_1024;
+		na = '-';
+	}
+
+	/* only toplevel vdevs have capacity stats */
+	if (vs->vs_space == 0) {
+		if (cb->cb_scripted)
+			printf("\t%c\t%c", na, na);
+		else
+			printf("  %*c  %*c", column_width, na, column_width,
+			    na);
+	} else {
+		print_one_stat(vs->vs_alloc, format, column_width,
+		    cb->cb_scripted);
+		print_one_stat(vs->vs_space - vs->vs_alloc, format,
+		    column_width, cb->cb_scripted);
+	}
+
+	print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale),
+	    format, column_width, cb->cb_scripted);
+	print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale),
+	    format, column_width, cb->cb_scripted);
+	print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale),
+	    format, column_width, cb->cb_scripted);
+	print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale),
+	    format, column_width, cb->cb_scripted);
+}
+
+static const char *class_name[] = {
+	VDEV_ALLOC_BIAS_DEDUP,
+	VDEV_ALLOC_BIAS_SPECIAL,
+	VDEV_ALLOC_CLASS_LOGS
+};
+
+/*
+ * Print out all the statistics for the given vdev.  This can either be the
+ * toplevel configuration, or called recursively.  If 'name' is NULL, then this
+ * is a verbose output, and we don't want to display the toplevel pool stats.
+ *
+ * Returns the number of stat lines printed.
+ */
+static unsigned int
+print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
+    nvlist_t *newnv, iostat_cbdata_t *cb, int depth)
+{
+	nvlist_t **oldchild, **newchild;
+	uint_t c, children, oldchildren;
+	vdev_stat_t *oldvs, *newvs, *calcvs;
+	vdev_stat_t zerovs = { 0 };
+	char *vname;
+	int i;
+	int ret = 0;
+	uint64_t tdelta;
+	double scale;
+
+	if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
+		return (ret);
+
+	calcvs = safe_malloc(sizeof (*calcvs));
+
+	if (oldnv != NULL) {
+		verify(nvlist_lookup_uint64_array(oldnv,
+		    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
+	} else {
+		oldvs = &zerovs;
+	}
+
+	/* Do we only want to see a specific vdev? */
+	for (i = 0; i < cb->cb_vdev_names_count; i++) {
+		/* Yes we do.  Is this the vdev? */
+		if (strcmp(name, cb->cb_vdev_names[i]) == 0) {
+			/*
+			 * This is our vdev.  Since it is the only vdev we
+			 * will be displaying, make depth = 0 so that it
+			 * doesn't get indented.
+			 */
+			depth = 0;
+			break;
+		}
+	}
+
+	if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) {
+		/* Couldn't match the name */
+		goto children;
+	}
+
+
+	verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&newvs, &c) == 0);
+
+	/*
+	 * Print the vdev name unless it's is a histogram.  Histograms
+	 * display the vdev name in the header itself.
+	 */
+	if (!(cb->cb_flags & IOS_ANYHISTO_M)) {
+		if (cb->cb_scripted) {
+			printf("%s", name);
+		} else {
+			if (strlen(name) + depth > cb->cb_namewidth)
+				(void) printf("%*s%s", depth, "", name);
+			else
+				(void) printf("%*s%s%*s", depth, "", name,
+				    (int)(cb->cb_namewidth - strlen(name) -
+				    depth), "");
+		}
+	}
+
+	/* Calculate our scaling factor */
+	tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;
+	if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) {
+		/*
+		 * If we specify printing histograms with no time interval, then
+		 * print the histogram numbers over the entire lifetime of the
+		 * vdev.
+		 */
+		scale = 1;
+	} else {
+		if (tdelta == 0)
+			scale = 1.0;
+		else
+			scale = (double)NANOSEC / tdelta;
+	}
+
+	if (cb->cb_flags & IOS_DEFAULT_M) {
+		calc_default_iostats(oldvs, newvs, calcvs);
+		print_iostat_default(calcvs, cb, scale);
+	}
+	if (cb->cb_flags & IOS_LATENCY_M)
+		print_iostat_latency(cb, oldnv, newnv);
+	if (cb->cb_flags & IOS_QUEUES_M)
+		print_iostat_queues(cb, oldnv, newnv);
+	if (cb->cb_flags & IOS_ANYHISTO_M) {
+		printf("\n");
+		print_iostat_histos(cb, oldnv, newnv, scale, name);
+	}
+
+	if (cb->vcdl != NULL) {
+		char *path;
+		if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH,
+		    &path) == 0) {
+			printf("  ");
+			zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path);
+		}
+	}
+
+	if (!(cb->cb_flags & IOS_ANYHISTO_M))
+		printf("\n");
+
+	ret++;
+
+children:
+
+	free(calcvs);
+
+	if (!cb->cb_verbose)
+		return (ret);
+
+	if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
+	    &newchild, &children) != 0)
+		return (ret);
+
+	if (oldnv) {
+		if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
+		    &oldchild, &oldchildren) != 0)
+			return (ret);
+
+		children = MIN(oldchildren, children);
+	}
+
+	/*
+	 * print normal top-level devices
+	 */
+	for (c = 0; c < children; c++) {
+		uint64_t ishole = B_FALSE, islog = B_FALSE;
+
+		(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE,
+		    &ishole);
+
+		(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG,
+		    &islog);
+
+		if (ishole || islog)
+			continue;
+
+		if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+			continue;
+
+		vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+		    cb->cb_name_flags);
+		ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
+		    newchild[c], cb, depth + 2);
+		free(vname);
+	}
+
+	/*
+	 * print all other top-level devices
+	 */
+	for (uint_t n = 0; n < 3; n++) {
+		boolean_t printed = B_FALSE;
+
+		for (c = 0; c < children; c++) {
+			uint64_t islog = B_FALSE;
+			char *bias = NULL;
+			char *type = NULL;
+
+			(void) nvlist_lookup_uint64(newchild[c],
+			    ZPOOL_CONFIG_IS_LOG, &islog);
+			if (islog) {
+				bias = VDEV_ALLOC_CLASS_LOGS;
+			} else {
+				(void) nvlist_lookup_string(newchild[c],
+				    ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
+				(void) nvlist_lookup_string(newchild[c],
+				    ZPOOL_CONFIG_TYPE, &type);
+			}
+			if (bias == NULL || strcmp(bias, class_name[n]) != 0)
+				continue;
+			if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+				continue;
+
+			if (!printed) {
+				if ((!(cb->cb_flags & IOS_ANYHISTO_M)) &&
+				    !cb->cb_scripted && !cb->cb_vdev_names) {
+					print_iostat_dashes(cb, 0,
+					    class_name[n]);
+				}
+				printf("\n");
+				printed = B_TRUE;
+			}
+
+			vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+			    cb->cb_name_flags);
+			ret += print_vdev_stats(zhp, vname, oldnv ?
+			    oldchild[c] : NULL, newchild[c], cb, depth + 2);
+			free(vname);
+		}
+	}
+
+	/*
+	 * Include level 2 ARC devices in iostat output
+	 */
+	if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
+	    &newchild, &children) != 0)
+		return (ret);
+
+	if (oldnv) {
+		if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
+		    &oldchild, &oldchildren) != 0)
+			return (ret);
+
+		children = MIN(oldchildren, children);
+	}
+
+	if (children > 0) {
+		if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted &&
+		    !cb->cb_vdev_names) {
+			print_iostat_dashes(cb, 0, "cache");
+		}
+		printf("\n");
+
+		for (c = 0; c < children; c++) {
+			vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+			    cb->cb_name_flags);
+			ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c]
+			    : NULL, newchild[c], cb, depth + 2);
+			free(vname);
+		}
+	}
+
+	return (ret);
+}
+
+static int
+refresh_iostat(zpool_handle_t *zhp, void *data)
+{
+	iostat_cbdata_t *cb = data;
+	boolean_t missing;
+
+	/*
+	 * If the pool has disappeared, remove it from the list and continue.
+	 */
+	if (zpool_refresh_stats(zhp, &missing) != 0)
+		return (-1);
+
+	if (missing)
+		pool_list_remove(cb->cb_list, zhp);
+
+	return (0);
+}
+
+/*
+ * Callback to print out the iostats for the given pool.
+ */
+static int
+print_iostat(zpool_handle_t *zhp, void *data)
+{
+	iostat_cbdata_t *cb = data;
+	nvlist_t *oldconfig, *newconfig;
+	nvlist_t *oldnvroot, *newnvroot;
+	int ret;
+
+	newconfig = zpool_get_config(zhp, &oldconfig);
+
+	if (cb->cb_iteration == 1)
+		oldconfig = NULL;
+
+	verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE,
+	    &newnvroot) == 0);
+
+	if (oldconfig == NULL)
+		oldnvroot = NULL;
+	else
+		verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE,
+		    &oldnvroot) == 0);
+
+	ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot,
+	    cb, 0);
+	if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) &&
+	    !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) {
+		print_iostat_separator(cb);
+		if (cb->vcdl != NULL) {
+			print_cmd_columns(cb->vcdl, 1);
+		}
+		printf("\n");
+	}
+
+	return (ret);
+}
+
+static int
+get_columns(void)
+{
+	struct winsize ws;
+	int columns = 80;
+	int error;
+
+	if (isatty(STDOUT_FILENO)) {
+		error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws);
+		if (error == 0)
+			columns = ws.ws_col;
+	} else {
+		columns = 999;
+	}
+
+	return (columns);
+}
+
+/*
+ * Return the required length of the pool/vdev name column.  The minimum
+ * allowed width and output formatting flags must be provided.
+ */
+static int
+get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose)
+{
+	nvlist_t *config, *nvroot;
+	int width = min_width;
+
+	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &nvroot) == 0);
+		unsigned int poolname_len = strlen(zpool_get_name(zhp));
+		if (verbose == B_FALSE) {
+			width = MAX(poolname_len, min_width);
+		} else {
+			width = MAX(poolname_len,
+			    max_width(zhp, nvroot, 0, min_width, flags));
+		}
+	}
+
+	return (width);
+}
+
+/*
+ * Parse the input string, get the 'interval' and 'count' value if there is one.
+ */
+static void
+get_interval_count(int *argcp, char **argv, float *iv,
+    unsigned long *cnt)
+{
+	float interval = 0;
+	unsigned long count = 0;
+	int argc = *argcp;
+
+	/*
+	 * Determine if the last argument is an integer or a pool name
+	 */
+	if (argc > 0 && zfs_isnumber(argv[argc - 1])) {
+		char *end;
+
+		errno = 0;
+		interval = strtof(argv[argc - 1], &end);
+
+		if (*end == '\0' && errno == 0) {
+			if (interval == 0) {
+				(void) fprintf(stderr, gettext("interval "
+				    "cannot be zero\n"));
+				usage(B_FALSE);
+			}
+			/*
+			 * Ignore the last parameter
+			 */
+			argc--;
+		} else {
+			/*
+			 * If this is not a valid number, just plow on.  The
+			 * user will get a more informative error message later
+			 * on.
+			 */
+			interval = 0;
+		}
+	}
+
+	/*
+	 * If the last argument is also an integer, then we have both a count
+	 * and an interval.
+	 */
+	if (argc > 0 && zfs_isnumber(argv[argc - 1])) {
+		char *end;
+
+		errno = 0;
+		count = interval;
+		interval = strtof(argv[argc - 1], &end);
+
+		if (*end == '\0' && errno == 0) {
+			if (interval == 0) {
+				(void) fprintf(stderr, gettext("interval "
+				    "cannot be zero\n"));
+				usage(B_FALSE);
+			}
+
+			/*
+			 * Ignore the last parameter
+			 */
+			argc--;
+		} else {
+			interval = 0;
+		}
+	}
+
+	*iv = interval;
+	*cnt = count;
+	*argcp = argc;
+}
+
+static void
+get_timestamp_arg(char c)
+{
+	if (c == 'u')
+		timestamp_fmt = UDATE;
+	else if (c == 'd')
+		timestamp_fmt = DDATE;
+	else
+		usage(B_FALSE);
+}
+
+/*
+ * Return stat flags that are supported by all pools by both the module and
+ * zpool iostat.  "*data" should be initialized to all 0xFFs before running.
+ * It will get ANDed down until only the flags that are supported on all pools
+ * remain.
+ */
+static int
+get_stat_flags_cb(zpool_handle_t *zhp, void *data)
+{
+	uint64_t *mask = data;
+	nvlist_t *config, *nvroot, *nvx;
+	uint64_t flags = 0;
+	int i, j;
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	/* Default stats are always supported, but for completeness.. */
+	if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS))
+		flags |= IOS_DEFAULT_M;
+
+	/* Get our extended stats nvlist from the main list */
+	if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX,
+	    &nvx) != 0) {
+		/*
+		 * No extended stats; they're probably running an older
+		 * module.  No big deal, we support that too.
+		 */
+		goto end;
+	}
+
+	/* For each extended stat, make sure all its nvpairs are supported */
+	for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) {
+		if (!vsx_type_to_nvlist[j][0])
+			continue;
+
+		/* Start off by assuming the flag is supported, then check */
+		flags |= (1ULL << j);
+		for (i = 0; vsx_type_to_nvlist[j][i]; i++) {
+			if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) {
+				/* flag isn't supported */
+				flags = flags & ~(1ULL  << j);
+				break;
+			}
+		}
+	}
+end:
+	*mask = *mask & flags;
+	return (0);
+}
+
+/*
+ * Return a bitmask of stats that are supported on all pools by both the module
+ * and zpool iostat.
+ */
+static uint64_t
+get_stat_flags(zpool_list_t *list)
+{
+	uint64_t mask = -1;
+
+	/*
+	 * get_stat_flags_cb() will lop off bits from "mask" until only the
+	 * flags that are supported on all pools remain.
+	 */
+	pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask);
+	return (mask);
+}
+
+/*
+ * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise.
+ */
+static int
+is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data)
+{
+	iostat_cbdata_t *cb = cb_data;
+	char *name = NULL;
+	int ret = 0;
+
+	name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags);
+
+	if (strcmp(name, cb->cb_vdev_names[0]) == 0)
+		ret = 1; /* match */
+	free(name);
+
+	return (ret);
+}
+
+/*
+ * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise.
+ */
+static int
+is_vdev(zpool_handle_t *zhp, void *cb_data)
+{
+	return (for_each_vdev(zhp, is_vdev_cb, cb_data));
+}
+
+/*
+ * Check if vdevs are in a pool
+ *
+ * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise
+ * return 0.  If pool_name is NULL, then search all pools.
+ */
+static int
+are_vdevs_in_pool(int argc, char **argv, char *pool_name,
+    iostat_cbdata_t *cb)
+{
+	char **tmp_name;
+	int ret = 0;
+	int i;
+	int pool_count = 0;
+
+	if ((argc == 0) || !*argv)
+		return (0);
+
+	if (pool_name)
+		pool_count = 1;
+
+	/* Temporarily hijack cb_vdev_names for a second... */
+	tmp_name = cb->cb_vdev_names;
+
+	/* Go though our list of prospective vdev names */
+	for (i = 0; i < argc; i++) {
+		cb->cb_vdev_names = argv + i;
+
+		/* Is this name a vdev in our pools? */
+		ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL,
+		    is_vdev, cb);
+		if (!ret) {
+			/* No match */
+			break;
+		}
+	}
+
+	cb->cb_vdev_names = tmp_name;
+
+	return (ret);
+}
+
+static int
+is_pool_cb(zpool_handle_t *zhp, void *data)
+{
+	char *name = data;
+	if (strcmp(name, zpool_get_name(zhp)) == 0)
+		return (1);
+
+	return (0);
+}
+
+/*
+ * Do we have a pool named *name?  If so, return 1, otherwise 0.
+ */
+static int
+is_pool(char *name)
+{
+	return (for_each_pool(0, NULL, B_TRUE, NULL,  is_pool_cb, name));
+}
+
+/* Are all our argv[] strings pool names?  If so return 1, 0 otherwise. */
+static int
+are_all_pools(int argc, char **argv)
+{
+	if ((argc == 0) || !*argv)
+		return (0);
+
+	while (--argc >= 0)
+		if (!is_pool(argv[argc]))
+			return (0);
+
+	return (1);
+}
+
+/*
+ * Helper function to print out vdev/pool names we can't resolve.  Used for an
+ * error message.
+ */
+static void
+error_list_unresolved_vdevs(int argc, char **argv, char *pool_name,
+    iostat_cbdata_t *cb)
+{
+	int i;
+	char *name;
+	char *str;
+	for (i = 0; i < argc; i++) {
+		name = argv[i];
+
+		if (is_pool(name))
+			str = gettext("pool");
+		else if (are_vdevs_in_pool(1, &name, pool_name, cb))
+			str = gettext("vdev in this pool");
+		else if (are_vdevs_in_pool(1, &name, NULL, cb))
+			str = gettext("vdev in another pool");
+		else
+			str = gettext("unknown");
+
+		fprintf(stderr, "\t%s (%s)\n", name, str);
+	}
+}
+
+/*
+ * Same as get_interval_count(), but with additional checks to not misinterpret
+ * guids as interval/count values.  Assumes VDEV_NAME_GUID is set in
+ * cb.cb_name_flags.
+ */
+static void
+get_interval_count_filter_guids(int *argc, char **argv, float *interval,
+    unsigned long *count, iostat_cbdata_t *cb)
+{
+	char **tmpargv = argv;
+	int argc_for_interval = 0;
+
+	/* Is the last arg an interval value?  Or a guid? */
+	if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) {
+		/*
+		 * The last arg is not a guid, so it's probably an
+		 * interval value.
+		 */
+		argc_for_interval++;
+
+		if (*argc >= 2 &&
+		    !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) {
+			/*
+			 * The 2nd to last arg is not a guid, so it's probably
+			 * an interval value.
+			 */
+			argc_for_interval++;
+		}
+	}
+
+	/* Point to our list of possible intervals */
+	tmpargv = &argv[*argc - argc_for_interval];
+
+	*argc = *argc - argc_for_interval;
+	get_interval_count(&argc_for_interval, tmpargv,
+	    interval, count);
+}
+
+/*
+ * Floating point sleep().  Allows you to pass in a floating point value for
+ * seconds.
+ */
+static void
+fsleep(float sec)
+{
+	struct timespec req;
+	req.tv_sec = floor(sec);
+	req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC;
+	nanosleep(&req, NULL);
+}
+
+/*
+ * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or
+ * if we were unable to determine its size.
+ */
+static int
+terminal_height(void)
+{
+	struct winsize win;
+
+	if (isatty(STDOUT_FILENO) == 0)
+		return (-1);
+
+	if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0)
+		return (win.ws_row);
+
+	return (-1);
+}
+
+/*
+ * Run one of the zpool status/iostat -c scripts with the help (-h) option and
+ * print the result.
+ *
+ * name:	Short name of the script ('iostat').
+ * path:	Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat');
+ */
+static void
+print_zpool_script_help(char *name, char *path)
+{
+	char *argv[] = {path, "-h", NULL};
+	char **lines = NULL;
+	int lines_cnt = 0;
+	int rc;
+
+	rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines,
+	    &lines_cnt);
+	if (rc != 0 || lines == NULL || lines_cnt <= 0) {
+		if (lines != NULL)
+			libzfs_free_str_array(lines, lines_cnt);
+		return;
+	}
+
+	for (int i = 0; i < lines_cnt; i++)
+		if (!is_blank_str(lines[i]))
+			printf("  %-14s  %s\n", name, lines[i]);
+
+	libzfs_free_str_array(lines, lines_cnt);
+}
+
+/*
+ * Go though the zpool status/iostat -c scripts in the user's path, run their
+ * help option (-h), and print out the results.
+ */
+static void
+print_zpool_dir_scripts(char *dirpath)
+{
+	DIR *dir;
+	struct dirent *ent;
+	char fullpath[MAXPATHLEN];
+	struct stat dir_stat;
+
+	if ((dir = opendir(dirpath)) != NULL) {
+		/* print all the files and directories within directory */
+		while ((ent = readdir(dir)) != NULL) {
+			sprintf(fullpath, "%s/%s", dirpath, ent->d_name);
+
+			/* Print the scripts */
+			if (stat(fullpath, &dir_stat) == 0)
+				if (dir_stat.st_mode & S_IXUSR &&
+				    S_ISREG(dir_stat.st_mode))
+					print_zpool_script_help(ent->d_name,
+					    fullpath);
+		}
+		closedir(dir);
+	}
+}
+
+/*
+ * Print out help text for all zpool status/iostat -c scripts.
+ */
+static void
+print_zpool_script_list(char *subcommand)
+{
+	char *dir, *sp;
+
+	printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand);
+
+	sp = zpool_get_cmd_search_path();
+	if (sp == NULL)
+		return;
+
+	dir = strtok(sp, ":");
+	while (dir != NULL) {
+		print_zpool_dir_scripts(dir);
+		dir = strtok(NULL, ":");
+	}
+
+	free(sp);
+}
+
+/*
+ * Set the minimum pool/vdev name column width.  The width must be at least 10,
+ * but may be as large as the column width - 42 so it still fits on one line.
+ * NOTE: 42 is the width of the default capacity/operations/bandwidth output
+ */
+static int
+get_namewidth_iostat(zpool_handle_t *zhp, void *data)
+{
+	iostat_cbdata_t *cb = data;
+	int width, available_width;
+
+	/*
+	 * get_namewidth() returns the maximum width of any name in that column
+	 * for any pool/vdev/device line that will be output.
+	 */
+	width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
+	    cb->cb_verbose);
+
+	/*
+	 * The width we are calculating is the width of the header and also the
+	 * padding width for names that are less than maximum width.  The stats
+	 * take up 42 characters, so the width available for names is:
+	 */
+	available_width = get_columns() - 42;
+
+	/*
+	 * If the maximum width fits on a screen, then great!  Make everything
+	 * line up by justifying all lines to the same width.  If that max
+	 * width is larger than what's available, the name plus stats won't fit
+	 * on one line, and justifying to that width would cause every line to
+	 * wrap on the screen.  We only want lines with long names to wrap.
+	 * Limit the padding to what won't wrap.
+	 */
+	if (width > available_width)
+		width = available_width;
+
+	/*
+	 * And regardless of whatever the screen width is (get_columns can
+	 * return 0 if the width is not known or less than 42 for a narrow
+	 * terminal) have the width be a minimum of 10.
+	 */
+	if (width < 10)
+		width = 10;
+
+	/* Save the calculated width */
+	cb->cb_namewidth = width;
+
+	return (0);
+}
+
+/*
+ * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name]
+ *              [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]]
+ *              [interval [count]]
+ *
+ *	-c CMD  For each vdev, run command CMD
+ *	-g	Display guid for individual vdev name.
+ *	-L	Follow links when resolving vdev path name.
+ *	-P	Display full path for vdev name.
+ *	-v	Display statistics for individual vdevs
+ *	-h	Display help
+ *	-p	Display values in parsable (exact) format.
+ *	-H	Scripted mode.  Don't display headers, and separate properties
+ *		by a single tab.
+ *	-l	Display average latency
+ *	-q	Display queue depths
+ *	-w	Display latency histograms
+ *	-r	Display request size histogram
+ *	-T	Display a timestamp in date(1) or Unix format
+ *	-n	Only print headers once
+ *
+ * This command can be tricky because we want to be able to deal with pool
+ * creation/destruction as well as vdev configuration changes.  The bulk of this
+ * processing is handled by the pool_list_* routines in zpool_iter.c.  We rely
+ * on pool_list_update() to detect the addition of new pools.  Configuration
+ * changes are all handled within libzfs.
+ */
+int
+zpool_do_iostat(int argc, char **argv)
+{
+	int c;
+	int ret;
+	int npools;
+	float interval = 0;
+	unsigned long count = 0;
+	int winheight = 24;
+	zpool_list_t *list;
+	boolean_t verbose = B_FALSE;
+	boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE;
+	boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE;
+	boolean_t omit_since_boot = B_FALSE;
+	boolean_t guid = B_FALSE;
+	boolean_t follow_links = B_FALSE;
+	boolean_t full_name = B_FALSE;
+	boolean_t headers_once = B_FALSE;
+	iostat_cbdata_t cb = { 0 };
+	char *cmd = NULL;
+
+	/* Used for printing error message */
+	const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q',
+	    [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'};
+
+	uint64_t unsupported_flags;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) {
+		switch (c) {
+		case 'c':
+			if (cmd != NULL) {
+				fprintf(stderr,
+				    gettext("Can't set -c flag twice\n"));
+				exit(1);
+			}
+
+			if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL &&
+			    !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) {
+				fprintf(stderr, gettext(
+				    "Can't run -c, disabled by "
+				    "ZPOOL_SCRIPTS_ENABLED.\n"));
+				exit(1);
+			}
+
+			if ((getuid() <= 0 || geteuid() <= 0) &&
+			    !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) {
+				fprintf(stderr, gettext(
+				    "Can't run -c with root privileges "
+				    "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n"));
+				exit(1);
+			}
+			cmd = optarg;
+			verbose = B_TRUE;
+			break;
+		case 'g':
+			guid = B_TRUE;
+			break;
+		case 'L':
+			follow_links = B_TRUE;
+			break;
+		case 'P':
+			full_name = B_TRUE;
+			break;
+		case 'T':
+			get_timestamp_arg(*optarg);
+			break;
+		case 'v':
+			verbose = B_TRUE;
+			break;
+		case 'p':
+			parsable = B_TRUE;
+			break;
+		case 'l':
+			latency = B_TRUE;
+			break;
+		case 'q':
+			queues = B_TRUE;
+			break;
+		case 'H':
+			scripted = B_TRUE;
+			break;
+		case 'w':
+			l_histo = B_TRUE;
+			break;
+		case 'r':
+			rq_histo = B_TRUE;
+			break;
+		case 'y':
+			omit_since_boot = B_TRUE;
+			break;
+		case 'n':
+			headers_once = B_TRUE;
+			break;
+		case 'h':
+			usage(B_FALSE);
+			break;
+		case '?':
+			if (optopt == 'c') {
+				print_zpool_script_list("iostat");
+				exit(0);
+			} else {
+				fprintf(stderr,
+				    gettext("invalid option '%c'\n"), optopt);
+			}
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	cb.cb_literal = parsable;
+	cb.cb_scripted = scripted;
+
+	if (guid)
+		cb.cb_name_flags |= VDEV_NAME_GUID;
+	if (follow_links)
+		cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
+	if (full_name)
+		cb.cb_name_flags |= VDEV_NAME_PATH;
+	cb.cb_iteration = 0;
+	cb.cb_namewidth = 0;
+	cb.cb_verbose = verbose;
+
+	/* Get our interval and count values (if any) */
+	if (guid) {
+		get_interval_count_filter_guids(&argc, argv, &interval,
+		    &count, &cb);
+	} else {
+		get_interval_count(&argc, argv, &interval, &count);
+	}
+
+	if (argc == 0) {
+		/* No args, so just print the defaults. */
+	} else if (are_all_pools(argc, argv)) {
+		/* All the args are pool names */
+	} else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) {
+		/* All the args are vdevs */
+		cb.cb_vdev_names = argv;
+		cb.cb_vdev_names_count = argc;
+		argc = 0; /* No pools to process */
+	} else if (are_all_pools(1, argv)) {
+		/* The first arg is a pool name */
+		if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) {
+			/* ...and the rest are vdev names */
+			cb.cb_vdev_names = argv + 1;
+			cb.cb_vdev_names_count = argc - 1;
+			argc = 1; /* One pool to process */
+		} else {
+			fprintf(stderr, gettext("Expected either a list of "));
+			fprintf(stderr, gettext("pools, or list of vdevs in"));
+			fprintf(stderr, " \"%s\", ", argv[0]);
+			fprintf(stderr, gettext("but got:\n"));
+			error_list_unresolved_vdevs(argc - 1, argv + 1,
+			    argv[0], &cb);
+			fprintf(stderr, "\n");
+			usage(B_FALSE);
+			return (1);
+		}
+	} else {
+		/*
+		 * The args don't make sense. The first arg isn't a pool name,
+		 * nor are all the args vdevs.
+		 */
+		fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n"));
+		fprintf(stderr, "\n");
+		return (1);
+	}
+
+	if (cb.cb_vdev_names_count != 0) {
+		/*
+		 * If user specified vdevs, it implies verbose.
+		 */
+		cb.cb_verbose = B_TRUE;
+	}
+
+	/*
+	 * Construct the list of all interesting pools.
+	 */
+	ret = 0;
+	if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL)
+		return (1);
+
+	if (pool_list_count(list) == 0 && argc != 0) {
+		pool_list_free(list);
+		return (1);
+	}
+
+	if (pool_list_count(list) == 0 && interval == 0) {
+		pool_list_free(list);
+		(void) fprintf(stderr, gettext("no pools available\n"));
+		return (1);
+	}
+
+	if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) {
+		pool_list_free(list);
+		(void) fprintf(stderr,
+		    gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n"));
+		usage(B_FALSE);
+		return (1);
+	}
+
+	if (l_histo && rq_histo) {
+		pool_list_free(list);
+		(void) fprintf(stderr,
+		    gettext("Only one of [-r|-w] can be passed at a time\n"));
+		usage(B_FALSE);
+		return (1);
+	}
+
+	/*
+	 * Enter the main iostat loop.
+	 */
+	cb.cb_list = list;
+
+	if (l_histo) {
+		/*
+		 * Histograms tables look out of place when you try to display
+		 * them with the other stats, so make a rule that you can only
+		 * print histograms by themselves.
+		 */
+		cb.cb_flags = IOS_L_HISTO_M;
+	} else if (rq_histo) {
+		cb.cb_flags = IOS_RQ_HISTO_M;
+	} else {
+		cb.cb_flags = IOS_DEFAULT_M;
+		if (latency)
+			cb.cb_flags |= IOS_LATENCY_M;
+		if (queues)
+			cb.cb_flags |= IOS_QUEUES_M;
+	}
+
+	/*
+	 * See if the module supports all the stats we want to display.
+	 */
+	unsupported_flags = cb.cb_flags & ~get_stat_flags(list);
+	if (unsupported_flags) {
+		uint64_t f;
+		int idx;
+		fprintf(stderr,
+		    gettext("The loaded zfs module doesn't support:"));
+
+		/* for each bit set in unsupported_flags */
+		for (f = unsupported_flags; f; f &= ~(1ULL << idx)) {
+			idx = lowbit64(f) - 1;
+			fprintf(stderr, " -%c", flag_to_arg[idx]);
+		}
+
+		fprintf(stderr, ".  Try running a newer module.\n");
+		pool_list_free(list);
+
+		return (1);
+	}
+
+	for (;;) {
+		if ((npools = pool_list_count(list)) == 0)
+			(void) fprintf(stderr, gettext("no pools available\n"));
+		else {
+			/*
+			 * If this is the first iteration and -y was supplied
+			 * we skip any printing.
+			 */
+			boolean_t skip = (omit_since_boot &&
+			    cb.cb_iteration == 0);
+
+			/*
+			 * Refresh all statistics.  This is done as an
+			 * explicit step before calculating the maximum name
+			 * width, so that any * configuration changes are
+			 * properly accounted for.
+			 */
+			(void) pool_list_iter(list, B_FALSE, refresh_iostat,
+			    &cb);
+
+			/*
+			 * Iterate over all pools to determine the maximum width
+			 * for the pool / device name column across all pools.
+			 */
+			cb.cb_namewidth = 0;
+			(void) pool_list_iter(list, B_FALSE,
+			    get_namewidth_iostat, &cb);
+
+			if (timestamp_fmt != NODATE)
+				print_timestamp(timestamp_fmt);
+
+			if (cmd != NULL && cb.cb_verbose &&
+			    !(cb.cb_flags & IOS_ANYHISTO_M)) {
+				cb.vcdl = all_pools_for_each_vdev_run(argc,
+				    argv, cmd, g_zfs, cb.cb_vdev_names,
+				    cb.cb_vdev_names_count, cb.cb_name_flags);
+			} else {
+				cb.vcdl = NULL;
+			}
+
+
+			/*
+			 * Check terminal size so we can print headers
+			 * even when terminal window has its height
+			 * changed.
+			 */
+			winheight = terminal_height();
+			/*
+			 * Are we connected to TTY? If not, headers_once
+			 * should be true, to avoid breaking scripts.
+			 */
+			if (winheight < 0)
+				headers_once = B_TRUE;
+
+			/*
+			 * If it's the first time and we're not skipping it,
+			 * or either skip or verbose mode, print the header.
+			 *
+			 * The histogram code explicitly prints its header on
+			 * every vdev, so skip this for histograms.
+			 */
+			if (((++cb.cb_iteration == 1 && !skip) ||
+			    (skip != verbose) ||
+			    (!headers_once &&
+			    (cb.cb_iteration % winheight) == 0)) &&
+			    (!(cb.cb_flags & IOS_ANYHISTO_M)) &&
+			    !cb.cb_scripted)
+				print_iostat_header(&cb);
+
+			if (skip) {
+				(void) fsleep(interval);
+				continue;
+			}
+
+			pool_list_iter(list, B_FALSE, print_iostat, &cb);
+
+			/*
+			 * If there's more than one pool, and we're not in
+			 * verbose mode (which prints a separator for us),
+			 * then print a separator.
+			 *
+			 * In addition, if we're printing specific vdevs then
+			 * we also want an ending separator.
+			 */
+			if (((npools > 1 && !verbose &&
+			    !(cb.cb_flags & IOS_ANYHISTO_M)) ||
+			    (!(cb.cb_flags & IOS_ANYHISTO_M) &&
+			    cb.cb_vdev_names_count)) &&
+			    !cb.cb_scripted) {
+				print_iostat_separator(&cb);
+				if (cb.vcdl != NULL)
+					print_cmd_columns(cb.vcdl, 1);
+				printf("\n");
+			}
+
+			if (cb.vcdl != NULL)
+				free_vdev_cmd_data_list(cb.vcdl);
+
+		}
+
+		/*
+		 * Flush the output so that redirection to a file isn't buffered
+		 * indefinitely.
+		 */
+		(void) fflush(stdout);
+
+		if (interval == 0)
+			break;
+
+		if (count != 0 && --count == 0)
+			break;
+
+		(void) fsleep(interval);
+	}
+
+	pool_list_free(list);
+
+	return (ret);
+}
+
+typedef struct list_cbdata {
+	boolean_t	cb_verbose;
+	int		cb_name_flags;
+	int		cb_namewidth;
+	boolean_t	cb_scripted;
+	zprop_list_t	*cb_proplist;
+	boolean_t	cb_literal;
+} list_cbdata_t;
+
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+static void
+print_header(list_cbdata_t *cb)
+{
+	zprop_list_t *pl = cb->cb_proplist;
+	char headerbuf[ZPOOL_MAXPROPLEN];
+	const char *header;
+	boolean_t first = B_TRUE;
+	boolean_t right_justify;
+	size_t width = 0;
+
+	for (; pl != NULL; pl = pl->pl_next) {
+		width = pl->pl_width;
+		if (first && cb->cb_verbose) {
+			/*
+			 * Reset the width to accommodate the verbose listing
+			 * of devices.
+			 */
+			width = cb->cb_namewidth;
+		}
+
+		if (!first)
+			(void) printf("  ");
+		else
+			first = B_FALSE;
+
+		right_justify = B_FALSE;
+		if (pl->pl_prop != ZPROP_INVAL) {
+			header = zpool_prop_column_name(pl->pl_prop);
+			right_justify = zpool_prop_align_right(pl->pl_prop);
+		} else {
+			int i;
+
+			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
+				headerbuf[i] = toupper(pl->pl_user_prop[i]);
+			headerbuf[i] = '\0';
+			header = headerbuf;
+		}
+
+		if (pl->pl_next == NULL && !right_justify)
+			(void) printf("%s", header);
+		else if (right_justify)
+			(void) printf("%*s", (int)width, header);
+		else
+			(void) printf("%-*s", (int)width, header);
+	}
+
+	(void) printf("\n");
+}
+
+/*
+ * Given a pool and a list of properties, print out all the properties according
+ * to the described layout. Used by zpool_do_list().
+ */
+static void
+print_pool(zpool_handle_t *zhp, list_cbdata_t *cb)
+{
+	zprop_list_t *pl = cb->cb_proplist;
+	boolean_t first = B_TRUE;
+	char property[ZPOOL_MAXPROPLEN];
+	char *propstr;
+	boolean_t right_justify;
+	size_t width;
+
+	for (; pl != NULL; pl = pl->pl_next) {
+
+		width = pl->pl_width;
+		if (first && cb->cb_verbose) {
+			/*
+			 * Reset the width to accommodate the verbose listing
+			 * of devices.
+			 */
+			width = cb->cb_namewidth;
+		}
+
+		if (!first) {
+			if (cb->cb_scripted)
+				(void) printf("\t");
+			else
+				(void) printf("  ");
+		} else {
+			first = B_FALSE;
+		}
+
+		right_justify = B_FALSE;
+		if (pl->pl_prop != ZPROP_INVAL) {
+			if (zpool_get_prop(zhp, pl->pl_prop, property,
+			    sizeof (property), NULL, cb->cb_literal) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+
+			right_justify = zpool_prop_align_right(pl->pl_prop);
+		} else if ((zpool_prop_feature(pl->pl_user_prop) ||
+		    zpool_prop_unsupported(pl->pl_user_prop)) &&
+		    zpool_prop_get_feature(zhp, pl->pl_user_prop, property,
+		    sizeof (property)) == 0) {
+			propstr = property;
+		} else {
+			propstr = "-";
+		}
+
+
+		/*
+		 * If this is being called in scripted mode, or if this is the
+		 * last column and it is left-justified, don't include a width
+		 * format specifier.
+		 */
+		if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
+			(void) printf("%s", propstr);
+		else if (right_justify)
+			(void) printf("%*s", (int)width, propstr);
+		else
+			(void) printf("%-*s", (int)width, propstr);
+	}
+
+	(void) printf("\n");
+}
+
+static void
+print_one_column(zpool_prop_t prop, uint64_t value, const char *str,
+    boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format)
+{
+	char propval[64];
+	boolean_t fixed;
+	size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);
+
+	switch (prop) {
+	case ZPOOL_PROP_EXPANDSZ:
+	case ZPOOL_PROP_CHECKPOINT:
+	case ZPOOL_PROP_DEDUPRATIO:
+		if (value == 0)
+			(void) strlcpy(propval, "-", sizeof (propval));
+		else
+			zfs_nicenum_format(value, propval, sizeof (propval),
+			    format);
+		break;
+	case ZPOOL_PROP_FRAGMENTATION:
+		if (value == ZFS_FRAG_INVALID) {
+			(void) strlcpy(propval, "-", sizeof (propval));
+		} else if (format == ZFS_NICENUM_RAW) {
+			(void) snprintf(propval, sizeof (propval), "%llu",
+			    (unsigned long long)value);
+		} else {
+			(void) snprintf(propval, sizeof (propval), "%llu%%",
+			    (unsigned long long)value);
+		}
+		break;
+	case ZPOOL_PROP_CAPACITY:
+		/* capacity value is in parts-per-10,000 (aka permyriad) */
+		if (format == ZFS_NICENUM_RAW)
+			(void) snprintf(propval, sizeof (propval), "%llu",
+			    (unsigned long long)value / 100);
+		else
+			(void) snprintf(propval, sizeof (propval),
+			    value < 1000 ? "%1.2f%%" : value < 10000 ?
+			    "%2.1f%%" : "%3.0f%%", value / 100.0);
+		break;
+	case ZPOOL_PROP_HEALTH:
+		width = 8;
+		snprintf(propval, sizeof (propval), "%-*s", (int)width, str);
+		break;
+	default:
+		zfs_nicenum_format(value, propval, sizeof (propval), format);
+	}
+
+	if (!valid)
+		(void) strlcpy(propval, "-", sizeof (propval));
+
+	if (scripted)
+		(void) printf("\t%s", propval);
+	else
+		(void) printf("  %*s", (int)width, propval);
+}
+
+/*
+ * print static default line per vdev
+ * not compatible with '-o' <proplist> option
+ */
+static void
+print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
+    list_cbdata_t *cb, int depth, boolean_t isspare)
+{
+	nvlist_t **child;
+	vdev_stat_t *vs;
+	uint_t c, children;
+	char *vname;
+	boolean_t scripted = cb->cb_scripted;
+	uint64_t islog = B_FALSE;
+	char *dashes = "%-*s      -      -      -        -         "
+	    "-      -      -      -  -\n";
+
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	if (name != NULL) {
+		boolean_t toplevel = (vs->vs_space != 0);
+		uint64_t cap;
+		enum zfs_nicenum_format format;
+		const char *state;
+
+		if (cb->cb_literal)
+			format = ZFS_NICENUM_RAW;
+		else
+			format = ZFS_NICENUM_1024;
+
+		if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
+			return;
+
+		if (scripted)
+			(void) printf("\t%s", name);
+		else if (strlen(name) + depth > cb->cb_namewidth)
+			(void) printf("%*s%s", depth, "", name);
+		else
+			(void) printf("%*s%s%*s", depth, "", name,
+			    (int)(cb->cb_namewidth - strlen(name) - depth), "");
+
+		/*
+		 * Print the properties for the individual vdevs. Some
+		 * properties are only applicable to toplevel vdevs. The
+		 * 'toplevel' boolean value is passed to the print_one_column()
+		 * to indicate that the value is valid.
+		 */
+		print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted,
+		    toplevel, format);
+		print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL,
+		    scripted, toplevel, format);
+		print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
+		    NULL, scripted, toplevel, format);
+		print_one_column(ZPOOL_PROP_CHECKPOINT,
+		    vs->vs_checkpoint_space, NULL, scripted, toplevel, format);
+		print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL,
+		    scripted, B_TRUE, format);
+		print_one_column(ZPOOL_PROP_FRAGMENTATION,
+		    vs->vs_fragmentation, NULL, scripted,
+		    (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel),
+		    format);
+		cap = (vs->vs_space == 0) ? 0 :
+		    (vs->vs_alloc * 10000 / vs->vs_space);
+		print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL,
+		    scripted, toplevel, format);
+		print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL,
+		    scripted, toplevel, format);
+		state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+		if (isspare) {
+			if (vs->vs_aux == VDEV_AUX_SPARED)
+				state = "INUSE";
+			else if (vs->vs_state == VDEV_STATE_HEALTHY)
+				state = "AVAIL";
+		}
+		print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted,
+		    B_TRUE, format);
+		(void) printf("\n");
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return;
+
+	/* list the normal vdevs first */
+	for (c = 0; c < children; c++) {
+		uint64_t ishole = B_FALSE;
+
+		if (nvlist_lookup_uint64(child[c],
+		    ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
+			continue;
+
+		if (nvlist_lookup_uint64(child[c],
+		    ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog)
+			continue;
+
+		if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+			continue;
+
+		vname = zpool_vdev_name(g_zfs, zhp, child[c],
+		    cb->cb_name_flags);
+		print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE);
+		free(vname);
+	}
+
+	/* list the classes: 'logs', 'dedup', and 'special' */
+	for (uint_t n = 0; n < 3; n++) {
+		boolean_t printed = B_FALSE;
+
+		for (c = 0; c < children; c++) {
+			char *bias = NULL;
+			char *type = NULL;
+
+			if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+			    &islog) == 0 && islog) {
+				bias = VDEV_ALLOC_CLASS_LOGS;
+			} else {
+				(void) nvlist_lookup_string(child[c],
+				    ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
+				(void) nvlist_lookup_string(child[c],
+				    ZPOOL_CONFIG_TYPE, &type);
+			}
+			if (bias == NULL || strcmp(bias, class_name[n]) != 0)
+				continue;
+			if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+				continue;
+
+			if (!printed) {
+				/* LINTED E_SEC_PRINTF_VAR_FMT */
+				(void) printf(dashes, cb->cb_namewidth,
+				    class_name[n]);
+				printed = B_TRUE;
+			}
+			vname = zpool_vdev_name(g_zfs, zhp, child[c],
+			    cb->cb_name_flags);
+			print_list_stats(zhp, vname, child[c], cb, depth + 2,
+			    B_FALSE);
+			free(vname);
+		}
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) == 0 && children > 0) {
+		/* LINTED E_SEC_PRINTF_VAR_FMT */
+		(void) printf(dashes, cb->cb_namewidth, "cache");
+		for (c = 0; c < children; c++) {
+			vname = zpool_vdev_name(g_zfs, zhp, child[c],
+			    cb->cb_name_flags);
+			print_list_stats(zhp, vname, child[c], cb, depth + 2,
+			    B_FALSE);
+			free(vname);
+		}
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child,
+	    &children) == 0 && children > 0) {
+		/* LINTED E_SEC_PRINTF_VAR_FMT */
+		(void) printf(dashes, cb->cb_namewidth, "spare");
+		for (c = 0; c < children; c++) {
+			vname = zpool_vdev_name(g_zfs, zhp, child[c],
+			    cb->cb_name_flags);
+			print_list_stats(zhp, vname, child[c], cb, depth + 2,
+			    B_TRUE);
+			free(vname);
+		}
+	}
+}
+
+/*
+ * Generic callback function to list a pool.
+ */
+static int
+list_callback(zpool_handle_t *zhp, void *data)
+{
+	list_cbdata_t *cbp = data;
+
+	print_pool(zhp, cbp);
+
+	if (cbp->cb_verbose) {
+		nvlist_t *config, *nvroot;
+
+		config = zpool_get_config(zhp, NULL);
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &nvroot) == 0);
+		print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE);
+	}
+
+	return (0);
+}
+
+/*
+ * Set the minimum pool/vdev name column width.  The width must be at least 9,
+ * but may be as large as needed.
+ */
+static int
+get_namewidth_list(zpool_handle_t *zhp, void *data)
+{
+	list_cbdata_t *cb = data;
+	int width;
+
+	width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
+	    cb->cb_verbose);
+
+	if (width < 9)
+		width = 9;
+
+	cb->cb_namewidth = width;
+
+	return (0);
+}
+
+/*
+ * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]]
+ *
+ *	-g	Display guid for individual vdev name.
+ *	-H	Scripted mode.  Don't display headers, and separate properties
+ *		by a single tab.
+ *	-L	Follow links when resolving vdev path name.
+ *	-o	List of properties to display.  Defaults to
+ *		"name,size,allocated,free,expandsize,fragmentation,capacity,"
+ *		"dedupratio,health,altroot"
+ *	-p	Display values in parsable (exact) format.
+ *	-P	Display full path for vdev name.
+ *	-T	Display a timestamp in date(1) or Unix format
+ *
+ * List all pools in the system, whether or not they're healthy.  Output space
+ * statistics for each one, as well as health status summary.
+ */
+int
+zpool_do_list(int argc, char **argv)
+{
+	int c;
+	int ret = 0;
+	list_cbdata_t cb = { 0 };
+	static char default_props[] =
+	    "name,size,allocated,free,checkpoint,expandsize,fragmentation,"
+	    "capacity,dedupratio,health,altroot";
+	char *props = default_props;
+	float interval = 0;
+	unsigned long count = 0;
+	zpool_list_t *list;
+	boolean_t first = B_TRUE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) {
+		switch (c) {
+		case 'g':
+			cb.cb_name_flags |= VDEV_NAME_GUID;
+			break;
+		case 'H':
+			cb.cb_scripted = B_TRUE;
+			break;
+		case 'L':
+			cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
+			break;
+		case 'o':
+			props = optarg;
+			break;
+		case 'P':
+			cb.cb_name_flags |= VDEV_NAME_PATH;
+			break;
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			break;
+		case 'T':
+			get_timestamp_arg(*optarg);
+			break;
+		case 'v':
+			cb.cb_verbose = B_TRUE;
+			cb.cb_namewidth = 8;	/* 8 until precalc is avail */
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	get_interval_count(&argc, argv, &interval, &count);
+
+	if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
+		usage(B_FALSE);
+
+	for (;;) {
+		if ((list = pool_list_get(argc, argv, &cb.cb_proplist,
+		    &ret)) == NULL)
+			return (1);
+
+		if (pool_list_count(list) == 0)
+			break;
+
+		cb.cb_namewidth = 0;
+		(void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb);
+
+		if (timestamp_fmt != NODATE)
+			print_timestamp(timestamp_fmt);
+
+		if (!cb.cb_scripted && (first || cb.cb_verbose)) {
+			print_header(&cb);
+			first = B_FALSE;
+		}
+		ret = pool_list_iter(list, B_TRUE, list_callback, &cb);
+
+		if (interval == 0)
+			break;
+
+		if (count != 0 && --count == 0)
+			break;
+
+		pool_list_free(list);
+		(void) fsleep(interval);
+	}
+
+	if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) {
+		(void) printf(gettext("no pools available\n"));
+		ret = 0;
+	}
+
+	pool_list_free(list);
+	zprop_free_list(cb.cb_proplist);
+	return (ret);
+}
+
+static int
+zpool_do_attach_or_replace(int argc, char **argv, int replacing)
+{
+	boolean_t force = B_FALSE;
+	boolean_t rebuild = B_FALSE;
+	boolean_t wait = B_FALSE;
+	int c;
+	nvlist_t *nvroot;
+	char *poolname, *old_disk, *new_disk;
+	zpool_handle_t *zhp;
+	nvlist_t *props = NULL;
+	char *propval;
+	int ret;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "fo:sw")) != -1) {
+		switch (c) {
+		case 'f':
+			force = B_TRUE;
+			break;
+		case 'o':
+			if ((propval = strchr(optarg, '=')) == NULL) {
+				(void) fprintf(stderr, gettext("missing "
+				    "'=' for -o option\n"));
+				usage(B_FALSE);
+			}
+			*propval = '\0';
+			propval++;
+
+			if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) ||
+			    (add_prop_list(optarg, propval, &props, B_TRUE)))
+				usage(B_FALSE);
+			break;
+		case 's':
+			rebuild = B_TRUE;
+			break;
+		case 'w':
+			wait = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(B_FALSE);
+	}
+
+	poolname = argv[0];
+
+	if (argc < 2) {
+		(void) fprintf(stderr,
+		    gettext("missing <device> specification\n"));
+		usage(B_FALSE);
+	}
+
+	old_disk = argv[1];
+
+	if (argc < 3) {
+		if (!replacing) {
+			(void) fprintf(stderr,
+			    gettext("missing <new_device> specification\n"));
+			usage(B_FALSE);
+		}
+		new_disk = old_disk;
+		argc -= 1;
+		argv += 1;
+	} else {
+		new_disk = argv[2];
+		argc -= 2;
+		argv += 2;
+	}
+
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
+		nvlist_free(props);
+		return (1);
+	}
+
+	if (zpool_get_config(zhp, NULL) == NULL) {
+		(void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
+		    poolname);
+		zpool_close(zhp);
+		nvlist_free(props);
+		return (1);
+	}
+
+	/* unless manually specified use "ashift" pool property (if set) */
+	if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) {
+		int intval;
+		zprop_source_t src;
+		char strval[ZPOOL_MAXPROPLEN];
+
+		intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src);
+		if (src != ZPROP_SRC_DEFAULT) {
+			(void) sprintf(strval, "%" PRId32, intval);
+			verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval,
+			    &props, B_TRUE) == 0);
+		}
+	}
+
+	nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE,
+	    argc, argv);
+	if (nvroot == NULL) {
+		zpool_close(zhp);
+		nvlist_free(props);
+		return (1);
+	}
+
+	ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing,
+	    rebuild);
+
+	if (ret == 0 && wait)
+		ret = zpool_wait(zhp,
+		    replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER);
+
+	nvlist_free(props);
+	nvlist_free(nvroot);
+	zpool_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * zpool replace [-fsw] [-o property=value] <pool> <device> <new_device>
+ *
+ *	-f	Force attach, even if <new_device> appears to be in use.
+ *	-s	Use sequential instead of healing reconstruction for resilver.
+ *	-o	Set property=value.
+ *	-w	Wait for replacing to complete before returning
+ *
+ * Replace <device> with <new_device>.
+ */
+/* ARGSUSED */
+int
+zpool_do_replace(int argc, char **argv)
+{
+	return (zpool_do_attach_or_replace(argc, argv, B_TRUE));
+}
+
+/*
+ * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device>
+ *
+ *	-f	Force attach, even if <new_device> appears to be in use.
+ *	-s	Use sequential instead of healing reconstruction for resilver.
+ *	-o	Set property=value.
+ *	-w	Wait for resilvering to complete before returning
+ *
+ * Attach <new_device> to the mirror containing <device>.  If <device> is not
+ * part of a mirror, then <device> will be transformed into a mirror of
+ * <device> and <new_device>.  In either case, <new_device> will begin life
+ * with a DTL of [0, now], and will immediately begin to resilver itself.
+ */
+int
+zpool_do_attach(int argc, char **argv)
+{
+	return (zpool_do_attach_or_replace(argc, argv, B_FALSE));
+}
+
+/*
+ * zpool detach [-f] <pool> <device>
+ *
+ *	-f	Force detach of <device>, even if DTLs argue against it
+ *		(not supported yet)
+ *
+ * Detach a device from a mirror.  The operation will be refused if <device>
+ * is the last device in the mirror, or if the DTLs indicate that this device
+ * has the only valid copy of some data.
+ */
+/* ARGSUSED */
+int
+zpool_do_detach(int argc, char **argv)
+{
+	int c;
+	char *poolname, *path;
+	zpool_handle_t *zhp;
+	int ret;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "")) != -1) {
+		switch (c) {
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc < 2) {
+		(void) fprintf(stderr,
+		    gettext("missing <device> specification\n"));
+		usage(B_FALSE);
+	}
+
+	poolname = argv[0];
+	path = argv[1];
+
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+		return (1);
+
+	ret = zpool_vdev_detach(zhp, path);
+
+	zpool_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * zpool split [-gLnP] [-o prop=val] ...
+ *		[-o mntopt] ...
+ *		[-R altroot] <pool> <newpool> [<device> ...]
+ *
+ *	-g      Display guid for individual vdev name.
+ *	-L	Follow links when resolving vdev path name.
+ *	-n	Do not split the pool, but display the resulting layout if
+ *		it were to be split.
+ *	-o	Set property=value, or set mount options.
+ *	-P	Display full path for vdev name.
+ *	-R	Mount the split-off pool under an alternate root.
+ *	-l	Load encryption keys while importing.
+ *
+ * Splits the named pool and gives it the new pool name.  Devices to be split
+ * off may be listed, provided that no more than one device is specified
+ * per top-level vdev mirror.  The newly split pool is left in an exported
+ * state unless -R is specified.
+ *
+ * Restrictions: the top-level of the pool pool must only be made up of
+ * mirrors; all devices in the pool must be healthy; no device may be
+ * undergoing a resilvering operation.
+ */
+int
+zpool_do_split(int argc, char **argv)
+{
+	char *srcpool, *newpool, *propval;
+	char *mntopts = NULL;
+	splitflags_t flags;
+	int c, ret = 0;
+	boolean_t loadkeys = B_FALSE;
+	zpool_handle_t *zhp;
+	nvlist_t *config, *props = NULL;
+
+	flags.dryrun = B_FALSE;
+	flags.import = B_FALSE;
+	flags.name_flags = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) {
+		switch (c) {
+		case 'g':
+			flags.name_flags |= VDEV_NAME_GUID;
+			break;
+		case 'L':
+			flags.name_flags |= VDEV_NAME_FOLLOW_LINKS;
+			break;
+		case 'R':
+			flags.import = B_TRUE;
+			if (add_prop_list(
+			    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
+			    &props, B_TRUE) != 0) {
+				nvlist_free(props);
+				usage(B_FALSE);
+			}
+			break;
+		case 'l':
+			loadkeys = B_TRUE;
+			break;
+		case 'n':
+			flags.dryrun = B_TRUE;
+			break;
+		case 'o':
+			if ((propval = strchr(optarg, '=')) != NULL) {
+				*propval = '\0';
+				propval++;
+				if (add_prop_list(optarg, propval,
+				    &props, B_TRUE) != 0) {
+					nvlist_free(props);
+					usage(B_FALSE);
+				}
+			} else {
+				mntopts = optarg;
+			}
+			break;
+		case 'P':
+			flags.name_flags |= VDEV_NAME_PATH;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+			break;
+		}
+	}
+
+	if (!flags.import && mntopts != NULL) {
+		(void) fprintf(stderr, gettext("setting mntopts is only "
+		    "valid when importing the pool\n"));
+		usage(B_FALSE);
+	}
+
+	if (!flags.import && loadkeys) {
+		(void) fprintf(stderr, gettext("loading keys is only "
+		    "valid when importing the pool\n"));
+		usage(B_FALSE);
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("Missing pool name\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("Missing new pool name\n"));
+		usage(B_FALSE);
+	}
+
+	srcpool = argv[0];
+	newpool = argv[1];
+
+	argc -= 2;
+	argv += 2;
+
+	if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) {
+		nvlist_free(props);
+		return (1);
+	}
+
+	config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv);
+	if (config == NULL) {
+		ret = 1;
+	} else {
+		if (flags.dryrun) {
+			(void) printf(gettext("would create '%s' with the "
+			    "following layout:\n\n"), newpool);
+			print_vdev_tree(NULL, newpool, config, 0, "",
+			    flags.name_flags);
+		}
+	}
+
+	zpool_close(zhp);
+
+	if (ret != 0 || flags.dryrun || !flags.import) {
+		nvlist_free(config);
+		nvlist_free(props);
+		return (ret);
+	}
+
+	/*
+	 * The split was successful. Now we need to open the new
+	 * pool and import it.
+	 */
+	if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) {
+		nvlist_free(config);
+		nvlist_free(props);
+		return (1);
+	}
+
+	if (loadkeys) {
+		ret = zfs_crypto_attempt_load_keys(g_zfs, newpool);
+		if (ret != 0)
+			ret = 1;
+	}
+
+	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
+		ret = 1;
+		(void) fprintf(stderr, gettext("Split was successful, but "
+		    "the datasets could not all be mounted\n"));
+		(void) fprintf(stderr, gettext("Try doing '%s' with a "
+		    "different altroot\n"), "zpool import");
+	}
+	zpool_close(zhp);
+	nvlist_free(config);
+	nvlist_free(props);
+
+	return (ret);
+}
+
+
+
+/*
+ * zpool online <pool> <device> ...
+ */
+int
+zpool_do_online(int argc, char **argv)
+{
+	int c, i;
+	char *poolname;
+	zpool_handle_t *zhp;
+	int ret = 0;
+	vdev_state_t newstate;
+	int flags = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "e")) != -1) {
+		switch (c) {
+		case 'e':
+			flags |= ZFS_ONLINE_EXPAND;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing device name\n"));
+		usage(B_FALSE);
+	}
+
+	poolname = argv[0];
+
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+		return (1);
+
+	for (i = 1; i < argc; i++) {
+		if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
+			if (newstate != VDEV_STATE_HEALTHY) {
+				(void) printf(gettext("warning: device '%s' "
+				    "onlined, but remains in faulted state\n"),
+				    argv[i]);
+				if (newstate == VDEV_STATE_FAULTED)
+					(void) printf(gettext("use 'zpool "
+					    "clear' to restore a faulted "
+					    "device\n"));
+				else
+					(void) printf(gettext("use 'zpool "
+					    "replace' to replace devices "
+					    "that are no longer present\n"));
+			}
+		} else {
+			ret = 1;
+		}
+	}
+
+	zpool_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * zpool offline [-ft] <pool> <device> ...
+ *
+ *	-f	Force the device into a faulted state.
+ *
+ *	-t	Only take the device off-line temporarily.  The offline/faulted
+ *		state will not be persistent across reboots.
+ */
+/* ARGSUSED */
+int
+zpool_do_offline(int argc, char **argv)
+{
+	int c, i;
+	char *poolname;
+	zpool_handle_t *zhp;
+	int ret = 0;
+	boolean_t istmp = B_FALSE;
+	boolean_t fault = B_FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "ft")) != -1) {
+		switch (c) {
+		case 'f':
+			fault = B_TRUE;
+			break;
+		case 't':
+			istmp = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing device name\n"));
+		usage(B_FALSE);
+	}
+
+	poolname = argv[0];
+
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+		return (1);
+
+	for (i = 1; i < argc; i++) {
+		if (fault) {
+			uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]);
+			vdev_aux_t aux;
+			if (istmp == B_FALSE) {
+				/* Force the fault to persist across imports */
+				aux = VDEV_AUX_EXTERNAL_PERSIST;
+			} else {
+				aux = VDEV_AUX_EXTERNAL;
+			}
+
+			if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0)
+				ret = 1;
+		} else {
+			if (zpool_vdev_offline(zhp, argv[i], istmp) != 0)
+				ret = 1;
+		}
+	}
+
+	zpool_close(zhp);
+
+	return (ret);
+}
+
+/*
+ * zpool clear <pool> [device]
+ *
+ * Clear all errors associated with a pool or a particular device.
+ */
+int
+zpool_do_clear(int argc, char **argv)
+{
+	int c;
+	int ret = 0;
+	boolean_t dryrun = B_FALSE;
+	boolean_t do_rewind = B_FALSE;
+	boolean_t xtreme_rewind = B_FALSE;
+	uint32_t rewind_policy = ZPOOL_NO_REWIND;
+	nvlist_t *policy = NULL;
+	zpool_handle_t *zhp;
+	char *pool, *device;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "FnX")) != -1) {
+		switch (c) {
+		case 'F':
+			do_rewind = B_TRUE;
+			break;
+		case 'n':
+			dryrun = B_TRUE;
+			break;
+		case 'X':
+			xtreme_rewind = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	if ((dryrun || xtreme_rewind) && !do_rewind) {
+		(void) fprintf(stderr,
+		    gettext("-n or -X only meaningful with -F\n"));
+		usage(B_FALSE);
+	}
+	if (dryrun)
+		rewind_policy = ZPOOL_TRY_REWIND;
+	else if (do_rewind)
+		rewind_policy = ZPOOL_DO_REWIND;
+	if (xtreme_rewind)
+		rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+	/* In future, further rewind policy choices can be passed along here */
+	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY,
+	    rewind_policy) != 0) {
+		return (1);
+	}
+
+	pool = argv[0];
+	device = argc == 2 ? argv[1] : NULL;
+
+	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
+		nvlist_free(policy);
+		return (1);
+	}
+
+	if (zpool_clear(zhp, device, policy) != 0)
+		ret = 1;
+
+	zpool_close(zhp);
+
+	nvlist_free(policy);
+
+	return (ret);
+}
+
+/*
+ * zpool reguid <pool>
+ */
+int
+zpool_do_reguid(int argc, char **argv)
+{
+	int c;
+	char *poolname;
+	zpool_handle_t *zhp;
+	int ret = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "")) != -1) {
+		switch (c) {
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get pool name and check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	poolname = argv[0];
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+		return (1);
+
+	ret = zpool_reguid(zhp);
+
+	zpool_close(zhp);
+	return (ret);
+}
+
+
+/*
+ * zpool reopen <pool>
+ *
+ * Reopen the pool so that the kernel can update the sizes of all vdevs.
+ */
+int
+zpool_do_reopen(int argc, char **argv)
+{
+	int c;
+	int ret = 0;
+	boolean_t scrub_restart = B_TRUE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "n")) != -1) {
+		switch (c) {
+		case 'n':
+			scrub_restart = B_FALSE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* if argc == 0 we will execute zpool_reopen_one on all pools */
+	ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_reopen_one,
+	    &scrub_restart);
+
+	return (ret);
+}
+
+typedef struct scrub_cbdata {
+	int	cb_type;
+	pool_scrub_cmd_t cb_scrub_cmd;
+} scrub_cbdata_t;
+
+static boolean_t
+zpool_has_checkpoint(zpool_handle_t *zhp)
+{
+	nvlist_t *config, *nvroot;
+
+	config = zpool_get_config(zhp, NULL);
+
+	if (config != NULL) {
+		pool_checkpoint_stat_t *pcs = NULL;
+		uint_t c;
+
+		nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
+		(void) nvlist_lookup_uint64_array(nvroot,
+		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+
+		if (pcs == NULL || pcs->pcs_state == CS_NONE)
+			return (B_FALSE);
+
+		assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS ||
+		    pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+scrub_callback(zpool_handle_t *zhp, void *data)
+{
+	scrub_cbdata_t *cb = data;
+	int err;
+
+	/*
+	 * Ignore faulted pools.
+	 */
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		(void) fprintf(stderr, gettext("cannot scan '%s': pool is "
+		    "currently unavailable\n"), zpool_get_name(zhp));
+		return (1);
+	}
+
+	err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);
+
+	if (err == 0 && zpool_has_checkpoint(zhp) &&
+	    cb->cb_type == POOL_SCAN_SCRUB) {
+		(void) printf(gettext("warning: will not scrub state that "
+		    "belongs to the checkpoint of pool '%s'\n"),
+		    zpool_get_name(zhp));
+	}
+
+	return (err != 0);
+}
+
+static int
+wait_callback(zpool_handle_t *zhp, void *data)
+{
+	zpool_wait_activity_t *act = data;
+	return (zpool_wait(zhp, *act));
+}
+
+/*
+ * zpool scrub [-s | -p] [-w] <pool> ...
+ *
+ *	-s	Stop.  Stops any in-progress scrub.
+ *	-p	Pause. Pause in-progress scrub.
+ *	-w	Wait.  Blocks until scrub has completed.
+ */
+int
+zpool_do_scrub(int argc, char **argv)
+{
+	int c;
+	scrub_cbdata_t cb;
+	boolean_t wait = B_FALSE;
+	int error;
+
+	cb.cb_type = POOL_SCAN_SCRUB;
+	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "spw")) != -1) {
+		switch (c) {
+		case 's':
+			cb.cb_type = POOL_SCAN_NONE;
+			break;
+		case 'p':
+			cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
+			break;
+		case 'w':
+			wait = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	if (cb.cb_type == POOL_SCAN_NONE &&
+	    cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) {
+		(void) fprintf(stderr, gettext("invalid option combination: "
+		    "-s and -p are mutually exclusive\n"));
+		usage(B_FALSE);
+	}
+
+	if (wait && (cb.cb_type == POOL_SCAN_NONE ||
+	    cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) {
+		(void) fprintf(stderr, gettext("invalid option combination: "
+		    "-w cannot be used with -p or -s\n"));
+		usage(B_FALSE);
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(B_FALSE);
+	}
+
+	error = for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb);
+
+	if (wait && !error) {
+		zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB;
+		error = for_each_pool(argc, argv, B_TRUE, NULL, wait_callback,
+		    &act);
+	}
+
+	return (error);
+}
+
+/*
+ * zpool resilver <pool> ...
+ *
+ *	Restarts any in-progress resilver
+ */
+int
+zpool_do_resilver(int argc, char **argv)
+{
+	int c;
+	scrub_cbdata_t cb;
+
+	cb.cb_type = POOL_SCAN_RESILVER;
+	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "")) != -1) {
+		switch (c) {
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(B_FALSE);
+	}
+
+	return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
+}
+
+/*
+ * zpool trim [-d] [-r <rate>] [-c | -s] <pool> [<device> ...]
+ *
+ *	-c		Cancel. Ends any in-progress trim.
+ *	-d		Secure trim.  Requires kernel and device support.
+ *	-r <rate>	Sets the TRIM rate in bytes (per second). Supports
+ *			adding a multiplier suffix such as 'k' or 'm'.
+ *	-s		Suspend. TRIM can then be restarted with no flags.
+ *	-w		Wait. Blocks until trimming has completed.
+ */
+int
+zpool_do_trim(int argc, char **argv)
+{
+	struct option long_options[] = {
+		{"cancel",	no_argument,		NULL,	'c'},
+		{"secure",	no_argument,		NULL,	'd'},
+		{"rate",	required_argument,	NULL,	'r'},
+		{"suspend",	no_argument,		NULL,	's'},
+		{"wait",	no_argument,		NULL,	'w'},
+		{0, 0, 0, 0}
+	};
+
+	pool_trim_func_t cmd_type = POOL_TRIM_START;
+	uint64_t rate = 0;
+	boolean_t secure = B_FALSE;
+	boolean_t wait = B_FALSE;
+
+	int c;
+	while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL))
+	    != -1) {
+		switch (c) {
+		case 'c':
+			if (cmd_type != POOL_TRIM_START &&
+			    cmd_type != POOL_TRIM_CANCEL) {
+				(void) fprintf(stderr, gettext("-c cannot be "
+				    "combined with other options\n"));
+				usage(B_FALSE);
+			}
+			cmd_type = POOL_TRIM_CANCEL;
+			break;
+		case 'd':
+			if (cmd_type != POOL_TRIM_START) {
+				(void) fprintf(stderr, gettext("-d cannot be "
+				    "combined with the -c or -s options\n"));
+				usage(B_FALSE);
+			}
+			secure = B_TRUE;
+			break;
+		case 'r':
+			if (cmd_type != POOL_TRIM_START) {
+				(void) fprintf(stderr, gettext("-r cannot be "
+				    "combined with the -c or -s options\n"));
+				usage(B_FALSE);
+			}
+			if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) {
+				(void) fprintf(stderr,
+				    gettext("invalid value for rate\n"));
+				usage(B_FALSE);
+			}
+			break;
+		case 's':
+			if (cmd_type != POOL_TRIM_START &&
+			    cmd_type != POOL_TRIM_SUSPEND) {
+				(void) fprintf(stderr, gettext("-s cannot be "
+				    "combined with other options\n"));
+				usage(B_FALSE);
+			}
+			cmd_type = POOL_TRIM_SUSPEND;
+			break;
+		case 'w':
+			wait = B_TRUE;
+			break;
+		case '?':
+			if (optopt != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid option '%c'\n"), optopt);
+			} else {
+				(void) fprintf(stderr,
+				    gettext("invalid option '%s'\n"),
+				    argv[optind - 1]);
+			}
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(B_FALSE);
+		return (-1);
+	}
+
+	if (wait && (cmd_type != POOL_TRIM_START)) {
+		(void) fprintf(stderr, gettext("-w cannot be used with -c or "
+		    "-s\n"));
+		usage(B_FALSE);
+	}
+
+	char *poolname = argv[0];
+	zpool_handle_t *zhp = zpool_open(g_zfs, poolname);
+	if (zhp == NULL)
+		return (-1);
+
+	trimflags_t trim_flags = {
+		.secure = secure,
+		.rate = rate,
+		.wait = wait,
+	};
+
+	nvlist_t *vdevs = fnvlist_alloc();
+	if (argc == 1) {
+		/* no individual leaf vdevs specified, so add them all */
+		nvlist_t *config = zpool_get_config(zhp, NULL);
+		nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+		    ZPOOL_CONFIG_VDEV_TREE);
+		zpool_collect_leaves(zhp, nvroot, vdevs);
+		trim_flags.fullpool = B_TRUE;
+	} else {
+		trim_flags.fullpool = B_FALSE;
+		for (int i = 1; i < argc; i++) {
+			fnvlist_add_boolean(vdevs, argv[i]);
+		}
+	}
+
+	int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags);
+
+	fnvlist_free(vdevs);
+	zpool_close(zhp);
+
+	return (error);
+}
+
+/*
+ * Converts a total number of seconds to a human readable string broken
+ * down in to days/hours/minutes/seconds.
+ */
+static void
+secs_to_dhms(uint64_t total, char *buf)
+{
+	uint64_t days = total / 60 / 60 / 24;
+	uint64_t hours = (total / 60 / 60) % 24;
+	uint64_t mins = (total / 60) % 60;
+	uint64_t secs = (total % 60);
+
+	if (days > 0) {
+		(void) sprintf(buf, "%llu days %02llu:%02llu:%02llu",
+		    (u_longlong_t)days, (u_longlong_t)hours,
+		    (u_longlong_t)mins, (u_longlong_t)secs);
+	} else {
+		(void) sprintf(buf, "%02llu:%02llu:%02llu",
+		    (u_longlong_t)hours, (u_longlong_t)mins,
+		    (u_longlong_t)secs);
+	}
+}
+
+/*
+ * Print out detailed scrub status.
+ */
+static void
+print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
+{
+	time_t start, end, pause;
+	uint64_t pass_scanned, scanned, pass_issued, issued, total;
+	uint64_t elapsed, scan_rate, issue_rate;
+	double fraction_done;
+	char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
+	char srate_buf[7], irate_buf[7], time_buf[32];
+
+	printf("  ");
+	printf_color(ANSI_BOLD, gettext("scan:"));
+	printf(" ");
+
+	/* If there's never been a scan, there's not much to say. */
+	if (ps == NULL || ps->pss_func == POOL_SCAN_NONE ||
+	    ps->pss_func >= POOL_SCAN_FUNCS) {
+		(void) printf(gettext("none requested\n"));
+		return;
+	}
+
+	start = ps->pss_start_time;
+	end = ps->pss_end_time;
+	pause = ps->pss_pass_scrub_pause;
+
+	zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf));
+
+	assert(ps->pss_func == POOL_SCAN_SCRUB ||
+	    ps->pss_func == POOL_SCAN_RESILVER);
+
+	/* Scan is finished or canceled. */
+	if (ps->pss_state == DSS_FINISHED) {
+		secs_to_dhms(end - start, time_buf);
+
+		if (ps->pss_func == POOL_SCAN_SCRUB) {
+			(void) printf(gettext("scrub repaired %s "
+			    "in %s with %llu errors on %s"), processed_buf,
+			    time_buf, (u_longlong_t)ps->pss_errors,
+			    ctime(&end));
+		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
+			(void) printf(gettext("resilvered %s "
+			    "in %s with %llu errors on %s"), processed_buf,
+			    time_buf, (u_longlong_t)ps->pss_errors,
+			    ctime(&end));
+		}
+		return;
+	} else if (ps->pss_state == DSS_CANCELED) {
+		if (ps->pss_func == POOL_SCAN_SCRUB) {
+			(void) printf(gettext("scrub canceled on %s"),
+			    ctime(&end));
+		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
+			(void) printf(gettext("resilver canceled on %s"),
+			    ctime(&end));
+		}
+		return;
+	}
+
+	assert(ps->pss_state == DSS_SCANNING);
+
+	/* Scan is in progress. Resilvers can't be paused. */
+	if (ps->pss_func == POOL_SCAN_SCRUB) {
+		if (pause == 0) {
+			(void) printf(gettext("scrub in progress since %s"),
+			    ctime(&start));
+		} else {
+			(void) printf(gettext("scrub paused since %s"),
+			    ctime(&pause));
+			(void) printf(gettext("\tscrub started on %s"),
+			    ctime(&start));
+		}
+	} else if (ps->pss_func == POOL_SCAN_RESILVER) {
+		(void) printf(gettext("resilver in progress since %s"),
+		    ctime(&start));
+	}
+
+	scanned = ps->pss_examined;
+	pass_scanned = ps->pss_pass_exam;
+	issued = ps->pss_issued;
+	pass_issued = ps->pss_pass_issued;
+	total = ps->pss_to_examine;
+
+	/* we are only done with a block once we have issued the IO for it */
+	fraction_done = (double)issued / total;
+
+	/* elapsed time for this pass, rounding up to 1 if it's 0 */
+	elapsed = time(NULL) - ps->pss_pass_start;
+	elapsed -= ps->pss_pass_scrub_spent_paused;
+	elapsed = (elapsed != 0) ? elapsed : 1;
+
+	scan_rate = pass_scanned / elapsed;
+	issue_rate = pass_issued / elapsed;
+	uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
+	    ((total - issued) / issue_rate) : UINT64_MAX;
+	secs_to_dhms(total_secs_left, time_buf);
+
+	/* format all of the numbers we will be reporting */
+	zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
+	zfs_nicebytes(issued, issued_buf, sizeof (issued_buf));
+	zfs_nicebytes(total, total_buf, sizeof (total_buf));
+	zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
+	zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
+
+	/* do not print estimated time if we have a paused scrub */
+	if (pause == 0) {
+		(void) printf(gettext("\t%s scanned at %s/s, "
+		    "%s issued at %s/s, %s total\n"),
+		    scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
+	} else {
+		(void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
+		    scanned_buf, issued_buf, total_buf);
+	}
+
+	if (ps->pss_func == POOL_SCAN_RESILVER) {
+		(void) printf(gettext("\t%s resilvered, %.2f%% done"),
+		    processed_buf, 100 * fraction_done);
+	} else if (ps->pss_func == POOL_SCAN_SCRUB) {
+		(void) printf(gettext("\t%s repaired, %.2f%% done"),
+		    processed_buf, 100 * fraction_done);
+	}
+
+	if (pause == 0) {
+		if (total_secs_left != UINT64_MAX &&
+		    issue_rate >= 10 * 1024 * 1024) {
+			(void) printf(gettext(", %s to go\n"), time_buf);
+		} else {
+			(void) printf(gettext(", no estimated "
+			    "completion time\n"));
+		}
+	} else {
+		(void) printf(gettext("\n"));
+	}
+}
+
+static void
+print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
+{
+	if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
+		return;
+
+	printf("  ");
+	printf_color(ANSI_BOLD, gettext("scan:"));
+	printf(" ");
+
+	uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
+	uint64_t bytes_issued = vrs->vrs_bytes_issued;
+	uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
+	uint64_t bytes_est = vrs->vrs_bytes_est;
+	uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
+	    (vrs->vrs_pass_time_ms + 1)) * 1000;
+	uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
+	    (vrs->vrs_pass_time_ms + 1)) * 1000;
+	double scan_pct = MIN((double)bytes_scanned * 100 /
+	    (bytes_est + 1), 100);
+
+	/* Format all of the numbers we will be reporting */
+	char bytes_scanned_buf[7], bytes_issued_buf[7];
+	char bytes_rebuilt_buf[7], bytes_est_buf[7];
+	char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
+	zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
+	    sizeof (bytes_scanned_buf));
+	zfs_nicebytes(bytes_issued, bytes_issued_buf,
+	    sizeof (bytes_issued_buf));
+	zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
+	    sizeof (bytes_rebuilt_buf));
+	zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
+	zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
+	zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
+
+	time_t start = vrs->vrs_start_time;
+	time_t end = vrs->vrs_end_time;
+
+	/* Rebuild is finished or canceled. */
+	if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) {
+		secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf);
+		(void) printf(gettext("resilvered (%s) %s in %s "
+		    "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf,
+		    time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end));
+		return;
+	} else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) {
+		(void) printf(gettext("resilver (%s) canceled on %s"),
+		    vdev_name, ctime(&end));
+		return;
+	} else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+		(void) printf(gettext("resilver (%s) in progress since %s"),
+		    vdev_name, ctime(&start));
+	}
+
+	assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
+
+	secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
+	    MAX(scan_rate, 1), time_buf);
+
+	(void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
+	    "%s total\n"), bytes_scanned_buf, scan_rate_buf,
+	    bytes_issued_buf, issue_rate_buf, bytes_est_buf);
+	(void) printf(gettext("\t%s resilvered, %.2f%% done"),
+	    bytes_rebuilt_buf, scan_pct);
+
+	if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+		if (scan_rate >= 10 * 1024 * 1024) {
+			(void) printf(gettext(", %s to go\n"), time_buf);
+		} else {
+			(void) printf(gettext(", no estimated "
+			    "completion time\n"));
+		}
+	} else {
+		(void) printf(gettext("\n"));
+	}
+}
+
+/*
+ * Print rebuild status for top-level vdevs.
+ */
+static void
+print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+	nvlist_t **child;
+	uint_t children;
+
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		children = 0;
+
+	for (uint_t c = 0; c < children; c++) {
+		vdev_rebuild_stat_t *vrs;
+		uint_t i;
+
+		if (nvlist_lookup_uint64_array(child[c],
+		    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
+			char *name = zpool_vdev_name(g_zfs, zhp,
+			    child[c], VDEV_NAME_TYPE_ID);
+			print_rebuild_status_impl(vrs, name);
+			free(name);
+		}
+	}
+}
+
+/*
+ * As we don't scrub checkpointed blocks, we want to warn the user that we
+ * skipped scanning some blocks if a checkpoint exists or existed at any
+ * time during the scan.  If a sequential instead of healing reconstruction
+ * was performed then the blocks were reconstructed.  However, their checksums
+ * have not been verified so we still print the warning.
+ */
+static void
+print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
+{
+	if (ps == NULL || pcs == NULL)
+		return;
+
+	if (pcs->pcs_state == CS_NONE ||
+	    pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
+		return;
+
+	assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS);
+
+	if (ps->pss_state == DSS_NONE)
+		return;
+
+	if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) &&
+	    ps->pss_end_time < pcs->pcs_start_time)
+		return;
+
+	if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) {
+		(void) printf(gettext("    scan warning: skipped blocks "
+		    "that are only referenced by the checkpoint.\n"));
+	} else {
+		assert(ps->pss_state == DSS_SCANNING);
+		(void) printf(gettext("    scan warning: skipping blocks "
+		    "that are only referenced by the checkpoint.\n"));
+	}
+}
+
+/*
+ * Returns B_TRUE if there is an active rebuild in progress.  Otherwise,
+ * B_FALSE is returned and 'rebuild_end_time' is set to the end time for
+ * the last completed (or cancelled) rebuild.
+ */
+static boolean_t
+check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time)
+{
+	nvlist_t **child;
+	uint_t children;
+	boolean_t rebuilding = B_FALSE;
+	uint64_t end_time = 0;
+
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		children = 0;
+
+	for (uint_t c = 0; c < children; c++) {
+		vdev_rebuild_stat_t *vrs;
+		uint_t i;
+
+		if (nvlist_lookup_uint64_array(child[c],
+		    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
+
+			if (vrs->vrs_end_time > end_time)
+				end_time = vrs->vrs_end_time;
+
+			if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+				rebuilding = B_TRUE;
+				end_time = 0;
+				break;
+			}
+		}
+	}
+
+	if (rebuild_end_time != NULL)
+		*rebuild_end_time = end_time;
+
+	return (rebuilding);
+}
+
+/*
+ * Print the scan status.
+ */
+static void
+print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+	uint64_t rebuild_end_time = 0, resilver_end_time = 0;
+	boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE;
+	boolean_t active_resilver = B_FALSE;
+	pool_checkpoint_stat_t *pcs = NULL;
+	pool_scan_stat_t *ps = NULL;
+	uint_t c;
+
+	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
+	    (uint64_t **)&ps, &c) == 0) {
+		if (ps->pss_func == POOL_SCAN_RESILVER) {
+			resilver_end_time = ps->pss_end_time;
+			active_resilver = (ps->pss_state == DSS_SCANNING);
+		}
+
+		have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
+		have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
+	}
+
+	boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
+	boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0));
+
+	/* Always print the scrub status when available. */
+	if (have_scrub)
+		print_scan_scrub_resilver_status(ps);
+
+	/*
+	 * When there is an active resilver or rebuild print its status.
+	 * Otherwise print the status of the last resilver or rebuild.
+	 */
+	if (active_resilver || (!active_rebuild && have_resilver &&
+	    resilver_end_time && resilver_end_time > rebuild_end_time)) {
+		print_scan_scrub_resilver_status(ps);
+	} else if (active_rebuild || (!active_resilver && have_rebuild &&
+	    rebuild_end_time && rebuild_end_time > resilver_end_time)) {
+		print_rebuild_status(zhp, nvroot);
+	}
+
+	(void) nvlist_lookup_uint64_array(nvroot,
+	    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+	print_checkpoint_scan_warning(ps, pcs);
+}
+
+/*
+ * Print out detailed removal status.
+ */
+static void
+print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
+{
+	char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
+	time_t start, end;
+	nvlist_t *config, *nvroot;
+	nvlist_t **child;
+	uint_t children;
+	char *vdev_name;
+
+	if (prs == NULL || prs->prs_state == DSS_NONE)
+		return;
+
+	/*
+	 * Determine name of vdev.
+	 */
+	config = zpool_get_config(zhp, NULL);
+	nvroot = fnvlist_lookup_nvlist(config,
+	    ZPOOL_CONFIG_VDEV_TREE);
+	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0);
+	assert(prs->prs_removing_vdev < children);
+	vdev_name = zpool_vdev_name(g_zfs, zhp,
+	    child[prs->prs_removing_vdev], B_TRUE);
+
+	(void) printf(gettext("remove: "));
+
+	start = prs->prs_start_time;
+	end = prs->prs_end_time;
+	zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf));
+
+	/*
+	 * Removal is finished or canceled.
+	 */
+	if (prs->prs_state == DSS_FINISHED) {
+		uint64_t minutes_taken = (end - start) / 60;
+
+		(void) printf(gettext("Removal of vdev %llu copied %s "
+		    "in %lluh%um, completed on %s"),
+		    (longlong_t)prs->prs_removing_vdev,
+		    copied_buf,
+		    (u_longlong_t)(minutes_taken / 60),
+		    (uint_t)(minutes_taken % 60),
+		    ctime((time_t *)&end));
+	} else if (prs->prs_state == DSS_CANCELED) {
+		(void) printf(gettext("Removal of %s canceled on %s"),
+		    vdev_name, ctime(&end));
+	} else {
+		uint64_t copied, total, elapsed, mins_left, hours_left;
+		double fraction_done;
+		uint_t rate;
+
+		assert(prs->prs_state == DSS_SCANNING);
+
+		/*
+		 * Removal is in progress.
+		 */
+		(void) printf(gettext(
+		    "Evacuation of %s in progress since %s"),
+		    vdev_name, ctime(&start));
+
+		copied = prs->prs_copied > 0 ? prs->prs_copied : 1;
+		total = prs->prs_to_copy;
+		fraction_done = (double)copied / total;
+
+		/* elapsed time for this pass */
+		elapsed = time(NULL) - prs->prs_start_time;
+		elapsed = elapsed > 0 ? elapsed : 1;
+		rate = copied / elapsed;
+		rate = rate > 0 ? rate : 1;
+		mins_left = ((total - copied) / rate) / 60;
+		hours_left = mins_left / 60;
+
+		zfs_nicenum(copied, examined_buf, sizeof (examined_buf));
+		zfs_nicenum(total, total_buf, sizeof (total_buf));
+		zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
+
+		/*
+		 * do not print estimated time if hours_left is more than
+		 * 30 days
+		 */
+		(void) printf(gettext("    %s copied out of %s at %s/s, "
+		    "%.2f%% done"),
+		    examined_buf, total_buf, rate_buf, 100 * fraction_done);
+		if (hours_left < (30 * 24)) {
+			(void) printf(gettext(", %lluh%um to go\n"),
+			    (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
+		} else {
+			(void) printf(gettext(
+			    ", (copy is slow, no estimated time)\n"));
+		}
+	}
+	free(vdev_name);
+
+	if (prs->prs_mapping_memory > 0) {
+		char mem_buf[7];
+		zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf));
+		(void) printf(gettext("    %s memory used for "
+		    "removed device mappings\n"),
+		    mem_buf);
+	}
+}
+
+static void
+print_checkpoint_status(pool_checkpoint_stat_t *pcs)
+{
+	time_t start;
+	char space_buf[7];
+
+	if (pcs == NULL || pcs->pcs_state == CS_NONE)
+		return;
+
+	(void) printf(gettext("checkpoint: "));
+
+	start = pcs->pcs_start_time;
+	zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf));
+
+	if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) {
+		char *date = ctime(&start);
+
+		/*
+		 * ctime() adds a newline at the end of the generated
+		 * string, thus the weird format specifier and the
+		 * strlen() call used to chop it off from the output.
+		 */
+		(void) printf(gettext("created %.*s, consumes %s\n"),
+		    (int)(strlen(date) - 1), date, space_buf);
+		return;
+	}
+
+	assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+
+	(void) printf(gettext("discarding, %s remaining.\n"),
+	    space_buf);
+}
+
+static void
+print_error_log(zpool_handle_t *zhp)
+{
+	nvlist_t *nverrlist = NULL;
+	nvpair_t *elem;
+	char *pathname;
+	size_t len = MAXPATHLEN * 2;
+
+	if (zpool_get_errlog(zhp, &nverrlist) != 0)
+		return;
+
+	(void) printf("errors: Permanent errors have been "
+	    "detected in the following files:\n\n");
+
+	pathname = safe_malloc(len);
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) {
+		nvlist_t *nv;
+		uint64_t dsobj, obj;
+
+		verify(nvpair_value_nvlist(elem, &nv) == 0);
+		verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET,
+		    &dsobj) == 0);
+		verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT,
+		    &obj) == 0);
+		zpool_obj_to_path(zhp, dsobj, obj, pathname, len);
+		(void) printf("%7s %s\n", "", pathname);
+	}
+	free(pathname);
+	nvlist_free(nverrlist);
+}
+
+static void
+print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares,
+    uint_t nspares)
+{
+	uint_t i;
+	char *name;
+
+	if (nspares == 0)
+		return;
+
+	(void) printf(gettext("\tspares\n"));
+
+	for (i = 0; i < nspares; i++) {
+		name = zpool_vdev_name(g_zfs, zhp, spares[i],
+		    cb->cb_name_flags);
+		print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL);
+		free(name);
+	}
+}
+
+static void
+print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache,
+    uint_t nl2cache)
+{
+	uint_t i;
+	char *name;
+
+	if (nl2cache == 0)
+		return;
+
+	(void) printf(gettext("\tcache\n"));
+
+	for (i = 0; i < nl2cache; i++) {
+		name = zpool_vdev_name(g_zfs, zhp, l2cache[i],
+		    cb->cb_name_flags);
+		print_status_config(zhp, cb, name, l2cache[i], 2,
+		    B_FALSE, NULL);
+		free(name);
+	}
+}
+
+static void
+print_dedup_stats(nvlist_t *config)
+{
+	ddt_histogram_t *ddh;
+	ddt_stat_t *dds;
+	ddt_object_t *ddo;
+	uint_t c;
+	char dspace[6], mspace[6];
+
+	/*
+	 * If the pool was faulted then we may not have been able to
+	 * obtain the config. Otherwise, if we have anything in the dedup
+	 * table continue processing the stats.
+	 */
+	if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
+	    (uint64_t **)&ddo, &c) != 0)
+		return;
+
+	(void) printf("\n");
+	(void) printf(gettext(" dedup: "));
+	if (ddo->ddo_count == 0) {
+		(void) printf(gettext("no DDT entries\n"));
+		return;
+	}
+
+	zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace));
+	zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace));
+	(void) printf("DDT entries %llu, size %s on disk, %s in core\n",
+	    (u_longlong_t)ddo->ddo_count,
+	    dspace,
+	    mspace);
+
+	verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
+	    (uint64_t **)&dds, &c) == 0);
+	verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM,
+	    (uint64_t **)&ddh, &c) == 0);
+	zpool_dump_ddt(dds, ddh);
+}
+
+/*
+ * Display a summary of pool status.  Displays a summary such as:
+ *
+ *        pool: tank
+ *	status: DEGRADED
+ *	reason: One or more devices ...
+ *         see: https://zfsonlinux.org/msg/ZFS-xxxx-01
+ *	config:
+ *		mirror		DEGRADED
+ *                c1t0d0	OK
+ *                c2t0d0	UNAVAIL
+ *
+ * When given the '-v' option, we print out the complete config.  If the '-e'
+ * option is specified, then we print out error rate information as well.
+ */
+static int
+status_callback(zpool_handle_t *zhp, void *data)
+{
+	status_cbdata_t *cbp = data;
+	nvlist_t *config, *nvroot;
+	char *msgid;
+	zpool_status_t reason;
+	zpool_errata_t errata;
+	const char *health;
+	uint_t c;
+	vdev_stat_t *vs;
+
+	config = zpool_get_config(zhp, NULL);
+	reason = zpool_get_status(zhp, &msgid, &errata);
+
+	cbp->cb_count++;
+
+	/*
+	 * If we were given 'zpool status -x', only report those pools with
+	 * problems.
+	 */
+	if (cbp->cb_explain &&
+	    (reason == ZPOOL_STATUS_OK ||
+	    reason == ZPOOL_STATUS_VERSION_OLDER ||
+	    reason == ZPOOL_STATUS_FEAT_DISABLED)) {
+		if (!cbp->cb_allpools) {
+			(void) printf(gettext("pool '%s' is healthy\n"),
+			    zpool_get_name(zhp));
+			if (cbp->cb_first)
+				cbp->cb_first = B_FALSE;
+		}
+		return (0);
+	}
+
+	if (cbp->cb_first)
+		cbp->cb_first = B_FALSE;
+	else
+		(void) printf("\n");
+
+	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	health = zpool_get_state_str(zhp);
+
+	printf("  ");
+	printf_color(ANSI_BOLD, gettext("pool:"));
+	printf(" %s\n", zpool_get_name(zhp));
+	printf(" ");
+	printf_color(ANSI_BOLD, gettext("state: "));
+
+	printf_color(health_str_to_color(health), "%s", health);
+
+	printf("\n");
+
+	switch (reason) {
+	case ZPOOL_STATUS_MISSING_DEV_R:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices could "
+		    "not be opened.  Sufficient replicas exist for\n\tthe pool "
+		    "to continue functioning in a degraded state.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Attach the missing device "
+		    "and online it using 'zpool online'.\n"));
+		break;
+
+	case ZPOOL_STATUS_MISSING_DEV_NR:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices could "
+		    "not be opened.  There are insufficient\n\treplicas for the"
+		    " pool to continue functioning.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Attach the missing device "
+		    "and online it using 'zpool online'.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_LABEL_R:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices could "
+		    "not be used because the label is missing or\n\tinvalid.  "
+		    "Sufficient replicas exist for the pool to continue\n\t"
+		    "functioning in a degraded state.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Replace the device using "
+		    "'zpool replace'.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_LABEL_NR:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices could "
+		    "not be used because the label is missing \n\tor invalid.  "
+		    "There are insufficient replicas for the pool to "
+		    "continue\n\tfunctioning.\n"));
+		zpool_explain_recover(zpool_get_handle(zhp),
+		    zpool_get_name(zhp), reason, config);
+		break;
+
+	case ZPOOL_STATUS_FAILING_DEV:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices has "
+		    "experienced an unrecoverable error.  An\n\tattempt was "
+		    "made to correct the error.  Applications are "
+		    "unaffected.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+			printf_color(ANSI_YELLOW, gettext("Determine if the "
+		    "device needs to be replaced, and clear the errors\n\tusing"
+		    " 'zpool clear' or replace the device with 'zpool "
+		    "replace'.\n"));
+		break;
+
+	case ZPOOL_STATUS_OFFLINE_DEV:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices has "
+		    "been taken offline by the administrator.\n\tSufficient "
+		    "replicas exist for the pool to continue functioning in "
+		    "a\n\tdegraded state.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Online the device "
+		    "using 'zpool online' or replace the device with\n\t'zpool "
+		    "replace'.\n"));
+		break;
+
+	case ZPOOL_STATUS_REMOVED_DEV:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices has "
+		    "been removed by the administrator.\n\tSufficient "
+		    "replicas exist for the pool to continue functioning in "
+		    "a\n\tdegraded state.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Online the device "
+		    "using zpool online' or replace the device with\n\t'zpool "
+		    "replace'.\n"));
+		break;
+
+	case ZPOOL_STATUS_RESILVERING:
+	case ZPOOL_STATUS_REBUILDING:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices is "
+		    "currently being resilvered.  The pool will\n\tcontinue "
+		    "to function, possibly in a degraded state.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Wait for the resilver to "
+		    "complete.\n"));
+		break;
+
+	case ZPOOL_STATUS_REBUILD_SCRUB:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices have "
+		    "been sequentially resilvered, scrubbing\n\tthe pool "
+		    "is recommended.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to "
+		    "verify all data checksums.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_DATA:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices has "
+		    "experienced an error resulting in data\n\tcorruption.  "
+		    "Applications may be affected.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Restore the file in question"
+		    " if possible.  Otherwise restore the\n\tentire pool from "
+		    "backup.\n"));
+		break;
+
+	case ZPOOL_STATUS_CORRUPT_POOL:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool metadata is "
+		    "corrupted and the pool cannot be opened.\n"));
+		zpool_explain_recover(zpool_get_handle(zhp),
+		    zpool_get_name(zhp), reason, config);
+		break;
+
+	case ZPOOL_STATUS_VERSION_OLDER:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool is formatted using "
+		    "a legacy on-disk format.  The pool can\n\tstill be used, "
+		    "but some features are unavailable.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Upgrade the pool using "
+		    "'zpool upgrade'.  Once this is done, the\n\tpool will no "
+		    "longer be accessible on software that does not support\n\t"
+		    "feature flags.\n"));
+		break;
+
+	case ZPOOL_STATUS_VERSION_NEWER:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool has been upgraded "
+		    "to a newer, incompatible on-disk version.\n\tThe pool "
+		    "cannot be accessed on this system.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Access the pool from a "
+		    "system running more recent software, or\n\trestore the "
+		    "pool from backup.\n"));
+		break;
+
+	case ZPOOL_STATUS_FEAT_DISABLED:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("Some supported features are "
+		    "not enabled on the pool. The pool can\n\tstill be used, "
+		    "but some features are unavailable.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Enable all features using "
+		    "'zpool upgrade'. Once this is done,\n\tthe pool may no "
+		    "longer be accessible by software that does not support\n\t"
+		    "the features. See zpool-features(5) for details.\n"));
+		break;
+
+	case ZPOOL_STATUS_UNSUP_FEAT_READ:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed "
+		    "on this system because it uses the\n\tfollowing feature(s)"
+		    " not supported on this system:\n"));
+		zpool_print_unsup_feat(config);
+		(void) printf("\n");
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Access the pool from a "
+		    "system that supports the required feature(s),\n\tor "
+		    "restore the pool from backup.\n"));
+		break;
+
+	case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool can only be "
+		    "accessed in read-only mode on this system. It\n\tcannot be"
+		    " accessed in read-write mode because it uses the "
+		    "following\n\tfeature(s) not supported on this system:\n"));
+		zpool_print_unsup_feat(config);
+		(void) printf("\n");
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed "
+		    "in read-write mode. Import the pool with\n"
+		    "\t\"-o readonly=on\", access the pool from a system that "
+		    "supports the\n\trequired feature(s), or restore the "
+		    "pool from backup.\n"));
+		break;
+
+	case ZPOOL_STATUS_FAULTED_DEV_R:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices are "
+		    "faulted in response to persistent errors.\n\tSufficient "
+		    "replicas exist for the pool to continue functioning "
+		    "in a\n\tdegraded state.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Replace the faulted device, "
+		    "or use 'zpool clear' to mark the device\n\trepaired.\n"));
+		break;
+
+	case ZPOOL_STATUS_FAULTED_DEV_NR:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices are "
+		    "faulted in response to persistent errors.  There are "
+		    "insufficient replicas for the pool to\n\tcontinue "
+		    "functioning.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Destroy and re-create the "
+		    "pool from a backup source.  Manually marking the device\n"
+		    "\trepaired using 'zpool clear' may allow some data "
+		    "to be recovered.\n"));
+		break;
+
+	case ZPOOL_STATUS_IO_FAILURE_MMP:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("The pool is suspended "
+		    "because multihost writes failed or were delayed;\n\t"
+		    "another system could import the pool undetected.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
+		    " are connected, then reboot your system and\n\timport the "
+		    "pool.\n"));
+		break;
+
+	case ZPOOL_STATUS_IO_FAILURE_WAIT:
+	case ZPOOL_STATUS_IO_FAILURE_CONTINUE:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("One or more devices are "
+		    "faulted in response to IO failures.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Make sure the affected "
+		    "devices are connected, then run 'zpool clear'.\n"));
+		break;
+
+	case ZPOOL_STATUS_BAD_LOG:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("An intent log record "
+		    "could not be read.\n"
+		    "\tWaiting for administrator intervention to fix the "
+		    "faulted pool.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Either restore the affected "
+		    "device(s) and run 'zpool online',\n"
+		    "\tor ignore the intent log records by running "
+		    "'zpool clear'.\n"));
+		break;
+
+	case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
+		(void) printf(gettext("status: One or more devices are "
+		    "configured to use a non-native block size.\n"
+		    "\tExpect reduced performance.\n"));
+		(void) printf(gettext("action: Replace affected devices with "
+		    "devices that support the\n\tconfigured block size, or "
+		    "migrate data to a properly configured\n\tpool.\n"));
+		break;
+
+	case ZPOOL_STATUS_HOSTID_MISMATCH:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid"
+		    " and system hostid on imported pool.\n\tThis pool was "
+		    "previously imported into a system with a different "
+		    "hostid,\n\tand then was verbatim imported into this "
+		    "system.\n"));
+		printf_color(ANSI_BOLD, gettext("action: "));
+		printf_color(ANSI_YELLOW, gettext("Export this pool on all "
+		    "systems on which it is imported.\n"
+		    "\tThen import it to correct the mismatch.\n"));
+		break;
+
+	case ZPOOL_STATUS_ERRATA:
+		printf_color(ANSI_BOLD, gettext("status: "));
+		printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"),
+		    errata);
+
+		switch (errata) {
+		case ZPOOL_ERRATA_NONE:
+			break;
+
+		case ZPOOL_ERRATA_ZOL_2094_SCRUB:
+			printf_color(ANSI_BOLD, gettext("action: "));
+			printf_color(ANSI_YELLOW, gettext("To correct the issue"
+			    " run 'zpool scrub'.\n"));
+			break;
+
+		case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION:
+			(void) printf(gettext("\tExisting encrypted datasets "
+			    "contain an on-disk incompatibility\n\twhich "
+			    "needs to be corrected.\n"));
+			printf_color(ANSI_BOLD, gettext("action: "));
+			printf_color(ANSI_YELLOW, gettext("To correct the issue"
+			    " backup existing encrypted datasets to new\n\t"
+			    "encrypted datasets and destroy the old ones. "
+			    "'zfs mount -o ro' can\n\tbe used to temporarily "
+			    "mount existing encrypted datasets readonly.\n"));
+			break;
+
+		case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION:
+			(void) printf(gettext("\tExisting encrypted snapshots "
+			    "and bookmarks contain an on-disk\n\tincompat"
+			    "ibility. This may cause on-disk corruption if "
+			    "they are used\n\twith 'zfs recv'.\n"));
+			printf_color(ANSI_BOLD, gettext("action: "));
+			printf_color(ANSI_YELLOW, gettext("To correct the"
+			    "issue, enable the bookmark_v2 feature. No "
+			    "additional\n\taction is needed if there are no "
+			    "encrypted snapshots or bookmarks.\n\tIf preserving"
+			    "the encrypted snapshots and bookmarks is required,"
+			    " use\n\ta non-raw send to backup and restore them."
+			    " Alternately, they may be\n\tremoved to resolve "
+			    "the incompatibility.\n"));
+			break;
+
+		default:
+			/*
+			 * All errata which allow the pool to be imported
+			 * must contain an action message.
+			 */
+			assert(0);
+		}
+		break;
+
+	default:
+		/*
+		 * The remaining errors can't actually be generated, yet.
+		 */
+		assert(reason == ZPOOL_STATUS_OK);
+	}
+
+	if (msgid != NULL) {
+		printf("   ");
+		printf_color(ANSI_BOLD, gettext("see:"));
+		printf(gettext(" https://zfsonlinux.org/msg/%s\n"), msgid);
+	}
+
+	if (config != NULL) {
+		uint64_t nerr;
+		nvlist_t **spares, **l2cache;
+		uint_t nspares, nl2cache;
+		pool_checkpoint_stat_t *pcs = NULL;
+		pool_removal_stat_t *prs = NULL;
+
+		print_scan_status(zhp, nvroot);
+
+		(void) nvlist_lookup_uint64_array(nvroot,
+		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
+		print_removal_status(zhp, prs);
+
+		(void) nvlist_lookup_uint64_array(nvroot,
+		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+		print_checkpoint_status(pcs);
+
+		cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
+		    cbp->cb_name_flags | VDEV_NAME_TYPE_ID);
+		if (cbp->cb_namewidth < 10)
+			cbp->cb_namewidth = 10;
+
+		color_start(ANSI_BOLD);
+		(void) printf(gettext("config:\n\n"));
+		(void) printf(gettext("\t%-*s  %-8s %5s %5s %5s"),
+		    cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
+		    "CKSUM");
+		color_end();
+
+		if (cbp->cb_print_slow_ios) {
+			printf_color(ANSI_BOLD, " %5s", gettext("SLOW"));
+		}
+
+		if (cbp->vcdl != NULL)
+			print_cmd_columns(cbp->vcdl, 0);
+
+		printf("\n");
+
+		print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0,
+		    B_FALSE, NULL);
+
+		print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP);
+		print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
+		print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS);
+
+		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+		    &l2cache, &nl2cache) == 0)
+			print_l2cache(zhp, cbp, l2cache, nl2cache);
+
+		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+		    &spares, &nspares) == 0)
+			print_spares(zhp, cbp, spares, nspares);
+
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
+		    &nerr) == 0) {
+			nvlist_t *nverrlist = NULL;
+
+			/*
+			 * If the approximate error count is small, get a
+			 * precise count by fetching the entire log and
+			 * uniquifying the results.
+			 */
+			if (nerr > 0 && nerr < 100 && !cbp->cb_verbose &&
+			    zpool_get_errlog(zhp, &nverrlist) == 0) {
+				nvpair_t *elem;
+
+				elem = NULL;
+				nerr = 0;
+				while ((elem = nvlist_next_nvpair(nverrlist,
+				    elem)) != NULL) {
+					nerr++;
+				}
+			}
+			nvlist_free(nverrlist);
+
+			(void) printf("\n");
+
+			if (nerr == 0)
+				(void) printf(gettext("errors: No known data "
+				    "errors\n"));
+			else if (!cbp->cb_verbose)
+				(void) printf(gettext("errors: %llu data "
+				    "errors, use '-v' for a list\n"),
+				    (u_longlong_t)nerr);
+			else
+				print_error_log(zhp);
+		}
+
+		if (cbp->cb_dedup_stats)
+			print_dedup_stats(config);
+	} else {
+		(void) printf(gettext("config: The configuration cannot be "
+		    "determined.\n"));
+	}
+
+	return (0);
+}
+
+/*
+ * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ...
+ *              [interval [count]]
+ *
+ *	-c CMD	For each vdev, run command CMD
+ *	-i	Display vdev initialization status.
+ *	-g	Display guid for individual vdev name.
+ *	-L	Follow links when resolving vdev path name.
+ *	-p	Display values in parsable (exact) format.
+ *	-P	Display full path for vdev name.
+ *	-s	Display slow IOs column.
+ *	-v	Display complete error logs
+ *	-x	Display only pools with potential problems
+ *	-D	Display dedup status (undocumented)
+ *	-t	Display vdev TRIM status.
+ *	-T	Display a timestamp in date(1) or Unix format
+ *
+ * Describes the health status of all pools or some subset.
+ */
+int
+zpool_do_status(int argc, char **argv)
+{
+	int c;
+	int ret;
+	float interval = 0;
+	unsigned long count = 0;
+	status_cbdata_t cb = { 0 };
+	char *cmd = NULL;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) {
+		switch (c) {
+		case 'c':
+			if (cmd != NULL) {
+				fprintf(stderr,
+				    gettext("Can't set -c flag twice\n"));
+				exit(1);
+			}
+
+			if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL &&
+			    !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) {
+				fprintf(stderr, gettext(
+				    "Can't run -c, disabled by "
+				    "ZPOOL_SCRIPTS_ENABLED.\n"));
+				exit(1);
+			}
+
+			if ((getuid() <= 0 || geteuid() <= 0) &&
+			    !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) {
+				fprintf(stderr, gettext(
+				    "Can't run -c with root privileges "
+				    "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n"));
+				exit(1);
+			}
+			cmd = optarg;
+			break;
+		case 'i':
+			cb.cb_print_vdev_init = B_TRUE;
+			break;
+		case 'g':
+			cb.cb_name_flags |= VDEV_NAME_GUID;
+			break;
+		case 'L':
+			cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
+			break;
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			break;
+		case 'P':
+			cb.cb_name_flags |= VDEV_NAME_PATH;
+			break;
+		case 's':
+			cb.cb_print_slow_ios = B_TRUE;
+			break;
+		case 'v':
+			cb.cb_verbose = B_TRUE;
+			break;
+		case 'x':
+			cb.cb_explain = B_TRUE;
+			break;
+		case 'D':
+			cb.cb_dedup_stats = B_TRUE;
+			break;
+		case 't':
+			cb.cb_print_vdev_trim = B_TRUE;
+			break;
+		case 'T':
+			get_timestamp_arg(*optarg);
+			break;
+		case '?':
+			if (optopt == 'c') {
+				print_zpool_script_list("status");
+				exit(0);
+			} else {
+				fprintf(stderr,
+				    gettext("invalid option '%c'\n"), optopt);
+			}
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	get_interval_count(&argc, argv, &interval, &count);
+
+	if (argc == 0)
+		cb.cb_allpools = B_TRUE;
+
+	cb.cb_first = B_TRUE;
+	cb.cb_print_status = B_TRUE;
+
+	for (;;) {
+		if (timestamp_fmt != NODATE)
+			print_timestamp(timestamp_fmt);
+
+		if (cmd != NULL)
+			cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd,
+			    NULL, NULL, 0, 0);
+
+		ret = for_each_pool(argc, argv, B_TRUE, NULL,
+		    status_callback, &cb);
+
+		if (cb.vcdl != NULL)
+			free_vdev_cmd_data_list(cb.vcdl);
+
+		if (argc == 0 && cb.cb_count == 0)
+			(void) fprintf(stderr, gettext("no pools available\n"));
+		else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
+			(void) printf(gettext("all pools are healthy\n"));
+
+		if (ret != 0)
+			return (ret);
+
+		if (interval == 0)
+			break;
+
+		if (count != 0 && --count == 0)
+			break;
+
+		(void) fsleep(interval);
+	}
+
+	return (0);
+}
+
+typedef struct upgrade_cbdata {
+	int	cb_first;
+	int	cb_argc;
+	uint64_t cb_version;
+	char	**cb_argv;
+} upgrade_cbdata_t;
+
+static int
+check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs)
+{
+	int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+	int *count = (int *)unsupp_fs;
+
+	if (zfs_version > ZPL_VERSION) {
+		(void) printf(gettext("%s (v%d) is not supported by this "
+		    "implementation of ZFS.\n"),
+		    zfs_get_name(zhp), zfs_version);
+		(*count)++;
+	}
+
+	zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs);
+
+	zfs_close(zhp);
+
+	return (0);
+}
+
+static int
+upgrade_version(zpool_handle_t *zhp, uint64_t version)
+{
+	int ret;
+	nvlist_t *config;
+	uint64_t oldversion;
+	int unsupp_fs = 0;
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &oldversion) == 0);
+
+	assert(SPA_VERSION_IS_SUPPORTED(oldversion));
+	assert(oldversion < version);
+
+	ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs);
+	if (ret != 0)
+		return (ret);
+
+	if (unsupp_fs) {
+		(void) fprintf(stderr, gettext("Upgrade not performed due "
+		    "to %d unsupported filesystems (max v%d).\n"),
+		    unsupp_fs, (int)ZPL_VERSION);
+		return (1);
+	}
+
+	ret = zpool_upgrade(zhp, version);
+	if (ret != 0)
+		return (ret);
+
+	if (version >= SPA_VERSION_FEATURES) {
+		(void) printf(gettext("Successfully upgraded "
+		    "'%s' from version %llu to feature flags.\n"),
+		    zpool_get_name(zhp), (u_longlong_t)oldversion);
+	} else {
+		(void) printf(gettext("Successfully upgraded "
+		    "'%s' from version %llu to version %llu.\n"),
+		    zpool_get_name(zhp), (u_longlong_t)oldversion,
+		    (u_longlong_t)version);
+	}
+
+	return (0);
+}
+
+static int
+upgrade_enable_all(zpool_handle_t *zhp, int *countp)
+{
+	int i, ret, count;
+	boolean_t firstff = B_TRUE;
+	nvlist_t *enabled = zpool_get_features(zhp);
+
+	count = 0;
+	for (i = 0; i < SPA_FEATURES; i++) {
+		const char *fname = spa_feature_table[i].fi_uname;
+		const char *fguid = spa_feature_table[i].fi_guid;
+		if (!nvlist_exists(enabled, fguid)) {
+			char *propname;
+			verify(-1 != asprintf(&propname, "feature@%s", fname));
+			ret = zpool_set_prop(zhp, propname,
+			    ZFS_FEATURE_ENABLED);
+			if (ret != 0) {
+				free(propname);
+				return (ret);
+			}
+			count++;
+
+			if (firstff) {
+				(void) printf(gettext("Enabled the "
+				    "following features on '%s':\n"),
+				    zpool_get_name(zhp));
+				firstff = B_FALSE;
+			}
+			(void) printf(gettext("  %s\n"), fname);
+			free(propname);
+		}
+	}
+
+	if (countp != NULL)
+		*countp = count;
+	return (0);
+}
+
+static int
+upgrade_cb(zpool_handle_t *zhp, void *arg)
+{
+	upgrade_cbdata_t *cbp = arg;
+	nvlist_t *config;
+	uint64_t version;
+	boolean_t printnl = B_FALSE;
+	int ret;
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &version) == 0);
+
+	assert(SPA_VERSION_IS_SUPPORTED(version));
+
+	if (version < cbp->cb_version) {
+		cbp->cb_first = B_FALSE;
+		ret = upgrade_version(zhp, cbp->cb_version);
+		if (ret != 0)
+			return (ret);
+		printnl = B_TRUE;
+
+		/*
+		 * If they did "zpool upgrade -a", then we could
+		 * be doing ioctls to different pools.  We need
+		 * to log this history once to each pool, and bypass
+		 * the normal history logging that happens in main().
+		 */
+		(void) zpool_log_history(g_zfs, history_str);
+		log_history = B_FALSE;
+	}
+
+	if (cbp->cb_version >= SPA_VERSION_FEATURES) {
+		int count;
+		ret = upgrade_enable_all(zhp, &count);
+		if (ret != 0)
+			return (ret);
+
+		if (count > 0) {
+			cbp->cb_first = B_FALSE;
+			printnl = B_TRUE;
+		}
+	}
+
+	if (printnl) {
+		(void) printf(gettext("\n"));
+	}
+
+	return (0);
+}
+
+static int
+upgrade_list_older_cb(zpool_handle_t *zhp, void *arg)
+{
+	upgrade_cbdata_t *cbp = arg;
+	nvlist_t *config;
+	uint64_t version;
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &version) == 0);
+
+	assert(SPA_VERSION_IS_SUPPORTED(version));
+
+	if (version < SPA_VERSION_FEATURES) {
+		if (cbp->cb_first) {
+			(void) printf(gettext("The following pools are "
+			    "formatted with legacy version numbers and can\n"
+			    "be upgraded to use feature flags.  After "
+			    "being upgraded, these pools\nwill no "
+			    "longer be accessible by software that does not "
+			    "support feature\nflags.\n\n"));
+			(void) printf(gettext("VER  POOL\n"));
+			(void) printf(gettext("---  ------------\n"));
+			cbp->cb_first = B_FALSE;
+		}
+
+		(void) printf("%2llu   %s\n", (u_longlong_t)version,
+		    zpool_get_name(zhp));
+	}
+
+	return (0);
+}
+
+static int
+upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
+{
+	upgrade_cbdata_t *cbp = arg;
+	nvlist_t *config;
+	uint64_t version;
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &version) == 0);
+
+	if (version >= SPA_VERSION_FEATURES) {
+		int i;
+		boolean_t poolfirst = B_TRUE;
+		nvlist_t *enabled = zpool_get_features(zhp);
+
+		for (i = 0; i < SPA_FEATURES; i++) {
+			const char *fguid = spa_feature_table[i].fi_guid;
+			const char *fname = spa_feature_table[i].fi_uname;
+			if (!nvlist_exists(enabled, fguid)) {
+				if (cbp->cb_first) {
+					(void) printf(gettext("\nSome "
+					    "supported features are not "
+					    "enabled on the following pools. "
+					    "Once a\nfeature is enabled the "
+					    "pool may become incompatible with "
+					    "software\nthat does not support "
+					    "the feature. See "
+					    "zpool-features(5) for "
+					    "details.\n\n"));
+					(void) printf(gettext("POOL  "
+					    "FEATURE\n"));
+					(void) printf(gettext("------"
+					    "---------\n"));
+					cbp->cb_first = B_FALSE;
+				}
+
+				if (poolfirst) {
+					(void) printf(gettext("%s\n"),
+					    zpool_get_name(zhp));
+					poolfirst = B_FALSE;
+				}
+
+				(void) printf(gettext("      %s\n"), fname);
+			}
+			/*
+			 * If they did "zpool upgrade -a", then we could
+			 * be doing ioctls to different pools.  We need
+			 * to log this history once to each pool, and bypass
+			 * the normal history logging that happens in main().
+			 */
+			(void) zpool_log_history(g_zfs, history_str);
+			log_history = B_FALSE;
+		}
+	}
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+upgrade_one(zpool_handle_t *zhp, void *data)
+{
+	boolean_t printnl = B_FALSE;
+	upgrade_cbdata_t *cbp = data;
+	uint64_t cur_version;
+	int ret;
+
+	if (strcmp("log", zpool_get_name(zhp)) == 0) {
+		(void) fprintf(stderr, gettext("'log' is now a reserved word\n"
+		    "Pool 'log' must be renamed using export and import"
+		    " to upgrade.\n"));
+		return (1);
+	}
+
+	cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
+	if (cur_version > cbp->cb_version) {
+		(void) printf(gettext("Pool '%s' is already formatted "
+		    "using more current version '%llu'.\n\n"),
+		    zpool_get_name(zhp), (u_longlong_t)cur_version);
+		return (0);
+	}
+
+	if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) {
+		(void) printf(gettext("Pool '%s' is already formatted "
+		    "using version %llu.\n\n"), zpool_get_name(zhp),
+		    (u_longlong_t)cbp->cb_version);
+		return (0);
+	}
+
+	if (cur_version != cbp->cb_version) {
+		printnl = B_TRUE;
+		ret = upgrade_version(zhp, cbp->cb_version);
+		if (ret != 0)
+			return (ret);
+	}
+
+	if (cbp->cb_version >= SPA_VERSION_FEATURES) {
+		int count = 0;
+		ret = upgrade_enable_all(zhp, &count);
+		if (ret != 0)
+			return (ret);
+
+		if (count != 0) {
+			printnl = B_TRUE;
+		} else if (cur_version == SPA_VERSION) {
+			(void) printf(gettext("Pool '%s' already has all "
+			    "supported features enabled.\n"),
+			    zpool_get_name(zhp));
+		}
+	}
+
+	if (printnl) {
+		(void) printf(gettext("\n"));
+	}
+
+	return (0);
+}
+
+/*
+ * zpool upgrade
+ * zpool upgrade -v
+ * zpool upgrade [-V version] <-a | pool ...>
+ *
+ * With no arguments, display downrev'd ZFS pool available for upgrade.
+ * Individual pools can be upgraded by specifying the pool, and '-a' will
+ * upgrade all pools.
+ */
+int
+zpool_do_upgrade(int argc, char **argv)
+{
+	int c;
+	upgrade_cbdata_t cb = { 0 };
+	int ret = 0;
+	boolean_t showversions = B_FALSE;
+	boolean_t upgradeall = B_FALSE;
+	char *end;
+
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":avV:")) != -1) {
+		switch (c) {
+		case 'a':
+			upgradeall = B_TRUE;
+			break;
+		case 'v':
+			showversions = B_TRUE;
+			break;
+		case 'V':
+			cb.cb_version = strtoll(optarg, &end, 10);
+			if (*end != '\0' ||
+			    !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) {
+				(void) fprintf(stderr,
+				    gettext("invalid version '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	cb.cb_argc = argc;
+	cb.cb_argv = argv;
+	argc -= optind;
+	argv += optind;
+
+	if (cb.cb_version == 0) {
+		cb.cb_version = SPA_VERSION;
+	} else if (!upgradeall && argc == 0) {
+		(void) fprintf(stderr, gettext("-V option is "
+		    "incompatible with other arguments\n"));
+		usage(B_FALSE);
+	}
+
+	if (showversions) {
+		if (upgradeall || argc != 0) {
+			(void) fprintf(stderr, gettext("-v option is "
+			    "incompatible with other arguments\n"));
+			usage(B_FALSE);
+		}
+	} else if (upgradeall) {
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("-a option should not "
+			    "be used along with a pool name\n"));
+			usage(B_FALSE);
+		}
+	}
+
+	(void) printf(gettext("This system supports ZFS pool feature "
+	    "flags.\n\n"));
+	if (showversions) {
+		int i;
+
+		(void) printf(gettext("The following features are "
+		    "supported:\n\n"));
+		(void) printf(gettext("FEAT DESCRIPTION\n"));
+		(void) printf("----------------------------------------------"
+		    "---------------\n");
+		for (i = 0; i < SPA_FEATURES; i++) {
+			zfeature_info_t *fi = &spa_feature_table[i];
+			const char *ro =
+			    (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+			    " (read-only compatible)" : "";
+
+			(void) printf("%-37s%s\n", fi->fi_uname, ro);
+			(void) printf("     %s\n", fi->fi_desc);
+		}
+		(void) printf("\n");
+
+		(void) printf(gettext("The following legacy versions are also "
+		    "supported:\n\n"));
+		(void) printf(gettext("VER  DESCRIPTION\n"));
+		(void) printf("---  -----------------------------------------"
+		    "---------------\n");
+		(void) printf(gettext(" 1   Initial ZFS version\n"));
+		(void) printf(gettext(" 2   Ditto blocks "
+		    "(replicated metadata)\n"));
+		(void) printf(gettext(" 3   Hot spares and double parity "
+		    "RAID-Z\n"));
+		(void) printf(gettext(" 4   zpool history\n"));
+		(void) printf(gettext(" 5   Compression using the gzip "
+		    "algorithm\n"));
+		(void) printf(gettext(" 6   bootfs pool property\n"));
+		(void) printf(gettext(" 7   Separate intent log devices\n"));
+		(void) printf(gettext(" 8   Delegated administration\n"));
+		(void) printf(gettext(" 9   refquota and refreservation "
+		    "properties\n"));
+		(void) printf(gettext(" 10  Cache devices\n"));
+		(void) printf(gettext(" 11  Improved scrub performance\n"));
+		(void) printf(gettext(" 12  Snapshot properties\n"));
+		(void) printf(gettext(" 13  snapused property\n"));
+		(void) printf(gettext(" 14  passthrough-x aclinherit\n"));
+		(void) printf(gettext(" 15  user/group space accounting\n"));
+		(void) printf(gettext(" 16  stmf property support\n"));
+		(void) printf(gettext(" 17  Triple-parity RAID-Z\n"));
+		(void) printf(gettext(" 18  Snapshot user holds\n"));
+		(void) printf(gettext(" 19  Log device removal\n"));
+		(void) printf(gettext(" 20  Compression using zle "
+		    "(zero-length encoding)\n"));
+		(void) printf(gettext(" 21  Deduplication\n"));
+		(void) printf(gettext(" 22  Received properties\n"));
+		(void) printf(gettext(" 23  Slim ZIL\n"));
+		(void) printf(gettext(" 24  System attributes\n"));
+		(void) printf(gettext(" 25  Improved scrub stats\n"));
+		(void) printf(gettext(" 26  Improved snapshot deletion "
+		    "performance\n"));
+		(void) printf(gettext(" 27  Improved snapshot creation "
+		    "performance\n"));
+		(void) printf(gettext(" 28  Multiple vdev replacements\n"));
+		(void) printf(gettext("\nFor more information on a particular "
+		    "version, including supported releases,\n"));
+		(void) printf(gettext("see the ZFS Administration Guide.\n\n"));
+	} else if (argc == 0 && upgradeall) {
+		cb.cb_first = B_TRUE;
+		ret = zpool_iter(g_zfs, upgrade_cb, &cb);
+		if (ret == 0 && cb.cb_first) {
+			if (cb.cb_version == SPA_VERSION) {
+				(void) printf(gettext("All pools are already "
+				    "formatted using feature flags.\n\n"));
+				(void) printf(gettext("Every feature flags "
+				    "pool already has all supported features "
+				    "enabled.\n"));
+			} else {
+				(void) printf(gettext("All pools are already "
+				    "formatted with version %llu or higher.\n"),
+				    (u_longlong_t)cb.cb_version);
+			}
+		}
+	} else if (argc == 0) {
+		cb.cb_first = B_TRUE;
+		ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb);
+		assert(ret == 0);
+
+		if (cb.cb_first) {
+			(void) printf(gettext("All pools are formatted "
+			    "using feature flags.\n\n"));
+		} else {
+			(void) printf(gettext("\nUse 'zpool upgrade -v' "
+			    "for a list of available legacy versions.\n"));
+		}
+
+		cb.cb_first = B_TRUE;
+		ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb);
+		assert(ret == 0);
+
+		if (cb.cb_first) {
+			(void) printf(gettext("Every feature flags pool has "
+			    "all supported features enabled.\n"));
+		} else {
+			(void) printf(gettext("\n"));
+		}
+	} else {
+		ret = for_each_pool(argc, argv, B_FALSE, NULL,
+		    upgrade_one, &cb);
+	}
+
+	return (ret);
+}
+
+typedef struct hist_cbdata {
+	boolean_t first;
+	boolean_t longfmt;
+	boolean_t internal;
+} hist_cbdata_t;
+
+static void
+print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb)
+{
+	nvlist_t **records;
+	uint_t numrecords;
+	int i;
+
+	verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD,
+	    &records, &numrecords) == 0);
+	for (i = 0; i < numrecords; i++) {
+		nvlist_t *rec = records[i];
+		char tbuf[30] = "";
+
+		if (nvlist_exists(rec, ZPOOL_HIST_TIME)) {
+			time_t tsec;
+			struct tm t;
+
+			tsec = fnvlist_lookup_uint64(records[i],
+			    ZPOOL_HIST_TIME);
+			(void) localtime_r(&tsec, &t);
+			(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+		}
+
+		if (nvlist_exists(rec, ZPOOL_HIST_CMD)) {
+			(void) printf("%s %s", tbuf,
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_CMD));
+		} else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) {
+			int ievent =
+			    fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT);
+			if (!cb->internal)
+				continue;
+			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) {
+				(void) printf("%s unrecognized record:\n",
+				    tbuf);
+				dump_nvlist(rec, 4);
+				continue;
+			}
+			(void) printf("%s [internal %s txg:%lld] %s", tbuf,
+			    zfs_history_event_names[ievent],
+			    (longlong_t)fnvlist_lookup_uint64(
+			    rec, ZPOOL_HIST_TXG),
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR));
+		} else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) {
+			if (!cb->internal)
+				continue;
+			(void) printf("%s [txg:%lld] %s", tbuf,
+			    (longlong_t)fnvlist_lookup_uint64(
+			    rec, ZPOOL_HIST_TXG),
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME));
+			if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) {
+				(void) printf(" %s (%llu)",
+				    fnvlist_lookup_string(rec,
+				    ZPOOL_HIST_DSNAME),
+				    (u_longlong_t)fnvlist_lookup_uint64(rec,
+				    ZPOOL_HIST_DSID));
+			}
+			(void) printf(" %s", fnvlist_lookup_string(rec,
+			    ZPOOL_HIST_INT_STR));
+		} else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) {
+			if (!cb->internal)
+				continue;
+			(void) printf("%s ioctl %s\n", tbuf,
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL));
+			if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) {
+				(void) printf("    input:\n");
+				dump_nvlist(fnvlist_lookup_nvlist(rec,
+				    ZPOOL_HIST_INPUT_NVL), 8);
+			}
+			if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) {
+				(void) printf("    output:\n");
+				dump_nvlist(fnvlist_lookup_nvlist(rec,
+				    ZPOOL_HIST_OUTPUT_NVL), 8);
+			}
+			if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) {
+				(void) printf("    errno: %lld\n",
+				    (longlong_t)fnvlist_lookup_int64(rec,
+				    ZPOOL_HIST_ERRNO));
+			}
+		} else {
+			if (!cb->internal)
+				continue;
+			(void) printf("%s unrecognized record:\n", tbuf);
+			dump_nvlist(rec, 4);
+		}
+
+		if (!cb->longfmt) {
+			(void) printf("\n");
+			continue;
+		}
+		(void) printf(" [");
+		if (nvlist_exists(rec, ZPOOL_HIST_WHO)) {
+			uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO);
+			struct passwd *pwd = getpwuid(who);
+			(void) printf("user %d ", (int)who);
+			if (pwd != NULL)
+				(void) printf("(%s) ", pwd->pw_name);
+		}
+		if (nvlist_exists(rec, ZPOOL_HIST_HOST)) {
+			(void) printf("on %s",
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_HOST));
+		}
+		if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) {
+			(void) printf(":%s",
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE));
+		}
+
+		(void) printf("]");
+		(void) printf("\n");
+	}
+}
+
+/*
+ * Print out the command history for a specific pool.
+ */
+static int
+get_history_one(zpool_handle_t *zhp, void *data)
+{
+	nvlist_t *nvhis;
+	int ret;
+	hist_cbdata_t *cb = (hist_cbdata_t *)data;
+	uint64_t off = 0;
+	boolean_t eof = B_FALSE;
+
+	cb->first = B_FALSE;
+
+	(void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp));
+
+	while (!eof) {
+		if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0)
+			return (ret);
+
+		print_history_records(nvhis, cb);
+		nvlist_free(nvhis);
+	}
+	(void) printf("\n");
+
+	return (ret);
+}
+
+/*
+ * zpool history <pool>
+ *
+ * Displays the history of commands that modified pools.
+ */
+int
+zpool_do_history(int argc, char **argv)
+{
+	hist_cbdata_t cbdata = { 0 };
+	int ret;
+	int c;
+
+	cbdata.first = B_TRUE;
+	/* check options */
+	while ((c = getopt(argc, argv, "li")) != -1) {
+		switch (c) {
+		case 'l':
+			cbdata.longfmt = B_TRUE;
+			break;
+		case 'i':
+			cbdata.internal = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	ret = for_each_pool(argc, argv, B_FALSE,  NULL, get_history_one,
+	    &cbdata);
+
+	if (argc == 0 && cbdata.first == B_TRUE) {
+		(void) fprintf(stderr, gettext("no pools available\n"));
+		return (0);
+	}
+
+	return (ret);
+}
+
+typedef struct ev_opts {
+	int verbose;
+	int scripted;
+	int follow;
+	int clear;
+	char poolname[ZFS_MAX_DATASET_NAME_LEN];
+} ev_opts_t;
+
+static void
+zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts)
+{
+	char ctime_str[26], str[32], *ptr;
+	int64_t *tv;
+	uint_t n;
+
+	verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0);
+	memset(str, ' ', 32);
+	(void) ctime_r((const time_t *)&tv[0], ctime_str);
+	(void) memcpy(str, ctime_str+4,  6);		/* 'Jun 30' */
+	(void) memcpy(str+7, ctime_str+20, 4);		/* '1993' */
+	(void) memcpy(str+12, ctime_str+11, 8);		/* '21:49:08' */
+	(void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */
+	if (opts->scripted)
+		(void) printf(gettext("%s\t"), str);
+	else
+		(void) printf(gettext("%s "), str);
+
+	verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0);
+	(void) printf(gettext("%s\n"), ptr);
+}
+
+static void
+zpool_do_events_nvprint(nvlist_t *nvl, int depth)
+{
+	nvpair_t *nvp;
+
+	for (nvp = nvlist_next_nvpair(nvl, NULL);
+	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+		data_type_t type = nvpair_type(nvp);
+		const char *name = nvpair_name(nvp);
+
+		boolean_t b;
+		uint8_t i8;
+		uint16_t i16;
+		uint32_t i32;
+		uint64_t i64;
+		char *str;
+		nvlist_t *cnv;
+
+		printf(gettext("%*s%s = "), depth, "", name);
+
+		switch (type) {
+		case DATA_TYPE_BOOLEAN:
+			printf(gettext("%s"), "1");
+			break;
+
+		case DATA_TYPE_BOOLEAN_VALUE:
+			(void) nvpair_value_boolean_value(nvp, &b);
+			printf(gettext("%s"), b ? "1" : "0");
+			break;
+
+		case DATA_TYPE_BYTE:
+			(void) nvpair_value_byte(nvp, &i8);
+			printf(gettext("0x%x"), i8);
+			break;
+
+		case DATA_TYPE_INT8:
+			(void) nvpair_value_int8(nvp, (void *)&i8);
+			printf(gettext("0x%x"), i8);
+			break;
+
+		case DATA_TYPE_UINT8:
+			(void) nvpair_value_uint8(nvp, &i8);
+			printf(gettext("0x%x"), i8);
+			break;
+
+		case DATA_TYPE_INT16:
+			(void) nvpair_value_int16(nvp, (void *)&i16);
+			printf(gettext("0x%x"), i16);
+			break;
+
+		case DATA_TYPE_UINT16:
+			(void) nvpair_value_uint16(nvp, &i16);
+			printf(gettext("0x%x"), i16);
+			break;
+
+		case DATA_TYPE_INT32:
+			(void) nvpair_value_int32(nvp, (void *)&i32);
+			printf(gettext("0x%x"), i32);
+			break;
+
+		case DATA_TYPE_UINT32:
+			(void) nvpair_value_uint32(nvp, &i32);
+			printf(gettext("0x%x"), i32);
+			break;
+
+		case DATA_TYPE_INT64:
+			(void) nvpair_value_int64(nvp, (void *)&i64);
+			printf(gettext("0x%llx"), (u_longlong_t)i64);
+			break;
+
+		case DATA_TYPE_UINT64:
+			(void) nvpair_value_uint64(nvp, &i64);
+			/*
+			 * translate vdev state values to readable
+			 * strings to aide zpool events consumers
+			 */
+			if (strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 ||
+			    strcmp(name,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) {
+				printf(gettext("\"%s\" (0x%llx)"),
+				    zpool_state_to_name(i64, VDEV_AUX_NONE),
+				    (u_longlong_t)i64);
+			} else {
+				printf(gettext("0x%llx"), (u_longlong_t)i64);
+			}
+			break;
+
+		case DATA_TYPE_HRTIME:
+			(void) nvpair_value_hrtime(nvp, (void *)&i64);
+			printf(gettext("0x%llx"), (u_longlong_t)i64);
+			break;
+
+		case DATA_TYPE_STRING:
+			(void) nvpair_value_string(nvp, &str);
+			printf(gettext("\"%s\""), str ? str : "<NULL>");
+			break;
+
+		case DATA_TYPE_NVLIST:
+			printf(gettext("(embedded nvlist)\n"));
+			(void) nvpair_value_nvlist(nvp, &cnv);
+			zpool_do_events_nvprint(cnv, depth + 8);
+			printf(gettext("%*s(end %s)"), depth, "", name);
+			break;
+
+		case DATA_TYPE_NVLIST_ARRAY: {
+			nvlist_t **val;
+			uint_t i, nelem;
+
+			(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
+			printf(gettext("(%d embedded nvlists)\n"), nelem);
+			for (i = 0; i < nelem; i++) {
+				printf(gettext("%*s%s[%d] = %s\n"),
+				    depth, "", name, i, "(embedded nvlist)");
+				zpool_do_events_nvprint(val[i], depth + 8);
+				printf(gettext("%*s(end %s[%i])\n"),
+				    depth, "", name, i);
+			}
+			printf(gettext("%*s(end %s)\n"), depth, "", name);
+			}
+			break;
+
+		case DATA_TYPE_INT8_ARRAY: {
+			int8_t *val;
+			uint_t i, nelem;
+
+			(void) nvpair_value_int8_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				printf(gettext("0x%x "), val[i]);
+
+			break;
+			}
+
+		case DATA_TYPE_UINT8_ARRAY: {
+			uint8_t *val;
+			uint_t i, nelem;
+
+			(void) nvpair_value_uint8_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				printf(gettext("0x%x "), val[i]);
+
+			break;
+			}
+
+		case DATA_TYPE_INT16_ARRAY: {
+			int16_t *val;
+			uint_t i, nelem;
+
+			(void) nvpair_value_int16_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				printf(gettext("0x%x "), val[i]);
+
+			break;
+			}
+
+		case DATA_TYPE_UINT16_ARRAY: {
+			uint16_t *val;
+			uint_t i, nelem;
+
+			(void) nvpair_value_uint16_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				printf(gettext("0x%x "), val[i]);
+
+			break;
+			}
+
+		case DATA_TYPE_INT32_ARRAY: {
+			int32_t *val;
+			uint_t i, nelem;
+
+			(void) nvpair_value_int32_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				printf(gettext("0x%x "), val[i]);
+
+			break;
+			}
+
+		case DATA_TYPE_UINT32_ARRAY: {
+			uint32_t *val;
+			uint_t i, nelem;
+
+			(void) nvpair_value_uint32_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				printf(gettext("0x%x "), val[i]);
+
+			break;
+			}
+
+		case DATA_TYPE_INT64_ARRAY: {
+			int64_t *val;
+			uint_t i, nelem;
+
+			(void) nvpair_value_int64_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				printf(gettext("0x%llx "),
+				    (u_longlong_t)val[i]);
+
+			break;
+			}
+
+		case DATA_TYPE_UINT64_ARRAY: {
+			uint64_t *val;
+			uint_t i, nelem;
+
+			(void) nvpair_value_uint64_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				printf(gettext("0x%llx "),
+				    (u_longlong_t)val[i]);
+
+			break;
+			}
+
+		case DATA_TYPE_STRING_ARRAY: {
+			char **str;
+			uint_t i, nelem;
+
+			(void) nvpair_value_string_array(nvp, &str, &nelem);
+			for (i = 0; i < nelem; i++)
+				printf(gettext("\"%s\" "),
+				    str[i] ? str[i] : "<NULL>");
+
+			break;
+			}
+
+		case DATA_TYPE_BOOLEAN_ARRAY:
+		case DATA_TYPE_BYTE_ARRAY:
+		case DATA_TYPE_DOUBLE:
+		case DATA_TYPE_DONTCARE:
+		case DATA_TYPE_UNKNOWN:
+			printf(gettext("<unknown>"));
+			break;
+		}
+
+		printf(gettext("\n"));
+	}
+}
+
+static int
+zpool_do_events_next(ev_opts_t *opts)
+{
+	nvlist_t *nvl;
+	int zevent_fd, ret, dropped;
+	char *pool;
+
+	zevent_fd = open(ZFS_DEV, O_RDWR);
+	VERIFY(zevent_fd >= 0);
+
+	if (!opts->scripted)
+		(void) printf(gettext("%-30s %s\n"), "TIME", "CLASS");
+
+	while (1) {
+		ret = zpool_events_next(g_zfs, &nvl, &dropped,
+		    (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd);
+		if (ret || nvl == NULL)
+			break;
+
+		if (dropped > 0)
+			(void) printf(gettext("dropped %d events\n"), dropped);
+
+		if (strlen(opts->poolname) > 0 &&
+		    nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 &&
+		    strcmp(opts->poolname, pool) != 0)
+			continue;
+
+		zpool_do_events_short(nvl, opts);
+
+		if (opts->verbose) {
+			zpool_do_events_nvprint(nvl, 8);
+			printf(gettext("\n"));
+		}
+		(void) fflush(stdout);
+
+		nvlist_free(nvl);
+	}
+
+	VERIFY(0 == close(zevent_fd));
+
+	return (ret);
+}
+
+static int
+zpool_do_events_clear(ev_opts_t *opts)
+{
+	int count, ret;
+
+	ret = zpool_events_clear(g_zfs, &count);
+	if (!ret)
+		(void) printf(gettext("cleared %d events\n"), count);
+
+	return (ret);
+}
+
+/*
+ * zpool events [-vHf [pool] | -c]
+ *
+ * Displays events logs by ZFS.
+ */
+int
+zpool_do_events(int argc, char **argv)
+{
+	ev_opts_t opts = { 0 };
+	int ret;
+	int c;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "vHfc")) != -1) {
+		switch (c) {
+		case 'v':
+			opts.verbose = 1;
+			break;
+		case 'H':
+			opts.scripted = 1;
+			break;
+		case 'f':
+			opts.follow = 1;
+			break;
+		case 'c':
+			opts.clear = 1;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	} else if (argc == 1) {
+		(void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname));
+		if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) {
+			(void) fprintf(stderr,
+			    gettext("invalid pool name '%s'\n"), opts.poolname);
+			usage(B_FALSE);
+		}
+	}
+
+	if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) &&
+	    opts.clear) {
+		(void) fprintf(stderr,
+		    gettext("invalid options combined with -c\n"));
+		usage(B_FALSE);
+	}
+
+	if (opts.clear)
+		ret = zpool_do_events_clear(&opts);
+	else
+		ret = zpool_do_events_next(&opts);
+
+	return (ret);
+}
+
+static int
+get_callback(zpool_handle_t *zhp, void *data)
+{
+	zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data;
+	char value[MAXNAMELEN];
+	zprop_source_t srctype;
+	zprop_list_t *pl;
+
+	for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
+
+		/*
+		 * Skip the special fake placeholder. This will also skip
+		 * over the name property when 'all' is specified.
+		 */
+		if (pl->pl_prop == ZPOOL_PROP_NAME &&
+		    pl == cbp->cb_proplist)
+			continue;
+
+		if (pl->pl_prop == ZPROP_INVAL &&
+		    (zpool_prop_feature(pl->pl_user_prop) ||
+		    zpool_prop_unsupported(pl->pl_user_prop))) {
+			srctype = ZPROP_SRC_LOCAL;
+
+			if (zpool_prop_get_feature(zhp, pl->pl_user_prop,
+			    value, sizeof (value)) == 0) {
+				zprop_print_one_property(zpool_get_name(zhp),
+				    cbp, pl->pl_user_prop, value, srctype,
+				    NULL, NULL);
+			}
+		} else {
+			if (zpool_get_prop(zhp, pl->pl_prop, value,
+			    sizeof (value), &srctype, cbp->cb_literal) != 0)
+				continue;
+
+			zprop_print_one_property(zpool_get_name(zhp), cbp,
+			    zpool_prop_to_name(pl->pl_prop), value, srctype,
+			    NULL, NULL);
+		}
+	}
+	return (0);
+}
+
+/*
+ * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> <pool> ...
+ *
+ *	-H	Scripted mode.  Don't display headers, and separate properties
+ *		by a single tab.
+ *	-o	List of columns to display.  Defaults to
+ *		"name,property,value,source".
+ * 	-p	Display values in parsable (exact) format.
+ *
+ * Get properties of pools in the system. Output space statistics
+ * for each one as well as other attributes.
+ */
+int
+zpool_do_get(int argc, char **argv)
+{
+	zprop_get_cbdata_t cb = { 0 };
+	zprop_list_t fake_name = { 0 };
+	int ret;
+	int c, i;
+	char *value;
+
+	cb.cb_first = B_TRUE;
+
+	/*
+	 * Set up default columns and sources.
+	 */
+	cb.cb_sources = ZPROP_SRC_ALL;
+	cb.cb_columns[0] = GET_COL_NAME;
+	cb.cb_columns[1] = GET_COL_PROPERTY;
+	cb.cb_columns[2] = GET_COL_VALUE;
+	cb.cb_columns[3] = GET_COL_SOURCE;
+	cb.cb_type = ZFS_TYPE_POOL;
+
+	/* check options */
+	while ((c = getopt(argc, argv, ":Hpo:")) != -1) {
+		switch (c) {
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			break;
+		case 'H':
+			cb.cb_scripted = B_TRUE;
+			break;
+		case 'o':
+			bzero(&cb.cb_columns, sizeof (cb.cb_columns));
+			i = 0;
+			while (*optarg != '\0') {
+				static char *col_subopts[] =
+				{ "name", "property", "value", "source",
+				"all", NULL };
+
+				if (i == ZFS_GET_NCOLS) {
+					(void) fprintf(stderr, gettext("too "
+					"many fields given to -o "
+					"option\n"));
+					usage(B_FALSE);
+				}
+
+				switch (getsubopt(&optarg, col_subopts,
+				    &value)) {
+				case 0:
+					cb.cb_columns[i++] = GET_COL_NAME;
+					break;
+				case 1:
+					cb.cb_columns[i++] = GET_COL_PROPERTY;
+					break;
+				case 2:
+					cb.cb_columns[i++] = GET_COL_VALUE;
+					break;
+				case 3:
+					cb.cb_columns[i++] = GET_COL_SOURCE;
+					break;
+				case 4:
+					if (i > 0) {
+						(void) fprintf(stderr,
+						    gettext("\"all\" conflicts "
+						    "with specific fields "
+						    "given to -o option\n"));
+						usage(B_FALSE);
+					}
+					cb.cb_columns[0] = GET_COL_NAME;
+					cb.cb_columns[1] = GET_COL_PROPERTY;
+					cb.cb_columns[2] = GET_COL_VALUE;
+					cb.cb_columns[3] = GET_COL_SOURCE;
+					i = ZFS_GET_NCOLS;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid column name "
+					    "'%s'\n"), value);
+					usage(B_FALSE);
+				}
+			}
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing property "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+
+	if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist,
+	    ZFS_TYPE_POOL) != 0)
+		usage(B_FALSE);
+
+	argc--;
+	argv++;
+
+	if (cb.cb_proplist != NULL) {
+		fake_name.pl_prop = ZPOOL_PROP_NAME;
+		fake_name.pl_width = strlen(gettext("NAME"));
+		fake_name.pl_next = cb.cb_proplist;
+		cb.cb_proplist = &fake_name;
+	}
+
+	ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
+	    get_callback, &cb);
+
+	if (cb.cb_proplist == &fake_name)
+		zprop_free_list(fake_name.pl_next);
+	else
+		zprop_free_list(cb.cb_proplist);
+
+	return (ret);
+}
+
+typedef struct set_cbdata {
+	char *cb_propname;
+	char *cb_value;
+	boolean_t cb_any_successful;
+} set_cbdata_t;
+
+static int
+set_callback(zpool_handle_t *zhp, void *data)
+{
+	int error;
+	set_cbdata_t *cb = (set_cbdata_t *)data;
+
+	error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value);
+
+	if (!error)
+		cb->cb_any_successful = B_TRUE;
+
+	return (error);
+}
+
+int
+zpool_do_set(int argc, char **argv)
+{
+	set_cbdata_t cb = { 0 };
+	int error;
+
+	if (argc > 1 && argv[1][0] == '-') {
+		(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+		    argv[1][1]);
+		usage(B_FALSE);
+	}
+
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing property=value "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc < 3) {
+		(void) fprintf(stderr, gettext("missing pool name\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 3) {
+		(void) fprintf(stderr, gettext("too many pool names\n"));
+		usage(B_FALSE);
+	}
+
+	cb.cb_propname = argv[1];
+	cb.cb_value = strchr(cb.cb_propname, '=');
+	if (cb.cb_value == NULL) {
+		(void) fprintf(stderr, gettext("missing value in "
+		    "property=value argument\n"));
+		usage(B_FALSE);
+	}
+
+	*(cb.cb_value) = '\0';
+	cb.cb_value++;
+
+	error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL,
+	    set_callback, &cb);
+
+	return (error);
+}
+
+/* Add up the total number of bytes left to initialize/trim across all vdevs */
+static uint64_t
+vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity)
+{
+	uint64_t bytes_remaining;
+	nvlist_t **child;
+	uint_t c, children;
+	vdev_stat_t *vs;
+
+	assert(activity == ZPOOL_WAIT_INITIALIZE ||
+	    activity == ZPOOL_WAIT_TRIM);
+
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	if (activity == ZPOOL_WAIT_INITIALIZE &&
+	    vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE)
+		bytes_remaining = vs->vs_initialize_bytes_est -
+		    vs->vs_initialize_bytes_done;
+	else if (activity == ZPOOL_WAIT_TRIM &&
+	    vs->vs_trim_state == VDEV_TRIM_ACTIVE)
+		bytes_remaining = vs->vs_trim_bytes_est -
+		    vs->vs_trim_bytes_done;
+	else
+		bytes_remaining = 0;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		children = 0;
+
+	for (c = 0; c < children; c++)
+		bytes_remaining += vdev_activity_remaining(child[c], activity);
+
+	return (bytes_remaining);
+}
+
+/* Add up the total number of bytes left to rebuild across top-level vdevs */
+static uint64_t
+vdev_activity_top_remaining(nvlist_t *nv)
+{
+	uint64_t bytes_remaining = 0;
+	nvlist_t **child;
+	uint_t children;
+	int error;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		children = 0;
+
+	for (uint_t c = 0; c < children; c++) {
+		vdev_rebuild_stat_t *vrs;
+		uint_t i;
+
+		error = nvlist_lookup_uint64_array(child[c],
+		    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i);
+		if (error == 0) {
+			if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+				bytes_remaining += (vrs->vrs_bytes_est -
+				    vrs->vrs_bytes_rebuilt);
+			}
+		}
+	}
+
+	return (bytes_remaining);
+}
+
+/* Whether any vdevs are 'spare' or 'replacing' vdevs */
+static boolean_t
+vdev_any_spare_replacing(nvlist_t *nv)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	char *vdev_type;
+
+	(void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type);
+
+	if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 ||
+	    strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) {
+		return (B_TRUE);
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		children = 0;
+
+	for (c = 0; c < children; c++) {
+		if (vdev_any_spare_replacing(child[c]))
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+typedef struct wait_data {
+	char *wd_poolname;
+	boolean_t wd_scripted;
+	boolean_t wd_exact;
+	boolean_t wd_headers_once;
+	boolean_t wd_should_exit;
+	/* Which activities to wait for */
+	boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES];
+	float wd_interval;
+	pthread_cond_t wd_cv;
+	pthread_mutex_t wd_mutex;
+} wait_data_t;
+
+/*
+ * Print to stdout a single line, containing one column for each activity that
+ * we are waiting for specifying how many bytes of work are left for that
+ * activity.
+ */
+static void
+print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
+{
+	nvlist_t *config, *nvroot;
+	uint_t c;
+	int i;
+	pool_checkpoint_stat_t *pcs = NULL;
+	pool_scan_stat_t *pss = NULL;
+	pool_removal_stat_t *prs = NULL;
+	char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE",
+	    "REMOVE", "RESILVER", "SCRUB", "TRIM"};
+	int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES];
+
+	/* Calculate the width of each column */
+	for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
+		/*
+		 * Make sure we have enough space in the col for pretty-printed
+		 * numbers and for the column header, and then leave a couple
+		 * spaces between cols for readability.
+		 */
+		col_widths[i] = MAX(strlen(headers[i]), 6) + 2;
+	}
+
+	/* Print header if appropriate */
+	int term_height = terminal_height();
+	boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 &&
+	    row % (term_height-1) == 0);
+	if (!wd->wd_scripted && (row == 0 || reprint_header)) {
+		for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
+			if (wd->wd_enabled[i])
+				(void) printf("%*s", col_widths[i], headers[i]);
+		}
+		(void) printf("\n");
+	}
+
+	/* Bytes of work remaining in each activity */
+	int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0};
+
+	bytes_rem[ZPOOL_WAIT_FREE] =
+	    zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL);
+
+	config = zpool_get_config(zhp, NULL);
+	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
+
+	(void) nvlist_lookup_uint64_array(nvroot,
+	    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+	if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
+		bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space;
+
+	(void) nvlist_lookup_uint64_array(nvroot,
+	    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
+	if (prs != NULL && prs->prs_state == DSS_SCANNING)
+		bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy -
+		    prs->prs_copied;
+
+	(void) nvlist_lookup_uint64_array(nvroot,
+	    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c);
+	if (pss != NULL && pss->pss_state == DSS_SCANNING &&
+	    pss->pss_pass_scrub_pause == 0) {
+		int64_t rem = pss->pss_to_examine - pss->pss_issued;
+		if (pss->pss_func == POOL_SCAN_SCRUB)
+			bytes_rem[ZPOOL_WAIT_SCRUB] = rem;
+		else
+			bytes_rem[ZPOOL_WAIT_RESILVER] = rem;
+	} else if (check_rebuilding(nvroot, NULL)) {
+		bytes_rem[ZPOOL_WAIT_RESILVER] =
+		    vdev_activity_top_remaining(nvroot);
+	}
+
+	bytes_rem[ZPOOL_WAIT_INITIALIZE] =
+	    vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE);
+	bytes_rem[ZPOOL_WAIT_TRIM] =
+	    vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM);
+
+	/*
+	 * A replace finishes after resilvering finishes, so the amount of work
+	 * left for a replace is the same as for resilvering.
+	 *
+	 * It isn't quite correct to say that if we have any 'spare' or
+	 * 'replacing' vdevs and a resilver is happening, then a replace is in
+	 * progress, like we do here. When a hot spare is used, the faulted vdev
+	 * is not removed after the hot spare is resilvered, so parent 'spare'
+	 * vdev is not removed either. So we could have a 'spare' vdev, but be
+	 * resilvering for a different reason. However, we use it as a heuristic
+	 * because we don't have access to the DTLs, which could tell us whether
+	 * or not we have really finished resilvering a hot spare.
+	 */
+	if (vdev_any_spare_replacing(nvroot))
+		bytes_rem[ZPOOL_WAIT_REPLACE] =  bytes_rem[ZPOOL_WAIT_RESILVER];
+
+	if (timestamp_fmt != NODATE)
+		print_timestamp(timestamp_fmt);
+
+	for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
+		char buf[64];
+		if (!wd->wd_enabled[i])
+			continue;
+
+		if (wd->wd_exact)
+			(void) snprintf(buf, sizeof (buf), "%" PRIi64,
+			    bytes_rem[i]);
+		else
+			zfs_nicenum(bytes_rem[i], buf, sizeof (buf));
+
+		if (wd->wd_scripted)
+			(void) printf(i == 0 ? "%s" : "\t%s", buf);
+		else
+			(void) printf(" %*s", col_widths[i] - 1, buf);
+	}
+	(void) printf("\n");
+	(void) fflush(stdout);
+}
+
+static void *
+wait_status_thread(void *arg)
+{
+	wait_data_t *wd = (wait_data_t *)arg;
+	zpool_handle_t *zhp;
+
+	if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL)
+		return (void *)(1);
+
+	for (int row = 0; ; row++) {
+		boolean_t missing;
+		struct timespec timeout;
+		int ret = 0;
+		(void) clock_gettime(CLOCK_REALTIME, &timeout);
+
+		if (zpool_refresh_stats(zhp, &missing) != 0 || missing ||
+		    zpool_props_refresh(zhp) != 0) {
+			zpool_close(zhp);
+			return (void *)(uintptr_t)(missing ? 0 : 1);
+		}
+
+		print_wait_status_row(wd, zhp, row);
+
+		timeout.tv_sec += floor(wd->wd_interval);
+		long nanos = timeout.tv_nsec +
+		    (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC;
+		if (nanos >= NANOSEC) {
+			timeout.tv_sec++;
+			timeout.tv_nsec = nanos - NANOSEC;
+		} else {
+			timeout.tv_nsec = nanos;
+		}
+		pthread_mutex_lock(&wd->wd_mutex);
+		if (!wd->wd_should_exit)
+			ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex,
+			    &timeout);
+		pthread_mutex_unlock(&wd->wd_mutex);
+		if (ret == 0) {
+			break; /* signaled by main thread */
+		} else if (ret != ETIMEDOUT) {
+			(void) fprintf(stderr, gettext("pthread_cond_timedwait "
+			    "failed: %s\n"), strerror(ret));
+			zpool_close(zhp);
+			return (void *)(uintptr_t)(1);
+		}
+	}
+
+	zpool_close(zhp);
+	return (void *)(0);
+}
+
+int
+zpool_do_wait(int argc, char **argv)
+{
+	boolean_t verbose = B_FALSE;
+	char c;
+	char *value;
+	int i;
+	unsigned long count;
+	pthread_t status_thr;
+	int error = 0;
+	zpool_handle_t *zhp;
+
+	wait_data_t wd;
+	wd.wd_scripted = B_FALSE;
+	wd.wd_exact = B_FALSE;
+	wd.wd_headers_once = B_FALSE;
+	wd.wd_should_exit = B_FALSE;
+
+	pthread_mutex_init(&wd.wd_mutex, NULL);
+	pthread_cond_init(&wd.wd_cv, NULL);
+
+	/* By default, wait for all types of activity. */
+	for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++)
+		wd.wd_enabled[i] = B_TRUE;
+
+	while ((c = getopt(argc, argv, "HpT:t:")) != -1) {
+		switch (c) {
+		case 'H':
+			wd.wd_scripted = B_TRUE;
+			break;
+		case 'n':
+			wd.wd_headers_once = B_TRUE;
+			break;
+		case 'p':
+			wd.wd_exact = B_TRUE;
+			break;
+		case 'T':
+			get_timestamp_arg(*optarg);
+			break;
+		case 't':
+		{
+			static char *col_subopts[] = { "discard", "free",
+			    "initialize", "replace", "remove", "resilver",
+			    "scrub", "trim", NULL };
+
+			/* Reset activities array */
+			bzero(&wd.wd_enabled, sizeof (wd.wd_enabled));
+			while (*optarg != '\0') {
+				int activity = getsubopt(&optarg, col_subopts,
+				    &value);
+
+				if (activity < 0) {
+					(void) fprintf(stderr,
+					    gettext("invalid activity '%s'\n"),
+					    value);
+					usage(B_FALSE);
+				}
+
+				wd.wd_enabled[activity] = B_TRUE;
+			}
+			break;
+		}
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	get_interval_count(&argc, argv, &wd.wd_interval, &count);
+	if (count != 0) {
+		/* This subcmd only accepts an interval, not a count */
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	if (wd.wd_interval != 0)
+		verbose = B_TRUE;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing 'pool' argument\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	wd.wd_poolname = argv[0];
+
+	if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL)
+		return (1);
+
+	if (verbose) {
+		/*
+		 * We use a separate thread for printing status updates because
+		 * the main thread will call lzc_wait(), which blocks as long
+		 * as an activity is in progress, which can be a long time.
+		 */
+		if (pthread_create(&status_thr, NULL, wait_status_thread, &wd)
+		    != 0) {
+			(void) fprintf(stderr, gettext("failed to create status"
+			    "thread: %s\n"), strerror(errno));
+			zpool_close(zhp);
+			return (1);
+		}
+	}
+
+	/*
+	 * Loop over all activities that we are supposed to wait for until none
+	 * of them are in progress. Note that this means we can end up waiting
+	 * for more activities to complete than just those that were in progress
+	 * when we began waiting; if an activity we are interested in begins
+	 * while we are waiting for another activity, we will wait for both to
+	 * complete before exiting.
+	 */
+	for (;;) {
+		boolean_t missing = B_FALSE;
+		boolean_t any_waited = B_FALSE;
+
+		for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
+			boolean_t waited;
+
+			if (!wd.wd_enabled[i])
+				continue;
+
+			error = zpool_wait_status(zhp, i, &missing, &waited);
+			if (error != 0 || missing)
+				break;
+
+			any_waited = (any_waited || waited);
+		}
+
+		if (error != 0 || missing || !any_waited)
+			break;
+	}
+
+	zpool_close(zhp);
+
+	if (verbose) {
+		uintptr_t status;
+		pthread_mutex_lock(&wd.wd_mutex);
+		wd.wd_should_exit = B_TRUE;
+		pthread_cond_signal(&wd.wd_cv);
+		pthread_mutex_unlock(&wd.wd_mutex);
+		(void) pthread_join(status_thr, (void *)&status);
+		if (status != 0)
+			error = status;
+	}
+
+	pthread_mutex_destroy(&wd.wd_mutex);
+	pthread_cond_destroy(&wd.wd_cv);
+	return (error);
+}
+
+static int
+find_command_idx(char *command, int *idx)
+{
+	int i;
+
+	for (i = 0; i < NCOMMAND; i++) {
+		if (command_table[i].name == NULL)
+			continue;
+
+		if (strcmp(command, command_table[i].name) == 0) {
+			*idx = i;
+			return (0);
+		}
+	}
+	return (1);
+}
+
+/*
+ * Display version message
+ */
+static int
+zpool_do_version(int argc, char **argv)
+{
+	if (zfs_version_print() == -1)
+		return (1);
+
+	return (0);
+}
+
+int
+main(int argc, char **argv)
+{
+	int ret = 0;
+	int i = 0;
+	char *cmdname;
+	char **newargv;
+
+	(void) setlocale(LC_ALL, "");
+	(void) textdomain(TEXT_DOMAIN);
+	srand(time(NULL));
+
+	opterr = 0;
+
+	/*
+	 * Make sure the user has specified some command.
+	 */
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing command\n"));
+		usage(B_FALSE);
+	}
+
+	cmdname = argv[1];
+
+	/*
+	 * Special case '-?'
+	 */
+	if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0)
+		usage(B_TRUE);
+
+	/*
+	 * Special case '-V|--version'
+	 */
+	if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0))
+		return (zpool_do_version(argc, argv));
+
+	if ((g_zfs = libzfs_init()) == NULL) {
+		(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+		return (1);
+	}
+
+	libzfs_print_on_error(g_zfs, B_TRUE);
+
+	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
+
+	/*
+	 * Many commands modify input strings for string parsing reasons.
+	 * We create a copy to protect the original argv.
+	 */
+	newargv = malloc((argc + 1) * sizeof (newargv[0]));
+	for (i = 0; i < argc; i++)
+		newargv[i] = strdup(argv[i]);
+	newargv[argc] = NULL;
+
+	/*
+	 * Run the appropriate command.
+	 */
+	if (find_command_idx(cmdname, &i) == 0) {
+		current_command = &command_table[i];
+		ret = command_table[i].func(argc - 1, newargv + 1);
+	} else if (strchr(cmdname, '=')) {
+		verify(find_command_idx("set", &i) == 0);
+		current_command = &command_table[i];
+		ret = command_table[i].func(argc, newargv);
+	} else if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
+		/*
+		 * 'freeze' is a vile debugging abomination, so we treat
+		 * it as such.
+		 */
+		zfs_cmd_t zc = {"\0"};
+
+		(void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name));
+		ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc);
+		if (ret != 0) {
+			(void) fprintf(stderr,
+			gettext("failed to freeze pool: %d\n"), errno);
+			ret = 1;
+		}
+
+		log_history = 0;
+	} else {
+		(void) fprintf(stderr, gettext("unrecognized "
+		    "command '%s'\n"), cmdname);
+		usage(B_FALSE);
+		ret = 1;
+	}
+
+	for (i = 0; i < argc; i++)
+		free(newargv[i]);
+	free(newargv);
+
+	if (ret == 0 && log_history)
+		(void) zpool_log_history(g_zfs, history_str);
+
+	libzfs_fini(g_zfs);
+
+	/*
+	 * The 'ZFS_ABORT' environment variable causes us to dump core on exit
+	 * for the purposes of running ::findleaks.
+	 */
+	if (getenv("ZFS_ABORT") != NULL) {
+		(void) printf("dumping core by request\n");
+		abort();
+	}
+
+	return (ret);
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.c b/sys/contrib/openzfs/cmd/zpool/zpool_util.c
new file mode 100644
index 000000000000..1c1eb024f365
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.c
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <errno.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <ctype.h>
+
+#include "zpool_util.h"
+
+/*
+ * Utility function to guarantee malloc() success.
+ */
+void *
+safe_malloc(size_t size)
+{
+	void *data;
+
+	if ((data = calloc(1, size)) == NULL) {
+		(void) fprintf(stderr, "internal error: out of memory\n");
+		exit(1);
+	}
+
+	return (data);
+}
+
+/*
+ * Display an out of memory error message and abort the current program.
+ */
+void
+zpool_no_memory(void)
+{
+	assert(errno == ENOMEM);
+	(void) fprintf(stderr,
+	    gettext("internal error: out of memory\n"));
+	exit(1);
+}
+
+/*
+ * Return the number of logs in supplied nvlist
+ */
+uint_t
+num_logs(nvlist_t *nv)
+{
+	uint_t nlogs = 0;
+	uint_t c, children;
+	nvlist_t **child;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return (0);
+
+	for (c = 0; c < children; c++) {
+		uint64_t is_log = B_FALSE;
+
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+		if (is_log)
+			nlogs++;
+	}
+	return (nlogs);
+}
+
+/* Find the max element in an array of uint64_t values */
+uint64_t
+array64_max(uint64_t array[], unsigned int len)
+{
+	uint64_t max = 0;
+	int i;
+	for (i = 0; i < len; i++)
+		max = MAX(max, array[i]);
+
+	return (max);
+}
+
+/*
+ * Find highest one bit set.
+ * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ */
+int
+highbit64(uint64_t i)
+{
+	if (i == 0)
+		return (0);
+
+	return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
+}
+
+/*
+ * Find lowest one bit set.
+ * Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
+ */
+int
+lowbit64(uint64_t i)
+{
+	if (i == 0)
+		return (0);
+
+	return (__builtin_ffsll(i));
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h
new file mode 100644
index 000000000000..265aa58953a0
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h
@@ -0,0 +1,137 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	ZPOOL_UTIL_H
+#define	ZPOOL_UTIL_H
+
+#include <libnvpair.h>
+#include <libzfs.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/* Path to scripts you can run with "zpool status/iostat -c" */
+#define	ZPOOL_SCRIPTS_DIR SYSCONFDIR"/zfs/zpool.d"
+
+/*
+ * Basic utility functions
+ */
+void *safe_malloc(size_t);
+void zpool_no_memory(void);
+uint_t num_logs(nvlist_t *nv);
+uint64_t array64_max(uint64_t array[], unsigned int len);
+int highbit64(uint64_t i);
+int lowbit64(uint64_t i);
+
+/*
+ * Misc utility functions
+ */
+char *zpool_get_cmd_search_path(void);
+
+/*
+ * Virtual device functions
+ */
+
+nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force,
+    int check_rep, boolean_t replacing, boolean_t dryrun, int argc,
+    char **argv);
+nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
+    nvlist_t *props, splitflags_t flags, int argc, char **argv);
+
+/*
+ * Pool list functions
+ */
+int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **,
+    zpool_iter_f, void *);
+
+/* Vdev list functions */
+typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *);
+int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data);
+
+typedef struct zpool_list zpool_list_t;
+
+zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *);
+void pool_list_update(zpool_list_t *);
+int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *);
+void pool_list_free(zpool_list_t *);
+int pool_list_count(zpool_list_t *);
+void pool_list_remove(zpool_list_t *, zpool_handle_t *);
+
+extern libzfs_handle_t *g_zfs;
+
+
+typedef	struct vdev_cmd_data
+{
+	char **lines;	/* Array of lines of output, minus the column name */
+	int lines_cnt;	/* Number of lines in the array */
+
+	char **cols;	/* Array of column names */
+	int cols_cnt;	/* Number of column names */
+
+
+	char *path;	/* vdev path */
+	char *upath;	/* vdev underlying path */
+	char *pool;	/* Pool name */
+	char *cmd;	/* backpointer to cmd */
+	char *vdev_enc_sysfs_path;	/* enclosure sysfs path (if any) */
+} vdev_cmd_data_t;
+
+typedef struct vdev_cmd_data_list
+{
+	char *cmd;		/* Command to run */
+	unsigned int count;	/* Number of vdev_cmd_data items (vdevs) */
+
+	/* fields used to select only certain vdevs, if requested */
+	libzfs_handle_t *g_zfs;
+	char **vdev_names;
+	int vdev_names_count;
+	int cb_name_flags;
+
+	vdev_cmd_data_t *data;	/* Array of vdevs */
+
+	/* List of unique column names and widths */
+	char **uniq_cols;
+	int uniq_cols_cnt;
+	int *uniq_cols_width;
+
+} vdev_cmd_data_list_t;
+
+vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv,
+    char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count,
+    int cb_name_flags);
+
+void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl);
+
+int check_device(const char *path, boolean_t force,
+    boolean_t isspare, boolean_t iswholedisk);
+boolean_t check_sector_size_database(char *path, int *sector_size);
+void vdev_error(const char *fmt, ...);
+int check_file(const char *file, boolean_t force, boolean_t isspare);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* ZPOOL_UTIL_H */
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
new file mode 100644
index 000000000000..9aa09b18c4ae
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
@@ -0,0 +1,1581 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 Intel Corporation.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ */
+
+/*
+ * Functions to convert between a list of vdevs and an nvlist representing the
+ * configuration.  Each entry in the list can be one of:
+ *
+ * 	Device vdevs
+ * 		disk=(path=..., devid=...)
+ * 		file=(path=...)
+ *
+ * 	Group vdevs
+ * 		raidz[1|2]=(...)
+ * 		mirror=(...)
+ *
+ * 	Hot spares
+ *
+ * While the underlying implementation supports it, group vdevs cannot contain
+ * other group vdevs.  All userland verification of devices is contained within
+ * this file.  If successful, the nvlist returned can be passed directly to the
+ * kernel; we've done as much verification as possible in userland.
+ *
+ * Hot spares are a special case, and passed down as an array of disk vdevs, at
+ * the same level as the root of the vdev tree.
+ *
+ * The only function exported by this file is 'make_root_vdev'.  The
+ * function performs several passes:
+ *
+ * 	1. Construct the vdev specification.  Performs syntax validation and
+ *         makes sure each device is valid.
+ * 	2. Check for devices in use.  Using libblkid to make sure that no
+ *         devices are also in use.  Some can be overridden using the 'force'
+ *         flag, others cannot.
+ * 	3. Check for replication errors if the 'force' flag is not specified.
+ *         validates that the replication level is consistent across the
+ *         entire pool.
+ * 	4. Call libzfs to label any whole disks with an EFI label.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <libnvpair.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <sys/spa.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "zpool_util.h"
+#include <sys/zfs_context.h>
+#include <sys/stat.h>
+
+/*
+ * For any given vdev specification, we can have multiple errors.  The
+ * vdev_error() function keeps track of whether we have seen an error yet, and
+ * prints out a header if its the first error we've seen.
+ */
+boolean_t error_seen;
+boolean_t is_force;
+
+
+
+
+/*PRINTFLIKE1*/
+void
+vdev_error(const char *fmt, ...)
+{
+	va_list ap;
+
+	if (!error_seen) {
+		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
+		if (!is_force)
+			(void) fprintf(stderr, gettext("use '-f' to override "
+			    "the following errors:\n"));
+		else
+			(void) fprintf(stderr, gettext("the following errors "
+			    "must be manually repaired:\n"));
+		error_seen = B_TRUE;
+	}
+
+	va_start(ap, fmt);
+	(void) vfprintf(stderr, fmt, ap);
+	va_end(ap);
+}
+
+/*
+ * Check that a file is valid.  All we can do in this case is check that it's
+ * not in use by another pool, and not in use by swap.
+ */
+int
+check_file(const char *file, boolean_t force, boolean_t isspare)
+{
+	char  *name;
+	int fd;
+	int ret = 0;
+	pool_state_t state;
+	boolean_t inuse;
+
+	if ((fd = open(file, O_RDONLY)) < 0)
+		return (0);
+
+	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
+		const char *desc;
+
+		switch (state) {
+		case POOL_STATE_ACTIVE:
+			desc = gettext("active");
+			break;
+
+		case POOL_STATE_EXPORTED:
+			desc = gettext("exported");
+			break;
+
+		case POOL_STATE_POTENTIALLY_ACTIVE:
+			desc = gettext("potentially active");
+			break;
+
+		default:
+			desc = gettext("unknown");
+			break;
+		}
+
+		/*
+		 * Allow hot spares to be shared between pools.
+		 */
+		if (state == POOL_STATE_SPARE && isspare) {
+			free(name);
+			(void) close(fd);
+			return (0);
+		}
+
+		if (state == POOL_STATE_ACTIVE ||
+		    state == POOL_STATE_SPARE || !force) {
+			switch (state) {
+			case POOL_STATE_SPARE:
+				vdev_error(gettext("%s is reserved as a hot "
+				    "spare for pool %s\n"), file, name);
+				break;
+			default:
+				vdev_error(gettext("%s is part of %s pool "
+				    "'%s'\n"), file, desc, name);
+				break;
+			}
+			ret = -1;
+		}
+
+		free(name);
+	}
+
+	(void) close(fd);
+	return (ret);
+}
+
+/*
+ * This may be a shorthand device path or it could be total gibberish.
+ * Check to see if it is a known device available in zfs_vdev_paths.
+ * As part of this check, see if we've been given an entire disk
+ * (minus the slice number).
+ */
+static int
+is_shorthand_path(const char *arg, char *path, size_t path_size,
+    struct stat64 *statbuf, boolean_t *wholedisk)
+{
+	int error;
+
+	error = zfs_resolve_shortname(arg, path, path_size);
+	if (error == 0) {
+		*wholedisk = zfs_dev_is_whole_disk(path);
+		if (*wholedisk || (stat64(path, statbuf) == 0))
+			return (0);
+	}
+
+	strlcpy(path, arg, path_size);
+	memset(statbuf, 0, sizeof (*statbuf));
+	*wholedisk = B_FALSE;
+
+	return (error);
+}
+
+/*
+ * Determine if the given path is a hot spare within the given configuration.
+ * If no configuration is given we rely solely on the label.
+ */
+static boolean_t
+is_spare(nvlist_t *config, const char *path)
+{
+	int fd;
+	pool_state_t state;
+	char *name = NULL;
+	nvlist_t *label;
+	uint64_t guid, spareguid;
+	nvlist_t *nvroot;
+	nvlist_t **spares;
+	uint_t i, nspares;
+	boolean_t inuse;
+
+	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
+		return (B_FALSE);
+
+	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
+	    !inuse ||
+	    state != POOL_STATE_SPARE ||
+	    zpool_read_label(fd, &label, NULL) != 0) {
+		free(name);
+		(void) close(fd);
+		return (B_FALSE);
+	}
+	free(name);
+	(void) close(fd);
+
+	if (config == NULL) {
+		nvlist_free(label);
+		return (B_TRUE);
+	}
+
+	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
+	nvlist_free(label);
+
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+	    &spares, &nspares) == 0) {
+		for (i = 0; i < nspares; i++) {
+			verify(nvlist_lookup_uint64(spares[i],
+			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
+			if (spareguid == guid)
+				return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
+ * device, fill in the device id to make a complete nvlist.  Valid forms for a
+ * leaf vdev are:
+ *
+ *	/dev/xxx	Complete disk path
+ *	/xxx		Full path to file
+ *	xxx		Shorthand for <zfs_vdev_paths>/xxx
+ */
+static nvlist_t *
+make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
+{
+	char path[MAXPATHLEN];
+	struct stat64 statbuf;
+	nvlist_t *vdev = NULL;
+	char *type = NULL;
+	boolean_t wholedisk = B_FALSE;
+	uint64_t ashift = 0;
+	int err;
+
+	/*
+	 * Determine what type of vdev this is, and put the full path into
+	 * 'path'.  We detect whether this is a device of file afterwards by
+	 * checking the st_mode of the file.
+	 */
+	if (arg[0] == '/') {
+		/*
+		 * Complete device or file path.  Exact type is determined by
+		 * examining the file descriptor afterwards.  Symbolic links
+		 * are resolved to their real paths to determine whole disk
+		 * and S_ISBLK/S_ISREG type checks.  However, we are careful
+		 * to store the given path as ZPOOL_CONFIG_PATH to ensure we
+		 * can leverage udev's persistent device labels.
+		 */
+		if (realpath(arg, path) == NULL) {
+			(void) fprintf(stderr,
+			    gettext("cannot resolve path '%s'\n"), arg);
+			return (NULL);
+		}
+
+		wholedisk = zfs_dev_is_whole_disk(path);
+		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
+			(void) fprintf(stderr,
+			    gettext("cannot open '%s': %s\n"),
+			    path, strerror(errno));
+			return (NULL);
+		}
+
+		/* After whole disk check restore original passed path */
+		strlcpy(path, arg, sizeof (path));
+	} else {
+		err = is_shorthand_path(arg, path, sizeof (path),
+		    &statbuf, &wholedisk);
+		if (err != 0) {
+			/*
+			 * If we got ENOENT, then the user gave us
+			 * gibberish, so try to direct them with a
+			 * reasonable error message.  Otherwise,
+			 * regurgitate strerror() since it's the best we
+			 * can do.
+			 */
+			if (err == ENOENT) {
+				(void) fprintf(stderr,
+				    gettext("cannot open '%s': no such "
+				    "device in %s\n"), arg, DISK_ROOT);
+				(void) fprintf(stderr,
+				    gettext("must be a full path or "
+				    "shorthand device name\n"));
+				return (NULL);
+			} else {
+				(void) fprintf(stderr,
+				    gettext("cannot open '%s': %s\n"),
+				    path, strerror(errno));
+				return (NULL);
+			}
+		}
+	}
+
+	/*
+	 * Determine whether this is a device or a file.
+	 */
+	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
+		type = VDEV_TYPE_DISK;
+	} else if (S_ISREG(statbuf.st_mode)) {
+		type = VDEV_TYPE_FILE;
+	} else {
+		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
+		    "block device or regular file\n"), path);
+		return (NULL);
+	}
+
+	/*
+	 * Finally, we have the complete device or file, and we know that it is
+	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
+	 * vdevs have a 'path' element, and devices also have a 'devid' element.
+	 */
+	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
+	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
+	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
+	verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
+	if (is_log)
+		verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
+		    VDEV_ALLOC_BIAS_LOG) == 0);
+	if (strcmp(type, VDEV_TYPE_DISK) == 0)
+		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
+		    (uint64_t)wholedisk) == 0);
+
+	/*
+	 * Override defaults if custom properties are provided.
+	 */
+	if (props != NULL) {
+		char *value = NULL;
+
+		if (nvlist_lookup_string(props,
+		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
+			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
+				(void) fprintf(stderr,
+				    gettext("ashift must be a number.\n"));
+				return (NULL);
+			}
+			if (ashift != 0 &&
+			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
+				(void) fprintf(stderr,
+				    gettext("invalid 'ashift=%" PRIu64 "' "
+				    "property: only values between %" PRId32 " "
+				    "and %" PRId32 " are allowed.\n"),
+				    ashift, ASHIFT_MIN, ASHIFT_MAX);
+				return (NULL);
+			}
+		}
+	}
+
+	/*
+	 * If the device is known to incorrectly report its physical sector
+	 * size explicitly provide the known correct value.
+	 */
+	if (ashift == 0) {
+		int sector_size;
+
+		if (check_sector_size_database(path, &sector_size) == B_TRUE)
+			ashift = highbit64(sector_size) - 1;
+	}
+
+	if (ashift > 0)
+		(void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
+
+	return (vdev);
+}
+
+/*
+ * Go through and verify the replication level of the pool is consistent.
+ * Performs the following checks:
+ *
+ * 	For the new spec, verifies that devices in mirrors and raidz are the
+ * 	same size.
+ *
+ * 	If the current configuration already has inconsistent replication
+ * 	levels, ignore any other potential problems in the new spec.
+ *
+ * 	Otherwise, make sure that the current spec (if there is one) and the new
+ * 	spec have consistent replication levels.
+ *
+ *	If there is no current spec (create), make sure new spec has at least
+ *	one general purpose vdev.
+ */
+typedef struct replication_level {
+	char *zprl_type;
+	uint64_t zprl_children;
+	uint64_t zprl_parity;
+} replication_level_t;
+
+#define	ZPOOL_FUZZ	(16 * 1024 * 1024)
+
+static boolean_t
+is_raidz_mirror(replication_level_t *a, replication_level_t *b,
+    replication_level_t **raidz, replication_level_t **mirror)
+{
+	if (strcmp(a->zprl_type, "raidz") == 0 &&
+	    strcmp(b->zprl_type, "mirror") == 0) {
+		*raidz = a;
+		*mirror = b;
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Given a list of toplevel vdevs, return the current replication level.  If
+ * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
+ * an error message will be displayed for each self-inconsistent vdev.
+ */
+static replication_level_t *
+get_replication(nvlist_t *nvroot, boolean_t fatal)
+{
+	nvlist_t **top;
+	uint_t t, toplevels;
+	nvlist_t **child;
+	uint_t c, children;
+	nvlist_t *nv;
+	char *type;
+	replication_level_t lastrep = {0};
+	replication_level_t rep;
+	replication_level_t *ret;
+	replication_level_t *raidz, *mirror;
+	boolean_t dontreport;
+
+	ret = safe_malloc(sizeof (replication_level_t));
+
+	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &top, &toplevels) == 0);
+
+	for (t = 0; t < toplevels; t++) {
+		uint64_t is_log = B_FALSE;
+
+		nv = top[t];
+
+		/*
+		 * For separate logs we ignore the top level vdev replication
+		 * constraints.
+		 */
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
+		if (is_log)
+			continue;
+
+		/* Ignore holes introduced by removing aux devices */
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+		if (strcmp(type, VDEV_TYPE_HOLE) == 0)
+			continue;
+
+		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+		    &child, &children) != 0) {
+			/*
+			 * This is a 'file' or 'disk' vdev.
+			 */
+			rep.zprl_type = type;
+			rep.zprl_children = 1;
+			rep.zprl_parity = 0;
+		} else {
+			int64_t vdev_size;
+
+			/*
+			 * This is a mirror or RAID-Z vdev.  Go through and make
+			 * sure the contents are all the same (files vs. disks),
+			 * keeping track of the number of elements in the
+			 * process.
+			 *
+			 * We also check that the size of each vdev (if it can
+			 * be determined) is the same.
+			 */
+			rep.zprl_type = type;
+			rep.zprl_children = 0;
+
+			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+				verify(nvlist_lookup_uint64(nv,
+				    ZPOOL_CONFIG_NPARITY,
+				    &rep.zprl_parity) == 0);
+				assert(rep.zprl_parity != 0);
+			} else {
+				rep.zprl_parity = 0;
+			}
+
+			/*
+			 * The 'dontreport' variable indicates that we've
+			 * already reported an error for this spec, so don't
+			 * bother doing it again.
+			 */
+			type = NULL;
+			dontreport = 0;
+			vdev_size = -1LL;
+			for (c = 0; c < children; c++) {
+				nvlist_t *cnv = child[c];
+				char *path;
+				struct stat64 statbuf;
+				int64_t size = -1LL;
+				char *childtype;
+				int fd, err;
+
+				rep.zprl_children++;
+
+				verify(nvlist_lookup_string(cnv,
+				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
+
+				/*
+				 * If this is a replacing or spare vdev, then
+				 * get the real first child of the vdev: do this
+				 * in a loop because replacing and spare vdevs
+				 * can be nested.
+				 */
+				while (strcmp(childtype,
+				    VDEV_TYPE_REPLACING) == 0 ||
+				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
+					nvlist_t **rchild;
+					uint_t rchildren;
+
+					verify(nvlist_lookup_nvlist_array(cnv,
+					    ZPOOL_CONFIG_CHILDREN, &rchild,
+					    &rchildren) == 0);
+					assert(rchildren == 2);
+					cnv = rchild[0];
+
+					verify(nvlist_lookup_string(cnv,
+					    ZPOOL_CONFIG_TYPE,
+					    &childtype) == 0);
+				}
+
+				verify(nvlist_lookup_string(cnv,
+				    ZPOOL_CONFIG_PATH, &path) == 0);
+
+				/*
+				 * If we have a raidz/mirror that combines disks
+				 * with files, report it as an error.
+				 */
+				if (!dontreport && type != NULL &&
+				    strcmp(type, childtype) != 0) {
+					if (ret != NULL)
+						free(ret);
+					ret = NULL;
+					if (fatal)
+						vdev_error(gettext(
+						    "mismatched replication "
+						    "level: %s contains both "
+						    "files and devices\n"),
+						    rep.zprl_type);
+					else
+						return (NULL);
+					dontreport = B_TRUE;
+				}
+
+				/*
+				 * According to stat(2), the value of 'st_size'
+				 * is undefined for block devices and character
+				 * devices.  But there is no effective way to
+				 * determine the real size in userland.
+				 *
+				 * Instead, we'll take advantage of an
+				 * implementation detail of spec_size().  If the
+				 * device is currently open, then we (should)
+				 * return a valid size.
+				 *
+				 * If we still don't get a valid size (indicated
+				 * by a size of 0 or MAXOFFSET_T), then ignore
+				 * this device altogether.
+				 */
+				if ((fd = open(path, O_RDONLY)) >= 0) {
+					err = fstat64_blk(fd, &statbuf);
+					(void) close(fd);
+				} else {
+					err = stat64(path, &statbuf);
+				}
+
+				if (err != 0 ||
+				    statbuf.st_size == 0 ||
+				    statbuf.st_size == MAXOFFSET_T)
+					continue;
+
+				size = statbuf.st_size;
+
+				/*
+				 * Also make sure that devices and
+				 * slices have a consistent size.  If
+				 * they differ by a significant amount
+				 * (~16MB) then report an error.
+				 */
+				if (!dontreport &&
+				    (vdev_size != -1LL &&
+				    (llabs(size - vdev_size) >
+				    ZPOOL_FUZZ))) {
+					if (ret != NULL)
+						free(ret);
+					ret = NULL;
+					if (fatal)
+						vdev_error(gettext(
+						    "%s contains devices of "
+						    "different sizes\n"),
+						    rep.zprl_type);
+					else
+						return (NULL);
+					dontreport = B_TRUE;
+				}
+
+				type = childtype;
+				vdev_size = size;
+			}
+		}
+
+		/*
+		 * At this point, we have the replication of the last toplevel
+		 * vdev in 'rep'.  Compare it to 'lastrep' to see if it is
+		 * different.
+		 */
+		if (lastrep.zprl_type != NULL) {
+			if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
+			    is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
+				/*
+				 * Accepted raidz and mirror when they can
+				 * handle the same number of disk failures.
+				 */
+				if (raidz->zprl_parity !=
+				    mirror->zprl_children - 1) {
+					if (ret != NULL)
+						free(ret);
+					ret = NULL;
+					if (fatal)
+						vdev_error(gettext(
+						    "mismatched replication "
+						    "level: "
+						    "%s and %s vdevs with "
+						    "different redundancy, "
+						    "%llu vs. %llu (%llu-way) "
+						    "are present\n"),
+						    raidz->zprl_type,
+						    mirror->zprl_type,
+						    raidz->zprl_parity,
+						    mirror->zprl_children - 1,
+						    mirror->zprl_children);
+					else
+						return (NULL);
+				}
+			} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
+			    0) {
+				if (ret != NULL)
+					free(ret);
+				ret = NULL;
+				if (fatal)
+					vdev_error(gettext(
+					    "mismatched replication level: "
+					    "both %s and %s vdevs are "
+					    "present\n"),
+					    lastrep.zprl_type, rep.zprl_type);
+				else
+					return (NULL);
+			} else if (lastrep.zprl_parity != rep.zprl_parity) {
+				if (ret)
+					free(ret);
+				ret = NULL;
+				if (fatal)
+					vdev_error(gettext(
+					    "mismatched replication level: "
+					    "both %llu and %llu device parity "
+					    "%s vdevs are present\n"),
+					    lastrep.zprl_parity,
+					    rep.zprl_parity,
+					    rep.zprl_type);
+				else
+					return (NULL);
+			} else if (lastrep.zprl_children != rep.zprl_children) {
+				if (ret)
+					free(ret);
+				ret = NULL;
+				if (fatal)
+					vdev_error(gettext(
+					    "mismatched replication level: "
+					    "both %llu-way and %llu-way %s "
+					    "vdevs are present\n"),
+					    lastrep.zprl_children,
+					    rep.zprl_children,
+					    rep.zprl_type);
+				else
+					return (NULL);
+			}
+		}
+		lastrep = rep;
+	}
+
+	if (ret != NULL)
+		*ret = rep;
+
+	return (ret);
+}
+
+/*
+ * Check the replication level of the vdev spec against the current pool.  Calls
+ * get_replication() to make sure the new spec is self-consistent.  If the pool
+ * has a consistent replication level, then we ignore any errors.  Otherwise,
+ * report any difference between the two.
+ */
+static int
+check_replication(nvlist_t *config, nvlist_t *newroot)
+{
+	nvlist_t **child;
+	uint_t	children;
+	replication_level_t *current = NULL, *new;
+	replication_level_t *raidz, *mirror;
+	int ret;
+
+	/*
+	 * If we have a current pool configuration, check to see if it's
+	 * self-consistent.  If not, simply return success.
+	 */
+	if (config != NULL) {
+		nvlist_t *nvroot;
+
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+		    &nvroot) == 0);
+		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
+			return (0);
+	}
+	/*
+	 * for spares there may be no children, and therefore no
+	 * replication level to check
+	 */
+	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) || (children == 0)) {
+		free(current);
+		return (0);
+	}
+
+	/*
+	 * If all we have is logs then there's no replication level to check.
+	 */
+	if (num_logs(newroot) == children) {
+		free(current);
+		return (0);
+	}
+
+	/*
+	 * Get the replication level of the new vdev spec, reporting any
+	 * inconsistencies found.
+	 */
+	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
+		free(current);
+		return (-1);
+	}
+
+	/*
+	 * Check to see if the new vdev spec matches the replication level of
+	 * the current pool.
+	 */
+	ret = 0;
+	if (current != NULL) {
+		if (is_raidz_mirror(current, new, &raidz, &mirror) ||
+		    is_raidz_mirror(new, current, &raidz, &mirror)) {
+			if (raidz->zprl_parity != mirror->zprl_children - 1) {
+				vdev_error(gettext(
+				    "mismatched replication level: pool and "
+				    "new vdev with different redundancy, %s "
+				    "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
+				    raidz->zprl_type,
+				    mirror->zprl_type,
+				    raidz->zprl_parity,
+				    mirror->zprl_children - 1,
+				    mirror->zprl_children);
+				ret = -1;
+			}
+		} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
+			vdev_error(gettext(
+			    "mismatched replication level: pool uses %s "
+			    "and new vdev is %s\n"),
+			    current->zprl_type, new->zprl_type);
+			ret = -1;
+		} else if (current->zprl_parity != new->zprl_parity) {
+			vdev_error(gettext(
+			    "mismatched replication level: pool uses %llu "
+			    "device parity and new vdev uses %llu\n"),
+			    current->zprl_parity, new->zprl_parity);
+			ret = -1;
+		} else if (current->zprl_children != new->zprl_children) {
+			vdev_error(gettext(
+			    "mismatched replication level: pool uses %llu-way "
+			    "%s and new vdev uses %llu-way %s\n"),
+			    current->zprl_children, current->zprl_type,
+			    new->zprl_children, new->zprl_type);
+			ret = -1;
+		}
+	}
+
+	free(new);
+	if (current != NULL)
+		free(current);
+
+	return (ret);
+}
+
+static int
+zero_label(char *path)
+{
+	const int size = 4096;
+	char buf[size];
+	int err, fd;
+
+	if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
+		(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
+		    path, strerror(errno));
+		return (-1);
+	}
+
+	memset(buf, 0, size);
+	err = write(fd, buf, size);
+	(void) fdatasync(fd);
+	(void) close(fd);
+
+	if (err == -1) {
+		(void) fprintf(stderr, gettext("cannot zero first %d bytes "
+		    "of '%s': %s\n"), size, path, strerror(errno));
+		return (-1);
+	}
+
+	if (err != size) {
+		(void) fprintf(stderr, gettext("could only zero %d/%d bytes "
+		    "of '%s'\n"), err, size, path);
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Go through and find any whole disks in the vdev specification, labelling them
+ * as appropriate.  When constructing the vdev spec, we were unable to open this
+ * device in order to provide a devid.  Now that we have labelled the disk and
+ * know that slice 0 is valid, we can construct the devid now.
+ *
+ * If the disk was already labeled with an EFI label, we will have gotten the
+ * devid already (because we were able to open the whole disk).  Otherwise, we
+ * need to get the devid after we label the disk.
+ */
+static int
+make_disks(zpool_handle_t *zhp, nvlist_t *nv)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	char *type, *path;
+	char devpath[MAXPATHLEN];
+	char udevpath[MAXPATHLEN];
+	uint64_t wholedisk;
+	struct stat64 statbuf;
+	int is_exclusive = 0;
+	int fd;
+	int ret;
+
+	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+
+		if (strcmp(type, VDEV_TYPE_DISK) != 0)
+			return (0);
+
+		/*
+		 * We have a disk device.  If this is a whole disk write
+		 * out the efi partition table, otherwise write zero's to
+		 * the first 4k of the partition.  This is to ensure that
+		 * libblkid will not misidentify the partition due to a
+		 * magic value left by the previous filesystem.
+		 */
+		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
+		verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+		    &wholedisk));
+
+		if (!wholedisk) {
+			/*
+			 * Update device id string for mpath nodes (Linux only)
+			 */
+			if (is_mpath_whole_disk(path))
+				update_vdev_config_dev_strs(nv);
+
+			if (!is_spare(NULL, path))
+				(void) zero_label(path);
+			return (0);
+		}
+
+		if (realpath(path, devpath) == NULL) {
+			ret = errno;
+			(void) fprintf(stderr,
+			    gettext("cannot resolve path '%s'\n"), path);
+			return (ret);
+		}
+
+		/*
+		 * Remove any previously existing symlink from a udev path to
+		 * the device before labeling the disk.  This ensures that
+		 * only newly created links are used.  Otherwise there is a
+		 * window between when udev deletes and recreates the link
+		 * during which access attempts will fail with ENOENT.
+		 */
+		strlcpy(udevpath, path, MAXPATHLEN);
+		(void) zfs_append_partition(udevpath, MAXPATHLEN);
+
+		fd = open(devpath, O_RDWR|O_EXCL);
+		if (fd == -1) {
+			if (errno == EBUSY)
+				is_exclusive = 1;
+#ifdef __FreeBSD__
+			if (errno == EPERM)
+				is_exclusive = 1;
+#endif
+		} else {
+			(void) close(fd);
+		}
+
+		/*
+		 * If the partition exists, contains a valid spare label,
+		 * and is opened exclusively there is no need to partition
+		 * it.  Hot spares have already been partitioned and are
+		 * held open exclusively by the kernel as a safety measure.
+		 *
+		 * If the provided path is for a /dev/disk/ device its
+		 * symbolic link will be removed, partition table created,
+		 * and then block until udev creates the new link.
+		 */
+		if (!is_exclusive && !is_spare(NULL, udevpath)) {
+			char *devnode = strrchr(devpath, '/') + 1;
+
+			ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
+			if (ret == 0) {
+				ret = lstat64(udevpath, &statbuf);
+				if (ret == 0 && S_ISLNK(statbuf.st_mode))
+					(void) unlink(udevpath);
+			}
+
+			/*
+			 * When labeling a pool the raw device node name
+			 * is provided as it appears under /dev/.
+			 */
+			if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
+				return (-1);
+
+			/*
+			 * Wait for udev to signal the device is available
+			 * by the provided path.
+			 */
+			ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
+			if (ret) {
+				(void) fprintf(stderr,
+				    gettext("missing link: %s was "
+				    "partitioned but %s is missing\n"),
+				    devnode, udevpath);
+				return (ret);
+			}
+
+			ret = zero_label(udevpath);
+			if (ret)
+				return (ret);
+		}
+
+		/*
+		 * Update the path to refer to the partition.  The presence of
+		 * the 'whole_disk' field indicates to the CLI that we should
+		 * chop off the partition number when displaying the device in
+		 * future output.
+		 */
+		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
+
+		/*
+		 * Update device id strings for whole disks (Linux only)
+		 */
+		update_vdev_config_dev_strs(nv);
+
+		return (0);
+	}
+
+	for (c = 0; c < children; c++)
+		if ((ret = make_disks(zhp, child[c])) != 0)
+			return (ret);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+	    &child, &children) == 0)
+		for (c = 0; c < children; c++)
+			if ((ret = make_disks(zhp, child[c])) != 0)
+				return (ret);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) == 0)
+		for (c = 0; c < children; c++)
+			if ((ret = make_disks(zhp, child[c])) != 0)
+				return (ret);
+
+	return (0);
+}
+
+/*
+ * Go through and find any devices that are in use.  We rely on libdiskmgt for
+ * the majority of this task.
+ */
+static boolean_t
+is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
+    boolean_t replacing, boolean_t isspare)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	char *type, *path;
+	int ret = 0;
+	char buf[MAXPATHLEN];
+	uint64_t wholedisk = B_FALSE;
+	boolean_t anyinuse = B_FALSE;
+
+	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+
+		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
+		if (strcmp(type, VDEV_TYPE_DISK) == 0)
+			verify(!nvlist_lookup_uint64(nv,
+			    ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
+
+		/*
+		 * As a generic check, we look to see if this is a replace of a
+		 * hot spare within the same pool.  If so, we allow it
+		 * regardless of what libblkid or zpool_in_use() says.
+		 */
+		if (replacing) {
+			(void) strlcpy(buf, path, sizeof (buf));
+			if (wholedisk) {
+				ret = zfs_append_partition(buf,  sizeof (buf));
+				if (ret == -1)
+					return (-1);
+			}
+
+			if (is_spare(config, buf))
+				return (B_FALSE);
+		}
+
+		if (strcmp(type, VDEV_TYPE_DISK) == 0)
+			ret = check_device(path, force, isspare, wholedisk);
+
+		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
+			ret = check_file(path, force, isspare);
+
+		return (ret != 0);
+	}
+
+	for (c = 0; c < children; c++)
+		if (is_device_in_use(config, child[c], force, replacing,
+		    B_FALSE))
+			anyinuse = B_TRUE;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+	    &child, &children) == 0)
+		for (c = 0; c < children; c++)
+			if (is_device_in_use(config, child[c], force, replacing,
+			    B_TRUE))
+				anyinuse = B_TRUE;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) == 0)
+		for (c = 0; c < children; c++)
+			if (is_device_in_use(config, child[c], force, replacing,
+			    B_FALSE))
+				anyinuse = B_TRUE;
+
+	return (anyinuse);
+}
+
+static const char *
+is_grouping(const char *type, int *mindev, int *maxdev)
+{
+	if (strncmp(type, "raidz", 5) == 0) {
+		const char *p = type + 5;
+		char *end;
+		long nparity;
+
+		if (*p == '\0') {
+			nparity = 1;
+		} else if (*p == '0') {
+			return (NULL); /* no zero prefixes allowed */
+		} else {
+			errno = 0;
+			nparity = strtol(p, &end, 10);
+			if (errno != 0 || nparity < 1 || nparity >= 255 ||
+			    *end != '\0')
+				return (NULL);
+		}
+
+		if (mindev != NULL)
+			*mindev = nparity + 1;
+		if (maxdev != NULL)
+			*maxdev = 255;
+		return (VDEV_TYPE_RAIDZ);
+	}
+
+	if (maxdev != NULL)
+		*maxdev = INT_MAX;
+
+	if (strcmp(type, "mirror") == 0) {
+		if (mindev != NULL)
+			*mindev = 2;
+		return (VDEV_TYPE_MIRROR);
+	}
+
+	if (strcmp(type, "spare") == 0) {
+		if (mindev != NULL)
+			*mindev = 1;
+		return (VDEV_TYPE_SPARE);
+	}
+
+	if (strcmp(type, "log") == 0) {
+		if (mindev != NULL)
+			*mindev = 1;
+		return (VDEV_TYPE_LOG);
+	}
+
+	if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
+	    strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
+		if (mindev != NULL)
+			*mindev = 1;
+		return (type);
+	}
+
+	if (strcmp(type, "cache") == 0) {
+		if (mindev != NULL)
+			*mindev = 1;
+		return (VDEV_TYPE_L2CACHE);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Construct a syntactically valid vdev specification,
+ * and ensure that all devices and files exist and can be opened.
+ * Note: we don't bother freeing anything in the error paths
+ * because the program is just going to exit anyway.
+ */
+static nvlist_t *
+construct_spec(nvlist_t *props, int argc, char **argv)
+{
+	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
+	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
+	const char *type;
+	uint64_t is_log, is_special, is_dedup;
+	boolean_t seen_logs;
+
+	top = NULL;
+	toplevels = 0;
+	spares = NULL;
+	l2cache = NULL;
+	nspares = 0;
+	nlogs = 0;
+	nl2cache = 0;
+	is_log = is_special = is_dedup = B_FALSE;
+	seen_logs = B_FALSE;
+	nvroot = NULL;
+
+	while (argc > 0) {
+		nv = NULL;
+
+		/*
+		 * If it's a mirror or raidz, the subsequent arguments are
+		 * its leaves -- until we encounter the next mirror or raidz.
+		 */
+		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
+			nvlist_t **child = NULL;
+			int c, children = 0;
+
+			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
+				if (spares != NULL) {
+					(void) fprintf(stderr,
+					    gettext("invalid vdev "
+					    "specification: 'spare' can be "
+					    "specified only once\n"));
+					goto spec_out;
+				}
+				is_log = is_special = is_dedup = B_FALSE;
+			}
+
+			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
+				if (seen_logs) {
+					(void) fprintf(stderr,
+					    gettext("invalid vdev "
+					    "specification: 'log' can be "
+					    "specified only once\n"));
+					goto spec_out;
+				}
+				seen_logs = B_TRUE;
+				is_log = B_TRUE;
+				is_special = B_FALSE;
+				is_dedup = B_FALSE;
+				argc--;
+				argv++;
+				/*
+				 * A log is not a real grouping device.
+				 * We just set is_log and continue.
+				 */
+				continue;
+			}
+
+			if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
+				is_special = B_TRUE;
+				is_log = B_FALSE;
+				is_dedup = B_FALSE;
+				argc--;
+				argv++;
+				continue;
+			}
+
+			if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
+				is_dedup = B_TRUE;
+				is_log = B_FALSE;
+				is_special = B_FALSE;
+				argc--;
+				argv++;
+				continue;
+			}
+
+			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
+				if (l2cache != NULL) {
+					(void) fprintf(stderr,
+					    gettext("invalid vdev "
+					    "specification: 'cache' can be "
+					    "specified only once\n"));
+					goto spec_out;
+				}
+				is_log = is_special = is_dedup = B_FALSE;
+			}
+
+			if (is_log || is_special || is_dedup) {
+				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
+					(void) fprintf(stderr,
+					    gettext("invalid vdev "
+					    "specification: unsupported '%s' "
+					    "device: %s\n"), is_log ? "log" :
+					    "special", type);
+					goto spec_out;
+				}
+				nlogs++;
+			}
+
+			for (c = 1; c < argc; c++) {
+				if (is_grouping(argv[c], NULL, NULL) != NULL)
+					break;
+				children++;
+				child = realloc(child,
+				    children * sizeof (nvlist_t *));
+				if (child == NULL)
+					zpool_no_memory();
+				if ((nv = make_leaf_vdev(props, argv[c],
+				    B_FALSE)) == NULL) {
+					for (c = 0; c < children - 1; c++)
+						nvlist_free(child[c]);
+					free(child);
+					goto spec_out;
+				}
+
+				child[children - 1] = nv;
+			}
+
+			if (children < mindev) {
+				(void) fprintf(stderr, gettext("invalid vdev "
+				    "specification: %s requires at least %d "
+				    "devices\n"), argv[0], mindev);
+				for (c = 0; c < children; c++)
+					nvlist_free(child[c]);
+				free(child);
+				goto spec_out;
+			}
+
+			if (children > maxdev) {
+				(void) fprintf(stderr, gettext("invalid vdev "
+				    "specification: %s supports no more than "
+				    "%d devices\n"), argv[0], maxdev);
+				for (c = 0; c < children; c++)
+					nvlist_free(child[c]);
+				free(child);
+				goto spec_out;
+			}
+
+			argc -= c;
+			argv += c;
+
+			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
+				spares = child;
+				nspares = children;
+				continue;
+			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
+				l2cache = child;
+				nl2cache = children;
+				continue;
+			} else {
+				/* create a top-level vdev with children */
+				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
+				    0) == 0);
+				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+				    type) == 0);
+				verify(nvlist_add_uint64(nv,
+				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
+				if (is_log)
+					verify(nvlist_add_string(nv,
+					    ZPOOL_CONFIG_ALLOCATION_BIAS,
+					    VDEV_ALLOC_BIAS_LOG) == 0);
+				if (is_special) {
+					verify(nvlist_add_string(nv,
+					    ZPOOL_CONFIG_ALLOCATION_BIAS,
+					    VDEV_ALLOC_BIAS_SPECIAL) == 0);
+				}
+				if (is_dedup) {
+					verify(nvlist_add_string(nv,
+					    ZPOOL_CONFIG_ALLOCATION_BIAS,
+					    VDEV_ALLOC_BIAS_DEDUP) == 0);
+				}
+				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+					verify(nvlist_add_uint64(nv,
+					    ZPOOL_CONFIG_NPARITY,
+					    mindev - 1) == 0);
+				}
+				verify(nvlist_add_nvlist_array(nv,
+				    ZPOOL_CONFIG_CHILDREN, child,
+				    children) == 0);
+
+				for (c = 0; c < children; c++)
+					nvlist_free(child[c]);
+				free(child);
+			}
+		} else {
+			/*
+			 * We have a device.  Pass off to make_leaf_vdev() to
+			 * construct the appropriate nvlist describing the vdev.
+			 */
+			if ((nv = make_leaf_vdev(props, argv[0],
+			    is_log)) == NULL)
+				goto spec_out;
+
+			if (is_log)
+				nlogs++;
+			if (is_special) {
+				verify(nvlist_add_string(nv,
+				    ZPOOL_CONFIG_ALLOCATION_BIAS,
+				    VDEV_ALLOC_BIAS_SPECIAL) == 0);
+			}
+			if (is_dedup) {
+				verify(nvlist_add_string(nv,
+				    ZPOOL_CONFIG_ALLOCATION_BIAS,
+				    VDEV_ALLOC_BIAS_DEDUP) == 0);
+			}
+			argc--;
+			argv++;
+		}
+
+		toplevels++;
+		top = realloc(top, toplevels * sizeof (nvlist_t *));
+		if (top == NULL)
+			zpool_no_memory();
+		top[toplevels - 1] = nv;
+	}
+
+	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
+		(void) fprintf(stderr, gettext("invalid vdev "
+		    "specification: at least one toplevel vdev must be "
+		    "specified\n"));
+		goto spec_out;
+	}
+
+	if (seen_logs && nlogs == 0) {
+		(void) fprintf(stderr, gettext("invalid vdev specification: "
+		    "log requires at least 1 device\n"));
+		goto spec_out;
+	}
+
+	/*
+	 * Finally, create nvroot and add all top-level vdevs to it.
+	 */
+	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
+	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT) == 0);
+	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    top, toplevels) == 0);
+	if (nspares != 0)
+		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+		    spares, nspares) == 0);
+	if (nl2cache != 0)
+		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+		    l2cache, nl2cache) == 0);
+
+spec_out:
+	for (t = 0; t < toplevels; t++)
+		nvlist_free(top[t]);
+	for (t = 0; t < nspares; t++)
+		nvlist_free(spares[t]);
+	for (t = 0; t < nl2cache; t++)
+		nvlist_free(l2cache[t]);
+
+	free(spares);
+	free(l2cache);
+	free(top);
+
+	return (nvroot);
+}
+
+nvlist_t *
+split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
+    splitflags_t flags, int argc, char **argv)
+{
+	nvlist_t *newroot = NULL, **child;
+	uint_t c, children;
+
+	if (argc > 0) {
+		if ((newroot = construct_spec(props, argc, argv)) == NULL) {
+			(void) fprintf(stderr, gettext("Unable to build a "
+			    "pool from the specified devices\n"));
+			return (NULL);
+		}
+
+		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
+			nvlist_free(newroot);
+			return (NULL);
+		}
+
+		/* avoid any tricks in the spec */
+		verify(nvlist_lookup_nvlist_array(newroot,
+		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
+		for (c = 0; c < children; c++) {
+			char *path;
+			const char *type;
+			int min, max;
+
+			verify(nvlist_lookup_string(child[c],
+			    ZPOOL_CONFIG_PATH, &path) == 0);
+			if ((type = is_grouping(path, &min, &max)) != NULL) {
+				(void) fprintf(stderr, gettext("Cannot use "
+				    "'%s' as a device for splitting\n"), type);
+				nvlist_free(newroot);
+				return (NULL);
+			}
+		}
+	}
+
+	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
+		nvlist_free(newroot);
+		return (NULL);
+	}
+
+	return (newroot);
+}
+
+static int
+num_normal_vdevs(nvlist_t *nvroot)
+{
+	nvlist_t **top;
+	uint_t t, toplevels, normal = 0;
+
+	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &top, &toplevels) == 0);
+
+	for (t = 0; t < toplevels; t++) {
+		uint64_t log = B_FALSE;
+
+		(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
+		if (log)
+			continue;
+		if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
+			continue;
+
+		normal++;
+	}
+
+	return (normal);
+}
+
+/*
+ * Get and validate the contents of the given vdev specification.  This ensures
+ * that the nvlist returned is well-formed, that all the devices exist, and that
+ * they are not currently in use by any other known consumer.  The 'poolconfig'
+ * parameter is the current configuration of the pool when adding devices
+ * existing pool, and is used to perform additional checks, such as changing the
+ * replication level of the pool.  It can be 'NULL' to indicate that this is a
+ * new pool.  The 'force' flag controls whether devices should be forcefully
+ * added, even if they appear in use.
+ */
+nvlist_t *
+make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
+    boolean_t replacing, boolean_t dryrun, int argc, char **argv)
+{
+	nvlist_t *newroot;
+	nvlist_t *poolconfig = NULL;
+	is_force = force;
+
+	/*
+	 * Construct the vdev specification.  If this is successful, we know
+	 * that we have a valid specification, and that all devices can be
+	 * opened.
+	 */
+	if ((newroot = construct_spec(props, argc, argv)) == NULL)
+		return (NULL);
+
+	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
+		nvlist_free(newroot);
+		return (NULL);
+	}
+
+	/*
+	 * Validate each device to make sure that it's not shared with another
+	 * subsystem.  We do this even if 'force' is set, because there are some
+	 * uses (such as a dedicated dump device) that even '-f' cannot
+	 * override.
+	 */
+	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
+		nvlist_free(newroot);
+		return (NULL);
+	}
+
+	/*
+	 * Check the replication level of the given vdevs and report any errors
+	 * found.  We include the existing pool spec, if any, as we need to
+	 * catch changes against the existing replication level.
+	 */
+	if (check_rep && check_replication(poolconfig, newroot) != 0) {
+		nvlist_free(newroot);
+		return (NULL);
+	}
+
+	/*
+	 * On pool create the new vdev spec must have one normal vdev.
+	 */
+	if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
+		vdev_error(gettext("at least one general top-level vdev must "
+		    "be specified\n"));
+		nvlist_free(newroot);
+		return (NULL);
+	}
+
+	/*
+	 * Run through the vdev specification and label any whole disks found.
+	 */
+	if (!dryrun && make_disks(zhp, newroot) != 0) {
+		nvlist_free(newroot);
+		return (NULL);
+	}
+
+	return (newroot);
+}
diff --git a/sys/contrib/openzfs/cmd/zstream/.gitignore b/sys/contrib/openzfs/cmd/zstream/.gitignore
new file mode 100644
index 000000000000..fd1240d55c4b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/.gitignore
@@ -0,0 +1 @@
+zstream
diff --git a/sys/contrib/openzfs/cmd/zstream/Makefile.am b/sys/contrib/openzfs/cmd/zstream/Makefile.am
new file mode 100644
index 000000000000..5e2ac5d69f1a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/Makefile.am
@@ -0,0 +1,15 @@
+include $(top_srcdir)/config/Rules.am
+
+sbin_PROGRAMS = zstream
+
+zstream_SOURCES = \
+	zstream.c \
+	zstream.h \
+	zstream_dump.c \
+	zstream_redup.c \
+	zstream_token.c
+
+zstream_LDADD = \
+	$(abs_top_builddir)/lib/libzfs/libzfs.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+	$(abs_top_builddir)/lib/libnvpair/libnvpair.la
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream.c b/sys/contrib/openzfs/cmd/zstream/zstream.c
new file mode 100644
index 000000000000..cbcb560a8638
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream.c
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2020 by Datto Inc. All rights reserved.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <libintl.h>
+#include <stddef.h>
+#include <libzfs.h>
+#include "zstream.h"
+
+void
+zstream_usage(void)
+{
+	(void) fprintf(stderr,
+	    "usage: zstream command args ...\n"
+	    "Available commands are:\n"
+	    "\n"
+	    "\tzstream dump [-vCd] FILE\n"
+	    "\t... | zstream dump [-vCd]\n"
+	    "\n"
+	    "\tzstream token resume_token\n"
+	    "\n"
+	    "\tzstream redup [-v] FILE | ...\n");
+	exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+	if (argc < 2)
+		zstream_usage();
+
+	char *subcommand = argv[1];
+
+	if (strcmp(subcommand, "dump") == 0) {
+		return (zstream_do_dump(argc - 1, argv + 1));
+	} else if (strcmp(subcommand, "token") == 0) {
+		return (zstream_do_token(argc - 1, argv + 1));
+	} else if (strcmp(subcommand, "redup") == 0) {
+		return (zstream_do_redup(argc - 1, argv + 1));
+	} else {
+		zstream_usage();
+	}
+}
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream.h b/sys/contrib/openzfs/cmd/zstream/zstream.h
new file mode 100644
index 000000000000..319fecb2876b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020 by Delphix. All rights reserved.
+ */
+
+#ifndef	_ZSTREAM_H
+#define	_ZSTREAM_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern int zstream_do_redup(int, char *[]);
+extern int zstream_do_dump(int, char *[]);
+extern int zstream_do_token(int, char *[]);
+extern void zstream_usage(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZSTREAM_H */
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_dump.c b/sys/contrib/openzfs/cmd/zstream/zstream_dump.c
new file mode 100644
index 000000000000..45cf7b97a147
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream_dump.c
@@ -0,0 +1,799 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Portions Copyright 2012 Martin Matuska <martin@matuska.org>
+ */
+
+/*
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
+#include <ctype.h>
+#include <libnvpair.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <stddef.h>
+
+#include <sys/dmu.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zio.h>
+#include <zfs_fletcher.h>
+#include "zstream.h"
+
+/*
+ * If dump mode is enabled, the number of bytes to print per line
+ */
+#define	BYTES_PER_LINE	16
+/*
+ * If dump mode is enabled, the number of bytes to group together, separated
+ * by newlines or spaces
+ */
+#define	DUMP_GROUPING	4
+
+uint64_t total_stream_len = 0;
+FILE *send_stream = 0;
+boolean_t do_byteswap = B_FALSE;
+boolean_t do_cksum = B_TRUE;
+
+static void *
+safe_malloc(size_t size)
+{
+	void *rv = malloc(size);
+	if (rv == NULL) {
+		(void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n",
+		    size);
+		abort();
+	}
+	return (rv);
+}
+
+/*
+ * ssread - send stream read.
+ *
+ * Read while computing incremental checksum
+ */
+static size_t
+ssread(void *buf, size_t len, zio_cksum_t *cksum)
+{
+	size_t outlen;
+
+	if ((outlen = fread(buf, len, 1, send_stream)) == 0)
+		return (0);
+
+	if (do_cksum) {
+		if (do_byteswap)
+			fletcher_4_incremental_byteswap(buf, len, cksum);
+		else
+			fletcher_4_incremental_native(buf, len, cksum);
+	}
+	total_stream_len += len;
+	return (outlen);
+}
+
+static size_t
+read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum)
+{
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum);
+	if (r == 0)
+		return (0);
+	zio_cksum_t saved_cksum = *cksum;
+	r = ssread(&drr->drr_u.drr_checksum.drr_checksum,
+	    sizeof (zio_cksum_t), cksum);
+	if (r == 0)
+		return (0);
+	if (do_cksum &&
+	    !ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) &&
+	    !ZIO_CHECKSUM_EQUAL(saved_cksum,
+	    drr->drr_u.drr_checksum.drr_checksum)) {
+		fprintf(stderr, "invalid checksum\n");
+		(void) printf("Incorrect checksum in record header.\n");
+		(void) printf("Expected checksum = %llx/%llx/%llx/%llx\n",
+		    (longlong_t)saved_cksum.zc_word[0],
+		    (longlong_t)saved_cksum.zc_word[1],
+		    (longlong_t)saved_cksum.zc_word[2],
+		    (longlong_t)saved_cksum.zc_word[3]);
+		return (0);
+	}
+	return (sizeof (*drr));
+}
+
+/*
+ * Print part of a block in ASCII characters
+ */
+static void
+print_ascii_block(char *subbuf, int length)
+{
+	int i;
+
+	for (i = 0; i < length; i++) {
+		char char_print = isprint(subbuf[i]) ? subbuf[i] : '.';
+		if (i != 0 && i % DUMP_GROUPING == 0) {
+			(void) printf(" ");
+		}
+		(void) printf("%c", char_print);
+	}
+	(void) printf("\n");
+}
+
+/*
+ * print_block - Dump the contents of a modified block to STDOUT
+ *
+ * Assume that buf has capacity evenly divisible by BYTES_PER_LINE
+ */
+static void
+print_block(char *buf, int length)
+{
+	int i;
+	/*
+	 * Start printing ASCII characters at a constant offset, after
+	 * the hex prints. Leave 3 characters per byte on a line (2 digit
+	 * hex number plus 1 space) plus spaces between characters and
+	 * groupings.
+	 */
+	int ascii_start = BYTES_PER_LINE * 3 +
+	    BYTES_PER_LINE / DUMP_GROUPING + 2;
+
+	for (i = 0; i < length; i += BYTES_PER_LINE) {
+		int j;
+		int this_line_length = MIN(BYTES_PER_LINE, length - i);
+		int print_offset = 0;
+
+		for (j = 0; j < this_line_length; j++) {
+			int buf_offset = i + j;
+
+			/*
+			 * Separate every DUMP_GROUPING bytes by a space.
+			 */
+			if (buf_offset % DUMP_GROUPING == 0) {
+				print_offset += printf(" ");
+			}
+
+			/*
+			 * Print the two-digit hex value for this byte.
+			 */
+			unsigned char hex_print = buf[buf_offset];
+			print_offset += printf("%02x ", hex_print);
+		}
+
+		(void) printf("%*s", ascii_start - print_offset, " ");
+
+		print_ascii_block(buf + i, this_line_length);
+	}
+}
+
+/*
+ * Print an array of bytes to stdout as hexadecimal characters. str must
+ * have buf_len * 2 + 1 bytes of space.
+ */
+static void
+sprintf_bytes(char *str, uint8_t *buf, uint_t buf_len)
+{
+	int i, n;
+
+	for (i = 0; i < buf_len; i++) {
+		n = sprintf(str, "%02x", buf[i] & 0xff);
+		str += n;
+	}
+
+	str[0] = '\0';
+}
+
+int
+zstream_do_dump(int argc, char *argv[])
+{
+	char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
+	uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
+	uint64_t total_payload_size = 0;
+	uint64_t total_overhead_size = 0;
+	uint64_t drr_byte_count[DRR_NUMTYPES] = { 0 };
+	char salt[ZIO_DATA_SALT_LEN * 2 + 1];
+	char iv[ZIO_DATA_IV_LEN * 2 + 1];
+	char mac[ZIO_DATA_MAC_LEN * 2 + 1];
+	uint64_t total_records = 0;
+	uint64_t payload_size;
+	dmu_replay_record_t thedrr;
+	dmu_replay_record_t *drr = &thedrr;
+	struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
+	struct drr_end *drre = &thedrr.drr_u.drr_end;
+	struct drr_object *drro = &thedrr.drr_u.drr_object;
+	struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects;
+	struct drr_write *drrw = &thedrr.drr_u.drr_write;
+	struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
+	struct drr_free *drrf = &thedrr.drr_u.drr_free;
+	struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
+	struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
+	struct drr_object_range *drror = &thedrr.drr_u.drr_object_range;
+	struct drr_redact *drrr = &thedrr.drr_u.drr_redact;
+	struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum;
+	int c;
+	boolean_t verbose = B_FALSE;
+	boolean_t very_verbose = B_FALSE;
+	boolean_t first = B_TRUE;
+	/*
+	 * dump flag controls whether the contents of any modified data blocks
+	 * are printed to the console during processing of the stream. Warning:
+	 * for large streams, this can obviously lead to massive prints.
+	 */
+	boolean_t dump = B_FALSE;
+	int err;
+	zio_cksum_t zc = { { 0 } };
+	zio_cksum_t pcksum = { { 0 } };
+
+	while ((c = getopt(argc, argv, ":vCd")) != -1) {
+		switch (c) {
+		case 'C':
+			do_cksum = B_FALSE;
+			break;
+		case 'v':
+			if (verbose)
+				very_verbose = B_TRUE;
+			verbose = B_TRUE;
+			break;
+		case 'd':
+			dump = B_TRUE;
+			verbose = B_TRUE;
+			very_verbose = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr,
+			    "missing argument for '%c' option\n", optopt);
+			zstream_usage();
+			break;
+		case '?':
+			(void) fprintf(stderr, "invalid option '%c'\n",
+			    optopt);
+			zstream_usage();
+			break;
+		}
+	}
+
+	if (argc > optind) {
+		const char *filename = argv[optind];
+		send_stream = fopen(filename, "r");
+		if (send_stream == NULL) {
+			(void) fprintf(stderr,
+			    "Error while opening file '%s': %s\n",
+			    filename, strerror(errno));
+			exit(1);
+		}
+	} else {
+		if (isatty(STDIN_FILENO)) {
+			(void) fprintf(stderr,
+			    "Error: The send stream is a binary format "
+			    "and can not be read from a\n"
+			    "terminal.  Standard input must be redirected, "
+			    "or a file must be\n"
+			    "specified as a command-line argument.\n");
+			exit(1);
+		}
+		send_stream = stdin;
+	}
+
+	fletcher_4_init();
+	while (read_hdr(drr, &zc)) {
+
+		/*
+		 * If this is the first DMU record being processed, check for
+		 * the magic bytes and figure out the endian-ness based on them.
+		 */
+		if (first) {
+			if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+				do_byteswap = B_TRUE;
+				if (do_cksum) {
+					ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+					/*
+					 * recalculate header checksum now
+					 * that we know it needs to be
+					 * byteswapped.
+					 */
+					fletcher_4_incremental_byteswap(drr,
+					    sizeof (dmu_replay_record_t), &zc);
+				}
+			} else if (drrb->drr_magic != DMU_BACKUP_MAGIC) {
+				(void) fprintf(stderr, "Invalid stream "
+				    "(bad magic number)\n");
+				exit(1);
+			}
+			first = B_FALSE;
+		}
+		if (do_byteswap) {
+			drr->drr_type = BSWAP_32(drr->drr_type);
+			drr->drr_payloadlen =
+			    BSWAP_32(drr->drr_payloadlen);
+		}
+
+		/*
+		 * At this point, the leading fields of the replay record
+		 * (drr_type and drr_payloadlen) have been byte-swapped if
+		 * necessary, but the rest of the data structure (the
+		 * union of type-specific structures) is still in its
+		 * original state.
+		 */
+		if (drr->drr_type >= DRR_NUMTYPES) {
+			(void) printf("INVALID record found: type 0x%x\n",
+			    drr->drr_type);
+			(void) printf("Aborting.\n");
+			exit(1);
+		}
+
+		drr_record_count[drr->drr_type]++;
+		total_overhead_size += sizeof (*drr);
+		total_records++;
+		payload_size = 0;
+
+		switch (drr->drr_type) {
+		case DRR_BEGIN:
+			if (do_byteswap) {
+				drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+				drrb->drr_versioninfo =
+				    BSWAP_64(drrb->drr_versioninfo);
+				drrb->drr_creation_time =
+				    BSWAP_64(drrb->drr_creation_time);
+				drrb->drr_type = BSWAP_32(drrb->drr_type);
+				drrb->drr_flags = BSWAP_32(drrb->drr_flags);
+				drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+				drrb->drr_fromguid =
+				    BSWAP_64(drrb->drr_fromguid);
+			}
+
+			(void) printf("BEGIN record\n");
+			(void) printf("\thdrtype = %lld\n",
+			    DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo));
+			(void) printf("\tfeatures = %llx\n",
+			    DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo));
+			(void) printf("\tmagic = %llx\n",
+			    (u_longlong_t)drrb->drr_magic);
+			(void) printf("\tcreation_time = %llx\n",
+			    (u_longlong_t)drrb->drr_creation_time);
+			(void) printf("\ttype = %u\n", drrb->drr_type);
+			(void) printf("\tflags = 0x%x\n", drrb->drr_flags);
+			(void) printf("\ttoguid = %llx\n",
+			    (u_longlong_t)drrb->drr_toguid);
+			(void) printf("\tfromguid = %llx\n",
+			    (u_longlong_t)drrb->drr_fromguid);
+			(void) printf("\ttoname = %s\n", drrb->drr_toname);
+			(void) printf("\tpayloadlen = %u\n",
+			    drr->drr_payloadlen);
+			if (verbose)
+				(void) printf("\n");
+
+			if (drr->drr_payloadlen != 0) {
+				nvlist_t *nv;
+				int sz = drr->drr_payloadlen;
+
+				if (sz > SPA_MAXBLOCKSIZE) {
+					free(buf);
+					buf = safe_malloc(sz);
+				}
+				(void) ssread(buf, sz, &zc);
+				if (ferror(send_stream))
+					perror("fread");
+				err = nvlist_unpack(buf, sz, &nv, 0);
+				if (err) {
+					perror(strerror(err));
+				} else {
+					nvlist_print(stdout, nv);
+					nvlist_free(nv);
+				}
+				payload_size = sz;
+			}
+			break;
+
+		case DRR_END:
+			if (do_byteswap) {
+				drre->drr_checksum.zc_word[0] =
+				    BSWAP_64(drre->drr_checksum.zc_word[0]);
+				drre->drr_checksum.zc_word[1] =
+				    BSWAP_64(drre->drr_checksum.zc_word[1]);
+				drre->drr_checksum.zc_word[2] =
+				    BSWAP_64(drre->drr_checksum.zc_word[2]);
+				drre->drr_checksum.zc_word[3] =
+				    BSWAP_64(drre->drr_checksum.zc_word[3]);
+			}
+			/*
+			 * We compare against the *previous* checksum
+			 * value, because the stored checksum is of
+			 * everything before the DRR_END record.
+			 */
+			if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum,
+			    pcksum)) {
+				(void) printf("Expected checksum differs from "
+				    "checksum in stream.\n");
+				(void) printf("Expected checksum = "
+				    "%llx/%llx/%llx/%llx\n",
+				    (long long unsigned int)pcksum.zc_word[0],
+				    (long long unsigned int)pcksum.zc_word[1],
+				    (long long unsigned int)pcksum.zc_word[2],
+				    (long long unsigned int)pcksum.zc_word[3]);
+			}
+			(void) printf("END checksum = %llx/%llx/%llx/%llx\n",
+			    (long long unsigned int)
+			    drre->drr_checksum.zc_word[0],
+			    (long long unsigned int)
+			    drre->drr_checksum.zc_word[1],
+			    (long long unsigned int)
+			    drre->drr_checksum.zc_word[2],
+			    (long long unsigned int)
+			    drre->drr_checksum.zc_word[3]);
+
+			ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+			break;
+
+		case DRR_OBJECT:
+			if (do_byteswap) {
+				drro->drr_object = BSWAP_64(drro->drr_object);
+				drro->drr_type = BSWAP_32(drro->drr_type);
+				drro->drr_bonustype =
+				    BSWAP_32(drro->drr_bonustype);
+				drro->drr_blksz = BSWAP_32(drro->drr_blksz);
+				drro->drr_bonuslen =
+				    BSWAP_32(drro->drr_bonuslen);
+				drro->drr_raw_bonuslen =
+				    BSWAP_32(drro->drr_raw_bonuslen);
+				drro->drr_toguid = BSWAP_64(drro->drr_toguid);
+				drro->drr_maxblkid =
+				    BSWAP_64(drro->drr_maxblkid);
+			}
+
+			payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
+
+			if (verbose) {
+				(void) printf("OBJECT object = %llu type = %u "
+				    "bonustype = %u blksz = %u bonuslen = %u "
+				    "dn_slots = %u raw_bonuslen = %u "
+				    "flags = %u maxblkid = %llu "
+				    "indblkshift = %u nlevels = %u "
+				    "nblkptr = %u\n",
+				    (u_longlong_t)drro->drr_object,
+				    drro->drr_type,
+				    drro->drr_bonustype,
+				    drro->drr_blksz,
+				    drro->drr_bonuslen,
+				    drro->drr_dn_slots,
+				    drro->drr_raw_bonuslen,
+				    drro->drr_flags,
+				    (u_longlong_t)drro->drr_maxblkid,
+				    drro->drr_indblkshift,
+				    drro->drr_nlevels,
+				    drro->drr_nblkptr);
+			}
+			if (drro->drr_bonuslen > 0) {
+				(void) ssread(buf, payload_size, &zc);
+				if (dump)
+					print_block(buf, payload_size);
+			}
+			break;
+
+		case DRR_FREEOBJECTS:
+			if (do_byteswap) {
+				drrfo->drr_firstobj =
+				    BSWAP_64(drrfo->drr_firstobj);
+				drrfo->drr_numobjs =
+				    BSWAP_64(drrfo->drr_numobjs);
+				drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid);
+			}
+			if (verbose) {
+				(void) printf("FREEOBJECTS firstobj = %llu "
+				    "numobjs = %llu\n",
+				    (u_longlong_t)drrfo->drr_firstobj,
+				    (u_longlong_t)drrfo->drr_numobjs);
+			}
+			break;
+
+		case DRR_WRITE:
+			if (do_byteswap) {
+				drrw->drr_object = BSWAP_64(drrw->drr_object);
+				drrw->drr_type = BSWAP_32(drrw->drr_type);
+				drrw->drr_offset = BSWAP_64(drrw->drr_offset);
+				drrw->drr_logical_size =
+				    BSWAP_64(drrw->drr_logical_size);
+				drrw->drr_toguid = BSWAP_64(drrw->drr_toguid);
+				drrw->drr_key.ddk_prop =
+				    BSWAP_64(drrw->drr_key.ddk_prop);
+				drrw->drr_compressed_size =
+				    BSWAP_64(drrw->drr_compressed_size);
+			}
+
+			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+
+			/*
+			 * If this is verbose and/or dump output,
+			 * print info on the modified block
+			 */
+			if (verbose) {
+				sprintf_bytes(salt, drrw->drr_salt,
+				    ZIO_DATA_SALT_LEN);
+				sprintf_bytes(iv, drrw->drr_iv,
+				    ZIO_DATA_IV_LEN);
+				sprintf_bytes(mac, drrw->drr_mac,
+				    ZIO_DATA_MAC_LEN);
+
+				(void) printf("WRITE object = %llu type = %u "
+				    "checksum type = %u compression type = %u "
+				    "flags = %u offset = %llu "
+				    "logical_size = %llu "
+				    "compressed_size = %llu "
+				    "payload_size = %llu props = %llx "
+				    "salt = %s iv = %s mac = %s\n",
+				    (u_longlong_t)drrw->drr_object,
+				    drrw->drr_type,
+				    drrw->drr_checksumtype,
+				    drrw->drr_compressiontype,
+				    drrw->drr_flags,
+				    (u_longlong_t)drrw->drr_offset,
+				    (u_longlong_t)drrw->drr_logical_size,
+				    (u_longlong_t)drrw->drr_compressed_size,
+				    (u_longlong_t)payload_size,
+				    (u_longlong_t)drrw->drr_key.ddk_prop,
+				    salt,
+				    iv,
+				    mac);
+			}
+
+			/*
+			 * Read the contents of the block in from STDIN to buf
+			 */
+			(void) ssread(buf, payload_size, &zc);
+			/*
+			 * If in dump mode
+			 */
+			if (dump) {
+				print_block(buf, payload_size);
+			}
+			break;
+
+		case DRR_WRITE_BYREF:
+			if (do_byteswap) {
+				drrwbr->drr_object =
+				    BSWAP_64(drrwbr->drr_object);
+				drrwbr->drr_offset =
+				    BSWAP_64(drrwbr->drr_offset);
+				drrwbr->drr_length =
+				    BSWAP_64(drrwbr->drr_length);
+				drrwbr->drr_toguid =
+				    BSWAP_64(drrwbr->drr_toguid);
+				drrwbr->drr_refguid =
+				    BSWAP_64(drrwbr->drr_refguid);
+				drrwbr->drr_refobject =
+				    BSWAP_64(drrwbr->drr_refobject);
+				drrwbr->drr_refoffset =
+				    BSWAP_64(drrwbr->drr_refoffset);
+				drrwbr->drr_key.ddk_prop =
+				    BSWAP_64(drrwbr->drr_key.ddk_prop);
+			}
+			if (verbose) {
+				(void) printf("WRITE_BYREF object = %llu "
+				    "checksum type = %u props = %llx "
+				    "offset = %llu length = %llu "
+				    "toguid = %llx refguid = %llx "
+				    "refobject = %llu refoffset = %llu\n",
+				    (u_longlong_t)drrwbr->drr_object,
+				    drrwbr->drr_checksumtype,
+				    (u_longlong_t)drrwbr->drr_key.ddk_prop,
+				    (u_longlong_t)drrwbr->drr_offset,
+				    (u_longlong_t)drrwbr->drr_length,
+				    (u_longlong_t)drrwbr->drr_toguid,
+				    (u_longlong_t)drrwbr->drr_refguid,
+				    (u_longlong_t)drrwbr->drr_refobject,
+				    (u_longlong_t)drrwbr->drr_refoffset);
+			}
+			break;
+
+		case DRR_FREE:
+			if (do_byteswap) {
+				drrf->drr_object = BSWAP_64(drrf->drr_object);
+				drrf->drr_offset = BSWAP_64(drrf->drr_offset);
+				drrf->drr_length = BSWAP_64(drrf->drr_length);
+			}
+			if (verbose) {
+				(void) printf("FREE object = %llu "
+				    "offset = %llu length = %lld\n",
+				    (u_longlong_t)drrf->drr_object,
+				    (u_longlong_t)drrf->drr_offset,
+				    (longlong_t)drrf->drr_length);
+			}
+			break;
+		case DRR_SPILL:
+			if (do_byteswap) {
+				drrs->drr_object = BSWAP_64(drrs->drr_object);
+				drrs->drr_length = BSWAP_64(drrs->drr_length);
+				drrs->drr_compressed_size =
+				    BSWAP_64(drrs->drr_compressed_size);
+				drrs->drr_type = BSWAP_32(drrs->drr_type);
+			}
+
+			payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+
+			if (verbose) {
+				sprintf_bytes(salt, drrs->drr_salt,
+				    ZIO_DATA_SALT_LEN);
+				sprintf_bytes(iv, drrs->drr_iv,
+				    ZIO_DATA_IV_LEN);
+				sprintf_bytes(mac, drrs->drr_mac,
+				    ZIO_DATA_MAC_LEN);
+
+				(void) printf("SPILL block for object = %llu "
+				    "length = %llu flags = %u "
+				    "compression type = %u "
+				    "compressed_size = %llu "
+				    "payload_size = %llu "
+				    "salt = %s iv = %s mac = %s\n",
+				    (u_longlong_t)drrs->drr_object,
+				    (u_longlong_t)drrs->drr_length,
+				    drrs->drr_flags,
+				    drrs->drr_compressiontype,
+				    (u_longlong_t)drrs->drr_compressed_size,
+				    (u_longlong_t)payload_size,
+				    salt,
+				    iv,
+				    mac);
+			}
+			(void) ssread(buf, payload_size, &zc);
+			if (dump) {
+				print_block(buf, payload_size);
+			}
+			break;
+		case DRR_WRITE_EMBEDDED:
+			if (do_byteswap) {
+				drrwe->drr_object =
+				    BSWAP_64(drrwe->drr_object);
+				drrwe->drr_offset =
+				    BSWAP_64(drrwe->drr_offset);
+				drrwe->drr_length =
+				    BSWAP_64(drrwe->drr_length);
+				drrwe->drr_toguid =
+				    BSWAP_64(drrwe->drr_toguid);
+				drrwe->drr_lsize =
+				    BSWAP_32(drrwe->drr_lsize);
+				drrwe->drr_psize =
+				    BSWAP_32(drrwe->drr_psize);
+			}
+			if (verbose) {
+				(void) printf("WRITE_EMBEDDED object = %llu "
+				    "offset = %llu length = %llu "
+				    "toguid = %llx comp = %u etype = %u "
+				    "lsize = %u psize = %u\n",
+				    (u_longlong_t)drrwe->drr_object,
+				    (u_longlong_t)drrwe->drr_offset,
+				    (u_longlong_t)drrwe->drr_length,
+				    (u_longlong_t)drrwe->drr_toguid,
+				    drrwe->drr_compression,
+				    drrwe->drr_etype,
+				    drrwe->drr_lsize,
+				    drrwe->drr_psize);
+			}
+			(void) ssread(buf,
+			    P2ROUNDUP(drrwe->drr_psize, 8), &zc);
+			if (dump) {
+				print_block(buf,
+				    P2ROUNDUP(drrwe->drr_psize, 8));
+			}
+			payload_size = P2ROUNDUP(drrwe->drr_psize, 8);
+			break;
+		case DRR_OBJECT_RANGE:
+			if (do_byteswap) {
+				drror->drr_firstobj =
+				    BSWAP_64(drror->drr_firstobj);
+				drror->drr_numslots =
+				    BSWAP_64(drror->drr_numslots);
+				drror->drr_toguid = BSWAP_64(drror->drr_toguid);
+			}
+			if (verbose) {
+				sprintf_bytes(salt, drror->drr_salt,
+				    ZIO_DATA_SALT_LEN);
+				sprintf_bytes(iv, drror->drr_iv,
+				    ZIO_DATA_IV_LEN);
+				sprintf_bytes(mac, drror->drr_mac,
+				    ZIO_DATA_MAC_LEN);
+
+				(void) printf("OBJECT_RANGE firstobj = %llu "
+				    "numslots = %llu flags = %u "
+				    "salt = %s iv = %s mac = %s\n",
+				    (u_longlong_t)drror->drr_firstobj,
+				    (u_longlong_t)drror->drr_numslots,
+				    drror->drr_flags,
+				    salt,
+				    iv,
+				    mac);
+			}
+			break;
+		case DRR_REDACT:
+			if (do_byteswap) {
+				drrr->drr_object = BSWAP_64(drrr->drr_object);
+				drrr->drr_offset = BSWAP_64(drrr->drr_offset);
+				drrr->drr_length = BSWAP_64(drrr->drr_length);
+				drrr->drr_toguid = BSWAP_64(drrr->drr_toguid);
+			}
+			if (verbose) {
+				(void) printf("REDACT object = %llu offset = "
+				    "%llu length = %llu\n",
+				    (u_longlong_t)drrr->drr_object,
+				    (u_longlong_t)drrr->drr_offset,
+				    (u_longlong_t)drrr->drr_length);
+			}
+			break;
+		case DRR_NUMTYPES:
+			/* should never be reached */
+			exit(1);
+		}
+		if (drr->drr_type != DRR_BEGIN && very_verbose) {
+			(void) printf("    checksum = %llx/%llx/%llx/%llx\n",
+			    (longlong_t)drrc->drr_checksum.zc_word[0],
+			    (longlong_t)drrc->drr_checksum.zc_word[1],
+			    (longlong_t)drrc->drr_checksum.zc_word[2],
+			    (longlong_t)drrc->drr_checksum.zc_word[3]);
+		}
+		pcksum = zc;
+		drr_byte_count[drr->drr_type] += payload_size;
+		total_payload_size += payload_size;
+	}
+	free(buf);
+	fletcher_4_fini();
+
+	/* Print final summary */
+
+	(void) printf("SUMMARY:\n");
+	(void) printf("\tTotal DRR_BEGIN records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_BEGIN],
+	    (u_longlong_t)drr_byte_count[DRR_BEGIN]);
+	(void) printf("\tTotal DRR_END records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_END],
+	    (u_longlong_t)drr_byte_count[DRR_END]);
+	(void) printf("\tTotal DRR_OBJECT records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_OBJECT],
+	    (u_longlong_t)drr_byte_count[DRR_OBJECT]);
+	(void) printf("\tTotal DRR_FREEOBJECTS records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_FREEOBJECTS],
+	    (u_longlong_t)drr_byte_count[DRR_FREEOBJECTS]);
+	(void) printf("\tTotal DRR_WRITE records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_WRITE],
+	    (u_longlong_t)drr_byte_count[DRR_WRITE]);
+	(void) printf("\tTotal DRR_WRITE_BYREF records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_WRITE_BYREF],
+	    (u_longlong_t)drr_byte_count[DRR_WRITE_BYREF]);
+	(void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld (%llu "
+	    "bytes)\n", (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED],
+	    (u_longlong_t)drr_byte_count[DRR_WRITE_EMBEDDED]);
+	(void) printf("\tTotal DRR_FREE records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_FREE],
+	    (u_longlong_t)drr_byte_count[DRR_FREE]);
+	(void) printf("\tTotal DRR_SPILL records = %lld (%llu bytes)\n",
+	    (u_longlong_t)drr_record_count[DRR_SPILL],
+	    (u_longlong_t)drr_byte_count[DRR_SPILL]);
+	(void) printf("\tTotal records = %lld\n",
+	    (u_longlong_t)total_records);
+	(void) printf("\tTotal payload size = %lld (0x%llx)\n",
+	    (u_longlong_t)total_payload_size, (u_longlong_t)total_payload_size);
+	(void) printf("\tTotal header overhead = %lld (0x%llx)\n",
+	    (u_longlong_t)total_overhead_size,
+	    (u_longlong_t)total_overhead_size);
+	(void) printf("\tTotal stream length = %lld (0x%llx)\n",
+	    (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_redup.c b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c
new file mode 100644
index 000000000000..379025ce59e5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c
@@ -0,0 +1,469 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020 by Delphix. All rights reserved.
+ */
+
+#include <assert.h>
+#include <cityhash.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libzfs_impl.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <umem.h>
+#include <unistd.h>
+#include <sys/debug.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zio_checksum.h>
+#include "zfs_fletcher.h"
+#include "zstream.h"
+
+
+#define	MAX_RDT_PHYSMEM_PERCENT		20
+#define	SMALLEST_POSSIBLE_MAX_RDT_MB		128
+
+typedef struct redup_entry {
+	struct redup_entry	*rde_next;
+	uint64_t rde_guid;
+	uint64_t rde_object;
+	uint64_t rde_offset;
+	uint64_t rde_stream_offset;
+} redup_entry_t;
+
+typedef struct redup_table {
+	redup_entry_t	**redup_hash_array;
+	umem_cache_t	*ddecache;
+	uint64_t	ddt_count;
+	int		numhashbits;
+} redup_table_t;
+
+int
+highbit64(uint64_t i)
+{
+	if (i == 0)
+		return (0);
+
+	return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
+}
+
+static void *
+safe_calloc(size_t n)
+{
+	void *rv = calloc(1, n);
+	if (rv == NULL) {
+		fprintf(stderr,
+		    "Error: could not allocate %u bytes of memory\n",
+		    (int)n);
+		exit(1);
+	}
+	return (rv);
+}
+
+/*
+ * Safe version of fread(), exits on error.
+ */
+static int
+sfread(void *buf, size_t size, FILE *fp)
+{
+	int rv = fread(buf, size, 1, fp);
+	if (rv == 0 && ferror(fp)) {
+		(void) fprintf(stderr, "Error while reading file: %s\n",
+		    strerror(errno));
+		exit(1);
+	}
+	return (rv);
+}
+
+/*
+ * Safe version of pread(), exits on error.
+ */
+static void
+spread(int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t err = pread(fd, buf, count, offset);
+	if (err == -1) {
+		(void) fprintf(stderr,
+		    "Error while reading file: %s\n",
+		    strerror(errno));
+		exit(1);
+	} else if (err != count) {
+		(void) fprintf(stderr,
+		    "Error while reading file: short read\n");
+		exit(1);
+	}
+}
+
+static int
+dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
+    zio_cksum_t *zc, int outfd)
+{
+	assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
+	    == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	fletcher_4_incremental_native(drr,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
+	if (drr->drr_type != DRR_BEGIN) {
+		assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
+		    drr_checksum.drr_checksum));
+		drr->drr_u.drr_checksum.drr_checksum = *zc;
+	}
+	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
+	    sizeof (zio_cksum_t), zc);
+	if (write(outfd, drr, sizeof (*drr)) == -1)
+		return (errno);
+	if (payload_len != 0) {
+		fletcher_4_incremental_native(payload, payload_len, zc);
+		if (write(outfd, payload, payload_len) == -1)
+			return (errno);
+	}
+	return (0);
+}
+
+static void
+rdt_insert(redup_table_t *rdt,
+    uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
+{
+	uint64_t ch = cityhash4(guid, object, offset, 0);
+	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
+	redup_entry_t **rdepp;
+
+	rdepp = &(rdt->redup_hash_array[hashcode]);
+	redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
+	rde->rde_next = *rdepp;
+	rde->rde_guid = guid;
+	rde->rde_object = object;
+	rde->rde_offset = offset;
+	rde->rde_stream_offset = stream_offset;
+	*rdepp = rde;
+	rdt->ddt_count++;
+}
+
+static void
+rdt_lookup(redup_table_t *rdt,
+    uint64_t guid, uint64_t object, uint64_t offset,
+    uint64_t *stream_offsetp)
+{
+	uint64_t ch = cityhash4(guid, object, offset, 0);
+	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
+
+	for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
+	    rde != NULL; rde = rde->rde_next) {
+		if (rde->rde_guid == guid &&
+		    rde->rde_object == object &&
+		    rde->rde_offset == offset) {
+			*stream_offsetp = rde->rde_stream_offset;
+			return;
+		}
+	}
+	assert(!"could not find expected redup table entry");
+}
+
+/*
+ * Convert a dedup stream (generated by "zfs send -D") to a
+ * non-deduplicated stream.  The entire infd will be converted, including
+ * any substreams in a stream package (generated by "zfs send -RD"). The
+ * infd must be seekable.
+ */
+static void
+zfs_redup_stream(int infd, int outfd, boolean_t verbose)
+{
+	int bufsz = SPA_MAXBLOCKSIZE;
+	dmu_replay_record_t thedrr = { 0 };
+	dmu_replay_record_t *drr = &thedrr;
+	redup_table_t rdt;
+	zio_cksum_t stream_cksum;
+	uint64_t numbuckets;
+	uint64_t num_records = 0;
+	uint64_t num_write_byref_records = 0;
+
+#ifdef _ILP32
+	uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
+#else
+	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
+	uint64_t max_rde_size =
+	    MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
+	    SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
+#endif
+
+	numbuckets = max_rde_size / (sizeof (redup_entry_t));
+
+	/*
+	 * numbuckets must be a power of 2.  Increase number to
+	 * a power of 2 if necessary.
+	 */
+	if (!ISP2(numbuckets))
+		numbuckets = 1ULL << highbit64(numbuckets);
+
+	rdt.redup_hash_array =
+	    safe_calloc(numbuckets * sizeof (redup_entry_t *));
+	rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
+	    NULL, NULL, NULL, NULL, NULL, 0);
+	rdt.numhashbits = highbit64(numbuckets) - 1;
+	rdt.ddt_count = 0;
+
+	char *buf = safe_calloc(bufsz);
+	FILE *ofp = fdopen(infd, "r");
+	long offset = ftell(ofp);
+	while (sfread(drr, sizeof (*drr), ofp) != 0) {
+		num_records++;
+
+		/*
+		 * We need to regenerate the checksum.
+		 */
+		if (drr->drr_type != DRR_BEGIN) {
+			bzero(&drr->drr_u.drr_checksum.drr_checksum,
+			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
+		}
+
+		uint64_t payload_size = 0;
+		switch (drr->drr_type) {
+		case DRR_BEGIN:
+		{
+			struct drr_begin *drrb = &drr->drr_u.drr_begin;
+			int fflags;
+			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
+
+			assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
+
+			/* clear the DEDUP feature flag for this stream */
+			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+			fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
+			    DMU_BACKUP_FEATURE_DEDUPPROPS);
+			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
+
+			int sz = drr->drr_payloadlen;
+			if (sz != 0) {
+				if (sz > bufsz) {
+					free(buf);
+					buf = safe_calloc(sz);
+					bufsz = sz;
+				}
+				(void) sfread(buf, sz, ofp);
+			}
+			payload_size = sz;
+			break;
+		}
+
+		case DRR_END:
+		{
+			struct drr_end *drre = &drr->drr_u.drr_end;
+			/*
+			 * Use the recalculated checksum, unless this is
+			 * the END record of a stream package, which has
+			 * no checksum.
+			 */
+			if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
+				drre->drr_checksum = stream_cksum;
+			break;
+		}
+
+		case DRR_OBJECT:
+		{
+			struct drr_object *drro = &drr->drr_u.drr_object;
+
+			if (drro->drr_bonuslen > 0) {
+				payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
+				(void) sfread(buf, payload_size, ofp);
+			}
+			break;
+		}
+
+		case DRR_SPILL:
+		{
+			struct drr_spill *drrs = &drr->drr_u.drr_spill;
+			payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+			(void) sfread(buf, payload_size, ofp);
+			break;
+		}
+
+		case DRR_WRITE_BYREF:
+		{
+			struct drr_write_byref drrwb =
+			    drr->drr_u.drr_write_byref;
+
+			num_write_byref_records++;
+
+			/*
+			 * Look up in hash table by drrwb->drr_refguid,
+			 * drr_refobject, drr_refoffset.  Replace this
+			 * record with the found WRITE record, but with
+			 * drr_object,drr_offset,drr_toguid replaced with ours.
+			 */
+			uint64_t stream_offset = 0;
+			rdt_lookup(&rdt, drrwb.drr_refguid,
+			    drrwb.drr_refobject, drrwb.drr_refoffset,
+			    &stream_offset);
+
+			spread(infd, drr, sizeof (*drr), stream_offset);
+
+			assert(drr->drr_type == DRR_WRITE);
+			struct drr_write *drrw = &drr->drr_u.drr_write;
+			assert(drrw->drr_toguid == drrwb.drr_refguid);
+			assert(drrw->drr_object == drrwb.drr_refobject);
+			assert(drrw->drr_offset == drrwb.drr_refoffset);
+
+			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+			spread(infd, buf, payload_size,
+			    stream_offset + sizeof (*drr));
+
+			drrw->drr_toguid = drrwb.drr_toguid;
+			drrw->drr_object = drrwb.drr_object;
+			drrw->drr_offset = drrwb.drr_offset;
+			break;
+		}
+
+		case DRR_WRITE:
+		{
+			struct drr_write *drrw = &drr->drr_u.drr_write;
+			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+			(void) sfread(buf, payload_size, ofp);
+
+			rdt_insert(&rdt, drrw->drr_toguid,
+			    drrw->drr_object, drrw->drr_offset, offset);
+			break;
+		}
+
+		case DRR_WRITE_EMBEDDED:
+		{
+			struct drr_write_embedded *drrwe =
+			    &drr->drr_u.drr_write_embedded;
+			payload_size =
+			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
+			(void) sfread(buf, payload_size, ofp);
+			break;
+		}
+
+		case DRR_FREEOBJECTS:
+		case DRR_FREE:
+		case DRR_OBJECT_RANGE:
+			break;
+
+		default:
+			(void) fprintf(stderr, "INVALID record type 0x%x\n",
+			    drr->drr_type);
+			/* should never happen, so assert */
+			assert(B_FALSE);
+		}
+
+		if (feof(ofp)) {
+			fprintf(stderr, "Error: unexpected end-of-file\n");
+			exit(1);
+		}
+		if (ferror(ofp)) {
+			fprintf(stderr, "Error while reading file: %s\n",
+			    strerror(errno));
+			exit(1);
+		}
+
+		/*
+		 * We need to recalculate the checksum, and it needs to be
+		 * initially zero to do that.  BEGIN records don't have
+		 * a checksum.
+		 */
+		if (drr->drr_type != DRR_BEGIN) {
+			bzero(&drr->drr_u.drr_checksum.drr_checksum,
+			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
+		}
+		if (dump_record(drr, buf, payload_size,
+		    &stream_cksum, outfd) != 0)
+			break;
+		if (drr->drr_type == DRR_END) {
+			/*
+			 * Typically the END record is either the last
+			 * thing in the stream, or it is followed
+			 * by a BEGIN record (which also zeros the checksum).
+			 * However, a stream package ends with two END
+			 * records.  The last END record's checksum starts
+			 * from zero.
+			 */
+			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
+		}
+		offset = ftell(ofp);
+	}
+
+	if (verbose) {
+		char mem_str[16];
+		zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
+		    mem_str, sizeof (mem_str));
+		fprintf(stderr, "converted stream with %llu total records, "
+		    "including %llu dedup records, using %sB memory.\n",
+		    (long long)num_records,
+		    (long long)num_write_byref_records,
+		    mem_str);
+	}
+
+	umem_cache_destroy(rdt.ddecache);
+	free(rdt.redup_hash_array);
+	free(buf);
+	(void) fclose(ofp);
+}
+
+int
+zstream_do_redup(int argc, char *argv[])
+{
+	boolean_t verbose = B_FALSE;
+	char c;
+
+	while ((c = getopt(argc, argv, "v")) != -1) {
+		switch (c) {
+		case 'v':
+			verbose = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, "invalid option '%c'\n",
+			    optopt);
+			zstream_usage();
+			break;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc != 1)
+		zstream_usage();
+
+	const char *filename = argv[0];
+
+	if (isatty(STDOUT_FILENO)) {
+		(void) fprintf(stderr,
+		    "Error: Stream can not be written to a terminal.\n"
+		    "You must redirect standard output.\n");
+		return (1);
+	}
+
+	int fd = open(filename, O_RDONLY);
+	if (fd == -1) {
+		(void) fprintf(stderr,
+		    "Error while opening file '%s': %s\n",
+		    filename, strerror(errno));
+		exit(1);
+	}
+
+	fletcher_4_init();
+	zfs_redup_stream(fd, STDOUT_FILENO, verbose);
+	fletcher_4_fini();
+
+	close(fd);
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_token.c b/sys/contrib/openzfs/cmd/zstream/zstream_token.c
new file mode 100644
index 000000000000..36a76a4bb851
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream_token.c
@@ -0,0 +1,78 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Portions Copyright 2012 Martin Matuska <martin@matuska.org>
+ */
+
+/*
+ * Copyright (c) 2020 by Datto Inc. All rights reserved.
+ */
+
+#include <ctype.h>
+#include <libnvpair.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <stddef.h>
+
+#include <libzfs.h>
+#include <libzfs_core.h>
+
+#include <sys/dmu.h>
+#include <sys/zfs_ioctl.h>
+#include "zstream.h"
+
+int
+zstream_do_token(int argc, char *argv[])
+{
+	char *resume_token = NULL;
+
+	if (argc < 2) {
+		(void) fprintf(stderr, "Need to pass the resume token\n");
+		zstream_usage();
+	}
+
+	resume_token = argv[1];
+
+	libzfs_handle_t *hdl = libzfs_init();
+
+	nvlist_t *resume_nvl =
+	    zfs_send_resume_token_to_nvlist(hdl, resume_token);
+
+	if (resume_nvl == NULL) {
+		(void) fprintf(stderr,
+		    "Unable to parse resume token: %s\n",
+		    libzfs_error_description(hdl));
+		libzfs_fini(hdl);
+		return (1);
+	}
+
+	dump_nvlist(resume_nvl, 5);
+	nvlist_free(resume_nvl);
+
+	libzfs_fini(hdl);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zstreamdump/Makefile.am b/sys/contrib/openzfs/cmd/zstreamdump/Makefile.am
new file mode 100644
index 000000000000..2c04d8513150
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstreamdump/Makefile.am
@@ -0,0 +1 @@
+dist_sbin_SCRIPTS = zstreamdump
diff --git a/sys/contrib/openzfs/cmd/zstreamdump/zstreamdump b/sys/contrib/openzfs/cmd/zstreamdump/zstreamdump
new file mode 100755
index 000000000000..fbf02ee687f6
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstreamdump/zstreamdump
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+zstream dump "$@"
diff --git a/sys/contrib/openzfs/cmd/ztest/.gitignore b/sys/contrib/openzfs/cmd/ztest/.gitignore
new file mode 100644
index 000000000000..d3d498dae693
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/ztest/.gitignore
@@ -0,0 +1 @@
+/ztest
diff --git a/sys/contrib/openzfs/cmd/ztest/Makefile.am b/sys/contrib/openzfs/cmd/ztest/Makefile.am
new file mode 100644
index 000000000000..6042b44d1dde
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/ztest/Makefile.am
@@ -0,0 +1,23 @@
+include $(top_srcdir)/config/Rules.am
+
+# Get rid of compiler warning for unchecked truncating snprintfs on gcc 7.1.1
+AM_CFLAGS += $(NO_FORMAT_TRUNCATION)
+
+# Includes kernel code, generate warnings for large stack frames
+AM_CFLAGS += $(FRAME_LARGER_THAN)
+
+# Unconditionally enable ASSERTs
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+sbin_PROGRAMS = ztest
+
+ztest_SOURCES = \
+	ztest.c
+
+ztest_LDADD = \
+	$(abs_top_builddir)/lib/libzpool/libzpool.la \
+	$(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+	$(abs_top_builddir)/lib/libnvpair/libnvpair.la
+
+ztest_LDADD += -lm
+ztest_LDFLAGS = -pthread
diff --git a/sys/contrib/openzfs/cmd/ztest/ztest.c b/sys/contrib/openzfs/cmd/ztest/ztest.c
new file mode 100644
index 000000000000..31205a5bf8cf
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/ztest/ztest.c
@@ -0,0 +1,7818 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+/*
+ * The objective of this program is to provide a DMU/ZAP/SPA stress test
+ * that runs entirely in userland, is easy to use, and easy to extend.
+ *
+ * The overall design of the ztest program is as follows:
+ *
+ * (1) For each major functional area (e.g. adding vdevs to a pool,
+ *     creating and destroying datasets, reading and writing objects, etc)
+ *     we have a simple routine to test that functionality.  These
+ *     individual routines do not have to do anything "stressful".
+ *
+ * (2) We turn these simple functionality tests into a stress test by
+ *     running them all in parallel, with as many threads as desired,
+ *     and spread across as many datasets, objects, and vdevs as desired.
+ *
+ * (3) While all this is happening, we inject faults into the pool to
+ *     verify that self-healing data really works.
+ *
+ * (4) Every time we open a dataset, we change its checksum and compression
+ *     functions.  Thus even individual objects vary from block to block
+ *     in which checksum they use and whether they're compressed.
+ *
+ * (5) To verify that we never lose on-disk consistency after a crash,
+ *     we run the entire test in a child of the main process.
+ *     At random times, the child self-immolates with a SIGKILL.
+ *     This is the software equivalent of pulling the power cord.
+ *     The parent then runs the test again, using the existing
+ *     storage pool, as many times as desired. If backwards compatibility
+ *     testing is enabled ztest will sometimes run the "older" version
+ *     of ztest after a SIGKILL.
+ *
+ * (6) To verify that we don't have future leaks or temporal incursions,
+ *     many of the functional tests record the transaction group number
+ *     as part of their data.  When reading old data, they verify that
+ *     the transaction group number is less than the current, open txg.
+ *     If you add a new test, please do this if applicable.
+ *
+ * (7) Threads are created with a reduced stack size, for sanity checking.
+ *     Therefore, it's important not to allocate huge buffers on the stack.
+ *
+ * When run with no arguments, ztest runs for about five minutes and
+ * produces no output if successful.  To get a little bit of information,
+ * specify -V.  To get more information, specify -VV, and so on.
+ *
+ * To turn this into an overnight stress test, use -T to specify run time.
+ *
+ * You can ask more vdevs [-v], datasets [-d], or threads [-t]
+ * to increase the pool capacity, fanout, and overall stress level.
+ *
+ * Use the -k option to set the desired frequency of kills.
+ *
+ * When ztest invokes itself it passes all relevant information through a
+ * temporary file which is mmap-ed in the child process. This allows shared
+ * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
+ * stored at offset 0 of this file and contains information on the size and
+ * number of shared structures in the file. The information stored in this file
+ * must remain backwards compatible with older versions of ztest so that
+ * ztest can invoke them during backwards compatibility testing (-B).
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/dmu_objset.h>
+#include <sys/poll.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_trim.h>
+#include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_scan.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_refcount.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_userhold.h>
+#include <sys/abd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <umem.h>
+#include <ctype.h>
+#include <math.h>
+#include <sys/fs/zfs.h>
+#include <zfs_fletcher.h>
+#include <libnvpair.h>
+#include <libzutil.h>
+#include <sys/crypto/icp.h>
+#ifdef __GLIBC__
+#include <execinfo.h> /* for backtrace() */
+#endif
+
+static int ztest_fd_data = -1;
+static int ztest_fd_rand = -1;
+
+typedef struct ztest_shared_hdr {
+	uint64_t	zh_hdr_size;
+	uint64_t	zh_opts_size;
+	uint64_t	zh_size;
+	uint64_t	zh_stats_size;
+	uint64_t	zh_stats_count;
+	uint64_t	zh_ds_size;
+	uint64_t	zh_ds_count;
+} ztest_shared_hdr_t;
+
+static ztest_shared_hdr_t *ztest_shared_hdr;
+
+enum ztest_class_state {
+	ZTEST_VDEV_CLASS_OFF,
+	ZTEST_VDEV_CLASS_ON,
+	ZTEST_VDEV_CLASS_RND
+};
+
+typedef struct ztest_shared_opts {
+	char zo_pool[ZFS_MAX_DATASET_NAME_LEN];
+	char zo_dir[ZFS_MAX_DATASET_NAME_LEN];
+	char zo_alt_ztest[MAXNAMELEN];
+	char zo_alt_libpath[MAXNAMELEN];
+	uint64_t zo_vdevs;
+	uint64_t zo_vdevtime;
+	size_t zo_vdev_size;
+	int zo_ashift;
+	int zo_mirrors;
+	int zo_raidz;
+	int zo_raidz_parity;
+	int zo_datasets;
+	int zo_threads;
+	uint64_t zo_passtime;
+	uint64_t zo_killrate;
+	int zo_verbose;
+	int zo_init;
+	uint64_t zo_time;
+	uint64_t zo_maxloops;
+	uint64_t zo_metaslab_force_ganging;
+	int zo_mmp_test;
+	int zo_special_vdevs;
+	int zo_dump_dbgmsg;
+} ztest_shared_opts_t;
+
+static const ztest_shared_opts_t ztest_opts_defaults = {
+	.zo_pool = "ztest",
+	.zo_dir = "/tmp",
+	.zo_alt_ztest = { '\0' },
+	.zo_alt_libpath = { '\0' },
+	.zo_vdevs = 5,
+	.zo_ashift = SPA_MINBLOCKSHIFT,
+	.zo_mirrors = 2,
+	.zo_raidz = 4,
+	.zo_raidz_parity = 1,
+	.zo_vdev_size = SPA_MINDEVSIZE * 4,	/* 256m default size */
+	.zo_datasets = 7,
+	.zo_threads = 23,
+	.zo_passtime = 60,		/* 60 seconds */
+	.zo_killrate = 70,		/* 70% kill rate */
+	.zo_verbose = 0,
+	.zo_mmp_test = 0,
+	.zo_init = 1,
+	.zo_time = 300,			/* 5 minutes */
+	.zo_maxloops = 50,		/* max loops during spa_freeze() */
+	.zo_metaslab_force_ganging = 64 << 10,
+	.zo_special_vdevs = ZTEST_VDEV_CLASS_RND,
+};
+
+extern uint64_t metaslab_force_ganging;
+extern uint64_t metaslab_df_alloc_threshold;
+extern unsigned long zfs_deadman_synctime_ms;
+extern int metaslab_preload_limit;
+extern boolean_t zfs_compressed_arc_enabled;
+extern int zfs_abd_scatter_enabled;
+extern int dmu_object_alloc_chunk_shift;
+extern boolean_t zfs_force_some_double_word_sm_entries;
+extern unsigned long zio_decompress_fail_fraction;
+extern unsigned long zfs_reconstruct_indirect_damage_fraction;
+
+
+static ztest_shared_opts_t *ztest_shared_opts;
+static ztest_shared_opts_t ztest_opts;
+static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345";
+
+typedef struct ztest_shared_ds {
+	uint64_t	zd_seq;
+} ztest_shared_ds_t;
+
+static ztest_shared_ds_t *ztest_shared_ds;
+#define	ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
+
+#define	BT_MAGIC	0x123456789abcdefULL
+#define	MAXFAULTS(zs) \
+	(MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
+
+enum ztest_io_type {
+	ZTEST_IO_WRITE_TAG,
+	ZTEST_IO_WRITE_PATTERN,
+	ZTEST_IO_WRITE_ZEROES,
+	ZTEST_IO_TRUNCATE,
+	ZTEST_IO_SETATTR,
+	ZTEST_IO_REWRITE,
+	ZTEST_IO_TYPES
+};
+
+typedef struct ztest_block_tag {
+	uint64_t	bt_magic;
+	uint64_t	bt_objset;
+	uint64_t	bt_object;
+	uint64_t	bt_dnodesize;
+	uint64_t	bt_offset;
+	uint64_t	bt_gen;
+	uint64_t	bt_txg;
+	uint64_t	bt_crtxg;
+} ztest_block_tag_t;
+
+typedef struct bufwad {
+	uint64_t	bw_index;
+	uint64_t	bw_txg;
+	uint64_t	bw_data;
+} bufwad_t;
+
+/*
+ * It would be better to use a rangelock_t per object.  Unfortunately
+ * the rangelock_t is not a drop-in replacement for rl_t, because we
+ * still need to map from object ID to rangelock_t.
+ */
+typedef enum {
+	RL_READER,
+	RL_WRITER,
+	RL_APPEND
+} rl_type_t;
+
+typedef struct rll {
+	void		*rll_writer;
+	int		rll_readers;
+	kmutex_t	rll_lock;
+	kcondvar_t	rll_cv;
+} rll_t;
+
+typedef struct rl {
+	uint64_t	rl_object;
+	uint64_t	rl_offset;
+	uint64_t	rl_size;
+	rll_t		*rl_lock;
+} rl_t;
+
+#define	ZTEST_RANGE_LOCKS	64
+#define	ZTEST_OBJECT_LOCKS	64
+
+/*
+ * Object descriptor.  Used as a template for object lookup/create/remove.
+ */
+typedef struct ztest_od {
+	uint64_t	od_dir;
+	uint64_t	od_object;
+	dmu_object_type_t od_type;
+	dmu_object_type_t od_crtype;
+	uint64_t	od_blocksize;
+	uint64_t	od_crblocksize;
+	uint64_t	od_crdnodesize;
+	uint64_t	od_gen;
+	uint64_t	od_crgen;
+	char		od_name[ZFS_MAX_DATASET_NAME_LEN];
+} ztest_od_t;
+
+/*
+ * Per-dataset state.
+ */
+typedef struct ztest_ds {
+	ztest_shared_ds_t *zd_shared;
+	objset_t	*zd_os;
+	pthread_rwlock_t zd_zilog_lock;
+	zilog_t		*zd_zilog;
+	ztest_od_t	*zd_od;		/* debugging aid */
+	char		zd_name[ZFS_MAX_DATASET_NAME_LEN];
+	kmutex_t	zd_dirobj_lock;
+	rll_t		zd_object_lock[ZTEST_OBJECT_LOCKS];
+	rll_t		zd_range_lock[ZTEST_RANGE_LOCKS];
+} ztest_ds_t;
+
+/*
+ * Per-iteration state.
+ */
+typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
+
+typedef struct ztest_info {
+	ztest_func_t	*zi_func;	/* test function */
+	uint64_t	zi_iters;	/* iterations per execution */
+	uint64_t	*zi_interval;	/* execute every <interval> seconds */
+	const char	*zi_funcname;	/* name of test function */
+} ztest_info_t;
+
+typedef struct ztest_shared_callstate {
+	uint64_t	zc_count;	/* per-pass count */
+	uint64_t	zc_time;	/* per-pass time */
+	uint64_t	zc_next;	/* next time to call this function */
+} ztest_shared_callstate_t;
+
+static ztest_shared_callstate_t *ztest_shared_callstate;
+#define	ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])
+
+ztest_func_t ztest_dmu_read_write;
+ztest_func_t ztest_dmu_write_parallel;
+ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_object_next_chunk;
+ztest_func_t ztest_dmu_commit_callbacks;
+ztest_func_t ztest_zap;
+ztest_func_t ztest_zap_parallel;
+ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_zil_remount;
+ztest_func_t ztest_dmu_read_write_zcopy;
+ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_prealloc;
+ztest_func_t ztest_fzap;
+ztest_func_t ztest_dmu_snapshot_create_destroy;
+ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_spa_prop_get_set;
+ztest_func_t ztest_spa_create_destroy;
+ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_dmu_snapshot_hold;
+ztest_func_t ztest_mmp_enable_disable;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_dsl_dataset_promote_busy;
+ztest_func_t ztest_vdev_attach_detach;
+ztest_func_t ztest_vdev_LUN_growth;
+ztest_func_t ztest_vdev_add_remove;
+ztest_func_t ztest_vdev_class_add;
+ztest_func_t ztest_vdev_aux_add_remove;
+ztest_func_t ztest_split_pool;
+ztest_func_t ztest_reguid;
+ztest_func_t ztest_spa_upgrade;
+ztest_func_t ztest_device_removal;
+ztest_func_t ztest_spa_checkpoint_create_discard;
+ztest_func_t ztest_initialize;
+ztest_func_t ztest_trim;
+ztest_func_t ztest_fletcher;
+ztest_func_t ztest_fletcher_incr;
+ztest_func_t ztest_verify_dnode_bt;
+
+uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
+uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
+uint64_t zopt_often = 1ULL * NANOSEC;		/* every second */
+uint64_t zopt_sometimes = 10ULL * NANOSEC;	/* every 10 seconds */
+uint64_t zopt_rarely = 60ULL * NANOSEC;		/* every 60 seconds */
+
+#define	ZTI_INIT(func, iters, interval) \
+	{   .zi_func = (func), \
+	    .zi_iters = (iters), \
+	    .zi_interval = (interval), \
+	    .zi_funcname = # func }
+
+ztest_info_t ztest_info[] = {
+	ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
+	ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
+	ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
+	ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
+	ZTI_INIT(ztest_zap, 30, &zopt_always),
+	ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
+	ZTI_INIT(ztest_split_pool, 1, &zopt_always),
+	ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant),
+	ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often),
+	ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often),
+	ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often),
+	ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes),
+#if 0
+	ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes),
+#endif
+	ZTI_INIT(ztest_fzap, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_reguid, 1, &zopt_rarely),
+	ZTI_INIT(ztest_scrub, 1, &zopt_rarely),
+	ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely),
+	ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely),
+	ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely),
+	ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),
+	ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime),
+	ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime),
+	ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely),
+	ZTI_INIT(ztest_initialize, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_trim, 1, &zopt_sometimes),
+	ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
+	ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
+	ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
+};
+
+#define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
+
+/*
+ * The following struct is used to hold a list of uncalled commit callbacks.
+ * The callbacks are ordered by txg number.
+ */
+typedef struct ztest_cb_list {
+	kmutex_t	zcl_callbacks_lock;
+	list_t		zcl_callbacks;
+} ztest_cb_list_t;
+
+/*
+ * Stuff we need to share writably between parent and child.
+ */
+typedef struct ztest_shared {
+	boolean_t	zs_do_init;
+	hrtime_t	zs_proc_start;
+	hrtime_t	zs_proc_stop;
+	hrtime_t	zs_thread_start;
+	hrtime_t	zs_thread_stop;
+	hrtime_t	zs_thread_kill;
+	uint64_t	zs_enospc_count;
+	uint64_t	zs_vdev_next_leaf;
+	uint64_t	zs_vdev_aux;
+	uint64_t	zs_alloc;
+	uint64_t	zs_space;
+	uint64_t	zs_splits;
+	uint64_t	zs_mirrors;
+	uint64_t	zs_metaslab_sz;
+	uint64_t	zs_metaslab_df_alloc_threshold;
+	uint64_t	zs_guid;
+} ztest_shared_t;
+
+#define	ID_PARALLEL	-1ULL
+
+static char ztest_dev_template[] = "%s/%s.%llua";
+static char ztest_aux_template[] = "%s/%s.%s.%llu";
+ztest_shared_t *ztest_shared;
+
+static spa_t *ztest_spa = NULL;
+static ztest_ds_t *ztest_ds;
+
+static kmutex_t ztest_vdev_lock;
+static boolean_t ztest_device_removal_active = B_FALSE;
+static boolean_t ztest_pool_scrubbed = B_FALSE;
+static kmutex_t ztest_checkpoint_lock;
+
+/*
+ * The ztest_name_lock protects the pool and dataset namespace used by
+ * the individual tests. To modify the namespace, consumers must grab
+ * this lock as writer. Grabbing the lock as reader will ensure that the
+ * namespace does not change while the lock is held.
+ */
+static pthread_rwlock_t ztest_name_lock;
+
+static boolean_t ztest_dump_core = B_TRUE;
+static boolean_t ztest_exiting;
+
+/* Global commit callback list */
+static ztest_cb_list_t zcl;
+/* Commit cb delay */
+static uint64_t zc_min_txg_delay = UINT64_MAX;
+static int zc_cb_counter = 0;
+
+/*
+ * Minimum number of commit callbacks that need to be registered for us to check
+ * whether the minimum txg delay is acceptable.
+ */
+#define	ZTEST_COMMIT_CB_MIN_REG	100
+
+/*
+ * If a number of txgs equal to this threshold have been created after a commit
+ * callback has been registered but not called, then we assume there is an
+ * implementation bug.
+ */
+#define	ZTEST_COMMIT_CB_THRESH	(TXG_CONCURRENT_STATES + 1000)
+
+enum ztest_object {
+	ZTEST_META_DNODE = 0,
+	ZTEST_DIROBJ,
+	ZTEST_OBJECTS
+};
+
+static void usage(boolean_t) __NORETURN;
+static int ztest_scrub_impl(spa_t *spa);
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init(void)
+{
+	return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+	return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+static void
+dump_debug_buffer(void)
+{
+	ssize_t ret __attribute__((unused));
+
+	if (!ztest_opts.zo_dump_dbgmsg)
+		return;
+
+	/*
+	 * We use write() instead of printf() so that this function
+	 * is safe to call from a signal handler.
+	 */
+	ret = write(STDOUT_FILENO, "\n", 1);
+	zfs_dbgmsg_print("ztest");
+}
+
+#define	BACKTRACE_SZ	100
+
+static void sig_handler(int signo)
+{
+	struct sigaction action;
+#ifdef __GLIBC__ /* backtrace() is a GNU extension */
+	int nptrs;
+	void *buffer[BACKTRACE_SZ];
+
+	nptrs = backtrace(buffer, BACKTRACE_SZ);
+	backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
+#endif
+	dump_debug_buffer();
+
+	/*
+	 * Restore default action and re-raise signal so SIGSEGV and
+	 * SIGABRT can trigger a core dump.
+	 */
+	action.sa_handler = SIG_DFL;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+	(void) sigaction(signo, &action, NULL);
+	raise(signo);
+}
+
+#define	FATAL_MSG_SZ	1024
+
+char *fatal_msg;
+
+static void
+fatal(int do_perror, char *message, ...)
+{
+	va_list args;
+	int save_errno = errno;
+	char *buf;
+
+	(void) fflush(stdout);
+	buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL);
+
+	va_start(args, message);
+	(void) sprintf(buf, "ztest: ");
+	/* LINTED */
+	(void) vsprintf(buf + strlen(buf), message, args);
+	va_end(args);
+	if (do_perror) {
+		(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
+		    ": %s", strerror(save_errno));
+	}
+	(void) fprintf(stderr, "%s\n", buf);
+	fatal_msg = buf;			/* to ease debugging */
+
+	if (ztest_dump_core)
+		abort();
+	else
+		dump_debug_buffer();
+
+	exit(3);
+}
+
+static int
+str2shift(const char *buf)
+{
+	const char *ends = "BKMGTPEZ";
+	int i;
+
+	if (buf[0] == '\0')
+		return (0);
+	for (i = 0; i < strlen(ends); i++) {
+		if (toupper(buf[0]) == ends[i])
+			break;
+	}
+	if (i == strlen(ends)) {
+		(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
+		    buf);
+		usage(B_FALSE);
+	}
+	if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
+		return (10*i);
+	}
+	(void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
+	usage(B_FALSE);
+	/* NOTREACHED */
+}
+
+static uint64_t
+nicenumtoull(const char *buf)
+{
+	char *end;
+	uint64_t val;
+
+	val = strtoull(buf, &end, 0);
+	if (end == buf) {
+		(void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
+		usage(B_FALSE);
+	} else if (end[0] == '.') {
+		double fval = strtod(buf, &end);
+		fval *= pow(2, str2shift(end));
+		/*
+		 * UINT64_MAX is not exactly representable as a double.
+		 * The closest representation is UINT64_MAX + 1, so we
+		 * use a >= comparison instead of > for the bounds check.
+		 */
+		if (fval >= (double)UINT64_MAX) {
+			(void) fprintf(stderr, "ztest: value too large: %s\n",
+			    buf);
+			usage(B_FALSE);
+		}
+		val = (uint64_t)fval;
+	} else {
+		int shift = str2shift(end);
+		if (shift >= 64 || (val << shift) >> shift != val) {
+			(void) fprintf(stderr, "ztest: value too large: %s\n",
+			    buf);
+			usage(B_FALSE);
+		}
+		val <<= shift;
+	}
+	return (val);
+}
+
+static void
+usage(boolean_t requested)
+{
+	const ztest_shared_opts_t *zo = &ztest_opts_defaults;
+
+	char nice_vdev_size[NN_NUMBUF_SZ];
+	char nice_force_ganging[NN_NUMBUF_SZ];
+	FILE *fp = requested ? stdout : stderr;
+
+	nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size));
+	nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging,
+	    sizeof (nice_force_ganging));
+
+	(void) fprintf(fp, "Usage: %s\n"
+	    "\t[-v vdevs (default: %llu)]\n"
+	    "\t[-s size_of_each_vdev (default: %s)]\n"
+	    "\t[-a alignment_shift (default: %d)] use 0 for random\n"
+	    "\t[-m mirror_copies (default: %d)]\n"
+	    "\t[-r raidz_disks (default: %d)]\n"
+	    "\t[-R raidz_parity (default: %d)]\n"
+	    "\t[-d datasets (default: %d)]\n"
+	    "\t[-t threads (default: %d)]\n"
+	    "\t[-g gang_block_threshold (default: %s)]\n"
+	    "\t[-i init_count (default: %d)] initialize pool i times\n"
+	    "\t[-k kill_percentage (default: %llu%%)]\n"
+	    "\t[-p pool_name (default: %s)]\n"
+	    "\t[-f dir (default: %s)] file directory for vdev files\n"
+	    "\t[-M] Multi-host simulate pool imported on remote host\n"
+	    "\t[-V] verbose (use multiple times for ever more blather)\n"
+	    "\t[-E] use existing pool instead of creating new one\n"
+	    "\t[-T time (default: %llu sec)] total run time\n"
+	    "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
+	    "\t[-P passtime (default: %llu sec)] time per pass\n"
+	    "\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
+	    "\t[-C vdev class state (default: random)] special=on|off|random\n"
+	    "\t[-o variable=value] ... set global variable to an unsigned\n"
+	    "\t    32-bit integer value\n"
+	    "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n"
+	    "\t[-h] (print help)\n"
+	    "",
+	    zo->zo_pool,
+	    (u_longlong_t)zo->zo_vdevs,			/* -v */
+	    nice_vdev_size,				/* -s */
+	    zo->zo_ashift,				/* -a */
+	    zo->zo_mirrors,				/* -m */
+	    zo->zo_raidz,				/* -r */
+	    zo->zo_raidz_parity,			/* -R */
+	    zo->zo_datasets,				/* -d */
+	    zo->zo_threads,				/* -t */
+	    nice_force_ganging,				/* -g */
+	    zo->zo_init,				/* -i */
+	    (u_longlong_t)zo->zo_killrate,		/* -k */
+	    zo->zo_pool,				/* -p */
+	    zo->zo_dir,					/* -f */
+	    (u_longlong_t)zo->zo_time,			/* -T */
+	    (u_longlong_t)zo->zo_maxloops,		/* -F */
+	    (u_longlong_t)zo->zo_passtime);
+	exit(requested ? 0 : 1);
+}
+
+
+static void
+ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo)
+{
+	char name[32];
+	char *value;
+	int state = ZTEST_VDEV_CLASS_RND;
+
+	(void) strlcpy(name, input, sizeof (name));
+
+	value = strchr(name, '=');
+	if (value == NULL) {
+		(void) fprintf(stderr, "missing value in property=value "
+		    "'-C' argument (%s)\n", input);
+		usage(B_FALSE);
+	}
+	*(value) = '\0';
+	value++;
+
+	if (strcmp(value, "on") == 0) {
+		state = ZTEST_VDEV_CLASS_ON;
+	} else if (strcmp(value, "off") == 0) {
+		state = ZTEST_VDEV_CLASS_OFF;
+	} else if (strcmp(value, "random") == 0) {
+		state = ZTEST_VDEV_CLASS_RND;
+	} else {
+		(void) fprintf(stderr, "invalid property value '%s'\n", value);
+		usage(B_FALSE);
+	}
+
+	if (strcmp(name, "special") == 0) {
+		zo->zo_special_vdevs = state;
+	} else {
+		(void) fprintf(stderr, "invalid property name '%s'\n", name);
+		usage(B_FALSE);
+	}
+	if (zo->zo_verbose >= 3)
+		(void) printf("%s vdev state is '%s'\n", name, value);
+}
+
+static void
+process_options(int argc, char **argv)
+{
+	char *path;
+	ztest_shared_opts_t *zo = &ztest_opts;
+
+	int opt;
+	uint64_t value;
+	char altdir[MAXNAMELEN] = { 0 };
+
+	bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
+
+	while ((opt = getopt(argc, argv,
+	    "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) {
+		value = 0;
+		switch (opt) {
+		case 'v':
+		case 's':
+		case 'a':
+		case 'm':
+		case 'r':
+		case 'R':
+		case 'd':
+		case 't':
+		case 'g':
+		case 'i':
+		case 'k':
+		case 'T':
+		case 'P':
+		case 'F':
+			value = nicenumtoull(optarg);
+		}
+		switch (opt) {
+		case 'v':
+			zo->zo_vdevs = value;
+			break;
+		case 's':
+			zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
+			break;
+		case 'a':
+			zo->zo_ashift = value;
+			break;
+		case 'm':
+			zo->zo_mirrors = value;
+			break;
+		case 'r':
+			zo->zo_raidz = MAX(1, value);
+			break;
+		case 'R':
+			zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
+			break;
+		case 'd':
+			zo->zo_datasets = MAX(1, value);
+			break;
+		case 't':
+			zo->zo_threads = MAX(1, value);
+			break;
+		case 'g':
+			zo->zo_metaslab_force_ganging =
+			    MAX(SPA_MINBLOCKSIZE << 1, value);
+			break;
+		case 'i':
+			zo->zo_init = value;
+			break;
+		case 'k':
+			zo->zo_killrate = value;
+			break;
+		case 'p':
+			(void) strlcpy(zo->zo_pool, optarg,
+			    sizeof (zo->zo_pool));
+			break;
+		case 'f':
+			path = realpath(optarg, NULL);
+			if (path == NULL) {
+				(void) fprintf(stderr, "error: %s: %s\n",
+				    optarg, strerror(errno));
+				usage(B_FALSE);
+			} else {
+				(void) strlcpy(zo->zo_dir, path,
+				    sizeof (zo->zo_dir));
+				free(path);
+			}
+			break;
+		case 'M':
+			zo->zo_mmp_test = 1;
+			break;
+		case 'V':
+			zo->zo_verbose++;
+			break;
+		case 'E':
+			zo->zo_init = 0;
+			break;
+		case 'T':
+			zo->zo_time = value;
+			break;
+		case 'P':
+			zo->zo_passtime = MAX(1, value);
+			break;
+		case 'F':
+			zo->zo_maxloops = MAX(1, value);
+			break;
+		case 'B':
+			(void) strlcpy(altdir, optarg, sizeof (altdir));
+			break;
+		case 'C':
+			ztest_parse_name_value(optarg, zo);
+			break;
+		case 'o':
+			if (set_global_var(optarg) != 0)
+				usage(B_FALSE);
+			break;
+		case 'G':
+			zo->zo_dump_dbgmsg = 1;
+			break;
+		case 'h':
+			usage(B_TRUE);
+			break;
+		case '?':
+		default:
+			usage(B_FALSE);
+			break;
+		}
+	}
+
+	zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
+
+	zo->zo_vdevtime =
+	    (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
+	    UINT64_MAX >> 2);
+
+	if (strlen(altdir) > 0) {
+		char *cmd;
+		char *realaltdir;
+		char *bin;
+		char *ztest;
+		char *isa;
+		int isalen;
+
+		cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+		realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+		VERIFY(NULL != realpath(getexecname(), cmd));
+		if (0 != access(altdir, F_OK)) {
+			ztest_dump_core = B_FALSE;
+			fatal(B_TRUE, "invalid alternate ztest path: %s",
+			    altdir);
+		}
+		VERIFY(NULL != realpath(altdir, realaltdir));
+
+		/*
+		 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
+		 * We want to extract <isa> to determine if we should use
+		 * 32 or 64 bit binaries.
+		 */
+		bin = strstr(cmd, "/usr/bin/");
+		ztest = strstr(bin, "/ztest");
+		isa = bin + 9;
+		isalen = ztest - isa;
+		(void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
+		    "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
+		(void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
+		    "%s/usr/lib/%.*s", realaltdir, isalen, isa);
+
+		if (0 != access(zo->zo_alt_ztest, X_OK)) {
+			ztest_dump_core = B_FALSE;
+			fatal(B_TRUE, "invalid alternate ztest: %s",
+			    zo->zo_alt_ztest);
+		} else if (0 != access(zo->zo_alt_libpath, X_OK)) {
+			ztest_dump_core = B_FALSE;
+			fatal(B_TRUE, "invalid alternate lib directory %s",
+			    zo->zo_alt_libpath);
+		}
+
+		umem_free(cmd, MAXPATHLEN);
+		umem_free(realaltdir, MAXPATHLEN);
+	}
+}
+
+static void
+ztest_kill(ztest_shared_t *zs)
+{
+	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
+	zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
+
+	/*
+	 * Before we kill off ztest, make sure that the config is updated.
+	 * See comment above spa_write_cachefile().
+	 */
+	mutex_enter(&spa_namespace_lock);
+	spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE);
+	mutex_exit(&spa_namespace_lock);
+
+	(void) kill(getpid(), SIGKILL);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+	uint64_t r;
+
+	ASSERT3S(ztest_fd_rand, >=, 0);
+
+	if (range == 0)
+		return (0);
+
+	if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
+		fatal(1, "short read from /dev/urandom");
+
+	return (r % range);
+}
+
+/* ARGSUSED */
+static void
+ztest_record_enospc(const char *s)
+{
+	ztest_shared->zs_enospc_count++;
+}
+
+static uint64_t
+ztest_get_ashift(void)
+{
+	if (ztest_opts.zo_ashift == 0)
+		return (SPA_MINBLOCKSHIFT + ztest_random(5));
+	return (ztest_opts.zo_ashift);
+}
+
+static nvlist_t *
+make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
+{
+	char *pathbuf;
+	uint64_t vdev;
+	nvlist_t *file;
+
+	pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+	if (ashift == 0)
+		ashift = ztest_get_ashift();
+
+	if (path == NULL) {
+		path = pathbuf;
+
+		if (aux != NULL) {
+			vdev = ztest_shared->zs_vdev_aux;
+			(void) snprintf(path, MAXPATHLEN,
+			    ztest_aux_template, ztest_opts.zo_dir,
+			    pool == NULL ? ztest_opts.zo_pool : pool,
+			    aux, vdev);
+		} else {
+			vdev = ztest_shared->zs_vdev_next_leaf++;
+			(void) snprintf(path, MAXPATHLEN,
+			    ztest_dev_template, ztest_opts.zo_dir,
+			    pool == NULL ? ztest_opts.zo_pool : pool, vdev);
+		}
+	}
+
+	if (size != 0) {
+		int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
+		if (fd == -1)
+			fatal(1, "can't open %s", path);
+		if (ftruncate(fd, size) != 0)
+			fatal(1, "can't ftruncate %s", path);
+		(void) close(fd);
+	}
+
+	VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
+	VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
+	VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
+	umem_free(pathbuf, MAXPATHLEN);
+
+	return (file);
+}
+
+static nvlist_t *
+make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
+    uint64_t ashift, int r)
+{
+	nvlist_t *raidz, **child;
+	int c;
+
+	if (r < 2)
+		return (make_vdev_file(path, aux, pool, size, ashift));
+	child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+	for (c = 0; c < r; c++)
+		child[c] = make_vdev_file(path, aux, pool, size, ashift);
+
+	VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_RAIDZ) == 0);
+	VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
+	    ztest_opts.zo_raidz_parity) == 0);
+	VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
+	    child, r) == 0);
+
+	for (c = 0; c < r; c++)
+		nvlist_free(child[c]);
+
+	umem_free(child, r * sizeof (nvlist_t *));
+
+	return (raidz);
+}
+
+static nvlist_t *
+make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
+    uint64_t ashift, int r, int m)
+{
+	nvlist_t *mirror, **child;
+	int c;
+
+	if (m < 1)
+		return (make_vdev_raidz(path, aux, pool, size, ashift, r));
+
+	child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+	for (c = 0; c < m; c++)
+		child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
+
+	VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_MIRROR) == 0);
+	VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
+	    child, m) == 0);
+
+	for (c = 0; c < m; c++)
+		nvlist_free(child[c]);
+
+	umem_free(child, m * sizeof (nvlist_t *));
+
+	return (mirror);
+}
+
+static nvlist_t *
+make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
+    const char *class, int r, int m, int t)
+{
+	nvlist_t *root, **child;
+	int c;
+	boolean_t log;
+
+	ASSERT(t > 0);
+
+	log = (class != NULL && strcmp(class, "log") == 0);
+
+	child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+	for (c = 0; c < t; c++) {
+		child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
+		    r, m);
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    log) == 0);
+
+		if (class != NULL && class[0] != '\0') {
+			ASSERT(m > 1 || log);   /* expecting a mirror */
+			VERIFY(nvlist_add_string(child[c],
+			    ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0);
+		}
+	}
+
+	VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
+	    child, t) == 0);
+
+	for (c = 0; c < t; c++)
+		nvlist_free(child[c]);
+
+	umem_free(child, t * sizeof (nvlist_t *));
+
+	return (root);
+}
+
+/*
+ * Find a random spa version. Returns back a random spa version in the
+ * range [initial_version, SPA_VERSION_FEATURES].
+ */
+static uint64_t
+ztest_random_spa_version(uint64_t initial_version)
+{
+	uint64_t version = initial_version;
+
+	if (version <= SPA_VERSION_BEFORE_FEATURES) {
+		version = version +
+		    ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
+	}
+
+	if (version > SPA_VERSION_BEFORE_FEATURES)
+		version = SPA_VERSION_FEATURES;
+
+	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+	return (version);
+}
+
+static int
+ztest_random_blocksize(void)
+{
+	ASSERT(ztest_spa->spa_max_ashift != 0);
+
+	/*
+	 * Choose a block size >= the ashift.
+	 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
+	 */
+	int maxbs = SPA_OLD_MAXBLOCKSHIFT;
+	if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
+		maxbs = 20;
+	uint64_t block_shift =
+	    ztest_random(maxbs - ztest_spa->spa_max_ashift + 1);
+	return (1 << (SPA_MINBLOCKSHIFT + block_shift));
+}
+
+static int
+ztest_random_dnodesize(void)
+{
+	int slots;
+	int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT;
+
+	if (max_slots == DNODE_MIN_SLOTS)
+		return (DNODE_MIN_SIZE);
+
+	/*
+	 * Weight the random distribution more heavily toward smaller
+	 * dnode sizes since that is more likely to reflect real-world
+	 * usage.
+	 */
+	ASSERT3U(max_slots, >, 4);
+	switch (ztest_random(10)) {
+	case 0:
+		slots = 5 + ztest_random(max_slots - 4);
+		break;
+	case 1 ... 4:
+		slots = 2 + ztest_random(3);
+		break;
+	default:
+		slots = 1;
+		break;
+	}
+
+	return (slots << DNODE_SHIFT);
+}
+
+static int
+ztest_random_ibshift(void)
+{
+	return (DN_MIN_INDBLKSHIFT +
+	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
+}
+
+static uint64_t
+ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
+{
+	uint64_t top;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+	do {
+		top = ztest_random(rvd->vdev_children);
+		tvd = rvd->vdev_child[top];
+	} while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) ||
+	    tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
+
+	return (top);
+}
+
+static uint64_t
+ztest_random_dsl_prop(zfs_prop_t prop)
+{
+	uint64_t value;
+
+	do {
+		value = zfs_prop_random_value(prop, ztest_random(-1ULL));
+	} while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
+
+	return (value);
+}
+
+static int
+ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
+    boolean_t inherit)
+{
+	const char *propname = zfs_prop_to_name(prop);
+	const char *valname;
+	char *setpoint;
+	uint64_t curval;
+	int error;
+
+	error = dsl_prop_set_int(osname, propname,
+	    (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
+		return (error);
+	}
+	ASSERT0(error);
+
+	setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+	VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
+
+	if (ztest_opts.zo_verbose >= 6) {
+		int err;
+
+		err = zfs_prop_index_to_string(prop, curval, &valname);
+		if (err)
+			(void) printf("%s %s = %llu at '%s'\n", osname,
+			    propname, (unsigned long long)curval, setpoint);
+		else
+			(void) printf("%s %s = %s at '%s'\n",
+			    osname, propname, valname, setpoint);
+	}
+	umem_free(setpoint, MAXPATHLEN);
+
+	return (error);
+}
+
+static int
+ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
+{
+	spa_t *spa = ztest_spa;
+	nvlist_t *props = NULL;
+	int error;
+
+	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
+
+	error = spa_prop_set(spa, props);
+
+	nvlist_free(props);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
+		return (error);
+	}
+	ASSERT0(error);
+
+	return (error);
+}
+
+static int
+ztest_dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+{
+	int err;
+	char *cp = NULL;
+	char ddname[ZFS_MAX_DATASET_NAME_LEN];
+
+	strcpy(ddname, name);
+	cp = strchr(ddname, '@');
+	if (cp != NULL)
+		*cp = '\0';
+
+	err = dmu_objset_own(name, type, readonly, decrypt, tag, osp);
+	while (decrypt && err == EACCES) {
+		dsl_crypto_params_t *dcp;
+		nvlist_t *crypto_args = fnvlist_alloc();
+
+		fnvlist_add_uint8_array(crypto_args, "wkeydata",
+		    (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN);
+		VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL,
+		    crypto_args, &dcp));
+		err = spa_keystore_load_wkey(ddname, dcp, B_FALSE);
+		dsl_crypto_params_free(dcp, B_FALSE);
+		fnvlist_free(crypto_args);
+
+		if (err == EINVAL) {
+			/*
+			 * We couldn't load a key for this dataset so try
+			 * the parent. This loop will eventually hit the
+			 * encryption root since ztest only makes clones
+			 * as children of their origin datasets.
+			 */
+			cp = strrchr(ddname, '/');
+			if (cp == NULL)
+				return (err);
+
+			*cp = '\0';
+			err = EACCES;
+			continue;
+		} else if (err != 0) {
+			break;
+		}
+
+		err = dmu_objset_own(name, type, readonly, decrypt, tag, osp);
+		break;
+	}
+
+	return (err);
+}
+
+static void
+ztest_rll_init(rll_t *rll)
+{
+	rll->rll_writer = NULL;
+	rll->rll_readers = 0;
+	mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL);
+}
+
+static void
+ztest_rll_destroy(rll_t *rll)
+{
+	ASSERT(rll->rll_writer == NULL);
+	ASSERT(rll->rll_readers == 0);
+	mutex_destroy(&rll->rll_lock);
+	cv_destroy(&rll->rll_cv);
+}
+
+static void
+ztest_rll_lock(rll_t *rll, rl_type_t type)
+{
+	mutex_enter(&rll->rll_lock);
+
+	if (type == RL_READER) {
+		while (rll->rll_writer != NULL)
+			(void) cv_wait(&rll->rll_cv, &rll->rll_lock);
+		rll->rll_readers++;
+	} else {
+		while (rll->rll_writer != NULL || rll->rll_readers)
+			(void) cv_wait(&rll->rll_cv, &rll->rll_lock);
+		rll->rll_writer = curthread;
+	}
+
+	mutex_exit(&rll->rll_lock);
+}
+
+static void
+ztest_rll_unlock(rll_t *rll)
+{
+	mutex_enter(&rll->rll_lock);
+
+	if (rll->rll_writer) {
+		ASSERT(rll->rll_readers == 0);
+		rll->rll_writer = NULL;
+	} else {
+		ASSERT(rll->rll_readers != 0);
+		ASSERT(rll->rll_writer == NULL);
+		rll->rll_readers--;
+	}
+
+	if (rll->rll_writer == NULL && rll->rll_readers == 0)
+		cv_broadcast(&rll->rll_cv);
+
+	mutex_exit(&rll->rll_lock);
+}
+
+static void
+ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
+{
+	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+	ztest_rll_lock(rll, type);
+}
+
+static void
+ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+{
+	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+	ztest_rll_unlock(rll);
+}
+
+static rl_t *
+ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+    uint64_t size, rl_type_t type)
+{
+	uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+	rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+	rl_t *rl;
+
+	rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+	rl->rl_object = object;
+	rl->rl_offset = offset;
+	rl->rl_size = size;
+	rl->rl_lock = rll;
+
+	ztest_rll_lock(rll, type);
+
+	return (rl);
+}
+
+static void
+ztest_range_unlock(rl_t *rl)
+{
+	rll_t *rll = rl->rl_lock;
+
+	ztest_rll_unlock(rll);
+
+	umem_free(rl, sizeof (*rl));
+}
+
+static void
+ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
+{
+	zd->zd_os = os;
+	zd->zd_zilog = dmu_objset_zil(os);
+	zd->zd_shared = szd;
+	dmu_objset_name(os, zd->zd_name);
+	int l;
+
+	if (zd->zd_shared != NULL)
+		zd->zd_shared->zd_seq = 0;
+
+	VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL));
+	mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+		ztest_rll_init(&zd->zd_object_lock[l]);
+
+	for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+		ztest_rll_init(&zd->zd_range_lock[l]);
+}
+
+static void
+ztest_zd_fini(ztest_ds_t *zd)
+{
+	int l;
+
+	mutex_destroy(&zd->zd_dirobj_lock);
+	(void) pthread_rwlock_destroy(&zd->zd_zilog_lock);
+
+	for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+		ztest_rll_destroy(&zd->zd_object_lock[l]);
+
+	for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+		ztest_rll_destroy(&zd->zd_range_lock[l]);
+}
+
+#define	TXG_MIGHTWAIT	(ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
+
+static uint64_t
+ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
+{
+	uint64_t txg;
+	int error;
+
+	/*
+	 * Attempt to assign tx to some transaction group.
+	 */
+	error = dmu_tx_assign(tx, txg_how);
+	if (error) {
+		if (error == ERESTART) {
+			ASSERT(txg_how == TXG_NOWAIT);
+			dmu_tx_wait(tx);
+		} else {
+			ASSERT3U(error, ==, ENOSPC);
+			ztest_record_enospc(tag);
+		}
+		dmu_tx_abort(tx);
+		return (0);
+	}
+	txg = dmu_tx_get_txg(tx);
+	ASSERT(txg != 0);
+	return (txg);
+}
+
+static void
+ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
+    uint64_t crtxg)
+{
+	bt->bt_magic = BT_MAGIC;
+	bt->bt_objset = dmu_objset_id(os);
+	bt->bt_object = object;
+	bt->bt_dnodesize = dnodesize;
+	bt->bt_offset = offset;
+	bt->bt_gen = gen;
+	bt->bt_txg = txg;
+	bt->bt_crtxg = crtxg;
+}
+
+static void
+ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
+    uint64_t crtxg)
+{
+	ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
+	ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
+	ASSERT3U(bt->bt_object, ==, object);
+	ASSERT3U(bt->bt_dnodesize, ==, dnodesize);
+	ASSERT3U(bt->bt_offset, ==, offset);
+	ASSERT3U(bt->bt_gen, <=, gen);
+	ASSERT3U(bt->bt_txg, <=, txg);
+	ASSERT3U(bt->bt_crtxg, ==, crtxg);
+}
+
+static ztest_block_tag_t *
+ztest_bt_bonus(dmu_buf_t *db)
+{
+	dmu_object_info_t doi;
+	ztest_block_tag_t *bt;
+
+	dmu_object_info_from_db(db, &doi);
+	ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+	ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+	bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+
+	return (bt);
+}
+
+/*
+ * Generate a token to fill up unused bonus buffer space.  Try to make
+ * it unique to the object, generation, and offset to verify that data
+ * is not getting overwritten by data from other dnodes.
+ */
+#define	ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \
+	(((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset))
+
+/*
+ * Fill up the unused bonus buffer region before the block tag with a
+ * verifiable pattern. Filling the whole bonus area with non-zero data
+ * helps ensure that all dnode traversal code properly skips the
+ * interior regions of large dnodes.
+ */
+static void
+ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj,
+    objset_t *os, uint64_t gen)
+{
+	uint64_t *bonusp;
+
+	ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8));
+
+	for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
+		uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
+		    gen, bonusp - (uint64_t *)db->db_data);
+		*bonusp = token;
+	}
+}
+
+/*
+ * Verify that the unused area of a bonus buffer is filled with the
+ * expected tokens.
+ */
+static void
+ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj,
+    objset_t *os, uint64_t gen)
+{
+	uint64_t *bonusp;
+
+	for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
+		uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
+		    gen, bonusp - (uint64_t *)db->db_data);
+		VERIFY3U(*bonusp, ==, token);
+	}
+}
+
+/*
+ * ZIL logging ops
+ */
+
+#define	lrz_type	lr_mode
+#define	lrz_blocksize	lr_uid
+#define	lrz_ibshift	lr_gid
+#define	lrz_bonustype	lr_rdev
+#define	lrz_dnodesize	lr_crtime[1]
+
+static void
+ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	size_t namesize = strlen(name) + 1;
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) + namesize - sizeof (lr_t));
+
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	size_t namesize = strlen(name) + 1;
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) + namesize - sizeof (lr_t));
+
+	itx->itx_oid = object;
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+{
+	itx_t *itx;
+	itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	if (lr->lr_length > zil_max_log_data(zd->zd_zilog))
+		write_state = WR_INDIRECT;
+
+	itx = zil_itx_create(TX_WRITE,
+	    sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+
+	if (write_state == WR_COPIED &&
+	    dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+	    ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+		zil_itx_destroy(itx);
+		itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+		write_state = WR_NEED_COPY;
+	}
+	itx->itx_private = zd;
+	itx->itx_wr_state = write_state;
+	itx->itx_sync = (ztest_random(8) == 0);
+
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+{
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	itx->itx_sync = B_FALSE;
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+{
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	itx->itx_sync = B_FALSE;
+	zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+/*
+ * ZIL replay ops
+ */
+static int
+ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap)
+{
+	ztest_ds_t *zd = arg1;
+	lr_create_t *lr = arg2;
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	objset_t *os = zd->zd_os;
+	ztest_block_tag_t *bbt;
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	int error = 0;
+	int bonuslen;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+	ASSERT(name[0] != '\0');
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+
+	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	} else {
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+	}
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0)
+		return (ENOSPC);
+
+	ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+	bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize);
+
+	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+		if (lr->lr_foid == 0) {
+			lr->lr_foid = zap_create_dnsize(os,
+			    lr->lrz_type, lr->lrz_bonustype,
+			    bonuslen, lr->lrz_dnodesize, tx);
+		} else {
+			error = zap_create_claim_dnsize(os, lr->lr_foid,
+			    lr->lrz_type, lr->lrz_bonustype,
+			    bonuslen, lr->lrz_dnodesize, tx);
+		}
+	} else {
+		if (lr->lr_foid == 0) {
+			lr->lr_foid = dmu_object_alloc_dnsize(os,
+			    lr->lrz_type, 0, lr->lrz_bonustype,
+			    bonuslen, lr->lrz_dnodesize, tx);
+		} else {
+			error = dmu_object_claim_dnsize(os, lr->lr_foid,
+			    lr->lrz_type, 0, lr->lrz_bonustype,
+			    bonuslen, lr->lrz_dnodesize, tx);
+		}
+	}
+
+	if (error) {
+		ASSERT3U(error, ==, EEXIST);
+		ASSERT(zd->zd_zilog->zl_replay);
+		dmu_tx_commit(tx);
+		return (error);
+	}
+
+	ASSERT(lr->lr_foid != 0);
+
+	if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+		VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+		    lr->lrz_blocksize, lr->lrz_ibshift, tx));
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+	bbt = ztest_bt_bonus(db);
+	dmu_buf_will_dirty(db, tx);
+	ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL,
+	    lr->lr_gen, txg, txg);
+	ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen);
+	dmu_buf_rele(db, FTAG);
+
+	VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+	    &lr->lr_foid, tx));
+
+	(void) ztest_log_create(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+static int
+ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
+{
+	ztest_ds_t *zd = arg1;
+	lr_remove_t *lr = arg2;
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	objset_t *os = zd->zd_os;
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	uint64_t object, txg;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+	ASSERT(name[0] != '\0');
+
+	VERIFY3U(0, ==,
+	    zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+	ASSERT(object != 0);
+
+	ztest_object_lock(zd, object, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		ztest_object_unlock(zd, object);
+		return (ENOSPC);
+	}
+
+	if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+		VERIFY3U(0, ==, zap_destroy(os, object, tx));
+	} else {
+		VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+	}
+
+	VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+
+	(void) ztest_log_remove(zd, tx, lr, object);
+
+	dmu_tx_commit(tx);
+
+	ztest_object_unlock(zd, object);
+
+	return (0);
+}
+
+static int
+ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
+{
+	ztest_ds_t *zd = arg1;
+	lr_write_t *lr = arg2;
+	objset_t *os = zd->zd_os;
+	void *data = lr + 1;			/* data follows lr */
+	uint64_t offset, length;
+	ztest_block_tag_t *bt = data;
+	ztest_block_tag_t *bbt;
+	uint64_t gen, txg, lrtxg, crtxg;
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	dmu_buf_t *db;
+	arc_buf_t *abuf = NULL;
+	rl_t *rl;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+	}
+
+	if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+		byteswap_uint64_array(bt, sizeof (*bt));
+
+	if (bt->bt_magic != BT_MAGIC)
+		bt = NULL;
+
+	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+	rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+	dmu_object_info_from_db(db, &doi);
+
+	bbt = ztest_bt_bonus(db);
+	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+	gen = bbt->bt_gen;
+	crtxg = bbt->bt_crtxg;
+	lrtxg = lr->lr_common.lrc_txg;
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+
+	if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+	    P2PHASE(offset, length) == 0)
+		abuf = dmu_request_arcbuf(db, length);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		if (abuf != NULL)
+			dmu_return_arcbuf(abuf);
+		dmu_buf_rele(db, FTAG);
+		ztest_range_unlock(rl);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	if (bt != NULL) {
+		/*
+		 * Usually, verify the old data before writing new data --
+		 * but not always, because we also want to verify correct
+		 * behavior when the data was not recently read into cache.
+		 */
+		ASSERT(offset % doi.doi_data_block_size == 0);
+		if (ztest_random(4) != 0) {
+			int prefetch = ztest_random(2) ?
+			    DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+			ztest_block_tag_t rbt;
+
+			VERIFY(dmu_read(os, lr->lr_foid, offset,
+			    sizeof (rbt), &rbt, prefetch) == 0);
+			if (rbt.bt_magic == BT_MAGIC) {
+				ztest_bt_verify(&rbt, os, lr->lr_foid, 0,
+				    offset, gen, txg, crtxg);
+			}
+		}
+
+		/*
+		 * Writes can appear to be newer than the bonus buffer because
+		 * the ztest_get_data() callback does a dmu_read() of the
+		 * open-context data, which may be different than the data
+		 * as it was when the write was generated.
+		 */
+		if (zd->zd_zilog->zl_replay) {
+			ztest_bt_verify(bt, os, lr->lr_foid, 0, offset,
+			    MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+			    bt->bt_crtxg);
+		}
+
+		/*
+		 * Set the bt's gen/txg to the bonus buffer's gen/txg
+		 * so that all of the usual ASSERTs will work.
+		 */
+		ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg,
+		    crtxg);
+	}
+
+	if (abuf == NULL) {
+		dmu_write(os, lr->lr_foid, offset, length, data, tx);
+	} else {
+		bcopy(data, abuf->b_data, length);
+		dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx);
+	}
+
+	(void) ztest_log_write(zd, tx, lr);
+
+	dmu_buf_rele(db, FTAG);
+
+	dmu_tx_commit(tx);
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+static int
+ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
+{
+	ztest_ds_t *zd = arg1;
+	lr_truncate_t *lr = arg2;
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	rl_t *rl;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+	rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+	    RL_WRITER);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		ztest_range_unlock(rl);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+	    lr->lr_length, tx) == 0);
+
+	(void) ztest_log_truncate(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+static int
+ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
+{
+	ztest_ds_t *zd = arg1;
+	lr_setattr_t *lr = arg2;
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	dmu_buf_t *db;
+	ztest_block_tag_t *bbt;
+	uint64_t txg, lrtxg, crtxg, dnodesize;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, lr->lr_foid);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		dmu_buf_rele(db, FTAG);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	bbt = ztest_bt_bonus(db);
+	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+	crtxg = bbt->bt_crtxg;
+	lrtxg = lr->lr_common.lrc_txg;
+	dnodesize = bbt->bt_dnodesize;
+
+	if (zd->zd_zilog->zl_replay) {
+		ASSERT(lr->lr_size != 0);
+		ASSERT(lr->lr_mode != 0);
+		ASSERT(lrtxg != 0);
+	} else {
+		/*
+		 * Randomly change the size and increment the generation.
+		 */
+		lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+		    sizeof (*bbt);
+		lr->lr_mode = bbt->bt_gen + 1;
+		ASSERT(lrtxg == 0);
+	}
+
+	/*
+	 * Verify that the current bonus buffer is not newer than our txg.
+	 */
+	ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
+	    MAX(txg, lrtxg), crtxg);
+
+	dmu_buf_will_dirty(db, tx);
+
+	ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+	ASSERT3U(lr->lr_size, <=, db->db_size);
+	VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
+	bbt = ztest_bt_bonus(db);
+
+	ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
+	    txg, crtxg);
+	ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen);
+	dmu_buf_rele(db, FTAG);
+
+	(void) ztest_log_setattr(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
+	NULL,			/* 0 no such transaction type */
+	ztest_replay_create,	/* TX_CREATE */
+	NULL,			/* TX_MKDIR */
+	NULL,			/* TX_MKXATTR */
+	NULL,			/* TX_SYMLINK */
+	ztest_replay_remove,	/* TX_REMOVE */
+	NULL,			/* TX_RMDIR */
+	NULL,			/* TX_LINK */
+	NULL,			/* TX_RENAME */
+	ztest_replay_write,	/* TX_WRITE */
+	ztest_replay_truncate,	/* TX_TRUNCATE */
+	ztest_replay_setattr,	/* TX_SETATTR */
+	NULL,			/* TX_ACL */
+	NULL,			/* TX_CREATE_ACL */
+	NULL,			/* TX_CREATE_ATTR */
+	NULL,			/* TX_CREATE_ACL_ATTR */
+	NULL,			/* TX_MKDIR_ACL */
+	NULL,			/* TX_MKDIR_ATTR */
+	NULL,			/* TX_MKDIR_ACL_ATTR */
+	NULL,			/* TX_WRITE2 */
+};
+
+/*
+ * ZIL get_data callbacks
+ */
+
+/* ARGSUSED */
+static void
+ztest_get_done(zgd_t *zgd, int error)
+{
+	ztest_ds_t *zd = zgd->zgd_private;
+	uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	ztest_range_unlock((rl_t *)zgd->zgd_lr);
+	ztest_object_unlock(zd, object);
+
+	umem_free(zgd, sizeof (*zgd));
+}
+
+static int
+ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
+    zio_t *zio)
+{
+	ztest_ds_t *zd = arg;
+	objset_t *os = zd->zd_os;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	uint64_t txg = lr->lr_common.lrc_txg;
+	uint64_t crtxg;
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error;
+
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(zio, !=, NULL);
+	ASSERT3U(size, !=, 0);
+
+	ztest_object_lock(zd, object, RL_READER);
+	error = dmu_bonus_hold(os, object, FTAG, &db);
+	if (error) {
+		ztest_object_unlock(zd, object);
+		return (error);
+	}
+
+	crtxg = ztest_bt_bonus(db)->bt_crtxg;
+
+	if (crtxg == 0 || crtxg > txg) {
+		dmu_buf_rele(db, FTAG);
+		ztest_object_unlock(zd, object);
+		return (ENOENT);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	dmu_buf_rele(db, FTAG);
+	db = NULL;
+
+	zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+	zgd->zgd_lwb = lwb;
+	zgd->zgd_private = zd;
+
+	if (buf != NULL) {	/* immediate write */
+		zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd,
+		    object, offset, size, RL_READER);
+
+		error = dmu_read(os, object, offset, size, buf,
+		    DMU_READ_NO_PREFETCH);
+		ASSERT(error == 0);
+	} else {
+		size = doi.doi_data_block_size;
+		if (ISP2(size)) {
+			offset = P2ALIGN(offset, size);
+		} else {
+			ASSERT(offset < size);
+			offset = 0;
+		}
+
+		zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd,
+		    object, offset, size, RL_READER);
+
+		error = dmu_buf_hold(os, object, offset, zgd, &db,
+		    DMU_READ_NO_PREFETCH);
+
+		if (error == 0) {
+			blkptr_t *bp = &lr->lr_blkptr;
+
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    ztest_get_done, zgd);
+
+			if (error == 0)
+				return (0);
+		}
+	}
+
+	ztest_get_done(zgd, error);
+
+	return (error);
+}
+
+static void *
+ztest_lr_alloc(size_t lrsize, char *name)
+{
+	char *lr;
+	size_t namesize = name ? strlen(name) + 1 : 0;
+
+	lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+
+	if (name)
+		bcopy(name, lr + lrsize, namesize);
+
+	return (lr);
+}
+
+static void
+ztest_lr_free(void *lr, size_t lrsize, char *name)
+{
+	size_t namesize = name ? strlen(name) + 1 : 0;
+
+	umem_free(lr, lrsize + namesize);
+}
+
+/*
+ * Lookup a bunch of objects.  Returns the number of objects not found.
+ */
+static int
+ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int error;
+	int i;
+
+	ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
+
+	for (i = 0; i < count; i++, od++) {
+		od->od_object = 0;
+		error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+		    sizeof (uint64_t), 1, &od->od_object);
+		if (error) {
+			ASSERT(error == ENOENT);
+			ASSERT(od->od_object == 0);
+			missing++;
+		} else {
+			dmu_buf_t *db;
+			ztest_block_tag_t *bbt;
+			dmu_object_info_t doi;
+
+			ASSERT(od->od_object != 0);
+			ASSERT(missing == 0);	/* there should be no gaps */
+
+			ztest_object_lock(zd, od->od_object, RL_READER);
+			VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+			    od->od_object, FTAG, &db));
+			dmu_object_info_from_db(db, &doi);
+			bbt = ztest_bt_bonus(db);
+			ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+			od->od_type = doi.doi_type;
+			od->od_blocksize = doi.doi_data_block_size;
+			od->od_gen = bbt->bt_gen;
+			dmu_buf_rele(db, FTAG);
+			ztest_object_unlock(zd, od->od_object);
+		}
+	}
+
+	return (missing);
+}
+
+static int
+ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int i;
+
+	ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
+
+	for (i = 0; i < count; i++, od++) {
+		if (missing) {
+			od->od_object = 0;
+			missing++;
+			continue;
+		}
+
+		lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+		lr->lr_doid = od->od_dir;
+		lr->lr_foid = 0;	/* 0 to allocate, > 0 to claim */
+		lr->lrz_type = od->od_crtype;
+		lr->lrz_blocksize = od->od_crblocksize;
+		lr->lrz_ibshift = ztest_random_ibshift();
+		lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+		lr->lrz_dnodesize = od->od_crdnodesize;
+		lr->lr_gen = od->od_crgen;
+		lr->lr_crtime[0] = time(NULL);
+
+		if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+			ASSERT(missing == 0);
+			od->od_object = 0;
+			missing++;
+		} else {
+			od->od_object = lr->lr_foid;
+			od->od_type = od->od_crtype;
+			od->od_blocksize = od->od_crblocksize;
+			od->od_gen = od->od_crgen;
+			ASSERT(od->od_object != 0);
+		}
+
+		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+	}
+
+	return (missing);
+}
+
+static int
+ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int error;
+	int i;
+
+	ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
+
+	od += count - 1;
+
+	for (i = count - 1; i >= 0; i--, od--) {
+		if (missing) {
+			missing++;
+			continue;
+		}
+
+		/*
+		 * No object was found.
+		 */
+		if (od->od_object == 0)
+			continue;
+
+		lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+		lr->lr_doid = od->od_dir;
+
+		if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+			ASSERT3U(error, ==, ENOSPC);
+			missing++;
+		} else {
+			od->od_object = 0;
+		}
+		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+	}
+
+	return (missing);
+}
+
+static int
+ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+    void *data)
+{
+	lr_write_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+
+	lr->lr_foid = object;
+	lr->lr_offset = offset;
+	lr->lr_length = size;
+	lr->lr_blkoff = 0;
+	BP_ZERO(&lr->lr_blkptr);
+
+	bcopy(data, lr + 1, size);
+
+	error = ztest_replay_write(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+
+	return (error);
+}
+
+static int
+ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+	lr_truncate_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+	lr->lr_foid = object;
+	lr->lr_offset = offset;
+	lr->lr_length = size;
+
+	error = ztest_replay_truncate(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr), NULL);
+
+	return (error);
+}
+
+static int
+ztest_setattr(ztest_ds_t *zd, uint64_t object)
+{
+	lr_setattr_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+	lr->lr_foid = object;
+	lr->lr_size = 0;
+	lr->lr_mode = 0;
+
+	error = ztest_replay_setattr(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr), NULL);
+
+	return (error);
+}
+
+static void
+ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	rl_t *rl;
+
+	txg_wait_synced(dmu_objset_pool(os), 0);
+
+	ztest_object_lock(zd, object, RL_READER);
+	rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, object, offset, size);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+
+	if (txg != 0) {
+		dmu_prealloc(os, object, offset, size, tx);
+		dmu_tx_commit(tx);
+		txg_wait_synced(dmu_objset_pool(os), txg);
+	} else {
+		(void) dmu_free_long_range(os, object, offset, size);
+	}
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, object);
+}
+
+static void
+ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+{
+	int err;
+	ztest_block_tag_t wbt;
+	dmu_object_info_t doi;
+	enum ztest_io_type io_type;
+	uint64_t blocksize;
+	void *data;
+
+	VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+	blocksize = doi.doi_data_block_size;
+	data = umem_alloc(blocksize, UMEM_NOFAIL);
+
+	/*
+	 * Pick an i/o type at random, biased toward writing block tags.
+	 */
+	io_type = ztest_random(ZTEST_IO_TYPES);
+	if (ztest_random(2) == 0)
+		io_type = ZTEST_IO_WRITE_TAG;
+
+	(void) pthread_rwlock_rdlock(&zd->zd_zilog_lock);
+
+	switch (io_type) {
+
+	case ZTEST_IO_WRITE_TAG:
+		ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize,
+		    offset, 0, 0, 0);
+		(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+		break;
+
+	case ZTEST_IO_WRITE_PATTERN:
+		(void) memset(data, 'a' + (object + offset) % 5, blocksize);
+		if (ztest_random(2) == 0) {
+			/*
+			 * Induce fletcher2 collisions to ensure that
+			 * zio_ddt_collision() detects and resolves them
+			 * when using fletcher2-verify for deduplication.
+			 */
+			((uint64_t *)data)[0] ^= 1ULL << 63;
+			((uint64_t *)data)[4] ^= 1ULL << 63;
+		}
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+
+	case ZTEST_IO_WRITE_ZEROES:
+		bzero(data, blocksize);
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+
+	case ZTEST_IO_TRUNCATE:
+		(void) ztest_truncate(zd, object, offset, blocksize);
+		break;
+
+	case ZTEST_IO_SETATTR:
+		(void) ztest_setattr(zd, object);
+		break;
+	default:
+		break;
+
+	case ZTEST_IO_REWRITE:
+		(void) pthread_rwlock_rdlock(&ztest_name_lock);
+		err = ztest_dsl_prop_set_uint64(zd->zd_name,
+		    ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
+		    B_FALSE);
+		VERIFY(err == 0 || err == ENOSPC);
+		err = ztest_dsl_prop_set_uint64(zd->zd_name,
+		    ZFS_PROP_COMPRESSION,
+		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
+		    B_FALSE);
+		VERIFY(err == 0 || err == ENOSPC);
+		(void) pthread_rwlock_unlock(&ztest_name_lock);
+
+		VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
+		    DMU_READ_NO_PREFETCH));
+
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+	}
+
+	(void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
+
+	umem_free(data, blocksize);
+}
+
+/*
+ * Initialize an object description template.
+ */
+static void
+ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+    dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize,
+    uint64_t gen)
+{
+	od->od_dir = ZTEST_DIROBJ;
+	od->od_object = 0;
+
+	od->od_crtype = type;
+	od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+	od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize();
+	od->od_crgen = gen;
+
+	od->od_type = DMU_OT_NONE;
+	od->od_blocksize = 0;
+	od->od_gen = 0;
+
+	(void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+	    tag, (longlong_t)id, (u_longlong_t)index);
+}
+
+/*
+ * Lookup or create the objects for a test using the od template.
+ * If the objects do not all exist, or if 'remove' is specified,
+ * remove any existing objects and create new ones.  Otherwise,
+ * use the existing objects.
+ */
+static int
+ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+{
+	int count = size / sizeof (*od);
+	int rv = 0;
+
+	mutex_enter(&zd->zd_dirobj_lock);
+	if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+	    (ztest_remove(zd, od, count) != 0 ||
+	    ztest_create(zd, od, count) != 0))
+		rv = -1;
+	zd->zd_od = od;
+	mutex_exit(&zd->zd_dirobj_lock);
+
+	return (rv);
+}
+
+/* ARGSUSED */
+void
+ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+{
+	zilog_t *zilog = zd->zd_zilog;
+
+	(void) pthread_rwlock_rdlock(&zd->zd_zilog_lock);
+
+	zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
+
+	/*
+	 * Remember the committed values in zd, which is in parent/child
+	 * shared memory.  If we die, the next iteration of ztest_run()
+	 * will verify that the log really does contain this record.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	ASSERT(zd->zd_shared != NULL);
+	ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
+	zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
+	mutex_exit(&zilog->zl_lock);
+
+	(void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
+}
+
+/*
+ * This function is designed to simulate the operations that occur during a
+ * mount/unmount operation.  We hold the dataset across these operations in an
+ * attempt to expose any implicit assumptions about ZIL management.
+ */
+/* ARGSUSED */
+void
+ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+
+	/*
+	 * We hold the ztest_vdev_lock so we don't cause problems with
+	 * other threads that wish to remove a log device, such as
+	 * ztest_device_removal().
+	 */
+	mutex_enter(&ztest_vdev_lock);
+
+	/*
+	 * We grab the zd_dirobj_lock to ensure that no other thread is
+	 * updating the zil (i.e. adding in-memory log records) and the
+	 * zd_zilog_lock to block any I/O.
+	 */
+	mutex_enter(&zd->zd_dirobj_lock);
+	(void) pthread_rwlock_wrlock(&zd->zd_zilog_lock);
+
+	/* zfsvfs_teardown() */
+	zil_close(zd->zd_zilog);
+
+	/* zfsvfs_setup() */
+	VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
+	zil_replay(os, zd, ztest_replay_vector);
+
+	(void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
+	mutex_exit(&zd->zd_dirobj_lock);
+	mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Verify that we can't destroy an active pool, create an existing pool,
+ * or create a pool with a bad vdev spec.
+ */
+/* ARGSUSED */
+void
+ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_opts_t *zo = &ztest_opts;
+	spa_t *spa;
+	nvlist_t *nvroot;
+
+	if (zo->zo_mmp_test)
+		return;
+
+	/*
+	 * Attempt to create using a bad file.
+	 */
+	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
+	VERIFY3U(ENOENT, ==,
+	    spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
+	nvlist_free(nvroot);
+
+	/*
+	 * Attempt to create using a bad mirror.
+	 */
+	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1);
+	VERIFY3U(ENOENT, ==,
+	    spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
+	nvlist_free(nvroot);
+
+	/*
+	 * Attempt to create an existing pool.  It shouldn't matter
+	 * what's in the nvroot; we should fail with EEXIST.
+	 */
+	(void) pthread_rwlock_rdlock(&ztest_name_lock);
+	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
+	VERIFY3U(EEXIST, ==,
+	    spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL));
+	nvlist_free(nvroot);
+
+	/*
+	 * We open a reference to the spa and then we try to export it
+	 * expecting one of the following errors:
+	 *
+	 * EBUSY
+	 *	Because of the reference we just opened.
+	 *
+	 * ZFS_ERR_EXPORT_IN_PROGRESS
+	 *	For the case that there is another ztest thread doing
+	 *	an export concurrently.
+	 */
+	VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
+	int error = spa_destroy(zo->zo_pool);
+	if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) {
+		fatal(0, "spa_destroy(%s) returned unexpected value %d",
+		    spa->spa_name, error);
+	}
+	spa_close(spa, FTAG);
+
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+/*
+ * Start and then stop the MMP threads to ensure the startup and shutdown code
+ * works properly.  Actual protection and property-related code tested via ZTS.
+ */
+/* ARGSUSED */
+void
+ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_opts_t *zo = &ztest_opts;
+	spa_t *spa = ztest_spa;
+
+	if (zo->zo_mmp_test)
+		return;
+
+	/*
+	 * Since enabling MMP involves setting a property, it could not be done
+	 * while the pool is suspended.
+	 */
+	if (spa_suspended(spa))
+		return;
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	mutex_enter(&spa->spa_props_lock);
+
+	zfs_multihost_fail_intervals = 0;
+
+	if (!spa_multihost(spa)) {
+		spa->spa_multihost = B_TRUE;
+		mmp_thread_start(spa);
+	}
+
+	mutex_exit(&spa->spa_props_lock);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	txg_wait_synced(spa_get_dsl(spa), 0);
+	mmp_signal_all_threads();
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	mutex_enter(&spa->spa_props_lock);
+
+	if (spa_multihost(spa)) {
+		mmp_thread_stop(spa);
+		spa->spa_multihost = B_FALSE;
+	}
+
+	mutex_exit(&spa->spa_props_lock);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa;
+	uint64_t initial_version = SPA_VERSION_INITIAL;
+	uint64_t version, newversion;
+	nvlist_t *nvroot, *props;
+	char *name;
+
+	if (ztest_opts.zo_mmp_test)
+		return;
+
+	mutex_enter(&ztest_vdev_lock);
+	name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
+
+	/*
+	 * Clean up from previous runs.
+	 */
+	(void) spa_destroy(name);
+
+	nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
+	    NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
+
+	/*
+	 * If we're configuring a RAIDZ device then make sure that the
+	 * initial version is capable of supporting that feature.
+	 */
+	switch (ztest_opts.zo_raidz_parity) {
+	case 0:
+	case 1:
+		initial_version = SPA_VERSION_INITIAL;
+		break;
+	case 2:
+		initial_version = SPA_VERSION_RAIDZ2;
+		break;
+	case 3:
+		initial_version = SPA_VERSION_RAIDZ3;
+		break;
+	}
+
+	/*
+	 * Create a pool with a spa version that can be upgraded. Pick
+	 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
+	 */
+	do {
+		version = ztest_random_spa_version(initial_version);
+	} while (version > SPA_VERSION_BEFORE_FEATURES);
+
+	props = fnvlist_alloc();
+	fnvlist_add_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
+	VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0);
+	fnvlist_free(nvroot);
+	fnvlist_free(props);
+
+	VERIFY3S(spa_open(name, &spa, FTAG), ==, 0);
+	VERIFY3U(spa_version(spa), ==, version);
+	newversion = ztest_random_spa_version(version + 1);
+
+	if (ztest_opts.zo_verbose >= 4) {
+		(void) printf("upgrading spa version from %llu to %llu\n",
+		    (u_longlong_t)version, (u_longlong_t)newversion);
+	}
+
+	spa_upgrade(spa, newversion);
+	VERIFY3U(spa_version(spa), >, version);
+	VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
+	    zpool_prop_to_name(ZPOOL_PROP_VERSION)));
+	spa_close(spa, FTAG);
+
+	kmem_strfree(name);
+	mutex_exit(&ztest_vdev_lock);
+}
+
+static void
+ztest_spa_checkpoint(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+	int error = spa_checkpoint(spa->spa_name);
+
+	switch (error) {
+	case 0:
+	case ZFS_ERR_DEVRM_IN_PROGRESS:
+	case ZFS_ERR_DISCARDING_CHECKPOINT:
+	case ZFS_ERR_CHECKPOINT_EXISTS:
+		break;
+	case ENOSPC:
+		ztest_record_enospc(FTAG);
+		break;
+	default:
+		fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error);
+	}
+}
+
+static void
+ztest_spa_discard_checkpoint(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+	int error = spa_checkpoint_discard(spa->spa_name);
+
+	switch (error) {
+	case 0:
+	case ZFS_ERR_DISCARDING_CHECKPOINT:
+	case ZFS_ERR_NO_CHECKPOINT:
+		break;
+	default:
+		fatal(0, "spa_discard_checkpoint(%s) = %d",
+		    spa->spa_name, error);
+	}
+
+}
+
+/* ARGSUSED */
+void
+ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+
+	mutex_enter(&ztest_checkpoint_lock);
+	if (ztest_random(2) == 0) {
+		ztest_spa_checkpoint(spa);
+	} else {
+		ztest_spa_discard_checkpoint(spa);
+	}
+	mutex_exit(&ztest_checkpoint_lock);
+}
+
+
+static vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+	vdev_t *mvd;
+	int c;
+
+	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+		return (vd);
+
+	for (c = 0; c < vd->vdev_children; c++)
+		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+		    NULL)
+			return (mvd);
+
+	return (NULL);
+}
+
+static int
+spa_num_top_vdevs(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV);
+	return (rvd->vdev_children);
+}
+
+/*
+ * Verify that vdev_add() works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	uint64_t leaves;
+	uint64_t guid;
+	nvlist_t *nvroot;
+	int error;
+
+	if (ztest_opts.zo_mmp_test)
+		return;
+
+	mutex_enter(&ztest_vdev_lock);
+	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
+
+	/*
+	 * If we have slogs then remove them 1/4 of the time.
+	 */
+	if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+		metaslab_group_t *mg;
+
+		/*
+		 * find the first real slog in log allocation class
+		 */
+		mg =  spa_log_class(spa)->mc_rotor;
+		while (!mg->mg_vd->vdev_islog)
+			mg = mg->mg_next;
+
+		guid = mg->mg_vd->vdev_guid;
+
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+
+		/*
+		 * We have to grab the zs_name_lock as writer to
+		 * prevent a race between removing a slog (dmu_objset_find)
+		 * and destroying a dataset. Removing the slog will
+		 * grab a reference on the dataset which may cause
+		 * dsl_destroy_head() to fail with EBUSY thus
+		 * leaving the dataset in an inconsistent state.
+		 */
+		pthread_rwlock_wrlock(&ztest_name_lock);
+		error = spa_vdev_remove(spa, guid, B_FALSE);
+		pthread_rwlock_unlock(&ztest_name_lock);
+
+		switch (error) {
+		case 0:
+		case EEXIST:	/* Generic zil_reset() error */
+		case EBUSY:	/* Replay required */
+		case EACCES:	/* Crypto key not loaded */
+		case ZFS_ERR_CHECKPOINT_EXISTS:
+		case ZFS_ERR_DISCARDING_CHECKPOINT:
+			break;
+		default:
+			fatal(0, "spa_vdev_remove() = %d", error);
+		}
+	} else {
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+
+		/*
+		 * Make 1/4 of the devices be log devices
+		 */
+		nvroot = make_vdev_root(NULL, NULL, NULL,
+		    ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?
+		    "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+
+		error = spa_vdev_add(spa, nvroot);
+		nvlist_free(nvroot);
+
+		switch (error) {
+		case 0:
+			break;
+		case ENOSPC:
+			ztest_record_enospc("spa_vdev_add");
+			break;
+		default:
+			fatal(0, "spa_vdev_add() = %d", error);
+		}
+	}
+
+	mutex_exit(&ztest_vdev_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	uint64_t leaves;
+	nvlist_t *nvroot;
+	const char *class = (ztest_random(2) == 0) ?
+	    VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP;
+	int error;
+
+	/*
+	 * By default add a special vdev 50% of the time
+	 */
+	if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) ||
+	    (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND &&
+	    ztest_random(2) == 0)) {
+		return;
+	}
+
+	mutex_enter(&ztest_vdev_lock);
+
+	/* Only test with mirrors */
+	if (zs->zs_mirrors < 2) {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/* requires feature@allocation_classes */
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
+	    class, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+
+	error = spa_vdev_add(spa, nvroot);
+	nvlist_free(nvroot);
+
+	if (error == ENOSPC)
+		ztest_record_enospc("spa_vdev_add");
+	else if (error != 0)
+		fatal(0, "spa_vdev_add() = %d", error);
+
+	/*
+	 * 50% of the time allow small blocks in the special class
+	 */
+	if (error == 0 &&
+	    spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) {
+		if (ztest_opts.zo_verbose >= 3)
+			(void) printf("Enabling special VDEV small blocks\n");
+		(void) ztest_dsl_prop_set_uint64(zd->zd_name,
+		    ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE);
+	}
+
+	mutex_exit(&ztest_vdev_lock);
+
+	if (ztest_opts.zo_verbose >= 3) {
+		metaslab_class_t *mc;
+
+		if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0)
+			mc = spa_special_class(spa);
+		else
+			mc = spa_dedup_class(spa);
+		(void) printf("Added a %s mirrored vdev (of %d)\n",
+		    class, (int)mc->mc_groups);
+	}
+}
+
+/*
+ * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	spa_aux_vdev_t *sav;
+	char *aux;
+	char *path;
+	uint64_t guid = 0;
+	int error;
+
+	if (ztest_opts.zo_mmp_test)
+		return;
+
+	path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+	if (ztest_random(2) == 0) {
+		sav = &spa->spa_spares;
+		aux = ZPOOL_CONFIG_SPARES;
+	} else {
+		sav = &spa->spa_l2cache;
+		aux = ZPOOL_CONFIG_L2CACHE;
+	}
+
+	mutex_enter(&ztest_vdev_lock);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	if (sav->sav_count != 0 && ztest_random(4) == 0) {
+		/*
+		 * Pick a random device to remove.
+		 */
+		guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
+	} else {
+		/*
+		 * Find an unused device we can add.
+		 */
+		zs->zs_vdev_aux = 0;
+		for (;;) {
+			int c;
+			(void) snprintf(path, MAXPATHLEN, ztest_aux_template,
+			    ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
+			    zs->zs_vdev_aux);
+			for (c = 0; c < sav->sav_count; c++)
+				if (strcmp(sav->sav_vdevs[c]->vdev_path,
+				    path) == 0)
+					break;
+			if (c == sav->sav_count &&
+			    vdev_lookup_by_path(rvd, path) == NULL)
+				break;
+			zs->zs_vdev_aux++;
+		}
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	if (guid == 0) {
+		/*
+		 * Add a new device.
+		 */
+		nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
+		    (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1);
+		error = spa_vdev_add(spa, nvroot);
+
+		switch (error) {
+		case 0:
+			break;
+		default:
+			fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
+		}
+		nvlist_free(nvroot);
+	} else {
+		/*
+		 * Remove an existing device.  Sometimes, dirty its
+		 * vdev state first to make sure we handle removal
+		 * of devices that have pending state changes.
+		 */
+		if (ztest_random(2) == 0)
+			(void) vdev_online(spa, guid, 0, NULL);
+
+		error = spa_vdev_remove(spa, guid, B_FALSE);
+
+		switch (error) {
+		case 0:
+		case EBUSY:
+		case ZFS_ERR_CHECKPOINT_EXISTS:
+		case ZFS_ERR_DISCARDING_CHECKPOINT:
+			break;
+		default:
+			fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
+		}
+	}
+
+	mutex_exit(&ztest_vdev_lock);
+
+	umem_free(path, MAXPATHLEN);
+}
+
+/*
+ * split a pool if it has mirror tlvdevs
+ */
+/* ARGSUSED */
+void
+ztest_split_pool(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	nvlist_t *tree, **child, *config, *split, **schild;
+	uint_t c, children, schildren = 0, lastlogid = 0;
+	int error = 0;
+
+	if (ztest_opts.zo_mmp_test)
+		return;
+
+	mutex_enter(&ztest_vdev_lock);
+
+	/* ensure we have a usable config; mirrors of raidz aren't supported */
+	if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/* clean up the old pool, if any */
+	(void) spa_destroy("splitp");
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	/* generate a config from the existing config */
+	mutex_enter(&spa->spa_props_lock);
+	VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
+	    &tree) == 0);
+	mutex_exit(&spa->spa_props_lock);
+
+	VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) == 0);
+
+	schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
+	for (c = 0; c < children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		nvlist_t **mchild;
+		uint_t mchildren;
+
+		if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
+			VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
+			    0) == 0);
+			VERIFY(nvlist_add_string(schild[schildren],
+			    ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
+			VERIFY(nvlist_add_uint64(schild[schildren],
+			    ZPOOL_CONFIG_IS_HOLE, 1) == 0);
+			if (lastlogid == 0)
+				lastlogid = schildren;
+			++schildren;
+			continue;
+		}
+		lastlogid = 0;
+		VERIFY(nvlist_lookup_nvlist_array(child[c],
+		    ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+		VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
+	}
+
+	/* OK, create a config that can be used to split */
+	VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
+	    lastlogid != 0 ? lastlogid : schildren) == 0);
+
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
+
+	for (c = 0; c < schildren; c++)
+		nvlist_free(schild[c]);
+	free(schild);
+	nvlist_free(split);
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	(void) pthread_rwlock_wrlock(&ztest_name_lock);
+	error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+
+	nvlist_free(config);
+
+	if (error == 0) {
+		(void) printf("successful split - results:\n");
+		mutex_enter(&spa_namespace_lock);
+		show_pool_stats(spa);
+		show_pool_stats(spa_lookup("splitp"));
+		mutex_exit(&spa_namespace_lock);
+		++zs->zs_splits;
+		--zs->zs_mirrors;
+	}
+	mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Verify that we can attach and detach devices.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	spa_aux_vdev_t *sav = &spa->spa_spares;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *oldvd, *newvd, *pvd;
+	nvlist_t *root;
+	uint64_t leaves;
+	uint64_t leaf, top;
+	uint64_t ashift = ztest_get_ashift();
+	uint64_t oldguid, pguid;
+	uint64_t oldsize, newsize;
+	char *oldpath, *newpath;
+	int replacing;
+	int oldvd_has_siblings = B_FALSE;
+	int newvd_is_spare = B_FALSE;
+	int oldvd_is_log;
+	int error, expected_error;
+
+	if (ztest_opts.zo_mmp_test)
+		return;
+
+	oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+	newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+	mutex_enter(&ztest_vdev_lock);
+	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+	/*
+	 * If a vdev is in the process of being removed, its removal may
+	 * finish while we are in progress, leading to an unexpected error
+	 * value.  Don't bother trying to attach while we are in the middle
+	 * of removal.
+	 */
+	if (ztest_device_removal_active) {
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/*
+	 * Decide whether to do an attach or a replace.
+	 */
+	replacing = ztest_random(2);
+
+	/*
+	 * Pick a random top-level vdev.
+	 */
+	top = ztest_random_vdev_top(spa, B_TRUE);
+
+	/*
+	 * Pick a random leaf within it.
+	 */
+	leaf = ztest_random(leaves);
+
+	/*
+	 * Locate this vdev.
+	 */
+	oldvd = rvd->vdev_child[top];
+
+	/* pick a child from the mirror */
+	if (zs->zs_mirrors >= 1) {
+		ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
+		ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
+		oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
+	}
+
+	/* pick a child out of the raidz group */
+	if (ztest_opts.zo_raidz > 1) {
+		ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
+		ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
+		oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
+	}
+
+	/*
+	 * If we're already doing an attach or replace, oldvd may be a
+	 * mirror vdev -- in which case, pick a random child.
+	 */
+	while (oldvd->vdev_children != 0) {
+		oldvd_has_siblings = B_TRUE;
+		ASSERT(oldvd->vdev_children >= 2);
+		oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
+	}
+
+	oldguid = oldvd->vdev_guid;
+	oldsize = vdev_get_min_asize(oldvd);
+	oldvd_is_log = oldvd->vdev_top->vdev_islog;
+	(void) strcpy(oldpath, oldvd->vdev_path);
+	pvd = oldvd->vdev_parent;
+	pguid = pvd->vdev_guid;
+
+	/*
+	 * If oldvd has siblings, then half of the time, detach it.  Prior
+	 * to the detach the pool is scrubbed in order to prevent creating
+	 * unrepairable blocks as a result of the data corruption injection.
+	 */
+	if (oldvd_has_siblings && ztest_random(2) == 0) {
+		spa_config_exit(spa, SCL_ALL, FTAG);
+
+		error = ztest_scrub_impl(spa);
+		if (error)
+			goto out;
+
+		error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
+		if (error != 0 && error != ENODEV && error != EBUSY &&
+		    error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS &&
+		    error != ZFS_ERR_DISCARDING_CHECKPOINT)
+			fatal(0, "detach (%s) returned %d", oldpath, error);
+		goto out;
+	}
+
+	/*
+	 * For the new vdev, choose with equal probability between the two
+	 * standard paths (ending in either 'a' or 'b') or a random hot spare.
+	 */
+	if (sav->sav_count != 0 && ztest_random(3) == 0) {
+		newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
+		newvd_is_spare = B_TRUE;
+		(void) strcpy(newpath, newvd->vdev_path);
+	} else {
+		(void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,
+		    ztest_opts.zo_dir, ztest_opts.zo_pool,
+		    top * leaves + leaf);
+		if (ztest_random(2) == 0)
+			newpath[strlen(newpath) - 1] = 'b';
+		newvd = vdev_lookup_by_path(rvd, newpath);
+	}
+
+	if (newvd) {
+		/*
+		 * Reopen to ensure the vdev's asize field isn't stale.
+		 */
+		vdev_reopen(newvd);
+		newsize = vdev_get_min_asize(newvd);
+	} else {
+		/*
+		 * Make newsize a little bigger or smaller than oldsize.
+		 * If it's smaller, the attach should fail.
+		 * If it's larger, and we're doing a replace,
+		 * we should get dynamic LUN growth when we're done.
+		 */
+		newsize = 10 * oldsize / (9 + ztest_random(3));
+	}
+
+	/*
+	 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
+	 * unless it's a replace; in that case any non-replacing parent is OK.
+	 *
+	 * If newvd is already part of the pool, it should fail with EBUSY.
+	 *
+	 * If newvd is too small, it should fail with EOVERFLOW.
+	 */
+	if (pvd->vdev_ops != &vdev_mirror_ops &&
+	    pvd->vdev_ops != &vdev_root_ops && (!replacing ||
+	    pvd->vdev_ops == &vdev_replacing_ops ||
+	    pvd->vdev_ops == &vdev_spare_ops))
+		expected_error = ENOTSUP;
+	else if (newvd_is_spare && (!replacing || oldvd_is_log))
+		expected_error = ENOTSUP;
+	else if (newvd == oldvd)
+		expected_error = replacing ? 0 : EBUSY;
+	else if (vdev_lookup_by_path(rvd, newpath) != NULL)
+		expected_error = EBUSY;
+	else if (newsize < oldsize)
+		expected_error = EOVERFLOW;
+	else if (ashift > oldvd->vdev_top->vdev_ashift)
+		expected_error = EDOM;
+	else
+		expected_error = 0;
+
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	/*
+	 * Build the nvlist describing newpath.
+	 */
+	root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
+	    ashift, NULL, 0, 0, 1);
+
+	/*
+	 * When supported select either a healing or sequential resilver.
+	 */
+	boolean_t rebuilding = B_FALSE;
+	if (pvd->vdev_ops == &vdev_mirror_ops ||
+	    pvd->vdev_ops ==  &vdev_root_ops) {
+		rebuilding = !!ztest_random(2);
+	}
+
+	error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding);
+
+	nvlist_free(root);
+
+	/*
+	 * If our parent was the replacing vdev, but the replace completed,
+	 * then instead of failing with ENOTSUP we may either succeed,
+	 * fail with ENODEV, or fail with EOVERFLOW.
+	 */
+	if (expected_error == ENOTSUP &&
+	    (error == 0 || error == ENODEV || error == EOVERFLOW))
+		expected_error = error;
+
+	/*
+	 * If someone grew the LUN, the replacement may be too small.
+	 */
+	if (error == EOVERFLOW || error == EBUSY)
+		expected_error = error;
+
+	if (error == ZFS_ERR_CHECKPOINT_EXISTS ||
+	    error == ZFS_ERR_DISCARDING_CHECKPOINT ||
+	    error == ZFS_ERR_RESILVER_IN_PROGRESS ||
+	    error == ZFS_ERR_REBUILD_IN_PROGRESS)
+		expected_error = error;
+
+	if (error != expected_error && expected_error != EBUSY) {
+		fatal(0, "attach (%s %llu, %s %llu, %d) "
+		    "returned %d, expected %d",
+		    oldpath, oldsize, newpath,
+		    newsize, replacing, error, expected_error);
+	}
+out:
+	mutex_exit(&ztest_vdev_lock);
+
+	umem_free(oldpath, MAXPATHLEN);
+	umem_free(newpath, MAXPATHLEN);
+}
+
+/* ARGSUSED */
+void
+ztest_device_removal(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+	vdev_t *vd;
+	uint64_t guid;
+	int error;
+
+	mutex_enter(&ztest_vdev_lock);
+
+	if (ztest_device_removal_active) {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/*
+	 * Remove a random top-level vdev and wait for removal to finish.
+	 */
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
+	guid = vd->vdev_guid;
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	error = spa_vdev_remove(spa, guid, B_FALSE);
+	if (error == 0) {
+		ztest_device_removal_active = B_TRUE;
+		mutex_exit(&ztest_vdev_lock);
+
+		/*
+		 * spa->spa_vdev_removal is created in a sync task that
+		 * is initiated via dsl_sync_task_nowait(). Since the
+		 * task may not run before spa_vdev_remove() returns, we
+		 * must wait at least 1 txg to ensure that the removal
+		 * struct has been created.
+		 */
+		txg_wait_synced(spa_get_dsl(spa), 0);
+
+		while (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+			txg_wait_synced(spa_get_dsl(spa), 0);
+	} else {
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/*
+	 * The pool needs to be scrubbed after completing device removal.
+	 * Failure to do so may result in checksum errors due to the
+	 * strategy employed by ztest_fault_inject() when selecting which
+	 * offset are redundant and can be damaged.
+	 */
+	error = spa_scan(spa, POOL_SCAN_SCRUB);
+	if (error == 0) {
+		while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+			txg_wait_synced(spa_get_dsl(spa), 0);
+	}
+
+	mutex_enter(&ztest_vdev_lock);
+	ztest_device_removal_active = B_FALSE;
+	mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Callback function which expands the physical size of the vdev.
+ */
+static vdev_t *
+grow_vdev(vdev_t *vd, void *arg)
+{
+	spa_t *spa __maybe_unused = vd->vdev_spa;
+	size_t *newsize = arg;
+	size_t fsize;
+	int fd;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
+		return (vd);
+
+	fsize = lseek(fd, 0, SEEK_END);
+	VERIFY(ftruncate(fd, *newsize) == 0);
+
+	if (ztest_opts.zo_verbose >= 6) {
+		(void) printf("%s grew from %lu to %lu bytes\n",
+		    vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
+	}
+	(void) close(fd);
+	return (NULL);
+}
+
+/*
+ * Callback function which expands a given vdev by calling vdev_online().
+ */
+/* ARGSUSED */
+static vdev_t *
+online_vdev(vdev_t *vd, void *arg)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *tvd = vd->vdev_top;
+	uint64_t guid = vd->vdev_guid;
+	uint64_t generation = spa->spa_config_generation + 1;
+	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
+	int error;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	/* Calling vdev_online will initialize the new metaslabs */
+	spa_config_exit(spa, SCL_STATE, spa);
+	error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	/*
+	 * If vdev_online returned an error or the underlying vdev_open
+	 * failed then we abort the expand. The only way to know that
+	 * vdev_open fails is by checking the returned newstate.
+	 */
+	if (error || newstate != VDEV_STATE_HEALTHY) {
+		if (ztest_opts.zo_verbose >= 5) {
+			(void) printf("Unable to expand vdev, state %llu, "
+			    "error %d\n", (u_longlong_t)newstate, error);
+		}
+		return (vd);
+	}
+	ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
+
+	/*
+	 * Since we dropped the lock we need to ensure that we're
+	 * still talking to the original vdev. It's possible this
+	 * vdev may have been detached/replaced while we were
+	 * trying to online it.
+	 */
+	if (generation != spa->spa_config_generation) {
+		if (ztest_opts.zo_verbose >= 5) {
+			(void) printf("vdev configuration has changed, "
+			    "guid %llu, state %llu, expected gen %llu, "
+			    "got gen %llu\n",
+			    (u_longlong_t)guid,
+			    (u_longlong_t)tvd->vdev_state,
+			    (u_longlong_t)generation,
+			    (u_longlong_t)spa->spa_config_generation);
+		}
+		return (vd);
+	}
+	return (NULL);
+}
+
+/*
+ * Traverse the vdev tree calling the supplied function.
+ * We continue to walk the tree until we either have walked all
+ * children or we receive a non-NULL return from the callback.
+ * If a NULL callback is passed, then we just return back the first
+ * leaf vdev we encounter.
+ */
+static vdev_t *
+vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
+{
+	uint_t c;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		if (func == NULL)
+			return (vd);
+		else
+			return (func(vd, arg));
+	}
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
+			return (cvd);
+	}
+	return (NULL);
+}
+
+/*
+ * Verify that dynamic LUN growth works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+	vdev_t *vd, *tvd;
+	metaslab_class_t *mc;
+	metaslab_group_t *mg;
+	size_t psize, newsize;
+	uint64_t top;
+	uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
+
+	mutex_enter(&ztest_checkpoint_lock);
+	mutex_enter(&ztest_vdev_lock);
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	/*
+	 * If there is a vdev removal in progress, it could complete while
+	 * we are running, in which case we would not be able to verify
+	 * that the metaslab_class space increased (because it decreases
+	 * when the device removal completes).
+	 */
+	if (ztest_device_removal_active) {
+		spa_config_exit(spa, SCL_STATE, spa);
+		mutex_exit(&ztest_vdev_lock);
+		mutex_exit(&ztest_checkpoint_lock);
+		return;
+	}
+
+	top = ztest_random_vdev_top(spa, B_TRUE);
+
+	tvd = spa->spa_root_vdev->vdev_child[top];
+	mg = tvd->vdev_mg;
+	mc = mg->mg_class;
+	old_ms_count = tvd->vdev_ms_count;
+	old_class_space = metaslab_class_get_space(mc);
+
+	/*
+	 * Determine the size of the first leaf vdev associated with
+	 * our top-level device.
+	 */
+	vd = vdev_walk_tree(tvd, NULL, NULL);
+	ASSERT3P(vd, !=, NULL);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	psize = vd->vdev_psize;
+
+	/*
+	 * We only try to expand the vdev if it's healthy, less than 4x its
+	 * original size, and it has a valid psize.
+	 */
+	if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
+	    psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
+		spa_config_exit(spa, SCL_STATE, spa);
+		mutex_exit(&ztest_vdev_lock);
+		mutex_exit(&ztest_checkpoint_lock);
+		return;
+	}
+	ASSERT(psize > 0);
+	newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE);
+	ASSERT3U(newsize, >, psize);
+
+	if (ztest_opts.zo_verbose >= 6) {
+		(void) printf("Expanding LUN %s from %lu to %lu\n",
+		    vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
+	}
+
+	/*
+	 * Growing the vdev is a two step process:
+	 *	1). expand the physical size (i.e. relabel)
+	 *	2). online the vdev to create the new metaslabs
+	 */
+	if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
+	    vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
+	    tvd->vdev_state != VDEV_STATE_HEALTHY) {
+		if (ztest_opts.zo_verbose >= 5) {
+			(void) printf("Could not expand LUN because "
+			    "the vdev configuration changed.\n");
+		}
+		spa_config_exit(spa, SCL_STATE, spa);
+		mutex_exit(&ztest_vdev_lock);
+		mutex_exit(&ztest_checkpoint_lock);
+		return;
+	}
+
+	spa_config_exit(spa, SCL_STATE, spa);
+
+	/*
+	 * Expanding the LUN will update the config asynchronously,
+	 * thus we must wait for the async thread to complete any
+	 * pending tasks before proceeding.
+	 */
+	for (;;) {
+		boolean_t done;
+		mutex_enter(&spa->spa_async_lock);
+		done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
+		mutex_exit(&spa->spa_async_lock);
+		if (done)
+			break;
+		txg_wait_synced(spa_get_dsl(spa), 0);
+		(void) poll(NULL, 0, 100);
+	}
+
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	tvd = spa->spa_root_vdev->vdev_child[top];
+	new_ms_count = tvd->vdev_ms_count;
+	new_class_space = metaslab_class_get_space(mc);
+
+	if (tvd->vdev_mg != mg || mg->mg_class != mc) {
+		if (ztest_opts.zo_verbose >= 5) {
+			(void) printf("Could not verify LUN expansion due to "
+			    "intervening vdev offline or remove.\n");
+		}
+		spa_config_exit(spa, SCL_STATE, spa);
+		mutex_exit(&ztest_vdev_lock);
+		mutex_exit(&ztest_checkpoint_lock);
+		return;
+	}
+
+	/*
+	 * Make sure we were able to grow the vdev.
+	 */
+	if (new_ms_count <= old_ms_count) {
+		fatal(0, "LUN expansion failed: ms_count %llu < %llu\n",
+		    old_ms_count, new_ms_count);
+	}
+
+	/*
+	 * Make sure we were able to grow the pool.
+	 */
+	if (new_class_space <= old_class_space) {
+		fatal(0, "LUN expansion failed: class_space %llu < %llu\n",
+		    old_class_space, new_class_space);
+	}
+
+	if (ztest_opts.zo_verbose >= 5) {
+		char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ];
+
+		nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf));
+		nicenum(new_class_space, newnumbuf, sizeof (newnumbuf));
+		(void) printf("%s grew from %s to %s\n",
+		    spa->spa_name, oldnumbuf, newnumbuf);
+	}
+
+	spa_config_exit(spa, SCL_STATE, spa);
+	mutex_exit(&ztest_vdev_lock);
+	mutex_exit(&ztest_checkpoint_lock);
+}
+
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
+/* ARGSUSED */
+static void
+ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+	/*
+	 * Create the objects common to all ztest datasets.
+	 */
+	VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
+	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+}
+
+static int
+ztest_dataset_create(char *dsname)
+{
+	int err;
+	uint64_t rand;
+	dsl_crypto_params_t *dcp = NULL;
+
+	/*
+	 * 50% of the time, we create encrypted datasets
+	 * using a random cipher suite and a hard-coded
+	 * wrapping key.
+	 */
+	rand = ztest_random(2);
+	if (rand != 0) {
+		nvlist_t *crypto_args = fnvlist_alloc();
+		nvlist_t *props = fnvlist_alloc();
+
+		/* slight bias towards the default cipher suite */
+		rand = ztest_random(ZIO_CRYPT_FUNCTIONS);
+		if (rand < ZIO_CRYPT_AES_128_CCM)
+			rand = ZIO_CRYPT_ON;
+
+		fnvlist_add_uint64(props,
+		    zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand);
+		fnvlist_add_uint8_array(crypto_args, "wkeydata",
+		    (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN);
+
+		/*
+		 * These parameters aren't really used by the kernel. They
+		 * are simply stored so that userspace knows how to load
+		 * the wrapping key.
+		 */
+		fnvlist_add_uint64(props,
+		    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW);
+		fnvlist_add_string(props,
+		    zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt");
+		fnvlist_add_uint64(props,
+		    zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL);
+		fnvlist_add_uint64(props,
+		    zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL);
+
+		VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props,
+		    crypto_args, &dcp));
+
+		/*
+		 * Cycle through all available encryption implementations
+		 * to verify interoperability.
+		 */
+		VERIFY0(gcm_impl_set("cycle"));
+		VERIFY0(aes_impl_set("cycle"));
+
+		fnvlist_free(crypto_args);
+		fnvlist_free(props);
+	}
+
+	err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp,
+	    ztest_objset_create_cb, NULL);
+	dsl_crypto_params_free(dcp, !!err);
+
+	rand = ztest_random(100);
+	if (err || rand < 80)
+		return (err);
+
+	if (ztest_opts.zo_verbose >= 5)
+		(void) printf("Setting dataset %s to sync always\n", dsname);
+	return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
+	    ZFS_SYNC_ALWAYS, B_FALSE));
+}
+
+/* ARGSUSED */
+static int
+ztest_objset_destroy_cb(const char *name, void *arg)
+{
+	objset_t *os;
+	dmu_object_info_t doi;
+	int error;
+
+	/*
+	 * Verify that the dataset contains a directory object.
+	 */
+	VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
+	    B_TRUE, FTAG, &os));
+	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
+	if (error != ENOENT) {
+		/* We could have crashed in the middle of destroying it */
+		ASSERT0(error);
+		ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
+		ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
+	}
+	dmu_objset_disown(os, B_TRUE, FTAG);
+
+	/*
+	 * Destroy the dataset.
+	 */
+	if (strchr(name, '@') != NULL) {
+		VERIFY0(dsl_destroy_snapshot(name, B_TRUE));
+	} else {
+		error = dsl_destroy_head(name);
+		if (error == ENOSPC) {
+			/* There could be checkpoint or insufficient slop */
+			ztest_record_enospc(FTAG);
+		} else if (error != EBUSY) {
+			/* There could be a hold on this dataset */
+			ASSERT0(error);
+		}
+	}
+	return (0);
+}
+
+static boolean_t
+ztest_snapshot_create(char *osname, uint64_t id)
+{
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	int error;
+
+	(void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
+
+	error = dmu_objset_snapshot_one(osname, snapname);
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
+		return (B_FALSE);
+	}
+	if (error != 0 && error != EEXIST) {
+		fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
+		    snapname, error);
+	}
+	return (B_TRUE);
+}
+
+static boolean_t
+ztest_snapshot_destroy(char *osname, uint64_t id)
+{
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	int error;
+
+	(void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname,
+	    (u_longlong_t)id);
+
+	error = dsl_destroy_snapshot(snapname, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+	return (B_TRUE);
+}
+
+/* ARGSUSED */
+void
+ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_ds_t *zdtmp;
+	int iters;
+	int error;
+	objset_t *os, *os2;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	zilog_t *zilog;
+	int i;
+
+	zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);
+
+	(void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+	(void) snprintf(name, sizeof (name), "%s/temp_%llu",
+	    ztest_opts.zo_pool, (u_longlong_t)id);
+
+	/*
+	 * If this dataset exists from a previous run, process its replay log
+	 * half of the time.  If we don't replay it, then dsl_destroy_head()
+	 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
+	 */
+	if (ztest_random(2) == 0 &&
+	    ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE,
+	    B_TRUE, FTAG, &os) == 0) {
+		ztest_zd_init(zdtmp, NULL, os);
+		zil_replay(os, zdtmp, ztest_replay_vector);
+		ztest_zd_fini(zdtmp);
+		dmu_objset_disown(os, B_TRUE, FTAG);
+	}
+
+	/*
+	 * There may be an old instance of the dataset we're about to
+	 * create lying around from a previous run.  If so, destroy it
+	 * and all of its snapshots.
+	 */
+	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+	    DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+
+	/*
+	 * Verify that the destroyed dataset is no longer in the namespace.
+	 */
+	VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
+	    B_TRUE, FTAG, &os));
+
+	/*
+	 * Verify that we can create a new dataset.
+	 */
+	error = ztest_dataset_create(name);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_objset_create(%s) = %d", name, error);
+	}
+
+	VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE,
+	    FTAG, &os));
+
+	ztest_zd_init(zdtmp, NULL, os);
+
+	/*
+	 * Open the intent log for it.
+	 */
+	zilog = zil_open(os, ztest_get_data);
+
+	/*
+	 * Put some objects in there, do a little I/O to them,
+	 * and randomly take a couple of snapshots along the way.
+	 */
+	iters = ztest_random(5);
+	for (i = 0; i < iters; i++) {
+		ztest_dmu_object_alloc_free(zdtmp, id);
+		if (ztest_random(iters) == 0)
+			(void) ztest_snapshot_create(name, i);
+	}
+
+	/*
+	 * Verify that we cannot create an existing dataset.
+	 */
+	VERIFY3U(EEXIST, ==,
+	    dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL));
+
+	/*
+	 * Verify that we can hold an objset that is also owned.
+	 */
+	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
+	dmu_objset_rele(os2, FTAG);
+
+	/*
+	 * Verify that we cannot own an objset that is already owned.
+	 */
+	VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER,
+	    B_FALSE, B_TRUE, FTAG, &os2));
+
+	zil_close(zilog);
+	dmu_objset_disown(os, B_TRUE, FTAG);
+	ztest_zd_fini(zdtmp);
+out:
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+
+	umem_free(zdtmp, sizeof (ztest_ds_t));
+}
+
+/*
+ * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
+ */
+void
+ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+	(void) pthread_rwlock_rdlock(&ztest_name_lock);
+	(void) ztest_snapshot_destroy(zd->zd_name, id);
+	(void) ztest_snapshot_create(zd->zd_name, id);
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+/*
+ * Cleanup non-standard snapshots and clones.
+ */
+static void
+ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
+{
+	char *snap1name;
+	char *clone1name;
+	char *snap2name;
+	char *clone2name;
+	char *snap3name;
+	int error;
+
+	snap1name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+	clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+	snap2name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+	clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+	snap3name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+
+	(void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s@s1_%llu", osname, (u_longlong_t)id);
+	(void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s/c1_%llu", osname, (u_longlong_t)id);
+	(void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s@s2_%llu", clone1name, (u_longlong_t)id);
+	(void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s/c2_%llu", osname, (u_longlong_t)id);
+	(void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s@s3_%llu", clone1name, (u_longlong_t)id);
+
+	error = dsl_destroy_head(clone2name);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
+	error = dsl_destroy_snapshot(snap3name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
+	error = dsl_destroy_snapshot(snap2name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
+	error = dsl_destroy_head(clone1name);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
+	error = dsl_destroy_snapshot(snap1name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
+
+	umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN);
+	umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN);
+	umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN);
+	umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN);
+	umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN);
+}
+
+/*
+ * Verify dsl_dataset_promote handles EBUSY
+ */
+void
+ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os;
+	char *snap1name;
+	char *clone1name;
+	char *snap2name;
+	char *clone2name;
+	char *snap3name;
+	char *osname = zd->zd_name;
+	int error;
+
+	snap1name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+	clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+	snap2name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+	clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+	snap3name  = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+
+	(void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+	ztest_dsl_dataset_cleanup(osname, id);
+
+	(void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s@s1_%llu", osname, (u_longlong_t)id);
+	(void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s/c1_%llu", osname, (u_longlong_t)id);
+	(void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s@s2_%llu", clone1name, (u_longlong_t)id);
+	(void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s/c2_%llu", osname, (u_longlong_t)id);
+	(void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN,
+	    "%s@s3_%llu", clone1name, (u_longlong_t)id);
+
+	error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
+	}
+
+	error = dmu_objset_clone(clone1name, snap1name);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
+	}
+
+	error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
+	}
+
+	error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
+	}
+
+	error = dmu_objset_clone(clone2name, snap3name);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
+		}
+		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
+	}
+
+	error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE,
+	    FTAG, &os);
+	if (error)
+		fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
+	error = dsl_dataset_promote(clone2name, NULL);
+	if (error == ENOSPC) {
+		dmu_objset_disown(os, B_TRUE, FTAG);
+		ztest_record_enospc(FTAG);
+		goto out;
+	}
+	if (error != EBUSY)
+		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
+		    error);
+	dmu_objset_disown(os, B_TRUE, FTAG);
+
+out:
+	ztest_dsl_dataset_cleanup(osname, id);
+
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+
+	umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN);
+	umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN);
+	umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN);
+	umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN);
+	umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN);
+}
+
+#undef OD_ARRAY_SIZE
+#define	OD_ARRAY_SIZE	4
+
+/*
+ * Verify that dmu_object_{alloc,free} work as expected.
+ */
+void
+ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_od_t *od;
+	int batchsize;
+	int size;
+	int b;
+
+	size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+	od = umem_alloc(size, UMEM_NOFAIL);
+	batchsize = OD_ARRAY_SIZE;
+
+	for (b = 0; b < batchsize; b++)
+		ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER,
+		    0, 0, 0);
+
+	/*
+	 * Destroy the previous batch of objects, create a new batch,
+	 * and do some I/O on the new objects.
+	 */
+	if (ztest_object_init(zd, od, size, B_TRUE) != 0)
+		return;
+
+	while (ztest_random(4 * batchsize) != 0)
+		ztest_io(zd, od[ztest_random(batchsize)].od_object,
+		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+
+	umem_free(od, size);
+}
+
+/*
+ * Rewind the global allocator to verify object allocation backfilling.
+ */
+void
+ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+	uint64_t object;
+
+	/*
+	 * Rewind the global allocator randomly back to a lower object number
+	 * to force backfilling and reclamation of recently freed dnodes.
+	 */
+	mutex_enter(&os->os_obj_lock);
+	object = ztest_random(os->os_obj_next_chunk);
+	os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
+	mutex_exit(&os->os_obj_lock);
+}
+
+#undef OD_ARRAY_SIZE
+#define	OD_ARRAY_SIZE	2
+
+/*
+ * Verify that dmu_{read,write} work as expected.
+ */
+void
+ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
+{
+	int size;
+	ztest_od_t *od;
+
+	objset_t *os = zd->zd_os;
+	size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+	od = umem_alloc(size, UMEM_NOFAIL);
+	dmu_tx_t *tx;
+	int i, freeit, error;
+	uint64_t n, s, txg;
+	bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
+	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+	uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
+	uint64_t regions = 997;
+	uint64_t stride = 123456789ULL;
+	uint64_t width = 40;
+	int free_percent = 5;
+
+	/*
+	 * This test uses two objects, packobj and bigobj, that are always
+	 * updated together (i.e. in the same tx) so that their contents are
+	 * in sync and can be compared.  Their contents relate to each other
+	 * in a simple way: packobj is a dense array of 'bufwad' structures,
+	 * while bigobj is a sparse array of the same bufwads.  Specifically,
+	 * for any index n, there are three bufwads that should be identical:
+	 *
+	 *	packobj, at offset n * sizeof (bufwad_t)
+	 *	bigobj, at the head of the nth chunk
+	 *	bigobj, at the tail of the nth chunk
+	 *
+	 * The chunk size is arbitrary. It doesn't have to be a power of two,
+	 * and it doesn't have any relation to the object blocksize.
+	 * The only requirement is that it can hold at least two bufwads.
+	 *
+	 * Normally, we write the bufwad to each of these locations.
+	 * However, free_percent of the time we instead write zeroes to
+	 * packobj and perform a dmu_free_range() on bigobj.  By comparing
+	 * bigobj to packobj, we can verify that the DMU is correctly
+	 * tracking which parts of an object are allocated and free,
+	 * and that the contents of the allocated blocks are correct.
+	 */
+
+	/*
+	 * Read the directory info.  If it's the first time, set things up.
+	 */
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize);
+	ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
+	    chunksize);
+
+	if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
+		umem_free(od, size);
+		return;
+	}
+
+	bigobj = od[0].od_object;
+	packobj = od[1].od_object;
+	chunksize = od[0].od_gen;
+	ASSERT(chunksize == od[1].od_gen);
+
+	/*
+	 * Prefetch a random chunk of the big object.
+	 * Our aim here is to get some async reads in flight
+	 * for blocks that we may free below; the DMU should
+	 * handle this race correctly.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(2 * width - 1);
+	dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
+	    ZIO_PRIORITY_SYNC_READ);
+
+	/*
+	 * Pick a random index and compute the offsets into packobj and bigobj.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(width - 1);
+
+	packoff = n * sizeof (bufwad_t);
+	packsize = s * sizeof (bufwad_t);
+
+	bigoff = n * chunksize;
+	bigsize = s * chunksize;
+
+	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
+	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
+
+	/*
+	 * free_percent of the time, free a range of bigobj rather than
+	 * overwriting it.
+	 */
+	freeit = (ztest_random(100) < free_percent);
+
+	/*
+	 * Read the current contents of our objects.
+	 */
+	error = dmu_read(os, packobj, packoff, packsize, packbuf,
+	    DMU_READ_PREFETCH);
+	ASSERT0(error);
+	error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
+	    DMU_READ_PREFETCH);
+	ASSERT0(error);
+
+	/*
+	 * Get a tx for the mods to both packobj and bigobj.
+	 */
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, packobj, packoff, packsize);
+
+	if (freeit)
+		dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
+	else
+		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+	/* This accounts for setting the checksum/compression. */
+	dmu_tx_hold_bonus(tx, bigobj);
+
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0) {
+		umem_free(packbuf, packsize);
+		umem_free(bigbuf, bigsize);
+		umem_free(od, size);
+		return;
+	}
+
+	enum zio_checksum cksum;
+	do {
+		cksum = (enum zio_checksum)
+		    ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
+	} while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+	dmu_object_set_checksum(os, bigobj, cksum, tx);
+
+	enum zio_compress comp;
+	do {
+		comp = (enum zio_compress)
+		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
+	} while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
+	dmu_object_set_compress(os, bigobj, comp, tx);
+
+	/*
+	 * For each index from n to n + s, verify that the existing bufwad
+	 * in packobj matches the bufwads at the head and tail of the
+	 * corresponding chunk in bigobj.  Then update all three bufwads
+	 * with the new values we want to write out.
+	 */
+	for (i = 0; i < s; i++) {
+		/* LINTED */
+		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+		/* LINTED */
+		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
+		/* LINTED */
+		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
+
+		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+		if (pack->bw_txg > txg)
+			fatal(0, "future leak: got %llx, open txg is %llx",
+			    pack->bw_txg, txg);
+
+		if (pack->bw_data != 0 && pack->bw_index != n + i)
+			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+			    pack->bw_index, n, i);
+
+		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+		if (freeit) {
+			bzero(pack, sizeof (bufwad_t));
+		} else {
+			pack->bw_index = n + i;
+			pack->bw_txg = txg;
+			pack->bw_data = 1 + ztest_random(-2ULL);
+		}
+		*bigH = *pack;
+		*bigT = *pack;
+	}
+
+	/*
+	 * We've verified all the old bufwads, and made new ones.
+	 * Now write them out.
+	 */
+	dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+
+	if (freeit) {
+		if (ztest_opts.zo_verbose >= 7) {
+			(void) printf("freeing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
+	} else {
+		if (ztest_opts.zo_verbose >= 7) {
+			(void) printf("writing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
+	}
+
+	dmu_tx_commit(tx);
+
+	/*
+	 * Sanity check the stuff we just wrote.
+	 */
+	{
+		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
+
+		VERIFY(0 == dmu_read(os, packobj, packoff,
+		    packsize, packcheck, DMU_READ_PREFETCH));
+		VERIFY(0 == dmu_read(os, bigobj, bigoff,
+		    bigsize, bigcheck, DMU_READ_PREFETCH));
+
+		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+
+		umem_free(packcheck, packsize);
+		umem_free(bigcheck, bigsize);
+	}
+
+	umem_free(packbuf, packsize);
+	umem_free(bigbuf, bigsize);
+	umem_free(od, size);
+}
+
+static void
+compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
+    uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
+{
+	uint64_t i;
+	bufwad_t *pack;
+	bufwad_t *bigH;
+	bufwad_t *bigT;
+
+	/*
+	 * For each index from n to n + s, verify that the existing bufwad
+	 * in packobj matches the bufwads at the head and tail of the
+	 * corresponding chunk in bigobj.  Then update all three bufwads
+	 * with the new values we want to write out.
+	 */
+	for (i = 0; i < s; i++) {
+		/* LINTED */
+		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+		/* LINTED */
+		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
+		/* LINTED */
+		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
+
+		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+		if (pack->bw_txg > txg)
+			fatal(0, "future leak: got %llx, open txg is %llx",
+			    pack->bw_txg, txg);
+
+		if (pack->bw_data != 0 && pack->bw_index != n + i)
+			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+			    pack->bw_index, n, i);
+
+		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+		pack->bw_index = n + i;
+		pack->bw_txg = txg;
+		pack->bw_data = 1 + ztest_random(-2ULL);
+
+		*bigH = *pack;
+		*bigT = *pack;
+	}
+}
+
+#undef OD_ARRAY_SIZE
+#define	OD_ARRAY_SIZE	2
+
+void
+ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	dmu_tx_t *tx;
+	uint64_t i;
+	int error;
+	int size;
+	uint64_t n, s, txg;
+	bufwad_t *packbuf, *bigbuf;
+	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+	uint64_t blocksize = ztest_random_blocksize();
+	uint64_t chunksize = blocksize;
+	uint64_t regions = 997;
+	uint64_t stride = 123456789ULL;
+	uint64_t width = 9;
+	dmu_buf_t *bonus_db;
+	arc_buf_t **bigbuf_arcbufs;
+	dmu_object_info_t doi;
+
+	size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+	od = umem_alloc(size, UMEM_NOFAIL);
+
+	/*
+	 * This test uses two objects, packobj and bigobj, that are always
+	 * updated together (i.e. in the same tx) so that their contents are
+	 * in sync and can be compared.  Their contents relate to each other
+	 * in a simple way: packobj is a dense array of 'bufwad' structures,
+	 * while bigobj is a sparse array of the same bufwads.  Specifically,
+	 * for any index n, there are three bufwads that should be identical:
+	 *
+	 *	packobj, at offset n * sizeof (bufwad_t)
+	 *	bigobj, at the head of the nth chunk
+	 *	bigobj, at the tail of the nth chunk
+	 *
+	 * The chunk size is set equal to bigobj block size so that
+	 * dmu_assign_arcbuf_by_dbuf() can be tested for object updates.
+	 */
+
+	/*
+	 * Read the directory info.  If it's the first time, set things up.
+	 */
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
+	ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
+	    chunksize);
+
+
+	if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
+		umem_free(od, size);
+		return;
+	}
+
+	bigobj = od[0].od_object;
+	packobj = od[1].od_object;
+	blocksize = od[0].od_blocksize;
+	chunksize = blocksize;
+	ASSERT(chunksize == od[1].od_gen);
+
+	VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
+	VERIFY(ISP2(doi.doi_data_block_size));
+	VERIFY(chunksize == doi.doi_data_block_size);
+	VERIFY(chunksize >= 2 * sizeof (bufwad_t));
+
+	/*
+	 * Pick a random index and compute the offsets into packobj and bigobj.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(width - 1);
+
+	packoff = n * sizeof (bufwad_t);
+	packsize = s * sizeof (bufwad_t);
+
+	bigoff = n * chunksize;
+	bigsize = s * chunksize;
+
+	packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
+	bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
+
+	bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
+
+	/*
+	 * Iteration 0 test zcopy for DB_UNCACHED dbufs.
+	 * Iteration 1 test zcopy to already referenced dbufs.
+	 * Iteration 2 test zcopy to dirty dbuf in the same txg.
+	 * Iteration 3 test zcopy to dbuf dirty in previous txg.
+	 * Iteration 4 test zcopy when dbuf is no longer dirty.
+	 * Iteration 5 test zcopy when it can't be done.
+	 * Iteration 6 one more zcopy write.
+	 */
+	for (i = 0; i < 7; i++) {
+		uint64_t j;
+		uint64_t off;
+
+		/*
+		 * In iteration 5 (i == 5) use arcbufs
+		 * that don't match bigobj blksz to test
+		 * dmu_assign_arcbuf_by_dbuf() when it can't directly
+		 * assign an arcbuf to a dbuf.
+		 */
+		for (j = 0; j < s; j++) {
+			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
+				bigbuf_arcbufs[j] =
+				    dmu_request_arcbuf(bonus_db, chunksize);
+			} else {
+				bigbuf_arcbufs[2 * j] =
+				    dmu_request_arcbuf(bonus_db, chunksize / 2);
+				bigbuf_arcbufs[2 * j + 1] =
+				    dmu_request_arcbuf(bonus_db, chunksize / 2);
+			}
+		}
+
+		/*
+		 * Get a tx for the mods to both packobj and bigobj.
+		 */
+		tx = dmu_tx_create(os);
+
+		dmu_tx_hold_write(tx, packobj, packoff, packsize);
+		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0) {
+			umem_free(packbuf, packsize);
+			umem_free(bigbuf, bigsize);
+			for (j = 0; j < s; j++) {
+				if (i != 5 ||
+				    chunksize < (SPA_MINBLOCKSIZE * 2)) {
+					dmu_return_arcbuf(bigbuf_arcbufs[j]);
+				} else {
+					dmu_return_arcbuf(
+					    bigbuf_arcbufs[2 * j]);
+					dmu_return_arcbuf(
+					    bigbuf_arcbufs[2 * j + 1]);
+				}
+			}
+			umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+			umem_free(od, size);
+			dmu_buf_rele(bonus_db, FTAG);
+			return;
+		}
+
+		/*
+		 * 50% of the time don't read objects in the 1st iteration to
+		 * test dmu_assign_arcbuf_by_dbuf() for the case when there are
+		 * no existing dbufs for the specified offsets.
+		 */
+		if (i != 0 || ztest_random(2) != 0) {
+			error = dmu_read(os, packobj, packoff,
+			    packsize, packbuf, DMU_READ_PREFETCH);
+			ASSERT0(error);
+			error = dmu_read(os, bigobj, bigoff, bigsize,
+			    bigbuf, DMU_READ_PREFETCH);
+			ASSERT0(error);
+		}
+		compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
+		    n, chunksize, txg);
+
+		/*
+		 * We've verified all the old bufwads, and made new ones.
+		 * Now write them out.
+		 */
+		dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+		if (ztest_opts.zo_verbose >= 7) {
+			(void) printf("writing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
+			dmu_buf_t *dbt;
+			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
+				bcopy((caddr_t)bigbuf + (off - bigoff),
+				    bigbuf_arcbufs[j]->b_data, chunksize);
+			} else {
+				bcopy((caddr_t)bigbuf + (off - bigoff),
+				    bigbuf_arcbufs[2 * j]->b_data,
+				    chunksize / 2);
+				bcopy((caddr_t)bigbuf + (off - bigoff) +
+				    chunksize / 2,
+				    bigbuf_arcbufs[2 * j + 1]->b_data,
+				    chunksize / 2);
+			}
+
+			if (i == 1) {
+				VERIFY(dmu_buf_hold(os, bigobj, off,
+				    FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
+			}
+			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
+				VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
+				    off, bigbuf_arcbufs[j], tx));
+			} else {
+				VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
+				    off, bigbuf_arcbufs[2 * j], tx));
+				VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
+				    off + chunksize / 2,
+				    bigbuf_arcbufs[2 * j + 1], tx));
+			}
+			if (i == 1) {
+				dmu_buf_rele(dbt, FTAG);
+			}
+		}
+		dmu_tx_commit(tx);
+
+		/*
+		 * Sanity check the stuff we just wrote.
+		 */
+		{
+			void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+			void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
+
+			VERIFY(0 == dmu_read(os, packobj, packoff,
+			    packsize, packcheck, DMU_READ_PREFETCH));
+			VERIFY(0 == dmu_read(os, bigobj, bigoff,
+			    bigsize, bigcheck, DMU_READ_PREFETCH));
+
+			ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+			ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+
+			umem_free(packcheck, packsize);
+			umem_free(bigcheck, bigsize);
+		}
+		if (i == 2) {
+			txg_wait_open(dmu_objset_pool(os), 0, B_TRUE);
+		} else if (i == 3) {
+			txg_wait_synced(dmu_objset_pool(os), 0);
+		}
+	}
+
+	dmu_buf_rele(bonus_db, FTAG);
+	umem_free(packbuf, packsize);
+	umem_free(bigbuf, bigsize);
+	umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+	umem_free(od, size);
+}
+
+/* ARGSUSED */
+void
+ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_od_t *od;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
+	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+
+	/*
+	 * Have multiple threads write to large offsets in an object
+	 * to verify that parallel writes to an object -- even to the
+	 * same blocks within the object -- doesn't cause any trouble.
+	 */
+	ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0)
+		return;
+
+	while (ztest_random(10) != 0)
+		ztest_io(zd, od->od_object, offset);
+
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+void
+ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_od_t *od;
+	uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
+	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+	uint64_t count = ztest_random(20) + 1;
+	uint64_t blocksize = ztest_random_blocksize();
+	void *data;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t),
+	    !ztest_random(2)) != 0) {
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) {
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	ztest_prealloc(zd, od->od_object, offset, count * blocksize);
+
+	data = umem_zalloc(blocksize, UMEM_NOFAIL);
+
+	while (ztest_random(count) != 0) {
+		uint64_t randoff = offset + (ztest_random(count) * blocksize);
+		if (ztest_write(zd, od->od_object, randoff, blocksize,
+		    data) != 0)
+			break;
+		while (ztest_random(4) != 0)
+			ztest_io(zd, od->od_object, randoff);
+	}
+
+	umem_free(data, blocksize);
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Verify that zap_{create,destroy,add,remove,update} work as expected.
+ */
+#define	ZTEST_ZAP_MIN_INTS	1
+#define	ZTEST_ZAP_MAX_INTS	4
+#define	ZTEST_ZAP_MAX_PROPS	1000
+
+void
+ztest_zap(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	uint64_t object;
+	uint64_t txg, last_txg;
+	uint64_t value[ZTEST_ZAP_MAX_INTS];
+	uint64_t zl_ints, zl_intsize, prop;
+	int i, ints;
+	dmu_tx_t *tx;
+	char propname[100], txgname[100];
+	int error;
+	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t),
+	    !ztest_random(2)) != 0)
+		goto out;
+
+	object = od->od_object;
+
+	/*
+	 * Generate a known hash collision, and verify that
+	 * we can lookup and remove both entries.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
+		goto out;
+	for (i = 0; i < 2; i++) {
+		value[i] = i;
+		VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
+		    1, &value[i], tx));
+	}
+	for (i = 0; i < 2; i++) {
+		VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
+		    sizeof (uint64_t), 1, &value[i], tx));
+		VERIFY3U(0, ==,
+		    zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, 1);
+	}
+	for (i = 0; i < 2; i++) {
+		VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
+	}
+	dmu_tx_commit(tx);
+
+	/*
+	 * Generate a bunch of random entries.
+	 */
+	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
+
+	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+	bzero(value, sizeof (value));
+	last_txg = 0;
+
+	/*
+	 * If these zap entries already exist, validate their contents.
+	 */
+	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+	if (error == 0) {
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, 1);
+
+		VERIFY(zap_lookup(os, object, txgname, zl_intsize,
+		    zl_ints, &last_txg) == 0);
+
+		VERIFY(zap_length(os, object, propname, &zl_intsize,
+		    &zl_ints) == 0);
+
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, ints);
+
+		VERIFY(zap_lookup(os, object, propname, zl_intsize,
+		    zl_ints, value) == 0);
+
+		for (i = 0; i < ints; i++) {
+			ASSERT3U(value[i], ==, last_txg + object + i);
+		}
+	} else {
+		ASSERT3U(error, ==, ENOENT);
+	}
+
+	/*
+	 * Atomically update two entries in our zap object.
+	 * The first is named txg_%llu, and contains the txg
+	 * in which the property was last updated.  The second
+	 * is named prop_%llu, and the nth element of its value
+	 * should be txg + object + n.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
+		goto out;
+
+	if (last_txg > txg)
+		fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
+
+	for (i = 0; i < ints; i++)
+		value[i] = txg + object + i;
+
+	VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
+	    1, &txg, tx));
+	VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
+	    ints, value, tx));
+
+	dmu_tx_commit(tx);
+
+	/*
+	 * Remove a random pair of entries.
+	 */
+	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+
+	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+
+	if (error == ENOENT)
+		goto out;
+
+	ASSERT0(error);
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
+		goto out;
+	VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+	VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
+	dmu_tx_commit(tx);
+out:
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Test case to test the upgrading of a microzap to fatzap.
+ */
+void
+ztest_fzap(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	uint64_t object, txg;
+	int i;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t),
+	    !ztest_random(2)) != 0)
+		goto out;
+	object = od->od_object;
+
+	/*
+	 * Add entries to this ZAP and make sure it spills over
+	 * and gets upgraded to a fatzap. Also, since we are adding
+	 * 2050 entries we should see ptrtbl growth and leaf-block split.
+	 */
+	for (i = 0; i < 2050; i++) {
+		char name[ZFS_MAX_DATASET_NAME_LEN];
+		uint64_t value = i;
+		dmu_tx_t *tx;
+		int error;
+
+		(void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+		    (u_longlong_t)id, (u_longlong_t)value);
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_zap(tx, object, B_TRUE, name);
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0)
+			goto out;
+		error = zap_add(os, object, name, sizeof (uint64_t), 1,
+		    &value, tx);
+		ASSERT(error == 0 || error == EEXIST);
+		dmu_tx_commit(tx);
+	}
+out:
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/* ARGSUSED */
+void
+ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
+	dmu_tx_t *tx;
+	int i, namelen, error;
+	int micro = ztest_random(2);
+	char name[20], string_value[20];
+	void *data;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	object = od->od_object;
+
+	/*
+	 * Generate a random name of the form 'xxx.....' where each
+	 * x is a random printable character and the dots are dots.
+	 * There are 94 such characters, and the name length goes from
+	 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+	 */
+	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+
+	for (i = 0; i < 3; i++)
+		name[i] = '!' + ztest_random('~' - '!' + 1);
+	for (; i < namelen - 1; i++)
+		name[i] = '.';
+	name[i] = '\0';
+
+	if ((namelen & 1) || micro) {
+		wsize = sizeof (txg);
+		wc = 1;
+		data = &txg;
+	} else {
+		wsize = 1;
+		wc = namelen;
+		data = string_value;
+	}
+
+	count = -1ULL;
+	VERIFY0(zap_count(os, object, &count));
+	ASSERT(count != -1ULL);
+
+	/*
+	 * Select an operation: length, lookup, add, update, remove.
+	 */
+	i = ztest_random(5);
+
+	if (i >= 2) {
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0) {
+			umem_free(od, sizeof (ztest_od_t));
+			return;
+		}
+		bcopy(name, string_value, namelen);
+	} else {
+		tx = NULL;
+		txg = 0;
+		bzero(string_value, namelen);
+	}
+
+	switch (i) {
+
+	case 0:
+		error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+		if (error == 0) {
+			ASSERT3U(wsize, ==, zl_wsize);
+			ASSERT3U(wc, ==, zl_wc);
+		} else {
+			ASSERT3U(error, ==, ENOENT);
+		}
+		break;
+
+	case 1:
+		error = zap_lookup(os, object, name, wsize, wc, data);
+		if (error == 0) {
+			if (data == string_value &&
+			    bcmp(name, data, namelen) != 0)
+				fatal(0, "name '%s' != val '%s' len %d",
+				    name, data, namelen);
+		} else {
+			ASSERT3U(error, ==, ENOENT);
+		}
+		break;
+
+	case 2:
+		error = zap_add(os, object, name, wsize, wc, data, tx);
+		ASSERT(error == 0 || error == EEXIST);
+		break;
+
+	case 3:
+		VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
+		break;
+
+	case 4:
+		error = zap_remove(os, object, name, tx);
+		ASSERT(error == 0 || error == ENOENT);
+		break;
+	}
+
+	if (tx != NULL)
+		dmu_tx_commit(tx);
+
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Commit callback data.
+ */
+typedef struct ztest_cb_data {
+	list_node_t		zcd_node;
+	uint64_t		zcd_txg;
+	int			zcd_expected_err;
+	boolean_t		zcd_added;
+	boolean_t		zcd_called;
+	spa_t			*zcd_spa;
+} ztest_cb_data_t;
+
+/* This is the actual commit callback function */
+static void
+ztest_commit_callback(void *arg, int error)
+{
+	ztest_cb_data_t *data = arg;
+	uint64_t synced_txg;
+
+	VERIFY(data != NULL);
+	VERIFY3S(data->zcd_expected_err, ==, error);
+	VERIFY(!data->zcd_called);
+
+	synced_txg = spa_last_synced_txg(data->zcd_spa);
+	if (data->zcd_txg > synced_txg)
+		fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
+		    ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
+		    synced_txg);
+
+	data->zcd_called = B_TRUE;
+
+	if (error == ECANCELED) {
+		ASSERT0(data->zcd_txg);
+		ASSERT(!data->zcd_added);
+
+		/*
+		 * The private callback data should be destroyed here, but
+		 * since we are going to check the zcd_called field after
+		 * dmu_tx_abort(), we will destroy it there.
+		 */
+		return;
+	}
+
+	ASSERT(data->zcd_added);
+	ASSERT3U(data->zcd_txg, !=, 0);
+
+	(void) mutex_enter(&zcl.zcl_callbacks_lock);
+
+	/* See if this cb was called more quickly */
+	if ((synced_txg - data->zcd_txg) < zc_min_txg_delay)
+		zc_min_txg_delay = synced_txg - data->zcd_txg;
+
+	/* Remove our callback from the list */
+	list_remove(&zcl.zcl_callbacks, data);
+
+	(void) mutex_exit(&zcl.zcl_callbacks_lock);
+
+	umem_free(data, sizeof (ztest_cb_data_t));
+}
+
+/* Allocate and initialize callback data structure */
+static ztest_cb_data_t *
+ztest_create_cb_data(objset_t *os, uint64_t txg)
+{
+	ztest_cb_data_t *cb_data;
+
+	cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
+
+	cb_data->zcd_txg = txg;
+	cb_data->zcd_spa = dmu_objset_spa(os);
+	list_link_init(&cb_data->zcd_node);
+
+	return (cb_data);
+}
+
+/*
+ * Commit callback test.
+ */
+void
+ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t *od;
+	dmu_tx_t *tx;
+	ztest_cb_data_t *cb_data[3], *tmp_cb;
+	uint64_t old_txg, txg;
+	int i, error = 0;
+
+	od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+	ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	tx = dmu_tx_create(os);
+
+	cb_data[0] = ztest_create_cb_data(os, 0);
+	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
+
+	dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t));
+
+	/* Every once in a while, abort the transaction on purpose */
+	if (ztest_random(100) == 0)
+		error = -1;
+
+	if (!error)
+		error = dmu_tx_assign(tx, TXG_NOWAIT);
+
+	txg = error ? 0 : dmu_tx_get_txg(tx);
+
+	cb_data[0]->zcd_txg = txg;
+	cb_data[1] = ztest_create_cb_data(os, txg);
+	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
+
+	if (error) {
+		/*
+		 * It's not a strict requirement to call the registered
+		 * callbacks from inside dmu_tx_abort(), but that's what
+		 * it's supposed to happen in the current implementation
+		 * so we will check for that.
+		 */
+		for (i = 0; i < 2; i++) {
+			cb_data[i]->zcd_expected_err = ECANCELED;
+			VERIFY(!cb_data[i]->zcd_called);
+		}
+
+		dmu_tx_abort(tx);
+
+		for (i = 0; i < 2; i++) {
+			VERIFY(cb_data[i]->zcd_called);
+			umem_free(cb_data[i], sizeof (ztest_cb_data_t));
+		}
+
+		umem_free(od, sizeof (ztest_od_t));
+		return;
+	}
+
+	cb_data[2] = ztest_create_cb_data(os, txg);
+	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
+
+	/*
+	 * Read existing data to make sure there isn't a future leak.
+	 */
+	VERIFY(0 == dmu_read(os, od->od_object, 0, sizeof (uint64_t),
+	    &old_txg, DMU_READ_PREFETCH));
+
+	if (old_txg > txg)
+		fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
+		    old_txg, txg);
+
+	dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx);
+
+	(void) mutex_enter(&zcl.zcl_callbacks_lock);
+
+	/*
+	 * Since commit callbacks don't have any ordering requirement and since
+	 * it is theoretically possible for a commit callback to be called
+	 * after an arbitrary amount of time has elapsed since its txg has been
+	 * synced, it is difficult to reliably determine whether a commit
+	 * callback hasn't been called due to high load or due to a flawed
+	 * implementation.
+	 *
+	 * In practice, we will assume that if after a certain number of txgs a
+	 * commit callback hasn't been called, then most likely there's an
+	 * implementation bug..
+	 */
+	tmp_cb = list_head(&zcl.zcl_callbacks);
+	if (tmp_cb != NULL &&
+	    tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) {
+		fatal(0, "Commit callback threshold exceeded, oldest txg: %"
+		    PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
+	}
+
+	/*
+	 * Let's find the place to insert our callbacks.
+	 *
+	 * Even though the list is ordered by txg, it is possible for the
+	 * insertion point to not be the end because our txg may already be
+	 * quiescing at this point and other callbacks in the open txg
+	 * (from other objsets) may have sneaked in.
+	 */
+	tmp_cb = list_tail(&zcl.zcl_callbacks);
+	while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
+		tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
+
+	/* Add the 3 callbacks to the list */
+	for (i = 0; i < 3; i++) {
+		if (tmp_cb == NULL)
+			list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
+		else
+			list_insert_after(&zcl.zcl_callbacks, tmp_cb,
+			    cb_data[i]);
+
+		cb_data[i]->zcd_added = B_TRUE;
+		VERIFY(!cb_data[i]->zcd_called);
+
+		tmp_cb = cb_data[i];
+	}
+
+	zc_cb_counter += 3;
+
+	(void) mutex_exit(&zcl.zcl_callbacks_lock);
+
+	dmu_tx_commit(tx);
+
+	umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Visit each object in the dataset. Verify that its properties
+ * are consistent what was stored in the block tag when it was created,
+ * and that its unused bonus buffer space has not been overwritten.
+ */
+/* ARGSUSED */
+void
+ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	uint64_t obj;
+	int err = 0;
+
+	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
+		ztest_block_tag_t *bt = NULL;
+		dmu_object_info_t doi;
+		dmu_buf_t *db;
+
+		ztest_object_lock(zd, obj, RL_READER);
+		if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) {
+			ztest_object_unlock(zd, obj);
+			continue;
+		}
+
+		dmu_object_info_from_db(db, &doi);
+		if (doi.doi_bonus_size >= sizeof (*bt))
+			bt = ztest_bt_bonus(db);
+
+		if (bt && bt->bt_magic == BT_MAGIC) {
+			ztest_bt_verify(bt, os, obj, doi.doi_dnodesize,
+			    bt->bt_offset, bt->bt_gen, bt->bt_txg,
+			    bt->bt_crtxg);
+			ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen);
+		}
+
+		dmu_buf_rele(db, FTAG);
+		ztest_object_unlock(zd, obj);
+	}
+}
+
+/* ARGSUSED */
+void
+ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+	zfs_prop_t proplist[] = {
+		ZFS_PROP_CHECKSUM,
+		ZFS_PROP_COMPRESSION,
+		ZFS_PROP_COPIES,
+		ZFS_PROP_DEDUP
+	};
+	int p;
+
+	(void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+	for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+		(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+		    ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
+
+	VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE,
+	    ztest_random_blocksize(), (int)ztest_random(2)));
+
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+	nvlist_t *props = NULL;
+
+	(void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+	(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2));
+
+	VERIFY0(spa_prop_get(ztest_spa, &props));
+
+	if (ztest_opts.zo_verbose >= 6)
+		dump_nvlist(props, 4);
+
+	nvlist_free(props);
+
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+static int
+user_release_one(const char *snapname, const char *holdname)
+{
+	nvlist_t *snaps, *holds;
+	int error;
+
+	snaps = fnvlist_alloc();
+	holds = fnvlist_alloc();
+	fnvlist_add_boolean(holds, holdname);
+	fnvlist_add_nvlist(snaps, snapname, holds);
+	fnvlist_free(holds);
+	error = dsl_dataset_user_release(snaps, NULL);
+	fnvlist_free(snaps);
+	return (error);
+}
+
+/*
+ * Test snapshot hold/release and deferred destroy.
+ */
+void
+ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
+{
+	int error;
+	objset_t *os = zd->zd_os;
+	objset_t *origin;
+	char snapname[100];
+	char fullname[100];
+	char clonename[100];
+	char tag[100];
+	char osname[ZFS_MAX_DATASET_NAME_LEN];
+	nvlist_t *holds;
+
+	(void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+	dmu_objset_name(os, osname);
+
+	(void) snprintf(snapname, sizeof (snapname), "sh1_%llu",
+	    (u_longlong_t)id);
+	(void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
+	(void) snprintf(clonename, sizeof (clonename),
+	    "%s/ch1_%llu", osname, (u_longlong_t)id);
+	(void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id);
+
+	/*
+	 * Clean up from any previous run.
+	 */
+	error = dsl_destroy_head(clonename);
+	if (error != ENOENT)
+		ASSERT0(error);
+	error = user_release_one(fullname, tag);
+	if (error != ESRCH && error != ENOENT)
+		ASSERT0(error);
+	error = dsl_destroy_snapshot(fullname, B_FALSE);
+	if (error != ENOENT)
+		ASSERT0(error);
+
+	/*
+	 * Create snapshot, clone it, mark snap for deferred destroy,
+	 * destroy clone, verify snap was also destroyed.
+	 */
+	error = dmu_objset_snapshot_one(osname, snapname);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_snapshot");
+			goto out;
+		}
+		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+	}
+
+	error = dmu_objset_clone(clonename, fullname);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_clone");
+			goto out;
+		}
+		fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
+	}
+
+	error = dsl_destroy_snapshot(fullname, B_TRUE);
+	if (error) {
+		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
+		    fullname, error);
+	}
+
+	error = dsl_destroy_head(clonename);
+	if (error)
+		fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
+
+	error = dmu_objset_hold(fullname, FTAG, &origin);
+	if (error != ENOENT)
+		fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
+
+	/*
+	 * Create snapshot, add temporary hold, verify that we can't
+	 * destroy a held snapshot, mark for deferred destroy,
+	 * release hold, verify snapshot was destroyed.
+	 */
+	error = dmu_objset_snapshot_one(osname, snapname);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_snapshot");
+			goto out;
+		}
+		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+	}
+
+	holds = fnvlist_alloc();
+	fnvlist_add_string(holds, fullname, tag);
+	error = dsl_dataset_user_hold(holds, 0, NULL);
+	fnvlist_free(holds);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc("dsl_dataset_user_hold");
+		goto out;
+	} else if (error) {
+		fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
+		    fullname, tag, error);
+	}
+
+	error = dsl_destroy_snapshot(fullname, B_FALSE);
+	if (error != EBUSY) {
+		fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
+		    fullname, error);
+	}
+
+	error = dsl_destroy_snapshot(fullname, B_TRUE);
+	if (error) {
+		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
+		    fullname, error);
+	}
+
+	error = user_release_one(fullname, tag);
+	if (error)
+		fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
+
+	VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
+
+out:
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+/*
+ * Inject random faults into the on-disk data.
+ */
+/* ARGSUSED */
+void
+ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = ztest_spa;
+	int fd;
+	uint64_t offset;
+	uint64_t leaves;
+	uint64_t bad = 0x1990c0ffeedecadeull;
+	uint64_t top, leaf;
+	char *path0;
+	char *pathrand;
+	size_t fsize;
+	int bshift = SPA_MAXBLOCKSHIFT + 2;
+	int iters = 1000;
+	int maxfaults;
+	int mirror_save;
+	vdev_t *vd0 = NULL;
+	uint64_t guid0 = 0;
+	boolean_t islog = B_FALSE;
+
+	path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+	pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+	mutex_enter(&ztest_vdev_lock);
+
+	/*
+	 * Device removal is in progress, fault injection must be disabled
+	 * until it completes and the pool is scrubbed.  The fault injection
+	 * strategy for damaging blocks does not take in to account evacuated
+	 * blocks which may have already been damaged.
+	 */
+	if (ztest_device_removal_active) {
+		mutex_exit(&ztest_vdev_lock);
+		goto out;
+	}
+
+	maxfaults = MAXFAULTS(zs);
+	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
+	mirror_save = zs->zs_mirrors;
+	mutex_exit(&ztest_vdev_lock);
+
+	ASSERT(leaves >= 1);
+
+	/*
+	 * While ztest is running the number of leaves will not change.  This
+	 * is critical for the fault injection logic as it determines where
+	 * errors can be safely injected such that they are always repairable.
+	 *
+	 * When restarting ztest a different number of leaves may be requested
+	 * which will shift the regions to be damaged.  This is fine as long
+	 * as the pool has been scrubbed prior to using the new mapping.
+	 * Failure to do can result in non-repairable damage being injected.
+	 */
+	if (ztest_pool_scrubbed == B_FALSE)
+		goto out;
+
+	/*
+	 * Grab the name lock as reader. There are some operations
+	 * which don't like to have their vdevs changed while
+	 * they are in progress (i.e. spa_change_guid). Those
+	 * operations will have grabbed the name lock as writer.
+	 */
+	(void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+	/*
+	 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
+	 */
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+	if (ztest_random(2) == 0) {
+		/*
+		 * Inject errors on a normal data device or slog device.
+		 */
+		top = ztest_random_vdev_top(spa, B_TRUE);
+		leaf = ztest_random(leaves) + zs->zs_splits;
+
+		/*
+		 * Generate paths to the first leaf in this top-level vdev,
+		 * and to the random leaf we selected.  We'll induce transient
+		 * write failures and random online/offline activity on leaf 0,
+		 * and we'll write random garbage to the randomly chosen leaf.
+		 */
+		(void) snprintf(path0, MAXPATHLEN, ztest_dev_template,
+		    ztest_opts.zo_dir, ztest_opts.zo_pool,
+		    top * leaves + zs->zs_splits);
+		(void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template,
+		    ztest_opts.zo_dir, ztest_opts.zo_pool,
+		    top * leaves + leaf);
+
+		vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+		if (vd0 != NULL && vd0->vdev_top->vdev_islog)
+			islog = B_TRUE;
+
+		/*
+		 * If the top-level vdev needs to be resilvered
+		 * then we only allow faults on the device that is
+		 * resilvering.
+		 */
+		if (vd0 != NULL && maxfaults != 1 &&
+		    (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
+		    vd0->vdev_resilver_txg != 0)) {
+			/*
+			 * Make vd0 explicitly claim to be unreadable,
+			 * or unwriteable, or reach behind its back
+			 * and close the underlying fd.  We can do this if
+			 * maxfaults == 0 because we'll fail and reexecute,
+			 * and we can do it if maxfaults >= 2 because we'll
+			 * have enough redundancy.  If maxfaults == 1, the
+			 * combination of this with injection of random data
+			 * corruption below exceeds the pool's fault tolerance.
+			 */
+			vdev_file_t *vf = vd0->vdev_tsd;
+
+			zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d",
+			    (long long)vd0->vdev_id, (int)maxfaults);
+
+			if (vf != NULL && ztest_random(3) == 0) {
+				(void) close(vf->vf_file->f_fd);
+				vf->vf_file->f_fd = -1;
+			} else if (ztest_random(2) == 0) {
+				vd0->vdev_cant_read = B_TRUE;
+			} else {
+				vd0->vdev_cant_write = B_TRUE;
+			}
+			guid0 = vd0->vdev_guid;
+		}
+	} else {
+		/*
+		 * Inject errors on an l2cache device.
+		 */
+		spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+		if (sav->sav_count == 0) {
+			spa_config_exit(spa, SCL_STATE, FTAG);
+			(void) pthread_rwlock_unlock(&ztest_name_lock);
+			goto out;
+		}
+		vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
+		guid0 = vd0->vdev_guid;
+		(void) strcpy(path0, vd0->vdev_path);
+		(void) strcpy(pathrand, vd0->vdev_path);
+
+		leaf = 0;
+		leaves = 1;
+		maxfaults = INT_MAX;	/* no limit on cache devices */
+	}
+
+	spa_config_exit(spa, SCL_STATE, FTAG);
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+
+	/*
+	 * If we can tolerate two or more faults, or we're dealing
+	 * with a slog, randomly online/offline vd0.
+	 */
+	if ((maxfaults >= 2 || islog) && guid0 != 0) {
+		if (ztest_random(10) < 6) {
+			int flags = (ztest_random(2) == 0 ?
+			    ZFS_OFFLINE_TEMPORARY : 0);
+
+			/*
+			 * We have to grab the zs_name_lock as writer to
+			 * prevent a race between offlining a slog and
+			 * destroying a dataset. Offlining the slog will
+			 * grab a reference on the dataset which may cause
+			 * dsl_destroy_head() to fail with EBUSY thus
+			 * leaving the dataset in an inconsistent state.
+			 */
+			if (islog)
+				(void) pthread_rwlock_wrlock(&ztest_name_lock);
+
+			VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+
+			if (islog)
+				(void) pthread_rwlock_unlock(&ztest_name_lock);
+		} else {
+			/*
+			 * Ideally we would like to be able to randomly
+			 * call vdev_[on|off]line without holding locks
+			 * to force unpredictable failures but the side
+			 * effects of vdev_[on|off]line prevent us from
+			 * doing so. We grab the ztest_vdev_lock here to
+			 * prevent a race between injection testing and
+			 * aux_vdev removal.
+			 */
+			mutex_enter(&ztest_vdev_lock);
+			(void) vdev_online(spa, guid0, 0, NULL);
+			mutex_exit(&ztest_vdev_lock);
+		}
+	}
+
+	if (maxfaults == 0)
+		goto out;
+
+	/*
+	 * We have at least single-fault tolerance, so inject data corruption.
+	 */
+	fd = open(pathrand, O_RDWR);
+
+	if (fd == -1) /* we hit a gap in the device namespace */
+		goto out;
+
+	fsize = lseek(fd, 0, SEEK_END);
+
+	while (--iters != 0) {
+		/*
+		 * The offset must be chosen carefully to ensure that
+		 * we do not inject a given logical block with errors
+		 * on two different leaf devices, because ZFS can not
+		 * tolerate that (if maxfaults==1).
+		 *
+		 * To achieve this we divide each leaf device into
+		 * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4).
+		 * Each chunk is further divided into error-injection
+		 * ranges (can accept errors) and clear ranges (we do
+		 * not inject errors in those). Each error-injection
+		 * range can accept errors only for a single leaf vdev.
+		 * Error-injection ranges are separated by clear ranges.
+		 *
+		 * For example, with 3 leaves, each chunk looks like:
+		 *    0 to  32M: injection range for leaf 0
+		 *  32M to  64M: clear range - no injection allowed
+		 *  64M to  96M: injection range for leaf 1
+		 *  96M to 128M: clear range - no injection allowed
+		 * 128M to 160M: injection range for leaf 2
+		 * 160M to 192M: clear range - no injection allowed
+		 *
+		 * Each clear range must be large enough such that a
+		 * single block cannot straddle it. This way a block
+		 * can't be a target in two different injection ranges
+		 * (on different leaf vdevs).
+		 */
+		offset = ztest_random(fsize / (leaves << bshift)) *
+		    (leaves << bshift) + (leaf << bshift) +
+		    (ztest_random(1ULL << (bshift - 1)) & -8ULL);
+
+		/*
+		 * Only allow damage to the labels at one end of the vdev.
+		 *
+		 * If all labels are damaged, the device will be totally
+		 * inaccessible, which will result in loss of data,
+		 * because we also damage (parts of) the other side of
+		 * the mirror/raidz.
+		 *
+		 * Additionally, we will always have both an even and an
+		 * odd label, so that we can handle crashes in the
+		 * middle of vdev_config_sync().
+		 */
+		if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
+			continue;
+
+		/*
+		 * The two end labels are stored at the "end" of the disk, but
+		 * the end of the disk (vdev_psize) is aligned to
+		 * sizeof (vdev_label_t).
+		 */
+		uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
+		if ((leaf & 1) == 1 &&
+		    offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
+			continue;
+
+		mutex_enter(&ztest_vdev_lock);
+		if (mirror_save != zs->zs_mirrors) {
+			mutex_exit(&ztest_vdev_lock);
+			(void) close(fd);
+			goto out;
+		}
+
+		if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
+			fatal(1, "can't inject bad word at 0x%llx in %s",
+			    offset, pathrand);
+
+		mutex_exit(&ztest_vdev_lock);
+
+		if (ztest_opts.zo_verbose >= 7)
+			(void) printf("injected bad word into %s,"
+			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
+	}
+
+	(void) close(fd);
+out:
+	umem_free(path0, MAXPATHLEN);
+	umem_free(pathrand, MAXPATHLEN);
+}
+
+/*
+ * By design ztest will never inject uncorrectable damage in to the pool.
+ * Issue a scrub, wait for it to complete, and verify there is never any
+ * any persistent damage.
+ *
+ * Only after a full scrub has been completed is it safe to start injecting
+ * data corruption.  See the comment in zfs_fault_inject().
+ */
+static int
+ztest_scrub_impl(spa_t *spa)
+{
+	int error = spa_scan(spa, POOL_SCAN_SCRUB);
+	if (error)
+		return (error);
+
+	while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+		txg_wait_synced(spa_get_dsl(spa), 0);
+
+	if (spa_get_errlog_size(spa) > 0)
+		return (ECKSUM);
+
+	ztest_pool_scrubbed = B_TRUE;
+
+	return (0);
+}
+
+/*
+ * Scrub the pool.
+ */
+/* ARGSUSED */
+void
+ztest_scrub(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+	int error;
+
+	/*
+	 * Scrub in progress by device removal.
+	 */
+	if (ztest_device_removal_active)
+		return;
+
+	/*
+	 * Start a scrub, wait a moment, then force a restart.
+	 */
+	(void) spa_scan(spa, POOL_SCAN_SCRUB);
+	(void) poll(NULL, 0, 100);
+
+	error = ztest_scrub_impl(spa);
+	if (error == EBUSY)
+		error = 0;
+	ASSERT0(error);
+}
+
+/*
+ * Change the guid for the pool.
+ */
+/* ARGSUSED */
+void
+ztest_reguid(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+	uint64_t orig, load;
+	int error;
+
+	if (ztest_opts.zo_mmp_test)
+		return;
+
+	orig = spa_guid(spa);
+	load = spa_load_guid(spa);
+
+	(void) pthread_rwlock_wrlock(&ztest_name_lock);
+	error = spa_change_guid(spa);
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+
+	if (error != 0)
+		return;
+
+	if (ztest_opts.zo_verbose >= 4) {
+		(void) printf("Changed guid old %llu -> %llu\n",
+		    (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
+	}
+
+	VERIFY3U(orig, !=, spa_guid(spa));
+	VERIFY3U(load, ==, spa_load_guid(spa));
+}
+
+void
+ztest_fletcher(ztest_ds_t *zd, uint64_t id)
+{
+	hrtime_t end = gethrtime() + NANOSEC;
+
+	while (gethrtime() <= end) {
+		int run_count = 100;
+		void *buf;
+		struct abd *abd_data, *abd_meta;
+		uint32_t size;
+		int *ptr;
+		int i;
+		zio_cksum_t zc_ref;
+		zio_cksum_t zc_ref_byteswap;
+
+		size = ztest_random_blocksize();
+
+		buf = umem_alloc(size, UMEM_NOFAIL);
+		abd_data = abd_alloc(size, B_FALSE);
+		abd_meta = abd_alloc(size, B_TRUE);
+
+		for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++)
+			*ptr = ztest_random(UINT_MAX);
+
+		abd_copy_from_buf_off(abd_data, buf, 0, size);
+		abd_copy_from_buf_off(abd_meta, buf, 0, size);
+
+		VERIFY0(fletcher_4_impl_set("scalar"));
+		fletcher_4_native(buf, size, NULL, &zc_ref);
+		fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap);
+
+		VERIFY0(fletcher_4_impl_set("cycle"));
+		while (run_count-- > 0) {
+			zio_cksum_t zc;
+			zio_cksum_t zc_byteswap;
+
+			fletcher_4_byteswap(buf, size, NULL, &zc_byteswap);
+			fletcher_4_native(buf, size, NULL, &zc);
+
+			VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
+			VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
+			    sizeof (zc_byteswap)));
+
+			/* Test ABD - data */
+			abd_fletcher_4_byteswap(abd_data, size, NULL,
+			    &zc_byteswap);
+			abd_fletcher_4_native(abd_data, size, NULL, &zc);
+
+			VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
+			VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
+			    sizeof (zc_byteswap)));
+
+			/* Test ABD - metadata */
+			abd_fletcher_4_byteswap(abd_meta, size, NULL,
+			    &zc_byteswap);
+			abd_fletcher_4_native(abd_meta, size, NULL, &zc);
+
+			VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
+			VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
+			    sizeof (zc_byteswap)));
+
+		}
+
+		umem_free(buf, size);
+		abd_free(abd_data);
+		abd_free(abd_meta);
+	}
+}
+
+void
+ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id)
+{
+	void *buf;
+	size_t size;
+	int *ptr;
+	int i;
+	zio_cksum_t zc_ref;
+	zio_cksum_t zc_ref_bswap;
+
+	hrtime_t end = gethrtime() + NANOSEC;
+
+	while (gethrtime() <= end) {
+		int run_count = 100;
+
+		size = ztest_random_blocksize();
+		buf = umem_alloc(size, UMEM_NOFAIL);
+
+		for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++)
+			*ptr = ztest_random(UINT_MAX);
+
+		VERIFY0(fletcher_4_impl_set("scalar"));
+		fletcher_4_native(buf, size, NULL, &zc_ref);
+		fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap);
+
+		VERIFY0(fletcher_4_impl_set("cycle"));
+
+		while (run_count-- > 0) {
+			zio_cksum_t zc;
+			zio_cksum_t zc_bswap;
+			size_t pos = 0;
+
+			ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+			ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
+
+			while (pos < size) {
+				size_t inc = 64 * ztest_random(size / 67);
+				/* sometimes add few bytes to test non-simd */
+				if (ztest_random(100) < 10)
+					inc += P2ALIGN(ztest_random(64),
+					    sizeof (uint32_t));
+
+				if (inc > (size - pos))
+					inc = size - pos;
+
+				fletcher_4_incremental_native(buf + pos, inc,
+				    &zc);
+				fletcher_4_incremental_byteswap(buf + pos, inc,
+				    &zc_bswap);
+
+				pos += inc;
+			}
+
+			VERIFY3U(pos, ==, size);
+
+			VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
+			VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
+
+			/*
+			 * verify if incremental on the whole buffer is
+			 * equivalent to non-incremental version
+			 */
+			ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+			ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
+
+			fletcher_4_incremental_native(buf, size, &zc);
+			fletcher_4_incremental_byteswap(buf, size, &zc_bswap);
+
+			VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
+			VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
+		}
+
+		umem_free(buf, size);
+	}
+}
+
+static int
+ztest_check_path(char *path)
+{
+	struct stat s;
+	/* return true on success */
+	return (!stat(path, &s));
+}
+
+static void
+ztest_get_zdb_bin(char *bin, int len)
+{
+	char *zdb_path;
+	/*
+	 * Try to use ZDB_PATH and in-tree zdb path. If not successful, just
+	 * let popen to search through PATH.
+	 */
+	if ((zdb_path = getenv("ZDB_PATH"))) {
+		strlcpy(bin, zdb_path, len); /* In env */
+		if (!ztest_check_path(bin)) {
+			ztest_dump_core = 0;
+			fatal(1, "invalid ZDB_PATH '%s'", bin);
+		}
+		return;
+	}
+
+	VERIFY(realpath(getexecname(), bin) != NULL);
+	if (strstr(bin, "/ztest/")) {
+		strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */
+		strcat(bin, "/zdb/zdb");
+		if (ztest_check_path(bin))
+			return;
+	}
+	strcpy(bin, "zdb");
+}
+
+static vdev_t *
+ztest_random_concrete_vdev_leaf(vdev_t *vd)
+{
+	if (vd == NULL)
+		return (NULL);
+
+	if (vd->vdev_children == 0)
+		return (vd);
+
+	vdev_t *eligible[vd->vdev_children];
+	int eligible_idx = 0, i;
+	for (i = 0; i < vd->vdev_children; i++) {
+		vdev_t *cvd = vd->vdev_child[i];
+		if (cvd->vdev_top->vdev_removing)
+			continue;
+		if (cvd->vdev_children > 0 ||
+		    (vdev_is_concrete(cvd) && !cvd->vdev_detached)) {
+			eligible[eligible_idx++] = cvd;
+		}
+	}
+	VERIFY(eligible_idx > 0);
+
+	uint64_t child_no = ztest_random(eligible_idx);
+	return (ztest_random_concrete_vdev_leaf(eligible[child_no]));
+}
+
+/* ARGSUSED */
+void
+ztest_initialize(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+	int error = 0;
+
+	mutex_enter(&ztest_vdev_lock);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	/* Random leaf vdev */
+	vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
+	if (rand_vd == NULL) {
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/*
+	 * The random vdev we've selected may change as soon as we
+	 * drop the spa_config_lock. We create local copies of things
+	 * we're interested in.
+	 */
+	uint64_t guid = rand_vd->vdev_guid;
+	char *path = strdup(rand_vd->vdev_path);
+	boolean_t active = rand_vd->vdev_initialize_thread != NULL;
+
+	zfs_dbgmsg("vd %px, guid %llu", rand_vd, guid);
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS);
+
+	nvlist_t *vdev_guids = fnvlist_alloc();
+	nvlist_t *vdev_errlist = fnvlist_alloc();
+	fnvlist_add_uint64(vdev_guids, path, guid);
+	error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist);
+	fnvlist_free(vdev_guids);
+	fnvlist_free(vdev_errlist);
+
+	switch (cmd) {
+	case POOL_INITIALIZE_CANCEL:
+		if (ztest_opts.zo_verbose >= 4) {
+			(void) printf("Cancel initialize %s", path);
+			if (!active)
+				(void) printf(" failed (no initialize active)");
+			(void) printf("\n");
+		}
+		break;
+	case POOL_INITIALIZE_START:
+		if (ztest_opts.zo_verbose >= 4) {
+			(void) printf("Start initialize %s", path);
+			if (active && error == 0)
+				(void) printf(" failed (already active)");
+			else if (error != 0)
+				(void) printf(" failed (error %d)", error);
+			(void) printf("\n");
+		}
+		break;
+	case POOL_INITIALIZE_SUSPEND:
+		if (ztest_opts.zo_verbose >= 4) {
+			(void) printf("Suspend initialize %s", path);
+			if (!active)
+				(void) printf(" failed (no initialize active)");
+			(void) printf("\n");
+		}
+		break;
+	}
+	free(path);
+	mutex_exit(&ztest_vdev_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_trim(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+	int error = 0;
+
+	mutex_enter(&ztest_vdev_lock);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	/* Random leaf vdev */
+	vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
+	if (rand_vd == NULL) {
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+		mutex_exit(&ztest_vdev_lock);
+		return;
+	}
+
+	/*
+	 * The random vdev we've selected may change as soon as we
+	 * drop the spa_config_lock. We create local copies of things
+	 * we're interested in.
+	 */
+	uint64_t guid = rand_vd->vdev_guid;
+	char *path = strdup(rand_vd->vdev_path);
+	boolean_t active = rand_vd->vdev_trim_thread != NULL;
+
+	zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid);
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	uint64_t cmd = ztest_random(POOL_TRIM_FUNCS);
+	uint64_t rate = 1 << ztest_random(30);
+	boolean_t partial = (ztest_random(5) > 0);
+	boolean_t secure = (ztest_random(5) > 0);
+
+	nvlist_t *vdev_guids = fnvlist_alloc();
+	nvlist_t *vdev_errlist = fnvlist_alloc();
+	fnvlist_add_uint64(vdev_guids, path, guid);
+	error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial,
+	    secure, vdev_errlist);
+	fnvlist_free(vdev_guids);
+	fnvlist_free(vdev_errlist);
+
+	switch (cmd) {
+	case POOL_TRIM_CANCEL:
+		if (ztest_opts.zo_verbose >= 4) {
+			(void) printf("Cancel TRIM %s", path);
+			if (!active)
+				(void) printf(" failed (no TRIM active)");
+			(void) printf("\n");
+		}
+		break;
+	case POOL_TRIM_START:
+		if (ztest_opts.zo_verbose >= 4) {
+			(void) printf("Start TRIM %s", path);
+			if (active && error == 0)
+				(void) printf(" failed (already active)");
+			else if (error != 0)
+				(void) printf(" failed (error %d)", error);
+			(void) printf("\n");
+		}
+		break;
+	case POOL_TRIM_SUSPEND:
+		if (ztest_opts.zo_verbose >= 4) {
+			(void) printf("Suspend TRIM %s", path);
+			if (!active)
+				(void) printf(" failed (no TRIM active)");
+			(void) printf("\n");
+		}
+		break;
+	}
+	free(path);
+	mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Verify pool integrity by running zdb.
+ */
+static void
+ztest_run_zdb(char *pool)
+{
+	int status;
+	char *bin;
+	char *zdb;
+	char *zbuf;
+	const int len = MAXPATHLEN + MAXNAMELEN + 20;
+	FILE *fp;
+
+	bin = umem_alloc(len, UMEM_NOFAIL);
+	zdb = umem_alloc(len, UMEM_NOFAIL);
+	zbuf = umem_alloc(1024, UMEM_NOFAIL);
+
+	ztest_get_zdb_bin(bin, len);
+
+	(void) sprintf(zdb,
+	    "%s -bcc%s%s -G -d -Y -e -y -p %s %s",
+	    bin,
+	    ztest_opts.zo_verbose >= 3 ? "s" : "",
+	    ztest_opts.zo_verbose >= 4 ? "v" : "",
+	    ztest_opts.zo_dir,
+	    pool);
+
+	if (ztest_opts.zo_verbose >= 5)
+		(void) printf("Executing %s\n", strstr(zdb, "zdb "));
+
+	fp = popen(zdb, "r");
+
+	while (fgets(zbuf, 1024, fp) != NULL)
+		if (ztest_opts.zo_verbose >= 3)
+			(void) printf("%s", zbuf);
+
+	status = pclose(fp);
+
+	if (status == 0)
+		goto out;
+
+	ztest_dump_core = 0;
+	if (WIFEXITED(status))
+		fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
+	else
+		fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
+out:
+	umem_free(bin, len);
+	umem_free(zdb, len);
+	umem_free(zbuf, 1024);
+}
+
+static void
+ztest_walk_pool_directory(char *header)
+{
+	spa_t *spa = NULL;
+
+	if (ztest_opts.zo_verbose >= 6)
+		(void) printf("%s\n", header);
+
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL)
+		if (ztest_opts.zo_verbose >= 6)
+			(void) printf("\t%s\n", spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+}
+
+static void
+ztest_spa_import_export(char *oldname, char *newname)
+{
+	nvlist_t *config, *newconfig;
+	uint64_t pool_guid;
+	spa_t *spa;
+	int error;
+
+	if (ztest_opts.zo_verbose >= 4) {
+		(void) printf("import/export: old = %s, new = %s\n",
+		    oldname, newname);
+	}
+
+	/*
+	 * Clean up from previous runs.
+	 */
+	(void) spa_destroy(newname);
+
+	/*
+	 * Get the pool's configuration and guid.
+	 */
+	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
+
+	/*
+	 * Kick off a scrub to tickle scrub/export races.
+	 */
+	if (ztest_random(2) == 0)
+		(void) spa_scan(spa, POOL_SCAN_SCRUB);
+
+	pool_guid = spa_guid(spa);
+	spa_close(spa, FTAG);
+
+	ztest_walk_pool_directory("pools before export");
+
+	/*
+	 * Export it.
+	 */
+	VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
+
+	ztest_walk_pool_directory("pools after export");
+
+	/*
+	 * Try to import it.
+	 */
+	newconfig = spa_tryimport(config);
+	ASSERT(newconfig != NULL);
+	nvlist_free(newconfig);
+
+	/*
+	 * Import it under the new name.
+	 */
+	error = spa_import(newname, config, NULL, 0);
+	if (error != 0) {
+		dump_nvlist(config, 0);
+		fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
+		    oldname, newname, error);
+	}
+
+	ztest_walk_pool_directory("pools after import");
+
+	/*
+	 * Try to import it again -- should fail with EEXIST.
+	 */
+	VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
+
+	/*
+	 * Try to import it under a different name -- should fail with EEXIST.
+	 */
+	VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
+
+	/*
+	 * Verify that the pool is no longer visible under the old name.
+	 */
+	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
+
+	/*
+	 * Verify that we can open and close the pool using the new name.
+	 */
+	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
+	ASSERT(pool_guid == spa_guid(spa));
+	spa_close(spa, FTAG);
+
+	nvlist_free(config);
+}
+
+static void
+ztest_resume(spa_t *spa)
+{
+	if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
+		(void) printf("resuming from suspended state\n");
+	spa_vdev_state_enter(spa, SCL_NONE);
+	vdev_clear(spa, NULL);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	(void) zio_resume(spa);
+}
+
+static void
+ztest_resume_thread(void *arg)
+{
+	spa_t *spa = arg;
+
+	while (!ztest_exiting) {
+		if (spa_suspended(spa))
+			ztest_resume(spa);
+		(void) poll(NULL, 0, 100);
+
+		/*
+		 * Periodically change the zfs_compressed_arc_enabled setting.
+		 */
+		if (ztest_random(10) == 0)
+			zfs_compressed_arc_enabled = ztest_random(2);
+
+		/*
+		 * Periodically change the zfs_abd_scatter_enabled setting.
+		 */
+		if (ztest_random(10) == 0)
+			zfs_abd_scatter_enabled = ztest_random(2);
+	}
+
+	thread_exit();
+}
+
+static void
+ztest_deadman_thread(void *arg)
+{
+	ztest_shared_t *zs = arg;
+	spa_t *spa = ztest_spa;
+	hrtime_t delay, overdue, last_run = gethrtime();
+
+	delay = (zs->zs_thread_stop - zs->zs_thread_start) +
+	    MSEC2NSEC(zfs_deadman_synctime_ms);
+
+	while (!ztest_exiting) {
+		/*
+		 * Wait for the delay timer while checking occasionally
+		 * if we should stop.
+		 */
+		if (gethrtime() < last_run + delay) {
+			(void) poll(NULL, 0, 1000);
+			continue;
+		}
+
+		/*
+		 * If the pool is suspended then fail immediately. Otherwise,
+		 * check to see if the pool is making any progress. If
+		 * vdev_deadman() discovers that there hasn't been any recent
+		 * I/Os then it will end up aborting the tests.
+		 */
+		if (spa_suspended(spa) || spa->spa_root_vdev == NULL) {
+			fatal(0, "aborting test after %llu seconds because "
+			    "pool has transitioned to a suspended state.",
+			    zfs_deadman_synctime_ms / 1000);
+		}
+		vdev_deadman(spa->spa_root_vdev, FTAG);
+
+		/*
+		 * If the process doesn't complete within a grace period of
+		 * zfs_deadman_synctime_ms over the expected finish time,
+		 * then it may be hung and is terminated.
+		 */
+		overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms);
+		if (gethrtime() > overdue) {
+			fatal(0, "aborting test after %llu seconds because "
+			    "the process is overdue for termination.",
+			    (gethrtime() - zs->zs_proc_start) / NANOSEC);
+		}
+
+		(void) printf("ztest has been running for %lld seconds\n",
+		    (gethrtime() - zs->zs_proc_start) / NANOSEC);
+
+		last_run = gethrtime();
+		delay = MSEC2NSEC(zfs_deadman_checktime_ms);
+	}
+
+	thread_exit();
+}
+
+static void
+ztest_execute(int test, ztest_info_t *zi, uint64_t id)
+{
+	ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
+	ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
+	hrtime_t functime = gethrtime();
+	int i;
+
+	for (i = 0; i < zi->zi_iters; i++)
+		zi->zi_func(zd, id);
+
+	functime = gethrtime() - functime;
+
+	atomic_add_64(&zc->zc_count, 1);
+	atomic_add_64(&zc->zc_time, functime);
+
+	if (ztest_opts.zo_verbose >= 4)
+		(void) printf("%6.2f sec in %s\n",
+		    (double)functime / NANOSEC, zi->zi_funcname);
+}
+
+static void
+ztest_thread(void *arg)
+{
+	int rand;
+	uint64_t id = (uintptr_t)arg;
+	ztest_shared_t *zs = ztest_shared;
+	uint64_t call_next;
+	hrtime_t now;
+	ztest_info_t *zi;
+	ztest_shared_callstate_t *zc;
+
+	while ((now = gethrtime()) < zs->zs_thread_stop) {
+		/*
+		 * See if it's time to force a crash.
+		 */
+		if (now > zs->zs_thread_kill)
+			ztest_kill(zs);
+
+		/*
+		 * If we're getting ENOSPC with some regularity, stop.
+		 */
+		if (zs->zs_enospc_count > 10)
+			break;
+
+		/*
+		 * Pick a random function to execute.
+		 */
+		rand = ztest_random(ZTEST_FUNCS);
+		zi = &ztest_info[rand];
+		zc = ZTEST_GET_SHARED_CALLSTATE(rand);
+		call_next = zc->zc_next;
+
+		if (now >= call_next &&
+		    atomic_cas_64(&zc->zc_next, call_next, call_next +
+		    ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
+			ztest_execute(rand, zi, id);
+		}
+	}
+
+	thread_exit();
+}
+
+static void
+ztest_dataset_name(char *dsname, char *pool, int d)
+{
+	(void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d);
+}
+
+static void
+ztest_dataset_destroy(int d)
+{
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	int t;
+
+	ztest_dataset_name(name, ztest_opts.zo_pool, d);
+
+	if (ztest_opts.zo_verbose >= 3)
+		(void) printf("Destroying %s to free up space\n", name);
+
+	/*
+	 * Cleanup any non-standard clones and snapshots.  In general,
+	 * ztest thread t operates on dataset (t % zopt_datasets),
+	 * so there may be more than one thing to clean up.
+	 */
+	for (t = d; t < ztest_opts.zo_threads;
+	    t += ztest_opts.zo_datasets)
+		ztest_dsl_dataset_cleanup(name, t);
+
+	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+	    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+}
+
+static void
+ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+{
+	uint64_t usedobjs, dirobjs, scratch;
+
+	/*
+	 * ZTEST_DIROBJ is the object directory for the entire dataset.
+	 * Therefore, the number of objects in use should equal the
+	 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+	 * If not, we have an object leak.
+	 *
+	 * Note that we can only check this in ztest_dataset_open(),
+	 * when the open-context and syncing-context values agree.
+	 * That's because zap_count() returns the open-context value,
+	 * while dmu_objset_space() returns the rootbp fill count.
+	 */
+	VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+	dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+	ASSERT3U(dirobjs + 1, ==, usedobjs);
+}
+
+static int
+ztest_dataset_open(int d)
+{
+	ztest_ds_t *zd = &ztest_ds[d];
+	uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
+	objset_t *os;
+	zilog_t *zilog;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	int error;
+
+	ztest_dataset_name(name, ztest_opts.zo_pool, d);
+
+	(void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+	error = ztest_dataset_create(name);
+	if (error == ENOSPC) {
+		(void) pthread_rwlock_unlock(&ztest_name_lock);
+		ztest_record_enospc(FTAG);
+		return (error);
+	}
+	ASSERT(error == 0 || error == EEXIST);
+
+	VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE,
+	    B_TRUE, zd, &os));
+	(void) pthread_rwlock_unlock(&ztest_name_lock);
+
+	ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
+
+	zilog = zd->zd_zilog;
+
+	if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+	    zilog->zl_header->zh_claim_lr_seq < committed_seq)
+		fatal(0, "missing log records: claimed %llu < committed %llu",
+		    zilog->zl_header->zh_claim_lr_seq, committed_seq);
+
+	ztest_dataset_dirobj_verify(zd);
+
+	zil_replay(os, zd, ztest_replay_vector);
+
+	ztest_dataset_dirobj_verify(zd);
+
+	if (ztest_opts.zo_verbose >= 6)
+		(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+		    zd->zd_name,
+		    (u_longlong_t)zilog->zl_parse_blk_count,
+		    (u_longlong_t)zilog->zl_parse_lr_count,
+		    (u_longlong_t)zilog->zl_replaying_seq);
+
+	zilog = zil_open(os, ztest_get_data);
+
+	if (zilog->zl_replaying_seq != 0 &&
+	    zilog->zl_replaying_seq < committed_seq)
+		fatal(0, "missing log records: replayed %llu < committed %llu",
+		    zilog->zl_replaying_seq, committed_seq);
+
+	return (0);
+}
+
+static void
+ztest_dataset_close(int d)
+{
+	ztest_ds_t *zd = &ztest_ds[d];
+
+	zil_close(zd->zd_zilog);
+	dmu_objset_disown(zd->zd_os, B_TRUE, zd);
+
+	ztest_zd_fini(zd);
+}
+
+/* ARGSUSED */
+static int
+ztest_replay_zil_cb(const char *name, void *arg)
+{
+	objset_t *os;
+	ztest_ds_t *zdtmp;
+
+	VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE,
+	    B_TRUE, FTAG, &os));
+
+	zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);
+
+	ztest_zd_init(zdtmp, NULL, os);
+	zil_replay(os, zdtmp, ztest_replay_vector);
+	ztest_zd_fini(zdtmp);
+
+	if (dmu_objset_zil(os)->zl_parse_lr_count != 0 &&
+	    ztest_opts.zo_verbose >= 6) {
+		zilog_t *zilog = dmu_objset_zil(os);
+
+		(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+		    name,
+		    (u_longlong_t)zilog->zl_parse_blk_count,
+		    (u_longlong_t)zilog->zl_parse_lr_count,
+		    (u_longlong_t)zilog->zl_replaying_seq);
+	}
+
+	umem_free(zdtmp, sizeof (ztest_ds_t));
+
+	dmu_objset_disown(os, B_TRUE, FTAG);
+	return (0);
+}
+
+static void
+ztest_freeze(void)
+{
+	ztest_ds_t *zd = &ztest_ds[0];
+	spa_t *spa;
+	int numloops = 0;
+
+	if (ztest_opts.zo_verbose >= 3)
+		(void) printf("testing spa_freeze()...\n");
+
+	kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	VERIFY3U(0, ==, ztest_dataset_open(0));
+	ztest_spa = spa;
+
+	/*
+	 * Force the first log block to be transactionally allocated.
+	 * We have to do this before we freeze the pool -- otherwise
+	 * the log chain won't be anchored.
+	 */
+	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+		ztest_dmu_object_alloc_free(zd, 0);
+		zil_commit(zd->zd_zilog, 0);
+	}
+
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	/*
+	 * Freeze the pool.  This stops spa_sync() from doing anything,
+	 * so that the only way to record changes from now on is the ZIL.
+	 */
+	spa_freeze(spa);
+
+	/*
+	 * Because it is hard to predict how much space a write will actually
+	 * require beforehand, we leave ourselves some fudge space to write over
+	 * capacity.
+	 */
+	uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
+
+	/*
+	 * Run tests that generate log records but don't alter the pool config
+	 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
+	 * We do a txg_wait_synced() after each iteration to force the txg
+	 * to increase well beyond the last synced value in the uberblock.
+	 * The ZIL should be OK with that.
+	 *
+	 * Run a random number of times less than zo_maxloops and ensure we do
+	 * not run out of space on the pool.
+	 */
+	while (ztest_random(10) != 0 &&
+	    numloops++ < ztest_opts.zo_maxloops &&
+	    metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
+		ztest_od_t od;
+		ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
+		VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
+		ztest_io(zd, od.od_object,
+		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+		txg_wait_synced(spa_get_dsl(spa), 0);
+	}
+
+	/*
+	 * Commit all of the changes we just generated.
+	 */
+	zil_commit(zd->zd_zilog, 0);
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	/*
+	 * Close our dataset and close the pool.
+	 */
+	ztest_dataset_close(0);
+	spa_close(spa, FTAG);
+	kernel_fini();
+
+	/*
+	 * Open and close the pool and dataset to induce log replay.
+	 */
+	kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
+	VERIFY3U(0, ==, ztest_dataset_open(0));
+	ztest_spa = spa;
+	txg_wait_synced(spa_get_dsl(spa), 0);
+	ztest_dataset_close(0);
+	ztest_reguid(NULL, 0);
+
+	spa_close(spa, FTAG);
+	kernel_fini();
+}
+
+static void
+ztest_import_impl(ztest_shared_t *zs)
+{
+	importargs_t args = { 0 };
+	nvlist_t *cfg = NULL;
+	int nsearch = 1;
+	char *searchdirs[nsearch];
+	int flags = ZFS_IMPORT_MISSING_LOG;
+
+	searchdirs[0] = ztest_opts.zo_dir;
+	args.paths = nsearch;
+	args.path = searchdirs;
+	args.can_be_active = B_FALSE;
+
+	VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args,
+	    &libzpool_config_ops));
+	VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags));
+}
+
+/*
+ * Import a storage pool with the given name.
+ */
+static void
+ztest_import(ztest_shared_t *zs)
+{
+	spa_t *spa;
+
+	mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
+	VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
+
+	kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+
+	ztest_import_impl(zs);
+
+	VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	zs->zs_metaslab_sz =
+	    1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+	spa_close(spa, FTAG);
+
+	kernel_fini();
+
+	if (!ztest_opts.zo_mmp_test) {
+		ztest_run_zdb(ztest_opts.zo_pool);
+		ztest_freeze();
+		ztest_run_zdb(ztest_opts.zo_pool);
+	}
+
+	(void) pthread_rwlock_destroy(&ztest_name_lock);
+	mutex_destroy(&ztest_vdev_lock);
+	mutex_destroy(&ztest_checkpoint_lock);
+}
+
+/*
+ * Kick off threads to run tests on all datasets in parallel.
+ */
+static void
+ztest_run(ztest_shared_t *zs)
+{
+	spa_t *spa;
+	objset_t *os;
+	kthread_t *resume_thread, *deadman_thread;
+	kthread_t **run_threads;
+	uint64_t object;
+	int error;
+	int t, d;
+
+	ztest_exiting = B_FALSE;
+
+	/*
+	 * Initialize parent/child shared state.
+	 */
+	mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
+	VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
+
+	zs->zs_thread_start = gethrtime();
+	zs->zs_thread_stop =
+	    zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
+	zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
+	zs->zs_thread_kill = zs->zs_thread_stop;
+	if (ztest_random(100) < ztest_opts.zo_killrate) {
+		zs->zs_thread_kill -=
+		    ztest_random(ztest_opts.zo_passtime * NANOSEC);
+	}
+
+	mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
+	    offsetof(ztest_cb_data_t, zcd_node));
+
+	/*
+	 * Open our pool.  It may need to be imported first depending on
+	 * what tests were running when the previous pass was terminated.
+	 */
+	kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+	error = spa_open(ztest_opts.zo_pool, &spa, FTAG);
+	if (error) {
+		VERIFY3S(error, ==, ENOENT);
+		ztest_import_impl(zs);
+		VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
+		zs->zs_metaslab_sz =
+		    1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+	}
+
+	metaslab_preload_limit = ztest_random(20) + 1;
+	ztest_spa = spa;
+
+	VERIFY0(vdev_raidz_impl_set("cycle"));
+
+	dmu_objset_stats_t dds;
+	VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
+	    DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+	dmu_objset_fast_stat(os, &dds);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+	zs->zs_guid = dds.dds_guid;
+	dmu_objset_disown(os, B_TRUE, FTAG);
+
+	/*
+	 * Create a thread to periodically resume suspended I/O.
+	 */
+	resume_thread = thread_create(NULL, 0, ztest_resume_thread,
+	    spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
+
+	/*
+	 * Create a deadman thread and set to panic if we hang.
+	 */
+	deadman_thread = thread_create(NULL, 0, ztest_deadman_thread,
+	    zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
+
+	spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
+
+	/*
+	 * Verify that we can safely inquire about any object,
+	 * whether it's allocated or not.  To make it interesting,
+	 * we probe a 5-wide window around each power of two.
+	 * This hits all edge cases, including zero and the max.
+	 */
+	for (t = 0; t < 64; t++) {
+		for (d = -5; d <= 5; d++) {
+			error = dmu_object_info(spa->spa_meta_objset,
+			    (1ULL << t) + d, NULL);
+			ASSERT(error == 0 || error == ENOENT ||
+			    error == EINVAL);
+		}
+	}
+
+	/*
+	 * If we got any ENOSPC errors on the previous run, destroy something.
+	 */
+	if (zs->zs_enospc_count != 0) {
+		int d = ztest_random(ztest_opts.zo_datasets);
+		ztest_dataset_destroy(d);
+	}
+	zs->zs_enospc_count = 0;
+
+	/*
+	 * If we were in the middle of ztest_device_removal() and were killed
+	 * we need to ensure the removal and scrub complete before running
+	 * any tests that check ztest_device_removal_active. The removal will
+	 * be restarted automatically when the spa is opened, but we need to
+	 * initiate the scrub manually if it is not already in progress. Note
+	 * that we always run the scrub whenever an indirect vdev exists
+	 * because we have no way of knowing for sure if ztest_device_removal()
+	 * fully completed its scrub before the pool was reimported.
+	 */
+	if (spa->spa_removing_phys.sr_state == DSS_SCANNING ||
+	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+		while (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+			txg_wait_synced(spa_get_dsl(spa), 0);
+
+		error = ztest_scrub_impl(spa);
+		if (error == EBUSY)
+			error = 0;
+		ASSERT0(error);
+	}
+
+	run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),
+	    UMEM_NOFAIL);
+
+	if (ztest_opts.zo_verbose >= 4)
+		(void) printf("starting main threads...\n");
+
+	/*
+	 * Replay all logs of all datasets in the pool. This is primarily for
+	 * temporary datasets which wouldn't otherwise get replayed, which
+	 * can trigger failures when attempting to offline a SLOG in
+	 * ztest_fault_inject().
+	 */
+	(void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb,
+	    NULL, DS_FIND_CHILDREN);
+
+	/*
+	 * Kick off all the tests that run in parallel.
+	 */
+	for (t = 0; t < ztest_opts.zo_threads; t++) {
+		if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {
+			umem_free(run_threads, ztest_opts.zo_threads *
+			    sizeof (kthread_t *));
+			return;
+		}
+
+		run_threads[t] = thread_create(NULL, 0, ztest_thread,
+		    (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,
+		    defclsyspri);
+	}
+
+	/*
+	 * Wait for all of the tests to complete.
+	 */
+	for (t = 0; t < ztest_opts.zo_threads; t++)
+		VERIFY0(thread_join(run_threads[t]));
+
+	/*
+	 * Close all datasets. This must be done after all the threads
+	 * are joined so we can be sure none of the datasets are in-use
+	 * by any of the threads.
+	 */
+	for (t = 0; t < ztest_opts.zo_threads; t++) {
+		if (t < ztest_opts.zo_datasets)
+			ztest_dataset_close(t);
+	}
+
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+	zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+	umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));
+
+	/* Kill the resume and deadman threads */
+	ztest_exiting = B_TRUE;
+	VERIFY0(thread_join(resume_thread));
+	VERIFY0(thread_join(deadman_thread));
+	ztest_resume(spa);
+
+	/*
+	 * Right before closing the pool, kick off a bunch of async I/O;
+	 * spa_close() should wait for it to complete.
+	 */
+	for (object = 1; object < 50; object++) {
+		dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+
+	/* Verify that at least one commit cb was called in a timely fashion */
+	if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG)
+		VERIFY0(zc_min_txg_delay);
+
+	spa_close(spa, FTAG);
+
+	/*
+	 * Verify that we can loop over all pools.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+		if (ztest_opts.zo_verbose > 3)
+			(void) printf("spa_next: found %s\n", spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+
+	/*
+	 * Verify that we can export the pool and reimport it under a
+	 * different name.
+	 */
+	if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) {
+		char name[ZFS_MAX_DATASET_NAME_LEN];
+		(void) snprintf(name, sizeof (name), "%s_import",
+		    ztest_opts.zo_pool);
+		ztest_spa_import_export(ztest_opts.zo_pool, name);
+		ztest_spa_import_export(name, ztest_opts.zo_pool);
+	}
+
+	kernel_fini();
+
+	list_destroy(&zcl.zcl_callbacks);
+	mutex_destroy(&zcl.zcl_callbacks_lock);
+	(void) pthread_rwlock_destroy(&ztest_name_lock);
+	mutex_destroy(&ztest_vdev_lock);
+	mutex_destroy(&ztest_checkpoint_lock);
+}
+
+static void
+print_time(hrtime_t t, char *timebuf)
+{
+	hrtime_t s = t / NANOSEC;
+	hrtime_t m = s / 60;
+	hrtime_t h = m / 60;
+	hrtime_t d = h / 24;
+
+	s -= m * 60;
+	m -= h * 60;
+	h -= d * 24;
+
+	timebuf[0] = '\0';
+
+	if (d)
+		(void) sprintf(timebuf,
+		    "%llud%02lluh%02llum%02llus", d, h, m, s);
+	else if (h)
+		(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
+	else if (m)
+		(void) sprintf(timebuf, "%llum%02llus", m, s);
+	else
+		(void) sprintf(timebuf, "%llus", s);
+}
+
+static nvlist_t *
+make_random_props(void)
+{
+	nvlist_t *props;
+
+	VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0));
+
+	if (ztest_random(2) == 0)
+		return (props);
+
+	VERIFY0(nvlist_add_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1));
+
+	return (props);
+}
+
+/*
+ * Create a storage pool with the given name and initial vdev size.
+ * Then test spa_freeze() functionality.
+ */
+static void
+ztest_init(ztest_shared_t *zs)
+{
+	spa_t *spa;
+	nvlist_t *nvroot, *props;
+	int i;
+
+	mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
+	VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
+
+	kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+
+	/*
+	 * Create the storage pool.
+	 */
+	(void) spa_destroy(ztest_opts.zo_pool);
+	ztest_shared->zs_vdev_next_leaf = 0;
+	zs->zs_splits = 0;
+	zs->zs_mirrors = ztest_opts.zo_mirrors;
+	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
+	    NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+	props = make_random_props();
+
+	/*
+	 * We don't expect the pool to suspend unless maxfaults == 0,
+	 * in which case ztest_fault_inject() temporarily takes away
+	 * the only valid replica.
+	 */
+	VERIFY0(nvlist_add_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
+	    MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT));
+
+	for (i = 0; i < SPA_FEATURES; i++) {
+		char *buf;
+
+		/*
+		 * 75% chance of using the log space map feature. We want ztest
+		 * to exercise both the code paths that use the log space map
+		 * feature and the ones that don't.
+		 */
+		if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
+			continue;
+
+		VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
+		    spa_feature_table[i].fi_uname));
+		VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
+		free(buf);
+	}
+
+	VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL));
+	nvlist_free(nvroot);
+	nvlist_free(props);
+
+	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	zs->zs_metaslab_sz =
+	    1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+	spa_close(spa, FTAG);
+
+	kernel_fini();
+
+	if (!ztest_opts.zo_mmp_test) {
+		ztest_run_zdb(ztest_opts.zo_pool);
+		ztest_freeze();
+		ztest_run_zdb(ztest_opts.zo_pool);
+	}
+
+	(void) pthread_rwlock_destroy(&ztest_name_lock);
+	mutex_destroy(&ztest_vdev_lock);
+	mutex_destroy(&ztest_checkpoint_lock);
+}
+
+static void
+setup_data_fd(void)
+{
+	static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
+
+	ztest_fd_data = mkstemp(ztest_name_data);
+	ASSERT3S(ztest_fd_data, >=, 0);
+	(void) unlink(ztest_name_data);
+}
+
+static int
+shared_data_size(ztest_shared_hdr_t *hdr)
+{
+	int size;
+
+	size = hdr->zh_hdr_size;
+	size += hdr->zh_opts_size;
+	size += hdr->zh_size;
+	size += hdr->zh_stats_size * hdr->zh_stats_count;
+	size += hdr->zh_ds_size * hdr->zh_ds_count;
+
+	return (size);
+}
+
+static void
+setup_hdr(void)
+{
+	int size;
+	ztest_shared_hdr_t *hdr;
+
+	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
+	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
+	ASSERT(hdr != MAP_FAILED);
+
+	VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
+
+	hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
+	hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
+	hdr->zh_size = sizeof (ztest_shared_t);
+	hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
+	hdr->zh_stats_count = ZTEST_FUNCS;
+	hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
+	hdr->zh_ds_count = ztest_opts.zo_datasets;
+
+	size = shared_data_size(hdr);
+	VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
+
+	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
+}
+
+static void
+setup_data(void)
+{
+	int size, offset;
+	ztest_shared_hdr_t *hdr;
+	uint8_t *buf;
+
+	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
+	    PROT_READ, MAP_SHARED, ztest_fd_data, 0);
+	ASSERT(hdr != MAP_FAILED);
+
+	size = shared_data_size(hdr);
+
+	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
+	hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
+	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
+	ASSERT(hdr != MAP_FAILED);
+	buf = (uint8_t *)hdr;
+
+	offset = hdr->zh_hdr_size;
+	ztest_shared_opts = (void *)&buf[offset];
+	offset += hdr->zh_opts_size;
+	ztest_shared = (void *)&buf[offset];
+	offset += hdr->zh_size;
+	ztest_shared_callstate = (void *)&buf[offset];
+	offset += hdr->zh_stats_size * hdr->zh_stats_count;
+	ztest_shared_ds = (void *)&buf[offset];
+}
+
+static boolean_t
+exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp)
+{
+	pid_t pid;
+	int status;
+	char *cmdbuf = NULL;
+
+	pid = fork();
+
+	if (cmd == NULL) {
+		cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+		(void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
+		cmd = cmdbuf;
+	}
+
+	if (pid == -1)
+		fatal(1, "fork failed");
+
+	if (pid == 0) {	/* child */
+		char *emptyargv[2] = { cmd, NULL };
+		char fd_data_str[12];
+
+		struct rlimit rl = { 1024, 1024 };
+		(void) setrlimit(RLIMIT_NOFILE, &rl);
+
+		(void) close(ztest_fd_rand);
+		VERIFY(11 >= snprintf(fd_data_str, 12, "%d", ztest_fd_data));
+		VERIFY(0 == setenv("ZTEST_FD_DATA", fd_data_str, 1));
+
+		(void) enable_extended_FILE_stdio(-1, -1);
+		if (libpath != NULL)
+			VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
+		(void) execv(cmd, emptyargv);
+		ztest_dump_core = B_FALSE;
+		fatal(B_TRUE, "exec failed: %s", cmd);
+	}
+
+	if (cmdbuf != NULL) {
+		umem_free(cmdbuf, MAXPATHLEN);
+		cmd = NULL;
+	}
+
+	while (waitpid(pid, &status, 0) != pid)
+		continue;
+	if (statusp != NULL)
+		*statusp = status;
+
+	if (WIFEXITED(status)) {
+		if (WEXITSTATUS(status) != 0) {
+			(void) fprintf(stderr, "child exited with code %d\n",
+			    WEXITSTATUS(status));
+			exit(2);
+		}
+		return (B_FALSE);
+	} else if (WIFSIGNALED(status)) {
+		if (!ignorekill || WTERMSIG(status) != SIGKILL) {
+			(void) fprintf(stderr, "child died with signal %d\n",
+			    WTERMSIG(status));
+			exit(3);
+		}
+		return (B_TRUE);
+	} else {
+		(void) fprintf(stderr, "something strange happened to child\n");
+		exit(4);
+		/* NOTREACHED */
+	}
+}
+
+static void
+ztest_run_init(void)
+{
+	int i;
+
+	ztest_shared_t *zs = ztest_shared;
+
+	/*
+	 * Blow away any existing copy of zpool.cache
+	 */
+	(void) remove(spa_config_path);
+
+	if (ztest_opts.zo_init == 0) {
+		if (ztest_opts.zo_verbose >= 1)
+			(void) printf("Importing pool %s\n",
+			    ztest_opts.zo_pool);
+		ztest_import(zs);
+		return;
+	}
+
+	/*
+	 * Create and initialize our storage pool.
+	 */
+	for (i = 1; i <= ztest_opts.zo_init; i++) {
+		bzero(zs, sizeof (ztest_shared_t));
+		if (ztest_opts.zo_verbose >= 3 &&
+		    ztest_opts.zo_init != 1) {
+			(void) printf("ztest_init(), pass %d\n", i);
+		}
+		ztest_init(zs);
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	int kills = 0;
+	int iters = 0;
+	int older = 0;
+	int newer = 0;
+	ztest_shared_t *zs;
+	ztest_info_t *zi;
+	ztest_shared_callstate_t *zc;
+	char timebuf[100];
+	char numbuf[NN_NUMBUF_SZ];
+	char *cmd;
+	boolean_t hasalt;
+	int f;
+	char *fd_data_str = getenv("ZTEST_FD_DATA");
+	struct sigaction action;
+
+	(void) setvbuf(stdout, NULL, _IOLBF, 0);
+
+	dprintf_setup(&argc, argv);
+	zfs_deadman_synctime_ms = 300000;
+	zfs_deadman_checktime_ms = 30000;
+	/*
+	 * As two-word space map entries may not come up often (especially
+	 * if pool and vdev sizes are small) we want to force at least some
+	 * of them so the feature get tested.
+	 */
+	zfs_force_some_double_word_sm_entries = B_TRUE;
+
+	/*
+	 * Verify that even extensively damaged split blocks with many
+	 * segments can be reconstructed in a reasonable amount of time
+	 * when reconstruction is known to be possible.
+	 *
+	 * Note: the lower this value is, the more damage we inflict, and
+	 * the more time ztest spends in recovering that damage. We chose
+	 * to induce damage 1/100th of the time so recovery is tested but
+	 * not so frequently that ztest doesn't get to test other code paths.
+	 */
+	zfs_reconstruct_indirect_damage_fraction = 100;
+
+	action.sa_handler = sig_handler;
+	sigemptyset(&action.sa_mask);
+	action.sa_flags = 0;
+
+	if (sigaction(SIGSEGV, &action, NULL) < 0) {
+		(void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n",
+		    strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	if (sigaction(SIGABRT, &action, NULL) < 0) {
+		(void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n",
+		    strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	/*
+	 * Force random_get_bytes() to use /dev/urandom in order to prevent
+	 * ztest from needlessly depleting the system entropy pool.
+	 */
+	random_path = "/dev/urandom";
+	ztest_fd_rand = open(random_path, O_RDONLY);
+	ASSERT3S(ztest_fd_rand, >=, 0);
+
+	if (!fd_data_str) {
+		process_options(argc, argv);
+
+		setup_data_fd();
+		setup_hdr();
+		setup_data();
+		bcopy(&ztest_opts, ztest_shared_opts,
+		    sizeof (*ztest_shared_opts));
+	} else {
+		ztest_fd_data = atoi(fd_data_str);
+		setup_data();
+		bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
+	}
+	ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
+
+	/* Override location of zpool.cache */
+	VERIFY(asprintf((char **)&spa_config_path, "%s/zpool.cache",
+	    ztest_opts.zo_dir) != -1);
+
+	ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
+	    UMEM_NOFAIL);
+	zs = ztest_shared;
+
+	if (fd_data_str) {
+		metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging;
+		metaslab_df_alloc_threshold =
+		    zs->zs_metaslab_df_alloc_threshold;
+
+		if (zs->zs_do_init)
+			ztest_run_init();
+		else
+			ztest_run(zs);
+		exit(0);
+	}
+
+	hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
+
+	if (ztest_opts.zo_verbose >= 1) {
+		(void) printf("%llu vdevs, %d datasets, %d threads,"
+		    " %llu seconds...\n",
+		    (u_longlong_t)ztest_opts.zo_vdevs,
+		    ztest_opts.zo_datasets,
+		    ztest_opts.zo_threads,
+		    (u_longlong_t)ztest_opts.zo_time);
+	}
+
+	cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	(void) strlcpy(cmd, getexecname(), MAXNAMELEN);
+
+	zs->zs_do_init = B_TRUE;
+	if (strlen(ztest_opts.zo_alt_ztest) != 0) {
+		if (ztest_opts.zo_verbose >= 1) {
+			(void) printf("Executing older ztest for "
+			    "initialization: %s\n", ztest_opts.zo_alt_ztest);
+		}
+		VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
+		    ztest_opts.zo_alt_libpath, B_FALSE, NULL));
+	} else {
+		VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
+	}
+	zs->zs_do_init = B_FALSE;
+
+	zs->zs_proc_start = gethrtime();
+	zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;
+
+	for (f = 0; f < ZTEST_FUNCS; f++) {
+		zi = &ztest_info[f];
+		zc = ZTEST_GET_SHARED_CALLSTATE(f);
+		if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+			zc->zc_next = UINT64_MAX;
+		else
+			zc->zc_next = zs->zs_proc_start +
+			    ztest_random(2 * zi->zi_interval[0] + 1);
+	}
+
+	/*
+	 * Run the tests in a loop.  These tests include fault injection
+	 * to verify that self-healing data works, and forced crashes
+	 * to verify that we never lose on-disk consistency.
+	 */
+	while (gethrtime() < zs->zs_proc_stop) {
+		int status;
+		boolean_t killed;
+
+		/*
+		 * Initialize the workload counters for each function.
+		 */
+		for (f = 0; f < ZTEST_FUNCS; f++) {
+			zc = ZTEST_GET_SHARED_CALLSTATE(f);
+			zc->zc_count = 0;
+			zc->zc_time = 0;
+		}
+
+		/* Set the allocation switch size */
+		zs->zs_metaslab_df_alloc_threshold =
+		    ztest_random(zs->zs_metaslab_sz / 4) + 1;
+
+		if (!hasalt || ztest_random(2) == 0) {
+			if (hasalt && ztest_opts.zo_verbose >= 1) {
+				(void) printf("Executing newer ztest: %s\n",
+				    cmd);
+			}
+			newer++;
+			killed = exec_child(cmd, NULL, B_TRUE, &status);
+		} else {
+			if (hasalt && ztest_opts.zo_verbose >= 1) {
+				(void) printf("Executing older ztest: %s\n",
+				    ztest_opts.zo_alt_ztest);
+			}
+			older++;
+			killed = exec_child(ztest_opts.zo_alt_ztest,
+			    ztest_opts.zo_alt_libpath, B_TRUE, &status);
+		}
+
+		if (killed)
+			kills++;
+		iters++;
+
+		if (ztest_opts.zo_verbose >= 1) {
+			hrtime_t now = gethrtime();
+
+			now = MIN(now, zs->zs_proc_stop);
+			print_time(zs->zs_proc_stop - now, timebuf);
+			nicenum(zs->zs_space, numbuf, sizeof (numbuf));
+
+			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
+			    "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
+			    iters,
+			    WIFEXITED(status) ? "Complete" : "SIGKILL",
+			    (u_longlong_t)zs->zs_enospc_count,
+			    100.0 * zs->zs_alloc / zs->zs_space,
+			    numbuf,
+			    100.0 * (now - zs->zs_proc_start) /
+			    (ztest_opts.zo_time * NANOSEC), timebuf);
+		}
+
+		if (ztest_opts.zo_verbose >= 2) {
+			(void) printf("\nWorkload summary:\n\n");
+			(void) printf("%7s %9s   %s\n",
+			    "Calls", "Time", "Function");
+			(void) printf("%7s %9s   %s\n",
+			    "-----", "----", "--------");
+			for (f = 0; f < ZTEST_FUNCS; f++) {
+				zi = &ztest_info[f];
+				zc = ZTEST_GET_SHARED_CALLSTATE(f);
+				print_time(zc->zc_time, timebuf);
+				(void) printf("%7llu %9s   %s\n",
+				    (u_longlong_t)zc->zc_count, timebuf,
+				    zi->zi_funcname);
+			}
+			(void) printf("\n");
+		}
+
+		if (!ztest_opts.zo_mmp_test)
+			ztest_run_zdb(ztest_opts.zo_pool);
+	}
+
+	if (ztest_opts.zo_verbose >= 1) {
+		if (hasalt) {
+			(void) printf("%d runs of older ztest: %s\n", older,
+			    ztest_opts.zo_alt_ztest);
+			(void) printf("%d runs of newer ztest: %s\n", newer,
+			    cmd);
+		}
+		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
+		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
+	}
+
+	umem_free(cmd, MAXNAMELEN);
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zvol_id/.gitignore b/sys/contrib/openzfs/cmd/zvol_id/.gitignore
new file mode 100644
index 000000000000..8b757a2d6781
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_id/.gitignore
@@ -0,0 +1 @@
+zvol_id
diff --git a/sys/contrib/openzfs/cmd/zvol_id/Makefile.am b/sys/contrib/openzfs/cmd/zvol_id/Makefile.am
new file mode 100644
index 000000000000..a584875081eb
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_id/Makefile.am
@@ -0,0 +1,10 @@
+include $(top_srcdir)/config/Rules.am
+
+# Disable GCC stack protection for zvol_id.  This is a kludge and should be
+# removed once https://github.com/zfsonlinux/zfs/issues/569 is resolved.
+AM_CFLAGS += -fno-stack-protector
+
+udev_PROGRAMS = zvol_id
+
+zvol_id_SOURCES = \
+	zvol_id_main.c
diff --git a/sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c b/sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c
new file mode 100644
index 000000000000..4a2d74cc203c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c
@@ -0,0 +1,110 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Fajar A. Nugraha.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <ctype.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <linux/ioctl.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+
+static int
+ioctl_get_msg(char *var, int fd)
+{
+	int error = 0;
+	char msg[ZFS_MAX_DATASET_NAME_LEN];
+
+	error = ioctl(fd, BLKZNAME, msg);
+	if (error < 0) {
+		return (error);
+	}
+
+	snprintf(var, ZFS_MAX_DATASET_NAME_LEN, "%s", msg);
+	return (error);
+}
+
+int
+main(int argc, char **argv)
+{
+	int fd, error = 0;
+	char zvol_name[ZFS_MAX_DATASET_NAME_LEN];
+	char *zvol_name_part = NULL;
+	char *dev_name;
+	struct stat64 statbuf;
+	int dev_minor, dev_part;
+	int i;
+	int rc;
+
+	if (argc < 2) {
+		printf("Usage: %s /dev/zvol_device_node\n", argv[0]);
+		return (EINVAL);
+	}
+
+	dev_name = argv[1];
+	error = stat64(dev_name, &statbuf);
+	if (error != 0) {
+		printf("Unable to access device file: %s\n", dev_name);
+		return (errno);
+	}
+
+	dev_minor = minor(statbuf.st_rdev);
+	dev_part = dev_minor % ZVOL_MINORS;
+
+	fd = open(dev_name, O_RDONLY);
+	if (fd < 0) {
+		printf("Unable to open device file: %s\n", dev_name);
+		return (errno);
+	}
+
+	error = ioctl_get_msg(zvol_name, fd);
+	if (error < 0) {
+		printf("ioctl_get_msg failed:%s\n", strerror(errno));
+		return (errno);
+	}
+	if (dev_part > 0)
+		rc = asprintf(&zvol_name_part, "%s-part%d", zvol_name,
+		    dev_part);
+	else
+		rc = asprintf(&zvol_name_part, "%s", zvol_name);
+
+	if (rc == -1 || zvol_name_part == NULL)
+		goto error;
+
+	for (i = 0; i < strlen(zvol_name_part); i++) {
+		if (isblank(zvol_name_part[i]))
+			zvol_name_part[i] = '+';
+	}
+
+	printf("%s\n", zvol_name_part);
+	free(zvol_name_part);
+error:
+	close(fd);
+	return (error);
+}
diff --git a/sys/contrib/openzfs/cmd/zvol_wait/Makefile.am b/sys/contrib/openzfs/cmd/zvol_wait/Makefile.am
new file mode 100644
index 000000000000..564031c9799d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_wait/Makefile.am
@@ -0,0 +1 @@
+dist_bin_SCRIPTS = zvol_wait
diff --git a/sys/contrib/openzfs/cmd/zvol_wait/zvol_wait b/sys/contrib/openzfs/cmd/zvol_wait/zvol_wait
new file mode 100755
index 000000000000..9a3948da5564
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_wait/zvol_wait
@@ -0,0 +1,116 @@
+#!/bin/sh
+
+count_zvols() {
+	if [ -z "$zvols" ]; then
+		echo 0
+	else
+		echo "$zvols" | wc -l
+	fi
+}
+
+filter_out_zvols_with_links() {
+	while read -r zvol; do
+		if [ ! -L "/dev/zvol/$zvol" ]; then
+			echo "$zvol"
+		fi
+	done
+}
+
+filter_out_deleted_zvols() {
+	while read -r zvol; do
+		if zfs list "$zvol" >/dev/null 2>&1; then
+			echo "$zvol"
+		fi
+	done
+}
+
+list_zvols() {
+	zfs list -t volume -H -o \
+		name,volmode,receive_resume_token,redact_snaps |
+		while read -r zvol_line; do
+		name=$(echo "$zvol_line" | awk '{print $1}')
+		volmode=$(echo "$zvol_line" | awk '{print $2}')
+		token=$(echo "$zvol_line" | awk '{print $3}')
+		redacted=$(echo "$zvol_line" | awk '{print $4}')
+		#
+		# /dev links are not created for zvols with volmode = "none"
+		# or for redacted zvols.
+		#
+		[ "$volmode" = "none" ] && continue
+		[ "$redacted" = "-" ] || continue
+		#
+		# We also also ignore partially received zvols if it is
+		# not an incremental receive, as those won't even have a block
+		# device minor node created yet.
+		#
+		if [ "$token" != "-" ]; then
+			#
+			# Incremental receives create an invisible clone that
+			# is not automatically displayed by zfs list.
+			#
+			if ! zfs list "$name/%recv" >/dev/null 2>&1; then
+				continue
+			fi
+		fi
+		echo "$name"
+	done
+}
+
+zvols=$(list_zvols)
+zvols_count=$(count_zvols)
+if [ "$zvols_count" -eq 0 ]; then
+	echo "No zvols found, nothing to do."
+	exit 0
+fi
+
+echo "Testing $zvols_count zvol links"
+
+outer_loop=0
+while [ "$outer_loop" -lt 20 ]; do
+	outer_loop=$((outer_loop + 1))
+
+	old_zvols_count=$(count_zvols)
+
+	inner_loop=0
+	while [ "$inner_loop" -lt 30 ]; do
+		inner_loop=$((inner_loop + 1))
+
+		zvols="$(echo "$zvols" | filter_out_zvols_with_links)"
+
+		zvols_count=$(count_zvols)
+		if [ "$zvols_count" -eq 0 ]; then
+			echo "All zvol links are now present."
+			exit 0
+		fi
+		sleep 1
+	done
+
+	echo "Still waiting on $zvols_count zvol links ..."
+	#
+	# Although zvols should normally not be deleted at boot time,
+	# if that is the case then their links will be missing and
+	# we would stall.
+	#
+	if [ "$old_zvols_count" -eq "$zvols_count" ]; then
+		echo "No progress since last loop."
+		echo "Checking if any zvols were deleted."
+
+		zvols=$(echo "$zvols" | filter_out_deleted_zvols)
+		zvols_count=$(count_zvols)
+
+		if [ "$old_zvols_count" -ne "$zvols_count" ]; then
+			echo "$((old_zvols_count - zvols_count)) zvol(s) deleted."
+		fi
+
+		if [ "$zvols_count" -ne 0 ]; then
+			echo "Remaining zvols:"
+			echo "$zvols"
+		else
+			echo "All zvol links are now present."
+			exit 0
+		fi
+	fi
+done
+
+echo "Timed out waiting on zvol links"
+exit 1