diff options
Diffstat (limited to 'sys/contrib/openzfs/cmd')
165 files changed, 59838 insertions, 0 deletions
diff --git a/sys/contrib/openzfs/cmd/Makefile.am b/sys/contrib/openzfs/cmd/Makefile.am new file mode 100644 index 000000000000..88d32b1c538c --- /dev/null +++ b/sys/contrib/openzfs/cmd/Makefile.am @@ -0,0 +1,10 @@ +SUBDIRS = zfs zpool zdb zhack zinject zstream zstreamdump ztest +SUBDIRS += fsck_zfs vdev_id raidz_test zfs_ids_to_path + +if USING_PYTHON +SUBDIRS += arcstat arc_summary dbufstat +endif + +if BUILD_LINUX +SUBDIRS += mount_zfs zed zgenhostid zvol_id zvol_wait +endif diff --git a/sys/contrib/openzfs/cmd/arc_summary/.gitignore b/sys/contrib/openzfs/cmd/arc_summary/.gitignore new file mode 100644 index 000000000000..50ba15f034e2 --- /dev/null +++ b/sys/contrib/openzfs/cmd/arc_summary/.gitignore @@ -0,0 +1 @@ +arc_summary diff --git a/sys/contrib/openzfs/cmd/arc_summary/Makefile.am b/sys/contrib/openzfs/cmd/arc_summary/Makefile.am new file mode 100644 index 000000000000..1a26c2c199f8 --- /dev/null +++ b/sys/contrib/openzfs/cmd/arc_summary/Makefile.am @@ -0,0 +1,13 @@ +bin_SCRIPTS = arc_summary + +CLEANFILES = arc_summary +EXTRA_DIST = arc_summary2 arc_summary3 + +if USING_PYTHON_2 +SCRIPT = arc_summary2 +else +SCRIPT = arc_summary3 +endif + +arc_summary: $(SCRIPT) + cp $< $@ diff --git a/sys/contrib/openzfs/cmd/arc_summary/arc_summary2 b/sys/contrib/openzfs/cmd/arc_summary/arc_summary2 new file mode 100755 index 000000000000..5dc40d759dce --- /dev/null +++ b/sys/contrib/openzfs/cmd/arc_summary/arc_summary2 @@ -0,0 +1,1093 @@ +#!/usr/bin/env python2 +# +# $Id: arc_summary.pl,v 388:e27800740aa2 2011-07-08 02:53:29Z jhell $ +# +# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>, +# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>, +# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>, +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# If you are having troubles when using this script from cron(8) please try +# adjusting your PATH before reporting problems. +# +# Note some of this code uses older code (eg getopt instead of argparse, +# subprocess.Popen() instead of subprocess.run()) because we need to support +# some very old versions of Python. +# + +"""Print statistics on the ZFS Adjustable Replacement Cache (ARC) + +Provides basic information on the ARC, its efficiency, the L2ARC (if present), +the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the +in-source documentation and code at +https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details. +""" + +import getopt +import os +import sys +import time +import errno + +from subprocess import Popen, PIPE +from decimal import Decimal as D + + +if sys.platform.startswith('freebsd'): + # Requires py27-sysctl on FreeBSD + import sysctl + + def load_kstats(namespace): + """Collect information on a specific subsystem of the ARC""" + + base = 'kstat.zfs.misc.%s.' % namespace + return [(kstat.name, D(kstat.value)) for kstat in sysctl.filter(base)] + + def load_tunables(): + return dict((ctl.name, ctl.value) for ctl in sysctl.filter('vfs.zfs')) + +elif sys.platform.startswith('linux'): + + def load_kstats(namespace): + """Collect information on a specific subsystem of the ARC""" + + kstat = 'kstat.zfs.misc.%s.%%s' % namespace + path = '/proc/spl/kstat/zfs/%s' % namespace + with open(path) as f: + entries = [line.strip().split() for line in f][2:] # Skip header + return [(kstat % name, D(value)) for name, _, value in entries] + + def load_tunables(): + basepath = '/sys/module/zfs/parameters' + tunables = {} + for name in os.listdir(basepath): + if not name: + continue + path = '%s/%s' % (basepath, name) + with open(path) as f: + value = f.read() + tunables[name] = value.strip() + return tunables + + +show_tunable_descriptions = False +alternate_tunable_layout = False + + +def handle_Exception(ex_cls, ex, tb): + if ex is IOError: + if ex.errno == errno.EPIPE: + sys.exit() + + if ex is KeyboardInterrupt: + sys.exit() + + +sys.excepthook = handle_Exception + + +def get_Kstat(): + """Collect information on the ZFS subsystem from the /proc virtual + file system. The name "kstat" is a holdover from the Solaris utility + of the same name. + """ + + Kstat = {} + Kstat.update(load_kstats('arcstats')) + Kstat.update(load_kstats('zfetchstats')) + Kstat.update(load_kstats('vdev_cache_stats')) + return Kstat + + +def fBytes(b=0): + """Return human-readable representation of a byte value in + powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal + points. Values smaller than one KiB are returned without + decimal points. + """ + + prefixes = [ + [2**80, "YiB"], # yobibytes (yotta) + [2**70, "ZiB"], # zebibytes (zetta) + [2**60, "EiB"], # exbibytes (exa) + [2**50, "PiB"], # pebibytes (peta) + [2**40, "TiB"], # tebibytes (tera) + [2**30, "GiB"], # gibibytes (giga) + [2**20, "MiB"], # mebibytes (mega) + [2**10, "KiB"]] # kibibytes (kilo) + + if b >= 2**10: + + for limit, unit in prefixes: + + if b >= limit: + value = b / limit + break + + result = "%0.2f\t%s" % (value, unit) + + else: + + result = "%d\tBytes" % b + + return result + + +def fHits(hits=0): + """Create a human-readable representation of the number of hits. + The single-letter symbols used are SI to avoid the confusion caused + by the different "short scale" and "long scale" representations in + English, which use the same words for different values. See + https://en.wikipedia.org/wiki/Names_of_large_numbers and + https://physics.nist.gov/cuu/Units/prefixes.html + """ + + numbers = [ + [10**24, 'Y'], # yotta (septillion) + [10**21, 'Z'], # zetta (sextillion) + [10**18, 'E'], # exa (quintrillion) + [10**15, 'P'], # peta (quadrillion) + [10**12, 'T'], # tera (trillion) + [10**9, 'G'], # giga (billion) + [10**6, 'M'], # mega (million) + [10**3, 'k']] # kilo (thousand) + + if hits >= 1000: + + for limit, symbol in numbers: + + if hits >= limit: + value = hits/limit + break + + result = "%0.2f%s" % (value, symbol) + + else: + + result = "%d" % hits + + return result + + +def fPerc(lVal=0, rVal=0, Decimal=2): + """Calculate percentage value and return in human-readable format""" + + if rVal > 0: + return str("%0." + str(Decimal) + "f") % (100 * (lVal / rVal)) + "%" + else: + return str("%0." + str(Decimal) + "f") % 100 + "%" + + +def get_arc_summary(Kstat): + """Collect general data on the ARC""" + + output = {} + memory_throttle_count = Kstat[ + "kstat.zfs.misc.arcstats.memory_throttle_count" + ] + + if memory_throttle_count > 0: + output['health'] = 'THROTTLED' + else: + output['health'] = 'HEALTHY' + + output['memory_throttle_count'] = fHits(memory_throttle_count) + + # ARC Misc. + deleted = Kstat["kstat.zfs.misc.arcstats.deleted"] + mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"] + evict_skip = Kstat["kstat.zfs.misc.arcstats.evict_skip"] + + # ARC Misc. + output["arc_misc"] = {} + output["arc_misc"]["deleted"] = fHits(deleted) + output["arc_misc"]['mutex_miss'] = fHits(mutex_miss) + output["arc_misc"]['evict_skips'] = fHits(evict_skip) + + # ARC Sizing + arc_size = Kstat["kstat.zfs.misc.arcstats.size"] + mru_size = Kstat["kstat.zfs.misc.arcstats.mru_size"] + mfu_size = Kstat["kstat.zfs.misc.arcstats.mfu_size"] + meta_limit = Kstat["kstat.zfs.misc.arcstats.arc_meta_limit"] + meta_size = Kstat["kstat.zfs.misc.arcstats.arc_meta_used"] + dnode_limit = Kstat["kstat.zfs.misc.arcstats.arc_dnode_limit"] + dnode_size = Kstat["kstat.zfs.misc.arcstats.dnode_size"] + target_max_size = Kstat["kstat.zfs.misc.arcstats.c_max"] + target_min_size = Kstat["kstat.zfs.misc.arcstats.c_min"] + target_size = Kstat["kstat.zfs.misc.arcstats.c"] + + target_size_ratio = (target_max_size / target_min_size) + + # ARC Sizing + output['arc_sizing'] = {} + output['arc_sizing']['arc_size'] = { + 'per': fPerc(arc_size, target_max_size), + 'num': fBytes(arc_size), + } + output['arc_sizing']['target_max_size'] = { + 'ratio': target_size_ratio, + 'num': fBytes(target_max_size), + } + output['arc_sizing']['target_min_size'] = { + 'per': fPerc(target_min_size, target_max_size), + 'num': fBytes(target_min_size), + } + output['arc_sizing']['target_size'] = { + 'per': fPerc(target_size, target_max_size), + 'num': fBytes(target_size), + } + output['arc_sizing']['meta_limit'] = { + 'per': fPerc(meta_limit, target_max_size), + 'num': fBytes(meta_limit), + } + output['arc_sizing']['meta_size'] = { + 'per': fPerc(meta_size, meta_limit), + 'num': fBytes(meta_size), + } + output['arc_sizing']['dnode_limit'] = { + 'per': fPerc(dnode_limit, meta_limit), + 'num': fBytes(dnode_limit), + } + output['arc_sizing']['dnode_size'] = { + 'per': fPerc(dnode_size, dnode_limit), + 'num': fBytes(dnode_size), + } + + # ARC Hash Breakdown + output['arc_hash_break'] = {} + output['arc_hash_break']['hash_chain_max'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_chain_max" + ] + output['arc_hash_break']['hash_chains'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_chains" + ] + output['arc_hash_break']['hash_collisions'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_collisions" + ] + output['arc_hash_break']['hash_elements'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_elements" + ] + output['arc_hash_break']['hash_elements_max'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_elements_max" + ] + + output['arc_size_break'] = {} + output['arc_size_break']['recently_used_cache_size'] = { + 'per': fPerc(mru_size, mru_size + mfu_size), + 'num': fBytes(mru_size), + } + output['arc_size_break']['frequently_used_cache_size'] = { + 'per': fPerc(mfu_size, mru_size + mfu_size), + 'num': fBytes(mfu_size), + } + + # ARC Hash Breakdown + hash_chain_max = Kstat["kstat.zfs.misc.arcstats.hash_chain_max"] + hash_chains = Kstat["kstat.zfs.misc.arcstats.hash_chains"] + hash_collisions = Kstat["kstat.zfs.misc.arcstats.hash_collisions"] + hash_elements = Kstat["kstat.zfs.misc.arcstats.hash_elements"] + hash_elements_max = Kstat["kstat.zfs.misc.arcstats.hash_elements_max"] + + output['arc_hash_break'] = {} + output['arc_hash_break']['elements_max'] = fHits(hash_elements_max) + output['arc_hash_break']['elements_current'] = { + 'per': fPerc(hash_elements, hash_elements_max), + 'num': fHits(hash_elements), + } + output['arc_hash_break']['collisions'] = fHits(hash_collisions) + output['arc_hash_break']['chain_max'] = fHits(hash_chain_max) + output['arc_hash_break']['chains'] = fHits(hash_chains) + + return output + + +def _arc_summary(Kstat): + """Print information on the ARC""" + + # ARC Sizing + arc = get_arc_summary(Kstat) + + sys.stdout.write("ARC Summary: (%s)\n" % arc['health']) + + sys.stdout.write("\tMemory Throttle Count:\t\t\t%s\n" % + arc['memory_throttle_count']) + sys.stdout.write("\n") + + # ARC Misc. + sys.stdout.write("ARC Misc:\n") + sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted']) + sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" % + arc['arc_misc']['mutex_miss']) + sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" % + arc['arc_misc']['evict_skips']) + sys.stdout.write("\n") + + # ARC Sizing + sys.stdout.write("ARC Size:\t\t\t\t%s\t%s\n" % ( + arc['arc_sizing']['arc_size']['per'], + arc['arc_sizing']['arc_size']['num'] + ) + ) + sys.stdout.write("\tTarget Size: (Adaptive)\t\t%s\t%s\n" % ( + arc['arc_sizing']['target_size']['per'], + arc['arc_sizing']['target_size']['num'], + ) + ) + + sys.stdout.write("\tMin Size (Hard Limit):\t\t%s\t%s\n" % ( + arc['arc_sizing']['target_min_size']['per'], + arc['arc_sizing']['target_min_size']['num'], + ) + ) + + sys.stdout.write("\tMax Size (High Water):\t\t%d:1\t%s\n" % ( + arc['arc_sizing']['target_max_size']['ratio'], + arc['arc_sizing']['target_max_size']['num'], + ) + ) + + sys.stdout.write("\nARC Size Breakdown:\n") + sys.stdout.write("\tRecently Used Cache Size:\t%s\t%s\n" % ( + arc['arc_size_break']['recently_used_cache_size']['per'], + arc['arc_size_break']['recently_used_cache_size']['num'], + ) + ) + sys.stdout.write("\tFrequently Used Cache Size:\t%s\t%s\n" % ( + arc['arc_size_break']['frequently_used_cache_size']['per'], + arc['arc_size_break']['frequently_used_cache_size']['num'], + ) + ) + sys.stdout.write("\tMetadata Size (Hard Limit):\t%s\t%s\n" % ( + arc['arc_sizing']['meta_limit']['per'], + arc['arc_sizing']['meta_limit']['num'], + ) + ) + sys.stdout.write("\tMetadata Size:\t\t\t%s\t%s\n" % ( + arc['arc_sizing']['meta_size']['per'], + arc['arc_sizing']['meta_size']['num'], + ) + ) + sys.stdout.write("\tDnode Size (Hard Limit):\t%s\t%s\n" % ( + arc['arc_sizing']['dnode_limit']['per'], + arc['arc_sizing']['dnode_limit']['num'], + ) + ) + sys.stdout.write("\tDnode Size:\t\t\t%s\t%s\n" % ( + arc['arc_sizing']['dnode_size']['per'], + arc['arc_sizing']['dnode_size']['num'], + ) + ) + + sys.stdout.write("\n") + + # ARC Hash Breakdown + sys.stdout.write("ARC Hash Breakdown:\n") + sys.stdout.write("\tElements Max:\t\t\t\t%s\n" % + arc['arc_hash_break']['elements_max']) + sys.stdout.write("\tElements Current:\t\t%s\t%s\n" % ( + arc['arc_hash_break']['elements_current']['per'], + arc['arc_hash_break']['elements_current']['num'], + ) + ) + sys.stdout.write("\tCollisions:\t\t\t\t%s\n" % + arc['arc_hash_break']['collisions']) + sys.stdout.write("\tChain Max:\t\t\t\t%s\n" % + arc['arc_hash_break']['chain_max']) + sys.stdout.write("\tChains:\t\t\t\t\t%s\n" % + arc['arc_hash_break']['chains']) + + +def get_arc_efficiency(Kstat): + """Collect information on the efficiency of the ARC""" + + output = {} + + arc_hits = Kstat["kstat.zfs.misc.arcstats.hits"] + arc_misses = Kstat["kstat.zfs.misc.arcstats.misses"] + demand_data_hits = Kstat["kstat.zfs.misc.arcstats.demand_data_hits"] + demand_data_misses = Kstat["kstat.zfs.misc.arcstats.demand_data_misses"] + demand_metadata_hits = Kstat[ + "kstat.zfs.misc.arcstats.demand_metadata_hits" + ] + demand_metadata_misses = Kstat[ + "kstat.zfs.misc.arcstats.demand_metadata_misses" + ] + mfu_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mfu_ghost_hits"] + mfu_hits = Kstat["kstat.zfs.misc.arcstats.mfu_hits"] + mru_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mru_ghost_hits"] + mru_hits = Kstat["kstat.zfs.misc.arcstats.mru_hits"] + prefetch_data_hits = Kstat["kstat.zfs.misc.arcstats.prefetch_data_hits"] + prefetch_data_misses = Kstat[ + "kstat.zfs.misc.arcstats.prefetch_data_misses" + ] + prefetch_metadata_hits = Kstat[ + "kstat.zfs.misc.arcstats.prefetch_metadata_hits" + ] + prefetch_metadata_misses = Kstat[ + "kstat.zfs.misc.arcstats.prefetch_metadata_misses" + ] + + anon_hits = arc_hits - ( + mfu_hits + mru_hits + mfu_ghost_hits + mru_ghost_hits + ) + arc_accesses_total = (arc_hits + arc_misses) + demand_data_total = (demand_data_hits + demand_data_misses) + prefetch_data_total = (prefetch_data_hits + prefetch_data_misses) + real_hits = (mfu_hits + mru_hits) + + output["total_accesses"] = fHits(arc_accesses_total) + output["cache_hit_ratio"] = { + 'per': fPerc(arc_hits, arc_accesses_total), + 'num': fHits(arc_hits), + } + output["cache_miss_ratio"] = { + 'per': fPerc(arc_misses, arc_accesses_total), + 'num': fHits(arc_misses), + } + output["actual_hit_ratio"] = { + 'per': fPerc(real_hits, arc_accesses_total), + 'num': fHits(real_hits), + } + output["data_demand_efficiency"] = { + 'per': fPerc(demand_data_hits, demand_data_total), + 'num': fHits(demand_data_total), + } + + if prefetch_data_total > 0: + output["data_prefetch_efficiency"] = { + 'per': fPerc(prefetch_data_hits, prefetch_data_total), + 'num': fHits(prefetch_data_total), + } + + if anon_hits > 0: + output["cache_hits_by_cache_list"] = {} + output["cache_hits_by_cache_list"]["anonymously_used"] = { + 'per': fPerc(anon_hits, arc_hits), + 'num': fHits(anon_hits), + } + + output["most_recently_used"] = { + 'per': fPerc(mru_hits, arc_hits), + 'num': fHits(mru_hits), + } + output["most_frequently_used"] = { + 'per': fPerc(mfu_hits, arc_hits), + 'num': fHits(mfu_hits), + } + output["most_recently_used_ghost"] = { + 'per': fPerc(mru_ghost_hits, arc_hits), + 'num': fHits(mru_ghost_hits), + } + output["most_frequently_used_ghost"] = { + 'per': fPerc(mfu_ghost_hits, arc_hits), + 'num': fHits(mfu_ghost_hits), + } + + output["cache_hits_by_data_type"] = {} + output["cache_hits_by_data_type"]["demand_data"] = { + 'per': fPerc(demand_data_hits, arc_hits), + 'num': fHits(demand_data_hits), + } + output["cache_hits_by_data_type"]["prefetch_data"] = { + 'per': fPerc(prefetch_data_hits, arc_hits), + 'num': fHits(prefetch_data_hits), + } + output["cache_hits_by_data_type"]["demand_metadata"] = { + 'per': fPerc(demand_metadata_hits, arc_hits), + 'num': fHits(demand_metadata_hits), + } + output["cache_hits_by_data_type"]["prefetch_metadata"] = { + 'per': fPerc(prefetch_metadata_hits, arc_hits), + 'num': fHits(prefetch_metadata_hits), + } + + output["cache_misses_by_data_type"] = {} + output["cache_misses_by_data_type"]["demand_data"] = { + 'per': fPerc(demand_data_misses, arc_misses), + 'num': fHits(demand_data_misses), + } + output["cache_misses_by_data_type"]["prefetch_data"] = { + 'per': fPerc(prefetch_data_misses, arc_misses), + 'num': fHits(prefetch_data_misses), + } + output["cache_misses_by_data_type"]["demand_metadata"] = { + 'per': fPerc(demand_metadata_misses, arc_misses), + 'num': fHits(demand_metadata_misses), + } + output["cache_misses_by_data_type"]["prefetch_metadata"] = { + 'per': fPerc(prefetch_metadata_misses, arc_misses), + 'num': fHits(prefetch_metadata_misses), + } + + return output + + +def _arc_efficiency(Kstat): + """Print information on the efficiency of the ARC""" + + arc = get_arc_efficiency(Kstat) + + sys.stdout.write("ARC Total accesses:\t\t\t\t\t%s\n" % + arc['total_accesses']) + sys.stdout.write("\tCache Hit Ratio:\t\t%s\t%s\n" % ( + arc['cache_hit_ratio']['per'], + arc['cache_hit_ratio']['num'], + ) + ) + sys.stdout.write("\tCache Miss Ratio:\t\t%s\t%s\n" % ( + arc['cache_miss_ratio']['per'], + arc['cache_miss_ratio']['num'], + ) + ) + + sys.stdout.write("\tActual Hit Ratio:\t\t%s\t%s\n" % ( + arc['actual_hit_ratio']['per'], + arc['actual_hit_ratio']['num'], + ) + ) + + sys.stdout.write("\n") + sys.stdout.write("\tData Demand Efficiency:\t\t%s\t%s\n" % ( + arc['data_demand_efficiency']['per'], + arc['data_demand_efficiency']['num'], + ) + ) + + if 'data_prefetch_efficiency' in arc: + sys.stdout.write("\tData Prefetch Efficiency:\t%s\t%s\n" % ( + arc['data_prefetch_efficiency']['per'], + arc['data_prefetch_efficiency']['num'], + ) + ) + sys.stdout.write("\n") + + sys.stdout.write("\tCACHE HITS BY CACHE LIST:\n") + if 'cache_hits_by_cache_list' in arc: + sys.stdout.write("\t Anonymously Used:\t\t%s\t%s\n" % ( + arc['cache_hits_by_cache_list']['anonymously_used']['per'], + arc['cache_hits_by_cache_list']['anonymously_used']['num'], + ) + ) + sys.stdout.write("\t Most Recently Used:\t\t%s\t%s\n" % ( + arc['most_recently_used']['per'], + arc['most_recently_used']['num'], + ) + ) + sys.stdout.write("\t Most Frequently Used:\t\t%s\t%s\n" % ( + arc['most_frequently_used']['per'], + arc['most_frequently_used']['num'], + ) + ) + sys.stdout.write("\t Most Recently Used Ghost:\t%s\t%s\n" % ( + arc['most_recently_used_ghost']['per'], + arc['most_recently_used_ghost']['num'], + ) + ) + sys.stdout.write("\t Most Frequently Used Ghost:\t%s\t%s\n" % ( + arc['most_frequently_used_ghost']['per'], + arc['most_frequently_used_ghost']['num'], + ) + ) + + sys.stdout.write("\n\tCACHE HITS BY DATA TYPE:\n") + sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['demand_data']['per'], + arc["cache_hits_by_data_type"]['demand_data']['num'], + ) + ) + sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['prefetch_data']['per'], + arc["cache_hits_by_data_type"]['prefetch_data']['num'], + ) + ) + sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['demand_metadata']['per'], + arc["cache_hits_by_data_type"]['demand_metadata']['num'], + ) + ) + sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['prefetch_metadata']['per'], + arc["cache_hits_by_data_type"]['prefetch_metadata']['num'], + ) + ) + + sys.stdout.write("\n\tCACHE MISSES BY DATA TYPE:\n") + sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['demand_data']['per'], + arc["cache_misses_by_data_type"]['demand_data']['num'], + ) + ) + sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['prefetch_data']['per'], + arc["cache_misses_by_data_type"]['prefetch_data']['num'], + ) + ) + sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['demand_metadata']['per'], + arc["cache_misses_by_data_type"]['demand_metadata']['num'], + ) + ) + sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['prefetch_metadata']['per'], + arc["cache_misses_by_data_type"]['prefetch_metadata']['num'], + ) + ) + + +def get_l2arc_summary(Kstat): + """Collection information on the L2ARC""" + + output = {} + + l2_abort_lowmem = Kstat["kstat.zfs.misc.arcstats.l2_abort_lowmem"] + l2_cksum_bad = Kstat["kstat.zfs.misc.arcstats.l2_cksum_bad"] + l2_evict_lock_retry = Kstat["kstat.zfs.misc.arcstats.l2_evict_lock_retry"] + l2_evict_reading = Kstat["kstat.zfs.misc.arcstats.l2_evict_reading"] + l2_feeds = Kstat["kstat.zfs.misc.arcstats.l2_feeds"] + l2_free_on_write = Kstat["kstat.zfs.misc.arcstats.l2_free_on_write"] + l2_hdr_size = Kstat["kstat.zfs.misc.arcstats.l2_hdr_size"] + l2_hits = Kstat["kstat.zfs.misc.arcstats.l2_hits"] + l2_io_error = Kstat["kstat.zfs.misc.arcstats.l2_io_error"] + l2_misses = Kstat["kstat.zfs.misc.arcstats.l2_misses"] + l2_rw_clash = Kstat["kstat.zfs.misc.arcstats.l2_rw_clash"] + l2_size = Kstat["kstat.zfs.misc.arcstats.l2_size"] + l2_asize = Kstat["kstat.zfs.misc.arcstats.l2_asize"] + l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"] + l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"] + l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"] + + l2_access_total = (l2_hits + l2_misses) + output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error) + + output['l2_access_total'] = l2_access_total + output['l2_size'] = l2_size + output['l2_asize'] = l2_asize + + if l2_size > 0 and l2_access_total > 0: + + if output['l2_health_count'] > 0: + output["health"] = "DEGRADED" + else: + output["health"] = "HEALTHY" + + output["low_memory_aborts"] = fHits(l2_abort_lowmem) + output["free_on_write"] = fHits(l2_free_on_write) + output["rw_clashes"] = fHits(l2_rw_clash) + output["bad_checksums"] = fHits(l2_cksum_bad) + output["io_errors"] = fHits(l2_io_error) + + output["l2_arc_size"] = {} + output["l2_arc_size"]["adative"] = fBytes(l2_size) + output["l2_arc_size"]["actual"] = { + 'per': fPerc(l2_asize, l2_size), + 'num': fBytes(l2_asize) + } + output["l2_arc_size"]["head_size"] = { + 'per': fPerc(l2_hdr_size, l2_size), + 'num': fBytes(l2_hdr_size), + } + + output["l2_arc_evicts"] = {} + output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry) + output["l2_arc_evicts"]['reading'] = fHits(l2_evict_reading) + + output['l2_arc_breakdown'] = {} + output['l2_arc_breakdown']['value'] = fHits(l2_access_total) + output['l2_arc_breakdown']['hit_ratio'] = { + 'per': fPerc(l2_hits, l2_access_total), + 'num': fHits(l2_hits), + } + output['l2_arc_breakdown']['miss_ratio'] = { + 'per': fPerc(l2_misses, l2_access_total), + 'num': fHits(l2_misses), + } + output['l2_arc_breakdown']['feeds'] = fHits(l2_feeds) + + output['l2_arc_buffer'] = {} + + output['l2_arc_writes'] = {} + output['l2_writes_done'] = l2_writes_done + output['l2_writes_sent'] = l2_writes_sent + if l2_writes_done != l2_writes_sent: + output['l2_arc_writes']['writes_sent'] = { + 'value': "FAULTED", + 'num': fHits(l2_writes_sent), + } + output['l2_arc_writes']['done_ratio'] = { + 'per': fPerc(l2_writes_done, l2_writes_sent), + 'num': fHits(l2_writes_done), + } + output['l2_arc_writes']['error_ratio'] = { + 'per': fPerc(l2_writes_error, l2_writes_sent), + 'num': fHits(l2_writes_error), + } + else: + output['l2_arc_writes']['writes_sent'] = { + 'per': fPerc(100), + 'num': fHits(l2_writes_sent), + } + + return output + + +def _l2arc_summary(Kstat): + """Print information on the L2ARC""" + + arc = get_l2arc_summary(Kstat) + + if arc['l2_size'] > 0 and arc['l2_access_total'] > 0: + sys.stdout.write("L2 ARC Summary: ") + if arc['l2_health_count'] > 0: + sys.stdout.write("(DEGRADED)\n") + else: + sys.stdout.write("(HEALTHY)\n") + sys.stdout.write("\tLow Memory Aborts:\t\t\t%s\n" % + arc['low_memory_aborts']) + sys.stdout.write("\tFree on Write:\t\t\t\t%s\n" % arc['free_on_write']) + sys.stdout.write("\tR/W Clashes:\t\t\t\t%s\n" % arc['rw_clashes']) + sys.stdout.write("\tBad Checksums:\t\t\t\t%s\n" % arc['bad_checksums']) + sys.stdout.write("\tIO Errors:\t\t\t\t%s\n" % arc['io_errors']) + sys.stdout.write("\n") + + sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" % + arc["l2_arc_size"]["adative"]) + sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["actual"]["per"], + arc["l2_arc_size"]["actual"]["num"], + ) + ) + sys.stdout.write("\tHeader Size:\t\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["head_size"]["per"], + arc["l2_arc_size"]["head_size"]["num"], + ) + ) + sys.stdout.write("\n") + + if arc["l2_arc_evicts"]['lock_retries'] != '0' or \ + arc["l2_arc_evicts"]["reading"] != '0': + sys.stdout.write("L2 ARC Evicts:\n") + sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" % + arc["l2_arc_evicts"]['lock_retries']) + sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" % + arc["l2_arc_evicts"]["reading"]) + sys.stdout.write("\n") + + sys.stdout.write("L2 ARC Breakdown:\t\t\t\t%s\n" % + arc['l2_arc_breakdown']['value']) + sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_breakdown']['hit_ratio']['per'], + arc['l2_arc_breakdown']['hit_ratio']['num'], + ) + ) + + sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_breakdown']['miss_ratio']['per'], + arc['l2_arc_breakdown']['miss_ratio']['num'], + ) + ) + + sys.stdout.write("\tFeeds:\t\t\t\t\t%s\n" % + arc['l2_arc_breakdown']['feeds']) + sys.stdout.write("\n") + + sys.stdout.write("L2 ARC Writes:\n") + if arc['l2_writes_done'] != arc['l2_writes_sent']: + sys.stdout.write("\tWrites Sent: (%s)\t\t\t\t%s\n" % ( + arc['l2_arc_writes']['writes_sent']['value'], + arc['l2_arc_writes']['writes_sent']['num'], + ) + ) + sys.stdout.write("\t Done Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_writes']['done_ratio']['per'], + arc['l2_arc_writes']['done_ratio']['num'], + ) + ) + sys.stdout.write("\t Error Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_writes']['error_ratio']['per'], + arc['l2_arc_writes']['error_ratio']['num'], + ) + ) + else: + sys.stdout.write("\tWrites Sent:\t\t\t%s\t%s\n" % ( + arc['l2_arc_writes']['writes_sent']['per'], + arc['l2_arc_writes']['writes_sent']['num'], + ) + ) + + +def get_dmu_summary(Kstat): + """Collect information on the DMU""" + + output = {} + + zfetch_hits = Kstat["kstat.zfs.misc.zfetchstats.hits"] + zfetch_misses = Kstat["kstat.zfs.misc.zfetchstats.misses"] + + zfetch_access_total = (zfetch_hits + zfetch_misses) + output['zfetch_access_total'] = zfetch_access_total + + if zfetch_access_total > 0: + output['dmu'] = {} + output['dmu']['efficiency'] = {} + output['dmu']['efficiency']['value'] = fHits(zfetch_access_total) + output['dmu']['efficiency']['hit_ratio'] = { + 'per': fPerc(zfetch_hits, zfetch_access_total), + 'num': fHits(zfetch_hits), + } + output['dmu']['efficiency']['miss_ratio'] = { + 'per': fPerc(zfetch_misses, zfetch_access_total), + 'num': fHits(zfetch_misses), + } + + return output + + +def _dmu_summary(Kstat): + """Print information on the DMU""" + + arc = get_dmu_summary(Kstat) + + if arc['zfetch_access_total'] > 0: + sys.stdout.write("DMU Prefetch Efficiency:\t\t\t\t\t%s\n" % + arc['dmu']['efficiency']['value']) + sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % ( + arc['dmu']['efficiency']['hit_ratio']['per'], + arc['dmu']['efficiency']['hit_ratio']['num'], + ) + ) + sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % ( + arc['dmu']['efficiency']['miss_ratio']['per'], + arc['dmu']['efficiency']['miss_ratio']['num'], + ) + ) + + sys.stdout.write("\n") + + +def get_vdev_summary(Kstat): + """Collect information on the VDEVs""" + + output = {} + + vdev_cache_delegations = \ + Kstat["kstat.zfs.misc.vdev_cache_stats.delegations"] + vdev_cache_misses = Kstat["kstat.zfs.misc.vdev_cache_stats.misses"] + vdev_cache_hits = Kstat["kstat.zfs.misc.vdev_cache_stats.hits"] + vdev_cache_total = (vdev_cache_misses + vdev_cache_hits + + vdev_cache_delegations) + + output['vdev_cache_total'] = vdev_cache_total + + if vdev_cache_total > 0: + output['summary'] = fHits(vdev_cache_total) + output['hit_ratio'] = { + 'per': fPerc(vdev_cache_hits, vdev_cache_total), + 'num': fHits(vdev_cache_hits), + } + output['miss_ratio'] = { + 'per': fPerc(vdev_cache_misses, vdev_cache_total), + 'num': fHits(vdev_cache_misses), + } + output['delegations'] = { + 'per': fPerc(vdev_cache_delegations, vdev_cache_total), + 'num': fHits(vdev_cache_delegations), + } + + return output + + +def _vdev_summary(Kstat): + """Print information on the VDEVs""" + + arc = get_vdev_summary(Kstat) + + if arc['vdev_cache_total'] > 0: + sys.stdout.write("VDEV Cache Summary:\t\t\t\t%s\n" % arc['summary']) + sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % ( + arc['hit_ratio']['per'], + arc['hit_ratio']['num'], + )) + sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % ( + arc['miss_ratio']['per'], + arc['miss_ratio']['num'], + )) + sys.stdout.write("\tDelegations:\t\t\t%s\t%s\n" % ( + arc['delegations']['per'], + arc['delegations']['num'], + )) + + +def _tunable_summary(Kstat): + """Print information on tunables, including descriptions if requested""" + + global show_tunable_descriptions + global alternate_tunable_layout + + tunables = load_tunables() + descriptions = {} + + if show_tunable_descriptions: + + command = ["/sbin/modinfo", "zfs", "-0"] + + try: + p = Popen(command, stdin=PIPE, stdout=PIPE, + stderr=PIPE, shell=False, close_fds=True) + p.wait() + + # By default, Python 2 returns a string as the first element of the + # tuple from p.communicate(), while Python 3 returns bytes which + # must be decoded first. The better way to do this would be with + # subprocess.run() or at least .check_output(), but this fails on + # CentOS 6 because of its old version of Python 2 + desc = bytes.decode(p.communicate()[0]) + description_list = desc.strip().split('\0') + + if p.returncode == 0: + for tunable in description_list: + if tunable[0:5] == 'parm:': + tunable = tunable[5:].strip() + name, description = tunable.split(':', 1) + if not description: + description = "Description unavailable" + descriptions[name] = description + else: + sys.stderr.write("%s: '%s' exited with code %i\n" % + (sys.argv[0], command[0], p.returncode)) + sys.stderr.write("Tunable descriptions will be disabled.\n") + except OSError as e: + sys.stderr.write("%s: Cannot run '%s': %s\n" % + (sys.argv[0], command[0], e.strerror)) + sys.stderr.write("Tunable descriptions will be disabled.\n") + + sys.stdout.write("ZFS Tunables:\n") + + if alternate_tunable_layout: + fmt = "\t%s=%s\n" + else: + fmt = "\t%-50s%s\n" + + for name in sorted(tunables.keys()): + if show_tunable_descriptions and name in descriptions: + sys.stdout.write("\t# %s\n" % descriptions[name]) + + sys.stdout.write(fmt % (name, tunables[name])) + + +unSub = [ + _arc_summary, + _arc_efficiency, + _l2arc_summary, + _dmu_summary, + _vdev_summary, + _tunable_summary +] + + +def zfs_header(): + """Print title string with date""" + + daydate = time.strftime('%a %b %d %H:%M:%S %Y') + + sys.stdout.write('\n'+'-'*72+'\n') + sys.stdout.write('ZFS Subsystem Report\t\t\t\t%s' % daydate) + sys.stdout.write('\n') + + +def usage(): + """Print usage information""" + + sys.stdout.write("Usage: arc_summary [-h] [-a] [-d] [-p PAGE]\n\n") + sys.stdout.write("\t -h, --help : " + "Print this help message and exit\n") + sys.stdout.write("\t -a, --alternate : " + "Show an alternate sysctl layout\n") + sys.stdout.write("\t -d, --description : " + "Show the sysctl descriptions\n") + sys.stdout.write("\t -p PAGE, --page=PAGE : " + "Select a single output page to display,\n") + sys.stdout.write("\t " + "should be an integer between 1 and " + + str(len(unSub)) + "\n\n") + sys.stdout.write("Examples:\n") + sys.stdout.write("\tarc_summary -a\n") + sys.stdout.write("\tarc_summary -p 4\n") + sys.stdout.write("\tarc_summary -ad\n") + sys.stdout.write("\tarc_summary --page=2\n") + + +def main(): + """Main function""" + + global show_tunable_descriptions + global alternate_tunable_layout + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "adp:h", ["alternate", "description", "page=", "help"] + ) + except getopt.error as e: + sys.stderr.write("Error: %s\n" % e.msg) + usage() + sys.exit(1) + + args = {} + for opt, arg in opts: + if opt in ('-a', '--alternate'): + args['a'] = True + if opt in ('-d', '--description'): + args['d'] = True + if opt in ('-p', '--page'): + args['p'] = arg + if opt in ('-h', '--help'): + usage() + sys.exit(0) + + Kstat = get_Kstat() + + alternate_tunable_layout = 'a' in args + show_tunable_descriptions = 'd' in args + + pages = [] + + if 'p' in args: + try: + pages.append(unSub[int(args['p']) - 1]) + except IndexError: + sys.stderr.write('the argument to -p must be between 1 and ' + + str(len(unSub)) + '\n') + sys.exit(1) + else: + pages = unSub + + zfs_header() + for page in pages: + page(Kstat) + sys.stdout.write("\n") + + +if __name__ == '__main__': + main() diff --git a/sys/contrib/openzfs/cmd/arc_summary/arc_summary3 b/sys/contrib/openzfs/cmd/arc_summary/arc_summary3 new file mode 100755 index 000000000000..c920b8e5395d --- /dev/null +++ b/sys/contrib/openzfs/cmd/arc_summary/arc_summary3 @@ -0,0 +1,943 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>, +# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>, +# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>, +# Copyright (c) 2017 Scot W. Stevenson <scot.stevenson@gmail.com> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +"""Print statistics on the ZFS ARC Cache and other information + +Provides basic information on the ARC, its efficiency, the L2ARC (if present), +the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See +the in-source documentation and code at +https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details. +The original introduction to arc_summary can be found at +http://cuddletech.com/?p=454 +""" + +import argparse +import os +import subprocess +import sys +import time + +DESCRIPTION = 'Print ARC and other statistics for ZFS on Linux' +INDENT = ' '*8 +LINE_LENGTH = 72 +DATE_FORMAT = '%a %b %d %H:%M:%S %Y' +TITLE = 'ZFS Subsystem Report' + +SECTIONS = 'arc archits dmu l2arc spl tunables vdev zil'.split() +SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')' + +# Tunables and SPL are handled separately because they come from +# different sources +SECTION_PATHS = {'arc': 'arcstats', + 'dmu': 'dmu_tx', + 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats + 'vdev': 'vdev_cache_stats', + 'xuio': 'xuio_stats', + 'zfetch': 'zfetchstats', + 'zil': 'zil'} + +parser = argparse.ArgumentParser(description=DESCRIPTION) +parser.add_argument('-a', '--alternate', action='store_true', default=False, + help='use alternate formatting for tunables and SPL', + dest='alt') +parser.add_argument('-d', '--description', action='store_true', default=False, + help='print descriptions with tunables and SPL', + dest='desc') +parser.add_argument('-g', '--graph', action='store_true', default=False, + help='print graph on ARC use and exit', dest='graph') +parser.add_argument('-p', '--page', type=int, dest='page', + help='print page by number (DEPRECATED, use "-s")') +parser.add_argument('-r', '--raw', action='store_true', default=False, + help='dump all available data with minimal formatting', + dest='raw') +parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP) +ARGS = parser.parse_args() + + +if sys.platform.startswith('freebsd'): + # Requires py36-sysctl on FreeBSD + import sysctl + + VDEV_CACHE_SIZE = 'vdev.cache_size' + + def load_kstats(section): + base = 'kstat.zfs.misc.{section}.'.format(section=section) + # base is removed from the name + fmt = lambda kstat: '{name} : {value}'.format(name=kstat.name[len(base):], + value=kstat.value) + return [fmt(kstat) for kstat in sysctl.filter(base)] + + def get_params(base): + cut = 8 # = len('vfs.zfs.') + return {ctl.name[cut:]: str(ctl.value) for ctl in sysctl.filter(base)} + + def get_tunable_params(): + return get_params('vfs.zfs') + + def get_vdev_params(): + return get_params('vfs.zfs.vdev') + + def get_version_impl(request): + # FreeBSD reports versions for zpl and spa instead of zfs and spl. + name = {'zfs': 'zpl', + 'spl': 'spa'}[request] + mib = 'vfs.zfs.version.{}'.format(name) + version = sysctl.filter(mib)[0].value + return '{} version {}'.format(name, version) + + def get_descriptions(_request): + # py-sysctl doesn't give descriptions, so we have to shell out. + command = ['sysctl', '-d', 'vfs.zfs'] + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + lines = info.stdout.split('\n') + else: + info = subprocess.check_output(command, universal_newlines=True) + lines = info.split('\n') + + def fmt(line): + name, desc = line.split(':', 1) + return (name.strip(), desc.strip()) + + return dict([fmt(line) for line in lines if len(line) > 0]) + + +elif sys.platform.startswith('linux'): + KSTAT_PATH = '/proc/spl/kstat/zfs' + SPL_PATH = '/sys/module/spl/parameters' + TUNABLES_PATH = '/sys/module/zfs/parameters' + + VDEV_CACHE_SIZE = 'zfs_vdev_cache_size' + + def load_kstats(section): + path = os.path.join(KSTAT_PATH, section) + with open(path) as f: + return list(f)[2:] # Get rid of header + + def get_params(basepath): + """Collect information on the Solaris Porting Layer (SPL) or the + tunables, depending on the PATH given. Does not check if PATH is + legal. + """ + result = {} + for name in os.listdir(basepath): + path = os.path.join(basepath, name) + with open(path) as f: + value = f.read() + result[name] = value.strip() + return result + + def get_spl_params(): + return get_params(SPL_PATH) + + def get_tunable_params(): + return get_params(TUNABLES_PATH) + + def get_vdev_params(): + return get_params(TUNABLES_PATH) + + def get_version_impl(request): + # The original arc_summary called /sbin/modinfo/{spl,zfs} to get + # the version information. We switch to /sys/module/{spl,zfs}/version + # to make sure we get what is really loaded in the kernel + command = ["cat", "/sys/module/{0}/version".format(request)] + req = request.upper() + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + version = info.stdout.strip() + else: + info = subprocess.check_output(command, universal_newlines=True) + version = info.strip() + + return version + + def get_descriptions(request): + """Get the descriptions of the Solaris Porting Layer (SPL) or the + tunables, return with minimal formatting. + """ + + if request not in ('spl', 'zfs'): + print('ERROR: description of "{0}" requested)'.format(request)) + sys.exit(1) + + descs = {} + target_prefix = 'parm:' + + # We would prefer to do this with /sys/modules -- see the discussion at + # get_version() -- but there isn't a way to get the descriptions from + # there, so we fall back on modinfo + command = ["/sbin/modinfo", request, "-0"] + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + info = '' + + try: + + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + raw_output = info.stdout.split('\0') + else: + info = subprocess.check_output(command, + universal_newlines=True) + raw_output = info.split('\0') + + except subprocess.CalledProcessError: + print("Error: Descriptions not available", + "(can't access kernel module)") + sys.exit(1) + + for line in raw_output: + + if not line.startswith(target_prefix): + continue + + line = line[len(target_prefix):].strip() + name, raw_desc = line.split(':', 1) + desc = raw_desc.rsplit('(', 1)[0] + + if desc == '': + desc = '(No description found)' + + descs[name.strip()] = desc.strip() + + return descs + + +def cleanup_line(single_line): + """Format a raw line of data from /proc and isolate the name value + part, returning a tuple with each. Currently, this gets rid of the + middle '4'. For example "arc_no_grow 4 0" returns the tuple + ("arc_no_grow", "0"). + """ + name, _, value = single_line.split() + + return name, value + + +def draw_graph(kstats_dict): + """Draw a primitive graph representing the basic information on the + ARC -- its size and the proportion used by MFU and MRU -- and quit. + We use max size of the ARC to calculate how full it is. This is a + very rough representation. + """ + + arc_stats = isolate_section('arcstats', kstats_dict) + + GRAPH_INDENT = ' '*4 + GRAPH_WIDTH = 60 + arc_size = f_bytes(arc_stats['size']) + arc_perc = f_perc(arc_stats['size'], arc_stats['c_max']) + mfu_size = f_bytes(arc_stats['mfu_size']) + mru_size = f_bytes(arc_stats['mru_size']) + meta_limit = f_bytes(arc_stats['arc_meta_limit']) + meta_size = f_bytes(arc_stats['arc_meta_used']) + dnode_limit = f_bytes(arc_stats['arc_dnode_limit']) + dnode_size = f_bytes(arc_stats['dnode_size']) + + info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) ' + 'DNODE {6} ({7})') + info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size, + meta_size, meta_limit, dnode_size, + dnode_limit) + info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2) + info_line = GRAPH_INDENT+info_spc+info_line + + graph_line = GRAPH_INDENT+'+'+('-'*(GRAPH_WIDTH-2))+'+' + + mfu_perc = float(int(arc_stats['mfu_size'])/int(arc_stats['c_max'])) + mru_perc = float(int(arc_stats['mru_size'])/int(arc_stats['c_max'])) + arc_perc = float(int(arc_stats['size'])/int(arc_stats['c_max'])) + total_ticks = float(arc_perc)*GRAPH_WIDTH + mfu_ticks = mfu_perc*GRAPH_WIDTH + mru_ticks = mru_perc*GRAPH_WIDTH + other_ticks = total_ticks-(mfu_ticks+mru_ticks) + + core_form = 'F'*int(mfu_ticks)+'R'*int(mru_ticks)+'O'*int(other_ticks) + core_spc = ' '*(GRAPH_WIDTH-(2+len(core_form))) + core_line = GRAPH_INDENT+'|'+core_form+core_spc+'|' + + for line in ('', info_line, graph_line, core_line, graph_line, ''): + print(line) + + +def f_bytes(byte_string): + """Return human-readable representation of a byte value in + powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal + points. Values smaller than one KiB are returned without + decimal points. Note "bytes" is a reserved keyword. + """ + + prefixes = ([2**80, "YiB"], # yobibytes (yotta) + [2**70, "ZiB"], # zebibytes (zetta) + [2**60, "EiB"], # exbibytes (exa) + [2**50, "PiB"], # pebibytes (peta) + [2**40, "TiB"], # tebibytes (tera) + [2**30, "GiB"], # gibibytes (giga) + [2**20, "MiB"], # mebibytes (mega) + [2**10, "KiB"]) # kibibytes (kilo) + + bites = int(byte_string) + + if bites >= 2**10: + for limit, unit in prefixes: + + if bites >= limit: + value = bites / limit + break + + result = '{0:.1f} {1}'.format(value, unit) + else: + result = '{0} Bytes'.format(bites) + + return result + + +def f_hits(hits_string): + """Create a human-readable representation of the number of hits. + The single-letter symbols used are SI to avoid the confusion caused + by the different "short scale" and "long scale" representations in + English, which use the same words for different values. See + https://en.wikipedia.org/wiki/Names_of_large_numbers and: + https://physics.nist.gov/cuu/Units/prefixes.html + """ + + numbers = ([10**24, 'Y'], # yotta (septillion) + [10**21, 'Z'], # zetta (sextillion) + [10**18, 'E'], # exa (quintrillion) + [10**15, 'P'], # peta (quadrillion) + [10**12, 'T'], # tera (trillion) + [10**9, 'G'], # giga (billion) + [10**6, 'M'], # mega (million) + [10**3, 'k']) # kilo (thousand) + + hits = int(hits_string) + + if hits >= 1000: + for limit, symbol in numbers: + + if hits >= limit: + value = hits/limit + break + + result = "%0.1f%s" % (value, symbol) + else: + result = "%d" % hits + + return result + + +def f_perc(value1, value2): + """Calculate percentage and return in human-readable form. If + rounding produces the result '0.0' though the first number is + not zero, include a 'less-than' symbol to avoid confusion. + Division by zero is handled by returning 'n/a'; no error + is called. + """ + + v1 = float(value1) + v2 = float(value2) + + try: + perc = 100 * v1/v2 + except ZeroDivisionError: + result = 'n/a' + else: + result = '{0:0.1f} %'.format(perc) + + if result == '0.0 %' and v1 > 0: + result = '< 0.1 %' + + return result + + +def format_raw_line(name, value): + """For the --raw option for the tunable and SPL outputs, decide on the + correct formatting based on the --alternate flag. + """ + + if ARGS.alt: + result = '{0}{1}={2}'.format(INDENT, name, value) + else: + spc = LINE_LENGTH-(len(INDENT)+len(value)) + result = '{0}{1:<{spc}}{2}'.format(INDENT, name, value, spc=spc) + + return result + + +def get_kstats(): + """Collect information on the ZFS subsystem. The step does not perform any + further processing, giving us the option to only work on what is actually + needed. The name "kstat" is a holdover from the Solaris utility of the same + name. + """ + + result = {} + + for section in SECTION_PATHS.values(): + if section not in result: + result[section] = load_kstats(section) + + return result + + +def get_version(request): + """Get the version number of ZFS or SPL on this machine for header. + Returns an error string, but does not raise an error, if we can't + get the ZFS/SPL version. + """ + + if request not in ('spl', 'zfs'): + error_msg = '(ERROR: "{0}" requested)'.format(request) + return error_msg + + return get_version_impl(request) + + +def print_header(): + """Print the initial heading with date and time as well as info on the + kernel and ZFS versions. This is not called for the graph. + """ + + # datetime is now recommended over time but we keep the exact formatting + # from the older version of arc_summary in case there are scripts + # that expect it in this way + daydate = time.strftime(DATE_FORMAT) + spc_date = LINE_LENGTH-len(daydate) + sys_version = os.uname() + + sys_msg = sys_version.sysname+' '+sys_version.release + zfs = get_version('zfs') + spc_zfs = LINE_LENGTH-len(zfs) + + machine_msg = 'Machine: '+sys_version.nodename+' ('+sys_version.machine+')' + spl = get_version('spl') + spc_spl = LINE_LENGTH-len(spl) + + print('\n'+('-'*LINE_LENGTH)) + print('{0:<{spc}}{1}'.format(TITLE, daydate, spc=spc_date)) + print('{0:<{spc}}{1}'.format(sys_msg, zfs, spc=spc_zfs)) + print('{0:<{spc}}{1}\n'.format(machine_msg, spl, spc=spc_spl)) + + +def print_raw(kstats_dict): + """Print all available data from the system in a minimally sorted format. + This can be used as a source to be piped through 'grep'. + """ + + sections = sorted(kstats_dict.keys()) + + for section in sections: + + print('\n{0}:'.format(section.upper())) + lines = sorted(kstats_dict[section]) + + for line in lines: + name, value = cleanup_line(line) + print(format_raw_line(name, value)) + + # Tunables and SPL must be handled separately because they come from a + # different source and have descriptions the user might request + print() + section_spl() + section_tunables() + + +def isolate_section(section_name, kstats_dict): + """From the complete information on all sections, retrieve only those + for one section. + """ + + try: + section_data = kstats_dict[section_name] + except KeyError: + print('ERROR: Data on {0} not available'.format(section_data)) + sys.exit(1) + + section_dict = dict(cleanup_line(l) for l in section_data) + + return section_dict + + +# Formatted output helper functions + + +def prt_1(text, value): + """Print text and one value, no indent""" + spc = ' '*(LINE_LENGTH-(len(text)+len(value))) + print('{0}{spc}{1}'.format(text, value, spc=spc)) + + +def prt_i1(text, value): + """Print text and one value, with indent""" + spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(value))) + print(INDENT+'{0}{spc}{1}'.format(text, value, spc=spc)) + + +def prt_2(text, value1, value2): + """Print text and two values, no indent""" + values = '{0:>9} {1:>9}'.format(value1, value2) + spc = ' '*(LINE_LENGTH-(len(text)+len(values)+2)) + print('{0}{spc} {1}'.format(text, values, spc=spc)) + + +def prt_i2(text, value1, value2): + """Print text and two values, with indent""" + values = '{0:>9} {1:>9}'.format(value1, value2) + spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(values)+2)) + print(INDENT+'{0}{spc} {1}'.format(text, values, spc=spc)) + + +# The section output concentrates on important parameters instead of +# being exhaustive (that is what the --raw parameter is for) + + +def section_arc(kstats_dict): + """Give basic information on the ARC, MRU and MFU. This is the first + and most used section. + """ + + arc_stats = isolate_section('arcstats', kstats_dict) + + throttle = arc_stats['memory_throttle_count'] + + if throttle == '0': + health = 'HEALTHY' + else: + health = 'THROTTLED' + + prt_1('ARC status:', health) + prt_i1('Memory throttle count:', throttle) + print() + + arc_size = arc_stats['size'] + arc_target_size = arc_stats['c'] + arc_max = arc_stats['c_max'] + arc_min = arc_stats['c_min'] + mfu_size = arc_stats['mfu_size'] + mru_size = arc_stats['mru_size'] + meta_limit = arc_stats['arc_meta_limit'] + meta_size = arc_stats['arc_meta_used'] + dnode_limit = arc_stats['arc_dnode_limit'] + dnode_size = arc_stats['dnode_size'] + target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min)) + + prt_2('ARC size (current):', + f_perc(arc_size, arc_max), f_bytes(arc_size)) + prt_i2('Target size (adaptive):', + f_perc(arc_target_size, arc_max), f_bytes(arc_target_size)) + prt_i2('Min size (hard limit):', + f_perc(arc_min, arc_max), f_bytes(arc_min)) + prt_i2('Max size (high water):', + target_size_ratio, f_bytes(arc_max)) + caches_size = int(mfu_size)+int(mru_size) + prt_i2('Most Frequently Used (MFU) cache size:', + f_perc(mfu_size, caches_size), f_bytes(mfu_size)) + prt_i2('Most Recently Used (MRU) cache size:', + f_perc(mru_size, caches_size), f_bytes(mru_size)) + prt_i2('Metadata cache size (hard limit):', + f_perc(meta_limit, arc_max), f_bytes(meta_limit)) + prt_i2('Metadata cache size (current):', + f_perc(meta_size, meta_limit), f_bytes(meta_size)) + prt_i2('Dnode cache size (hard limit):', + f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit)) + prt_i2('Dnode cache size (current):', + f_perc(dnode_size, dnode_limit), f_bytes(dnode_size)) + print() + + print('ARC hash breakdown:') + prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max'])) + prt_i2('Elements current:', + f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']), + f_hits(arc_stats['hash_elements'])) + prt_i1('Collisions:', f_hits(arc_stats['hash_collisions'])) + + prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max'])) + prt_i1('Chains:', f_hits(arc_stats['hash_chains'])) + print() + + print('ARC misc:') + prt_i1('Deleted:', f_hits(arc_stats['deleted'])) + prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss'])) + prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip'])) + print() + + +def section_archits(kstats_dict): + """Print information on how the caches are accessed ("arc hits"). + """ + + arc_stats = isolate_section('arcstats', kstats_dict) + all_accesses = int(arc_stats['hits'])+int(arc_stats['misses']) + actual_hits = int(arc_stats['mfu_hits'])+int(arc_stats['mru_hits']) + + prt_1('ARC total accesses (hits + misses):', f_hits(all_accesses)) + ta_todo = (('Cache hit ratio:', arc_stats['hits']), + ('Cache miss ratio:', arc_stats['misses']), + ('Actual hit ratio (MFU + MRU hits):', actual_hits)) + + for title, value in ta_todo: + prt_i2(title, f_perc(value, all_accesses), f_hits(value)) + + dd_total = int(arc_stats['demand_data_hits']) +\ + int(arc_stats['demand_data_misses']) + prt_i2('Data demand efficiency:', + f_perc(arc_stats['demand_data_hits'], dd_total), + f_hits(dd_total)) + + dp_total = int(arc_stats['prefetch_data_hits']) +\ + int(arc_stats['prefetch_data_misses']) + prt_i2('Data prefetch efficiency:', + f_perc(arc_stats['prefetch_data_hits'], dp_total), + f_hits(dp_total)) + + known_hits = int(arc_stats['mfu_hits']) +\ + int(arc_stats['mru_hits']) +\ + int(arc_stats['mfu_ghost_hits']) +\ + int(arc_stats['mru_ghost_hits']) + + anon_hits = int(arc_stats['hits'])-known_hits + + print() + print('Cache hits by cache type:') + cl_todo = (('Most frequently used (MFU):', arc_stats['mfu_hits']), + ('Most recently used (MRU):', arc_stats['mru_hits']), + ('Most frequently used (MFU) ghost:', + arc_stats['mfu_ghost_hits']), + ('Most recently used (MRU) ghost:', + arc_stats['mru_ghost_hits'])) + + for title, value in cl_todo: + prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value)) + + # For some reason, anon_hits can turn negative, which is weird. Until we + # have figured out why this happens, we just hide the problem, following + # the behavior of the original arc_summary. + if anon_hits >= 0: + prt_i2('Anonymously used:', + f_perc(anon_hits, arc_stats['hits']), f_hits(anon_hits)) + + print() + print('Cache hits by data type:') + dt_todo = (('Demand data:', arc_stats['demand_data_hits']), + ('Demand prefetch data:', arc_stats['prefetch_data_hits']), + ('Demand metadata:', arc_stats['demand_metadata_hits']), + ('Demand prefetch metadata:', + arc_stats['prefetch_metadata_hits'])) + + for title, value in dt_todo: + prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value)) + + print() + print('Cache misses by data type:') + dm_todo = (('Demand data:', arc_stats['demand_data_misses']), + ('Demand prefetch data:', + arc_stats['prefetch_data_misses']), + ('Demand metadata:', arc_stats['demand_metadata_misses']), + ('Demand prefetch metadata:', + arc_stats['prefetch_metadata_misses'])) + + for title, value in dm_todo: + prt_i2(title, f_perc(value, arc_stats['misses']), f_hits(value)) + + print() + + +def section_dmu(kstats_dict): + """Collect information on the DMU""" + + zfetch_stats = isolate_section('zfetchstats', kstats_dict) + + zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses']) + + prt_1('DMU prefetch efficiency:', f_hits(zfetch_access_total)) + prt_i2('Hit ratio:', f_perc(zfetch_stats['hits'], zfetch_access_total), + f_hits(zfetch_stats['hits'])) + prt_i2('Miss ratio:', f_perc(zfetch_stats['misses'], zfetch_access_total), + f_hits(zfetch_stats['misses'])) + print() + + +def section_l2arc(kstats_dict): + """Collect information on L2ARC device if present. If not, tell user + that we're skipping the section. + """ + + # The L2ARC statistics live in the same section as the normal ARC stuff + arc_stats = isolate_section('arcstats', kstats_dict) + + if arc_stats['l2_size'] == '0': + print('L2ARC not detected, skipping section\n') + return + + l2_errors = int(arc_stats['l2_writes_error']) +\ + int(arc_stats['l2_cksum_bad']) +\ + int(arc_stats['l2_io_error']) + + l2_access_total = int(arc_stats['l2_hits'])+int(arc_stats['l2_misses']) + health = 'HEALTHY' + + if l2_errors > 0: + health = 'DEGRADED' + + prt_1('L2ARC status:', health) + + l2_todo = (('Low memory aborts:', 'l2_abort_lowmem'), + ('Free on write:', 'l2_free_on_write'), + ('R/W clashes:', 'l2_rw_clash'), + ('Bad checksums:', 'l2_cksum_bad'), + ('I/O errors:', 'l2_io_error')) + + for title, value in l2_todo: + prt_i1(title, f_hits(arc_stats[value])) + + print() + prt_1('L2ARC size (adaptive):', f_bytes(arc_stats['l2_size'])) + prt_i2('Compressed:', f_perc(arc_stats['l2_asize'], arc_stats['l2_size']), + f_bytes(arc_stats['l2_asize'])) + prt_i2('Header size:', + f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), + f_bytes(arc_stats['l2_hdr_size'])) + + print() + prt_1('L2ARC breakdown:', f_hits(l2_access_total)) + prt_i2('Hit ratio:', + f_perc(arc_stats['l2_hits'], l2_access_total), + f_hits(arc_stats['l2_hits'])) + prt_i2('Miss ratio:', + f_perc(arc_stats['l2_misses'], l2_access_total), + f_hits(arc_stats['l2_misses'])) + prt_i1('Feeds:', f_hits(arc_stats['l2_feeds'])) + + print() + print('L2ARC writes:') + + if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']: + prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent'])) + prt_i2('Done ratio:', + f_perc(arc_stats['l2_writes_done'], + arc_stats['l2_writes_sent']), + f_hits(arc_stats['l2_writes_done'])) + prt_i2('Error ratio:', + f_perc(arc_stats['l2_writes_error'], + arc_stats['l2_writes_sent']), + f_hits(arc_stats['l2_writes_error'])) + else: + prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent'])) + + print() + print('L2ARC evicts:') + prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry'])) + prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading'])) + print() + + +def section_spl(*_): + """Print the SPL parameters, if requested with alternative format + and/or descriptions. This does not use kstats. + """ + + if sys.platform.startswith('freebsd'): + # No SPL support in FreeBSD + return + + spls = get_spl_params() + keylist = sorted(spls.keys()) + print('Solaris Porting Layer (SPL):') + + if ARGS.desc: + descriptions = get_descriptions('spl') + + for key in keylist: + value = spls[key] + + if ARGS.desc: + try: + print(INDENT+'#', descriptions[key]) + except KeyError: + print(INDENT+'# (No description found)') # paranoid + + print(format_raw_line(key, value)) + + print() + + +def section_tunables(*_): + """Print the tunables, if requested with alternative format and/or + descriptions. This does not use kstasts. + """ + + tunables = get_tunable_params() + keylist = sorted(tunables.keys()) + print('Tunables:') + + if ARGS.desc: + descriptions = get_descriptions('zfs') + + for key in keylist: + value = tunables[key] + + if ARGS.desc: + try: + print(INDENT+'#', descriptions[key]) + except KeyError: + print(INDENT+'# (No description found)') # paranoid + + print(format_raw_line(key, value)) + + print() + + +def section_vdev(kstats_dict): + """Collect information on VDEV caches""" + + # Currently [Nov 2017] the VDEV cache is disabled, because it is actually + # harmful. When this is the case, we just skip the whole entry. See + # https://github.com/zfsonlinux/zfs/blob/master/module/zfs/vdev_cache.c + # for details + tunables = get_vdev_params() + + if tunables[VDEV_CACHE_SIZE] == '0': + print('VDEV cache disabled, skipping section\n') + return + + vdev_stats = isolate_section('vdev_cache_stats', kstats_dict) + + vdev_cache_total = int(vdev_stats['hits']) +\ + int(vdev_stats['misses']) +\ + int(vdev_stats['delegations']) + + prt_1('VDEV cache summary:', f_hits(vdev_cache_total)) + prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total), + f_hits(vdev_stats['hits'])) + prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total), + f_hits(vdev_stats['misses'])) + prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total), + f_hits(vdev_stats['delegations'])) + print() + + +def section_zil(kstats_dict): + """Collect information on the ZFS Intent Log. Some of the information + taken from https://github.com/zfsonlinux/zfs/blob/master/include/sys/zil.h + """ + + zil_stats = isolate_section('zil', kstats_dict) + + prt_1('ZIL committed transactions:', + f_hits(zil_stats['zil_itx_count'])) + prt_i1('Commit requests:', f_hits(zil_stats['zil_commit_count'])) + prt_i1('Flushes to stable storage:', + f_hits(zil_stats['zil_commit_writer_count'])) + prt_i2('Transactions to SLOG storage pool:', + f_bytes(zil_stats['zil_itx_metaslab_slog_bytes']), + f_hits(zil_stats['zil_itx_metaslab_slog_count'])) + prt_i2('Transactions to non-SLOG storage pool:', + f_bytes(zil_stats['zil_itx_metaslab_normal_bytes']), + f_hits(zil_stats['zil_itx_metaslab_normal_count'])) + print() + + +section_calls = {'arc': section_arc, + 'archits': section_archits, + 'dmu': section_dmu, + 'l2arc': section_l2arc, + 'spl': section_spl, + 'tunables': section_tunables, + 'vdev': section_vdev, + 'zil': section_zil} + + +def main(): + """Run program. The options to draw a graph and to print all data raw are + treated separately because they come with their own call. + """ + + kstats = get_kstats() + + if ARGS.graph: + draw_graph(kstats) + sys.exit(0) + + print_header() + + if ARGS.raw: + print_raw(kstats) + + elif ARGS.section: + + try: + section_calls[ARGS.section](kstats) + except KeyError: + print('Error: Section "{0}" unknown'.format(ARGS.section)) + sys.exit(1) + + elif ARGS.page: + print('WARNING: Pages are deprecated, please use "--section"\n') + + pages_to_calls = {1: 'arc', + 2: 'archits', + 3: 'l2arc', + 4: 'dmu', + 5: 'vdev', + 6: 'tunables'} + + try: + call = pages_to_calls[ARGS.page] + except KeyError: + print('Error: Page "{0}" not supported'.format(ARGS.page)) + sys.exit(1) + else: + section_calls[call](kstats) + + else: + # If no parameters were given, we print all sections. We might want to + # change the sequence by hand + calls = sorted(section_calls.keys()) + + for section in calls: + section_calls[section](kstats) + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/sys/contrib/openzfs/cmd/arcstat/.gitignore b/sys/contrib/openzfs/cmd/arcstat/.gitignore new file mode 100644 index 000000000000..6d6cd1ab75fc --- /dev/null +++ b/sys/contrib/openzfs/cmd/arcstat/.gitignore @@ -0,0 +1 @@ +arcstat diff --git a/sys/contrib/openzfs/cmd/arcstat/Makefile.am b/sys/contrib/openzfs/cmd/arcstat/Makefile.am new file mode 100644 index 000000000000..d1ba989a0cd8 --- /dev/null +++ b/sys/contrib/openzfs/cmd/arcstat/Makefile.am @@ -0,0 +1,5 @@ +include $(top_srcdir)/config/Substfiles.am + +bin_SCRIPTS = arcstat + +SUBSTFILES += $(bin_SCRIPTS) diff --git a/sys/contrib/openzfs/cmd/arcstat/arcstat.in b/sys/contrib/openzfs/cmd/arcstat/arcstat.in new file mode 100755 index 000000000000..c83a1c74599e --- /dev/null +++ b/sys/contrib/openzfs/cmd/arcstat/arcstat.in @@ -0,0 +1,494 @@ +#!/usr/bin/env @PYTHON_SHEBANG@ +# +# Print out ZFS ARC Statistics exported via kstat(1) +# For a definition of fields, or usage, use arcstat -v +# +# This script was originally a fork of the original arcstat.pl (0.1) +# by Neelakanth Nadgir, originally published on his Sun blog on +# 09/18/2007 +# http://blogs.sun.com/realneel/entry/zfs_arc_statistics +# +# A new version aimed to improve upon the original by adding features +# and fixing bugs as needed. This version was maintained by Mike +# Harsch and was hosted in a public open source repository: +# http://github.com/mharsch/arcstat +# +# but has since moved to the illumos-gate repository. +# +# This Python port was written by John Hixson for FreeNAS, introduced +# in commit e2c29f: +# https://github.com/freenas/freenas +# +# and has been improved by many people since. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Fields have a fixed width. Every interval, we fill the "v" +# hash with its corresponding value (v[field]=value) using calculate(). +# @hdr is the array of fields that needs to be printed, so we +# just iterate over this array and print the values using our pretty printer. +# +# This script must remain compatible with Python 2.6+ and Python 3.4+. +# + +import sys +import time +import getopt +import re +import copy + +from signal import signal, SIGINT, SIGWINCH, SIG_DFL + + +cols = { + # HDR: [Size, Scale, Description] + "time": [8, -1, "Time"], + "hits": [4, 1000, "ARC reads per second"], + "miss": [4, 1000, "ARC misses per second"], + "read": [4, 1000, "Total ARC accesses per second"], + "hit%": [4, 100, "ARC hit percentage"], + "miss%": [5, 100, "ARC miss percentage"], + "dhit": [4, 1000, "Demand hits per second"], + "dmis": [4, 1000, "Demand misses per second"], + "dh%": [3, 100, "Demand hit percentage"], + "dm%": [3, 100, "Demand miss percentage"], + "phit": [4, 1000, "Prefetch hits per second"], + "pmis": [4, 1000, "Prefetch misses per second"], + "ph%": [3, 100, "Prefetch hits percentage"], + "pm%": [3, 100, "Prefetch miss percentage"], + "mhit": [4, 1000, "Metadata hits per second"], + "mmis": [4, 1000, "Metadata misses per second"], + "mread": [5, 1000, "Metadata accesses per second"], + "mh%": [3, 100, "Metadata hit percentage"], + "mm%": [3, 100, "Metadata miss percentage"], + "arcsz": [5, 1024, "ARC size"], + "size": [4, 1024, "ARC size"], + "c": [4, 1024, "ARC target size"], + "mfu": [4, 1000, "MFU list hits per second"], + "mru": [4, 1000, "MRU list hits per second"], + "mfug": [4, 1000, "MFU ghost list hits per second"], + "mrug": [4, 1000, "MRU ghost list hits per second"], + "eskip": [5, 1000, "evict_skip per second"], + "mtxmis": [6, 1000, "mutex_miss per second"], + "dread": [5, 1000, "Demand accesses per second"], + "pread": [5, 1000, "Prefetch accesses per second"], + "l2hits": [6, 1000, "L2ARC hits per second"], + "l2miss": [6, 1000, "L2ARC misses per second"], + "l2read": [6, 1000, "Total L2ARC accesses per second"], + "l2hit%": [6, 100, "L2ARC access hit percentage"], + "l2miss%": [7, 100, "L2ARC access miss percentage"], + "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], + "l2size": [6, 1024, "Size of the L2ARC"], + "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"], + "grow": [4, 1000, "ARC grow disabled"], + "need": [4, 1024, "ARC reclaim need"], + "free": [4, 1024, "ARC free memory"], + "avail": [5, 1024, "ARC available memory"], + "waste": [5, 1024, "Wasted memory due to round up to pagesize"], +} + +v = {} +hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis", + "mm%", "size", "c", "avail"] +xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "dread", + "pread", "read"] +sint = 1 # Default interval is 1 second +count = 1 # Default count is 1 +hdr_intr = 20 # Print header every 20 lines of output +opfile = None +sep = " " # Default separator is 2 spaces +version = "0.4" +l2exist = False +cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval " + "[count]]\n") +cur = {} +d = {} +out = None +kstat = None + + +if sys.platform.startswith('freebsd'): + # Requires py27-sysctl on FreeBSD + import sysctl + + def kstat_update(): + global kstat + + k = sysctl.filter('kstat.zfs.misc.arcstats') + + if not k: + sys.exit(1) + + kstat = {} + + for s in k: + if not s: + continue + + name, value = s.name, s.value + # Trims 'kstat.zfs.misc.arcstats' from the name + kstat[name[24:]] = int(value) + +elif sys.platform.startswith('linux'): + def kstat_update(): + global kstat + + k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] + + if not k: + sys.exit(1) + + del k[0:2] + kstat = {} + + for s in k: + if not s: + continue + + name, unused, value = s.split() + kstat[name] = int(value) + + +def detailed_usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("Field definitions are as follows:\n") + for key in cols: + sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) + sys.stderr.write("\n") + + sys.exit(0) + + +def usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -v : List all possible field headers and definitions" + "\n") + sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") + sys.stderr.write("\t -o : Redirect output to the specified file\n") + sys.stderr.write("\t -s : Override default field separator with custom " + "character or string\n") + sys.stderr.write("\nExamples:\n") + sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat -v\n") + sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") + sys.stderr.write("\n") + + sys.exit(1) + + +def snap_stats(): + global cur + global kstat + + prev = copy.deepcopy(cur) + kstat_update() + + cur = kstat + for key in cur: + if re.match(key, "class"): + continue + if key in prev: + d[key] = cur[key] - prev[key] + else: + d[key] = cur[key] + + +def prettynum(sz, scale, num=0): + suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] + index = 0 + save = 0 + + # Special case for date field + if scale == -1: + return "%s" % num + + # Rounding error, return 0 + elif 0 < num < 1: + num = 0 + + while abs(num) > scale and index < 5: + save = num + num = num / scale + index += 1 + + if index == 0: + return "%*d" % (sz, num) + + if abs(save / scale) < 10: + return "%*.1f%s" % (sz - 1, num, suffix[index]) + else: + return "%*d%s" % (sz - 1, num, suffix[index]) + + +def print_values(): + global hdr + global sep + global v + + sys.stdout.write(sep.join( + prettynum(cols[col][0], cols[col][1], v[col]) for col in hdr)) + + sys.stdout.write("\n") + sys.stdout.flush() + + +def print_header(): + global hdr + global sep + + sys.stdout.write(sep.join("%*s" % (cols[col][0], col) for col in hdr)) + + sys.stdout.write("\n") + + +def get_terminal_lines(): + try: + import fcntl + import termios + import struct + data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234') + sz = struct.unpack('hh', data) + return sz[0] + except Exception: + pass + + +def update_hdr_intr(): + global hdr_intr + + lines = get_terminal_lines() + if lines and lines > 3: + hdr_intr = lines - 3 + + +def resize_handler(signum, frame): + update_hdr_intr() + + +def init(): + global sint + global count + global hdr + global xhdr + global opfile + global sep + global out + global l2exist + + desired_cols = None + xflag = False + hflag = False + vflag = False + i = 1 + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "xo:hvs:f:", + [ + "extended", + "outfile", + "help", + "verbose", + "separator", + "columns" + ] + ) + except getopt.error as msg: + sys.stderr.write("Error: %s\n" % str(msg)) + usage() + opts = None + + for opt, arg in opts: + if opt in ('-x', '--extended'): + xflag = True + if opt in ('-o', '--outfile'): + opfile = arg + i += 1 + if opt in ('-h', '--help'): + hflag = True + if opt in ('-v', '--verbose'): + vflag = True + if opt in ('-s', '--separator'): + sep = arg + i += 1 + if opt in ('-f', '--columns'): + desired_cols = arg + i += 1 + i += 1 + + argv = sys.argv[i:] + sint = int(argv[0]) if argv else sint + count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1) + + if hflag or (xflag and desired_cols): + usage() + + if vflag: + detailed_usage() + + if xflag: + hdr = xhdr + + update_hdr_intr() + + # check if L2ARC exists + snap_stats() + l2_size = cur.get("l2_size") + if l2_size: + l2exist = True + + if desired_cols: + hdr = desired_cols.split(",") + + invalid = [] + incompat = [] + for ele in hdr: + if ele not in cols: + invalid.append(ele) + elif not l2exist and ele.startswith("l2"): + sys.stdout.write("No L2ARC Here\n%s\n" % ele) + incompat.append(ele) + + if len(invalid) > 0: + sys.stderr.write("Invalid column definition! -- %s\n" % invalid) + usage() + + if len(incompat) > 0: + sys.stderr.write("Incompatible field specified! -- %s\n" % + incompat) + usage() + + if opfile: + try: + out = open(opfile, "w") + sys.stdout = out + + except IOError: + sys.stderr.write("Cannot open %s for writing\n" % opfile) + sys.exit(1) + + +def calculate(): + global d + global v + global l2exist + + v = dict() + v["time"] = time.strftime("%H:%M:%S", time.localtime()) + v["hits"] = d["hits"] / sint + v["miss"] = d["misses"] / sint + v["read"] = v["hits"] + v["miss"] + v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0 + v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0 + + v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint + v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint + + v["dread"] = v["dhit"] + v["dmis"] + v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0 + v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0 + + v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint + v["pmis"] = (d["prefetch_data_misses"] + + d["prefetch_metadata_misses"]) / sint + + v["pread"] = v["phit"] + v["pmis"] + v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0 + v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0 + + v["mhit"] = (d["prefetch_metadata_hits"] + + d["demand_metadata_hits"]) / sint + v["mmis"] = (d["prefetch_metadata_misses"] + + d["demand_metadata_misses"]) / sint + + v["mread"] = v["mhit"] + v["mmis"] + v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0 + v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0 + + v["arcsz"] = cur["size"] + v["size"] = cur["size"] + v["c"] = cur["c"] + v["mfu"] = d["mfu_hits"] / sint + v["mru"] = d["mru_hits"] / sint + v["mrug"] = d["mru_ghost_hits"] / sint + v["mfug"] = d["mfu_ghost_hits"] / sint + v["eskip"] = d["evict_skip"] / sint + v["mtxmis"] = d["mutex_miss"] / sint + + if l2exist: + v["l2hits"] = d["l2_hits"] / sint + v["l2miss"] = d["l2_misses"] / sint + v["l2read"] = v["l2hits"] + v["l2miss"] + v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0 + + v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0 + v["l2asize"] = cur["l2_asize"] + v["l2size"] = cur["l2_size"] + v["l2bytes"] = d["l2_read_bytes"] / sint + + v["grow"] = 0 if cur["arc_no_grow"] else 1 + v["need"] = cur["arc_need_free"] + v["free"] = cur["memory_free_bytes"] + v["avail"] = cur["memory_available_bytes"] + v["waste"] = cur["abd_chunk_waste_size"] + + +def main(): + global sint + global count + global hdr_intr + + i = 0 + count_flag = 0 + + init() + if count > 0: + count_flag = 1 + + signal(SIGINT, SIG_DFL) + signal(SIGWINCH, resize_handler) + while True: + if i == 0: + print_header() + + snap_stats() + calculate() + print_values() + + if count_flag == 1: + if count <= 1: + break + count -= 1 + + i = 0 if i >= hdr_intr else i + 1 + time.sleep(sint) + + if out: + out.close() + + +if __name__ == '__main__': + main() diff --git a/sys/contrib/openzfs/cmd/dbufstat/.gitignore b/sys/contrib/openzfs/cmd/dbufstat/.gitignore new file mode 100644 index 000000000000..2c2e913cef70 --- /dev/null +++ b/sys/contrib/openzfs/cmd/dbufstat/.gitignore @@ -0,0 +1 @@ +dbufstat diff --git a/sys/contrib/openzfs/cmd/dbufstat/Makefile.am b/sys/contrib/openzfs/cmd/dbufstat/Makefile.am new file mode 100644 index 000000000000..e672a01a4227 --- /dev/null +++ b/sys/contrib/openzfs/cmd/dbufstat/Makefile.am @@ -0,0 +1,5 @@ +include $(top_srcdir)/config/Substfiles.am + +bin_SCRIPTS = dbufstat + +SUBSTFILES += $(bin_SCRIPTS) diff --git a/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in b/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in new file mode 100755 index 000000000000..98eb79057388 --- /dev/null +++ b/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in @@ -0,0 +1,669 @@ +#!/usr/bin/env @PYTHON_SHEBANG@ +# +# Print out statistics for all cached dmu buffers. This information +# is available through the dbufs kstat and may be post-processed as +# needed by the script. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (C) 2013 Lawrence Livermore National Security, LLC. +# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +# +# This script must remain compatible with Python 2.6+ and Python 3.4+. +# + +import sys +import getopt +import errno +import re + +bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] +bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", + "meta", "state", "dbholds", "dbc", "list", "atype", "flags", + "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", + "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", + "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] +bincompat = ["cached", "direct", "indirect", "bonus", "spill"] + +dhdr = ["pool", "objset", "object", "dtype", "cached"] +dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", + "indirect", "bonus", "spill"] +dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", + "dbc", "list", "atype", "flags", "count", "asize", "access", + "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", + "l2_comp", "aholds"] + +thdr = ["pool", "objset", "dtype", "cached"] +txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", + "bonus", "spill"] +tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", + "dbc", "dbholds", "list", "atype", "flags", "count", "asize", + "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", + "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize"] + +cols = { + # hdr: [size, scale, description] + "pool": [15, -1, "pool name"], + "objset": [6, -1, "dataset identification number"], + "object": [10, -1, "object number"], + "level": [5, -1, "indirection level of buffer"], + "blkid": [8, -1, "block number of buffer"], + "offset": [12, 1024, "offset in object of buffer"], + "dbsize": [7, 1024, "size of buffer"], + "meta": [4, -1, "is this buffer metadata?"], + "state": [5, -1, "state of buffer (read, cached, etc)"], + "dbholds": [7, 1000, "number of holds on buffer"], + "dbc": [3, -1, "in dbuf cache"], + "list": [4, -1, "which ARC list contains this buffer"], + "atype": [7, -1, "ARC header type (data or metadata)"], + "flags": [9, -1, "ARC read flags"], + "count": [5, -1, "ARC data count"], + "asize": [7, 1024, "size of this ARC buffer"], + "access": [10, -1, "time this ARC buffer was last accessed"], + "mru": [5, 1000, "hits while on the ARC's MRU list"], + "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"], + "mfu": [5, 1000, "hits while on the ARC's MFU list"], + "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"], + "l2": [5, 1000, "hits while on the L2ARC"], + "l2_dattr": [8, -1, "L2ARC disk address/offset"], + "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"], + "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"], + "aholds": [6, 1000, "number of holds on this ARC buffer"], + "dtype": [27, -1, "dnode type"], + "btype": [27, -1, "bonus buffer type"], + "data_bs": [7, 1024, "data block size"], + "meta_bs": [7, 1024, "metadata block size"], + "bsize": [6, 1024, "bonus buffer size"], + "lvls": [6, -1, "number of indirection levels"], + "dholds": [6, 1000, "number of holds on dnode"], + "blocks": [8, 1000, "number of allocated blocks"], + "dsize": [12, 1024, "size of dnode"], + "cached": [6, 1024, "bytes cached for all blocks"], + "direct": [6, 1024, "bytes cached for direct blocks"], + "indirect": [8, 1024, "bytes cached for indirect blocks"], + "bonus": [5, 1024, "bytes cached for bonus buffer"], + "spill": [5, 1024, "bytes cached for spill block"], +} + +hdr = None +xhdr = None +sep = " " # Default separator is 2 spaces +cmd = ("Usage: dbufstat [-bdhnrtvx] [-i file] [-f fields] [-o file] " + "[-s string] [-F filter]\n") +raw = 0 + + +def print_incompat_helper(incompat): + cnt = 0 + for key in sorted(incompat): + if cnt is 0: + sys.stderr.write("\t") + elif cnt > 8: + sys.stderr.write(",\n\t") + cnt = 0 + else: + sys.stderr.write(", ") + + sys.stderr.write("%s" % key) + cnt += 1 + + sys.stderr.write("\n\n") + + +def detailed_usage(): + sys.stderr.write("%s\n" % cmd) + + sys.stderr.write("Field definitions incompatible with '-b' option:\n") + print_incompat_helper(bincompat) + + sys.stderr.write("Field definitions incompatible with '-d' option:\n") + print_incompat_helper(dincompat) + + sys.stderr.write("Field definitions incompatible with '-t' option:\n") + print_incompat_helper(tincompat) + + sys.stderr.write("Field definitions are as follows:\n") + for key in sorted(cols.keys()): + sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) + sys.stderr.write("\n") + + sys.exit(0) + + +def usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("\t -b : Print table of information for each dbuf\n") + sys.stderr.write("\t -d : Print table of information for each dnode\n") + sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -n : Exclude header from output\n") + sys.stderr.write("\t -r : Print raw values\n") + sys.stderr.write("\t -t : Print table of information for each dnode type" + "\n") + sys.stderr.write("\t -v : List all possible field headers and definitions" + "\n") + sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -i : Redirect input from the specified file\n") + sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") + sys.stderr.write("\t -o : Redirect output to the specified file\n") + sys.stderr.write("\t -s : Override default field separator with custom " + "character or string\n") + sys.stderr.write("\t -F : Filter output by value or regex\n") + sys.stderr.write("\nExamples:\n") + sys.stderr.write("\tdbufstat -d -o /tmp/d.log\n") + sys.stderr.write("\tdbufstat -t -s \",\" -o /tmp/t.log\n") + sys.stderr.write("\tdbufstat -v\n") + sys.stderr.write("\tdbufstat -d -f pool,object,objset,dsize,cached\n") + sys.stderr.write("\tdbufstat -bx -F dbc=1,objset=54,pool=testpool\n") + sys.stderr.write("\n") + + sys.exit(1) + + +def prettynum(sz, scale, num=0): + global raw + + suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] + index = 0 + save = 0 + + if raw or scale == -1: + return "%*s" % (sz, num) + + # Rounding error, return 0 + elif 0 < num < 1: + num = 0 + + while num > scale and index < 5: + save = num + num = num / scale + index += 1 + + if index == 0: + return "%*d" % (sz, num) + + if (save / scale) < 10: + return "%*.1f%s" % (sz - 1, num, suffix[index]) + else: + return "%*d%s" % (sz - 1, num, suffix[index]) + + +def print_values(v): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%s%s" % ( + prettynum(cols[col][0], cols[col][1], v[col]), sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + + +def print_header(): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%*s%s" % (cols[col][0], col, sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + + +def get_typestring(t): + ot_strings = [ + "DMU_OT_NONE", + # general: + "DMU_OT_OBJECT_DIRECTORY", + "DMU_OT_OBJECT_ARRAY", + "DMU_OT_PACKED_NVLIST", + "DMU_OT_PACKED_NVLIST_SIZE", + "DMU_OT_BPOBJ", + "DMU_OT_BPOBJ_HDR", + # spa: + "DMU_OT_SPACE_MAP_HEADER", + "DMU_OT_SPACE_MAP", + # zil: + "DMU_OT_INTENT_LOG", + # dmu: + "DMU_OT_DNODE", + "DMU_OT_OBJSET", + # dsl: + "DMU_OT_DSL_DIR", + "DMU_OT_DSL_DIR_CHILD_MAP", + "DMU_OT_DSL_DS_SNAP_MAP", + "DMU_OT_DSL_PROPS", + "DMU_OT_DSL_DATASET", + # zpl: + "DMU_OT_ZNODE", + "DMU_OT_OLDACL", + "DMU_OT_PLAIN_FILE_CONTENTS", + "DMU_OT_DIRECTORY_CONTENTS", + "DMU_OT_MASTER_NODE", + "DMU_OT_UNLINKED_SET", + # zvol: + "DMU_OT_ZVOL", + "DMU_OT_ZVOL_PROP", + # other; for testing only! + "DMU_OT_PLAIN_OTHER", + "DMU_OT_UINT64_OTHER", + "DMU_OT_ZAP_OTHER", + # new object types: + "DMU_OT_ERROR_LOG", + "DMU_OT_SPA_HISTORY", + "DMU_OT_SPA_HISTORY_OFFSETS", + "DMU_OT_POOL_PROPS", + "DMU_OT_DSL_PERMS", + "DMU_OT_ACL", + "DMU_OT_SYSACL", + "DMU_OT_FUID", + "DMU_OT_FUID_SIZE", + "DMU_OT_NEXT_CLONES", + "DMU_OT_SCAN_QUEUE", + "DMU_OT_USERGROUP_USED", + "DMU_OT_USERGROUP_QUOTA", + "DMU_OT_USERREFS", + "DMU_OT_DDT_ZAP", + "DMU_OT_DDT_STATS", + "DMU_OT_SA", + "DMU_OT_SA_MASTER_NODE", + "DMU_OT_SA_ATTR_REGISTRATION", + "DMU_OT_SA_ATTR_LAYOUTS", + "DMU_OT_SCAN_XLATE", + "DMU_OT_DEDUP", + "DMU_OT_DEADLIST", + "DMU_OT_DEADLIST_HDR", + "DMU_OT_DSL_CLONES", + "DMU_OT_BPOBJ_SUBOBJ"] + otn_strings = { + 0x80: "DMU_OTN_UINT8_DATA", + 0xc0: "DMU_OTN_UINT8_METADATA", + 0x81: "DMU_OTN_UINT16_DATA", + 0xc1: "DMU_OTN_UINT16_METADATA", + 0x82: "DMU_OTN_UINT32_DATA", + 0xc2: "DMU_OTN_UINT32_METADATA", + 0x83: "DMU_OTN_UINT64_DATA", + 0xc3: "DMU_OTN_UINT64_METADATA", + 0x84: "DMU_OTN_ZAP_DATA", + 0xc4: "DMU_OTN_ZAP_METADATA", + 0xa0: "DMU_OTN_UINT8_ENC_DATA", + 0xe0: "DMU_OTN_UINT8_ENC_METADATA", + 0xa1: "DMU_OTN_UINT16_ENC_DATA", + 0xe1: "DMU_OTN_UINT16_ENC_METADATA", + 0xa2: "DMU_OTN_UINT32_ENC_DATA", + 0xe2: "DMU_OTN_UINT32_ENC_METADATA", + 0xa3: "DMU_OTN_UINT64_ENC_DATA", + 0xe3: "DMU_OTN_UINT64_ENC_METADATA", + 0xa4: "DMU_OTN_ZAP_ENC_DATA", + 0xe4: "DMU_OTN_ZAP_ENC_METADATA"} + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % t + + try: + if t < len(ot_strings): + return ot_strings[t] + else: + return otn_strings[t] + except (IndexError, KeyError): + return "(UNKNOWN)" + + +def get_compstring(c): + comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON", + "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB", + "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1", + "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3", + "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5", + "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7", + "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9", + "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4", + "ZIO_COMPRESS_ZSTD", "ZIO_COMPRESS_FUNCTION"] + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % c + + try: + return comp_strings[c] + except IndexError: + return "%i" % c + + +def parse_line(line, labels): + global hdr + + new = dict() + val = None + for col in hdr: + # These are "special" fields computed in the update_dict + # function, prevent KeyError exception on labels[col] for these. + if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']: + val = line[labels[col]] + + if col in ['pool', 'flags']: + new[col] = str(val) + elif col in ['dtype', 'btype']: + new[col] = get_typestring(int(val)) + elif col in ['l2_comp']: + new[col] = get_compstring(int(val)) + else: + new[col] = int(val) + + return new + + +def update_dict(d, k, line, labels): + pool = line[labels['pool']] + objset = line[labels['objset']] + key = line[labels[k]] + + dbsize = int(line[labels['dbsize']]) + blkid = int(line[labels['blkid']]) + level = int(line[labels['level']]) + + if pool not in d: + d[pool] = dict() + + if objset not in d[pool]: + d[pool][objset] = dict() + + if key not in d[pool][objset]: + d[pool][objset][key] = parse_line(line, labels) + d[pool][objset][key]['bonus'] = 0 + d[pool][objset][key]['cached'] = 0 + d[pool][objset][key]['direct'] = 0 + d[pool][objset][key]['indirect'] = 0 + d[pool][objset][key]['spill'] = 0 + + d[pool][objset][key]['cached'] += dbsize + + if blkid == -1: + d[pool][objset][key]['bonus'] += dbsize + elif blkid == -2: + d[pool][objset][key]['spill'] += dbsize + else: + if level == 0: + d[pool][objset][key]['direct'] += dbsize + else: + d[pool][objset][key]['indirect'] += dbsize + + return d + + +def skip_line(vals, filters): + ''' + Determines if a line should be skipped during printing + based on a set of filters + ''' + if len(filters) == 0: + return False + + for key in vals: + if key in filters: + val = prettynum(cols[key][0], cols[key][1], vals[key]).strip() + # we want a full match here + if re.match("(?:" + filters[key] + r")\Z", val) is None: + return True + + return False + + +def print_dict(d, filters, noheader): + if not noheader: + print_header() + for pool in list(d.keys()): + for objset in list(d[pool].keys()): + for v in list(d[pool][objset].values()): + if not skip_line(v, filters): + print_values(v) + + +def dnodes_build_dict(filehandle): + labels = dict() + dnodes = dict() + + # First 3 lines are header information, skip the first two + for i in range(2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(dnodes, 'object', line.split(), labels) + + return dnodes + + +def types_build_dict(filehandle): + labels = dict() + types = dict() + + # First 3 lines are header information, skip the first two + for i in range(2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(types, 'dtype', line.split(), labels) + + return types + + +def buffers_print_all(filehandle, filters, noheader): + labels = dict() + + # First 3 lines are header information, skip the first two + for i in range(2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + if not noheader: + print_header() + + # The rest of the file is buffer information + for line in filehandle: + vals = parse_line(line.split(), labels) + if not skip_line(vals, filters): + print_values(vals) + + +def main(): + global hdr + global sep + global raw + + desired_cols = None + bflag = False + dflag = False + hflag = False + ifile = None + ofile = None + tflag = False + vflag = False + xflag = False + nflag = False + filters = dict() + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "bdf:hi:o:rs:tvxF:n", + [ + "buffers", + "dnodes", + "columns", + "help", + "infile", + "outfile", + "separator", + "types", + "verbose", + "extended", + "filter" + ] + ) + except getopt.error: + usage() + opts = None + + for opt, arg in opts: + if opt in ('-b', '--buffers'): + bflag = True + if opt in ('-d', '--dnodes'): + dflag = True + if opt in ('-f', '--columns'): + desired_cols = arg + if opt in ('-h', '--help'): + hflag = True + if opt in ('-i', '--infile'): + ifile = arg + if opt in ('-o', '--outfile'): + ofile = arg + if opt in ('-r', '--raw'): + raw += 1 + if opt in ('-s', '--separator'): + sep = arg + if opt in ('-t', '--types'): + tflag = True + if opt in ('-v', '--verbose'): + vflag = True + if opt in ('-x', '--extended'): + xflag = True + if opt in ('-n', '--noheader'): + nflag = True + if opt in ('-F', '--filter'): + fils = [x.strip() for x in arg.split(",")] + + for fil in fils: + f = [x.strip() for x in fil.split("=")] + + if len(f) != 2: + sys.stderr.write("Invalid filter '%s'.\n" % fil) + sys.exit(1) + + if f[0] not in cols: + sys.stderr.write("Invalid field '%s' in filter.\n" % f[0]) + sys.exit(1) + + if f[0] in filters: + sys.stderr.write("Field '%s' specified multiple times in " + "filter.\n" % f[0]) + sys.exit(1) + + try: + re.compile("(?:" + f[1] + r")\Z") + except re.error: + sys.stderr.write("Invalid regex for field '%s' in " + "filter.\n" % f[0]) + sys.exit(1) + + filters[f[0]] = f[1] + + if hflag or (xflag and desired_cols): + usage() + + if vflag: + detailed_usage() + + # Ensure at most only one of b, d, or t flags are set + if (bflag and dflag) or (bflag and tflag) or (dflag and tflag): + usage() + + if bflag: + hdr = bxhdr if xflag else bhdr + elif tflag: + hdr = txhdr if xflag else thdr + else: # Even if dflag is False, it's the default if none set + dflag = True + hdr = dxhdr if xflag else dhdr + + if desired_cols: + hdr = desired_cols.split(",") + + invalid = [] + incompat = [] + for ele in hdr: + if ele not in cols: + invalid.append(ele) + elif ((bflag and bincompat and ele in bincompat) or + (dflag and dincompat and ele in dincompat) or + (tflag and tincompat and ele in tincompat)): + incompat.append(ele) + + if len(invalid) > 0: + sys.stderr.write("Invalid column definition! -- %s\n" % invalid) + usage() + + if len(incompat) > 0: + sys.stderr.write("Incompatible field specified! -- %s\n" % + incompat) + usage() + + if ofile: + try: + tmp = open(ofile, "w") + sys.stdout = tmp + + except IOError: + sys.stderr.write("Cannot open %s for writing\n" % ofile) + sys.exit(1) + + if not ifile: + ifile = '/proc/spl/kstat/zfs/dbufs' + + if ifile is not "-": + try: + tmp = open(ifile, "r") + sys.stdin = tmp + except IOError: + sys.stderr.write("Cannot open %s for reading\n" % ifile) + sys.exit(1) + + if bflag: + buffers_print_all(sys.stdin, filters, nflag) + + if dflag: + print_dict(dnodes_build_dict(sys.stdin), filters, nflag) + + if tflag: + print_dict(types_build_dict(sys.stdin), filters, nflag) + + +if __name__ == '__main__': + main() diff --git a/sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am b/sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am new file mode 100644 index 000000000000..2380f56fa4d4 --- /dev/null +++ b/sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am @@ -0,0 +1 @@ +dist_sbin_SCRIPTS = fsck.zfs diff --git a/sys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs b/sys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs new file mode 100755 index 000000000000..129a7f39c388 --- /dev/null +++ b/sys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs @@ -0,0 +1,9 @@ +#!/bin/sh +# +# fsck.zfs: A fsck helper to accommodate distributions that expect +# to be able to execute a fsck on all filesystem types. Currently +# this script does nothing but it could be extended to act as a +# compatibility wrapper for 'zpool scrub'. +# + +exit 0 diff --git a/sys/contrib/openzfs/cmd/mount_zfs/.gitignore b/sys/contrib/openzfs/cmd/mount_zfs/.gitignore new file mode 100644 index 000000000000..cd9254bde3da --- /dev/null +++ b/sys/contrib/openzfs/cmd/mount_zfs/.gitignore @@ -0,0 +1 @@ +mount.zfs diff --git a/sys/contrib/openzfs/cmd/mount_zfs/Makefile.am b/sys/contrib/openzfs/cmd/mount_zfs/Makefile.am new file mode 100644 index 000000000000..6c4d6ff79f16 --- /dev/null +++ b/sys/contrib/openzfs/cmd/mount_zfs/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +# +# Ignore the prefix for the mount helper. It must be installed in /sbin/ +# because this path is hardcoded in the mount(8) for security reasons. +# However, if needed, the configure option --with-mounthelperdir= can be used +# to override the default install location. +# +sbindir=$(mounthelperdir) +sbin_PROGRAMS = mount.zfs + +mount_zfs_SOURCES = \ + mount_zfs.c + +mount_zfs_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +mount_zfs_LDADD += $(LTLIBINTL) diff --git a/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c b/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c new file mode 100644 index 000000000000..87d2ccadcded --- /dev/null +++ b/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c @@ -0,0 +1,408 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Lawrence Livermore National Security, LLC. + */ + +#include <libintl.h> +#include <unistd.h> +#include <sys/file.h> +#include <sys/mount.h> +#include <sys/mntent.h> +#include <sys/stat.h> +#include <libzfs.h> +#include <libzutil.h> +#include <locale.h> +#include <getopt.h> +#include <fcntl.h> +#include <errno.h> + +#define ZS_COMMENT 0x00000000 /* comment */ +#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ + +libzfs_handle_t *g_zfs; + +/* + * Return the pool/dataset to mount given the name passed to mount. This + * is expected to be of the form pool/dataset, however may also refer to + * a block device if that device contains a valid zfs label. + */ +static char * +parse_dataset(char *dataset) +{ + char cwd[PATH_MAX]; + struct stat64 statbuf; + int error; + int len; + + /* + * We expect a pool/dataset to be provided, however if we're + * given a device which is a member of a zpool we attempt to + * extract the pool name stored in the label. Given the pool + * name we can mount the root dataset. + */ + error = stat64(dataset, &statbuf); + if (error == 0) { + nvlist_t *config; + char *name; + int fd; + + fd = open(dataset, O_RDONLY); + if (fd < 0) + goto out; + + error = zpool_read_label(fd, &config, NULL); + (void) close(fd); + if (error) + goto out; + + error = nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &name); + if (error) { + nvlist_free(config); + } else { + dataset = strdup(name); + nvlist_free(config); + return (dataset); + } + } +out: + /* + * If a file or directory in your current working directory is + * named 'dataset' then mount(8) will prepend your current working + * directory to the dataset. There is no way to prevent this + * behavior so we simply check for it and strip the prepended + * patch when it is added. + */ + if (getcwd(cwd, PATH_MAX) == NULL) + return (dataset); + + len = strlen(cwd); + + /* Do not add one when cwd already ends in a trailing '/' */ + if (strncmp(cwd, dataset, len) == 0) + return (dataset + len + (cwd[len-1] != '/')); + + return (dataset); +} + +/* + * Update the mtab_* code to use the libmount library when it is commonly + * available otherwise fallback to legacy mode. The mount(8) utility will + * manage the lock file for us to prevent racing updates to /etc/mtab. + */ +static int +mtab_is_writeable(void) +{ + struct stat st; + int error, fd; + + error = lstat("/etc/mtab", &st); + if (error || S_ISLNK(st.st_mode)) + return (0); + + fd = open("/etc/mtab", O_RDWR | O_CREAT, 0644); + if (fd < 0) + return (0); + + close(fd); + return (1); +} + +static int +mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) +{ + struct mntent mnt; + FILE *fp; + int error; + + mnt.mnt_fsname = dataset; + mnt.mnt_dir = mntpoint; + mnt.mnt_type = type; + mnt.mnt_opts = mntopts ? mntopts : ""; + mnt.mnt_freq = 0; + mnt.mnt_passno = 0; + + fp = setmntent("/etc/mtab", "a+"); + if (!fp) { + (void) fprintf(stderr, gettext( + "filesystem '%s' was mounted, but /etc/mtab " + "could not be opened due to error %d\n"), + dataset, errno); + return (MOUNT_FILEIO); + } + + error = addmntent(fp, &mnt); + if (error) { + (void) fprintf(stderr, gettext( + "filesystem '%s' was mounted, but /etc/mtab " + "could not be updated due to error %d\n"), + dataset, errno); + return (MOUNT_FILEIO); + } + + (void) endmntent(fp); + + return (MOUNT_SUCCESS); +} + +int +main(int argc, char **argv) +{ + zfs_handle_t *zhp; + char prop[ZFS_MAXPROPLEN]; + uint64_t zfs_version = 0; + char mntopts[MNT_LINE_MAX] = { '\0' }; + char badopt[MNT_LINE_MAX] = { '\0' }; + char mtabopt[MNT_LINE_MAX] = { '\0' }; + char mntpoint[PATH_MAX]; + char *dataset; + unsigned long mntflags = 0, zfsflags = 0, remount = 0; + int sloppy = 0, fake = 0, verbose = 0, nomtab = 0, zfsutil = 0; + int error, c; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + opterr = 0; + + /* check options */ + while ((c = getopt_long(argc, argv, "sfnvo:h?", 0, 0)) != -1) { + switch (c) { + case 's': + sloppy = 1; + break; + case 'f': + fake = 1; + break; + case 'n': + nomtab = 1; + break; + case 'v': + verbose++; + break; + case 'o': + (void) strlcpy(mntopts, optarg, sizeof (mntopts)); + break; + case 'h': + case '?': + (void) fprintf(stderr, gettext("Invalid option '%c'\n"), + optopt); + (void) fprintf(stderr, gettext("Usage: mount.zfs " + "[-sfnv] [-o options] <dataset> <mountpoint>\n")); + return (MOUNT_USAGE); + } + } + + argc -= optind; + argv += optind; + + /* check that we only have two arguments */ + if (argc != 2) { + if (argc == 0) + (void) fprintf(stderr, gettext("missing dataset " + "argument\n")); + else if (argc == 1) + (void) fprintf(stderr, + gettext("missing mountpoint argument\n")); + else + (void) fprintf(stderr, gettext("too many arguments\n")); + (void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n"); + return (MOUNT_USAGE); + } + + dataset = parse_dataset(argv[0]); + + /* canonicalize the mount point */ + if (realpath(argv[1], mntpoint) == NULL) { + (void) fprintf(stderr, gettext("filesystem '%s' cannot be " + "mounted at '%s' due to canonicalization error %d.\n"), + dataset, argv[1], errno); + return (MOUNT_SYSERR); + } + + /* validate mount options and set mntflags */ + error = zfs_parse_mount_options(mntopts, &mntflags, &zfsflags, sloppy, + badopt, mtabopt); + if (error) { + switch (error) { + case ENOMEM: + (void) fprintf(stderr, gettext("filesystem '%s' " + "cannot be mounted due to a memory allocation " + "failure.\n"), dataset); + return (MOUNT_SYSERR); + case ENOENT: + (void) fprintf(stderr, gettext("filesystem '%s' " + "cannot be mounted due to invalid option " + "'%s'.\n"), dataset, badopt); + (void) fprintf(stderr, gettext("Use the '-s' option " + "to ignore the bad mount option.\n")); + return (MOUNT_USAGE); + default: + (void) fprintf(stderr, gettext("filesystem '%s' " + "cannot be mounted due to internal error %d.\n"), + dataset, error); + return (MOUNT_SOFTWARE); + } + } + + if (verbose) + (void) fprintf(stdout, gettext("mount.zfs:\n" + " dataset: \"%s\"\n mountpoint: \"%s\"\n" + " mountflags: 0x%lx\n zfsflags: 0x%lx\n" + " mountopts: \"%s\"\n mtabopts: \"%s\"\n"), + dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt); + + if (mntflags & MS_REMOUNT) { + nomtab = 1; + remount = 1; + } + + if (zfsflags & ZS_ZFSUTIL) + zfsutil = 1; + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (MOUNT_SYSERR); + } + + /* try to open the dataset to access the mount point */ + if ((zhp = zfs_open(g_zfs, dataset, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) { + (void) fprintf(stderr, gettext("filesystem '%s' cannot be " + "mounted, unable to open the dataset\n"), dataset); + libzfs_fini(g_zfs); + return (MOUNT_USAGE); + } + + zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); + + /* treat all snapshots as legacy mount points */ + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) + (void) strlcpy(prop, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN); + else + (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, prop, + sizeof (prop), NULL, NULL, 0, B_FALSE); + + /* + * Fetch the max supported zfs version in case we get ENOTSUP + * back from the mount command, since we need the zfs handle + * to do so. + */ + zfs_version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + if (zfs_version == 0) { + fprintf(stderr, gettext("unable to fetch " + "ZFS version for filesystem '%s'\n"), dataset); + return (MOUNT_SYSERR); + } + + zfs_close(zhp); + libzfs_fini(g_zfs); + + /* + * Legacy mount points may only be mounted using 'mount', never using + * 'zfs mount'. However, since 'zfs mount' actually invokes 'mount' + * we differentiate the two cases using the 'zfsutil' mount option. + * This mount option should only be supplied by the 'zfs mount' util. + * + * The only exception to the above rule is '-o remount' which is + * always allowed for non-legacy datasets. This is done because when + * using zfs as your root file system both rc.sysinit/umountroot and + * systemd depend on 'mount -o remount <mountpoint>' to work. + */ + if (zfsutil && (strcmp(prop, ZFS_MOUNTPOINT_LEGACY) == 0)) { + (void) fprintf(stderr, gettext( + "filesystem '%s' cannot be mounted using 'zfs mount'.\n" + "Use 'zfs set mountpoint=%s' or 'mount -t zfs %s %s'.\n" + "See zfs(8) for more information.\n"), + dataset, mntpoint, dataset, mntpoint); + return (MOUNT_USAGE); + } + + if (!zfsutil && !(remount || fake) && + strcmp(prop, ZFS_MOUNTPOINT_LEGACY)) { + (void) fprintf(stderr, gettext( + "filesystem '%s' cannot be mounted using 'mount'.\n" + "Use 'zfs set mountpoint=%s' or 'zfs mount %s'.\n" + "See zfs(8) for more information.\n"), + dataset, "legacy", dataset); + return (MOUNT_USAGE); + } + + if (!fake) { + error = mount(dataset, mntpoint, MNTTYPE_ZFS, + mntflags, mntopts); + } + + if (error) { + switch (errno) { + case ENOENT: + (void) fprintf(stderr, gettext("mount point " + "'%s' does not exist\n"), mntpoint); + return (MOUNT_SYSERR); + case EBUSY: + (void) fprintf(stderr, gettext("filesystem " + "'%s' is already mounted\n"), dataset); + return (MOUNT_BUSY); + case ENOTSUP: + if (zfs_version > ZPL_VERSION) { + (void) fprintf(stderr, + gettext("filesystem '%s' (v%d) is not " + "supported by this implementation of " + "ZFS (max v%d).\n"), dataset, + (int)zfs_version, (int)ZPL_VERSION); + } else { + (void) fprintf(stderr, + gettext("filesystem '%s' mount " + "failed for unknown reason.\n"), dataset); + } + return (MOUNT_SYSERR); +#ifdef MS_MANDLOCK + case EPERM: + if (mntflags & MS_MANDLOCK) { + (void) fprintf(stderr, gettext("filesystem " + "'%s' has the 'nbmand=on' property set, " + "this mount\noption may be disabled in " + "your kernel. Use 'zfs set nbmand=off'\n" + "to disable this option and try to " + "mount the filesystem again.\n"), dataset); + return (MOUNT_SYSERR); + } + /* fallthru */ +#endif + default: + (void) fprintf(stderr, gettext("filesystem " + "'%s' can not be mounted: %s\n"), dataset, + strerror(errno)); + return (MOUNT_USAGE); + } + } + + if (!nomtab && mtab_is_writeable()) { + error = mtab_update(dataset, mntpoint, MNTTYPE_ZFS, mtabopt); + if (error) + return (error); + } + + return (MOUNT_SUCCESS); +} diff --git a/sys/contrib/openzfs/cmd/raidz_test/.gitignore b/sys/contrib/openzfs/cmd/raidz_test/.gitignore new file mode 100644 index 000000000000..f8b83d9cce03 --- /dev/null +++ b/sys/contrib/openzfs/cmd/raidz_test/.gitignore @@ -0,0 +1 @@ +/raidz_test diff --git a/sys/contrib/openzfs/cmd/raidz_test/Makefile.am b/sys/contrib/openzfs/cmd/raidz_test/Makefile.am new file mode 100644 index 000000000000..72c914e641e4 --- /dev/null +++ b/sys/contrib/openzfs/cmd/raidz_test/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +# Includes kernel code, generate warnings for large stack frames +AM_CFLAGS += $(FRAME_LARGER_THAN) + +# Unconditionally enable ASSERTs +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +bin_PROGRAMS = raidz_test + +raidz_test_SOURCES = \ + raidz_test.h \ + raidz_test.c \ + raidz_bench.c + +raidz_test_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la + +raidz_test_LDADD += -lm diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c new file mode 100644 index 000000000000..8a2cec4ca685 --- /dev/null +++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c @@ -0,0 +1,227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/zio.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> +#include <stdio.h> + +#include <sys/time.h> + +#include "raidz_test.h" + +#define GEN_BENCH_MEMORY (((uint64_t)1ULL)<<32) +#define REC_BENCH_MEMORY (((uint64_t)1ULL)<<29) +#define BENCH_ASHIFT 12 +#define MIN_CS_SHIFT BENCH_ASHIFT +#define MAX_CS_SHIFT SPA_MAXBLOCKSHIFT + +static zio_t zio_bench; +static raidz_map_t *rm_bench; +static size_t max_data_size = SPA_MAXBLOCKSIZE; + +static void +bench_init_raidz_map(void) +{ + zio_bench.io_offset = 0; + zio_bench.io_size = max_data_size; + + /* + * To permit larger column sizes these have to be done + * allocated using aligned alloc instead of zio_abd_buf_alloc + */ + zio_bench.io_abd = raidz_alloc(max_data_size); + + init_zio_abd(&zio_bench); +} + +static void +bench_fini_raidz_maps(void) +{ + /* tear down golden zio */ + raidz_free(zio_bench.io_abd, max_data_size); + bzero(&zio_bench, sizeof (zio_t)); +} + +static inline void +run_gen_bench_impl(const char *impl) +{ + int fn, ncols; + uint64_t ds, iter_cnt, iter, disksize; + hrtime_t start; + double elapsed, d_bw; + + /* Benchmark generate functions */ + for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + + for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) { + /* create suitable raidz_map */ + ncols = rto_opts.rto_dcols + fn + 1; + zio_bench.io_size = 1ULL << ds; + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, fn+1); + + /* estimate iteration count */ + iter_cnt = GEN_BENCH_MEMORY; + iter_cnt /= zio_bench.io_size; + + start = gethrtime(); + for (iter = 0; iter < iter_cnt; iter++) + vdev_raidz_generate_parity(rm_bench); + elapsed = NSEC2SEC((double)(gethrtime() - start)); + + disksize = (1ULL << ds) / rto_opts.rto_dcols; + d_bw = (double)iter_cnt * (double)disksize; + d_bw /= (1024.0 * 1024.0 * elapsed); + + LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n", + impl, + raidz_gen_name[fn], + rto_opts.rto_dcols, + (1ULL<<ds), + d_bw, + d_bw * (double)(ncols), + (unsigned)iter_cnt); + + vdev_raidz_map_free(rm_bench); + } + } +} + +static void +run_gen_bench(void) +{ + char **impl_name; + + LOG(D_INFO, DBLSEP "\nBenchmarking parity generation...\n\n"); + LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n"); + + for (impl_name = (char **)raidz_impl_names; *impl_name != NULL; + impl_name++) { + + if (vdev_raidz_impl_set(*impl_name) != 0) + continue; + + run_gen_bench_impl(*impl_name); + } +} + +static void +run_rec_bench_impl(const char *impl) +{ + int fn, ncols, nbad; + uint64_t ds, iter_cnt, iter, disksize; + hrtime_t start; + double elapsed, d_bw; + static const int tgt[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { + for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) { + + /* create suitable raidz_map */ + ncols = rto_opts.rto_dcols + PARITY_PQR; + zio_bench.io_size = 1ULL << ds; + + /* + * raidz block is too short to test + * the requested method + */ + if (zio_bench.io_size / rto_opts.rto_dcols < + (1ULL << BENCH_ASHIFT)) + continue; + + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, PARITY_PQR); + + /* estimate iteration count */ + iter_cnt = (REC_BENCH_MEMORY); + iter_cnt /= zio_bench.io_size; + + /* calculate how many bad columns there are */ + nbad = MIN(3, raidz_ncols(rm_bench) - + raidz_parity(rm_bench)); + + start = gethrtime(); + for (iter = 0; iter < iter_cnt; iter++) + vdev_raidz_reconstruct(rm_bench, tgt[fn], nbad); + elapsed = NSEC2SEC((double)(gethrtime() - start)); + + disksize = (1ULL << ds) / rto_opts.rto_dcols; + d_bw = (double)iter_cnt * (double)(disksize); + d_bw /= (1024.0 * 1024.0 * elapsed); + + LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n", + impl, + raidz_rec_name[fn], + rto_opts.rto_dcols, + (1ULL<<ds), + d_bw, + d_bw * (double)ncols, + (unsigned)iter_cnt); + + vdev_raidz_map_free(rm_bench); + } + } +} + +static void +run_rec_bench(void) +{ + char **impl_name; + + LOG(D_INFO, DBLSEP "\nBenchmarking data reconstruction...\n\n"); + LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n"); + + for (impl_name = (char **)raidz_impl_names; *impl_name != NULL; + impl_name++) { + + if (vdev_raidz_impl_set(*impl_name) != 0) + continue; + + run_rec_bench_impl(*impl_name); + } +} + +void +run_raidz_benchmark(void) +{ + bench_init_raidz_map(); + + run_gen_bench(); + run_rec_bench(); + + bench_fini_raidz_maps(); +} diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c new file mode 100644 index 000000000000..66f36b0d56ca --- /dev/null +++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c @@ -0,0 +1,782 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/zio.h> +#include <umem.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> +#include <assert.h> +#include <stdio.h> +#include "raidz_test.h" + +static int *rand_data; +raidz_test_opts_t rto_opts; + +static char gdb[256]; +static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d"; + +static void sig_handler(int signo) +{ + struct sigaction action; + /* + * Restore default action and re-raise signal so SIGSEGV and + * SIGABRT can trigger a core dump. + */ + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + (void) sigaction(signo, &action, NULL); + + if (rto_opts.rto_gdb) + if (system(gdb)) { } + + raise(signo); +} + +static void print_opts(raidz_test_opts_t *opts, boolean_t force) +{ + char *verbose; + switch (opts->rto_v) { + case 0: + verbose = "no"; + break; + case 1: + verbose = "info"; + break; + default: + verbose = "debug"; + break; + } + + if (force || opts->rto_v >= D_INFO) { + (void) fprintf(stdout, DBLSEP "Running with options:\n" + " (-a) zio ashift : %zu\n" + " (-o) zio offset : 1 << %zu\n" + " (-d) number of raidz data columns : %zu\n" + " (-s) size of DATA : 1 << %zu\n" + " (-S) sweep parameters : %s \n" + " (-v) verbose : %s \n\n", + opts->rto_ashift, /* -a */ + ilog2(opts->rto_offset), /* -o */ + opts->rto_dcols, /* -d */ + ilog2(opts->rto_dsize), /* -s */ + opts->rto_sweep ? "yes" : "no", /* -S */ + verbose); /* -v */ + } +} + +static void usage(boolean_t requested) +{ + const raidz_test_opts_t *o = &rto_opts_defaults; + + FILE *fp = requested ? stdout : stderr; + + (void) fprintf(fp, "Usage:\n" + "\t[-a zio ashift (default: %zu)]\n" + "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" + "\t[-d number of raidz data columns (default: %zu)]\n" + "\t[-s zio size, exponent radix 2 (default: %zu)]\n" + "\t[-S parameter sweep (default: %s)]\n" + "\t[-t timeout for parameter sweep test]\n" + "\t[-B benchmark all raidz implementations]\n" + "\t[-v increase verbosity (default: %zu)]\n" + "\t[-h (print help)]\n" + "\t[-T test the test, see if failure would be detected]\n" + "\t[-D debug (attach gdb on SIGSEGV)]\n" + "", + o->rto_ashift, /* -a */ + ilog2(o->rto_offset), /* -o */ + o->rto_dcols, /* -d */ + ilog2(o->rto_dsize), /* -s */ + rto_opts.rto_sweep ? "yes" : "no", /* -S */ + o->rto_v); /* -d */ + + exit(requested ? 0 : 1); +} + +static void process_options(int argc, char **argv) +{ + size_t value; + int opt; + + raidz_test_opts_t *o = &rto_opts; + + bcopy(&rto_opts_defaults, o, sizeof (*o)); + + while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) { + value = 0; + + switch (opt) { + case 'a': + value = strtoull(optarg, NULL, 0); + o->rto_ashift = MIN(13, MAX(9, value)); + break; + case 'o': + value = strtoull(optarg, NULL, 0); + o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; + break; + case 'd': + value = strtoull(optarg, NULL, 0); + o->rto_dcols = MIN(255, MAX(1, value)); + break; + case 's': + value = strtoull(optarg, NULL, 0); + o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT, + MAX(SPA_MINBLOCKSHIFT, value)); + break; + case 't': + value = strtoull(optarg, NULL, 0); + o->rto_sweep_timeout = value; + break; + case 'v': + o->rto_v++; + break; + case 'S': + o->rto_sweep = 1; + break; + case 'B': + o->rto_benchmark = 1; + break; + case 'D': + o->rto_gdb = 1; + break; + case 'T': + o->rto_sanity = 1; + break; + case 'h': + usage(B_TRUE); + break; + case '?': + default: + usage(B_FALSE); + break; + } + } +} + +#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) +#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) + +#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) +#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) + +static int +cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) +{ + int i, ret = 0; + + VERIFY(parity >= 1 && parity <= 3); + + for (i = 0; i < parity; i++) { + if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) + != 0) { + ret++; + LOG_OPT(D_DEBUG, opts, + "\nParity block [%d] different!\n", i); + } + } + return (ret); +} + +static int +cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) +{ + int i, ret = 0; + int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); + + for (i = 0; i < dcols; i++) { + if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) + != 0) { + ret++; + + LOG_OPT(D_DEBUG, opts, + "\nData block [%d] different!\n", i); + } + } + return (ret); +} + +static int +init_rand(void *data, size_t size, void *private) +{ + int i; + int *dst = (int *)data; + + for (i = 0; i < size / sizeof (int); i++) + dst[i] = rand_data[i]; + + return (0); +} + +static void +corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) +{ + int i; + raidz_col_t *col; + + for (i = 0; i < cnt; i++) { + col = &rm->rm_col[tgts[i]]; + abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); + } +} + +void +init_zio_abd(zio_t *zio) +{ + abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); +} + +static void +fini_raidz_map(zio_t **zio, raidz_map_t **rm) +{ + vdev_raidz_map_free(*rm); + raidz_free((*zio)->io_abd, (*zio)->io_size); + umem_free(*zio, sizeof (zio_t)); + + *zio = NULL; + *rm = NULL; +} + +static int +init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) +{ + int err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + const size_t total_ncols = opts->rto_dcols + parity; + + if (opts->rm_golden) { + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + } + + opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + + opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; + opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; + + opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); + zio_test->io_abd = raidz_alloc(opts->rto_dsize); + + init_zio_abd(opts->zio_golden); + init_zio_abd(zio_test); + + VERIFY0(vdev_raidz_impl_set("original")); + + opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, + opts->rto_ashift, total_ncols, parity); + rm_test = vdev_raidz_map_alloc(zio_test, + opts->rto_ashift, total_ncols, parity); + + VERIFY(opts->zio_golden); + VERIFY(opts->rm_golden); + + vdev_raidz_generate_parity(opts->rm_golden); + vdev_raidz_generate_parity(rm_test); + + /* sanity check */ + err |= cmp_data(opts, rm_test); + err |= cmp_code(opts, rm_test, parity); + + if (err) + ERR("initializing the golden copy ... [FAIL]!\n"); + + /* tear down raidz_map of test zio */ + fini_raidz_map(&zio_test, &rm_test); + + return (err); +} + +static raidz_map_t * +init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) +{ + raidz_map_t *rm = NULL; + const size_t alloc_dsize = opts->rto_dsize; + const size_t total_ncols = opts->rto_dcols + parity; + const int ccols[] = { 0, 1, 2 }; + + VERIFY(zio); + VERIFY(parity <= 3 && parity >= 1); + + *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + + (*zio)->io_offset = 0; + (*zio)->io_size = alloc_dsize; + (*zio)->io_abd = raidz_alloc(alloc_dsize); + init_zio_abd(*zio); + + rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, + total_ncols, parity); + VERIFY(rm); + + /* Make sure code columns are destroyed */ + corrupt_colums(rm, ccols, parity); + + return (rm); +} + +static int +run_gen_check(raidz_test_opts_t *opts) +{ + char **impl_name; + int fn, err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + + err = init_raidz_golden_map(opts, PARITY_PQR); + if (0 != err) + return (err); + + LOG(D_INFO, DBLSEP); + LOG(D_INFO, "Testing parity generation...\n"); + + for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; + impl_name++) { + + LOG(D_INFO, SEP); + LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); + + if (0 != vdev_raidz_impl_set(*impl_name)) { + LOG(D_INFO, "[SKIP]\n"); + continue; + } else { + LOG(D_INFO, "[SUPPORTED]\n"); + } + + for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + /* create suitable raidz_map */ + rm_test = init_raidz_map(opts, &zio_test, fn+1); + VERIFY(rm_test); + + LOG(D_INFO, "\t\tTesting method [%s] ...", + raidz_gen_name[fn]); + + if (!opts->rto_sanity) + vdev_raidz_generate_parity(rm_test); + + if (cmp_code(opts, rm_test, fn+1) != 0) { + LOG(D_INFO, "[FAIL]\n"); + err++; + } else + LOG(D_INFO, "[PASS]\n"); + + fini_raidz_map(&zio_test, &rm_test); + } + } + + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + + return (err); +} + +static int +run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) +{ + int x0, x1, x2; + int tgtidx[3]; + int err = 0; + static const int rec_tgts[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx)); + + if (fn < RAIDZ_REC_PQ) { + /* can reconstruct 1 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d] ", x0); + + tgtidx[2] = x0 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx+2, 1); + + if (!opts->rto_sanity) + vdev_raidz_reconstruct(rm, tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0); + } + } + + } else if (fn < RAIDZ_REC_PQR) { + /* can reconstruct 2 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { + if (x1 >= rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d %d] ", x0, x1); + + tgtidx[1] = x0 + raidz_parity(rm); + tgtidx[2] = x1 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx+1, 2); + + if (!opts->rto_sanity) + vdev_raidz_reconstruct(rm, tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, "\nREC D[%d %d]... " + "[FAIL]\n", x0, x1); + } + } + } + } else { + /* can reconstruct 3 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { + if (x1 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { + if (x2 >= + rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2); + + tgtidx[0] = x0 + raidz_parity(rm); + tgtidx[1] = x1 + raidz_parity(rm); + tgtidx[2] = x2 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx, 3); + + if (!opts->rto_sanity) + vdev_raidz_reconstruct(rm, + tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, + "\nREC D[%d %d %d]... " + "[FAIL]\n", x0, x1, x2); + } + } + } + } + } + return (err); +} + +static int +run_rec_check(raidz_test_opts_t *opts) +{ + char **impl_name; + unsigned fn, err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + + err = init_raidz_golden_map(opts, PARITY_PQR); + if (0 != err) + return (err); + + LOG(D_INFO, DBLSEP); + LOG(D_INFO, "Testing data reconstruction...\n"); + + for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; + impl_name++) { + + LOG(D_INFO, SEP); + LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); + + if (vdev_raidz_impl_set(*impl_name) != 0) { + LOG(D_INFO, "[SKIP]\n"); + continue; + } else + LOG(D_INFO, "[SUPPORTED]\n"); + + + /* create suitable raidz_map */ + rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR); + /* generate parity */ + vdev_raidz_generate_parity(rm_test); + + for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { + + LOG(D_INFO, "\t\tTesting method [%s] ...", + raidz_rec_name[fn]); + + if (run_rec_check_impl(opts, rm_test, fn) != 0) { + LOG(D_INFO, "[FAIL]\n"); + err++; + + } else + LOG(D_INFO, "[PASS]\n"); + + } + /* tear down test raidz_map */ + fini_raidz_map(&zio_test, &rm_test); + } + + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + + return (err); +} + +static int +run_test(raidz_test_opts_t *opts) +{ + int err = 0; + + if (opts == NULL) + opts = &rto_opts; + + print_opts(opts, B_FALSE); + + err |= run_gen_check(opts); + err |= run_rec_check(opts); + + return (err); +} + +#define SWEEP_RUNNING 0 +#define SWEEP_FINISHED 1 +#define SWEEP_ERROR 2 +#define SWEEP_TIMEOUT 3 + +static int sweep_state = 0; +static raidz_test_opts_t failed_opts; + +static kmutex_t sem_mtx; +static kcondvar_t sem_cv; +static int max_free_slots; +static int free_slots; + +static void +sweep_thread(void *arg) +{ + int err = 0; + raidz_test_opts_t *opts = (raidz_test_opts_t *)arg; + VERIFY(opts != NULL); + + err = run_test(opts); + + if (rto_opts.rto_sanity) { + /* 25% chance that a sweep test fails */ + if (rand() < (RAND_MAX/4)) + err = 1; + } + + if (0 != err) { + mutex_enter(&sem_mtx); + memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t)); + sweep_state = SWEEP_ERROR; + mutex_exit(&sem_mtx); + } + + umem_free(opts, sizeof (raidz_test_opts_t)); + + /* signal the next thread */ + mutex_enter(&sem_mtx); + free_slots++; + cv_signal(&sem_cv); + mutex_exit(&sem_mtx); + + thread_exit(); +} + +static int +run_sweep(void) +{ + static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 }; + static const size_t ashift_v[] = { 9, 12, 14 }; + static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12), + 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE }; + + (void) setvbuf(stdout, NULL, _IONBF, 0); + + ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) * + ARRAY_SIZE(dcols_v); + ulong_t tried_comb = 0; + hrtime_t time_diff, start_time = gethrtime(); + raidz_test_opts_t *opts; + int a, d, s; + + max_free_slots = free_slots = MAX(2, boot_ncpus); + + mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&sem_cv, NULL, CV_DEFAULT, NULL); + + for (s = 0; s < ARRAY_SIZE(size_v); s++) + for (a = 0; a < ARRAY_SIZE(ashift_v); a++) + for (d = 0; d < ARRAY_SIZE(dcols_v); d++) { + + if (size_v[s] < (1 << ashift_v[a])) { + total_comb--; + continue; + } + + if (++tried_comb % 20 == 0) + LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb); + + /* wait for signal to start new thread */ + mutex_enter(&sem_mtx); + while (cv_timedwait_sig(&sem_cv, &sem_mtx, + ddi_get_lbolt() + hz)) { + + /* check if should stop the test (timeout) */ + time_diff = (gethrtime() - start_time) / NANOSEC; + if (rto_opts.rto_sweep_timeout > 0 && + time_diff >= rto_opts.rto_sweep_timeout) { + sweep_state = SWEEP_TIMEOUT; + rto_opts.rto_should_stop = B_TRUE; + mutex_exit(&sem_mtx); + goto exit; + } + + /* check if should stop the test (error) */ + if (sweep_state != SWEEP_RUNNING) { + mutex_exit(&sem_mtx); + goto exit; + } + + /* exit loop if a slot is available */ + if (free_slots > 0) { + break; + } + } + + free_slots--; + mutex_exit(&sem_mtx); + + opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL); + opts->rto_ashift = ashift_v[a]; + opts->rto_dcols = dcols_v[d]; + opts->rto_offset = (1 << ashift_v[a]) * rand(); + opts->rto_dsize = size_v[s]; + opts->rto_v = 0; /* be quiet */ + + VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, + 0, NULL, TS_RUN, defclsyspri), !=, NULL); + } + +exit: + LOG(D_ALL, "\nWaiting for test threads to finish...\n"); + mutex_enter(&sem_mtx); + VERIFY(free_slots <= max_free_slots); + while (free_slots < max_free_slots) { + (void) cv_wait(&sem_cv, &sem_mtx); + } + mutex_exit(&sem_mtx); + + if (sweep_state == SWEEP_ERROR) { + ERR("Sweep test failed! Failed option: \n"); + print_opts(&failed_opts, B_TRUE); + } else { + if (sweep_state == SWEEP_TIMEOUT) + LOG(D_ALL, "Test timeout (%lus). Stopping...\n", + (ulong_t)rto_opts.rto_sweep_timeout); + + LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n", + (ulong_t)tried_comb); + } + + mutex_destroy(&sem_mtx); + + return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); +} + +int +main(int argc, char **argv) +{ + size_t i; + struct sigaction action; + int err = 0; + + /* init gdb string early */ + (void) sprintf(gdb, gdb_tmpl, getpid()); + + action.sa_handler = sig_handler; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + + if (sigaction(SIGSEGV, &action, NULL) < 0) { + ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno)); + exit(EXIT_FAILURE); + } + + (void) setvbuf(stdout, NULL, _IOLBF, 0); + + dprintf_setup(&argc, argv); + + process_options(argc, argv); + + kernel_init(SPA_MODE_READ); + + /* setup random data because rand() is not reentrant */ + rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + srand((unsigned)time(NULL) * getpid()); + for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++) + rand_data[i] = rand(); + + mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ); + + if (rto_opts.rto_benchmark) { + run_raidz_benchmark(); + } else if (rto_opts.rto_sweep) { + err = run_sweep(); + } else { + err = run_test(NULL); + } + + umem_free(rand_data, SPA_MAXBLOCKSIZE); + kernel_fini(); + + return (err); +} diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h new file mode 100644 index 000000000000..09c825ae43c7 --- /dev/null +++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h @@ -0,0 +1,116 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#ifndef RAIDZ_TEST_H +#define RAIDZ_TEST_H + +#include <sys/spa.h> + +static const char *raidz_impl_names[] = { + "original", + "scalar", + "sse2", + "ssse3", + "avx2", + "avx512f", + "avx512bw", + "aarch64_neon", + "aarch64_neonx2", + "powerpc_altivec", + NULL +}; + +typedef struct raidz_test_opts { + size_t rto_ashift; + size_t rto_offset; + size_t rto_dcols; + size_t rto_dsize; + size_t rto_v; + size_t rto_sweep; + size_t rto_sweep_timeout; + size_t rto_benchmark; + size_t rto_sanity; + size_t rto_gdb; + + /* non-user options */ + boolean_t rto_should_stop; + + zio_t *zio_golden; + raidz_map_t *rm_golden; +} raidz_test_opts_t; + +static const raidz_test_opts_t rto_opts_defaults = { + .rto_ashift = 9, + .rto_offset = 1ULL << 0, + .rto_dcols = 8, + .rto_dsize = 1<<19, + .rto_v = 0, + .rto_sweep = 0, + .rto_benchmark = 0, + .rto_sanity = 0, + .rto_gdb = 0, + .rto_should_stop = B_FALSE +}; + +extern raidz_test_opts_t rto_opts; + +static inline size_t ilog2(size_t a) +{ + return (a > 1 ? 1 + ilog2(a >> 1) : 0); +} + + +#define D_ALL 0 +#define D_INFO 1 +#define D_DEBUG 2 + +#define LOG(lvl, a...) \ +{ \ + if (rto_opts.rto_v >= lvl) \ + (void) fprintf(stdout, a); \ +} \ + +#define LOG_OPT(lvl, opt, a...) \ +{ \ + if (opt->rto_v >= lvl) \ + (void) fprintf(stdout, a); \ +} \ + +#define ERR(a...) (void) fprintf(stderr, a) + + +#define DBLSEP "================\n" +#define SEP "----------------\n" + + +#define raidz_alloc(size) abd_alloc(size, B_FALSE) +#define raidz_free(p, size) abd_free(p) + + +void init_zio_abd(zio_t *zio); + +void run_raidz_benchmark(void); + +#endif /* RAIDZ_TEST_H */ diff --git a/sys/contrib/openzfs/cmd/vdev_id/Makefile.am b/sys/contrib/openzfs/cmd/vdev_id/Makefile.am new file mode 100644 index 000000000000..fb815faad084 --- /dev/null +++ b/sys/contrib/openzfs/cmd/vdev_id/Makefile.am @@ -0,0 +1 @@ +dist_udev_SCRIPTS = vdev_id diff --git a/sys/contrib/openzfs/cmd/vdev_id/vdev_id b/sys/contrib/openzfs/cmd/vdev_id/vdev_id new file mode 100755 index 000000000000..8a75e638b67e --- /dev/null +++ b/sys/contrib/openzfs/cmd/vdev_id/vdev_id @@ -0,0 +1,605 @@ +#!/bin/sh +# +# vdev_id: udev helper to generate user-friendly names for JBOD disks +# +# This script parses the file /etc/zfs/vdev_id.conf to map a +# physical path in a storage topology to a channel name. The +# channel name is combined with a disk enclosure slot number to +# create an alias that reflects the physical location of the drive. +# This is particularly helpful when it comes to tasks like replacing +# failed drives. Slot numbers may also be re-mapped in case the +# default numbering is unsatisfactory. The drive aliases will be +# created as symbolic links in /dev/disk/by-vdev. +# +# The currently supported topologies are sas_direct and sas_switch. +# A multipath mode is supported in which dm-mpath devices are +# handled by examining the first-listed running component disk. In +# multipath mode the configuration file should contain a channel +# definition with the same name for each path to a given enclosure. +# +# The alias keyword provides a simple way to map already-existing +# device symlinks to more convenient names. It is suitable for +# small, static configurations or for sites that have some automated +# way to generate the mapping file. +# +# +# Some example configuration files are given below. + +# # +# # Example vdev_id.conf - sas_direct. +# # +# +# multipath no +# topology sas_direct +# phys_per_port 4 +# slot bay +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 1 A +# channel 85:00.0 0 B +# channel 86:00.0 1 C +# channel 86:00.0 0 D +# +# # Custom mapping for Channel A +# +# # Linux Mapped +# # Slot Slot Channel +# slot 1 7 A +# slot 2 10 A +# slot 3 3 A +# slot 4 6 A +# +# # Default mapping for B, C, and D +# slot 1 4 +# slot 2 2 +# slot 3 1 +# slot 4 3 + +# # +# # Example vdev_id.conf - sas_switch +# # +# +# topology sas_switch +# +# # SWITCH PORT CHANNEL NAME +# channel 1 A +# channel 2 B +# channel 3 C +# channel 4 D + +# # +# # Example vdev_id.conf - multipath +# # +# +# multipath yes +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 1 A +# channel 85:00.0 0 B +# channel 86:00.0 1 A +# channel 86:00.0 0 B + +# # +# # Example vdev_id.conf - alias +# # +# +# # by-vdev +# # name fully qualified or base name of device link +# alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca +# alias d2 wwn-0x5000c5002def789e + +PATH=/bin:/sbin:/usr/bin:/usr/sbin +CONFIG=/etc/zfs/vdev_id.conf +PHYS_PER_PORT= +DEV= +MULTIPATH= +TOPOLOGY= +BAY= + +usage() { + cat << EOF +Usage: vdev_id [-h] + vdev_id <-d device> [-c config_file] [-p phys_per_port] + [-g sas_direct|sas_switch|scsi] [-m] + + -c specify name of an alternative config file [default=$CONFIG] + -d specify basename of device (i.e. sda) + -e Create enclose device symlinks only (/dev/by-enclosure) + -g Storage network topology [default="$TOPOLOGY"] + -m Run in multipath mode + -p number of phy's per switch port [default=$PHYS_PER_PORT] + -h show this summary +EOF + exit 0 +} + +map_slot() { + LINUX_SLOT=$1 + CHANNEL=$2 + + MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \ + \\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG` + if [ -z "$MAPPED_SLOT" ] ; then + MAPPED_SLOT=$LINUX_SLOT + fi + printf "%d" ${MAPPED_SLOT} +} + +map_channel() { + MAPPED_CHAN= + PCI_ID=$1 + PORT=$2 + + case $TOPOLOGY in + "sas_switch") + MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \ + { print \\$3; exit }" $CONFIG` + ;; + "sas_direct"|"scsi") + MAPPED_CHAN=`awk "\\$1 == \"channel\" && \ + \\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \ + { print \\$4; exit }" $CONFIG` + ;; + esac + printf "%s" ${MAPPED_CHAN} +} + +sas_handler() { + if [ -z "$PHYS_PER_PORT" ] ; then + PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ + {print \\$2; exit}" $CONFIG` + fi + PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then + echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" + exit 1 + fi + + if [ -z "$MULTIPATH_MODE" ] ; then + MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ + {print \\$2; exit}" $CONFIG` + fi + + # Use first running component device if we're handling a dm-mpath device + if [ "$MULTIPATH_MODE" = "yes" ] ; then + # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper + if [ -z "$DM_NAME" ] ; then + DM_NAME=`ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print \\$9}"` + fi + + # For raw disks udev exports DEVTYPE=partition when + # handling partitions, and the rules can be written to + # take advantage of this to append a -part suffix. For + # dm devices we get DEVTYPE=disk even for partitions so + # we have to append the -part suffix directly in the + # helper. + if [ "$DEVTYPE" != "partition" ] ; then + PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + fi + + # Strip off partition information. + DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + if [ -z "$DM_NAME" ] ; then + return + fi + + # Get the raw scsi device name from multipath -ll. Strip off + # leading pipe symbols to make field numbering consistent. + DEV=`multipath -ll $DM_NAME | + awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + if [ -z "$DEV" ] ; then + return + fi + fi + + if echo $DEV | grep -q ^/devices/ ; then + sys_path=$DEV + else + sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + fi + + # Use positional parameters as an ad-hoc array + set -- $(echo "$sys_path" | tr / ' ') + num_dirs=$# + scsi_host_dir="/sys" + + # Get path up to /sys/.../hostX + i=1 + while [ $i -le $num_dirs ] ; do + d=$(eval echo \${$i}) + scsi_host_dir="$scsi_host_dir/$d" + echo $d | grep -q -E '^host[0-9]+$' && break + i=$(($i + 1)) + done + + if [ $i = $num_dirs ] ; then + return + fi + + PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + + # In sas_switch mode, the directory four levels beneath + # /sys/.../hostX contains symlinks to phy devices that reveal + # the switch port number. In sas_direct mode, the phy links one + # directory down reveal the HBA port. + port_dir=$scsi_host_dir + case $TOPOLOGY in + "sas_switch") j=$(($i + 4)) ;; + "sas_direct") j=$(($i + 1)) ;; + esac + + i=$(($i + 1)) + while [ $i -le $j ] ; do + port_dir="$port_dir/$(eval echo \${$i})" + i=$(($i + 1)) + done + + PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'` + if [ -z "$PHY" ] ; then + PHY=0 + fi + PORT=$(( $PHY / $PHYS_PER_PORT )) + + # Look in /sys/.../sas_device/end_device-X for the bay_identifier + # attribute. + end_device_dir=$port_dir + while [ $i -lt $num_dirs ] ; do + d=$(eval echo \${$i}) + end_device_dir="$end_device_dir/$d" + if echo $d | grep -q '^end_device' ; then + end_device_dir="$end_device_dir/sas_device/$d" + break + fi + i=$(($i + 1)) + done + + SLOT= + case $BAY in + "bay") + SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null` + ;; + "phy") + SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null` + ;; + "port") + d=$(eval echo \${$i}) + SLOT=`echo $d | sed -e 's/^.*://'` + ;; + "id") + i=$(($i + 1)) + d=$(eval echo \${$i}) + SLOT=`echo $d | sed -e 's/^.*://'` + ;; + "lun") + i=$(($i + 2)) + d=$(eval echo \${$i}) + SLOT=`echo $d | sed -e 's/^.*://'` + ;; + "ses") + # look for this SAS path in all SCSI Enclosure Services + # (SES) enclosures + sas_address=`cat $end_device_dir/sas_address 2>/dev/null` + enclosures=`lsscsi -g | \ + sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'` + for enclosure in $enclosures; do + set -- $(sg_ses -p aes $enclosure | \ + awk "/device slot number:/{slot=\$12} \ + /SAS address: $sas_address/\ + {print slot}") + SLOT=$1 + if [ -n "$SLOT" ] ; then + break + fi + done + ;; + esac + if [ -z "$SLOT" ] ; then + return + fi + + CHAN=`map_channel $PCI_ID $PORT` + SLOT=`map_slot $SLOT $CHAN` + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}${SLOT}${PART} +} + +scsi_handler() { + if [ -z "$FIRST_BAY_NUMBER" ] ; then + FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \ + {print \\$2; exit}" $CONFIG` + fi + FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0} + + if [ -z "$PHYS_PER_PORT" ] ; then + PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ + {print \\$2; exit}" $CONFIG` + fi + PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then + echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" + exit 1 + fi + + if [ -z "$MULTIPATH_MODE" ] ; then + MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ + {print \\$2; exit}" $CONFIG` + fi + + # Use first running component device if we're handling a dm-mpath device + if [ "$MULTIPATH_MODE" = "yes" ] ; then + # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper + if [ -z "$DM_NAME" ] ; then + DM_NAME=`ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print \\$9}"` + fi + + # For raw disks udev exports DEVTYPE=partition when + # handling partitions, and the rules can be written to + # take advantage of this to append a -part suffix. For + # dm devices we get DEVTYPE=disk even for partitions so + # we have to append the -part suffix directly in the + # helper. + if [ "$DEVTYPE" != "partition" ] ; then + PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + fi + + # Strip off partition information. + DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + if [ -z "$DM_NAME" ] ; then + return + fi + + # Get the raw scsi device name from multipath -ll. Strip off + # leading pipe symbols to make field numbering consistent. + DEV=`multipath -ll $DM_NAME | + awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + if [ -z "$DEV" ] ; then + return + fi + fi + + if echo $DEV | grep -q ^/devices/ ; then + sys_path=$DEV + else + sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + fi + + # expect sys_path like this, for example: + # /devices/pci0000:00/0000:00:0b.0/0000:09:00.0/0000:0a:05.0/0000:0c:00.0/host3/target3:1:0/3:1:0:21/block/sdv + + # Use positional parameters as an ad-hoc array + set -- $(echo "$sys_path" | tr / ' ') + num_dirs=$# + scsi_host_dir="/sys" + + # Get path up to /sys/.../hostX + i=1 + while [ $i -le $num_dirs ] ; do + d=$(eval echo \${$i}) + scsi_host_dir="$scsi_host_dir/$d" + echo $d | grep -q -E '^host[0-9]+$' && break + i=$(($i + 1)) + done + + if [ $i = $num_dirs ] ; then + return + fi + + PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + + # In scsi mode, the directory two levels beneath + # /sys/.../hostX reveals the port and slot. + port_dir=$scsi_host_dir + j=$(($i + 2)) + + i=$(($i + 1)) + while [ $i -le $j ] ; do + port_dir="$port_dir/$(eval echo \${$i})" + i=$(($i + 1)) + done + + set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/') + PORT=$1 + SLOT=$(($2 + $FIRST_BAY_NUMBER)) + + if [ -z "$SLOT" ] ; then + return + fi + + CHAN=`map_channel $PCI_ID $PORT` + SLOT=`map_slot $SLOT $CHAN` + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}${SLOT}${PART} +} + +# Figure out the name for the enclosure symlink +enclosure_handler () { + # We get all the info we need from udev's DEVPATH variable: + # + # DEVPATH=/sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/subsystem/devices/0:0:0:0/scsi_generic/sg0 + + # Get the enclosure ID ("0:0:0:0") + ENC=$(basename $(readlink -m "/sys/$DEVPATH/../..")) + if [ ! -d /sys/class/enclosure/$ENC ] ; then + # Not an enclosure, bail out + return + fi + + # Get the long sysfs device path to our enclosure. Looks like: + # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0 + + ENC_DEVICE=$(readlink /sys/class/enclosure/$ENC) + + # Grab the full path to the hosts port dir: + # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0 + PORT_DIR=$(echo $ENC_DEVICE | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+') + + # Get the port number + PORT_ID=$(echo $PORT_DIR | grep -Eo "[0-9]+$") + + # The PCI directory is two directories up from the port directory + # /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0 + PCI_ID_LONG=$(basename $(readlink -m "/sys/$PORT_DIR/../..")) + + # Strip down the PCI address from 0000:05:00.0 to 05:00.0 + PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g') + + # Name our device according to vdev_id.conf (like "L0" or "U1"). + NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \ + \$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG) + + echo "${NAME}" +} + +alias_handler () { + # Special handling is needed to correctly append a -part suffix + # to partitions of device mapper devices. The DEVTYPE attribute + # is normally set to "disk" instead of "partition" in this case, + # so the udev rules won't handle that for us as they do for + # "plain" block devices. + # + # For example, we may have the following links for a device and its + # partitions, + # + # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0 -> ../../dm-0 + # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p1 -> ../../dm-1 + # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p2 -> ../../dm-3 + # + # and the following alias in vdev_id.conf. + # + # alias A0 dm-name-isw_dibgbfcije_ARRAY0 + # + # The desired outcome is for the following links to be created + # without having explicitly defined aliases for the partitions. + # + # /dev/disk/by-vdev/A0 -> ../../dm-0 + # /dev/disk/by-vdev/A0-part1 -> ../../dm-1 + # /dev/disk/by-vdev/A0-part2 -> ../../dm-3 + # + # Warning: The following grep pattern will misidentify whole-disk + # devices whose names end with 'p' followed by a string of + # digits as partitions, causing alias creation to fail. This + # ambiguity seems unavoidable, so devices using this facility + # must not use such names. + DM_PART= + if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then + if [ "$DEVTYPE" != "partition" ] ; then + DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + fi + fi + + # DEVLINKS attribute must have been populated by already-run udev rules. + for link in $DEVLINKS ; do + # Remove partition information to match key of top-level device. + if [ -n "$DM_PART" ] ; then + link=`echo $link | sed 's/p[0-9][0-9]*$//'` + fi + # Check both the fully qualified and the base name of link. + for l in $link `basename $link` ; do + alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \ + { print \\$2; exit }" $CONFIG` + if [ -n "$alias" ] ; then + echo ${alias}${DM_PART} + return + fi + done + done +} + +while getopts 'c:d:eg:mp:h' OPTION; do + case ${OPTION} in + c) + CONFIG=${OPTARG} + ;; + d) + DEV=${OPTARG} + ;; + e) + # When udev sees a scsi_generic device, it calls this script with -e to + # create the enclosure device symlinks only. We also need + # "enclosure_symlinks yes" set in vdev_id.config to actually create the + # symlink. + ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG) + if [ "$ENCLOSURE_MODE" != "yes" ] ; then + exit 0 + fi + ;; + g) + TOPOLOGY=$OPTARG + ;; + p) + PHYS_PER_PORT=${OPTARG} + ;; + m) + MULTIPATH_MODE=yes + ;; + h) + usage + ;; + esac +done + +if [ ! -r $CONFIG ] ; then + exit 0 +fi + +if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then + echo "Error: missing required option -d" + exit 1 +fi + +if [ -z "$TOPOLOGY" ] ; then + TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG` +fi + +if [ -z "$BAY" ] ; then + BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG` +fi + +TOPOLOGY=${TOPOLOGY:-sas_direct} + +# Should we create /dev/by-enclosure symlinks? +if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then + ID_ENCLOSURE=$(enclosure_handler) + if [ -z "$ID_ENCLOSURE" ] ; then + exit 0 + fi + + # Just create the symlinks to the enclosure devices and then exit. + ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' $CONFIG) + if [ -z "$ENCLOSURE_PREFIX" ] ; then + ENCLOSURE_PREFIX="enc" + fi + echo "ID_ENCLOSURE=$ID_ENCLOSURE" + echo "ID_ENCLOSURE_PATH=by-enclosure/$ENCLOSURE_PREFIX-$ID_ENCLOSURE" + exit 0 +fi + +# First check if an alias was defined for this device. +ID_VDEV=`alias_handler` + +if [ -z "$ID_VDEV" ] ; then + BAY=${BAY:-bay} + case $TOPOLOGY in + sas_direct|sas_switch) + ID_VDEV=`sas_handler` + ;; + scsi) + ID_VDEV=`scsi_handler` + ;; + *) + echo "Error: unknown topology $TOPOLOGY" + exit 1 + ;; + esac +fi + +if [ -n "$ID_VDEV" ] ; then + echo "ID_VDEV=${ID_VDEV}" + echo "ID_VDEV_PATH=disk/by-vdev/${ID_VDEV}" +fi diff --git a/sys/contrib/openzfs/cmd/zdb/.gitignore b/sys/contrib/openzfs/cmd/zdb/.gitignore new file mode 100644 index 000000000000..f64a3fc5a160 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zdb/.gitignore @@ -0,0 +1 @@ +/zdb diff --git a/sys/contrib/openzfs/cmd/zdb/Makefile.am b/sys/contrib/openzfs/cmd/zdb/Makefile.am new file mode 100644 index 000000000000..b325cb060bd2 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zdb/Makefile.am @@ -0,0 +1,16 @@ +include $(top_srcdir)/config/Rules.am + +# Unconditionally enable debugging for zdb +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +sbin_PROGRAMS = zdb + +zdb_SOURCES = \ + zdb.c \ + zdb_il.c \ + zdb.h + +zdb_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c new file mode 100644 index 000000000000..e7211711a41c --- /dev/null +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -0,0 +1,8606 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. + * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. + * Copyright (c) 2015, 2017, Intel Corporation. + * Copyright (c) 2020 Datto Inc. + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <ctype.h> +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_sa.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab_impl.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_bookmark.h> +#include <sys/dbuf.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <sys/dmu_send.h> +#include <sys/dmu_traverse.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/zfs_fuid.h> +#include <sys/arc.h> +#include <sys/arc_impl.h> +#include <sys/ddt.h> +#include <sys/zfeature.h> +#include <sys/abd.h> +#include <sys/blkptr.h> +#include <sys/dsl_crypt.h> +#include <sys/dsl_scan.h> +#include <sys/btree.h> +#include <zfs_comutil.h> +#include <sys/zstd/zstd.h> + +#include <libnvpair.h> +#include <libzutil.h> + +#include "zdb.h" + +#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ + zio_compress_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ + zio_checksum_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ + (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ + DMU_OT_ZAP_OTHER : \ + (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ + DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) + +static char * +zdb_ot_name(dmu_object_type_t type) +{ + if (type < DMU_OT_NUMTYPES) + return (dmu_ot[type].ot_name); + else if ((type & DMU_OT_NEWTYPE) && + ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) + return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); + else + return ("UNKNOWN"); +} + +extern int reference_tracking_enable; +extern int zfs_recover; +extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit; +extern int zfs_vdev_async_read_max_active; +extern boolean_t spa_load_verify_dryrun; +extern int zfs_reconstruct_indirect_combinations_max; +extern int zfs_btree_verify_intensity; + +static const char cmdname[] = "zdb"; +uint8_t dump_opt[256]; + +typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); + +uint64_t *zopt_metaslab = NULL; +static unsigned zopt_metaslab_args = 0; + +typedef struct zopt_object_range { + uint64_t zor_obj_start; + uint64_t zor_obj_end; + uint64_t zor_flags; +} zopt_object_range_t; +zopt_object_range_t *zopt_object_ranges = NULL; +static unsigned zopt_object_args = 0; + +static int flagbits[256]; + +#define ZOR_FLAG_PLAIN_FILE 0x0001 +#define ZOR_FLAG_DIRECTORY 0x0002 +#define ZOR_FLAG_SPACE_MAP 0x0004 +#define ZOR_FLAG_ZAP 0x0008 +#define ZOR_FLAG_ALL_TYPES -1 +#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ + ZOR_FLAG_DIRECTORY | \ + ZOR_FLAG_SPACE_MAP | \ + ZOR_FLAG_ZAP) + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 + +uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ +static int leaked_objects = 0; +static range_tree_t *mos_refd_objs; + +static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, + boolean_t); +static void mos_obj_refd(uint64_t); +static void mos_obj_refd_multiple(uint64_t); +static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx); + +typedef struct sublivelist_verify { + /* all ALLOC'd blkptr_t in one sub-livelist */ + zfs_btree_t sv_all_allocs; + + /* all FREE'd blkptr_t in one sub-livelist */ + zfs_btree_t sv_all_frees; + + /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ + zfs_btree_t sv_pair; + + /* ALLOC's without a matching FREE, accumulates across sub-livelists */ + zfs_btree_t sv_leftover; +} sublivelist_verify_t; + +static int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = larg; + const blkptr_t *r = rarg; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev, r_dva0_vdev; + l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + if (l_dva0_vdev < r_dva0_vdev) + return (-1); + else if (l_dva0_vdev > r_dva0_vdev) + return (+1); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset; + uint64_t r_dva0_offset; + l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset < r_dva0_offset) { + return (-1); + } else if (l_dva0_offset > r_dva0_offset) { + return (+1); + } + + /* + * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, + * it's possible the offsets are equal. In that case, sort by txg + */ + if (l->blk_birth < r->blk_birth) { + return (-1); + } else if (l->blk_birth > r->blk_birth) { + return (+1); + } + return (0); +} + +typedef struct sublivelist_verify_block { + dva_t svb_dva; + + /* + * We need this to check if the block marked as allocated + * in the livelist was freed (and potentially reallocated) + * in the metaslab spacemaps at a later TXG. + */ + uint64_t svb_allocated_txg; +} sublivelist_verify_block_t; + +static void zdb_print_blkptr(const blkptr_t *bp, int flags); + +static int +sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx) +{ + ASSERT3P(tx, ==, NULL); + struct sublivelist_verify *sv = arg; + char blkbuf[BP_SPRINTF_LEN]; + zfs_btree_index_t where; + if (free) { + zfs_btree_add(&sv->sv_pair, bp); + /* Check if the FREE is a duplicate */ + if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, + free); + (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf); + } else { + zfs_btree_add_idx(&sv->sv_all_frees, bp, &where); + } + } else { + /* Check if the ALLOC has been freed */ + if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) { + zfs_btree_remove_idx(&sv->sv_pair, &where); + } else { + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + if (DVA_IS_EMPTY(&bp->blk_dva[i])) + break; + sublivelist_verify_block_t svb = { + .svb_dva = bp->blk_dva[i], + .svb_allocated_txg = bp->blk_birth + }; + + if (zfs_btree_find(&sv->sv_leftover, &svb, + &where) == NULL) { + zfs_btree_add_idx(&sv->sv_leftover, + &svb, &where); + } + } + } + /* Check if the ALLOC is a duplicate */ + if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, + free); + (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf); + } else { + zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where); + } + } + return (0); +} + +static int +sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) +{ + int err; + char blkbuf[BP_SPRINTF_LEN]; + struct sublivelist_verify *sv = args; + + zfs_btree_create(&sv->sv_all_allocs, livelist_compare, + sizeof (blkptr_t)); + + zfs_btree_create(&sv->sv_all_frees, livelist_compare, + sizeof (blkptr_t)); + + zfs_btree_create(&sv->sv_pair, livelist_compare, + sizeof (blkptr_t)); + + err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, + sv, NULL); + + zfs_btree_clear(&sv->sv_all_allocs); + zfs_btree_destroy(&sv->sv_all_allocs); + + zfs_btree_clear(&sv->sv_all_frees); + zfs_btree_destroy(&sv->sv_all_frees); + + blkptr_t *e; + zfs_btree_index_t *cookie = NULL; + while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE); + (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf); + } + zfs_btree_destroy(&sv->sv_pair); + + return (err); +} + +static int +livelist_block_compare(const void *larg, const void *rarg) +{ + const sublivelist_verify_block_t *l = larg; + const sublivelist_verify_block_t *r = rarg; + + if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) + return (-1); + else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) + return (+1); + + if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) + return (-1); + else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) + return (+1); + + if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) + return (-1); + else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) + return (+1); + + return (0); +} + +/* + * Check for errors in a livelist while tracking all unfreed ALLOCs in the + * sublivelist_verify_t: sv->sv_leftover + */ +static void +livelist_verify(dsl_deadlist_t *dl, void *arg) +{ + sublivelist_verify_t *sv = arg; + dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); +} + +/* + * Check for errors in the livelist entry and discard the intermediary + * data structures + */ +/* ARGSUSED */ +static int +sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) +{ + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + int err = sublivelist_verify_func(&sv, dle); + zfs_btree_clear(&sv.sv_leftover); + zfs_btree_destroy(&sv.sv_leftover); + return (err); +} + +typedef struct metaslab_verify { + /* + * Tree containing all the leftover ALLOCs from the livelists + * that are part of this metaslab. + */ + zfs_btree_t mv_livelist_allocs; + + /* + * Metaslab information. + */ + uint64_t mv_vdid; + uint64_t mv_msid; + uint64_t mv_start; + uint64_t mv_end; + + /* + * What's currently allocated for this metaslab. + */ + range_tree_t *mv_allocated; +} metaslab_verify_t; + +typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); + +typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, + void *arg); + +typedef struct unflushed_iter_cb_arg { + spa_t *uic_spa; + uint64_t uic_txg; + void *uic_arg; + zdb_log_sm_cb_t uic_cb; +} unflushed_iter_cb_arg_t; + +static int +iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) +{ + unflushed_iter_cb_arg_t *uic = arg; + return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); +} + +static void +iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + unflushed_iter_cb_arg_t uic = { + .uic_spa = spa, + .uic_txg = sls->sls_txg, + .uic_arg = arg, + .uic_cb = cb + }; + VERIFY0(space_map_iterate(sm, space_map_length(sm), + iterate_through_spacemap_logs_cb, &uic)); + space_map_close(sm); + } + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static void +verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, + uint64_t offset, uint64_t size) +{ + sublivelist_verify_block_t svb; + DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); + DVA_SET_OFFSET(&svb.svb_dva, offset); + DVA_SET_ASIZE(&svb.svb_dva, size); + zfs_btree_index_t where; + uint64_t end_offset = offset + size; + + /* + * Look for an exact match for spacemap entry in the livelist entries. + * Then, look for other livelist entries that fall within the range + * of the spacemap entry as it may have been condensed + */ + sublivelist_verify_block_t *found = + zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); + if (found == NULL) { + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); + } + for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && + DVA_GET_OFFSET(&found->svb_dva) < end_offset; + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + if (found->svb_allocated_txg <= txg) { + (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " + "from TXG %llx FREED at TXG %llx\n", + (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), + (u_longlong_t)found->svb_allocated_txg, + (u_longlong_t)txg); + } + } +} + +static int +metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t txg = sme->sme_txg; + + if (sme->sme_type == SM_ALLOC) { + if (range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE ALLOC: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_add(mv->mv_allocated, + offset, size); + } + } else { + if (!range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE FREE: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_remove(mv->mv_allocated, + offset, size); + } + } + + if (sme->sme_type != SM_ALLOC) { + /* + * If something is freed in the spacemap, verify that + * it is not listed as allocated in the livelist. + */ + verify_livelist_allocs(mv, txg, offset, size); + } + return (0); +} + +static int +spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + if (vdev_id != mv->mv_vdid) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + if (ms->ms_id != mv->mv_msid) + return (0); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + + ASSERT3U(txg, ==, sme->sme_txg); + return (metaslab_spacemap_validation_cb(sme, mv)); +} + +static void +spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) +{ + iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); +} + +static void +spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) +{ + if (sm == NULL) + return; + + VERIFY0(space_map_iterate(sm, space_map_length(sm), + metaslab_spacemap_validation_cb, mv)); +} + +static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); + +/* + * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if + * they are part of that metaslab (mv_msid). + */ +static void +mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) +{ + zfs_btree_index_t where; + sublivelist_verify_block_t *svb; + ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); + for (svb = zfs_btree_first(&sv->sv_leftover, &where); + svb != NULL; + svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { + if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && + (DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) + continue; + + if ((DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + zfs_btree_add(&mv->mv_livelist_allocs, svb); + } + + for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); + svb != NULL; + svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + zfs_btree_remove(&sv->sv_leftover, svb); + } +} + +/* + * [Livelist Check] + * Iterate through all the sublivelists and: + * - report leftover frees + * - report double ALLOCs/FREEs + * - record leftover ALLOCs together with their TXG [see Cross Check] + * + * [Spacemap Check] + * for each metaslab: + * - iterate over spacemap and then the metaslab's entries in the + * spacemap log, then report any double FREEs and ALLOCs (do not + * blow up). + * + * [Cross Check] + * After finishing the Livelist Check phase and while being in the + * Spacemap Check phase, we find all the recorded leftover ALLOCs + * of the livelist check that are part of the metaslab that we are + * currently looking at in the Spacemap Check. We report any entries + * that are marked as ALLOCs in the livelists but have been actually + * freed (and potentially allocated again) after their TXG stamp in + * the spacemaps. Also report any ALLOCs from the livelists that + * belong to indirect vdevs (e.g. their vdev completed removal). + * + * Note that this will miss Log Spacemap entries that cancelled each other + * out before being flushed to the metaslab, so we are not guaranteed + * to match all erroneous ALLOCs. + */ +static void +livelist_metaslab_validate(spa_t *spa) +{ + (void) printf("Verifying deleted livelist entries\n"); + + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + iterate_deleted_livelists(spa, livelist_verify, &sv); + + (void) printf("Verifying metaslab entries\n"); + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + if (!vdev_is_concrete(vd)) + continue; + + for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { + metaslab_t *m = vd->vdev_ms[mid]; + + (void) fprintf(stderr, + "\rverifying concrete vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)mid, + (longlong_t)vd->vdev_ms_count); + + uint64_t shift, start; + range_seg_type_t type = + metaslab_calculate_range_tree_type(vd, m, + &start, &shift); + metaslab_verify_t mv; + mv.mv_allocated = range_tree_create(NULL, + type, NULL, start, shift); + mv.mv_vdid = vd->vdev_id; + mv.mv_msid = m->ms_id; + mv.mv_start = m->ms_start; + mv.mv_end = m->ms_start + m->ms_size; + zfs_btree_create(&mv.mv_livelist_allocs, + livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + + mv_populate_livelist_allocs(&mv, &sv); + + spacemap_check_ms_sm(m->ms_sm, &mv); + spacemap_check_sm_log(spa, &mv); + + range_tree_vacate(mv.mv_allocated, NULL, NULL); + range_tree_destroy(mv.mv_allocated); + zfs_btree_clear(&mv.mv_livelist_allocs); + zfs_btree_destroy(&mv.mv_livelist_allocs); + } + } + (void) fprintf(stderr, "\n"); + + /* + * If there are any segments in the leftover tree after we walked + * through all the metaslabs in the concrete vdevs then this means + * that we have segments in the livelists that belong to indirect + * vdevs and are marked as allocated. + */ + if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { + zfs_btree_destroy(&sv.sv_leftover); + return; + } + (void) printf("ERROR: Found livelist blocks marked as allocated " + "for indirect vdevs:\n"); + + zfs_btree_index_t *where = NULL; + sublivelist_verify_block_t *svb; + while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != + NULL) { + int vdev_id = DVA_GET_VDEV(&svb->svb_dva); + ASSERT3U(vdev_id, <, rvd->vdev_children); + vdev_t *vd = rvd->vdev_child[vdev_id]; + ASSERT(!vdev_is_concrete(vd)); + (void) printf("<%d:%llx:%llx> TXG %llx\n", + vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), + (u_longlong_t)svb->svb_allocated_txg); + } + (void) printf("\n"); + zfs_btree_destroy(&sv.sv_leftover); +} + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} + +static void +usage(void) +{ + (void) fprintf(stderr, + "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] " + "[-I <inflight I/Os>]\n" + "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" + "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n" + "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>]\n" + "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n" + "\t%s [-v] <bookmark>\n" + "\t%s -C [-A] [-U <cache>]\n" + "\t%s -l [-Aqu] <device>\n" + "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] " + "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n" + "\t%s -O <dataset> <path>\n" + "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n" + "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n" + "\t%s -E [-A] word0:word1:...:word15\n" + "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] " + "<poolname>\n\n", + cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, + cmdname, cmdname, cmdname); + + (void) fprintf(stderr, " Dataset name must include at least one " + "separator character '/' or '@'\n"); + (void) fprintf(stderr, " If dataset name is specified, only that " + "dataset is dumped\n"); + (void) fprintf(stderr, " If object numbers or object number " + "ranges are specified, only those\n" + " objects or ranges are dumped.\n\n"); + (void) fprintf(stderr, + " Object ranges take the form <start>:<end>[:<flags>]\n" + " start Starting object number\n" + " end Ending object number, or -1 for no upper bound\n" + " flags Optional flags to select object types:\n" + " A All objects (this is the default)\n" + " d ZFS directories\n" + " f ZFS files \n" + " m SPA space maps\n" + " z ZAPs\n" + " - Negate effect of next flag\n\n"); + (void) fprintf(stderr, " Options to control amount of output:\n"); + (void) fprintf(stderr, " -b block statistics\n"); + (void) fprintf(stderr, " -c checksum all metadata (twice for " + "all data) blocks\n"); + (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); + (void) fprintf(stderr, " -d dataset(s)\n"); + (void) fprintf(stderr, " -D dedup statistics\n"); + (void) fprintf(stderr, " -E decode and display block from an " + "embedded block pointer\n"); + (void) fprintf(stderr, " -h pool history\n"); + (void) fprintf(stderr, " -i intent logs\n"); + (void) fprintf(stderr, " -l read label contents\n"); + (void) fprintf(stderr, " -k examine the checkpointed state " + "of the pool\n"); + (void) fprintf(stderr, " -L disable leak tracking (do not " + "load spacemaps)\n"); + (void) fprintf(stderr, " -m metaslabs\n"); + (void) fprintf(stderr, " -M metaslab groups\n"); + (void) fprintf(stderr, " -O perform object lookups by path\n"); + (void) fprintf(stderr, " -R read and display block from a " + "device\n"); + (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); + (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); + (void) fprintf(stderr, " -v verbose (applies to all " + "others)\n"); + (void) fprintf(stderr, " -y perform livelist and metaslab " + "validation on any livelists being deleted\n\n"); + (void) fprintf(stderr, " Below options are intended for use " + "with other options:\n"); + (void) fprintf(stderr, " -A ignore assertions (-A), enable " + "panic recovery (-AA) or both (-AAA)\n"); + (void) fprintf(stderr, " -e pool is exported/destroyed/" + "has altroot/not in a cachefile\n"); + (void) fprintf(stderr, " -F attempt automatic rewind within " + "safe range of transaction groups\n"); + (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " + "exiting\n"); + (void) fprintf(stderr, " -I <number of inflight I/Os> -- " + "specify the maximum number of\n " + "checksumming I/Os [default is 200]\n"); + (void) fprintf(stderr, " -o <variable>=<value> set global " + "variable to an unsigned 32-bit integer\n"); + (void) fprintf(stderr, " -p <path> -- use one or more with " + "-e to specify path to vdev dir\n"); + (void) fprintf(stderr, " -P print numbers in parseable form\n"); + (void) fprintf(stderr, " -q don't print label contents\n"); + (void) fprintf(stderr, " -t <txg> -- highest txg to use when " + "searching for uberblocks\n"); + (void) fprintf(stderr, " -u uberblock\n"); + (void) fprintf(stderr, " -U <cachefile_path> -- use alternate " + "cachefile\n"); + (void) fprintf(stderr, " -V do verbatim import\n"); + (void) fprintf(stderr, " -x <dumpdir> -- " + "dump all read blocks into specified directory\n"); + (void) fprintf(stderr, " -X attempt extreme rewind (does not " + "work with dataset)\n"); + (void) fprintf(stderr, " -Y attempt all reconstruction " + "combinations for split blocks\n"); + (void) fprintf(stderr, " -Z show ZSTD headers \n"); + (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " + "to make only that option verbose\n"); + (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); + exit(1); +} + +static void +dump_debug_buffer(void) +{ + if (dump_opt['G']) { + (void) printf("\n"); + (void) fflush(stdout); + zfs_dbgmsg_print("zdb"); + } +} + +/* + * Called for usage errors that are discovered after a call to spa_open(), + * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. + */ + +static void +fatal(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + (void) fprintf(stderr, "%s: ", cmdname); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); + (void) fprintf(stderr, "\n"); + + dump_debug_buffer(); + + exit(1); +} + +/* ARGSUSED */ +static void +dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) +{ + nvlist_t *nv; + size_t nvsize = *(uint64_t *)data; + char *packed = umem_alloc(nvsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); + + VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); + + umem_free(packed, nvsize); + + dump_nvlist(nv, 8); + + nvlist_free(nv); +} + +/* ARGSUSED */ +static void +dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) +{ + spa_history_phys_t *shp = data; + + if (shp == NULL) + return; + + (void) printf("\t\tpool_create_len = %llu\n", + (u_longlong_t)shp->sh_pool_create_len); + (void) printf("\t\tphys_max_off = %llu\n", + (u_longlong_t)shp->sh_phys_max_off); + (void) printf("\t\tbof = %llu\n", + (u_longlong_t)shp->sh_bof); + (void) printf("\t\teof = %llu\n", + (u_longlong_t)shp->sh_eof); + (void) printf("\t\trecords_lost = %llu\n", + (u_longlong_t)shp->sh_records_lost); +} + +static void +zdb_nicenum(uint64_t num, char *buf, size_t buflen) +{ + if (dump_opt['P']) + (void) snprintf(buf, buflen, "%llu", (longlong_t)num); + else + nicenum(num, buf, sizeof (buf)); +} + +static const char histo_stars[] = "****************************************"; +static const uint64_t histo_width = sizeof (histo_stars) - 1; + +static void +dump_histogram(const uint64_t *histo, int size, int offset) +{ + int i; + int minidx = size - 1; + int maxidx = 0; + uint64_t max = 0; + + for (i = 0; i < size; i++) { + if (histo[i] > max) + max = histo[i]; + if (histo[i] > 0 && i > maxidx) + maxidx = i; + if (histo[i] > 0 && i < minidx) + minidx = i; + } + + if (max < histo_width) + max = histo_width; + + for (i = minidx; i <= maxidx; i++) { + (void) printf("\t\t\t%3u: %6llu %s\n", + i + offset, (u_longlong_t)histo[i], + &histo_stars[(max - histo[i]) * histo_width / max]); + } +} + +static void +dump_zap_stats(objset_t *os, uint64_t object) +{ + int error; + zap_stats_t zs; + + error = zap_get_stats(os, object, &zs); + if (error) + return; + + if (zs.zs_ptrtbl_len == 0) { + ASSERT(zs.zs_num_blocks == 1); + (void) printf("\tmicrozap: %llu bytes, %llu entries\n", + (u_longlong_t)zs.zs_blocksize, + (u_longlong_t)zs.zs_num_entries); + return; + } + + (void) printf("\tFat ZAP stats:\n"); + + (void) printf("\t\tPointer table:\n"); + (void) printf("\t\t\t%llu elements\n", + (u_longlong_t)zs.zs_ptrtbl_len); + (void) printf("\t\t\tzt_blk: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_blk); + (void) printf("\t\t\tzt_numblks: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_numblks); + (void) printf("\t\t\tzt_shift: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_shift); + (void) printf("\t\t\tzt_blks_copied: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_blks_copied); + (void) printf("\t\t\tzt_nextblk: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_nextblk); + + (void) printf("\t\tZAP entries: %llu\n", + (u_longlong_t)zs.zs_num_entries); + (void) printf("\t\tLeaf blocks: %llu\n", + (u_longlong_t)zs.zs_num_leafs); + (void) printf("\t\tTotal blocks: %llu\n", + (u_longlong_t)zs.zs_num_blocks); + (void) printf("\t\tzap_block_type: 0x%llx\n", + (u_longlong_t)zs.zs_block_type); + (void) printf("\t\tzap_magic: 0x%llx\n", + (u_longlong_t)zs.zs_magic); + (void) printf("\t\tzap_salt: 0x%llx\n", + (u_longlong_t)zs.zs_salt); + + (void) printf("\t\tLeafs with 2^n pointers:\n"); + dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tBlocks with n*5 entries:\n"); + dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tBlocks n/10 full:\n"); + dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tEntries with n chunks:\n"); + dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tBuckets with n entries:\n"); + dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); +} + +/*ARGSUSED*/ +static void +dump_none(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) +{ + (void) printf("\tUNKNOWN OBJECT TYPE\n"); +} + +/*ARGSUSED*/ +static void +dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) +{ + uint64_t *arr; + uint64_t oursize; + if (dump_opt['d'] < 6) + return; + + if (data == NULL) { + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(os, object, &doi)); + size = doi.doi_max_offset; + /* + * We cap the size at 1 mebibyte here to prevent + * allocation failures and nigh-infinite printing if the + * object is extremely large. + */ + oursize = MIN(size, 1 << 20); + arr = kmem_alloc(oursize, KM_SLEEP); + + int err = dmu_read(os, object, 0, oursize, arr, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(arr, oursize); + return; + } + } else { + /* + * Even though the allocation is already done in this code path, + * we still cap the size to prevent excessive printing. + */ + oursize = MIN(size, 1 << 20); + arr = data; + } + + if (size == 0) { + (void) printf("\t\t[]\n"); + return; + } + + (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); + for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { + if (i % 4 != 0) + (void) printf(", %0llx", (u_longlong_t)arr[i]); + else + (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); + } + if (oursize != size) + (void) printf(", ... "); + (void) printf("]\n"); + + if (data == NULL) + kmem_free(arr, oursize); +} + +/*ARGSUSED*/ +static void +dump_zap(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + void *prop; + unsigned i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + prop = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + (void) zap_lookup(os, object, attr.za_name, + attr.za_integer_length, attr.za_num_integers, prop); + if (attr.za_integer_length == 1) { + (void) printf("%s", (char *)prop); + } else { + for (i = 0; i < attr.za_num_integers; i++) { + switch (attr.za_integer_length) { + case 2: + (void) printf("%u ", + ((uint16_t *)prop)[i]); + break; + case 4: + (void) printf("%u ", + ((uint32_t *)prop)[i]); + break; + case 8: + (void) printf("%lld ", + (u_longlong_t)((int64_t *)prop)[i]); + break; + } + } + } + (void) printf("\n"); + umem_free(prop, attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +static void +dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) +{ + bpobj_phys_t *bpop = data; + uint64_t i; + char bytes[32], comp[32], uncomp[32]; + + /* make sure the output won't get truncated */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + + if (bpop == NULL) + return; + + zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); + zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); + zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); + + (void) printf("\t\tnum_blkptrs = %llu\n", + (u_longlong_t)bpop->bpo_num_blkptrs); + (void) printf("\t\tbytes = %s\n", bytes); + if (size >= BPOBJ_SIZE_V1) { + (void) printf("\t\tcomp = %s\n", comp); + (void) printf("\t\tuncomp = %s\n", uncomp); + } + if (size >= BPOBJ_SIZE_V2) { + (void) printf("\t\tsubobjs = %llu\n", + (u_longlong_t)bpop->bpo_subobjs); + (void) printf("\t\tnum_subobjs = %llu\n", + (u_longlong_t)bpop->bpo_num_subobjs); + } + if (size >= sizeof (*bpop)) { + (void) printf("\t\tnum_freed = %llu\n", + (u_longlong_t)bpop->bpo_num_freed); + } + + if (dump_opt['d'] < 5) + return; + + for (i = 0; i < bpop->bpo_num_blkptrs; i++) { + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t bp; + + int err = dmu_read(os, object, + i * sizeof (bp), sizeof (bp), &bp, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + break; + } + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, + BP_GET_FREE(&bp)); + (void) printf("\t%s\n", blkbuf); + } +} + +/* ARGSUSED */ +static void +dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) +{ + dmu_object_info_t doi; + int64_t i; + + VERIFY0(dmu_object_info(os, object, &doi)); + uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); + + int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(subobjs, doi.doi_max_offset); + return; + } + + int64_t last_nonzero = -1; + for (i = 0; i < doi.doi_max_offset / 8; i++) { + if (subobjs[i] != 0) + last_nonzero = i; + } + + for (i = 0; i <= last_nonzero; i++) { + (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); + } + kmem_free(subobjs, doi.doi_max_offset); +} + +/*ARGSUSED*/ +static void +dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) +{ + dump_zap_stats(os, object); + /* contents are printed elsewhere, properly decoded */ +} + +/*ARGSUSED*/ +static void +dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + (void) printf(" %llx : [%d:%d:%d]\n", + (u_longlong_t)attr.za_first_integer, + (int)ATTR_LENGTH(attr.za_first_integer), + (int)ATTR_BSWAP(attr.za_first_integer), + (int)ATTR_NUM(attr.za_first_integer)); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + uint16_t *layout_attrs; + unsigned i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = [", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + + VERIFY(attr.za_integer_length == 2); + layout_attrs = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + + VERIFY(zap_lookup(os, object, attr.za_name, + attr.za_integer_length, + attr.za_num_integers, layout_attrs) == 0); + + for (i = 0; i != attr.za_num_integers; i++) + (void) printf(" %d ", (int)layout_attrs[i]); + (void) printf("]\n"); + umem_free(layout_attrs, + attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + const char *typenames[] = { + /* 0 */ "not specified", + /* 1 */ "FIFO", + /* 2 */ "Character Device", + /* 3 */ "3 (invalid)", + /* 4 */ "Directory", + /* 5 */ "5 (invalid)", + /* 6 */ "Block Device", + /* 7 */ "7 (invalid)", + /* 8 */ "Regular File", + /* 9 */ "9 (invalid)", + /* 10 */ "Symbolic Link", + /* 11 */ "11 (invalid)", + /* 12 */ "Socket", + /* 13 */ "Door", + /* 14 */ "Event Port", + /* 15 */ "15 (invalid)", + }; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = %lld (type: %s)\n", + attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), + typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); + } + zap_cursor_fini(&zc); +} + +static int +get_dtl_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_ops->vdev_op_leaf) { + space_map_t *sm = vd->vdev_dtl_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + return (1); + return (0); + } + + for (unsigned c = 0; c < vd->vdev_children; c++) + refcount += get_dtl_refcount(vd->vdev_child[c]); + return (refcount); +} + +static int +get_metaslab_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_top == vd) { + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + space_map_t *sm = vd->vdev_ms[m]->ms_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + refcount++; + } + } + for (unsigned c = 0; c < vd->vdev_children; c++) + refcount += get_metaslab_refcount(vd->vdev_child[c]); + + return (refcount); +} + +static int +get_obsolete_refcount(vdev_t *vd) +{ + uint64_t obsolete_sm_object; + int refcount = 0; + + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (vd->vdev_top == vd && obsolete_sm_object != 0) { + dmu_object_info_t doi; + VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, + obsolete_sm_object, &doi)); + if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { + refcount++; + } + } else { + ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); + ASSERT3U(obsolete_sm_object, ==, 0); + } + for (unsigned c = 0; c < vd->vdev_children; c++) { + refcount += get_obsolete_refcount(vd->vdev_child[c]); + } + + return (refcount); +} + +static int +get_prev_obsolete_spacemap_refcount(spa_t *spa) +{ + uint64_t prev_obj = + spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; + if (prev_obj != 0) { + dmu_object_info_t doi; + VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); + if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { + return (1); + } + } + return (0); +} + +static int +get_checkpoint_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && + zap_contains(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) + refcount++; + + for (uint64_t c = 0; c < vd->vdev_children; c++) + refcount += get_checkpoint_refcount(vd->vdev_child[c]); + + return (refcount); +} + +static int +get_log_spacemap_refcount(spa_t *spa) +{ + return (avl_numnodes(&spa->spa_sm_logs_by_txg)); +} + +static int +verify_spacemap_refcounts(spa_t *spa) +{ + uint64_t expected_refcount = 0; + uint64_t actual_refcount; + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], + &expected_refcount); + actual_refcount = get_dtl_refcount(spa->spa_root_vdev); + actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); + actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); + actual_refcount += get_prev_obsolete_spacemap_refcount(spa); + actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); + actual_refcount += get_log_spacemap_refcount(spa); + + if (expected_refcount != actual_refcount) { + (void) printf("space map refcount mismatch: expected %lld != " + "actual %lld\n", + (longlong_t)expected_refcount, + (longlong_t)actual_refcount); + return (2); + } + return (0); +} + +static void +dump_spacemap(objset_t *os, space_map_t *sm) +{ + const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", + "INVALID", "INVALID", "INVALID", "INVALID" }; + + if (sm == NULL) + return; + + (void) printf("space map object %llu:\n", + (longlong_t)sm->sm_object); + (void) printf(" smp_length = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_length); + (void) printf(" smp_alloc = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_alloc); + + if (dump_opt['d'] < 6 && dump_opt['m'] < 4) + return; + + /* + * Print out the freelist entries in both encoded and decoded form. + */ + uint8_t mapshift = sm->sm_shift; + int64_t alloc = 0; + uint64_t word, entry_id = 0; + for (uint64_t offset = 0; offset < space_map_length(sm); + offset += sizeof (word)) { + + VERIFY0(dmu_read(os, space_map_object(sm), offset, + sizeof (word), &word, DMU_READ_PREFETCH)); + + if (sm_entry_is_debug(word)) { + uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); + uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); + if (de_txg == 0) { + (void) printf( + "\t [%6llu] PADDING\n", + (u_longlong_t)entry_id); + } else { + (void) printf( + "\t [%6llu] %s: txg %llu pass %llu\n", + (u_longlong_t)entry_id, + ddata[SM_DEBUG_ACTION_DECODE(word)], + (u_longlong_t)de_txg, + (u_longlong_t)de_sync_pass); + } + entry_id++; + continue; + } + + uint8_t words; + char entry_type; + uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; + + if (sm_entry_is_single_word(word)) { + entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? + 'A' : 'F'; + entry_off = (SM_OFFSET_DECODE(word) << mapshift) + + sm->sm_start; + entry_run = SM_RUN_DECODE(word) << mapshift; + words = 1; + } else { + /* it is a two-word entry so we read another word */ + ASSERT(sm_entry_is_double_word(word)); + + uint64_t extra_word; + offset += sizeof (extra_word); + VERIFY0(dmu_read(os, space_map_object(sm), offset, + sizeof (extra_word), &extra_word, + DMU_READ_PREFETCH)); + + ASSERT3U(offset, <=, space_map_length(sm)); + + entry_run = SM2_RUN_DECODE(word) << mapshift; + entry_vdev = SM2_VDEV_DECODE(word); + entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? + 'A' : 'F'; + entry_off = (SM2_OFFSET_DECODE(extra_word) << + mapshift) + sm->sm_start; + words = 2; + } + + (void) printf("\t [%6llu] %c range:" + " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", + (u_longlong_t)entry_id, + entry_type, (u_longlong_t)entry_off, + (u_longlong_t)(entry_off + entry_run), + (u_longlong_t)entry_run, + (u_longlong_t)entry_vdev, words); + + if (entry_type == 'A') + alloc += entry_run; + else + alloc -= entry_run; + entry_id++; + } + if (alloc != space_map_allocated(sm)) { + (void) printf("space_map_object alloc (%lld) INCONSISTENT " + "with space map summary (%lld)\n", + (longlong_t)space_map_allocated(sm), (longlong_t)alloc); + } +} + +static void +dump_metaslab_stats(metaslab_t *msp) +{ + char maxbuf[32]; + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_t *t = &msp->ms_allocatable_by_size; + int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + + /* max sure nicenum has enough space */ + CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); + + zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); + + (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", + "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, + "freepct", free_pct); + (void) printf("\tIn-memory histogram:\n"); + dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); +} + +static void +dump_metaslab(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + space_map_t *sm = msp->ms_sm; + char freebuf[32]; + + zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, + sizeof (freebuf)); + + (void) printf( + "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", + (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, + (u_longlong_t)space_map_object(sm), freebuf); + + if (dump_opt['m'] > 2 && !dump_opt['L']) { + mutex_enter(&msp->ms_lock); + VERIFY0(metaslab_load(msp)); + range_tree_stat_verify(msp->ms_allocatable); + dump_metaslab_stats(msp); + metaslab_unload(msp); + mutex_exit(&msp->ms_lock); + } + + if (dump_opt['m'] > 1 && sm != NULL && + spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + /* + * The space map histogram represents free space in chunks + * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). + */ + (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", + (u_longlong_t)msp->ms_fragmentation); + dump_histogram(sm->sm_phys->smp_histogram, + SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); + } + + ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); + + if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", + (u_longlong_t)metaslab_unflushed_txg(msp)); + } +} + +static void +print_vdev_metaslab_header(vdev_t *vd) +{ + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *bias_str = ""; + if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { + bias_str = VDEV_ALLOC_BIAS_LOG; + } else if (alloc_bias == VDEV_BIAS_SPECIAL) { + bias_str = VDEV_ALLOC_BIAS_SPECIAL; + } else if (alloc_bias == VDEV_BIAS_DEDUP) { + bias_str = VDEV_ALLOC_BIAS_DEDUP; + } + + uint64_t ms_flush_data_obj = 0; + if (vd->vdev_top_zap != 0) { + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (uint64_t), 1, &ms_flush_data_obj); + if (error != ENOENT) { + ASSERT0(error); + } + } + + (void) printf("\tvdev %10llu %s", + (u_longlong_t)vd->vdev_id, bias_str); + + if (ms_flush_data_obj != 0) { + (void) printf(" ms_unflushed_phys object %llu", + (u_longlong_t)ms_flush_data_obj); + } + + (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", + "metaslabs", (u_longlong_t)vd->vdev_ms_count, + "offset", "spacemap", "free"); + (void) printf("\t%15s %19s %15s %12s\n", + "---------------", "-------------------", + "---------------", "------------"); +} + +static void +dump_metaslab_groups(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + metaslab_class_t *mc = spa_normal_class(spa); + uint64_t fragmentation; + + metaslab_class_histogram_verify(mc); + + for (unsigned c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg == NULL || mg->mg_class != mc) + continue; + + metaslab_group_histogram_verify(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); + + (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" + "fragmentation", + (u_longlong_t)tvd->vdev_id, + (u_longlong_t)tvd->vdev_ms_count); + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + (void) printf("%3s\n", "-"); + } else { + (void) printf("%3llu%%\n", + (u_longlong_t)mg->mg_fragmentation); + } + dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + } + + (void) printf("\tpool %s\tfragmentation", spa_name(spa)); + fragmentation = metaslab_class_fragmentation(mc); + if (fragmentation == ZFS_FRAG_INVALID) + (void) printf("\t%3s\n", "-"); + else + (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); + dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); +} + +static void +print_vdev_indirect(vdev_t *vd) +{ + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + if (vim == NULL) { + ASSERT3P(vib, ==, NULL); + return; + } + + ASSERT3U(vdev_indirect_mapping_object(vim), ==, + vic->vic_mapping_object); + ASSERT3U(vdev_indirect_births_object(vib), ==, + vic->vic_births_object); + + (void) printf("indirect births obj %llu:\n", + (longlong_t)vic->vic_births_object); + (void) printf(" vib_count = %llu\n", + (longlong_t)vdev_indirect_births_count(vib)); + for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { + vdev_indirect_birth_entry_phys_t *cur_vibe = + &vib->vib_entries[i]; + (void) printf("\toffset %llx -> txg %llu\n", + (longlong_t)cur_vibe->vibe_offset, + (longlong_t)cur_vibe->vibe_phys_birth_txg); + } + (void) printf("\n"); + + (void) printf("indirect mapping obj %llu:\n", + (longlong_t)vic->vic_mapping_object); + (void) printf(" vim_max_offset = 0x%llx\n", + (longlong_t)vdev_indirect_mapping_max_offset(vim)); + (void) printf(" vim_bytes_mapped = 0x%llx\n", + (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); + (void) printf(" vim_count = %llu\n", + (longlong_t)vdev_indirect_mapping_num_entries(vim)); + + if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) + return; + + uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); + + for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[i]; + (void) printf("\t<%llx:%llx:%llx> -> " + "<%llx:%llx:%llx> (%x obsolete)\n", + (longlong_t)vd->vdev_id, + (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), + (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), + (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), + (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + counts[i]); + } + (void) printf("\n"); + + uint64_t obsolete_sm_object; + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object != 0) { + objset_t *mos = vd->vdev_spa->spa_meta_objset; + (void) printf("obsolete space map object %llu:\n", + (u_longlong_t)obsolete_sm_object); + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, + obsolete_sm_object); + dump_spacemap(mos, vd->vdev_obsolete_sm); + (void) printf("\n"); + } +} + +static void +dump_metaslabs(spa_t *spa) +{ + vdev_t *vd, *rvd = spa->spa_root_vdev; + uint64_t m, c = 0, children = rvd->vdev_children; + + (void) printf("\nMetaslabs:\n"); + + if (!dump_opt['d'] && zopt_metaslab_args > 0) { + c = zopt_metaslab[0]; + + if (c >= children) + (void) fatal("bad vdev id: %llu", (u_longlong_t)c); + + if (zopt_metaslab_args > 1) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + + for (m = 1; m < zopt_metaslab_args; m++) { + if (zopt_metaslab[m] < vd->vdev_ms_count) + dump_metaslab( + vd->vdev_ms[zopt_metaslab[m]]); + else + (void) fprintf(stderr, "bad metaslab " + "number %llu\n", + (u_longlong_t)zopt_metaslab[m]); + } + (void) printf("\n"); + return; + } + children = c + 1; + } + for (; c < children; c++) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + + print_vdev_indirect(vd); + + for (m = 0; m < vd->vdev_ms_count; m++) + dump_metaslab(vd->vdev_ms[m]); + (void) printf("\n"); + } +} + +static void +dump_log_spacemaps(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + (void) printf("\nLog Space Maps in Pool:\n"); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + (void) printf("Log Spacemap object %llu txg %llu\n", + (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); + dump_spacemap(spa->spa_meta_objset, sm); + space_map_close(sm); + } + (void) printf("\n"); +} + +static void +dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) +{ + const ddt_phys_t *ddp = dde->dde_phys; + const ddt_key_t *ddk = &dde->dde_key; + const char *types[4] = { "ditto", "single", "double", "triple" }; + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t blk; + int p; + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); + (void) printf("index %llx refcnt %llu %s %s\n", + (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, + types[p], blkbuf); + } +} + +static void +dump_dedup_ratio(const ddt_stat_t *dds) +{ + double rL, rP, rD, D, dedup, compress, copies; + + if (dds->dds_blocks == 0) + return; + + rL = (double)dds->dds_ref_lsize; + rP = (double)dds->dds_ref_psize; + rD = (double)dds->dds_ref_dsize; + D = (double)dds->dds_dsize; + + dedup = rD / D; + compress = rL / rP; + copies = rD / rP; + + (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " + "dedup * compress / copies = %.2f\n\n", + dedup, compress, copies, dedup * compress / copies); +} + +static void +dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + char name[DDT_NAMELEN]; + ddt_entry_t dde; + uint64_t walk = 0; + dmu_object_info_t doi; + uint64_t count, dspace, mspace; + int error; + + error = ddt_object_info(ddt, type, class, &doi); + + if (error == ENOENT) + return; + ASSERT(error == 0); + + error = ddt_object_count(ddt, type, class, &count); + ASSERT(error == 0); + if (count == 0) + return; + + dspace = doi.doi_physical_blocks_512 << 9; + mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ddt_object_name(ddt, type, class, name); + + (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", + name, + (u_longlong_t)count, + (u_longlong_t)(dspace / count), + (u_longlong_t)(mspace / count)); + + if (dump_opt['D'] < 3) + return; + + zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); + + if (dump_opt['D'] < 4) + return; + + if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) + return; + + (void) printf("%s contents:\n\n", name); + + while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) + dump_dde(ddt, &dde, walk); + + ASSERT3U(error, ==, ENOENT); + + (void) printf("\n"); +} + +static void +dump_all_ddts(spa_t *spa) +{ + ddt_histogram_t ddh_total; + ddt_stat_t dds_total; + + bzero(&ddh_total, sizeof (ddh_total)); + bzero(&dds_total, sizeof (dds_total)); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + dump_ddt(ddt, type, class); + } + } + } + + ddt_get_dedup_stats(spa, &dds_total); + + if (dds_total.dds_blocks == 0) { + (void) printf("All DDTs are empty\n"); + return; + } + + (void) printf("\n"); + + if (dump_opt['D'] > 1) { + (void) printf("DDT histogram (aggregated over all DDTs):\n"); + ddt_get_dedup_histogram(spa, &ddh_total); + zpool_dump_ddt(&dds_total, &ddh_total); + } + + dump_dedup_ratio(&dds_total); +} + +static void +dump_dtl_seg(void *arg, uint64_t start, uint64_t size) +{ + char *prefix = arg; + + (void) printf("%s [%llu,%llu) length %llu\n", + prefix, + (u_longlong_t)start, + (u_longlong_t)(start + size), + (u_longlong_t)(size)); +} + +static void +dump_dtl(vdev_t *vd, int indent) +{ + spa_t *spa = vd->vdev_spa; + boolean_t required; + const char *name[DTL_TYPES] = { "missing", "partial", "scrub", + "outage" }; + char prefix[256]; + + spa_vdev_state_enter(spa, SCL_NONE); + required = vdev_dtl_required(vd); + (void) spa_vdev_state_exit(spa, NULL, 0); + + if (indent == 0) + (void) printf("\nDirty time logs:\n\n"); + + (void) printf("\t%*s%s [%s]\n", indent, "", + vd->vdev_path ? vd->vdev_path : + vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), + required ? "DTL-required" : "DTL-expendable"); + + for (int t = 0; t < DTL_TYPES; t++) { + range_tree_t *rt = vd->vdev_dtl[t]; + if (range_tree_space(rt) == 0) + continue; + (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", + indent + 2, "", name[t]); + range_tree_walk(rt, dump_dtl_seg, prefix); + if (dump_opt['d'] > 5 && vd->vdev_children == 0) + dump_spacemap(spa->spa_meta_objset, + vd->vdev_dtl_sm); + } + + for (unsigned c = 0; c < vd->vdev_children; c++) + dump_dtl(vd->vdev_child[c], indent + 4); +} + +static void +dump_history(spa_t *spa) +{ + nvlist_t **events = NULL; + char *buf; + uint64_t resid, len, off = 0; + uint_t num = 0; + int error; + time_t tsec; + struct tm t; + char tbuf[30]; + char internalstr[MAXPATHLEN]; + + if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { + (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", + __func__); + return; + } + + do { + len = SPA_OLD_MAXBLOCKSIZE; + + if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { + (void) fprintf(stderr, "Unable to read history: " + "error %d\n", error); + free(buf); + return; + } + + if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) + break; + + off -= resid; + } while (len != 0); + + (void) printf("\nHistory:\n"); + for (unsigned i = 0; i < num; i++) { + uint64_t time, txg, ievent; + char *cmd, *intstr; + boolean_t printed = B_FALSE; + + if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, + &time) != 0) + goto next; + if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, + &cmd) != 0) { + if (nvlist_lookup_uint64(events[i], + ZPOOL_HIST_INT_EVENT, &ievent) != 0) + goto next; + verify(nvlist_lookup_uint64(events[i], + ZPOOL_HIST_TXG, &txg) == 0); + verify(nvlist_lookup_string(events[i], + ZPOOL_HIST_INT_STR, &intstr) == 0); + if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) + goto next; + + (void) snprintf(internalstr, + sizeof (internalstr), + "[internal %s txg:%lld] %s", + zfs_history_event_names[ievent], + (longlong_t)txg, intstr); + cmd = internalstr; + } + tsec = time; + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + (void) printf("%s %s\n", tbuf, cmd); + printed = B_TRUE; + +next: + if (dump_opt['h'] > 1) { + if (!printed) + (void) printf("unrecognized record:\n"); + dump_nvlist(events[i], 2); + } + } + free(buf); +} + +/*ARGSUSED*/ +static void +dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +static uint64_t +blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_phys_t *zb) +{ + if (dnp == NULL) { + ASSERT(zb->zb_level < 0); + if (zb->zb_object == 0) + return (zb->zb_blkid); + return (zb->zb_blkid * BP_GET_LSIZE(bp)); + } + + ASSERT(zb->zb_level >= 0); + + return ((zb->zb_blkid << + (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); +} + +static void +snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, + const blkptr_t *bp) +{ + abd_t *pabd; + void *buf; + zio_t *zio; + zfs_zstdhdr_t zstd_hdr; + int error; + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) + return; + + if (BP_IS_HOLE(bp)) + return; + + if (BP_IS_EMBEDDED(bp)) { + buf = malloc(SPA_MAXBLOCKSIZE); + if (buf == NULL) { + (void) fprintf(stderr, "out of memory\n"); + exit(1); + } + decode_embedded_bp_compressed(bp, buf); + memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); + free(buf); + zstd_hdr.c_len = BE_32(zstd_hdr.c_len); + zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", + zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level); + return; + } + + pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); + zio = zio_root(spa, NULL, NULL, 0); + + /* Decrypt but don't decompress so we can read the compression header */ + zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, + NULL)); + error = zio_wait(zio); + if (error) { + (void) fprintf(stderr, "read failed: %d\n", error); + return; + } + buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); + memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); + zstd_hdr.c_len = BE_32(zstd_hdr.c_len); + zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); + + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + " ZSTD:size=%u:version=%u:level=%u:NORMAL", + zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level); + + abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); +} + +static void +snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, + boolean_t bp_freed) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; + int i; + + if (dump_opt['b'] >= 6) { + snprintf_blkptr(blkbuf, buflen, bp); + if (bp_freed) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); + } + return; + } + + if (BP_IS_EMBEDDED(bp)) { + (void) sprintf(blkbuf, + "EMBEDDED et=%u %llxL/%llxP B=%llu", + (int)BPE_GET_ETYPE(bp), + (u_longlong_t)BPE_GET_LSIZE(bp), + (u_longlong_t)BPE_GET_PSIZE(bp), + (u_longlong_t)bp->blk_birth); + return; + } + + blkbuf[0] = '\0'; + + for (i = 0; i < ndvas; i++) + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), "%llu:%llx:%llx ", + (u_longlong_t)DVA_GET_VDEV(&dva[i]), + (u_longlong_t)DVA_GET_OFFSET(&dva[i]), + (u_longlong_t)DVA_GET_ASIZE(&dva[i])); + + if (BP_IS_HOLE(bp)) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL B=%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)bp->blk_birth); + } else { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL/%llxP F=%llu B=%llu/%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp), + (u_longlong_t)BP_GET_FILL(bp), + (u_longlong_t)bp->blk_birth, + (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + if (bp_freed) + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx", + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); + } +} + +static void +print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, + const dnode_phys_t *dnp) +{ + char blkbuf[BP_SPRINTF_LEN]; + int l; + + if (!BP_IS_EMBEDDED(bp)) { + ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); + ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); + } + + (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); + + ASSERT(zb->zb_level >= 0); + + for (l = dnp->dn_nlevels - 1; l >= -1; l--) { + if (l == zb->zb_level) { + (void) printf("L%llx", (u_longlong_t)zb->zb_level); + } else { + (void) printf(" "); + } + } + + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); + if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) + snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); + (void) printf("%s\n", blkbuf); +} + +static int +visit_indirect(spa_t *spa, const dnode_phys_t *dnp, + blkptr_t *bp, const zbookmark_phys_t *zb) +{ + int err = 0; + + if (bp->blk_birth == 0) + return (0); + + print_indirect(spa, bp, zb, dnp); + + if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { + arc_flags_t flags = ARC_FLAG_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + arc_buf_t *buf; + uint64_t fill = 0; + ASSERT(!BP_IS_REDACTED(bp)); + + err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + ASSERT(buf->b_data); + + /* recursively visit blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + zbookmark_phys_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = visit_indirect(spa, dnp, cbp, &czb); + if (err) + break; + fill += BP_GET_FILL(cbp); + } + if (!err) + ASSERT3U(fill, ==, BP_GET_FILL(bp)); + arc_buf_destroy(buf, &buf); + } + + return (err); +} + +/*ARGSUSED*/ +static void +dump_indirect(dnode_t *dn) +{ + dnode_phys_t *dnp = dn->dn_phys; + int j; + zbookmark_phys_t czb; + + (void) printf("Indirect blocks:\n"); + + SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), + dn->dn_object, dnp->dn_nlevels - 1, 0); + for (j = 0; j < dnp->dn_nblkptr; j++) { + czb.zb_blkid = j; + (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, + &dnp->dn_blkptr[j], &czb); + } + + (void) printf("\n"); +} + +/*ARGSUSED*/ +static void +dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) +{ + dsl_dir_phys_t *dd = data; + time_t crtime; + char nice[32]; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); + + if (dd == NULL) + return; + + ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); + + crtime = dd->dd_creation_time; + (void) printf("\t\tcreation_time = %s", ctime(&crtime)); + (void) printf("\t\thead_dataset_obj = %llu\n", + (u_longlong_t)dd->dd_head_dataset_obj); + (void) printf("\t\tparent_dir_obj = %llu\n", + (u_longlong_t)dd->dd_parent_obj); + (void) printf("\t\torigin_obj = %llu\n", + (u_longlong_t)dd->dd_origin_obj); + (void) printf("\t\tchild_dir_zapobj = %llu\n", + (u_longlong_t)dd->dd_child_dir_zapobj); + zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); + (void) printf("\t\tused_bytes = %s\n", nice); + zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); + (void) printf("\t\tcompressed_bytes = %s\n", nice); + zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); + (void) printf("\t\tuncompressed_bytes = %s\n", nice); + zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); + (void) printf("\t\tquota = %s\n", nice); + zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); + (void) printf("\t\treserved = %s\n", nice); + (void) printf("\t\tprops_zapobj = %llu\n", + (u_longlong_t)dd->dd_props_zapobj); + (void) printf("\t\tdeleg_zapobj = %llu\n", + (u_longlong_t)dd->dd_deleg_zapobj); + (void) printf("\t\tflags = %llx\n", + (u_longlong_t)dd->dd_flags); + +#define DO(which) \ + zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ + sizeof (nice)); \ + (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) + DO(HEAD); + DO(SNAP); + DO(CHILD); + DO(CHILD_RSRV); + DO(REFRSRV); +#undef DO + (void) printf("\t\tclones = %llu\n", + (u_longlong_t)dd->dd_clones); +} + +/*ARGSUSED*/ +static void +dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) +{ + dsl_dataset_phys_t *ds = data; + time_t crtime; + char used[32], compressed[32], uncompressed[32], unique[32]; + char blkbuf[BP_SPRINTF_LEN]; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); + + if (ds == NULL) + return; + + ASSERT(size == sizeof (*ds)); + crtime = ds->ds_creation_time; + zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); + zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); + zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, + sizeof (uncompressed)); + zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); + + (void) printf("\t\tdir_obj = %llu\n", + (u_longlong_t)ds->ds_dir_obj); + (void) printf("\t\tprev_snap_obj = %llu\n", + (u_longlong_t)ds->ds_prev_snap_obj); + (void) printf("\t\tprev_snap_txg = %llu\n", + (u_longlong_t)ds->ds_prev_snap_txg); + (void) printf("\t\tnext_snap_obj = %llu\n", + (u_longlong_t)ds->ds_next_snap_obj); + (void) printf("\t\tsnapnames_zapobj = %llu\n", + (u_longlong_t)ds->ds_snapnames_zapobj); + (void) printf("\t\tnum_children = %llu\n", + (u_longlong_t)ds->ds_num_children); + (void) printf("\t\tuserrefs_obj = %llu\n", + (u_longlong_t)ds->ds_userrefs_obj); + (void) printf("\t\tcreation_time = %s", ctime(&crtime)); + (void) printf("\t\tcreation_txg = %llu\n", + (u_longlong_t)ds->ds_creation_txg); + (void) printf("\t\tdeadlist_obj = %llu\n", + (u_longlong_t)ds->ds_deadlist_obj); + (void) printf("\t\tused_bytes = %s\n", used); + (void) printf("\t\tcompressed_bytes = %s\n", compressed); + (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); + (void) printf("\t\tunique = %s\n", unique); + (void) printf("\t\tfsid_guid = %llu\n", + (u_longlong_t)ds->ds_fsid_guid); + (void) printf("\t\tguid = %llu\n", + (u_longlong_t)ds->ds_guid); + (void) printf("\t\tflags = %llx\n", + (u_longlong_t)ds->ds_flags); + (void) printf("\t\tnext_clones_obj = %llu\n", + (u_longlong_t)ds->ds_next_clones_obj); + (void) printf("\t\tprops_obj = %llu\n", + (u_longlong_t)ds->ds_props_obj); + (void) printf("\t\tbp = %s\n", blkbuf); +} + +/* ARGSUSED */ +static int +dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (bp->blk_birth != 0) { + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("\t%s\n", blkbuf); + } + return (0); +} + +static void +dump_bptree(objset_t *os, uint64_t obj, const char *name) +{ + char bytes[32]; + bptree_phys_t *bt; + dmu_buf_t *db; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + + if (dump_opt['d'] < 3) + return; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); + (void) printf("\n %s: %llu datasets, %s\n", + name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); + dmu_buf_rele(db, FTAG); + + if (dump_opt['d'] < 5) + return; + + (void) printf("\n"); + + (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); +} + +/* ARGSUSED */ +static int +dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + ASSERT(bp->blk_birth != 0); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); + (void) printf("\t%s\n", blkbuf); + return (0); +} + +static void +dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) +{ + char bytes[32]; + char comp[32]; + char uncomp[32]; + uint64_t i; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + + if (dump_opt['d'] < 3) + return; + + zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); + zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu freed, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } else { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } + + for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { + uint64_t subobj; + bpobj_t subbpo; + int error; + VERIFY0(dmu_read(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + i * sizeof (subobj), sizeof (subobj), &subobj, 0)); + error = bpobj_open(&subbpo, bpo->bpo_os, subobj); + if (error != 0) { + (void) printf("ERROR %u while trying to open " + "subobj id %llu\n", + error, (u_longlong_t)subobj); + continue; + } + dump_full_bpobj(&subbpo, "subobj", indent + 1); + bpobj_close(&subbpo); + } + } else { + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%llu freed, %s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + bytes); + } else { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + bytes); + } + } + + if (dump_opt['d'] < 5) + return; + + + if (indent == 0) { + (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); + (void) printf("\n"); + } +} + +static int +dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, + boolean_t print_list) +{ + int err = 0; + zfs_bookmark_phys_t prop; + objset_t *mos = dp->dp_spa->spa_meta_objset; + err = dsl_bookmark_lookup(dp, name, NULL, &prop); + + if (err != 0) { + return (err); + } + + (void) printf("\t#%s: ", strchr(name, '#') + 1); + (void) printf("{guid: %llx creation_txg: %llu creation_time: " + "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, + (u_longlong_t)prop.zbm_creation_txg, + (u_longlong_t)prop.zbm_creation_time, + (u_longlong_t)prop.zbm_redaction_obj); + + IMPLY(print_list, print_redact); + if (!print_redact || prop.zbm_redaction_obj == 0) + return (0); + + redaction_list_t *rl; + VERIFY0(dsl_redaction_list_hold_obj(dp, + prop.zbm_redaction_obj, FTAG, &rl)); + + redaction_list_phys_t *rlp = rl->rl_phys; + (void) printf("\tRedacted:\n\t\tProgress: "); + if (rlp->rlp_last_object != UINT64_MAX || + rlp->rlp_last_blkid != UINT64_MAX) { + (void) printf("%llu %llu (incomplete)\n", + (u_longlong_t)rlp->rlp_last_object, + (u_longlong_t)rlp->rlp_last_blkid); + } else { + (void) printf("complete\n"); + } + (void) printf("\t\tSnapshots: ["); + for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { + if (i > 0) + (void) printf(", "); + (void) printf("%0llu", + (u_longlong_t)rlp->rlp_snaps[i]); + } + (void) printf("]\n\t\tLength: %llu\n", + (u_longlong_t)rlp->rlp_num_entries); + + if (!print_list) { + dsl_redaction_list_rele(rl, FTAG); + return (0); + } + + if (rlp->rlp_num_entries == 0) { + dsl_redaction_list_rele(rl, FTAG); + (void) printf("\t\tRedaction List: []\n\n"); + return (0); + } + + redact_block_phys_t *rbp_buf; + uint64_t size; + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); + size = doi.doi_max_offset; + rbp_buf = kmem_alloc(size, KM_SLEEP); + + err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, + rbp_buf, 0); + if (err != 0) { + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + return (err); + } + + (void) printf("\t\tRedaction List: [{object: %llx, offset: " + "%llx, blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[0].rbp_object, + (u_longlong_t)rbp_buf[0].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[0])), + (u_longlong_t)redact_block_get_count(&rbp_buf[0])); + + for (size_t i = 1; i < rlp->rlp_num_entries; i++) { + (void) printf(",\n\t\t{object: %llx, offset: %llx, " + "blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[i].rbp_object, + (u_longlong_t)rbp_buf[i].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[i])), + (u_longlong_t)redact_block_get_count(&rbp_buf[i])); + } + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + (void) printf("]\n\n"); + return (0); +} + +static void +dump_bookmarks(objset_t *os, int verbosity) +{ + zap_cursor_t zc; + zap_attribute_t attr; + dsl_dataset_t *ds = dmu_objset_ds(os); + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + objset_t *mos = os->os_spa->spa_meta_objset; + if (verbosity < 4) + return; + dsl_pool_config_enter(dp, FTAG); + + for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + char osname[ZFS_MAX_DATASET_NAME_LEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + dmu_objset_name(os, osname); + VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname, + attr.za_name)); + (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); + } + zap_cursor_fini(&zc); + dsl_pool_config_exit(dp, FTAG); +} + +static void +bpobj_count_refd(bpobj_t *bpo) +{ + mos_obj_refd(bpo->bpo_object); + + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + mos_obj_refd(bpo->bpo_phys->bpo_subobjs); + for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { + uint64_t subobj; + bpobj_t subbpo; + int error; + VERIFY0(dmu_read(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + i * sizeof (subobj), sizeof (subobj), &subobj, 0)); + error = bpobj_open(&subbpo, bpo->bpo_os, subobj); + if (error != 0) { + (void) printf("ERROR %u while trying to open " + "subobj id %llu\n", + error, (u_longlong_t)subobj); + continue; + } + bpobj_count_refd(&subbpo); + bpobj_close(&subbpo); + } + } +} + +static int +dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) +{ + spa_t *spa = arg; + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; + if (dle->dle_bpobj.bpo_object != empty_bpobj) + bpobj_count_refd(&dle->dle_bpobj); + return (0); +} + +static int +dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) +{ + ASSERT(arg == NULL); + if (dump_opt['d'] >= 5) { + char buf[128]; + (void) snprintf(buf, sizeof (buf), + "mintxg %llu -> obj %llu", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + + dump_full_bpobj(&dle->dle_bpobj, buf, 0); + } else { + (void) printf("mintxg %llu -> obj %llu\n", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + } + return (0); +} + +static void +dump_blkptr_list(dsl_deadlist_t *dl, char *name) +{ + char bytes[32]; + char comp[32]; + char uncomp[32]; + char entries[32]; + spa_t *spa = dmu_objset_spa(dl->dl_os); + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; + + if (dl->dl_oldfmt) { + if (dl->dl_bpobj.bpo_object != empty_bpobj) + bpobj_count_refd(&dl->dl_bpobj); + } else { + mos_obj_refd(dl->dl_object); + dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); + } + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ); + + if (dump_opt['d'] < 3) + return; + + if (dl->dl_oldfmt) { + dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); + return; + } + + zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); + zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); + zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); + zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); + (void) printf("\n %s: %s (%s/%s comp), %s entries\n", + name, bytes, comp, uncomp, entries); + + if (dump_opt['d'] < 4) + return; + + (void) printf("\n"); + + dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); +} + +static int +verify_dd_livelist(objset_t *os) +{ + uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + + ASSERT(!dmu_objset_is_snapshot(os)); + if (!dsl_deadlist_is_open(&dd->dd_livelist)) + return (0); + + /* Iterate through the livelist to check for duplicates */ + dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, + NULL); + + dsl_pool_config_enter(dp, FTAG); + dsl_deadlist_space(&dd->dd_livelist, &ll_used, + &ll_comp, &ll_uncomp); + + dsl_dataset_t *origin_ds; + ASSERT(dsl_pool_config_held(dp)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); + VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, + &used, &comp, &uncomp)); + dsl_dataset_rele(origin_ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + /* + * It's possible that the dataset's uncomp space is larger than the + * livelist's because livelists do not track embedded block pointers + */ + if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { + char nice_used[32], nice_comp[32], nice_uncomp[32]; + (void) printf("Discrepancy in space accounting:\n"); + zdb_nicenum(used, nice_used, sizeof (nice_used)); + zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("dir: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); + zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("livelist: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + return (1); + } + return (0); +} + +static avl_tree_t idx_tree; +static avl_tree_t domain_tree; +static boolean_t fuid_table_loaded; +static objset_t *sa_os = NULL; +static sa_attr_type_t *sa_attr_table = NULL; + +static int +open_objset(const char *path, void *tag, objset_t **osp) +{ + int err; + uint64_t sa_attrs = 0; + uint64_t version = 0; + + VERIFY3P(sa_os, ==, NULL); + /* + * We can't own an objset if it's redacted. Therefore, we do this + * dance: hold the objset, then acquire a long hold on its dataset, then + * release the pool (which is held as part of holding the objset). + */ + err = dmu_objset_hold(path, tag, osp); + if (err != 0) { + (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", + path, strerror(err)); + return (err); + } + dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); + dsl_pool_rele(dmu_objset_pool(*osp), tag); + + if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) { + (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &version); + if (version >= ZPL_VERSION_SA) { + (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, + 8, 1, &sa_attrs); + } + err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, + &sa_attr_table); + if (err != 0) { + (void) fprintf(stderr, "sa_setup failed: %s\n", + strerror(err)); + dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); + dsl_dataset_rele(dmu_objset_ds(*osp), tag); + *osp = NULL; + } + } + sa_os = *osp; + + return (0); +} + +static void +close_objset(objset_t *os, void *tag) +{ + VERIFY3P(os, ==, sa_os); + if (os->os_sa != NULL) + sa_tear_down(os); + dsl_dataset_long_rele(dmu_objset_ds(os), tag); + dsl_dataset_rele(dmu_objset_ds(os), tag); + sa_attr_table = NULL; + sa_os = NULL; +} + +static void +fuid_table_destroy(void) +{ + if (fuid_table_loaded) { + zfs_fuid_table_destroy(&idx_tree, &domain_tree); + fuid_table_loaded = B_FALSE; + } +} + +/* + * print uid or gid information. + * For normal POSIX id just the id is printed in decimal format. + * For CIFS files with FUID the fuid is printed in hex followed by + * the domain-rid string. + */ +static void +print_idstr(uint64_t id, const char *id_type) +{ + if (FUID_INDEX(id)) { + char *domain; + + domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); + (void) printf("\t%s %llx [%s-%d]\n", id_type, + (u_longlong_t)id, domain, (int)FUID_RID(id)); + } else { + (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); + } + +} + +static void +dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) +{ + uint32_t uid_idx, gid_idx; + + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); + + /* Load domain table, if not already loaded */ + if (!fuid_table_loaded && (uid_idx || gid_idx)) { + uint64_t fuid_obj; + + /* first find the fuid object. It lives in the master node */ + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, + 8, 1, &fuid_obj) == 0); + zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); + (void) zfs_fuid_table_load(os, fuid_obj, + &idx_tree, &domain_tree); + fuid_table_loaded = B_TRUE; + } + + print_idstr(uid, "uid"); + print_idstr(gid, "gid"); +} + +static void +dump_znode_sa_xattr(sa_handle_t *hdl) +{ + nvlist_t *sa_xattr; + nvpair_t *elem = NULL; + int sa_xattr_size = 0; + int sa_xattr_entries = 0; + int error; + char *sa_xattr_packed; + + error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); + if (error || sa_xattr_size == 0) + return; + + sa_xattr_packed = malloc(sa_xattr_size); + if (sa_xattr_packed == NULL) + return; + + error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], + sa_xattr_packed, sa_xattr_size); + if (error) { + free(sa_xattr_packed); + return; + } + + error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); + if (error) { + free(sa_xattr_packed); + return; + } + + while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) + sa_xattr_entries++; + + (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", + sa_xattr_size, sa_xattr_entries); + while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { + uchar_t *value; + uint_t cnt, idx; + + (void) printf("\t\t%s = ", nvpair_name(elem)); + nvpair_value_byte_array(elem, &value, &cnt); + for (idx = 0; idx < cnt; ++idx) { + if (isprint(value[idx])) + (void) putchar(value[idx]); + else + (void) printf("\\%3.3o", value[idx]); + } + (void) putchar('\n'); + } + + nvlist_free(sa_xattr); + free(sa_xattr_packed); +} + +static void +dump_znode_symlink(sa_handle_t *hdl) +{ + int sa_symlink_size = 0; + char linktarget[MAXPATHLEN]; + linktarget[0] = '\0'; + int error; + + error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); + if (error || sa_symlink_size == 0) { + return; + } + if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], + &linktarget, sa_symlink_size) == 0) + (void) printf("\ttarget %s\n", linktarget); +} + +/*ARGSUSED*/ +static void +dump_znode(objset_t *os, uint64_t object, void *data, size_t size) +{ + char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ + sa_handle_t *hdl; + uint64_t xattr, rdev, gen; + uint64_t uid, gid, mode, fsize, parent, links; + uint64_t pflags; + uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; + time_t z_crtime, z_atime, z_mtime, z_ctime; + sa_bulk_attr_t bulk[12]; + int idx = 0; + int error; + + VERIFY3P(os, ==, sa_os); + if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { + (void) printf("Failed to get handle for SA znode\n"); + return; + } + + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, + &links, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, + &mode, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], + NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, + &fsize, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, + acctm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, + modtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, + crtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, + chgtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, + &pflags, 8); + + if (sa_bulk_lookup(hdl, bulk, idx)) { + (void) sa_handle_destroy(hdl); + return; + } + + z_crtime = (time_t)crtm[0]; + z_atime = (time_t)acctm[0]; + z_mtime = (time_t)modtm[0]; + z_ctime = (time_t)chgtm[0]; + + if (dump_opt['d'] > 4) { + error = zfs_obj_to_path(os, object, path, sizeof (path)); + if (error == ESTALE) { + (void) snprintf(path, sizeof (path), "on delete queue"); + } else if (error != 0) { + leaked_objects++; + (void) snprintf(path, sizeof (path), + "path not found, possibly leaked"); + } + (void) printf("\tpath %s\n", path); + } + + if (S_ISLNK(mode)) + dump_znode_symlink(hdl); + dump_uidgid(os, uid, gid); + (void) printf("\tatime %s", ctime(&z_atime)); + (void) printf("\tmtime %s", ctime(&z_mtime)); + (void) printf("\tctime %s", ctime(&z_ctime)); + (void) printf("\tcrtime %s", ctime(&z_crtime)); + (void) printf("\tgen %llu\n", (u_longlong_t)gen); + (void) printf("\tmode %llo\n", (u_longlong_t)mode); + (void) printf("\tsize %llu\n", (u_longlong_t)fsize); + (void) printf("\tparent %llu\n", (u_longlong_t)parent); + (void) printf("\tlinks %llu\n", (u_longlong_t)links); + (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); + if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { + uint64_t projid; + + if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, + sizeof (uint64_t)) == 0) + (void) printf("\tprojid %llu\n", (u_longlong_t)projid); + } + if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, + sizeof (uint64_t)) == 0) + (void) printf("\txattr %llu\n", (u_longlong_t)xattr); + if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, + sizeof (uint64_t)) == 0) + (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); + dump_znode_sa_xattr(hdl); + sa_handle_destroy(hdl); +} + +/*ARGSUSED*/ +static void +dump_acl(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { + dump_none, /* unallocated */ + dump_zap, /* object directory */ + dump_uint64, /* object array */ + dump_none, /* packed nvlist */ + dump_packed_nvlist, /* packed nvlist size */ + dump_none, /* bpobj */ + dump_bpobj, /* bpobj header */ + dump_none, /* SPA space map header */ + dump_none, /* SPA space map */ + dump_none, /* ZIL intent log */ + dump_dnode, /* DMU dnode */ + dump_dmu_objset, /* DMU objset */ + dump_dsl_dir, /* DSL directory */ + dump_zap, /* DSL directory child map */ + dump_zap, /* DSL dataset snap map */ + dump_zap, /* DSL props */ + dump_dsl_dataset, /* DSL dataset */ + dump_znode, /* ZFS znode */ + dump_acl, /* ZFS V0 ACL */ + dump_uint8, /* ZFS plain file */ + dump_zpldir, /* ZFS directory */ + dump_zap, /* ZFS master node */ + dump_zap, /* ZFS delete queue */ + dump_uint8, /* zvol object */ + dump_zap, /* zvol prop */ + dump_uint8, /* other uint8[] */ + dump_uint64, /* other uint64[] */ + dump_zap, /* other ZAP */ + dump_zap, /* persistent error log */ + dump_uint8, /* SPA history */ + dump_history_offsets, /* SPA history offsets */ + dump_zap, /* Pool properties */ + dump_zap, /* DSL permissions */ + dump_acl, /* ZFS ACL */ + dump_uint8, /* ZFS SYSACL */ + dump_none, /* FUID nvlist */ + dump_packed_nvlist, /* FUID nvlist size */ + dump_zap, /* DSL dataset next clones */ + dump_zap, /* DSL scrub queue */ + dump_zap, /* ZFS user/group/project used */ + dump_zap, /* ZFS user/group/project quota */ + dump_zap, /* snapshot refcount tags */ + dump_ddt_zap, /* DDT ZAP object */ + dump_zap, /* DDT statistics */ + dump_znode, /* SA object */ + dump_zap, /* SA Master Node */ + dump_sa_attrs, /* SA attribute registration */ + dump_sa_layouts, /* SA attribute layouts */ + dump_zap, /* DSL scrub translations */ + dump_none, /* fake dedup BP */ + dump_zap, /* deadlist */ + dump_none, /* deadlist hdr */ + dump_zap, /* dsl clones */ + dump_bpobj_subobjs, /* bpobj subobjs */ + dump_unknown, /* Unknown type, must be last */ +}; + +static boolean_t +match_object_type(dmu_object_type_t obj_type, uint64_t flags) +{ + boolean_t match = B_TRUE; + + switch (obj_type) { + case DMU_OT_DIRECTORY_CONTENTS: + if (!(flags & ZOR_FLAG_DIRECTORY)) + match = B_FALSE; + break; + case DMU_OT_PLAIN_FILE_CONTENTS: + if (!(flags & ZOR_FLAG_PLAIN_FILE)) + match = B_FALSE; + break; + case DMU_OT_SPACE_MAP: + if (!(flags & ZOR_FLAG_SPACE_MAP)) + match = B_FALSE; + break; + default: + if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { + if (!(flags & ZOR_FLAG_ZAP)) + match = B_FALSE; + break; + } + + /* + * If all bits except some of the supported flags are + * set, the user combined the all-types flag (A) with + * a negated flag to exclude some types (e.g. A-f to + * show all object types except plain files). + */ + if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) + match = B_FALSE; + + break; + } + + return (match); +} + +static void +dump_object(objset_t *os, uint64_t object, int verbosity, + boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) +{ + dmu_buf_t *db = NULL; + dmu_object_info_t doi; + dnode_t *dn; + boolean_t dnode_held = B_FALSE; + void *bonus = NULL; + size_t bsize = 0; + char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; + char bonus_size[32]; + char aux[50]; + int error; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); + + if (*print_header) { + (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", + "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", + "lsize", "%full", "type"); + *print_header = 0; + } + + if (object == 0) { + dn = DMU_META_DNODE(os); + dmu_object_info_from_dnode(dn, &doi); + } else { + /* + * Encrypted datasets will have sensitive bonus buffers + * encrypted. Therefore we cannot hold the bonus buffer and + * must hold the dnode itself instead. + */ + error = dmu_object_info(os, object, &doi); + if (error) + fatal("dmu_object_info() failed, errno %u", error); + + if (os->os_encrypted && + DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { + error = dnode_hold(os, object, FTAG, &dn); + if (error) + fatal("dnode_hold() failed, errno %u", error); + dnode_held = B_TRUE; + } else { + error = dmu_bonus_hold(os, object, FTAG, &db); + if (error) + fatal("dmu_bonus_hold(%llu) failed, errno %u", + object, error); + bonus = db->db_data; + bsize = db->db_size; + dn = DB_DNODE((dmu_buf_impl_t *)db); + } + } + + /* + * Default to showing all object types if no flags were specified. + */ + if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && + !match_object_type(doi.doi_type, flags)) + goto out; + + if (dnode_slots_used) + *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; + + zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); + zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); + zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); + zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); + zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); + zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); + (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * + doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / + doi.doi_max_offset); + + aux[0] = '\0'; + + if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); + } + + if (doi.doi_compress == ZIO_COMPRESS_INHERIT && + ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { + const char *compname = NULL; + if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, + ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), + &compname) == 0) { + (void) snprintf(aux + strlen(aux), + sizeof (aux) - strlen(aux), " (Z=inherit=%s)", + compname); + } else { + (void) snprintf(aux + strlen(aux), + sizeof (aux) - strlen(aux), + " (Z=inherit=%s-unknown)", + ZDB_COMPRESS_NAME(os->os_compress)); + } + } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); + } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); + } + + (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", + (u_longlong_t)object, doi.doi_indirection, iblk, dblk, + asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); + + if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { + (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", + "", "", "", "", "", "", bonus_size, "bonus", + zdb_ot_name(doi.doi_bonus_type)); + } + + if (verbosity >= 4) { + (void) printf("\tdnode flags: %s%s%s%s\n", + (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? + "USED_BYTES " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? + "USERUSED_ACCOUNTED " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? + "USEROBJUSED_ACCOUNTED " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? + "SPILL_BLKPTR" : ""); + (void) printf("\tdnode maxblkid: %llu\n", + (longlong_t)dn->dn_phys->dn_maxblkid); + + if (!dnode_held) { + object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, + object, bonus, bsize); + } else { + (void) printf("\t\t(bonus encrypted)\n"); + } + + if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) { + object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, + NULL, 0); + } else { + (void) printf("\t\t(object encrypted)\n"); + } + + *print_header = B_TRUE; + } + + if (verbosity >= 5) + dump_indirect(dn); + + if (verbosity >= 5) { + /* + * Report the list of segments that comprise the object. + */ + uint64_t start = 0; + uint64_t end; + uint64_t blkfill = 1; + int minlvl = 1; + + if (dn->dn_type == DMU_OT_DNODE) { + minlvl = 0; + blkfill = DNODES_PER_BLOCK; + } + + for (;;) { + char segsize[32]; + /* make sure nicenum has enough space */ + CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); + error = dnode_next_offset(dn, + 0, &start, minlvl, blkfill, 0); + if (error) + break; + end = start; + error = dnode_next_offset(dn, + DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); + zdb_nicenum(end - start, segsize, sizeof (segsize)); + (void) printf("\t\tsegment [%016llx, %016llx)" + " size %5s\n", (u_longlong_t)start, + (u_longlong_t)end, segsize); + if (error) + break; + start = end; + } + } + +out: + if (db != NULL) + dmu_buf_rele(db, FTAG); + if (dnode_held) + dnode_rele(dn, FTAG); +} + +static void +count_dir_mos_objects(dsl_dir_t *dd) +{ + mos_obj_refd(dd->dd_object); + mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); + mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); + mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); + mos_obj_refd(dsl_dir_phys(dd)->dd_clones); + + /* + * The dd_crypto_obj can be referenced by multiple dsl_dir's. + * Ignore the references after the first one. + */ + mos_obj_refd_multiple(dd->dd_crypto_obj); +} + +static void +count_ds_mos_objects(dsl_dataset_t *ds) +{ + mos_obj_refd(ds->ds_object); + mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); + mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); + mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); + mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); + mos_obj_refd(ds->ds_bookmarks_obj); + + if (!dsl_dataset_is_snapshot(ds)) { + count_dir_mos_objects(ds->ds_dir); + } +} + +static const char *objset_types[DMU_OST_NUMTYPES] = { + "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; + +/* + * Parse a string denoting a range of object IDs of the form + * <start>[:<end>[:flags]], and store the results in zor. + * Return 0 on success. On error, return 1 and update the msg + * pointer to point to a descriptive error message. + */ +static int +parse_object_range(char *range, zopt_object_range_t *zor, char **msg) +{ + uint64_t flags = 0; + char *p, *s, *dup, *flagstr; + size_t len; + int i; + int rc = 0; + + if (strchr(range, ':') == NULL) { + zor->zor_obj_start = strtoull(range, &p, 0); + if (*p != '\0') { + *msg = "Invalid characters in object ID"; + rc = 1; + } + zor->zor_obj_end = zor->zor_obj_start; + return (rc); + } + + if (strchr(range, ':') == range) { + *msg = "Invalid leading colon"; + rc = 1; + return (rc); + } + + len = strlen(range); + if (range[len - 1] == ':') { + *msg = "Invalid trailing colon"; + rc = 1; + return (rc); + } + + dup = strdup(range); + s = strtok(dup, ":"); + zor->zor_obj_start = strtoull(s, &p, 0); + + if (*p != '\0') { + *msg = "Invalid characters in start object ID"; + rc = 1; + goto out; + } + + s = strtok(NULL, ":"); + zor->zor_obj_end = strtoull(s, &p, 0); + + if (*p != '\0') { + *msg = "Invalid characters in end object ID"; + rc = 1; + goto out; + } + + if (zor->zor_obj_start > zor->zor_obj_end) { + *msg = "Start object ID may not exceed end object ID"; + rc = 1; + goto out; + } + + s = strtok(NULL, ":"); + if (s == NULL) { + zor->zor_flags = ZOR_FLAG_ALL_TYPES; + goto out; + } else if (strtok(NULL, ":") != NULL) { + *msg = "Invalid colon-delimited field after flags"; + rc = 1; + goto out; + } + + flagstr = s; + for (i = 0; flagstr[i]; i++) { + int bit; + boolean_t negation = (flagstr[i] == '-'); + + if (negation) { + i++; + if (flagstr[i] == '\0') { + *msg = "Invalid trailing negation operator"; + rc = 1; + goto out; + } + } + bit = flagbits[(uchar_t)flagstr[i]]; + if (bit == 0) { + *msg = "Invalid flag"; + rc = 1; + goto out; + } + if (negation) + flags &= ~bit; + else + flags |= bit; + } + zor->zor_flags = flags; + +out: + free(dup); + return (rc); +} + +static void +dump_objset(objset_t *os) +{ + dmu_objset_stats_t dds = { 0 }; + uint64_t object, object_count; + uint64_t refdbytes, usedobjs, scratch; + char numbuf[32]; + char blkbuf[BP_SPRINTF_LEN + 20]; + char osname[ZFS_MAX_DATASET_NAME_LEN]; + const char *type = "UNKNOWN"; + int verbosity = dump_opt['d']; + boolean_t print_header; + unsigned i; + int error; + uint64_t total_slots_used = 0; + uint64_t max_slot_used = 0; + uint64_t dnode_slots; + uint64_t obj_start; + uint64_t obj_end; + uint64_t flags; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); + + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + + print_header = B_TRUE; + + if (dds.dds_type < DMU_OST_NUMTYPES) + type = objset_types[dds.dds_type]; + + if (dds.dds_type == DMU_OST_META) { + dds.dds_creation_txg = TXG_INITIAL; + usedobjs = BP_GET_FILL(os->os_rootbp); + refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> + dd_used_bytes; + } else { + dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); + } + + ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); + + zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); + + if (verbosity >= 4) { + (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); + (void) snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); + } else { + blkbuf[0] = '\0'; + } + + dmu_objset_name(os, osname); + + (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " + "%s, %llu objects%s%s\n", + osname, type, (u_longlong_t)dmu_objset_id(os), + (u_longlong_t)dds.dds_creation_txg, + numbuf, (u_longlong_t)usedobjs, blkbuf, + (dds.dds_inconsistent) ? " (inconsistent)" : ""); + + for (i = 0; i < zopt_object_args; i++) { + obj_start = zopt_object_ranges[i].zor_obj_start; + obj_end = zopt_object_ranges[i].zor_obj_end; + flags = zopt_object_ranges[i].zor_flags; + + object = obj_start; + if (object == 0 || obj_start == obj_end) + dump_object(os, object, verbosity, &print_header, NULL, + flags); + else + object--; + + while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && + object <= obj_end) { + dump_object(os, object, verbosity, &print_header, NULL, + flags); + } + } + + if (zopt_object_args > 0) { + (void) printf("\n"); + return; + } + + if (dump_opt['i'] != 0 || verbosity >= 2) + dump_intent_log(dmu_objset_zil(os)); + + if (dmu_objset_ds(os) != NULL) { + dsl_dataset_t *ds = dmu_objset_ds(os); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); + if (verify_dd_livelist(os) != 0) + fatal("livelist is incorrect"); + } + + if (dsl_dataset_remap_deadlist_exists(ds)) { + (void) printf("ds_remap_deadlist:\n"); + dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); + } + count_ds_mos_objects(ds); + } + + if (dmu_objset_ds(os) != NULL) + dump_bookmarks(os, verbosity); + + if (verbosity < 2) + return; + + if (BP_IS_HOLE(os->os_rootbp)) + return; + + dump_object(os, 0, verbosity, &print_header, NULL, 0); + object_count = 0; + if (DMU_USERUSED_DNODE(os) != NULL && + DMU_USERUSED_DNODE(os)->dn_type != 0) { + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, + NULL, 0); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, + NULL, 0); + } + + if (DMU_PROJECTUSED_DNODE(os) != NULL && + DMU_PROJECTUSED_DNODE(os)->dn_type != 0) + dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, + &print_header, NULL, 0); + + object = 0; + while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { + dump_object(os, object, verbosity, &print_header, &dnode_slots, + 0); + object_count++; + total_slots_used += dnode_slots; + max_slot_used = object + dnode_slots - 1; + } + + (void) printf("\n"); + + (void) printf(" Dnode slots:\n"); + (void) printf("\tTotal used: %10llu\n", + (u_longlong_t)total_slots_used); + (void) printf("\tMax used: %10llu\n", + (u_longlong_t)max_slot_used); + (void) printf("\tPercent empty: %10lf\n", + (double)(max_slot_used - total_slots_used)*100 / + (double)max_slot_used); + (void) printf("\n"); + + if (error != ESRCH) { + (void) fprintf(stderr, "dmu_object_next() = %d\n", error); + abort(); + } + + ASSERT3U(object_count, ==, usedobjs); + + if (leaked_objects != 0) { + (void) printf("%d potentially leaked objects detected\n", + leaked_objects); + leaked_objects = 0; + } +} + +static void +dump_uberblock(uberblock_t *ub, const char *header, const char *footer) +{ + time_t timestamp = ub->ub_timestamp; + + (void) printf("%s", header ? header : ""); + (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); + (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); + (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); + (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); + (void) printf("\ttimestamp = %llu UTC = %s", + (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); + + (void) printf("\tmmp_magic = %016llx\n", + (u_longlong_t)ub->ub_mmp_magic); + if (MMP_VALID(ub)) { + (void) printf("\tmmp_delay = %0llu\n", + (u_longlong_t)ub->ub_mmp_delay); + if (MMP_SEQ_VALID(ub)) + (void) printf("\tmmp_seq = %u\n", + (unsigned int) MMP_SEQ(ub)); + if (MMP_FAIL_INT_VALID(ub)) + (void) printf("\tmmp_fail = %u\n", + (unsigned int) MMP_FAIL_INT(ub)); + if (MMP_INTERVAL_VALID(ub)) + (void) printf("\tmmp_write = %u\n", + (unsigned int) MMP_INTERVAL(ub)); + /* After MMP_* to make summarize_uberblock_mmp cleaner */ + (void) printf("\tmmp_valid = %x\n", + (unsigned int) ub->ub_mmp_config & 0xFF); + } + + if (dump_opt['u'] >= 4) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); + (void) printf("\trootbp = %s\n", blkbuf); + } + (void) printf("\tcheckpoint_txg = %llu\n", + (u_longlong_t)ub->ub_checkpoint_txg); + (void) printf("%s", footer ? footer : ""); +} + +static void +dump_config(spa_t *spa) +{ + dmu_buf_t *db; + size_t nvsize = 0; + int error = 0; + + + error = dmu_bonus_hold(spa->spa_meta_objset, + spa->spa_config_object, FTAG, &db); + + if (error == 0) { + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + (void) printf("\nMOS Configuration:\n"); + dump_packed_nvlist(spa->spa_meta_objset, + spa->spa_config_object, (void *)&nvsize, 1); + } else { + (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", + (u_longlong_t)spa->spa_config_object, error); + } +} + +static void +dump_cachefile(const char *cachefile) +{ + int fd; + struct stat64 statbuf; + char *buf; + nvlist_t *config; + + if ((fd = open64(cachefile, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", cachefile, + strerror(errno)); + exit(1); + } + + if (fstat64(fd, &statbuf) != 0) { + (void) printf("failed to stat '%s': %s\n", cachefile, + strerror(errno)); + exit(1); + } + + if ((buf = malloc(statbuf.st_size)) == NULL) { + (void) fprintf(stderr, "failed to allocate %llu bytes\n", + (u_longlong_t)statbuf.st_size); + exit(1); + } + + if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { + (void) fprintf(stderr, "failed to read %llu bytes\n", + (u_longlong_t)statbuf.st_size); + exit(1); + } + + (void) close(fd); + + if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { + (void) fprintf(stderr, "failed to unpack nvlist\n"); + exit(1); + } + + free(buf); + + dump_nvlist(config, 0); + + nvlist_free(config); +} + +/* + * ZFS label nvlist stats + */ +typedef struct zdb_nvl_stats { + int zns_list_count; + int zns_leaf_count; + size_t zns_leaf_largest; + size_t zns_leaf_total; + nvlist_t *zns_string; + nvlist_t *zns_uint64; + nvlist_t *zns_boolean; +} zdb_nvl_stats_t; + +static void +collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) +{ + nvlist_t *list, **array; + nvpair_t *nvp = NULL; + char *name; + uint_t i, items; + + stats->zns_list_count++; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + name = nvpair_name(nvp); + + switch (nvpair_type(nvp)) { + case DATA_TYPE_STRING: + fnvlist_add_string(stats->zns_string, name, + fnvpair_value_string(nvp)); + break; + case DATA_TYPE_UINT64: + fnvlist_add_uint64(stats->zns_uint64, name, + fnvpair_value_uint64(nvp)); + break; + case DATA_TYPE_BOOLEAN: + fnvlist_add_boolean(stats->zns_boolean, name); + break; + case DATA_TYPE_NVLIST: + if (nvpair_value_nvlist(nvp, &list) == 0) + collect_nvlist_stats(list, stats); + break; + case DATA_TYPE_NVLIST_ARRAY: + if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) + break; + + for (i = 0; i < items; i++) { + collect_nvlist_stats(array[i], stats); + + /* collect stats on leaf vdev */ + if (strcmp(name, "children") == 0) { + size_t size; + + (void) nvlist_size(array[i], &size, + NV_ENCODE_XDR); + stats->zns_leaf_total += size; + if (size > stats->zns_leaf_largest) + stats->zns_leaf_largest = size; + stats->zns_leaf_count++; + } + } + break; + default: + (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); + } + } +} + +static void +dump_nvlist_stats(nvlist_t *nvl, size_t cap) +{ + zdb_nvl_stats_t stats = { 0 }; + size_t size, sum = 0, total; + size_t noise; + + /* requires nvlist with non-unique names for stat collection */ + VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); + VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); + VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); + VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); + + (void) printf("\n\nZFS Label NVList Config Stats:\n"); + + VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); + (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", + (int)total, (int)(cap - total), 100.0 * total / cap); + + collect_nvlist_stats(nvl, &stats); + + VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); + size -= noise; + sum += size; + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", + (int)fnvlist_num_pairs(stats.zns_uint64), + (int)size, 100.0 * size / total); + + VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); + size -= noise; + sum += size; + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", + (int)fnvlist_num_pairs(stats.zns_string), + (int)size, 100.0 * size / total); + + VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); + size -= noise; + sum += size; + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", + (int)fnvlist_num_pairs(stats.zns_boolean), + (int)size, 100.0 * size / total); + + size = total - sum; /* treat remainder as nvlist overhead */ + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", + stats.zns_list_count, (int)size, 100.0 * size / total); + + if (stats.zns_leaf_count > 0) { + size_t average = stats.zns_leaf_total / stats.zns_leaf_count; + + (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", + stats.zns_leaf_count, (int)average); + (void) printf("%24d bytes largest\n", + (int)stats.zns_leaf_largest); + + if (dump_opt['l'] >= 3 && average > 0) + (void) printf(" space for %d additional leaf vdevs\n", + (int)((cap - total) / average)); + } + (void) printf("\n"); + + nvlist_free(stats.zns_string); + nvlist_free(stats.zns_uint64); + nvlist_free(stats.zns_boolean); +} + +typedef struct cksum_record { + zio_cksum_t cksum; + boolean_t labels[VDEV_LABELS]; + avl_node_t link; +} cksum_record_t; + +static int +cksum_record_compare(const void *x1, const void *x2) +{ + const cksum_record_t *l = (cksum_record_t *)x1; + const cksum_record_t *r = (cksum_record_t *)x2; + int arraysize = ARRAY_SIZE(l->cksum.zc_word); + int difference; + + for (int i = 0; i < arraysize; i++) { + difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); + if (difference) + break; + } + + return (difference); +} + +static cksum_record_t * +cksum_record_alloc(zio_cksum_t *cksum, int l) +{ + cksum_record_t *rec; + + rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); + rec->cksum = *cksum; + rec->labels[l] = B_TRUE; + + return (rec); +} + +static cksum_record_t * +cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) +{ + cksum_record_t lookup = { .cksum = *cksum }; + avl_index_t where; + + return (avl_find(tree, &lookup, &where)); +} + +static cksum_record_t * +cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) +{ + cksum_record_t *rec; + + rec = cksum_record_lookup(tree, cksum); + if (rec) { + rec->labels[l] = B_TRUE; + } else { + rec = cksum_record_alloc(cksum, l); + avl_add(tree, rec); + } + + return (rec); +} + +static int +first_label(cksum_record_t *rec) +{ + for (int i = 0; i < VDEV_LABELS; i++) + if (rec->labels[i]) + return (i); + + return (-1); +} + +static void +print_label_numbers(char *prefix, cksum_record_t *rec) +{ + printf("%s", prefix); + for (int i = 0; i < VDEV_LABELS; i++) + if (rec->labels[i] == B_TRUE) + printf("%d ", i); + printf("\n"); +} + +#define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) + +typedef struct zdb_label { + vdev_label_t label; + nvlist_t *config_nv; + cksum_record_t *config; + cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; + boolean_t header_printed; + boolean_t read_failed; +} zdb_label_t; + +static void +print_label_header(zdb_label_t *label, int l) +{ + + if (dump_opt['q']) + return; + + if (label->header_printed == B_TRUE) + return; + + (void) printf("------------------------------------\n"); + (void) printf("LABEL %d\n", l); + (void) printf("------------------------------------\n"); + + label->header_printed = B_TRUE; +} + +static void +print_l2arc_header(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device header\n"); + (void) printf("------------------------------------\n"); +} + +static void +print_l2arc_log_blocks(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device log blocks\n"); + (void) printf("------------------------------------\n"); +} + +static void +dump_l2arc_log_entries(uint64_t log_entries, + l2arc_log_ent_phys_t *le, uint64_t i) +{ + for (int j = 0; j < log_entries; j++) { + dva_t dva = le[j].le_dva; + (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " + "vdev: %llu, offset: %llu\n", + (u_longlong_t)i, j + 1, + (u_longlong_t)DVA_GET_ASIZE(&dva), + (u_longlong_t)DVA_GET_VDEV(&dva), + (u_longlong_t)DVA_GET_OFFSET(&dva)); + (void) printf("|\t\t\t\tbirth: %llu\n", + (u_longlong_t)le[j].le_birth); + (void) printf("|\t\t\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tpsize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tcompr: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); + (void) printf("|\t\t\t\tcomplevel: %llu\n", + (u_longlong_t)(&le[j])->le_complevel); + (void) printf("|\t\t\t\ttype: %llu\n", + (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprotected: %llu\n", + (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprefetch: %llu\n", + (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); + (void) printf("|\t\t\t\taddress: %llu\n", + (u_longlong_t)le[j].le_daddr); + (void) printf("|\n"); + } + (void) printf("\n"); +} + +static void +dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) +{ + (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr); + (void) printf("|\t\tpayload_asize: %llu\n", + (u_longlong_t)lbps.lbp_payload_asize); + (void) printf("|\t\tpayload_start: %llu\n", + (u_longlong_t)lbps.lbp_payload_start); + (void) printf("|\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tasize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tcompralgo: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop)); + (void) printf("|\t\tcksumalgo: %llu\n", + (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop)); + (void) printf("|\n\n"); +} + +static void +dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr, + l2arc_dev_hdr_phys_t *rebuild) +{ + l2arc_log_blk_phys_t this_lb; + uint64_t asize; + l2arc_log_blkptr_t lbps[2]; + abd_t *abd; + zio_cksum_t cksum; + int failed = 0; + l2arc_dev_t dev; + + if (!dump_opt['q']) + print_l2arc_log_blocks(); + bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps)); + + dev.l2ad_evict = l2dhdr.dh_evict; + dev.l2ad_start = l2dhdr.dh_start; + dev.l2ad_end = l2dhdr.dh_end; + + if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) { + /* no log blocks to read */ + if (!dump_opt['q']) { + (void) printf("No log blocks to read\n"); + (void) printf("\n"); + } + return; + } else { + dev.l2ad_hand = lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + } + + dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + for (;;) { + if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) + break; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { + if (!dump_opt['q']) { + (void) printf("Error while reading next log " + "block\n\n"); + } + break; + } + + fletcher_4_native_varsize(&this_lb, asize, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { + failed++; + if (!dump_opt['q']) { + (void) printf("Invalid cksum\n"); + dump_l2arc_log_blkptr(lbps[0]); + } + break; + } + + switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + default: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, &this_lb, 0, asize); + zio_decompress_data(L2BLK_GET_COMPRESS( + (&lbps[0])->lbp_prop), abd, &this_lb, + asize, sizeof (this_lb), NULL); + abd_free(abd); + break; + } + + if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(&this_lb, sizeof (this_lb)); + if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { + if (!dump_opt['q']) + (void) printf("Invalid log block magic\n\n"); + break; + } + + rebuild->dh_lb_count++; + rebuild->dh_lb_asize += asize; + if (dump_opt['l'] > 1 && !dump_opt['q']) { + (void) printf("lb[%4llu]\tmagic: %llu\n", + (u_longlong_t)rebuild->dh_lb_count, + (u_longlong_t)this_lb.lb_magic); + dump_l2arc_log_blkptr(lbps[0]); + } + + if (dump_opt['l'] > 2 && !dump_opt['q']) + dump_l2arc_log_entries(l2dhdr.dh_log_entries, + this_lb.lb_entries, + rebuild->dh_lb_count); + + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev.l2ad_evict) && + !dev.l2ad_first) + break; + + lbps[0] = lbps[1]; + lbps[1] = this_lb.lb_prev_lbp; + } + + if (!dump_opt['q']) { + (void) printf("log_blk_count:\t %llu with valid cksum\n", + (u_longlong_t)rebuild->dh_lb_count); + (void) printf("\t\t %d with invalid cksum\n", failed); + (void) printf("log_blk_asize:\t %llu\n\n", + (u_longlong_t)rebuild->dh_lb_asize); + } +} + +static int +dump_l2arc_header(int fd) +{ + l2arc_dev_hdr_phys_t l2dhdr, rebuild; + int error = B_FALSE; + + bzero(&l2dhdr, sizeof (l2dhdr)); + bzero(&rebuild, sizeof (rebuild)); + + if (pread64(fd, &l2dhdr, sizeof (l2dhdr), + VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { + error = B_TRUE; + } else { + if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); + + if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) + error = B_TRUE; + } + + if (error) { + (void) printf("L2ARC device header not found\n\n"); + /* Do not return an error here for backward compatibility */ + return (0); + } else if (!dump_opt['q']) { + print_l2arc_header(); + + (void) printf(" magic: %llu\n", + (u_longlong_t)l2dhdr.dh_magic); + (void) printf(" version: %llu\n", + (u_longlong_t)l2dhdr.dh_version); + (void) printf(" pool_guid: %llu\n", + (u_longlong_t)l2dhdr.dh_spa_guid); + (void) printf(" flags: %llu\n", + (u_longlong_t)l2dhdr.dh_flags); + (void) printf(" start_lbps[0]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[0].lbp_daddr); + (void) printf(" start_lbps[1]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[1].lbp_daddr); + (void) printf(" log_blk_ent: %llu\n", + (u_longlong_t)l2dhdr.dh_log_entries); + (void) printf(" start: %llu\n", + (u_longlong_t)l2dhdr.dh_start); + (void) printf(" end: %llu\n", + (u_longlong_t)l2dhdr.dh_end); + (void) printf(" evict: %llu\n", + (u_longlong_t)l2dhdr.dh_evict); + (void) printf(" lb_asize_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_asize); + (void) printf(" lb_count_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_count); + (void) printf(" trim_action_time: %llu\n", + (u_longlong_t)l2dhdr.dh_trim_action_time); + (void) printf(" trim_state: %llu\n\n", + (u_longlong_t)l2dhdr.dh_trim_state); + } + + dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); + /* + * The total aligned size of log blocks and the number of log blocks + * reported in the header of the device may be less than what zdb + * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). + * This happens because dump_l2arc_log_blocks() lacks the memory + * pressure valve that l2arc_rebuild() has. Thus, if we are on a system + * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize + * and dh_lb_count will be lower to begin with than what exists on the + * device. This is normal and zdb should not exit with an error. The + * opposite case should never happen though, the values reported in the + * header should never be higher than what dump_l2arc_log_blocks() and + * l2arc_rebuild() report. If this happens there is a leak in the + * accounting of log blocks. + */ + if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || + l2dhdr.dh_lb_count > rebuild.dh_lb_count) + return (1); + + return (0); +} + +static void +dump_config_from_label(zdb_label_t *label, size_t buflen, int l) +{ + if (dump_opt['q']) + return; + + if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) + return; + + print_label_header(label, l); + dump_nvlist(label->config_nv, 4); + print_label_numbers(" labels = ", label->config); + + if (dump_opt['l'] >= 2) + dump_nvlist_stats(label->config_nv, buflen); +} + +#define ZDB_MAX_UB_HEADER_SIZE 32 + +static void +dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) +{ + + vdev_t vd; + char header[ZDB_MAX_UB_HEADER_SIZE]; + + vd.vdev_ashift = ashift; + vd.vdev_top = &vd; + + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { + uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); + uberblock_t *ub = (void *)((char *)&label->label + uoff); + cksum_record_t *rec = label->uberblocks[i]; + + if (rec == NULL) { + if (dump_opt['u'] >= 2) { + print_label_header(label, label_num); + (void) printf(" Uberblock[%d] invalid\n", i); + } + continue; + } + + if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) + continue; + + if ((dump_opt['u'] < 4) && + (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && + (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) + continue; + + print_label_header(label, label_num); + (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, + " Uberblock[%d]\n", i); + dump_uberblock(ub, header, ""); + print_label_numbers(" labels = ", rec); + } +} + +static char curpath[PATH_MAX]; + +/* + * Iterate through the path components, recursively passing + * current one's obj and remaining path until we find the obj + * for the last one. + */ +static int +dump_path_impl(objset_t *os, uint64_t obj, char *name) +{ + int err; + boolean_t header = B_TRUE; + uint64_t child_obj; + char *s; + dmu_buf_t *db; + dmu_object_info_t doi; + + if ((s = strchr(name, '/')) != NULL) + *s = '\0'; + err = zap_lookup(os, obj, name, 8, 1, &child_obj); + + (void) strlcat(curpath, name, sizeof (curpath)); + + if (err != 0) { + (void) fprintf(stderr, "failed to lookup %s: %s\n", + curpath, strerror(err)); + return (err); + } + + child_obj = ZFS_DIRENT_OBJ(child_obj); + err = sa_buf_hold(os, child_obj, FTAG, &db); + if (err != 0) { + (void) fprintf(stderr, + "failed to get SA dbuf for obj %llu: %s\n", + (u_longlong_t)child_obj, strerror(err)); + return (EINVAL); + } + dmu_object_info_from_db(db, &doi); + sa_buf_rele(db, FTAG); + + if (doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) { + (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", + doi.doi_bonus_type, (u_longlong_t)child_obj); + return (EINVAL); + } + + if (dump_opt['v'] > 6) { + (void) printf("obj=%llu %s type=%d bonustype=%d\n", + (u_longlong_t)child_obj, curpath, doi.doi_type, + doi.doi_bonus_type); + } + + (void) strlcat(curpath, "/", sizeof (curpath)); + + switch (doi.doi_type) { + case DMU_OT_DIRECTORY_CONTENTS: + if (s != NULL && *(s + 1) != '\0') + return (dump_path_impl(os, child_obj, s + 1)); + /*FALLTHROUGH*/ + case DMU_OT_PLAIN_FILE_CONTENTS: + dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0); + return (0); + default: + (void) fprintf(stderr, "object %llu has non-file/directory " + "type %d\n", (u_longlong_t)obj, doi.doi_type); + break; + } + + return (EINVAL); +} + +/* + * Dump the blocks for the object specified by path inside the dataset. + */ +static int +dump_path(char *ds, char *path) +{ + int err; + objset_t *os; + uint64_t root_obj; + + err = open_objset(ds, FTAG, &os); + if (err != 0) + return (err); + + err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); + if (err != 0) { + (void) fprintf(stderr, "can't lookup root znode: %s\n", + strerror(err)); + close_objset(os, FTAG); + return (EINVAL); + } + + (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); + + err = dump_path_impl(os, root_obj, path); + + close_objset(os, FTAG); + return (err); +} + +static int +dump_label(const char *dev) +{ + char path[MAXPATHLEN]; + zdb_label_t labels[VDEV_LABELS]; + uint64_t psize, ashift, l2cache; + struct stat64 statbuf; + boolean_t config_found = B_FALSE; + boolean_t error = B_FALSE; + boolean_t read_l2arc_header = B_FALSE; + avl_tree_t config_tree; + avl_tree_t uberblock_tree; + void *node, *cookie; + int fd; + + bzero(labels, sizeof (labels)); + + /* + * Check if we were given absolute path and use it as is. + * Otherwise if the provided vdev name doesn't point to a file, + * try prepending expected disk paths and partition numbers. + */ + (void) strlcpy(path, dev, sizeof (path)); + if (dev[0] != '/' && stat64(path, &statbuf) != 0) { + int error; + + error = zfs_resolve_shortname(dev, path, MAXPATHLEN); + if (error == 0 && zfs_dev_is_whole_disk(path)) { + if (zfs_append_partition(path, MAXPATHLEN) == -1) + error = ENOENT; + } + + if (error || (stat64(path, &statbuf) != 0)) { + (void) printf("failed to find device %s, try " + "specifying absolute path instead\n", dev); + return (1); + } + } + + if ((fd = open64(path, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", path, strerror(errno)); + exit(1); + } + + if (fstat64_blk(fd, &statbuf) != 0) { + (void) printf("failed to stat '%s': %s\n", path, + strerror(errno)); + (void) close(fd); + exit(1); + } + + if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) + (void) printf("failed to invalidate cache '%s' : %s\n", path, + strerror(errno)); + + avl_create(&config_tree, cksum_record_compare, + sizeof (cksum_record_t), offsetof(cksum_record_t, link)); + avl_create(&uberblock_tree, cksum_record_compare, + sizeof (cksum_record_t), offsetof(cksum_record_t, link)); + + psize = statbuf.st_size; + psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + ashift = SPA_MINBLOCKSHIFT; + + /* + * 1. Read the label from disk + * 2. Unpack the configuration and insert in config tree. + * 3. Traverse all uberblocks and insert in uberblock tree. + */ + for (int l = 0; l < VDEV_LABELS; l++) { + zdb_label_t *label = &labels[l]; + char *buf = label->label.vl_vdev_phys.vp_nvlist; + size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); + nvlist_t *config; + cksum_record_t *rec; + zio_cksum_t cksum; + vdev_t vd; + + if (pread64(fd, &label->label, sizeof (label->label), + vdev_label_offset(psize, l, 0)) != sizeof (label->label)) { + if (!dump_opt['q']) + (void) printf("failed to read label %d\n", l); + label->read_failed = B_TRUE; + error = B_TRUE; + continue; + } + + label->read_failed = B_FALSE; + + if (nvlist_unpack(buf, buflen, &config, 0) == 0) { + nvlist_t *vdev_tree = NULL; + size_t size; + + if ((nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || + (nvlist_lookup_uint64(vdev_tree, + ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) + ashift = SPA_MINBLOCKSHIFT; + + if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) + size = buflen; + + /* If the device is a cache device clear the header. */ + if (!read_l2arc_header) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && + l2cache == POOL_STATE_L2CACHE) { + read_l2arc_header = B_TRUE; + } + } + + fletcher_4_native_varsize(buf, size, &cksum); + rec = cksum_record_insert(&config_tree, &cksum, l); + + label->config = rec; + label->config_nv = config; + config_found = B_TRUE; + } else { + error = B_TRUE; + } + + vd.vdev_ashift = ashift; + vd.vdev_top = &vd; + + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { + uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); + uberblock_t *ub = (void *)((char *)label + uoff); + + if (uberblock_verify(ub)) + continue; + + fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); + rec = cksum_record_insert(&uberblock_tree, &cksum, l); + + label->uberblocks[i] = rec; + } + } + + /* + * Dump the label and uberblocks. + */ + for (int l = 0; l < VDEV_LABELS; l++) { + zdb_label_t *label = &labels[l]; + size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); + + if (label->read_failed == B_TRUE) + continue; + + if (label->config_nv) { + dump_config_from_label(label, buflen, l); + } else { + if (!dump_opt['q']) + (void) printf("failed to unpack label %d\n", l); + } + + if (dump_opt['u']) + dump_label_uberblocks(label, ashift, l); + + nvlist_free(label->config_nv); + } + + /* + * Dump the L2ARC header, if existent. + */ + if (read_l2arc_header) + error |= dump_l2arc_header(fd); + + cookie = NULL; + while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) + umem_free(node, sizeof (cksum_record_t)); + + cookie = NULL; + while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) + umem_free(node, sizeof (cksum_record_t)); + + avl_destroy(&config_tree); + avl_destroy(&uberblock_tree); + + (void) close(fd); + + return (config_found == B_FALSE ? 2 : + (error == B_TRUE ? 1 : 0)); +} + +static uint64_t dataset_feature_count[SPA_FEATURES]; +static uint64_t global_feature_count[SPA_FEATURES]; +static uint64_t remap_deadlist_count = 0; + +/*ARGSUSED*/ +static int +dump_one_objset(const char *dsname, void *arg) +{ + int error; + objset_t *os; + spa_feature_t f; + + error = open_objset(dsname, FTAG, &os); + if (error != 0) + return (0); + + for (f = 0; f < SPA_FEATURES; f++) { + if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) + continue; + ASSERT(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET); + dataset_feature_count[f]++; + } + + if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { + remap_deadlist_count++; + } + + for (dsl_bookmark_node_t *dbn = + avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; + dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { + mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); + if (dbn->dbn_phys.zbm_redaction_obj != 0) + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; + } + + if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + global_feature_count[SPA_FEATURE_LIVELIST]++; + } + + dump_objset(os); + close_objset(os, FTAG); + fuid_table_destroy(); + return (0); +} + +/* + * Block statistics. + */ +#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) +typedef struct zdb_blkstats { + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_count; + uint64_t zb_gangs; + uint64_t zb_ditto_samevdev; + uint64_t zb_ditto_same_ms; + uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; +} zdb_blkstats_t; + +/* + * Extended object types to report deferred frees and dedup auto-ditto blocks. + */ +#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) +#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) +#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) +#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) + +static const char *zdb_ot_extname[] = { + "deferred free", + "dedup ditto", + "other", + "Total", +}; + +#define ZB_TOTAL DN_MAX_LEVELS +#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) + +typedef struct zdb_cb { + zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; + uint64_t zcb_removing_size; + uint64_t zcb_checkpoint_size; + uint64_t zcb_dedup_asize; + uint64_t zcb_dedup_blocks; + uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_total; + uint64_t zcb_lsize_total; + uint64_t zcb_asize_total; + uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; + uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] + [BPE_PAYLOAD_SIZE + 1]; + uint64_t zcb_start; + hrtime_t zcb_lastprint; + uint64_t zcb_totalasize; + uint64_t zcb_errors[256]; + int zcb_readfails; + int zcb_haderrors; + spa_t *zcb_spa; + uint32_t **zcb_vd_obsolete_counts; +} zdb_cb_t; + +/* test if two DVA offsets from same vdev are within the same metaslab */ +static boolean_t +same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) +{ + vdev_t *vd = vdev_lookup_top(spa, vdev); + uint64_t ms_shift = vd->vdev_ms_shift; + + return ((off1 >> ms_shift) == (off2 >> ms_shift)); +} + +/* + * Used to simplify reporting of the histogram data. + */ +typedef struct one_histo { + char *name; + uint64_t *count; + uint64_t *len; + uint64_t cumulative; +} one_histo_t; + +/* + * The number of separate histograms processed for psize, lsize and asize. + */ +#define NUM_HISTO 3 + +/* + * This routine will create a fixed column size output of three different + * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M + * the count, length and cumulative length of the psize, lsize and + * asize blocks. + * + * All three types of blocks are listed on a single line + * + * By default the table is printed in nicenumber format (e.g. 123K) but + * if the '-P' parameter is specified then the full raw number (parseable) + * is printed out. + */ +static void +dump_size_histograms(zdb_cb_t *zcb) +{ + /* + * A temporary buffer that allows us to convert a number into + * a string using zdb_nicenumber to allow either raw or human + * readable numbers to be output. + */ + char numbuf[32]; + + /* + * Define titles which are used in the headers of the tables + * printed by this routine. + */ + const char blocksize_title1[] = "block"; + const char blocksize_title2[] = "size"; + const char count_title[] = "Count"; + const char length_title[] = "Size"; + const char cumulative_title[] = "Cum."; + + /* + * Setup the histogram arrays (psize, lsize, and asize). + */ + one_histo_t parm_histo[NUM_HISTO]; + + parm_histo[0].name = "psize"; + parm_histo[0].count = zcb->zcb_psize_count; + parm_histo[0].len = zcb->zcb_psize_len; + parm_histo[0].cumulative = 0; + + parm_histo[1].name = "lsize"; + parm_histo[1].count = zcb->zcb_lsize_count; + parm_histo[1].len = zcb->zcb_lsize_len; + parm_histo[1].cumulative = 0; + + parm_histo[2].name = "asize"; + parm_histo[2].count = zcb->zcb_asize_count; + parm_histo[2].len = zcb->zcb_asize_len; + parm_histo[2].cumulative = 0; + + + (void) printf("\nBlock Size Histogram\n"); + /* + * Print the first line titles + */ + if (dump_opt['P']) + (void) printf("\n%s\t", blocksize_title1); + else + (void) printf("\n%7s ", blocksize_title1); + + for (int j = 0; j < NUM_HISTO; j++) { + if (dump_opt['P']) { + if (j < NUM_HISTO - 1) { + (void) printf("%s\t\t\t", parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf(" %s", parm_histo[j].name); + } + } else { + if (j < NUM_HISTO - 1) { + /* Left aligned strings in the output */ + (void) printf("%-7s ", + parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf("%s", parm_histo[j].name); + } + } + } + (void) printf("\n"); + + /* + * Print the second line titles + */ + if (dump_opt['P']) { + (void) printf("%s\t", blocksize_title2); + } else { + (void) printf("%7s ", blocksize_title2); + } + + for (int i = 0; i < NUM_HISTO; i++) { + if (dump_opt['P']) { + (void) printf("%s\t%s\t%s\t", + count_title, length_title, cumulative_title); + } else { + (void) printf("%7s%7s%7s", + count_title, length_title, cumulative_title); + } + } + (void) printf("\n"); + + /* + * Print the rows + */ + for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { + + /* + * Print the first column showing the blocksize + */ + zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); + + if (dump_opt['P']) { + printf("%s", numbuf); + } else { + printf("%7s:", numbuf); + } + + /* + * Print the remaining set of 3 columns per size: + * for psize, lsize and asize + */ + for (int j = 0; j < NUM_HISTO; j++) { + parm_histo[j].cumulative += parm_histo[j].len[i]; + + zdb_nicenum(parm_histo[j].count[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].len[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].cumulative, + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + } + (void) printf("\n"); + } +} + +static void +zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, + dmu_object_type_t type) +{ + uint64_t refcnt = 0; + int i; + + ASSERT(type < ZDB_OT_TOTAL); + + if (zilog && zil_bp_tree_add(zilog, bp) != 0) + return; + + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; + int t = (i & 1) ? type : ZDB_OT_TOTAL; + int equal; + zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; + + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_count++; + + /* + * The histogram is only big enough to record blocks up to + * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, + * "other", bucket. + */ + unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; + idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); + zb->zb_psize_histogram[idx]++; + + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) { + zb->zb_ditto_samevdev++; + + if (same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + } + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal != 0) { + zb->zb_ditto_samevdev++; + + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + } + break; + } + } + + spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); + + if (BP_IS_EMBEDDED(bp)) { + zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; + zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] + [BPE_GET_PSIZE(bp)]++; + return; + } + /* + * The binning histogram bins by powers of two up to + * SPA_MAXBLOCKSIZE rather than creating bins for + * every possible blocksize found in the pool. + */ + int bin = highbit64(BP_GET_PSIZE(bp)) - 1; + + zcb->zcb_psize_count[bin]++; + zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); + zcb->zcb_psize_total += BP_GET_PSIZE(bp); + + bin = highbit64(BP_GET_LSIZE(bp)) - 1; + + zcb->zcb_lsize_count[bin]++; + zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); + zcb->zcb_lsize_total += BP_GET_LSIZE(bp); + + bin = highbit64(BP_GET_ASIZE(bp)) - 1; + + zcb->zcb_asize_count[bin]++; + zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); + zcb->zcb_asize_total += BP_GET_ASIZE(bp); + + if (dump_opt['L']) + return; + + if (BP_GET_DEDUP(bp)) { + ddt_t *ddt; + ddt_entry_t *dde; + + ddt = ddt_select(zcb->zcb_spa, bp); + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_FALSE); + + if (dde == NULL) { + refcnt = 0; + } else { + ddt_phys_t *ddp = ddt_phys_select(dde, bp); + ddt_phys_decref(ddp); + refcnt = ddp->ddp_refcnt; + if (ddt_phys_total_refcnt(dde) == 0) + ddt_remove(ddt, dde); + } + ddt_exit(ddt); + } + + VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, + refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), + bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); +} + +static void +zdb_blkptr_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + int ioerr = zio->io_error; + zdb_cb_t *zcb = zio->io_private; + zbookmark_phys_t *zb = &zio->io_bookmark; + + abd_free(zio->io_abd); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + + if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + char blkbuf[BP_SPRINTF_LEN]; + + zcb->zcb_haderrors = 1; + zcb->zcb_errors[ioerr]++; + + if (dump_opt['b'] >= 2) + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + else + blkbuf[0] = '\0'; + + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } + mutex_exit(&spa->spa_scrub_lock); +} + +static int +zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + zdb_cb_t *zcb = arg; + dmu_object_type_t type; + boolean_t is_metadata; + + if (zb->zb_level == ZB_DNODE_LEVEL) + return (0); + + if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("objset %llu object %llu " + "level %lld offset 0x%llx %s\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (u_longlong_t)blkid2offset(dnp, bp, zb), + blkbuf); + } + + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) + return (0); + + type = BP_GET_TYPE(bp); + + zdb_count_block(zcb, zilog, bp, + (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); + + is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); + + if (!BP_IS_EMBEDDED(bp) && + (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { + size_t size = BP_GET_PSIZE(bp); + abd_t *abd = abd_alloc(size, B_FALSE); + int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + flags |= ZIO_FLAG_SPECULATIVE; + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_load_verify_bytes > max_inflight_bytes) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_load_verify_bytes += size; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, abd, size, + zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); + } + + zcb->zcb_readfails = 0; + + /* only call gethrtime() every 100 blocks */ + static int iters; + if (++iters > 100) + iters = 0; + else + return (0); + + if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { + uint64_t now = gethrtime(); + char buf[10]; + uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; + int kb_per_sec = + 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); + int sec_remaining = + (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); + + zfs_nicebytes(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, + "\r%5s completed (%4dMB/s) " + "estimated time remaining: %uhr %02umin %02usec ", + buf, kb_per_sec / 1024, + sec_remaining / 60 / 60, + sec_remaining / 60 % 60, + sec_remaining % 60); + + zcb->zcb_lastprint = now; + } + + return (0); +} + +static void +zdb_leak(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + + (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", + (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); +} + +static metaslab_ops_t zdb_metaslab_ops = { + NULL /* alloc */ +}; + +/* ARGSUSED */ +static int +load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + spa_vdev_removal_t *svr = arg; + + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + + /* skip vdevs we don't care about */ + if (sme->sme_vdev != svr->svr_vdev_id) + return (0); + + vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + ASSERT(vim != NULL); + if (offset >= vdev_indirect_mapping_max_offset(vim)) + return (0); + + if (sme->sme_type == SM_ALLOC) + range_tree_add(svr->svr_allocd_segs, offset, size); + else + range_tree_remove(svr->svr_allocd_segs, offset, size); + + return (0); +} + +/* ARGSUSED */ +static void +claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + /* + * This callback was called through a remap from + * a device being removed. Therefore, the vdev that + * this callback is applied to is a concrete + * vdev. + */ + ASSERT(vdev_is_concrete(vd)); + + VERIFY0(metaslab_claim_impl(vd, offset, size, + spa_min_claim_txg(vd->vdev_spa))); +} + +static void +claim_segment_cb(void *arg, uint64_t offset, uint64_t size) +{ + vdev_t *vd = arg; + + vdev_indirect_ops.vdev_op_remap(vd, offset, size, + claim_segment_impl_cb, NULL); +} + +/* + * After accounting for all allocated blocks that are directly referenced, + * we might have missed a reference to a block from a partially complete + * (and thus unused) indirect mapping object. We perform a secondary pass + * through the metaslabs we have already mapped and claim the destination + * blocks. + */ +static void +zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) +{ + if (dump_opt['L']) + return; + + if (spa->spa_vdev_removal == NULL) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + + if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) + break; + + ASSERT0(range_tree_space(allocs)); + if (msp->ms_sm != NULL) + VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); + range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); + } + range_tree_destroy(allocs); + + iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); + + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for + * it yet. + */ + range_tree_clear(svr->svr_allocd_segs, + vdev_indirect_mapping_max_offset(vim), + vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); + + zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); + range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); + + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +/* ARGSUSED */ +static int +increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + spa_t *spa = zcb->zcb_spa; + vdev_t *vd; + const dva_t *dva = &bp->blk_dva[0]; + + ASSERT(!bp_freed); + ASSERT(!dump_opt['L']); + ASSERT3U(BP_GET_NDVAS(bp), ==, 1); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); + ASSERT3P(vd, !=, NULL); + spa_config_exit(spa, SCL_VDEV, FTAG); + + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); + + vdev_indirect_mapping_increment_obsolete_count( + vd->vdev_indirect_mapping, + DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), + zcb->zcb_vd_obsolete_counts[vd->vdev_id]); + + return (0); +} + +static uint32_t * +zdb_load_obsolete_counts(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + uint64_t obsolete_sm_object; + uint32_t *counts; + + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); + counts = vdev_indirect_mapping_load_obsolete_counts(vim); + if (vd->vdev_obsolete_sm != NULL) { + vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, + vd->vdev_obsolete_sm); + } + if (scip->scip_vdev == vd->vdev_id && + scip->scip_prev_obsolete_sm_object != 0) { + space_map_t *prev_obsolete_sm = NULL; + VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); + vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, + prev_obsolete_sm); + space_map_close(prev_obsolete_sm); + } + return (counts); +} + +static void +zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + ddt_bookmark_t ddb; + ddt_entry_t dde; + int error; + int p; + + ASSERT(!dump_opt['L']); + + bzero(&ddb, sizeof (ddb)); + while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { + blkptr_t blk; + ddt_phys_t *ddp = dde.dde_phys; + + if (ddb.ddb_class == DDT_CLASS_UNIQUE) + return; + + ASSERT(ddt_phys_total_refcnt(&dde) > 1); + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddb.ddb_checksum, + &dde.dde_key, ddp, &blk); + if (p == DDT_PHYS_DITTO) { + zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); + } else { + zcb->zcb_dedup_asize += + BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); + zcb->zcb_dedup_blocks++; + } + } + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); + VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); + ddt_exit(ddt); + } + + ASSERT(error == ENOENT); +} + +typedef struct checkpoint_sm_exclude_entry_arg { + vdev_t *cseea_vd; + uint64_t cseea_checkpoint_size; +} checkpoint_sm_exclude_entry_arg_t; + +static int +checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) +{ + checkpoint_sm_exclude_entry_arg_t *cseea = arg; + vdev_t *vd = cseea->cseea_vd; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; + + ASSERT(sme->sme_type == SM_FREE); + + /* + * Since the vdev_checkpoint_sm exists in the vdev level + * and the ms_sm space maps exist in the metaslab level, + * an entry in the checkpoint space map could theoretically + * cross the boundaries of the metaslab that it belongs. + * + * In reality, because of the way that we populate and + * manipulate the checkpoint's space maps currently, + * there shouldn't be any entries that cross metaslabs. + * Hence the assertion below. + * + * That said, there is no fundamental requirement that + * the checkpoint's space map entries should not cross + * metaslab boundaries. So if needed we could add code + * that handles metaslab-crossing segments in the future. + */ + VERIFY3U(sme->sme_offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * By removing the entry from the allocated segments we + * also verify that the entry is there to begin with. + */ + mutex_enter(&ms->ms_lock); + range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); + mutex_exit(&ms->ms_lock); + + cseea->cseea_checkpoint_size += sme->sme_run; + return (0); +} + +static void +zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) +{ + spa_t *spa = vd->vdev_spa; + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + /* + * If there is no vdev_top_zap, we are in a pool whose + * version predates the pool checkpoint feature. + */ + if (vd->vdev_top_zap == 0) + return; + + /* + * If there is no reference of the vdev_checkpoint_sm in + * the vdev_top_zap, then one of the following scenarios + * is true: + * + * 1] There is no checkpoint + * 2] There is a checkpoint, but no checkpointed blocks + * have been freed yet + * 3] The current vdev is indirect + * + * In these cases we return immediately. + */ + if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + return; + + VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, + &checkpoint_sm_obj)); + + checkpoint_sm_exclude_entry_arg_t cseea; + cseea.cseea_vd = vd; + cseea.cseea_checkpoint_size = 0; + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), + checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); + + VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), + checkpoint_sm_exclude_entry_cb, &cseea)); + space_map_close(checkpoint_sm); + + zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; +} + +static void +zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) +{ + ASSERT(!dump_opt['L']); + + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); + zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); + } +} + +static int +count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + int64_t *ualloc_space = arg; + + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (sme->sme_type == SM_ALLOC) + *ualloc_space += sme->sme_run; + else + *ualloc_space -= sme->sme_run; + + return (0); +} + +static int64_t +get_unflushed_alloc_space(spa_t *spa) +{ + if (dump_opt['L']) + return (0); + + int64_t ualloc_space = 0; + iterate_through_spacemap_logs(spa, count_unflushed_space_cb, + &ualloc_space); + return (ualloc_space); +} + +static int +load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) +{ + maptype_t *uic_maptype = arg; + + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (*uic_maptype == sme->sme_type) + range_tree_add(ms->ms_allocatable, offset, size); + else + range_tree_remove(ms->ms_allocatable, offset, size); + + return (0); +} + +static void +load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) +{ + iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); +} + +static void +load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) +{ + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + + ASSERT3U(i, ==, vd->vdev_id); + + if (vd->vdev_ops == &vdev_indirect_ops) + continue; + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + (void) fprintf(stderr, + "\rloading concrete vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)msp->ms_id, + (longlong_t)vd->vdev_ms_count); + + mutex_enter(&msp->ms_lock); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + + /* + * We don't want to spend the CPU manipulating the + * size-ordered tree, so clear the range_tree ops. + */ + msp->ms_allocatable->rt_ops = NULL; + + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + msp->ms_allocatable, maptype)); + } + if (!msp->ms_loaded) + msp->ms_loaded = B_TRUE; + mutex_exit(&msp->ms_lock); + } + } + + load_unflushed_to_ms_allocatables(spa, maptype); +} + +/* + * vm_idxp is an in-out parameter which (for indirect vdevs) is the + * index in vim_entries that has the first entry in this metaslab. + * On return, it will be set to the first entry after this metaslab. + */ +static void +load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, + uint64_t *vim_idxp) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + mutex_enter(&msp->ms_lock); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + + /* + * We don't want to spend the CPU manipulating the + * size-ordered tree, so clear the range_tree ops. + */ + msp->ms_allocatable->rt_ops = NULL; + + for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); + (*vim_idxp)++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[*vim_idxp]; + uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); + uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); + ASSERT3U(ent_offset, >=, msp->ms_start); + if (ent_offset >= msp->ms_start + msp->ms_size) + break; + + /* + * Mappings do not cross metaslab boundaries, + * because we create them by walking the metaslabs. + */ + ASSERT3U(ent_offset + ent_len, <=, + msp->ms_start + msp->ms_size); + range_tree_add(msp->ms_allocatable, ent_offset, ent_len); + } + + if (!msp->ms_loaded) + msp->ms_loaded = B_TRUE; + mutex_exit(&msp->ms_lock); +} + +static void +zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) +{ + ASSERT(!dump_opt['L']); + + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + ASSERT3U(c, ==, vd->vdev_id); + + if (vd->vdev_ops != &vdev_indirect_ops) + continue; + + /* + * Note: we don't check for mapping leaks on + * removing vdevs because their ms_allocatable's + * are used to look for leaks in allocated space. + */ + zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); + + /* + * Normally, indirect vdevs don't have any + * metaslabs. We want to set them up for + * zio_claim(). + */ + VERIFY0(vdev_metaslab_init(vd, 0)); + + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t vim_idx = 0; + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + + (void) fprintf(stderr, + "\rloading indirect vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)vd->vdev_ms[m]->ms_id, + (longlong_t)vd->vdev_ms_count); + + load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], + &vim_idx); + } + ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); + } +} + +static void +zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + zcb->zcb_spa = spa; + + if (dump_opt['L']) + return; + + dsl_pool_t *dp = spa->spa_dsl_pool; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * We are going to be changing the meaning of the metaslab's + * ms_allocatable. Ensure that the allocator doesn't try to + * use the tree. + */ + spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; + spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + + zcb->zcb_vd_obsolete_counts = + umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), + UMEM_NOFAIL); + + /* + * For leak detection, we overload the ms_allocatable trees + * to contain allocated segments instead of free segments. + * As a result, we can't use the normal metaslab_load/unload + * interfaces. + */ + zdb_leak_init_prepare_indirect_vdevs(spa, zcb); + load_concrete_ms_allocatable_trees(spa, SM_ALLOC); + + /* + * On load_concrete_ms_allocatable_trees() we loaded all the + * allocated entries from the ms_sm to the ms_allocatable for + * each metaslab. If the pool has a checkpoint or is in the + * middle of discarding a checkpoint, some of these blocks + * may have been freed but their ms_sm may not have been + * updated because they are referenced by the checkpoint. In + * order to avoid false-positives during leak-detection, we + * go through the vdev's checkpoint space map and exclude all + * its entries from their relevant ms_allocatable. + * + * We also aggregate the space held by the checkpoint and add + * it to zcb_checkpoint_size. + * + * Note that at this point we are also verifying that all the + * entries on the checkpoint_sm are marked as allocated in + * the ms_sm of their relevant metaslab. + * [see comment in checkpoint_sm_exclude_entry_cb()] + */ + zdb_leak_init_exclude_checkpoint(spa, zcb); + ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); + + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, + increment_indirect_mapping_cb, zcb, NULL); + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + zdb_ddt_leak_init(spa, zcb); + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static boolean_t +zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) +{ + boolean_t leaks = B_FALSE; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t total_leaked = 0; + boolean_t are_precise = B_FALSE; + + ASSERT(vim != NULL); + + for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[i]; + uint64_t obsolete_bytes = 0; + uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); + metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + /* + * This is not very efficient but it's easy to + * verify correctness. + */ + for (uint64_t inner_offset = 0; + inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); + inner_offset += 1 << vd->vdev_ashift) { + if (range_tree_contains(msp->ms_allocatable, + offset + inner_offset, 1 << vd->vdev_ashift)) { + obsolete_bytes += 1 << vd->vdev_ashift; + } + } + + int64_t bytes_leaked = obsolete_bytes - + zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; + ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, + zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); + + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { + (void) printf("obsolete indirect mapping count " + "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), + (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + (u_longlong_t)bytes_leaked); + } + total_leaked += ABS(bytes_leaked); + } + + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (!are_precise && total_leaked > 0) { + int pct_leaked = total_leaked * 100 / + vdev_indirect_mapping_bytes_mapped(vim); + (void) printf("cannot verify obsolete indirect mapping " + "counts of vdev %llu because precise feature was not " + "enabled when it was removed: %d%% (%llx bytes) of mapping" + "unreferenced\n", + (u_longlong_t)vd->vdev_id, pct_leaked, + (u_longlong_t)total_leaked); + } else if (total_leaked > 0) { + (void) printf("obsolete indirect mapping count mismatch " + "for vdev %llu -- %llx total bytes mismatched\n", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)total_leaked); + leaks |= B_TRUE; + } + + vdev_indirect_mapping_free_obsolete_counts(vim, + zcb->zcb_vd_obsolete_counts[vd->vdev_id]); + zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; + + return (leaks); +} + +static boolean_t +zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) +{ + if (dump_opt['L']) + return (B_FALSE); + + boolean_t leaks = B_FALSE; + vdev_t *rvd = spa->spa_root_vdev; + for (unsigned c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + metaslab_group_t *mg __maybe_unused = vd->vdev_mg; + + if (zcb->zcb_vd_obsolete_counts[c] != NULL) { + leaks |= zdb_check_for_obsolete_leaks(vd, zcb); + } + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(mg, ==, msp->ms_group); + + /* + * ms_allocatable has been overloaded + * to contain allocated segments. Now that + * we finished traversing all blocks, any + * block that remains in the ms_allocatable + * represents an allocated block that we + * did not claim during the traversal. + * Claimed blocks would have been removed + * from the ms_allocatable. For indirect + * vdevs, space remaining in the tree + * represents parts of the mapping that are + * not referenced, which is not a bug. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + range_tree_vacate(msp->ms_allocatable, + NULL, NULL); + } else { + range_tree_vacate(msp->ms_allocatable, + zdb_leak, vd); + } + if (msp->ms_loaded) { + msp->ms_loaded = B_FALSE; + } + } + } + + umem_free(zcb->zcb_vd_obsolete_counts, + rvd->vdev_children * sizeof (uint32_t *)); + zcb->zcb_vd_obsolete_counts = NULL; + + return (leaks); +} + +/* ARGSUSED */ +static int +count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + + if (dump_opt['b'] >= 5) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("[%s] %s\n", + "deferred free", blkbuf); + } + zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); + return (0); +} + +/* + * Iterate over livelists which have been destroyed by the user but + * are still present in the MOS, waiting to be freed + */ +static void +iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) +{ + objset_t *mos = spa->spa_meta_objset; + uint64_t zap_obj; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + ASSERT0(err); + + zap_cursor_t zc; + zap_attribute_t attr; + dsl_deadlist_t ll; + /* NULL out os prior to dsl_deadlist_open in case it's garbage */ + ll.dl_os = NULL; + for (zap_cursor_init(&zc, mos, zap_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + (void) zap_cursor_advance(&zc)) { + dsl_deadlist_open(&ll, mos, attr.za_first_integer); + func(&ll, arg); + dsl_deadlist_close(&ll); + } + zap_cursor_fini(&zc); +} + +static int +bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (count_block_cb(arg, bp, tx)); +} + +static int +livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) +{ + zdb_cb_t *zbc = args; + bplist_t blks; + bplist_create(&blks); + /* determine which blocks have been alloc'd but not freed */ + VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); + /* count those blocks */ + (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); + bplist_destroy(&blks); + return (0); +} + +static void +livelist_count_blocks(dsl_deadlist_t *ll, void *arg) +{ + dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); +} + +/* + * Count the blocks in the livelists that have been destroyed by the user + * but haven't yet been freed. + */ +static void +deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) +{ + iterate_deleted_livelists(spa, livelist_count_blocks, zbc); +} + +static void +dump_livelist_cb(dsl_deadlist_t *ll, void *arg) +{ + ASSERT3P(arg, ==, NULL); + global_feature_count[SPA_FEATURE_LIVELIST]++; + dump_blkptr_list(ll, "Deleted Livelist"); + dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); +} + +/* + * Print out, register object references to, and increment feature counts for + * livelists that have been destroyed by the user but haven't yet been freed. + */ +static void +deleted_livelists_dump_mos(spa_t *spa) +{ + uint64_t zap_obj; + objset_t *mos = spa->spa_meta_objset; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + mos_obj_refd(zap_obj); + iterate_deleted_livelists(spa, dump_livelist_cb, NULL); +} + +static int +dump_block_stats(spa_t *spa) +{ + zdb_cb_t zcb; + zdb_blkstats_t *zb, *tzb; + uint64_t norm_alloc, norm_space, total_alloc, total_found; + int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; + boolean_t leaks = B_FALSE; + int e, c, err; + bp_embedded_type_t i; + + bzero(&zcb, sizeof (zcb)); + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", + (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", + (dump_opt['c'] == 1) ? "metadata " : "", + dump_opt['c'] ? "checksums " : "", + (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", + !dump_opt['L'] ? "nothing leaked " : ""); + + /* + * When leak detection is enabled we load all space maps as SM_ALLOC + * maps, then traverse the pool claiming each block we discover. If + * the pool is perfectly consistent, the segment trees will be empty + * when we're done. Anything left over is a leak; any block we can't + * claim (because it's not part of any space map) is a double + * allocation, reference to a freed block, or an unclaimed log block. + * + * When leak detection is disabled (-L option) we still traverse the + * pool claiming each block we discover, but we skip opening any space + * maps. + */ + bzero(&zcb, sizeof (zdb_cb_t)); + zdb_leak_init(spa, &zcb); + + /* + * If there's a deferred-free bplist, process that first. + */ + (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, + bpobj_count_block_cb, &zcb, NULL); + + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, + bpobj_count_block_cb, &zcb, NULL); + } + + zdb_claim_removing(spa, &zcb); + + if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { + VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, + spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, + &zcb, NULL)); + } + + deleted_livelists_count_blocks(spa, &zcb); + + if (dump_opt['c'] > 1) + flags |= TRAVERSE_PREFETCH_DATA; + + zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); + zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); + err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + + /* + * If we've traversed the data blocks then we need to wait for those + * I/Os to complete. We leverage "The Godfather" zio to wait on + * all async I/Os to complete. + */ + if (dump_opt['c']) { + for (c = 0; c < max_ncpus; c++) { + (void) zio_wait(spa->spa_async_zio_root[c]); + spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + } + ASSERT0(spa->spa_load_verify_bytes); + + /* + * Done after zio_wait() since zcb_haderrors is modified in + * zdb_blkptr_done() + */ + zcb.zcb_haderrors |= err; + + if (zcb.zcb_haderrors) { + (void) printf("\nError counts:\n\n"); + (void) printf("\t%5s %s\n", "errno", "count"); + for (e = 0; e < 256; e++) { + if (zcb.zcb_errors[e] != 0) { + (void) printf("\t%5d %llu\n", + e, (u_longlong_t)zcb.zcb_errors[e]); + } + } + } + + /* + * Report any leaked segments. + */ + leaks |= zdb_leak_fini(spa, &zcb); + + tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; + + norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + norm_space = metaslab_class_get_space(spa_normal_class(spa)); + + total_alloc = norm_alloc + + metaslab_class_get_alloc(spa_log_class(spa)) + + metaslab_class_get_alloc(spa_special_class(spa)) + + metaslab_class_get_alloc(spa_dedup_class(spa)) + + get_unflushed_alloc_space(spa); + total_found = tzb->zb_asize - zcb.zcb_dedup_asize + + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; + + if (total_found == total_alloc && !dump_opt['L']) { + (void) printf("\n\tNo leaks (block sum matches space" + " maps exactly)\n"); + } else if (!dump_opt['L']) { + (void) printf("block traversal size %llu != alloc %llu " + "(%s %lld)\n", + (u_longlong_t)total_found, + (u_longlong_t)total_alloc, + (dump_opt['L']) ? "unreachable" : "leaked", + (longlong_t)(total_alloc - total_found)); + leaks = B_TRUE; + } + + if (tzb->zb_count == 0) + return (2); + + (void) printf("\n"); + (void) printf("\t%-16s %14llu\n", "bp count:", + (u_longlong_t)tzb->zb_count); + (void) printf("\t%-16s %14llu\n", "ganged count:", + (longlong_t)tzb->zb_gangs); + (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", + (u_longlong_t)tzb->zb_lsize, + (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp physical:", (u_longlong_t)tzb->zb_psize, + (u_longlong_t)(tzb->zb_psize / tzb->zb_count), + (double)tzb->zb_lsize / tzb->zb_psize); + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp allocated:", (u_longlong_t)tzb->zb_asize, + (u_longlong_t)(tzb->zb_asize / tzb->zb_count), + (double)tzb->zb_lsize / tzb->zb_asize); + (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", + "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, + (u_longlong_t)zcb.zcb_dedup_blocks, + (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); + (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", + (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); + + if (spa_special_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_special_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_special_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Special class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + + if (spa_dedup_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_dedup_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_dedup_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Dedup class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + + for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { + if (zcb.zcb_embedded_blocks[i] == 0) + continue; + (void) printf("\n"); + (void) printf("\tadditional, non-pointer bps of type %u: " + "%10llu\n", + i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); + + if (dump_opt['b'] >= 3) { + (void) printf("\t number of (compressed) bytes: " + "number of bps\n"); + dump_histogram(zcb.zcb_embedded_histogram[i], + sizeof (zcb.zcb_embedded_histogram[i]) / + sizeof (zcb.zcb_embedded_histogram[i][0]), 0); + } + } + + if (tzb->zb_ditto_samevdev != 0) { + (void) printf("\tDittoed blocks on same vdev: %llu\n", + (longlong_t)tzb->zb_ditto_samevdev); + } + if (tzb->zb_ditto_same_ms != 0) { + (void) printf("\tDittoed blocks in same metaslab: %llu\n", + (longlong_t)tzb->zb_ditto_same_ms); + } + + for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + if (vim == NULL) { + continue; + } + + char mem[32]; + zdb_nicenum(vdev_indirect_mapping_num_entries(vim), + mem, vdev_indirect_mapping_size(vim)); + + (void) printf("\tindirect vdev id %llu has %llu segments " + "(%s in memory)\n", + (longlong_t)vd->vdev_id, + (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); + } + + if (dump_opt['b'] >= 2) { + int l, t, level; + (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" + "\t avg\t comp\t%%Total\tType\n"); + + for (t = 0; t <= ZDB_OT_TOTAL; t++) { + char csize[32], lsize[32], psize[32], asize[32]; + char avg[32], gang[32]; + const char *typename; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); + + if (t < DMU_OT_NUMTYPES) + typename = dmu_ot[t].ot_name; + else + typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; + + if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { + (void) printf("%6s\t%5s\t%5s\t%5s" + "\t%5s\t%5s\t%6s\t%s\n", + "-", + "-", + "-", + "-", + "-", + "-", + "-", + typename); + continue; + } + + for (l = ZB_TOTAL - 1; l >= -1; l--) { + level = (l == -1 ? ZB_TOTAL : l); + zb = &zcb.zcb_type[level][t]; + + if (zb->zb_asize == 0) + continue; + + if (dump_opt['b'] < 3 && level != ZB_TOTAL) + continue; + + if (level == 0 && zb->zb_asize == + zcb.zcb_type[ZB_TOTAL][t].zb_asize) + continue; + + zdb_nicenum(zb->zb_count, csize, + sizeof (csize)); + zdb_nicenum(zb->zb_lsize, lsize, + sizeof (lsize)); + zdb_nicenum(zb->zb_psize, psize, + sizeof (psize)); + zdb_nicenum(zb->zb_asize, asize, + sizeof (asize)); + zdb_nicenum(zb->zb_asize / zb->zb_count, avg, + sizeof (avg)); + zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); + + (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" + "\t%5.2f\t%6.2f\t", + csize, lsize, psize, asize, avg, + (double)zb->zb_lsize / zb->zb_psize, + 100.0 * zb->zb_asize / tzb->zb_asize); + + if (level == ZB_TOTAL) + (void) printf("%s\n", typename); + else + (void) printf(" L%d %s\n", + level, typename); + + if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { + (void) printf("\t number of ganged " + "blocks: %s\n", gang); + } + + if (dump_opt['b'] >= 4) { + (void) printf("psize " + "(in 512-byte sectors): " + "number of blocks\n"); + dump_histogram(zb->zb_psize_histogram, + PSIZE_HISTO_SIZE, 0); + } + } + } + + /* Output a table summarizing block sizes in the pool */ + if (dump_opt['b'] >= 2) { + dump_size_histograms(&zcb); + } + } + + (void) printf("\n"); + + if (leaks) + return (2); + + if (zcb.zcb_haderrors) + return (3); + + return (0); +} + +typedef struct zdb_ddt_entry { + ddt_key_t zdde_key; + uint64_t zdde_ref_blocks; + uint64_t zdde_ref_lsize; + uint64_t zdde_ref_psize; + uint64_t zdde_ref_dsize; + avl_node_t zdde_node; +} zdb_ddt_entry_t; + +/* ARGSUSED */ +static int +zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + avl_tree_t *t = arg; + avl_index_t where; + zdb_ddt_entry_t *zdde, zdde_search; + + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp)) + return (0); + + if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { + (void) printf("traversing objset %llu, %llu objects, " + "%lu blocks so far\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)BP_GET_FILL(bp), + avl_numnodes(t)); + } + + if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || + BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + return (0); + + ddt_key_fill(&zdde_search.zdde_key, bp); + + zdde = avl_find(t, &zdde_search, &where); + + if (zdde == NULL) { + zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); + zdde->zdde_key = zdde_search.zdde_key; + avl_insert(t, zdde, where); + } + + zdde->zdde_ref_blocks += 1; + zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); + zdde->zdde_ref_psize += BP_GET_PSIZE(bp); + zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); + + return (0); +} + +static void +dump_simulated_ddt(spa_t *spa) +{ + avl_tree_t t; + void *cookie = NULL; + zdb_ddt_entry_t *zdde; + ddt_histogram_t ddh_total; + ddt_stat_t dds_total; + + bzero(&ddh_total, sizeof (ddh_total)); + bzero(&dds_total, sizeof (dds_total)); + avl_create(&t, ddt_entry_compare, + sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { + ddt_stat_t dds; + uint64_t refcnt = zdde->zdde_ref_blocks; + ASSERT(refcnt != 0); + + dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; + dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; + dds.dds_psize = zdde->zdde_ref_psize / refcnt; + dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; + + dds.dds_ref_blocks = zdde->zdde_ref_blocks; + dds.dds_ref_lsize = zdde->zdde_ref_lsize; + dds.dds_ref_psize = zdde->zdde_ref_psize; + dds.dds_ref_dsize = zdde->zdde_ref_dsize; + + ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], + &dds, 0); + + umem_free(zdde, sizeof (*zdde)); + } + + avl_destroy(&t); + + ddt_histogram_stat(&dds_total, &ddh_total); + + (void) printf("Simulated DDT histogram:\n"); + + zpool_dump_ddt(&dds_total, &ddh_total); + + dump_dedup_ratio(&dds_total); +} + +static int +verify_device_removal_feature_counts(spa_t *spa) +{ + uint64_t dr_feature_refcount = 0; + uint64_t oc_feature_refcount = 0; + uint64_t indirect_vdev_count = 0; + uint64_t precise_vdev_count = 0; + uint64_t obsolete_counts_object_count = 0; + uint64_t obsolete_sm_count = 0; + uint64_t obsolete_counts_count = 0; + uint64_t scip_count = 0; + uint64_t obsolete_bpobj_count = 0; + int ret = 0; + + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + if (scip->scip_next_mapping_object != 0) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + (void) printf("Condensing indirect vdev %llu: new mapping " + "object %llu, prev obsolete sm %llu\n", + (u_longlong_t)scip->scip_vdev, + (u_longlong_t)scip->scip_next_mapping_object, + (u_longlong_t)scip->scip_prev_obsolete_sm_object); + if (scip->scip_prev_obsolete_sm_object != 0) { + space_map_t *prev_obsolete_sm = NULL; + VERIFY0(space_map_open(&prev_obsolete_sm, + spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, + 0, vd->vdev_asize, 0)); + dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); + (void) printf("\n"); + space_map_close(prev_obsolete_sm); + } + + scip_count += 2; + } + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + if (vic->vic_mapping_object != 0) { + ASSERT(vd->vdev_ops == &vdev_indirect_ops || + vd->vdev_removing); + indirect_vdev_count++; + + if (vd->vdev_indirect_mapping->vim_havecounts) { + obsolete_counts_count++; + } + } + + boolean_t are_precise; + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (are_precise) { + ASSERT(vic->vic_mapping_object != 0); + precise_vdev_count++; + } + + uint64_t obsolete_sm_object; + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object != 0) { + ASSERT(vic->vic_mapping_object != 0); + obsolete_sm_count++; + } + } + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], + &dr_feature_refcount); + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], + &oc_feature_refcount); + + if (dr_feature_refcount != indirect_vdev_count) { + ret = 1; + (void) printf("Number of indirect vdevs (%llu) " \ + "does not match feature count (%llu)\n", + (u_longlong_t)indirect_vdev_count, + (u_longlong_t)dr_feature_refcount); + } else { + (void) printf("Verified device_removal feature refcount " \ + "of %llu is correct\n", + (u_longlong_t)dr_feature_refcount); + } + + if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ) == 0) { + obsolete_bpobj_count++; + } + + + obsolete_counts_object_count = precise_vdev_count; + obsolete_counts_object_count += obsolete_sm_count; + obsolete_counts_object_count += obsolete_counts_count; + obsolete_counts_object_count += scip_count; + obsolete_counts_object_count += obsolete_bpobj_count; + obsolete_counts_object_count += remap_deadlist_count; + + if (oc_feature_refcount != obsolete_counts_object_count) { + ret = 1; + (void) printf("Number of obsolete counts objects (%llu) " \ + "does not match feature count (%llu)\n", + (u_longlong_t)obsolete_counts_object_count, + (u_longlong_t)oc_feature_refcount); + (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " + "ob:%llu rd:%llu\n", + (u_longlong_t)precise_vdev_count, + (u_longlong_t)obsolete_sm_count, + (u_longlong_t)obsolete_counts_count, + (u_longlong_t)scip_count, + (u_longlong_t)obsolete_bpobj_count, + (u_longlong_t)remap_deadlist_count); + } else { + (void) printf("Verified indirect_refcount feature refcount " \ + "of %llu is correct\n", + (u_longlong_t)oc_feature_refcount); + } + return (ret); +} + +static void +zdb_set_skip_mmp(char *target) +{ + spa_t *spa; + + /* + * Disable the activity check to allow examination of + * active pools. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(target)) != NULL) { + spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; + } + mutex_exit(&spa_namespace_lock); +} + +#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" +/* + * Import the checkpointed state of the pool specified by the target + * parameter as readonly. The function also accepts a pool config + * as an optional parameter, else it attempts to infer the config by + * the name of the target pool. + * + * Note that the checkpointed state's pool name will be the name of + * the original pool with the above suffix appended to it. In addition, + * if the target is not a pool name (e.g. a path to a dataset) then + * the new_path parameter is populated with the updated path to + * reflect the fact that we are looking into the checkpointed state. + * + * The function returns a newly-allocated copy of the name of the + * pool containing the checkpointed state. When this copy is no + * longer needed it should be freed with free(3C). Same thing + * applies to the new_path parameter if allocated. + */ +static char * +import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) +{ + int error = 0; + char *poolname, *bogus_name = NULL; + + /* If the target is not a pool, the extract the pool name */ + char *path_start = strchr(target, '/'); + if (path_start != NULL) { + size_t poolname_len = path_start - target; + poolname = strndup(target, poolname_len); + } else { + poolname = target; + } + + if (cfg == NULL) { + zdb_set_skip_mmp(poolname); + error = spa_get_stats(poolname, &cfg, NULL, 0); + if (error != 0) { + fatal("Tried to read config of pool \"%s\" but " + "spa_get_stats() failed with error %d\n", + poolname, error); + } + } + + if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) + return (NULL); + fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); + + error = spa_import(bogus_name, cfg, NULL, + ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | + ZFS_IMPORT_SKIP_MMP); + if (error != 0) { + fatal("Tried to import pool \"%s\" but spa_import() failed " + "with error %d\n", bogus_name, error); + } + + if (new_path != NULL && path_start != NULL) { + if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { + if (path_start != NULL) + free(poolname); + return (NULL); + } + } + + if (target != poolname) + free(poolname); + + return (bogus_name); +} + +typedef struct verify_checkpoint_sm_entry_cb_arg { + vdev_t *vcsec_vd; + + /* the following fields are only used for printing progress */ + uint64_t vcsec_entryid; + uint64_t vcsec_num_entries; +} verify_checkpoint_sm_entry_cb_arg_t; + +#define ENTRIES_PER_PROGRESS_UPDATE 10000 + +static int +verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) +{ + verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; + vdev_t *vd = vcsec->vcsec_vd; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; + + ASSERT(sme->sme_type == SM_FREE); + + if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { + (void) fprintf(stderr, + "\rverifying vdev %llu, space map entry %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)vcsec->vcsec_entryid, + (longlong_t)vcsec->vcsec_num_entries); + } + vcsec->vcsec_entryid++; + + /* + * See comment in checkpoint_sm_exclude_entry_cb() + */ + VERIFY3U(sme->sme_offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * The entries in the vdev_checkpoint_sm should be marked as + * allocated in the checkpointed state of the pool, therefore + * their respective ms_allocateable trees should not contain them. + */ + mutex_enter(&ms->ms_lock); + range_tree_verify_not_present(ms->ms_allocatable, + sme->sme_offset, sme->sme_run); + mutex_exit(&ms->ms_lock); + + return (0); +} + +/* + * Verify that all segments in the vdev_checkpoint_sm are allocated + * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's + * ms_allocatable). + * + * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of + * each vdev in the current state of the pool to the metaslab space maps + * (ms_sm) of the checkpointed state of the pool. + * + * Note that the function changes the state of the ms_allocatable + * trees of the current spa_t. The entries of these ms_allocatable + * trees are cleared out and then repopulated from with the free + * entries of their respective ms_sm space maps. + */ +static void +verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) +{ + vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; + vdev_t *current_rvd = current->spa_root_vdev; + + load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); + + for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { + vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; + vdev_t *current_vd = current_rvd->vdev_child[c]; + + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { + /* + * Since we don't allow device removal in a pool + * that has a checkpoint, we expect that all removed + * vdevs were removed from the pool before the + * checkpoint. + */ + ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); + continue; + } + + /* + * If the checkpoint space map doesn't exist, then nothing + * here is checkpointed so there's nothing to verify. + */ + if (current_vd->vdev_top_zap == 0 || + zap_contains(spa_meta_objset(current), + current_vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + continue; + + VERIFY0(zap_lookup(spa_meta_objset(current), + current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (uint64_t), 1, &checkpoint_sm_obj)); + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), + checkpoint_sm_obj, 0, current_vd->vdev_asize, + current_vd->vdev_ashift)); + + verify_checkpoint_sm_entry_cb_arg_t vcsec; + vcsec.vcsec_vd = ckpoint_vd; + vcsec.vcsec_entryid = 0; + vcsec.vcsec_num_entries = + space_map_length(checkpoint_sm) / sizeof (uint64_t); + VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), + verify_checkpoint_sm_entry_cb, &vcsec)); + if (dump_opt['m'] > 3) + dump_spacemap(current->spa_meta_objset, checkpoint_sm); + space_map_close(checkpoint_sm); + } + + /* + * If we've added vdevs since we took the checkpoint, ensure + * that their checkpoint space maps are empty. + */ + if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { + for (uint64_t c = ckpoint_rvd->vdev_children; + c < current_rvd->vdev_children; c++) { + vdev_t *current_vd = current_rvd->vdev_child[c]; + ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); + } + } + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); +} + +/* + * Verifies that all space that's allocated in the checkpoint is + * still allocated in the current version, by checking that everything + * in checkpoint's ms_allocatable (which is actually allocated, not + * allocatable/free) is not present in current's ms_allocatable. + * + * Note that the function changes the state of the ms_allocatable + * trees of both spas when called. The entries of all ms_allocatable + * trees are cleared out and then repopulated from their respective + * ms_sm space maps. In the checkpointed state we load the allocated + * entries, and in the current state we load the free entries. + */ +static void +verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) +{ + vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; + vdev_t *current_rvd = current->spa_root_vdev; + + load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); + load_concrete_ms_allocatable_trees(current, SM_FREE); + + for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { + vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; + vdev_t *current_vd = current_rvd->vdev_child[i]; + + if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { + /* + * See comment in verify_checkpoint_vdev_spacemaps() + */ + ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); + continue; + } + + for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { + metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; + metaslab_t *current_msp = current_vd->vdev_ms[m]; + + (void) fprintf(stderr, + "\rverifying vdev %llu of %llu, " + "metaslab %llu of %llu ...", + (longlong_t)current_vd->vdev_id, + (longlong_t)current_rvd->vdev_children, + (longlong_t)current_vd->vdev_ms[m]->ms_id, + (longlong_t)current_vd->vdev_ms_count); + + /* + * We walk through the ms_allocatable trees that + * are loaded with the allocated blocks from the + * ms_sm spacemaps of the checkpoint. For each + * one of these ranges we ensure that none of them + * exists in the ms_allocatable trees of the + * current state which are loaded with the ranges + * that are currently free. + * + * This way we ensure that none of the blocks that + * are part of the checkpoint were freed by mistake. + */ + range_tree_walk(ckpoint_msp->ms_allocatable, + (range_tree_func_t *)range_tree_verify_not_present, + current_msp->ms_allocatable); + } + } + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); +} + +static void +verify_checkpoint_blocks(spa_t *spa) +{ + ASSERT(!dump_opt['L']); + + spa_t *checkpoint_spa; + char *checkpoint_pool; + nvlist_t *config = NULL; + int error = 0; + + /* + * We import the checkpointed state of the pool (under a different + * name) so we can do verification on it against the current state + * of the pool. + */ + checkpoint_pool = import_checkpointed_state(spa->spa_name, config, + NULL); + ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); + + error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); + if (error != 0) { + fatal("Tried to open pool \"%s\" but spa_open() failed with " + "error %d\n", checkpoint_pool, error); + } + + /* + * Ensure that ranges in the checkpoint space maps of each vdev + * are allocated according to the checkpointed state's metaslab + * space maps. + */ + verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); + + /* + * Ensure that allocated ranges in the checkpoint's metaslab + * space maps remain allocated in the metaslab space maps of + * the current state. + */ + verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); + + /* + * Once we are done, we get rid of the checkpointed state. + */ + spa_close(checkpoint_spa, FTAG); + free(checkpoint_pool); +} + +static void +dump_leftover_checkpoint_blocks(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + if (vd->vdev_top_zap == 0) + continue; + + if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + continue; + + VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (uint64_t), 1, &checkpoint_sm_obj)); + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), + checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); + dump_spacemap(spa->spa_meta_objset, checkpoint_sm); + space_map_close(checkpoint_sm); + } +} + +static int +verify_checkpoint(spa_t *spa) +{ + uberblock_t checkpoint; + int error; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (0); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error == ENOENT && !dump_opt['L']) { + /* + * If the feature is active but the uberblock is missing + * then we must be in the middle of discarding the + * checkpoint. + */ + (void) printf("\nPartially discarded checkpoint " + "state found:\n"); + if (dump_opt['m'] > 3) + dump_leftover_checkpoint_blocks(spa); + return (0); + } else if (error != 0) { + (void) printf("lookup error %d when looking for " + "checkpointed uberblock in MOS\n", error); + return (error); + } + dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); + + if (checkpoint.ub_checkpoint_txg == 0) { + (void) printf("\nub_checkpoint_txg not set in checkpointed " + "uberblock\n"); + error = 3; + } + + if (error == 0 && !dump_opt['L']) + verify_checkpoint_blocks(spa); + + return (error); +} + +/* ARGSUSED */ +static void +mos_leaks_cb(void *arg, uint64_t start, uint64_t size) +{ + for (uint64_t i = start; i < size; i++) { + (void) printf("MOS object %llu referenced but not allocated\n", + (u_longlong_t)i); + } +} + +static void +mos_obj_refd(uint64_t obj) +{ + if (obj != 0 && mos_refd_objs != NULL) + range_tree_add(mos_refd_objs, obj, 1); +} + +/* + * Call on a MOS object that may already have been referenced. + */ +static void +mos_obj_refd_multiple(uint64_t obj) +{ + if (obj != 0 && mos_refd_objs != NULL && + !range_tree_contains(mos_refd_objs, obj, 1)) + range_tree_add(mos_refd_objs, obj, 1); +} + +static void +mos_leak_vdev_top_zap(vdev_t *vd) +{ + uint64_t ms_flush_data_obj; + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(ms_flush_data_obj); +} + +static void +mos_leak_vdev(vdev_t *vd) +{ + mos_obj_refd(vd->vdev_dtl_object); + mos_obj_refd(vd->vdev_ms_array); + mos_obj_refd(vd->vdev_indirect_config.vic_births_object); + mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); + mos_obj_refd(vd->vdev_leaf_zap); + if (vd->vdev_checkpoint_sm != NULL) + mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); + if (vd->vdev_indirect_mapping != NULL) { + mos_obj_refd(vd->vdev_indirect_mapping-> + vim_phys->vimp_counts_object); + } + if (vd->vdev_obsolete_sm != NULL) + mos_obj_refd(vd->vdev_obsolete_sm->sm_object); + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *ms = vd->vdev_ms[m]; + mos_obj_refd(space_map_object(ms->ms_sm)); + } + + if (vd->vdev_top_zap != 0) { + mos_obj_refd(vd->vdev_top_zap); + mos_leak_vdev_top_zap(vd); + } + + for (uint64_t c = 0; c < vd->vdev_children; c++) { + mos_leak_vdev(vd->vdev_child[c]); + } +} + +static void +mos_leak_log_spacemaps(spa_t *spa) +{ + uint64_t spacemap_zap; + int error = zap_lookup(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, + sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(spacemap_zap); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) + mos_obj_refd(sls->sls_sm_obj); +} + +static int +dump_mos_leaks(spa_t *spa) +{ + int rv = 0; + objset_t *mos = spa->spa_meta_objset; + dsl_pool_t *dp = spa->spa_dsl_pool; + + /* Visit and mark all referenced objects in the MOS */ + + mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); + mos_obj_refd(spa->spa_pool_props_object); + mos_obj_refd(spa->spa_config_object); + mos_obj_refd(spa->spa_ddt_stat_object); + mos_obj_refd(spa->spa_feat_desc_obj); + mos_obj_refd(spa->spa_feat_enabled_txg_obj); + mos_obj_refd(spa->spa_feat_for_read_obj); + mos_obj_refd(spa->spa_feat_for_write_obj); + mos_obj_refd(spa->spa_history); + mos_obj_refd(spa->spa_errlog_last); + mos_obj_refd(spa->spa_errlog_scrub); + mos_obj_refd(spa->spa_all_vdev_zaps); + mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); + mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); + mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); + bpobj_count_refd(&spa->spa_deferred_bpobj); + mos_obj_refd(dp->dp_empty_bpobj); + bpobj_count_refd(&dp->dp_obsolete_bpobj); + bpobj_count_refd(&dp->dp_free_bpobj); + mos_obj_refd(spa->spa_l2cache.sav_object); + mos_obj_refd(spa->spa_spares.sav_object); + + if (spa->spa_syncing_log_sm != NULL) + mos_obj_refd(spa->spa_syncing_log_sm->sm_object); + mos_leak_log_spacemaps(spa); + + mos_obj_refd(spa->spa_condensing_indirect_phys. + scip_next_mapping_object); + mos_obj_refd(spa->spa_condensing_indirect_phys. + scip_prev_obsolete_sm_object); + if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { + vdev_indirect_mapping_t *vim = + vdev_indirect_mapping_open(mos, + spa->spa_condensing_indirect_phys.scip_next_mapping_object); + mos_obj_refd(vim->vim_phys->vimp_counts_object); + vdev_indirect_mapping_close(vim); + } + deleted_livelists_dump_mos(spa); + + if (dp->dp_origin_snap != NULL) { + dsl_dataset_t *ds; + + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, + FTAG, &ds)); + count_ds_mos_objects(ds); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + + count_ds_mos_objects(dp->dp_origin_snap); + dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); + } + count_dir_mos_objects(dp->dp_mos_dir); + if (dp->dp_free_dir != NULL) + count_dir_mos_objects(dp->dp_free_dir); + if (dp->dp_leak_dir != NULL) + count_dir_mos_objects(dp->dp_leak_dir); + + mos_leak_vdev(spa->spa_root_vdev); + + for (uint64_t class = 0; class < DDT_CLASSES; class++) { + for (uint64_t type = 0; type < DDT_TYPES; type++) { + for (uint64_t cksum = 0; + cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { + ddt_t *ddt = spa->spa_ddt[cksum]; + mos_obj_refd(ddt->ddt_object[type][class]); + } + } + } + + /* + * Visit all allocated objects and make sure they are referenced. + */ + uint64_t object = 0; + while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { + if (range_tree_contains(mos_refd_objs, object, 1)) { + range_tree_remove(mos_refd_objs, object, 1); + } else { + dmu_object_info_t doi; + const char *name; + dmu_object_info(mos, object, &doi); + if (doi.doi_type & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(doi.doi_type); + name = dmu_ot_byteswap[bswap].ob_name; + } else { + name = dmu_ot[doi.doi_type].ot_name; + } + + (void) printf("MOS object %llu (%s) leaked\n", + (u_longlong_t)object, name); + rv = 2; + } + } + (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); + if (!range_tree_is_empty(mos_refd_objs)) + rv = 2; + range_tree_vacate(mos_refd_objs, NULL, NULL); + range_tree_destroy(mos_refd_objs); + return (rv); +} + +typedef struct log_sm_obsolete_stats_arg { + uint64_t lsos_current_txg; + + uint64_t lsos_total_entries; + uint64_t lsos_valid_entries; + + uint64_t lsos_sm_entries; + uint64_t lsos_valid_sm_entries; +} log_sm_obsolete_stats_arg_t; + +static int +log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + log_sm_obsolete_stats_arg_t *lsos = arg; + + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + if (lsos->lsos_current_txg == 0) { + /* this is the first log */ + lsos->lsos_current_txg = txg; + } else if (lsos->lsos_current_txg < txg) { + /* we just changed log - print stats and reset */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos->lsos_valid_sm_entries, + (u_longlong_t)lsos->lsos_sm_entries, + (u_longlong_t)lsos->lsos_current_txg); + lsos->lsos_valid_sm_entries = 0; + lsos->lsos_sm_entries = 0; + lsos->lsos_current_txg = txg; + } + ASSERT3U(lsos->lsos_current_txg, ==, txg); + + lsos->lsos_sm_entries++; + lsos->lsos_total_entries++; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + lsos->lsos_valid_sm_entries++; + lsos->lsos_valid_entries++; + return (0); +} + +static void +dump_log_spacemap_obsolete_stats(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + log_sm_obsolete_stats_arg_t lsos; + bzero(&lsos, sizeof (lsos)); + + (void) printf("Log Space Map Obsolete Entry Statistics:\n"); + + iterate_through_spacemap_logs(spa, + log_spacemap_obsolete_stats_cb, &lsos); + + /* print stats for latest log */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos.lsos_valid_sm_entries, + (u_longlong_t)lsos.lsos_sm_entries, + (u_longlong_t)lsos.lsos_current_txg); + + (void) printf("%-8llu valid entries out of %-8llu - total\n\n", + (u_longlong_t)lsos.lsos_valid_entries, + (u_longlong_t)lsos.lsos_total_entries); +} + +static void +dump_zpool(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + int rc = 0; + + if (dump_opt['y']) { + livelist_metaslab_validate(spa); + } + + if (dump_opt['S']) { + dump_simulated_ddt(spa); + return; + } + + if (!dump_opt['e'] && dump_opt['C'] > 1) { + (void) printf("\nCached configuration:\n"); + dump_nvlist(spa->spa_config, 8); + } + + if (dump_opt['C']) + dump_config(spa); + + if (dump_opt['u']) + dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); + + if (dump_opt['D']) + dump_all_ddts(spa); + + if (dump_opt['d'] > 2 || dump_opt['m']) + dump_metaslabs(spa); + if (dump_opt['M']) + dump_metaslab_groups(spa); + if (dump_opt['d'] > 2 || dump_opt['m']) { + dump_log_spacemaps(spa); + dump_log_spacemap_obsolete_stats(spa); + } + + if (dump_opt['d'] || dump_opt['i']) { + spa_feature_t f; + mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); + dump_objset(dp->dp_meta_objset); + + if (dump_opt['d'] >= 3) { + dsl_pool_t *dp = spa->spa_dsl_pool; + dump_full_bpobj(&spa->spa_deferred_bpobj, + "Deferred frees", 0); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + dump_full_bpobj(&dp->dp_free_bpobj, + "Pool snapshot frees", 0); + } + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + dump_full_bpobj(&dp->dp_obsolete_bpobj, + "Pool obsolete blocks", 0); + } + + if (spa_feature_is_active(spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dump_bptree(spa->spa_meta_objset, + dp->dp_bptree_obj, + "Pool dataset frees"); + } + dump_dtl(spa->spa_root_vdev, 0); + } + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) + global_feature_count[f] = UINT64_MAX; + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; + global_feature_count[SPA_FEATURE_LIVELIST] = 0; + + (void) dmu_objset_find(spa_name(spa), dump_one_objset, + NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + + if (rc == 0 && !dump_opt['L']) + rc = dump_mos_leaks(spa); + + for (f = 0; f < SPA_FEATURES; f++) { + uint64_t refcount; + + uint64_t *arr; + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) { + if (global_feature_count[f] == UINT64_MAX) + continue; + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(global_feature_count[f]); + continue; + } + arr = global_feature_count; + } else { + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(dataset_feature_count[f]); + continue; + } + arr = dataset_feature_count; + } + if (feature_get_refcount(spa, &spa_feature_table[f], + &refcount) == ENOTSUP) + continue; + if (arr[f] != refcount) { + (void) printf("%s feature refcount mismatch: " + "%lld consumers != %lld refcount\n", + spa_feature_table[f].fi_uname, + (longlong_t)arr[f], (longlong_t)refcount); + rc = 2; + } else { + (void) printf("Verified %s feature refcount " + "of %llu is correct\n", + spa_feature_table[f].fi_uname, + (longlong_t)refcount); + } + } + + if (rc == 0) + rc = verify_device_removal_feature_counts(spa); + } + + if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) + rc = dump_block_stats(spa); + + if (rc == 0) + rc = verify_spacemap_refcounts(spa); + + if (dump_opt['s']) + show_pool_stats(spa); + + if (dump_opt['h']) + dump_history(spa); + + if (rc == 0) + rc = verify_checkpoint(spa); + + if (rc != 0) { + dump_debug_buffer(); + exit(rc); + } +} + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 + +static int flagbits[256]; +static char flagbitstr[16]; + +static void +zdb_print_blkptr(const blkptr_t *bp, int flags) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); + + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("%s\n", blkbuf); +} + +static void +zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) +{ + int i; + + for (i = 0; i < nbps; i++) + zdb_print_blkptr(&bp[i], flags); +} + +static void +zdb_dump_gbh(void *buf, int flags) +{ + zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); +} + +static void +zdb_dump_block_raw(void *buf, uint64_t size, int flags) +{ + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array(buf, size); + VERIFY(write(fileno(stdout), buf, size) == size); +} + +static void +zdb_dump_block(char *label, void *buf, uint64_t size, int flags) +{ + uint64_t *d = (uint64_t *)buf; + unsigned nwords = size / sizeof (uint64_t); + int do_bswap = !!(flags & ZDB_FLAG_BSWAP); + unsigned i, j; + const char *hdr; + char *c; + + + if (do_bswap) + hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; + else + hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; + + (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); + +#ifdef _LITTLE_ENDIAN + /* correct the endianness */ + do_bswap = !do_bswap; +#endif + for (i = 0; i < nwords; i += 2) { + (void) printf("%06llx: %016llx %016llx ", + (u_longlong_t)(i * sizeof (uint64_t)), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); + + c = (char *)&d[i]; + for (j = 0; j < 2 * sizeof (uint64_t); j++) + (void) printf("%c", isprint(c[j]) ? c[j] : '.'); + (void) printf("\n"); + } +} + +/* + * There are two acceptable formats: + * leaf_name - For example: c1t0d0 or /tmp/ztest.0a + * child[.child]* - For example: 0.1.1 + * + * The second form can be used to specify arbitrary vdevs anywhere + * in the hierarchy. For example, in a pool with a mirror of + * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . + */ +static vdev_t * +zdb_vdev_lookup(vdev_t *vdev, const char *path) +{ + char *s, *p, *q; + unsigned i; + + if (vdev == NULL) + return (NULL); + + /* First, assume the x.x.x.x format */ + i = strtoul(path, &s, 10); + if (s == path || (s && *s != '.' && *s != '\0')) + goto name; + if (i >= vdev->vdev_children) + return (NULL); + + vdev = vdev->vdev_child[i]; + if (s && *s == '\0') + return (vdev); + return (zdb_vdev_lookup(vdev, s+1)); + +name: + for (i = 0; i < vdev->vdev_children; i++) { + vdev_t *vc = vdev->vdev_child[i]; + + if (vc->vdev_path == NULL) { + vc = zdb_vdev_lookup(vc, path); + if (vc == NULL) + continue; + else + return (vc); + } + + p = strrchr(vc->vdev_path, '/'); + p = p ? p + 1 : vc->vdev_path; + q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; + + if (strcmp(vc->vdev_path, path) == 0) + return (vc); + if (strcmp(p, path) == 0) + return (vc); + if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) + return (vc); + } + + return (NULL); +} + +static int +name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) +{ + dsl_dataset_t *ds; + + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); + int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, + NULL, &ds); + if (error != 0) { + (void) fprintf(stderr, "failed to hold objset %llu: %s\n", + (u_longlong_t)objset_id, strerror(error)); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + return (error); + } + dsl_dataset_name(ds, outstr); + dsl_dataset_rele(ds, NULL); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + return (0); +} + +static boolean_t +zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) +{ + char *s0, *s1; + + if (sizes == NULL) + return (B_FALSE); + + s0 = strtok(sizes, "/"); + if (s0 == NULL) + return (B_FALSE); + s1 = strtok(NULL, "/"); + *lsize = strtoull(s0, NULL, 16); + *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; + return (*lsize >= *psize && *psize > 0); +} + +#define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) + +static boolean_t +zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, + uint64_t psize, int flags) +{ + boolean_t exceeded = B_FALSE; + /* + * We don't know how the data was compressed, so just try + * every decompress function at every inflated blocksize. + */ + void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; + int *cfuncp = cfuncs; + uint64_t maxlsize = SPA_MAXBLOCKSIZE; + uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | + ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | + (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0); + *cfuncp++ = ZIO_COMPRESS_LZ4; + *cfuncp++ = ZIO_COMPRESS_LZJB; + mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); + for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) + if (((1ULL << c) & mask) == 0) + *cfuncp++ = c; + + /* + * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this + * could take a while and we should let the user know + * we are not stuck. On the other hand, printing progress + * info gets old after a while. User can specify 'v' flag + * to see the progression. + */ + if (lsize == psize) + lsize += SPA_MINBLOCKSIZE; + else + maxlsize = lsize; + for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { + for (cfuncp = cfuncs; *cfuncp; cfuncp++) { + if (flags & ZDB_FLAG_VERBOSE) { + (void) fprintf(stderr, + "Trying %05llx -> %05llx (%s)\n", + (u_longlong_t)psize, + (u_longlong_t)lsize, + zio_compress_table[*cfuncp].\ + ci_name); + } + + /* + * We randomize lbuf2, and decompress to both + * lbuf and lbuf2. This way, we will know if + * decompression fill exactly to lsize. + */ + VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); + + if (zio_decompress_data(*cfuncp, pabd, + lbuf, psize, lsize, NULL) == 0 && + zio_decompress_data(*cfuncp, pabd, + lbuf2, psize, lsize, NULL) == 0 && + bcmp(lbuf, lbuf2, lsize) == 0) + break; + } + if (*cfuncp != 0) + break; + } + umem_free(lbuf2, SPA_MAXBLOCKSIZE); + + if (lsize > maxlsize) { + exceeded = B_TRUE; + } + buf = lbuf; + if (*cfuncp == ZIO_COMPRESS_ZLE) { + printf("\nZLE decompression was selected. If you " + "suspect the results are wrong,\ntry avoiding ZLE " + "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); + } + + return (exceeded); +} + +/* + * Read a block from a pool and print it out. The syntax of the + * block descriptor is: + * + * pool:vdev_specifier:offset:[lsize/]psize[:flags] + * + * pool - The name of the pool you wish to read from + * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) + * offset - offset, in hex, in bytes + * size - Amount of data to read, in hex, in bytes + * flags - A string of characters specifying options + * b: Decode a blkptr at given offset within block + * c: Calculate and display checksums + * d: Decompress data before dumping + * e: Byteswap data before dumping + * g: Display data as a gang block header + * i: Display as an indirect block + * r: Dump raw data to stdout + * v: Verbose + * + */ +static void +zdb_read_block(char *thing, spa_t *spa) +{ + blkptr_t blk, *bp = &blk; + dva_t *dva = bp->blk_dva; + int flags = 0; + uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; + zio_t *zio; + vdev_t *vd; + abd_t *pabd; + void *lbuf, *buf; + char *s, *p, *dup, *vdev, *flagstr, *sizes; + int i, error; + boolean_t borrowed = B_FALSE, found = B_FALSE; + + dup = strdup(thing); + s = strtok(dup, ":"); + vdev = s ? s : ""; + s = strtok(NULL, ":"); + offset = strtoull(s ? s : "", NULL, 16); + sizes = strtok(NULL, ":"); + s = strtok(NULL, ":"); + flagstr = strdup(s ? s : ""); + + s = NULL; + if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) + s = "invalid size(s)"; + if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) + s = "size must be a multiple of sector size"; + if (!IS_P2ALIGNED(offset, DEV_BSIZE)) + s = "offset must be a multiple of sector size"; + if (s) { + (void) printf("Invalid block specifier: %s - %s\n", thing, s); + goto done; + } + + for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { + for (i = 0; i < strlen(flagstr); i++) { + int bit = flagbits[(uchar_t)flagstr[i]]; + + if (bit == 0) { + (void) printf("***Ignoring flag: %c\n", + (uchar_t)flagstr[i]); + continue; + } + found = B_TRUE; + flags |= bit; + + p = &flagstr[i + 1]; + if (*p != ':' && *p != '\0') { + int j = 0, nextbit = flagbits[(uchar_t)*p]; + char *end, offstr[8] = { 0 }; + if ((bit == ZDB_FLAG_PRINT_BLKPTR) && + (nextbit == 0)) { + /* look ahead to isolate the offset */ + while (nextbit == 0 && + strchr(flagbitstr, *p) == NULL) { + offstr[j] = *p; + j++; + if (i + j > strlen(flagstr)) + break; + p++; + nextbit = flagbits[(uchar_t)*p]; + } + blkptr_offset = strtoull(offstr, &end, + 16); + i += j; + } else if (nextbit == 0) { + (void) printf("***Ignoring flag arg:" + " '%c'\n", (uchar_t)*p); + } + } + } + } + if (blkptr_offset % sizeof (blkptr_t)) { + printf("Block pointer offset 0x%llx " + "must be divisible by 0x%x\n", + (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); + goto done; + } + if (found == B_FALSE && strlen(flagstr) > 0) { + printf("Invalid flag arg: '%s'\n", flagstr); + goto done; + } + + vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); + if (vd == NULL) { + (void) printf("***Invalid vdev: %s\n", vdev); + free(dup); + return; + } else { + if (vd->vdev_path) + (void) fprintf(stderr, "Found vdev: %s\n", + vd->vdev_path); + else + (void) fprintf(stderr, "Found vdev type: %s\n", + vd->vdev_ops->vdev_op_type); + } + + pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); + lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], offset); + DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); + DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + + BP_SET_LSIZE(bp, lsize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + zio = zio_root(spa, NULL, NULL, 0); + + if (vd == vd->vdev_top) { + /* + * Treat this as a normal block read. + */ + zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); + } else { + /* + * Treat this as a vdev child I/O. + */ + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + + error = zio_wait(zio); + spa_config_exit(spa, SCL_STATE, FTAG); + + if (error) { + (void) printf("Read of %s failed, error: %d\n", thing, error); + goto out; + } + + uint64_t orig_lsize = lsize; + buf = lbuf; + if (flags & ZDB_FLAG_DECOMPRESS) { + boolean_t failed = zdb_decompress_block(pabd, buf, lbuf, + lsize, psize, flags); + if (failed) { + (void) printf("Decompress of %s failed\n", thing); + goto out; + } + } else { + buf = abd_borrow_buf_copy(pabd, lsize); + borrowed = B_TRUE; + } + /* + * Try to detect invalid block pointer. If invalid, try + * decompressing. + */ + if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && + !(flags & ZDB_FLAG_DECOMPRESS)) { + const blkptr_t *b = (const blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset); + if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) == + B_FALSE) { + abd_return_buf_copy(pabd, buf, lsize); + borrowed = B_FALSE; + buf = lbuf; + boolean_t failed = zdb_decompress_block(pabd, buf, + lbuf, lsize, psize, flags); + b = (const blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset); + if (failed || zfs_blkptr_verify(spa, b, B_FALSE, + BLK_VERIFY_LOG) == B_FALSE) { + printf("invalid block pointer at this DVA\n"); + goto out; + } + } + } + + if (flags & ZDB_FLAG_PRINT_BLKPTR) + zdb_print_blkptr((blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); + else if (flags & ZDB_FLAG_RAW) + zdb_dump_block_raw(buf, lsize, flags); + else if (flags & ZDB_FLAG_INDIRECT) + zdb_dump_indirect((blkptr_t *)buf, + orig_lsize / sizeof (blkptr_t), flags); + else if (flags & ZDB_FLAG_GBH) + zdb_dump_gbh(buf, flags); + else + zdb_dump_block(thing, buf, lsize, flags); + + /* + * If :c was specified, iterate through the checksum table to + * calculate and display each checksum for our specified + * DVA and length. + */ + if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && + !(flags & ZDB_FLAG_GBH)) { + zio_t *czio; + (void) printf("\n"); + for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; + ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { + + if ((zio_checksum_table[ck].ci_flags & + ZCHECKSUM_FLAG_EMBEDDED) || + ck == ZIO_CHECKSUM_NOPARITY) { + continue; + } + BP_SET_CHECKSUM(bp, ck); + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + czio->io_bp = bp; + + if (vd == vd->vdev_top) { + zio_nowait(zio_read(czio, spa, bp, pabd, psize, + NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_DONT_RETRY, NULL)); + } else { + zio_nowait(zio_vdev_child_io(czio, bp, vd, + offset, pabd, psize, ZIO_TYPE_READ, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + error = zio_wait(czio); + if (error == 0 || error == ECKSUM) { + zio_t *ck_zio = zio_root(spa, NULL, NULL, 0); + ck_zio->io_offset = + DVA_GET_OFFSET(&bp->blk_dva[0]); + ck_zio->io_bp = bp; + zio_checksum_compute(ck_zio, ck, pabd, lsize); + printf("%12s\tcksum=%llx:%llx:%llx:%llx\n", + zio_checksum_table[ck].ci_name, + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); + zio_wait(ck_zio); + } else { + printf("error %d reading block\n", error); + } + spa_config_exit(spa, SCL_STATE, FTAG); + } + } + + if (borrowed) + abd_return_buf_copy(pabd, buf, lsize); + +out: + abd_free(pabd); + umem_free(lbuf, SPA_MAXBLOCKSIZE); +done: + free(flagstr); + free(dup); +} + +static void +zdb_embedded_block(char *thing) +{ + blkptr_t bp; + unsigned long long *words = (void *)&bp; + char *buf; + int err; + + bzero(&bp, sizeof (bp)); + err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" + "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", + words + 0, words + 1, words + 2, words + 3, + words + 4, words + 5, words + 6, words + 7, + words + 8, words + 9, words + 10, words + 11, + words + 12, words + 13, words + 14, words + 15); + if (err != 16) { + (void) fprintf(stderr, "invalid input format\n"); + exit(1); + } + ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); + buf = malloc(SPA_MAXBLOCKSIZE); + if (buf == NULL) { + (void) fprintf(stderr, "out of memory\n"); + exit(1); + } + err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); + if (err != 0) { + (void) fprintf(stderr, "decode failed: %u\n", err); + exit(1); + } + zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); + free(buf); +} + +int +main(int argc, char **argv) +{ + int c; + struct rlimit rl = { 1024, 1024 }; + spa_t *spa = NULL; + objset_t *os = NULL; + int dump_all = 1; + int verbose = 0; + int error = 0; + char **searchdirs = NULL; + int nsearch = 0; + char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *policy = NULL; + uint64_t max_txg = UINT64_MAX; + int64_t objset_id = -1; + int flags = ZFS_IMPORT_MISSING_LOG; + int rewind = ZPOOL_NEVER_REWIND; + char *spa_config_path_env, *objset_str; + boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; + nvlist_t *cfg = NULL; + + (void) setrlimit(RLIMIT_NOFILE, &rl); + (void) enable_extended_FILE_stdio(-1, -1); + + dprintf_setup(&argc, argv); + + /* + * If there is an environment variable SPA_CONFIG_PATH it overrides + * default spa_config_path setting. If -U flag is specified it will + * override this environment variable settings once again. + */ + spa_config_path_env = getenv("SPA_CONFIG_PATH"); + if (spa_config_path_env != NULL) + spa_config_path = spa_config_path_env; + + /* + * For performance reasons, we set this tunable down. We do so before + * the arg parsing section so that the user can override this value if + * they choose. + */ + zfs_btree_verify_intensity = 3; + + while ((c = getopt(argc, argv, + "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYyZ")) != -1) { + switch (c) { + case 'b': + case 'c': + case 'C': + case 'd': + case 'D': + case 'E': + case 'G': + case 'h': + case 'i': + case 'l': + case 'm': + case 'M': + case 'O': + case 'R': + case 's': + case 'S': + case 'u': + case 'y': + case 'Z': + dump_opt[c]++; + dump_all = 0; + break; + case 'A': + case 'e': + case 'F': + case 'k': + case 'L': + case 'P': + case 'q': + case 'X': + dump_opt[c]++; + break; + case 'Y': + zfs_reconstruct_indirect_combinations_max = INT_MAX; + zfs_deadman_enabled = 0; + break; + /* NB: Sort single match options below. */ + case 'I': + max_inflight_bytes = strtoull(optarg, NULL, 0); + if (max_inflight_bytes == 0) { + (void) fprintf(stderr, "maximum number " + "of inflight bytes must be greater " + "than 0\n"); + usage(); + } + break; + case 'o': + error = set_global_var(optarg); + if (error != 0) + usage(); + break; + case 'p': + if (searchdirs == NULL) { + searchdirs = umem_alloc(sizeof (char *), + UMEM_NOFAIL); + } else { + char **tmp = umem_alloc((nsearch + 1) * + sizeof (char *), UMEM_NOFAIL); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + umem_free(searchdirs, + nsearch * sizeof (char *)); + searchdirs = tmp; + } + searchdirs[nsearch++] = optarg; + break; + case 't': + max_txg = strtoull(optarg, NULL, 0); + if (max_txg < TXG_INITIAL) { + (void) fprintf(stderr, "incorrect txg " + "specified: %s\n", optarg); + usage(); + } + break; + case 'U': + spa_config_path = optarg; + if (spa_config_path[0] != '/') { + (void) fprintf(stderr, + "cachefile must be an absolute path " + "(i.e. start with a slash)\n"); + usage(); + } + break; + case 'v': + verbose++; + break; + case 'V': + flags = ZFS_IMPORT_VERBATIM; + break; + case 'x': + vn_dumpdir = optarg; + break; + default: + usage(); + break; + } + } + + if (!dump_opt['e'] && searchdirs != NULL) { + (void) fprintf(stderr, "-p option requires use of -e\n"); + usage(); + } + if (dump_opt['d']) { + /* <pool>[/<dataset | objset id> is accepted */ + if (argv[2] && (objset_str = strchr(argv[2], '/')) != NULL && + objset_str++ != NULL) { + char *endptr; + errno = 0; + objset_id = strtoull(objset_str, &endptr, 0); + /* dataset 0 is the same as opening the pool */ + if (errno == 0 && endptr != objset_str && + objset_id != 0) { + target_is_spa = B_FALSE; + dataset_lookup = B_TRUE; + } else if (objset_id != 0) { + printf("failed to open objset %s " + "%llu %s", objset_str, + (u_longlong_t)objset_id, + strerror(errno)); + exit(1); + } + /* normal dataset name not an objset ID */ + if (endptr == objset_str) { + objset_id = -1; + } + } + } + +#if defined(_LP64) + /* + * ZDB does not typically re-read blocks; therefore limit the ARC + * to 256 MB, which can be used entirely for metadata. + */ + zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT; + zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; +#endif + + /* + * "zdb -c" uses checksum-verifying scrub i/os which are async reads. + * "zdb -b" uses traversal prefetch which uses async reads. + * For good performance, let several of them be active at once. + */ + zfs_vdev_async_read_max_active = 10; + + /* + * Disable reference tracking for better performance. + */ + reference_tracking_enable = B_FALSE; + + /* + * Do not fail spa_load when spa_load_verify fails. This is needed + * to load non-idle pools. + */ + spa_load_verify_dryrun = B_TRUE; + + kernel_init(SPA_MODE_READ); + + if (dump_all) + verbose = MAX(verbose, 1); + + for (c = 0; c < 256; c++) { + if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL) + dump_opt[c] = 1; + if (dump_opt[c]) + dump_opt[c] += verbose; + } + + aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); + zfs_recover = (dump_opt['A'] > 1); + + argc -= optind; + argv += optind; + if (argc < 2 && dump_opt['R']) + usage(); + + if (dump_opt['E']) { + if (argc != 1) + usage(); + zdb_embedded_block(argv[0]); + return (0); + } + + if (argc < 1) { + if (!dump_opt['e'] && dump_opt['C']) { + dump_cachefile(spa_config_path); + return (0); + } + usage(); + } + + if (dump_opt['l']) + return (dump_label(argv[0])); + + if (dump_opt['O']) { + if (argc != 2) + usage(); + dump_opt['v'] = verbose + 3; + return (dump_path(argv[0], argv[1])); + } + + if (dump_opt['X'] || dump_opt['F']) + rewind = ZPOOL_DO_REWIND | + (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); + + if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || + nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || + nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) + fatal("internal error: %s", strerror(ENOMEM)); + + error = 0; + target = argv[0]; + + if (strpbrk(target, "/@") != NULL) { + size_t targetlen; + + target_pool = strdup(target); + *strpbrk(target_pool, "/@") = '\0'; + + target_is_spa = B_FALSE; + targetlen = strlen(target); + if (targetlen && target[targetlen - 1] == '/') + target[targetlen - 1] = '\0'; + } else { + target_pool = target; + } + + if (dump_opt['e']) { + importargs_t args = { 0 }; + + args.paths = nsearch; + args.path = searchdirs; + args.can_be_active = B_TRUE; + + error = zpool_find_config(NULL, target_pool, &cfg, &args, + &libzpool_config_ops); + + if (error == 0) { + + if (nvlist_add_nvlist(cfg, + ZPOOL_LOAD_POLICY, policy) != 0) { + fatal("can't open '%s': %s", + target, strerror(ENOMEM)); + } + + if (dump_opt['C'] > 1) { + (void) printf("\nConfiguration for import:\n"); + dump_nvlist(cfg, 8); + } + + /* + * Disable the activity check to allow examination of + * active pools. + */ + error = spa_import(target_pool, cfg, NULL, + flags | ZFS_IMPORT_SKIP_MMP); + } + } + + /* + * import_checkpointed_state makes the assumption that the + * target pool that we pass it is already part of the spa + * namespace. Because of that we need to make sure to call + * it always after the -e option has been processed, which + * imports the pool to the namespace if it's not in the + * cachefile. + */ + char *checkpoint_pool = NULL; + char *checkpoint_target = NULL; + if (dump_opt['k']) { + checkpoint_pool = import_checkpointed_state(target, cfg, + &checkpoint_target); + + if (checkpoint_target != NULL) + target = checkpoint_target; + } + + if (target_pool != target) + free(target_pool); + + if (error == 0) { + if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { + ASSERT(checkpoint_pool != NULL); + ASSERT(checkpoint_target == NULL); + + error = spa_open(checkpoint_pool, &spa, FTAG); + if (error != 0) { + fatal("Tried to open pool \"%s\" but " + "spa_open() failed with error %d\n", + checkpoint_pool, error); + } + + } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { + zdb_set_skip_mmp(target); + error = spa_open_rewind(target, &spa, FTAG, policy, + NULL); + if (error) { + /* + * If we're missing the log device then + * try opening the pool after clearing the + * log state. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(target)) != NULL && + spa->spa_log_state == SPA_LOG_MISSING) { + spa->spa_log_state = SPA_LOG_CLEAR; + error = 0; + } + mutex_exit(&spa_namespace_lock); + + if (!error) { + error = spa_open_rewind(target, &spa, + FTAG, policy, NULL); + } + } + } else if (strpbrk(target, "#") != NULL) { + dsl_pool_t *dp; + error = dsl_pool_hold(target, FTAG, &dp); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + error = dump_bookmark(dp, target, B_TRUE, verbose > 1); + dsl_pool_rele(dp, FTAG); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + return (error); + } else { + zdb_set_skip_mmp(target); + if (dataset_lookup == B_TRUE) { + /* + * Use the supplied id to get the name + * for open_objset. + */ + error = spa_open(target, &spa, FTAG); + if (error == 0) { + error = name_from_objset_id(spa, + objset_id, dsname); + spa_close(spa, FTAG); + if (error == 0) + target = dsname; + } + } + if (error == 0) + error = open_objset(target, FTAG, &os); + if (error == 0) + spa = dmu_objset_spa(os); + } + } + nvlist_free(policy); + + if (error) + fatal("can't open '%s': %s", target, strerror(error)); + + /* + * Set the pool failure mode to panic in order to prevent the pool + * from suspending. A suspended I/O will have no way to resume and + * can prevent the zdb(8) command from terminating as expected. + */ + if (spa != NULL) + spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; + + argv++; + argc--; + if (!dump_opt['R']) { + flagbits['d'] = ZOR_FLAG_DIRECTORY; + flagbits['f'] = ZOR_FLAG_PLAIN_FILE; + flagbits['m'] = ZOR_FLAG_SPACE_MAP; + flagbits['z'] = ZOR_FLAG_ZAP; + flagbits['A'] = ZOR_FLAG_ALL_TYPES; + + if (argc > 0 && dump_opt['d']) { + zopt_object_args = argc; + zopt_object_ranges = calloc(zopt_object_args, + sizeof (zopt_object_range_t)); + for (unsigned i = 0; i < zopt_object_args; i++) { + int err; + char *msg = NULL; + + err = parse_object_range(argv[i], + &zopt_object_ranges[i], &msg); + if (err != 0) + fatal("Bad object or range: '%s': %s\n", + argv[i], msg ? msg : ""); + } + } else if (argc > 0 && dump_opt['m']) { + zopt_metaslab_args = argc; + zopt_metaslab = calloc(zopt_metaslab_args, + sizeof (uint64_t)); + for (unsigned i = 0; i < zopt_metaslab_args; i++) { + errno = 0; + zopt_metaslab[i] = strtoull(argv[i], NULL, 0); + if (zopt_metaslab[i] == 0 && errno != 0) + fatal("bad number %s: %s", argv[i], + strerror(errno)); + } + } + if (os != NULL) { + dump_objset(os); + } else if (zopt_object_args > 0 && !dump_opt['m']) { + dump_objset(spa->spa_meta_objset); + } else { + dump_zpool(spa); + } + } else { + flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; + flagbits['c'] = ZDB_FLAG_CHECKSUM; + flagbits['d'] = ZDB_FLAG_DECOMPRESS; + flagbits['e'] = ZDB_FLAG_BSWAP; + flagbits['g'] = ZDB_FLAG_GBH; + flagbits['i'] = ZDB_FLAG_INDIRECT; + flagbits['r'] = ZDB_FLAG_RAW; + flagbits['v'] = ZDB_FLAG_VERBOSE; + + for (int i = 0; i < argc; i++) + zdb_read_block(argv[i], spa); + } + + if (dump_opt['k']) { + free(checkpoint_pool); + if (!target_is_spa) + free(checkpoint_target); + } + + if (os != NULL) { + close_objset(os, FTAG); + } else { + spa_close(spa, FTAG); + } + + fuid_table_destroy(); + + dump_debug_buffer(); + + kernel_fini(); + + return (error); +} diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.h b/sys/contrib/openzfs/cmd/zdb/zdb.h new file mode 100644 index 000000000000..49579811efbb --- /dev/null +++ b/sys/contrib/openzfs/cmd/zdb/zdb.h @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2017 Spectra Logic Corp Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _ZDB_H +#define _ZDB_H + +void dump_intent_log(zilog_t *); +extern uint8_t dump_opt[256]; + +#endif /* _ZDB_H */ diff --git a/sys/contrib/openzfs/cmd/zdb/zdb_il.c b/sys/contrib/openzfs/cmd/zdb/zdb_il.c new file mode 100644 index 000000000000..c12178effae0 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zdb/zdb_il.c @@ -0,0 +1,431 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012 Cyril Plisko. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +/* + * Print intent log header and statistics. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/spa_impl.h> +#include <sys/abd.h> + +#include "zdb.h" + +extern uint8_t dump_opt[256]; + +static char tab_prefix[4] = "\t\t\t"; + +static void +print_log_bp(const blkptr_t *bp, const char *prefix) +{ + char blkbuf[BP_SPRINTF_LEN]; + + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("%s%s\n", prefix, blkbuf); +} + +/* ARGSUSED */ +static void +zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg) +{ + lr_create_t *lr = arg; + time_t crtime = lr->lr_crtime[0]; + char *name, *link; + lr_attr_t *lrattr; + + name = (char *)(lr + 1); + + if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR || + lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) { + lrattr = (lr_attr_t *)(lr + 1); + name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + if (txtype == TX_SYMLINK) { + link = name + strlen(name) + 1; + (void) printf("%s%s -> %s\n", tab_prefix, name, link); + } else if (txtype != TX_MKXATTR) { + (void) printf("%s%s\n", tab_prefix, name); + } + + (void) printf("%s%s", tab_prefix, ctime(&crtime)); + (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n", + tab_prefix, (u_longlong_t)lr->lr_doid, + (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid), + (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid), + (longlong_t)lr->lr_mode); + (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", + tab_prefix, + (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid, + (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev); +} + +/* ARGSUSED */ +static void +zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg) +{ + lr_remove_t *lr = arg; + + (void) printf("%sdoid %llu, name %s\n", tab_prefix, + (u_longlong_t)lr->lr_doid, (char *)(lr + 1)); +} + +/* ARGSUSED */ +static void +zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg) +{ + lr_link_t *lr = arg; + + (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix, + (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj, + (char *)(lr + 1)); +} + +/* ARGSUSED */ +static void +zil_prt_rec_rename(zilog_t *zilog, int txtype, void *arg) +{ + lr_rename_t *lr = arg; + char *snm = (char *)(lr + 1); + char *tnm = snm + strlen(snm) + 1; + + (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix, + (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid); + (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm); +} + +/* ARGSUSED */ +static int +zil_prt_rec_write_cb(void *data, size_t len, void *unused) +{ + char *cdata = data; + + for (size_t i = 0; i < len; i++) { + if (isprint(*cdata)) + (void) printf("%c ", *cdata); + else + (void) printf("%2X", *cdata); + cdata++; + } + return (0); +} + +/* ARGSUSED */ +static void +zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg) +{ + lr_write_t *lr = arg; + abd_t *data; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_phys_t zb; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + int error; + + (void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length); + + if (txtype == TX_WRITE2 || verbose < 5) + return; + + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + (void) printf("%shas blkptr, %s\n", tab_prefix, + !BP_IS_HOLE(bp) && + bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? + "will claim" : "won't claim"); + print_log_bp(bp, tab_prefix); + + if (BP_IS_HOLE(bp)) { + (void) printf("\t\t\tLSIZE 0x%llx\n", + (u_longlong_t)BP_GET_LSIZE(bp)); + (void) printf("%s<hole>\n", tab_prefix); + return; + } + if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { + (void) printf("%s<block already committed>\n", + tab_prefix); + return; + } + + SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); + error = zio_wait(zio_read(NULL, zilog->zl_spa, + bp, data, BP_GET_LSIZE(bp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); + if (error) + goto out; + } else { + /* data is stored after the end of the lr_write record */ + data = abd_alloc(lr->lr_length, B_FALSE); + abd_copy_from_buf(data, lr + 1, lr->lr_length); + } + + (void) printf("%s", tab_prefix); + (void) abd_iterate_func(data, + 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), + zil_prt_rec_write_cb, NULL); + (void) printf("\n"); + +out: + abd_free(data); +} + +/* ARGSUSED */ +static void +zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg) +{ + lr_truncate_t *lr = arg; + + (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length); +} + +/* ARGSUSED */ +static void +zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg) +{ + lr_setattr_t *lr = arg; + time_t atime = (time_t)lr->lr_atime[0]; + time_t mtime = (time_t)lr->lr_mtime[0]; + + (void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask); + + if (lr->lr_mask & AT_MODE) { + (void) printf("%sAT_MODE %llo\n", tab_prefix, + (longlong_t)lr->lr_mode); + } + + if (lr->lr_mask & AT_UID) { + (void) printf("%sAT_UID %llu\n", tab_prefix, + (u_longlong_t)lr->lr_uid); + } + + if (lr->lr_mask & AT_GID) { + (void) printf("%sAT_GID %llu\n", tab_prefix, + (u_longlong_t)lr->lr_gid); + } + + if (lr->lr_mask & AT_SIZE) { + (void) printf("%sAT_SIZE %llu\n", tab_prefix, + (u_longlong_t)lr->lr_size); + } + + if (lr->lr_mask & AT_ATIME) { + (void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix, + (u_longlong_t)lr->lr_atime[0], + (u_longlong_t)lr->lr_atime[1], + ctime(&atime)); + } + + if (lr->lr_mask & AT_MTIME) { + (void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix, + (u_longlong_t)lr->lr_mtime[0], + (u_longlong_t)lr->lr_mtime[1], + ctime(&mtime)); + } +} + +/* ARGSUSED */ +static void +zil_prt_rec_acl(zilog_t *zilog, int txtype, void *arg) +{ + lr_acl_t *lr = arg; + + (void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt); +} + +typedef void (*zil_prt_rec_func_t)(zilog_t *, int, void *); +typedef struct zil_rec_info { + zil_prt_rec_func_t zri_print; + const char *zri_name; + uint64_t zri_count; +} zil_rec_info_t; + +static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { + {.zri_print = NULL, .zri_name = "Total "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKXATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_SYMLINK "}, + {.zri_print = zil_prt_rec_remove, .zri_name = "TX_REMOVE "}, + {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "}, + {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "}, + {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "}, + {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "}, + {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "}, + {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "}, + {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "}, + {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_ACL "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL_ATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL_ATTR "}, + {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "}, +}; + +/* ARGSUSED */ +static int +print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) +{ + int txtype; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + /* reduce size of txtype to strip off TX_CI bit */ + txtype = lr->lrc_txtype; + + ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE); + ASSERT(lr->lrc_txg); + + (void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n", + (lr->lrc_txtype & TX_CI) ? "CI-" : "", + zil_rec_info[txtype].zri_name, + (u_longlong_t)lr->lrc_reclen, + (u_longlong_t)lr->lrc_txg, + (u_longlong_t)lr->lrc_seq); + + if (txtype && verbose >= 3) { + if (!zilog->zl_os->os_encrypted) { + zil_rec_info[txtype].zri_print(zilog, txtype, lr); + } else { + (void) printf("%s(encrypted)\n", tab_prefix); + } + } + + zil_rec_info[txtype].zri_count++; + zil_rec_info[0].zri_count++; + + return (0); +} + +/* ARGSUSED */ +static int +print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + char blkbuf[BP_SPRINTF_LEN + 10]; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + const char *claim; + + if (verbose <= 3) + return (0); + + if (verbose >= 5) { + (void) strcpy(blkbuf, ", "); + snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), bp); + } else { + blkbuf[0] = '\0'; + } + + if (claim_txg != 0) + claim = "already claimed"; + else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa)) + claim = "will claim"; + else + claim = "won't claim"; + + (void) printf("\tBlock seqno %llu, %s%s\n", + (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf); + + return (0); +} + +static void +print_log_stats(int verbose) +{ + unsigned i, w, p10; + + if (verbose > 3) + (void) printf("\n"); + + if (zil_rec_info[0].zri_count == 0) + return; + + for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10) + w++; + + for (i = 0; i < TX_MAX_TYPE; i++) + if (zil_rec_info[i].zri_count || verbose >= 3) + (void) printf("\t\t%s %*llu\n", + zil_rec_info[i].zri_name, w, + (u_longlong_t)zil_rec_info[i].zri_count); + (void) printf("\n"); +} + +/* ARGSUSED */ +void +dump_intent_log(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + int i; + + if (BP_IS_HOLE(&zh->zh_log) || verbose < 1) + return; + + (void) printf("\n ZIL header: claim_txg %llu, " + "claim_blk_seq %llu, claim_lr_seq %llu", + (u_longlong_t)zh->zh_claim_txg, + (u_longlong_t)zh->zh_claim_blk_seq, + (u_longlong_t)zh->zh_claim_lr_seq); + (void) printf(" replay_seq %llu, flags 0x%llx\n", + (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags); + + for (i = 0; i < TX_MAX_TYPE; i++) + zil_rec_info[i].zri_count = 0; + + /* see comment in zil_claim() or zil_check_log_chain() */ + if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0) + return; + + if (verbose >= 2) { + (void) printf("\n"); + (void) zil_parse(zilog, print_log_block, print_log_record, NULL, + zh->zh_claim_txg, B_FALSE); + print_log_stats(verbose); + } +} diff --git a/sys/contrib/openzfs/cmd/zed/.gitignore b/sys/contrib/openzfs/cmd/zed/.gitignore new file mode 100644 index 000000000000..76557bb6bb3a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/.gitignore @@ -0,0 +1 @@ +/zed diff --git a/sys/contrib/openzfs/cmd/zed/Makefile.am b/sys/contrib/openzfs/cmd/zed/Makefile.am new file mode 100644 index 000000000000..4bd8ac4a53e6 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/Makefile.am @@ -0,0 +1,49 @@ +include $(top_srcdir)/config/Rules.am + +AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS) + +SUBDIRS = zed.d + +sbin_PROGRAMS = zed + +ZED_SRC = \ + zed.c \ + zed.h \ + zed_conf.c \ + zed_conf.h \ + zed_disk_event.c \ + zed_disk_event.h \ + zed_event.c \ + zed_event.h \ + zed_exec.c \ + zed_exec.h \ + zed_file.c \ + zed_file.h \ + zed_log.c \ + zed_log.h \ + zed_strings.c \ + zed_strings.h + +FMA_SRC = \ + agents/zfs_agents.c \ + agents/zfs_agents.h \ + agents/zfs_diagnosis.c \ + agents/zfs_mod.c \ + agents/zfs_retire.c \ + agents/fmd_api.c \ + agents/fmd_api.h \ + agents/fmd_serd.c \ + agents/fmd_serd.h + +zed_SOURCES = $(ZED_SRC) $(FMA_SRC) + +zed_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +zed_LDADD += -lrt $(LIBUDEV_LIBS) $(LIBUUID_LIBS) +zed_LDFLAGS = -pthread + +EXTRA_DIST = agents/README.md diff --git a/sys/contrib/openzfs/cmd/zed/agents/README.md b/sys/contrib/openzfs/cmd/zed/agents/README.md new file mode 100644 index 000000000000..e35b97668a9d --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/README.md @@ -0,0 +1,112 @@ +## Fault Management Logic for ZED ## + +The integration of Fault Management Daemon (FMD) logic from illumos +is being deployed in three phases. This logic is encapsulated in +several software modules inside ZED. + +### ZED+FM Phase 1 ### + +All the phase 1 work is in current Master branch. Phase I work includes: + +* Add new paths to the persistent VDEV label for device matching. +* Add a disk monitor for generating _disk-add_ and _disk-change_ events. +* Add support for automated VDEV auto-online, auto-replace and auto-expand. +* Expand the statechange event to include all VDEV state transitions. + +### ZED+FM Phase 2 (WIP) ### + +The phase 2 work primarily entails the _Diagnosis Engine_ and the +_Retire Agent_ modules. It also includes infrastructure to support a +crude FMD environment to host these modules. For additional +information see the **FMD Components in ZED** and **Implementation +Notes** sections below. + +### ZED+FM Phase 3 ### + +Future work will add additional functionality and will likely include: + +* Add FMD module garbage collection (periodically call `fmd_module_gc()`). +* Add real module property retrieval (currently hard-coded in accessors). +* Additional diagnosis telemetry (like latency outliers and SMART data). +* Export FMD module statistics. +* Zedlet parallel execution and resiliency (add watchdog). + +### ZFS Fault Management Overview ### + +The primary purpose with ZFS fault management is automated diagnosis +and isolation of VDEV faults. A fault is something we can associate +with an impact (e.g. loss of data redundancy) and a corrective action +(e.g. offline or replace a disk). A typical ZFS fault management stack +is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk +monitor_, a _diagnosis engine_ and _response agents_. + +After detecting a software error, the ZFS kernel module sends error +events to the ZED user daemon which in turn routes the events to its +internal FMA modules based on their event subscriptions. Likewise, if +a disk is added or changed in the system, the disk monitor sends disk +events which are consumed by a response agent. + +### FMD Components in ZED ### + +There are three FMD modules (aka agents) that are now built into ZED. + + 1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`) + 2. A _Retire Agent_ module (`agents/zfs_retire.c`) + 3. A _Disk Add Agent_ module (`agents/zfs_mod.c`) + +To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum +ereports and feeds them into a Soft Error Rate Discrimination (SERD) +algorithm which will generate a corresponding fault diagnosis when the +tracked VDEV encounters **N** events in a given **T** time window. The +initial N and T values for the SERD algorithm are estimates inherited +from illumos (10 errors in 10 minutes). + +In turn, a **Retire Agent** responds to diagnosed faults by isolating +the faulty VDEV. It will notify the ZFS kernel module of the new VDEV +state (degraded or faulted). The retire agent is also responsible for +managing hot spares across all pools. When it encounters a device fault +or a device removal it will replace the device with an appropriate +spare if available. + +Finally, a **Disk Add Agent** responds to events from a libudev disk +monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or +expand the associated VDEV. This agent is also known as the `zfs_mod` +or Sysevent Loadable Module (SLM) on the illumos platform. The added +disk is matched to a specific VDEV using its device id, physical path +or VDEV GUID. + +Note that the _auto-replace_ feature (aka hot plug) is opt-in and you +must set the pool's `autoreplace` property to enable it. The new disk +will be matched to the corresponding leaf VDEV by physical location +and labeled with a GPT partition before replacing the original VDEV +in the pool. + +### Implementation Notes ### + +* The FMD module API required for logic modules is emulated and implemented + in the `fmd_api.c` and `fmd_serd.c` source files. This support includes + module registration, memory allocation, module property accessors, basic + case management, one-shot timers and SERD engines. + For detailed information on the FMD module API, see the document -- + _"Fault Management Daemon Programmer's Reference Manual"_. + +* The event subscriptions for the modules (located in a module specific + configuration file on illumos) are currently hard-coded into the ZED + `zfs_agent_dispatch()` function. + +* The FMD modules are called one at a time from a single thread that + consumes events queued to the modules. These events are sourced from + the normal ZED events and also include events posted from the diagnosis + engine and the libudev disk event monitor. + +* The FMD code modules have minimal changes and were intentionally left + as similar as possible to their upstream source files. + +* The sysevent namespace in ZED differs from illumos. For example: + * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"` + * Linux uses `"sysevent.fs.zfs.vdev_remove"` + +* The FMD Modules port was produced by Intel Federal, LLC under award + number B609815 between the U.S. Department of Energy (DOE) and Intel + Federal, LLC. + diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_api.c b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.c new file mode 100644 index 000000000000..607b387ca3a8 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.c @@ -0,0 +1,760 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * This file implements the minimal FMD module API required to support the + * fault logic modules in ZED. This support includes module registration, + * memory allocation, module property accessors, basic case management, + * one-shot timers and SERD engines. + * + * In the ZED runtime, the modules are called from a single thread so no + * locking is required in this emulated FMD environment. + */ + +#include <sys/types.h> +#include <sys/fm/protocol.h> +#include <uuid/uuid.h> +#include <signal.h> +#include <strings.h> +#include <time.h> + +#include "fmd_api.h" +#include "fmd_serd.h" + +#include "zfs_agents.h" +#include "../zed_log.h" + +typedef struct fmd_modstat { + fmd_stat_t ms_accepted; /* total events accepted by module */ + fmd_stat_t ms_caseopen; /* cases currently open */ + fmd_stat_t ms_casesolved; /* total cases solved by module */ + fmd_stat_t ms_caseclosed; /* total cases closed by module */ +} fmd_modstat_t; + +typedef struct fmd_module { + const char *mod_name; /* basename of module (ro) */ + const fmd_hdl_info_t *mod_info; /* module info registered with handle */ + void *mod_spec; /* fmd_hdl_get/setspecific data value */ + fmd_stat_t *mod_ustat; /* module specific custom stats */ + uint_t mod_ustat_cnt; /* count of ustat stats */ + fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */ + fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */ + char *mod_vers; /* a copy of module version string */ +} fmd_module_t; + +/* + * ZED has two FMD hardwired module instances + */ +fmd_module_t zfs_retire_module; +fmd_module_t zfs_diagnosis_module; + +/* + * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +/* + * Register a module with fmd and finish module initialization. + * Returns an integer indicating whether it succeeded (zero) or + * failed (non-zero). + */ +int +fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_info = mip; + mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */ + mp->mod_spec = NULL; + + /* bare minimum module stats */ + (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted"); + (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen"); + (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved"); + (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed"); + + fmd_serd_hash_create(&mp->mod_serds); + + fmd_hdl_debug(hdl, "register module"); + + return (0); +} + +void +fmd_hdl_unregister(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_modstat_t *msp = &mp->mod_stats; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + /* dump generic module stats */ + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name, + msp->ms_accepted.fmds_value.ui64); + if (ops->fmdo_close != NULL) { + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name, + msp->ms_caseopen.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name, + msp->ms_casesolved.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name, + msp->ms_caseclosed.fmds_value.ui64); + } + + /* dump module specific stats */ + if (mp->mod_ustat != NULL) { + int i; + + for (i = 0; i < mp->mod_ustat_cnt; i++) { + fmd_hdl_debug(hdl, "%s: %llu", + mp->mod_ustat[i].fmds_name, + mp->mod_ustat[i].fmds_value.ui64); + } + } + + fmd_serd_hash_destroy(&mp->mod_serds); + + fmd_hdl_debug(hdl, "unregister module"); +} + +/* + * fmd_hdl_setspecific() is used to associate a data pointer with + * the specified handle for the duration of the module's lifetime. + * This pointer can be retrieved using fmd_hdl_getspecific(). + */ +void +fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_spec = spec; +} + +/* + * Return the module-specific data pointer previously associated + * with the handle using fmd_hdl_setspecific(). + */ +void * +fmd_hdl_getspecific(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_spec); +} + +void * +fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_alloc(size, flags)); +} + +void * +fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_zalloc(size, flags)); +} + +void +fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size) +{ + umem_free(data, size); +} + +/* + * Record a module debug message using the specified format. + */ +void +fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...) +{ + char message[256]; + va_list vargs; + fmd_module_t *mp = (fmd_module_t *)hdl; + + va_start(vargs, format); + (void) vsnprintf(message, sizeof (message), format, vargs); + va_end(vargs); + + /* prefix message with module name */ + zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message); +} + +/* Property Retrieval */ + +int32_t +fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "spare_on_remove") == 0) + return (1); + + if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) + return (10); /* N = 10 events */ + + return (0); +} + +int64_t +fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "remove_timeout") == 0) + return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */ + + if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) + return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */ + + return (0); +} + +/* FMD Statistics */ + +fmd_stat_t * +fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (flags == FMD_STAT_NOALLOC) { + mp->mod_ustat = statv; + mp->mod_ustat_cnt = nstats; + } + + return (statv); +} + +/* Case Management */ + +fmd_case_t * +fmd_case_open(fmd_hdl_t *hdl, void *data) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + uuid_t uuid; + + fmd_case_t *cp; + + cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP); + cp->ci_mod = hdl; + cp->ci_state = FMD_CASE_UNSOLVED; + cp->ci_flags = FMD_CF_DIRTY; + cp->ci_data = data; + cp->ci_bufptr = NULL; + cp->ci_bufsiz = 0; + + uuid_generate(uuid); + uuid_unparse(uuid, cp->ci_uuid); + + fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid); + mp->mod_stats.ms_caseopen.fmds_value.ui64++; + + return (cp); +} + +void +fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + /* + * For ZED, the event was already sent from fmd_case_add_suspect() + */ + + if (cp->ci_state >= FMD_CASE_SOLVED) + fmd_hdl_debug(hdl, "case is already solved or closed"); + + cp->ci_state = FMD_CASE_SOLVED; + + fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid); + mp->mod_stats.ms_casesolved.fmds_value.ui64++; +} + +void +fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid); + + if (ops->fmdo_close != NULL) + ops->fmdo_close(hdl, cp); + + mp->mod_stats.ms_caseopen.fmds_value.ui64--; + mp->mod_stats.ms_caseclosed.fmds_value.ui64++; + + if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0) + fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz); + + fmd_hdl_free(hdl, cp, sizeof (fmd_case_t)); +} + +void +fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid) +{ + fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid); +} + +int +fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE); +} + +void +fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep) +{ +} + +static void +zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code) +{ + nvlist_t *rsrc; + char *strval; + uint64_t guid; + uint8_t byte; + + zed_log_msg(LOG_INFO, "\nzed_fault_event:"); + + if (uuid != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid); + if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval); + if (code != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code); + if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte); + if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { + if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME, + strval); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL, + guid); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV, + guid); + } +} + +static const char * +fmd_fault_mkcode(nvlist_t *fault) +{ + char *class, *code = "-"; + + /* + * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po + */ + if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) { + if (strcmp(class, "fault.fs.zfs.vdev.io") == 0) + code = "ZFS-8000-FD"; + else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0) + code = "ZFS-8000-GH"; + else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0) + code = "ZFS-8000-HC"; + else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0) + code = "ZFS-8000-JQ"; + else if (strcmp(class, "fault.fs.zfs.log_replay") == 0) + code = "ZFS-8000-K4"; + else if (strcmp(class, "fault.fs.zfs.pool") == 0) + code = "ZFS-8000-CS"; + else if (strcmp(class, "fault.fs.zfs.device") == 0) + code = "ZFS-8000-D3"; + + } + return (code); +} + +void +fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault) +{ + nvlist_t *nvl; + const char *code = fmd_fault_mkcode(fault); + int64_t tod[2]; + int err = 0; + + /* + * payload derived from fmd_protocol_list() + */ + + (void) gettimeofday(&cp->ci_tv, NULL); + tod[0] = cp->ci_tv.tv_sec; + tod[1] = cp->ci_tv.tv_usec; + + nvl = fmd_nvl_alloc(hdl, FMD_SLEEP); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS); + err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid); + err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code); + err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2); + err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1); + err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1); + + if (err) + zed_log_die("failed to populate nvlist"); + + zed_log_fault(fault, cp->ci_uuid, code); + zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl); + + nvlist_free(nvl); + nvlist_free(fault); +} + +void +fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data) +{ + cp->ci_data = data; +} + +void * +fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return (cp->ci_data); +} + +void +fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr == NULL); + assert(size < (1024 * 1024)); + + cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP); + cp->ci_bufsiz = size; +} + +void +fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(size <= cp->ci_bufsiz); + + bcopy(cp->ci_bufptr, buf, size); +} + +void +fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, const void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(cp->ci_bufsiz >= size); + + bcopy(buf, cp->ci_bufptr, size); +} + +/* SERD Engines */ + +void +fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) { + zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': " + " name already exists", name); + return; + } + + (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t); +} + +void +fmd_serd_destroy(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + fmd_serd_eng_delete(&mp->mod_serds, name); + + fmd_hdl_debug(hdl, "serd_destroy %s", name); +} + +int +fmd_serd_exists(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); +} + +void +fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); + return; + } + + fmd_serd_eng_reset(sgp); + + fmd_hdl_debug(hdl, "serd_reset %s", name); +} + +int +fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + int err; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", + name); + return (FMD_B_FALSE); + } + err = fmd_serd_eng_record(sgp, ep->ev_hrt); + + return (err); +} + +/* FMD Timers */ + +static void +_timer_notify(union sigval sv) +{ + fmd_timer_t *ftp = sv.sival_ptr; + fmd_hdl_t *hdl = ftp->ft_hdl; + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + struct itimerspec its; + + fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); + + /* disarm the timer */ + bzero(&its, sizeof (struct itimerspec)); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + /* Note that the fmdo_timeout can remove this timer */ + if (ops->fmdo_timeout != NULL) + ops->fmdo_timeout(hdl, ftp, ftp->ft_arg); +} + +/* + * Install a new timer which will fire at least delta nanoseconds after the + * current time. After the timeout has expired, the module's fmdo_timeout + * entry point is called. + */ +fmd_timer_t * +fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta) +{ + struct sigevent sev; + struct itimerspec its; + fmd_timer_t *ftp; + + ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP); + ftp->ft_arg = arg; + ftp->ft_hdl = hdl; + + its.it_value.tv_sec = delta / 1000000000; + its.it_value.tv_nsec = delta % 1000000000; + its.it_interval.tv_sec = its.it_value.tv_sec; + its.it_interval.tv_nsec = its.it_value.tv_nsec; + + sev.sigev_notify = SIGEV_THREAD; + sev.sigev_notify_function = _timer_notify; + sev.sigev_notify_attributes = NULL; + sev.sigev_value.sival_ptr = ftp; + + timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + fmd_hdl_debug(hdl, "installing timer for %d secs (%p)", + (int)its.it_value.tv_sec, ftp->ft_tid); + + return (ftp); +} + +void +fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp) +{ + fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid); + + timer_delete(ftp->ft_tid); + + fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t)); +} + +/* Name-Value Pair Lists */ + +nvlist_t * +fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty, + nvlist_t *asru, nvlist_t *fru, nvlist_t *resource) +{ + nvlist_t *nvl; + int err = 0; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + zed_log_die("failed to xalloc fault nvlist"); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, class); + err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty); + + if (asru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru); + if (fru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru); + if (resource != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource); + + if (err) + zed_log_die("failed to populate nvlist: %s\n", strerror(err)); + + return (nvl); +} + +/* + * sourced from fmd_string.c + */ +static int +fmd_strmatch(const char *s, const char *p) +{ + char c; + + if (p == NULL) + return (0); + + if (s == NULL) + s = ""; /* treat NULL string as the empty string */ + + do { + if ((c = *p++) == '\0') + return (*s == '\0'); + + if (c == '*') { + while (*p == '*') + p++; /* consecutive *'s can be collapsed */ + + if (*p == '\0') + return (1); + + while (*s != '\0') { + if (fmd_strmatch(s++, p) != 0) + return (1); + } + + return (0); + } + } while (c == *s++); + + return (0); +} + +int +fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern) +{ + char *class; + + return (nvl != NULL && + nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 && + fmd_strmatch(class, pattern)); +} + +nvlist_t * +fmd_nvl_alloc(fmd_hdl_t *hdl, int flags) +{ + nvlist_t *nvl = NULL; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + return (nvl); +} + + +/* + * ZED Agent specific APIs + */ + +fmd_hdl_t * +fmd_module_hdl(const char *name) +{ + if (strcmp(name, "zfs-retire") == 0) + return ((fmd_hdl_t *)&zfs_retire_module); + if (strcmp(name, "zfs-diagnosis") == 0) + return ((fmd_hdl_t *)&zfs_diagnosis_module); + + return (NULL); +} + +boolean_t +fmd_module_initialized(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_info != NULL); +} + +/* + * fmd_module_recv is called for each event that is received by + * the fault manager that has a class that matches one of the + * module's subscriptions. + */ +void +fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + fmd_event_t faux_event = {0}; + int64_t *tv; + uint_t n; + + /* + * Will need to normalized this if we persistently store the case data + */ + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0) + faux_event.ev_hrt = tv[0] * NANOSEC + tv[1]; + else + faux_event.ev_hrt = 0; + + ops->fmdo_recv(hdl, &faux_event, nvl, class); + + mp->mod_stats.ms_accepted.fmds_value.ui64++; + + /* TBD - should we initiate fm_module_gc() periodically? */ +} diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_api.h b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.h new file mode 100644 index 000000000000..4f06fb244b7b --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.h @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_API_H +#define _FMD_API_H + +#include <sys/types.h> +#include <sys/time.h> +#include <time.h> +#include <libnvpair.h> +#include <stdarg.h> +#include <umem.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Fault Management Daemon Client Interfaces + */ + +#define FMD_API_VERSION 5 + +typedef struct fmd_hdl fmd_hdl_t; + +typedef struct fmd_timer { + timer_t ft_tid; + void *ft_arg; + fmd_hdl_t *ft_hdl; +} fmd_timer_t; + +#define id_t fmd_timer_t * + + +typedef struct fmd_event { + hrtime_t ev_hrt; /* event time used by SERD engines */ +} fmd_event_t; + +typedef struct fmd_case { + char ci_uuid[48]; /* uuid string for this case */ + fmd_hdl_t *ci_mod; /* module that owns this case */ + void *ci_data; /* data from fmd_case_setspecific() */ + ushort_t ci_state; /* case state (see below) */ + ushort_t ci_flags; /* case flags (see below) */ + struct timeval ci_tv; /* time of original diagnosis */ + void *ci_bufptr; /* case data serialization buffer */ + size_t ci_bufsiz; +} fmd_case_t; + + +#define FMD_B_FALSE 0 /* false value for booleans as int */ +#define FMD_B_TRUE 1 /* true value for booleans as int */ + + +#define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */ +#define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */ +#define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */ +#define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */ +#define FMD_CASE_REPAIRED 4 /* case is repaired */ +#define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */ + +#define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */ +#define FMD_CF_SOLVED 0x02 /* case has been solved */ +#define FMD_CF_ISOLATED 0x04 /* case has been isolated */ +#define FMD_CF_REPAIRED 0x08 /* case has been repaired */ +#define FMD_CF_RESOLVED 0x10 /* case has been resolved */ + + +#define FMD_TYPE_BOOL 0 /* int */ +#define FMD_TYPE_INT32 1 /* int32_t */ +#define FMD_TYPE_UINT32 2 /* uint32_t */ +#define FMD_TYPE_INT64 3 /* int64_t */ +#define FMD_TYPE_UINT64 4 /* uint64_t */ +#define FMD_TYPE_TIME 5 /* uint64_t */ +#define FMD_TYPE_SIZE 6 /* uint64_t */ + +typedef struct fmd_prop { + const char *fmdp_name; /* property name */ + uint_t fmdp_type; /* property type (see above) */ + const char *fmdp_defv; /* default value */ +} fmd_prop_t; + +typedef struct fmd_stat { + char fmds_name[32]; /* statistic name */ + uint_t fmds_type; /* statistic type (see above) */ + char fmds_desc[64]; /* statistic description */ + union { + int bool; /* FMD_TYPE_BOOL */ + int32_t i32; /* FMD_TYPE_INT32 */ + uint32_t ui32; /* FMD_TYPE_UINT32 */ + int64_t i64; /* FMD_TYPE_INT64 */ + uint64_t ui64; /* FMD_TYPE_UINT64 */ + } fmds_value; +} fmd_stat_t; + +typedef struct fmd_hdl_ops { + void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *); + void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *); + void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *); + void (*fmdo_stats)(fmd_hdl_t *); + void (*fmdo_gc)(fmd_hdl_t *); +} fmd_hdl_ops_t; + +#define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */ +#define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */ +#define FMD_SEND_RETRY 2 /* fmdo_send requests retry */ + +typedef struct fmd_hdl_info { + const char *fmdi_desc; /* fmd client description string */ + const char *fmdi_vers; /* fmd client version string */ + const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */ + const fmd_prop_t *fmdi_props; /* array of configuration props */ +} fmd_hdl_info_t; + +extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *); +extern void fmd_hdl_unregister(fmd_hdl_t *); + +extern void fmd_hdl_setspecific(fmd_hdl_t *, void *); +extern void *fmd_hdl_getspecific(fmd_hdl_t *); + +#define FMD_SLEEP UMEM_NOFAIL + +extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int); +extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int); +extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t); + +extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int); +extern void fmd_hdl_strfree(fmd_hdl_t *, char *); + +extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); +extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); + +extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); +extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); + +#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ +#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ + +extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *); +extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *); +extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *); + +extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *); +extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *); + +extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *); +extern void fmd_case_uuclose(fmd_hdl_t *, const char *); +extern int fmd_case_uuclosed(fmd_hdl_t *, const char *); +extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *); +extern void fmd_case_uuresolved(fmd_hdl_t *, const char *); + +extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *); +extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *); +extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *); + +extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *); +extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *); + +extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t); +extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *, + const char *, void *, size_t); +extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *, + const char *, const void *, size_t); +extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); + +extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); +extern void fmd_serd_destroy(fmd_hdl_t *, const char *); +extern int fmd_serd_exists(fmd_hdl_t *, const char *); +extern void fmd_serd_reset(fmd_hdl_t *, const char *); +extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); +extern int fmd_serd_fired(fmd_hdl_t *, const char *); +extern int fmd_serd_empty(fmd_hdl_t *, const char *); + +extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); +extern void fmd_timer_remove(fmd_hdl_t *, id_t); + +extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *, + const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *); + +extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *); + +#define FMD_HAS_FAULT_FRU 0 +#define FMD_HAS_FAULT_ASRU 1 +#define FMD_HAS_FAULT_RESOURCE 2 + +extern void fmd_repair_fru(fmd_hdl_t *, const char *); +extern int fmd_repair_asru(fmd_hdl_t *, const char *); + +extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int); +extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int); + +/* + * ZED Specific Interfaces + */ + +extern fmd_hdl_t *fmd_module_hdl(const char *); +extern boolean_t fmd_module_initialized(fmd_hdl_t *); +extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *); + +/* ZFS FMA Retire Agent */ +extern void _zfs_retire_init(fmd_hdl_t *); +extern void _zfs_retire_fini(fmd_hdl_t *); + +/* ZFS FMA Diagnosis Engine */ +extern void _zfs_diagnosis_init(fmd_hdl_t *); +extern void _zfs_diagnosis_fini(fmd_hdl_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_API_H */ diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c new file mode 100644 index 000000000000..d4ec37fb7691 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c @@ -0,0 +1,316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#include <assert.h> +#include <stddef.h> +#include <stdlib.h> +#include <strings.h> +#include <sys/list.h> +#include <sys/time.h> + +#include "fmd_api.h" +#include "fmd_serd.h" +#include "../zed_log.h" + + +#define FMD_STR_BUCKETS 211 + + +#ifdef SERD_ENG_DEBUG +#define serd_log_msg(fmt, ...) \ + zed_log_msg(LOG_INFO, fmt, __VA_ARGS__) +#else +#define serd_log_msg(fmt, ...) +#endif + + +/* + * SERD Engine Backend + */ + +/* + * Compute the delta between events in nanoseconds. To account for very old + * events which are replayed, we must handle the case where time is negative. + * We convert the hrtime_t's to unsigned 64-bit integers and then handle the + * case where 'old' is greater than 'new' (i.e. high-res time has wrapped). + */ +static hrtime_t +fmd_event_delta(hrtime_t t1, hrtime_t t2) +{ + uint64_t old = t1; + uint64_t new = t2; + + return (new >= old ? new - old : (UINT64_MAX - old) + new + 1); +} + +static fmd_serd_eng_t * +fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t) +{ + fmd_serd_eng_t *sgp; + + sgp = malloc(sizeof (fmd_serd_eng_t)); + bzero(sgp, sizeof (fmd_serd_eng_t)); + + sgp->sg_name = strdup(name); + sgp->sg_flags = FMD_SERD_DIRTY; + sgp->sg_n = n; + sgp->sg_t = t; + + list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t), + offsetof(fmd_serd_elem_t, se_list)); + + return (sgp); +} + +static void +fmd_serd_eng_free(fmd_serd_eng_t *sgp) +{ + fmd_serd_eng_reset(sgp); + free(sgp->sg_name); + list_destroy(&sgp->sg_list); + free(sgp); +} + +/* + * sourced from fmd_string.c + */ +static ulong_t +fmd_strhash(const char *key) +{ + ulong_t g, h = 0; + const char *p; + + for (p = key; *p != '\0'; p++) { + h = (h << 4) + *p; + + if ((g = (h & 0xf0000000)) != 0) { + h ^= (g >> 24); + h ^= g; + } + } + + return (h); +} + +void +fmd_serd_hash_create(fmd_serd_hash_t *shp) +{ + shp->sh_hashlen = FMD_STR_BUCKETS; + shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *)); + shp->sh_count = 0; +} + +void +fmd_serd_hash_destroy(fmd_serd_hash_t *shp) +{ + fmd_serd_eng_t *sgp, *ngp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) { + ngp = sgp->sg_next; + fmd_serd_eng_free(sgp); + } + } + + free(shp->sh_hash); + bzero(shp, sizeof (fmd_serd_hash_t)); +} + +void +fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg) +{ + fmd_serd_eng_t *sgp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next) + func(sgp, arg); + } +} + +fmd_serd_eng_t * +fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name, + uint_t n, hrtime_t t) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t); + + serd_log_msg(" SERD Engine: inserting %s N %d T %llu", + name, (int)n, (long long unsigned)t); + + sgp->sg_next = shp->sh_hash[h]; + shp->sh_hash[h] = sgp; + shp->sh_count++; + + return (sgp); +} + +fmd_serd_eng_t * +fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp; + + for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(name, sgp->sg_name) == 0) + return (sgp); + } + + return (NULL); +} + +void +fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h]; + + serd_log_msg(" SERD Engine: deleting %s", name); + + for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(sgp->sg_name, name) != 0) + pp = &sgp->sg_next; + else + break; + } + + if (sgp != NULL) { + *pp = sgp->sg_next; + fmd_serd_eng_free(sgp); + assert(shp->sh_count != 0); + shp->sh_count--; + } +} + +static void +fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep) +{ + list_remove(&sgp->sg_list, sep); + sgp->sg_count--; + + serd_log_msg(" SERD Engine: discarding %s, %d remaining", + sgp->sg_name, (int)sgp->sg_count); + + free(sep); +} + +int +fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt) +{ + fmd_serd_elem_t *sep, *oep; + + /* + * If the fired flag is already set, return false and discard the + * event. This means that the caller will only see the engine "fire" + * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired() + * function can also be used in combination with fmd_serd_eng_record(). + */ + if (sgp->sg_flags & FMD_SERD_FIRED) { + serd_log_msg(" SERD Engine: record %s already fired!", + sgp->sg_name); + return (FMD_B_FALSE); + } + + while (sgp->sg_count >= sgp->sg_n) + fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list)); + + sep = malloc(sizeof (fmd_serd_elem_t)); + sep->se_hrt = hrt; + + list_insert_head(&sgp->sg_list, sep); + sgp->sg_count++; + + serd_log_msg(" SERD Engine: recording %s of %d (%llu)", + sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt); + + /* + * Pick up the oldest element pointer for comparison to 'sep'. We must + * do this after adding 'sep' because 'oep' and 'sep' can be the same. + */ + oep = list_tail(&sgp->sg_list); + + if (sgp->sg_count >= sgp->sg_n && + fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) { + sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY; + serd_log_msg(" SERD Engine: fired %s", sgp->sg_name); + return (FMD_B_TRUE); + } + + sgp->sg_flags |= FMD_SERD_DIRTY; + return (FMD_B_FALSE); +} + +int +fmd_serd_eng_fired(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_flags & FMD_SERD_FIRED); +} + +int +fmd_serd_eng_empty(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_count == 0); +} + +void +fmd_serd_eng_reset(fmd_serd_eng_t *sgp) +{ + serd_log_msg(" SERD Engine: resetting %s", sgp->sg_name); + + while (sgp->sg_count != 0) + fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list)); + + sgp->sg_flags &= ~FMD_SERD_FIRED; + sgp->sg_flags |= FMD_SERD_DIRTY; +} + +void +fmd_serd_eng_gc(fmd_serd_eng_t *sgp) +{ + fmd_serd_elem_t *sep, *nep; + hrtime_t hrt; + + if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED)) + return; /* no garbage collection needed if empty or fired */ + + sep = list_head(&sgp->sg_list); + if (sep == NULL) + return; + + hrt = sep->se_hrt - sgp->sg_t; + + for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) { + if (sep->se_hrt >= hrt) + break; /* sep and subsequent events are all within T */ + + nep = list_next(&sgp->sg_list, sep); + fmd_serd_eng_discard(sgp, sep); + sgp->sg_flags |= FMD_SERD_DIRTY; + } +} diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h new file mode 100644 index 000000000000..c35c9acc7785 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_SERD_H +#define _FMD_SERD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/list.h> +#include <sys/time.h> + +typedef struct fmd_serd_elem { + list_node_t se_list; /* linked list forward/back pointers */ + hrtime_t se_hrt; /* upper bound on event hrtime */ +} fmd_serd_elem_t; + +typedef struct fmd_serd_eng { + char *sg_name; /* string name for this engine */ + struct fmd_serd_eng *sg_next; /* next engine on hash chain */ + list_t sg_list; /* list of fmd_serd_elem_t's */ + uint_t sg_count; /* count of events in sg_list */ + uint_t sg_flags; /* engine flags (see below) */ + uint_t sg_n; /* engine N parameter (event count) */ + hrtime_t sg_t; /* engine T parameter (nanoseconds) */ +} fmd_serd_eng_t; + +#define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */ +#define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */ + +typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *); + +typedef struct fmd_serd_hash { + fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */ + uint_t sh_hashlen; /* length of hash bucket array */ + uint_t sh_count; /* count of engines in hash */ +} fmd_serd_hash_t; + +extern void fmd_serd_hash_create(fmd_serd_hash_t *); +extern void fmd_serd_hash_destroy(fmd_serd_hash_t *); +extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *); + +extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *, + const char *, uint32_t, hrtime_t); + +extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *); +extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *); + +extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t); +extern int fmd_serd_eng_fired(fmd_serd_eng_t *); +extern int fmd_serd_eng_empty(fmd_serd_eng_t *); + +extern void fmd_serd_eng_reset(fmd_serd_eng_t *); +extern void fmd_serd_eng_gc(fmd_serd_eng_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_SERD_H */ diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c new file mode 100644 index 000000000000..006e0ab99f47 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c @@ -0,0 +1,422 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> + */ + +#include <libnvpair.h> +#include <libzfs.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/list.h> +#include <sys/time.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> +#include <pthread.h> +#include <unistd.h> + +#include "zfs_agents.h" +#include "fmd_api.h" +#include "../zed_log.h" + +/* + * agent dispatch code + */ + +static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER; +static list_t agent_events; /* list of pending events */ +static int agent_exiting; + +typedef struct agent_event { + char ae_class[64]; + char ae_subclass[32]; + nvlist_t *ae_nvl; + list_node_t ae_node; +} agent_event_t; + +pthread_t g_agents_tid; + +libzfs_handle_t *g_zfs_hdl; + +/* guid search data */ +typedef enum device_type { + DEVICE_TYPE_L2ARC, /* l2arc device */ + DEVICE_TYPE_SPARE, /* spare device */ + DEVICE_TYPE_PRIMARY /* any primary pool storage device */ +} device_type_t; + +typedef struct guid_search { + uint64_t gs_pool_guid; + uint64_t gs_vdev_guid; + char *gs_devid; + device_type_t gs_vdev_type; + uint64_t gs_vdev_expandtime; /* vdev expansion time */ +} guid_search_t; + +/* + * Walks the vdev tree recursively looking for a matching devid. + * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise. + */ +static boolean_t +zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) +{ + guid_search_t *gsp = arg; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY; + return (B_TRUE); + } + } + } + /* + * Iterate over any spares and cache devices + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_L2ARC; + return (B_TRUE); + } + } + } + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_SPARE; + return (B_TRUE); + } + } + } + /* + * On a devid match, grab the vdev guid and expansion time, if any. + */ + if (gsp->gs_devid != NULL && + (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && + (strcmp(gsp->gs_devid, path) == 0)) { + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &gsp->gs_vdev_guid); + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME, + &gsp->gs_vdev_expandtime); + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg) +{ + guid_search_t *gsp = arg; + nvlist_t *config, *nvl; + + /* + * For each vdev in this pool, look for a match by devid + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvl) == 0) { + (void) zfs_agent_iter_vdev(zhp, nvl, gsp); + } + } + /* + * if a match was found then grab the pool guid + */ + if (gsp->gs_vdev_guid) { + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &gsp->gs_pool_guid); + } + + zpool_close(zhp); + return (gsp->gs_vdev_guid != 0); +} + +void +zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + agent_event_t *event; + + if (subclass == NULL) + subclass = ""; + + event = malloc(sizeof (agent_event_t)); + if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) { + if (event) + free(event); + return; + } + + if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) { + class = EC_ZFS; + subclass = ESC_ZFS_VDEV_CHECK; + } + + /* + * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED + * ereport from vdev_disk layer after a hot unplug. Fortunately we + * get a EC_DEV_REMOVE from our disk monitor and it is a suitable + * proxy so we remap it here for the benefit of the diagnosis engine. + */ + if ((strcmp(class, EC_DEV_REMOVE) == 0) && + (strcmp(subclass, ESC_DISK) == 0) && + (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) || + nvlist_exists(nvl, DEV_IDENTIFIER))) { + nvlist_t *payload = event->ae_nvl; + struct timeval tv; + int64_t tod[2]; + uint64_t pool_guid = 0, vdev_guid = 0; + guid_search_t search = { 0 }; + device_type_t devtype = DEVICE_TYPE_PRIMARY; + + class = "resource.fs.zfs.removed"; + subclass = ""; + + (void) nvlist_add_string(payload, FM_CLASS, class); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); + + (void) gettimeofday(&tv, NULL); + tod[0] = tv.tv_sec; + tod[1] = tv.tv_usec; + (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2); + + /* + * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or + * ZFS_EV_POOL_GUID may be missing so find them. + */ + (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, + &search.gs_devid); + (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); + pool_guid = search.gs_pool_guid; + vdev_guid = search.gs_vdev_guid; + devtype = search.gs_vdev_type; + + /* + * We want to avoid reporting "remove" events coming from + * libudev for VDEVs which were expanded recently (10s) and + * avoid activating spares in response to partitions being + * deleted and created in rapid succession. + */ + if (search.gs_vdev_expandtime != 0 && + search.gs_vdev_expandtime + 10 > tv.tv_sec) { + zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' " + "for recently expanded device '%s'", EC_DEV_REMOVE, + search.gs_devid); + goto out; + } + + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid); + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid); + switch (devtype) { + case DEVICE_TYPE_L2ARC: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + VDEV_TYPE_L2CACHE); + break; + case DEVICE_TYPE_SPARE: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE); + break; + case DEVICE_TYPE_PRIMARY: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK); + break; + } + + zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'", + EC_DEV_REMOVE, class); + } + + (void) strlcpy(event->ae_class, class, sizeof (event->ae_class)); + (void) strlcpy(event->ae_subclass, subclass, + sizeof (event->ae_subclass)); + + (void) pthread_mutex_lock(&agent_lock); + list_insert_tail(&agent_events, event); + (void) pthread_mutex_unlock(&agent_lock); + +out: + (void) pthread_cond_signal(&agent_cond); +} + +static void +zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl) +{ + /* + * The diagnosis engine subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf + */ + if (strstr(class, "ereport.fs.zfs.") != NULL || + strstr(class, "resource.fs.zfs.") != NULL || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 || + strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class); + } + + /* + * The retire agent subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-retire.conf + * + * NOTE: faults events come directly from our diagnosis engine + * and will not pass through the zfs kernel module. + */ + if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || + strcmp(class, "resource.fs.zfs.removed") == 0 || + strcmp(class, "resource.fs.zfs.statechange") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class); + } + + /* + * The SLM module only consumes disk events and vdev check events + * + * NOTE: disk events come directly from disk monitor and will + * not pass through the zfs kernel module. + */ + if (strstr(class, "EC_dev_") != NULL || + strcmp(class, EC_ZFS) == 0) { + (void) zfs_slm_event(class, subclass, nvl); + } +} + +/* + * Events are consumed and dispatched from this thread + * An agent can also post an event so event list lock + * is not held when calling an agent. + * One event is consumed at a time. + */ +static void * +zfs_agent_consumer_thread(void *arg) +{ + for (;;) { + agent_event_t *event; + + (void) pthread_mutex_lock(&agent_lock); + + /* wait for an event to show up */ + while (!agent_exiting && list_is_empty(&agent_events)) + (void) pthread_cond_wait(&agent_cond, &agent_lock); + + if (agent_exiting) { + (void) pthread_mutex_unlock(&agent_lock); + zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: " + "exiting"); + return (NULL); + } + + if ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + + (void) pthread_mutex_unlock(&agent_lock); + + /* dispatch to all event subscribers */ + zfs_agent_dispatch(event->ae_class, event->ae_subclass, + event->ae_nvl); + + nvlist_free(event->ae_nvl); + free(event); + continue; + } + + (void) pthread_mutex_unlock(&agent_lock); + } + + return (NULL); +} + +void +zfs_agent_init(libzfs_handle_t *zfs_hdl) +{ + fmd_hdl_t *hdl; + + g_zfs_hdl = zfs_hdl; + + if (zfs_slm_init() != 0) + zed_log_die("Failed to initialize zfs slm"); + zed_log_msg(LOG_INFO, "Add Agent: init"); + + hdl = fmd_module_hdl("zfs-diagnosis"); + _zfs_diagnosis_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs diagnosis"); + + hdl = fmd_module_hdl("zfs-retire"); + _zfs_retire_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs retire"); + + list_create(&agent_events, sizeof (agent_event_t), + offsetof(struct agent_event, ae_node)); + + if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread, + NULL) != 0) { + list_destroy(&agent_events); + zed_log_die("Failed to initialize agents"); + } +} + +void +zfs_agent_fini(void) +{ + fmd_hdl_t *hdl; + agent_event_t *event; + + agent_exiting = 1; + (void) pthread_cond_signal(&agent_cond); + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_agents_tid, NULL); + + /* drain any pending events */ + while ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + nvlist_free(event->ae_nvl); + free(event); + } + + list_destroy(&agent_events); + + if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) { + _zfs_retire_fini(hdl); + fmd_hdl_unregister(hdl); + } + if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) { + _zfs_diagnosis_fini(hdl); + fmd_hdl_unregister(hdl); + } + + zed_log_msg(LOG_INFO, "Add Agent: fini"); + zfs_slm_fini(); + + g_zfs_hdl = NULL; +} diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h new file mode 100644 index 000000000000..d1a459139b1e --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef ZFS_AGENTS_H +#define ZFS_AGENTS_H + +#include <libzfs.h> +#include <libnvpair.h> + + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Agent abstraction presented to ZED + */ +extern void zfs_agent_init(libzfs_handle_t *); +extern void zfs_agent_fini(void); +extern void zfs_agent_post_event(const char *, const char *, nvlist_t *); + +/* + * ZFS Sysevent Linkable Module (SLM) + */ +extern int zfs_slm_init(void); +extern void zfs_slm_fini(void); +extern void zfs_slm_event(const char *, const char *, nvlist_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* !ZFS_AGENTS_H */ diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c new file mode 100644 index 000000000000..0b27f6702ee8 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c @@ -0,0 +1,981 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, Intel Corporation. + */ + +#include <stddef.h> +#include <string.h> +#include <strings.h> +#include <libuutil.h> +#include <libzfs.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> + +#include "zfs_agents.h" +#include "fmd_api.h" + +/* + * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This + * #define reserves enough space for two 64-bit hex values plus the length of + * the longest string. + */ +#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) + +/* + * On-disk case structure. This must maintain backwards compatibility with + * previous versions of the DE. By default, any members appended to the end + * will be filled with zeros if they don't exist in a previous version. + */ +typedef struct zfs_case_data { + uint64_t zc_version; + uint64_t zc_ena; + uint64_t zc_pool_guid; + uint64_t zc_vdev_guid; + int zc_pool_state; + char zc_serd_checksum[MAX_SERDLEN]; + char zc_serd_io[MAX_SERDLEN]; + int zc_has_remove_timer; +} zfs_case_data_t; + +/* + * Time-of-day + */ +typedef struct er_timeval { + uint64_t ertv_sec; + uint64_t ertv_nsec; +} er_timeval_t; + +/* + * In-core case structure. + */ +typedef struct zfs_case { + boolean_t zc_present; + uint32_t zc_version; + zfs_case_data_t zc_data; + fmd_case_t *zc_case; + uu_list_node_t zc_node; + id_t zc_remove_timer; + char *zc_fru; + er_timeval_t zc_when; +} zfs_case_t; + +#define CASE_DATA "data" +#define CASE_FRU "fru" +#define CASE_DATA_VERSION_INITIAL 1 +#define CASE_DATA_VERSION_SERD 2 + +typedef struct zfs_de_stats { + fmd_stat_t old_drops; + fmd_stat_t dev_drops; + fmd_stat_t vdev_drops; + fmd_stat_t import_drops; + fmd_stat_t resource_drops; +} zfs_de_stats_t; + +zfs_de_stats_t zfs_stats = { + { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, + { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, + { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, + { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, + { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } +}; + +static hrtime_t zfs_remove_timeout; + +uu_list_pool_t *zfs_case_pool; +uu_list_t *zfs_cases; + +#define ZFS_MAKE_RSRC(type) \ + FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type +#define ZFS_MAKE_EREPORT(type) \ + FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type + +/* + * Write out the persistent representation of an active case. + */ +static void +zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) +{ + zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; +} + +/* + * Read back the persistent representation of an active case. + */ +static zfs_case_t * +zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + zfs_case_t *zcp; + + zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); + zcp->zc_case = cp; + + fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, + sizeof (zcp->zc_data)); + + if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + return (NULL); + } + + /* + * fmd_buf_read() will have already zeroed out the remainder of the + * buffer, so we don't have to do anything special if the version + * doesn't include the SERD engine name. + */ + + if (zcp->zc_data.zc_has_remove_timer) + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, + NULL, zfs_remove_timeout); + + uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); + (void) uu_list_insert_before(zfs_cases, NULL, zcp); + + fmd_case_setspecific(hdl, cp, zcp); + + return (zcp); +} + +/* + * Iterate over any active cases. If any cases are associated with a pool or + * vdev which is no longer present on the system, close the associated case. + */ +static void +zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) +{ + uint64_t vdev_guid = 0; + uint_t c, children; + nvlist_t **child; + zfs_case_t *zcp; + + (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); + + /* + * Mark any cases associated with this (pool, vdev) pair. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == vdev_guid) { + zcp->zc_present = B_TRUE; + zcp->zc_when = *loaded; + } + } + + /* + * Iterate over all children. + */ + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } +} + +/*ARGSUSED*/ +static int +zfs_mark_pool(zpool_handle_t *zhp, void *unused) +{ + zfs_case_t *zcp; + uint64_t pool_guid; + uint64_t *tod; + er_timeval_t loaded = { 0 }; + nvlist_t *config, *vd; + uint_t nelem = 0; + int ret; + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + /* + * Mark any cases associated with just this pool. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) + zcp->zc_present = B_TRUE; + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem); + if (nelem == 2) { + loaded.ertv_sec = tod[0]; + loaded.ertv_nsec = tod[1]; + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) { + zcp->zc_when = loaded; + } + } + } + + ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); + if (ret) { + zpool_close(zhp); + return (-1); + } + + zfs_mark_vdev(pool_guid, vd, &loaded); + + zpool_close(zhp); + + return (0); +} + +struct load_time_arg { + uint64_t lt_guid; + er_timeval_t *lt_time; + boolean_t lt_found; +}; + +static int +zpool_find_load_time(zpool_handle_t *zhp, void *arg) +{ + struct load_time_arg *lta = arg; + uint64_t pool_guid; + uint64_t *tod; + nvlist_t *config; + uint_t nelem; + + if (lta->lt_found) { + zpool_close(zhp); + return (0); + } + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + if (pool_guid != lta->lt_guid) { + zpool_close(zhp); + return (0); + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem) == 0 && nelem == 2) { + lta->lt_found = B_TRUE; + lta->lt_time->ertv_sec = tod[0]; + lta->lt_time->ertv_nsec = tod[1]; + } + + zpool_close(zhp); + + return (0); +} + +static void +zfs_purge_cases(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + + /* + * There is no way to open a pool by GUID, or lookup a vdev by GUID. No + * matter what we do, we're going to have to stomach an O(vdevs * cases) + * algorithm. In reality, both quantities are likely so small that + * neither will matter. Given that iterating over pools is more + * expensive than iterating over the in-memory case list, we opt for a + * 'present' flag in each case that starts off cleared. We then iterate + * over all pools, marking those that are still present, and removing + * those that aren't found. + * + * Note that we could also construct an FMRI and rely on + * fmd_nvl_fmri_present(), but this would end up doing the same search. + */ + + /* + * Mark the cases as not present. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) + zcp->zc_present = B_FALSE; + + /* + * Iterate over all pools and mark the pools and vdevs found. If this + * fails (most probably because we're out of memory), then don't close + * any of the cases and we cannot be sure they are accurate. + */ + if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) + return; + + /* + * Remove those cases which were not found. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + if (!zcp->zc_present) + fmd_case_close(hdl, zcp->zc_case); + } + uu_list_walk_end(walk); +} + +/* + * Construct the name of a serd engine given the pool/vdev GUID and type (io or + * checksum). + */ +static void +zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, + const char *type) +{ + (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", + (long long unsigned int)pool_guid, + (long long unsigned int)vdev_guid, type); +} + +/* + * Solve a given ZFS case. This first checks to make sure the diagnosis is + * still valid, as well as cleaning up any pending timer associated with the + * case. + */ +static void +zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, + boolean_t checkunusable) +{ + nvlist_t *detector, *fault; + boolean_t serialize; + nvlist_t *fru = NULL; + fmd_hdl_debug(hdl, "solving fault '%s'", faultname); + + /* + * Construct the detector from the case data. The detector is in the + * ZFS scheme, and is either the pool or the vdev, depending on whether + * this is a vdev or pool fault. + */ + detector = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); + (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, + zcp->zc_data.zc_pool_guid); + if (zcp->zc_data.zc_vdev_guid != 0) { + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, + zcp->zc_data.zc_vdev_guid); + } + + fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, + fru, detector); + fmd_case_add_suspect(hdl, zcp->zc_case, fault); + + nvlist_free(fru); + + fmd_case_solve(hdl, zcp->zc_case); + + serialize = B_FALSE; + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + serialize = B_TRUE; + } + if (serialize) + zfs_case_serialize(hdl, zcp); + + nvlist_free(detector); +} + +static boolean_t +timeval_earlier(er_timeval_t *a, er_timeval_t *b) +{ + return (a->ertv_sec < b->ertv_sec || + (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); +} + +/*ARGSUSED*/ +static void +zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) +{ + int64_t *tod; + uint_t nelem; + + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod, + &nelem) == 0 && nelem == 2) { + when->ertv_sec = tod[0]; + when->ertv_nsec = tod[1]; + } else { + when->ertv_sec = when->ertv_nsec = UINT64_MAX; + } +} + +/* + * Main fmd entry point. + */ +/*ARGSUSED*/ +static void +zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) +{ + zfs_case_t *zcp, *dcp; + int32_t pool_state; + uint64_t ena, pool_guid, vdev_guid; + er_timeval_t pool_load; + er_timeval_t er_when; + nvlist_t *detector; + boolean_t pool_found = B_FALSE; + boolean_t isresource; + char *type; + + /* + * We subscribe to notifications for vdev or pool removal. In these + * cases, there may be cases that no longer apply. Purge any cases + * that no longer apply. + */ + if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) { + fmd_hdl_debug(hdl, "purging orphaned cases from %s", + strrchr(class, '.') + 1); + zfs_purge_cases(hdl); + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); + + if (isresource) { + /* + * For resources, we don't have a normal payload. + */ + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + pool_state = SPA_LOAD_OPEN; + else + pool_state = SPA_LOAD_NONE; + detector = NULL; + } else { + (void) nvlist_lookup_nvlist(nvl, + FM_EREPORT_DETECTOR, &detector); + (void) nvlist_lookup_int32(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); + } + + /* + * We also ignore all ereports generated during an import of a pool, + * since the only possible fault (.pool) would result in import failure, + * and hence no persistent fault. Some day we may want to do something + * with these ereports, so we continue generating them internally. + */ + if (pool_state == SPA_LOAD_IMPORT) { + zfs_stats.import_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "ignoring '%s' during import", class); + return; + } + + /* + * Device I/O errors are ignored during pool open. + */ + if (pool_state == SPA_LOAD_OPEN && + (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { + fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class); + zfs_stats.dev_drops.fmds_value.ui64++; + return; + } + + /* + * We ignore ereports for anything except disks and files. + */ + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + &type) == 0) { + if (strcmp(type, VDEV_TYPE_DISK) != 0 && + strcmp(type, VDEV_TYPE_FILE) != 0) { + zfs_stats.vdev_drops.fmds_value.ui64++; + return; + } + } + + /* + * Determine if this ereport corresponds to an open case. + * Each vdev or pool can have a single case. + */ + (void) nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); + if (nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + vdev_guid = 0; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) + ena = 0; + + zfs_ereport_when(hdl, nvl, &er_when); + + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid) { + pool_found = B_TRUE; + pool_load = zcp->zc_when; + } + if (zcp->zc_data.zc_vdev_guid == vdev_guid) + break; + } + + /* + * Avoid falsely accusing a pool of being faulty. Do so by + * not replaying ereports that were generated prior to the + * current import. If the failure that generated them was + * transient because the device was actually removed but we + * didn't receive the normal asynchronous notification, we + * don't want to mark it as faulted and potentially panic. If + * there is still a problem we'd expect not to be able to + * import the pool, or that new ereports will be generated + * once the pool is used. + */ + if (pool_found && timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, er_when.ertv_nsec, + pool_load.ertv_sec, pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + + if (!pool_found) { + /* + * Haven't yet seen this pool, but same situation + * may apply. + */ + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + struct load_time_arg la; + + la.lt_guid = pool_guid; + la.lt_time = &pool_load; + la.lt_found = B_FALSE; + + if (zhdl != NULL && + zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && + la.lt_found == B_TRUE) { + pool_found = B_TRUE; + + if (timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, " + "pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, + er_when.ertv_nsec, pool_load.ertv_sec, + pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + } + } + + if (zcp == NULL) { + fmd_case_t *cs; + zfs_case_data_t data = { 0 }; + + /* + * If this is one of our 'fake' resource ereports, and there is + * no case open, simply discard it. + */ + if (isresource) { + zfs_stats.resource_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", + class, vdev_guid); + return; + } + + /* + * Skip tracking some ereports + */ + if (strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Open a new case. + */ + cs = fmd_case_open(hdl, NULL); + + fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'", + vdev_guid, class); + + /* + * Initialize the case buffer. To commonize code, we actually + * create the buffer with existing data, and then call + * zfs_case_unserialize() to instantiate the in-core structure. + */ + fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); + + data.zc_version = CASE_DATA_VERSION_SERD; + data.zc_ena = ena; + data.zc_pool_guid = pool_guid; + data.zc_vdev_guid = vdev_guid; + data.zc_pool_state = (int)pool_state; + + fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); + + zcp = zfs_case_unserialize(hdl, cs); + assert(zcp != NULL); + if (pool_found) + zcp->zc_when = pool_load; + } + + if (isresource) { + fmd_hdl_debug(hdl, "resource event '%s'", class); + + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { + /* + * The 'resource.fs.zfs.autoreplace' event indicates + * that the pool was loaded with the 'autoreplace' + * property set. In this case, any pending device + * failures should be ignored, as the asynchronous + * autoreplace handling will take care of them. + */ + fmd_case_close(hdl, zcp->zc_case); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { + /* + * The 'resource.fs.zfs.removed' event indicates that + * device removal was detected, and the device was + * closed asynchronously. If this is the case, we + * assume that any recent I/O errors were due to the + * device removal, not any fault of the device itself. + * We reset the SERD engine, and cancel any pending + * timers. + */ + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + zfs_case_serialize(hdl, zcp); + } + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_checksum); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { + uint64_t state = 0; + + if (zcp != NULL && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 && + state == VDEV_STATE_HEALTHY) { + fmd_hdl_debug(hdl, "closing case after a " + "device statechange to healthy"); + fmd_case_close(hdl, zcp->zc_case); + } + } + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Associate the ereport with this case. + */ + fmd_case_add_ereport(hdl, zcp->zc_case, ep); + + /* + * Don't do anything else if this case is already solved. + */ + if (fmd_case_solved(hdl, zcp->zc_case)) + return; + + fmd_hdl_debug(hdl, "error event '%s'", class); + + /* + * Determine if we should solve the case and generate a fault. We solve + * a case if: + * + * a. A pool failed to open (ereport.fs.zfs.pool) + * b. A device failed to open (ereport.fs.zfs.pool) while a pool + * was up and running. + * + * We may see a series of ereports associated with a pool open, all + * chained together by the same ENA. If the pool open succeeds, then + * we'll see no further ereports. To detect when a pool open has + * succeeded, we associate a timer with the event. When it expires, we + * close the case. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { + /* + * Pool level fault. Before solving the case, go through and + * close any open device cases that may be pending. + */ + for (dcp = uu_list_first(zfs_cases); dcp != NULL; + dcp = uu_list_next(zfs_cases, dcp)) { + if (dcp->zc_data.zc_pool_guid == + zcp->zc_data.zc_pool_guid && + dcp->zc_data.zc_vdev_guid != 0) + fmd_case_close(hdl, dcp->zc_case); + } + + zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { + /* + * Pool level fault for reading the intent logs. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { + /* + * Device fault. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { + char *failmode = NULL; + boolean_t checkremove = B_FALSE; + + /* + * If this is a checksum or I/O error, then toss it into the + * appropriate SERD engine and check to see if it has fired. + * Ideally, we want to do something more sophisticated, + * (persistent errors for a single data block, etc). For now, + * a single SERD engine is sufficient. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { + if (zcp->zc_data.zc_serd_io[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_io, + pool_guid, vdev_guid, "io"); + fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, + fmd_prop_get_int32(hdl, "io_N"), + fmd_prop_get_int64(hdl, "io_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) + checkremove = B_TRUE; + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { + if (zcp->zc_data.zc_serd_checksum[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_checksum, + pool_guid, vdev_guid, "checksum"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_checksum, + fmd_prop_get_int32(hdl, "checksum_N"), + fmd_prop_get_int64(hdl, "checksum_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, + zcp->zc_data.zc_serd_checksum, ep)) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.checksum", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && + (nvlist_lookup_string(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && + failmode != NULL) { + if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, + strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_continue", + B_FALSE); + } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, + strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_wait", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { +#ifndef __linux__ + /* This causes an unexpected fault diagnosis on linux */ + checkremove = B_TRUE; +#endif + } + + /* + * Because I/O errors may be due to device removal, we postpone + * any diagnosis until we're sure that we aren't about to + * receive a 'resource.fs.zfs.removed' event. + */ + if (checkremove) { + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, + zfs_remove_timeout); + if (!zcp->zc_data.zc_has_remove_timer) { + zcp->zc_data.zc_has_remove_timer = 1; + zfs_case_serialize(hdl, zcp); + } + } + } +} + +/* + * The timeout is fired when we diagnosed an I/O error, and it was not due to + * device removal (which would cause the timeout to be cancelled). + */ +/* ARGSUSED */ +static void +zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) +{ + zfs_case_t *zcp = data; + + if (id == zcp->zc_remove_timer) + zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE); +} + +/* + * The specified case has been closed and any case-specific + * data structures should be deallocated. + */ +static void +zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) +{ + zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); + + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); +} + +/* + * We use the fmd gc entry point to look for old cases that no longer apply. + * This allows us to keep our set of case data small in a long running system. + */ +static void +zfs_fm_gc(fmd_hdl_t *hdl) +{ + zfs_purge_cases(hdl); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_fm_recv, /* fmdo_recv */ + zfs_fm_timeout, /* fmdo_timeout */ + zfs_fm_close, /* fmdo_close */ + NULL, /* fmdo_stats */ + zfs_fm_gc, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "checksum_N", FMD_TYPE_UINT32, "10" }, + { "checksum_T", FMD_TYPE_TIME, "10min" }, + { "io_N", FMD_TYPE_UINT32, "10" }, + { "io_T", FMD_TYPE_TIME, "10min" }, + { "remove_timeout", FMD_TYPE_TIME, "15sec" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props +}; + +void +_zfs_diagnosis_init(fmd_hdl_t *hdl) +{ + libzfs_handle_t *zhdl; + + if ((zhdl = libzfs_init()) == NULL) + return; + + if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", + sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), + NULL, UU_LIST_POOL_DEBUG)) == NULL) { + libzfs_fini(zhdl); + return; + } + + if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, + UU_LIST_DEBUG)) == NULL) { + uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); + return; + } + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); + return; + } + + fmd_hdl_setspecific(hdl, zhdl); + + (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / + sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); + + zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); +} + +void +_zfs_diagnosis_fini(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl; + + /* + * Remove all active cases. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + fmd_hdl_debug(hdl, "removing case ena %llu", + (long long unsigned)zcp->zc_data.zc_ena); + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + } + uu_list_walk_end(walk); + + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + + zhdl = fmd_hdl_getspecific(hdl); + libzfs_fini(zhdl); +} diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c new file mode 100644 index 000000000000..8d0a3b420086 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c @@ -0,0 +1,956 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, 2017, Intel Corporation. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + */ + +/* + * ZFS syseventd module. + * + * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c + * + * The purpose of this module is to identify when devices are added to the + * system, and appropriately online or replace the affected vdevs. + * + * When a device is added to the system: + * + * 1. Search for any vdevs whose devid matches that of the newly added + * device. + * + * 2. If no vdevs are found, then search for any vdevs whose udev path + * matches that of the new device. + * + * 3. If no vdevs match by either method, then ignore the event. + * + * 4. Attempt to online the device with a flag to indicate that it should + * be unspared when resilvering completes. If this succeeds, then the + * same device was inserted and we should continue normally. + * + * 5. If the pool does not have the 'autoreplace' property set, attempt to + * online the device again without the unspare flag, which will + * generate a FMA fault. + * + * 6. If the pool has the 'autoreplace' property set, and the matching vdev + * is a whole disk, then label the new disk and attempt a 'zpool + * replace'. + * + * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK + * event indicates that a device failed to open during pool load, but the + * autoreplace property was set. In this case, we deferred the associated + * FMA fault until our module had a chance to process the autoreplace logic. + * If the device could not be replaced, then the second online attempt will + * trigger the FMA fault that we skipped earlier. + * + * ZFS on Linux porting notes: + * Linux udev provides a disk insert for both the disk and the partition + * + */ + +#include <ctype.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <libzfs.h> +#include <libzutil.h> +#include <limits.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <sys/list.h> +#include <sys/sunddi.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> +#include <thread_pool.h> +#include <pthread.h> +#include <unistd.h> +#include <errno.h> +#include "zfs_agents.h" +#include "../zed_log.h" + +#define DEV_BYID_PATH "/dev/disk/by-id/" +#define DEV_BYPATH_PATH "/dev/disk/by-path/" +#define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" + +typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); + +libzfs_handle_t *g_zfshdl; +list_t g_pool_list; /* list of unavailable pools at initialization */ +list_t g_device_list; /* list of disks with asynchronous label request */ +tpool_t *g_tpool; +boolean_t g_enumeration_done; +pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ + +typedef struct unavailpool { + zpool_handle_t *uap_zhp; + list_node_t uap_node; +} unavailpool_t; + +typedef struct pendingdev { + char pd_physpath[128]; + list_node_t pd_node; +} pendingdev_t; + +static int +zfs_toplevel_state(zpool_handle_t *zhp) +{ + nvlist_t *nvroot; + vdev_stat_t *vs; + unsigned int c; + + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + return (vs->vs_state); +} + +static int +zfs_unavail_pool(zpool_handle_t *zhp, void *data) +{ + zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", + zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); + + if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { + unavailpool_t *uap; + uap = malloc(sizeof (unavailpool_t)); + uap->uap_zhp = zhp; + list_insert_tail((list_t *)data, uap); + } else { + zpool_close(zhp); + } + return (0); +} + +/* + * Two stage replace on Linux + * since we get disk notifications + * we can wait for partitioned disk slice to show up! + * + * First stage tags the disk, initiates async partitioning, and returns + * Second stage finds the tag and proceeds to ZFS labeling/replace + * + * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach + * + * 1. physical match with no fs, no partition + * tag it top, partition disk + * + * 2. physical match again, see partition and tag + * + */ + +/* + * The device associated with the given vdev (either by devid or physical path) + * has been added to the system. If 'isdisk' is set, then we only attempt a + * replacement if it's a whole disk. This also implies that we should label the + * disk first. + * + * First, we attempt to online the device (making sure to undo any spare + * operation when finished). If this succeeds, then we're done. If it fails, + * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, + * but that the label was not what we expected. If the 'autoreplace' property + * is enabled, then we relabel the disk (if specified), and attempt a 'zpool + * replace'. If the online is successful, but the new state is something else + * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of + * race, and we should avoid attempting to relabel the disk. + * + * Also can arrive here from a ESC_ZFS_VDEV_CHECK event + */ +static void +zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) +{ + char *path; + vdev_state_t newstate; + nvlist_t *nvroot, *newvd; + pendingdev_t *device; + uint64_t wholedisk = 0ULL; + uint64_t offline = 0ULL; + uint64_t guid = 0ULL; + char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; + char rawpath[PATH_MAX], fullpath[PATH_MAX]; + char devpath[PATH_MAX]; + int ret; + boolean_t is_dm = B_FALSE; + boolean_t is_sd = B_FALSE; + uint_t c; + vdev_stat_t *vs; + + if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) + return; + + /* Skip healthy disks */ + verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + if (vs->vs_state == VDEV_STATE_HEALTHY) { + zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", + __func__, path); + return; + } + + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_path); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); + + if (offline) + return; /* don't intervene if it was taken offline */ + + is_dm = zfs_dev_is_dm(path); + zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" + " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path, + physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not", + (long long unsigned int)guid); + + /* + * The VDEV guid is preferred for identification (gets passed in path) + */ + if (guid != 0) { + (void) snprintf(fullpath, sizeof (fullpath), "%llu", + (long long unsigned int)guid); + } else { + /* + * otherwise use path sans partition suffix for whole disks + */ + (void) strlcpy(fullpath, path, sizeof (fullpath)); + if (wholedisk) { + char *spath = zfs_strip_partition(fullpath); + if (!spath) { + zed_log_msg(LOG_INFO, "%s: Can't alloc", + __func__); + return; + } + + (void) strlcpy(fullpath, spath, sizeof (fullpath)); + free(spath); + } + } + + /* + * Attempt to online the device. + */ + if (zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && + (newstate == VDEV_STATE_HEALTHY || + newstate == VDEV_STATE_DEGRADED)) { + zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", + fullpath, (newstate == VDEV_STATE_HEALTHY) ? + "HEALTHY" : "DEGRADED"); + return; + } + + /* + * vdev_id alias rule for using scsi_debug devices (FMA automated + * testing) + */ + if (physpath != NULL && strcmp("scsidebug", physpath) == 0) + is_sd = B_TRUE; + + /* + * If the pool doesn't have the autoreplace property set, then use + * vdev online to trigger a FMA fault by posting an ereport. + */ + if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || + !(wholedisk || is_dm) || (physpath == NULL)) { + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " + "not a whole disk for '%s'", fullpath); + return; + } + + /* + * Convert physical path into its current device node. Rawpath + * needs to be /dev/disk/by-vdev for a scsi_debug device since + * /dev/disk/by-path will not be present. + */ + (void) snprintf(rawpath, sizeof (rawpath), "%s%s", + is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); + + if (realpath(rawpath, devpath) == NULL && !is_dm) { + zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", + rawpath, strerror(errno)); + + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", + fullpath, libzfs_error_description(g_zfshdl)); + return; + } + + /* Only autoreplace bad disks */ + if ((vs->vs_state != VDEV_STATE_DEGRADED) && + (vs->vs_state != VDEV_STATE_FAULTED) && + (vs->vs_state != VDEV_STATE_CANT_OPEN)) { + return; + } + + nvlist_lookup_string(vdev, "new_devid", &new_devid); + + if (is_dm) { + /* Don't label device mapper or multipath disks. */ + } else if (!labeled) { + /* + * we're auto-replacing a raw disk, so label it first + */ + char *leafname; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. Before we can label the disk, we need + * to map the physical string that was matched on to the under + * lying device node. + * + * If any part of this process fails, then do a force online + * to trigger a ZFS fault for the device (and any hot spare + * replacement). + */ + leafname = strrchr(devpath, '/') + 1; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. + */ + if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { + zed_log_msg(LOG_INFO, " zpool_label_disk: could not " + "label '%s' (%s)", leafname, + libzfs_error_description(g_zfshdl)); + + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + /* + * The disk labeling is asynchronous on Linux. Just record + * this label request and return as there will be another + * disk add event for the partition after the labeling is + * completed. + */ + device = malloc(sizeof (pendingdev_t)); + (void) strlcpy(device->pd_physpath, physpath, + sizeof (device->pd_physpath)); + list_insert_tail(&g_device_list, device); + + zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", + leafname, (u_longlong_t)guid); + + return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ + + } else /* labeled */ { + boolean_t found = B_FALSE; + /* + * match up with request above to label the disk + */ + for (device = list_head(&g_device_list); device != NULL; + device = list_next(&g_device_list, device)) { + if (strcmp(physpath, device->pd_physpath) == 0) { + list_remove(&g_device_list, device); + free(device); + found = B_TRUE; + break; + } + zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", + physpath, device->pd_physpath); + } + if (!found) { + /* unexpected partition slice encountered */ + zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", + fullpath); + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", + physpath, (u_longlong_t)guid); + + (void) snprintf(devpath, sizeof (devpath), "%s%s", + DEV_BYID_PATH, new_devid); + } + + /* + * Construct the root vdev to pass to zpool_vdev_attach(). While adding + * the entire vdev structure is harmless, we construct a reduced set of + * path/physpath/wholedisk to keep it simple. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + return; + } + if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + nvlist_free(nvroot); + return; + } + + if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || + (physpath != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || + (enc_sysfs_path != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || + nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || + nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, + 1) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); + nvlist_free(newvd); + nvlist_free(nvroot); + return; + } + + nvlist_free(newvd); + + /* + * Wait for udev to verify the links exist, then auto-replace + * the leaf disk at same physical location. + */ + if (zpool_label_disk_wait(path, 3000) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " + "disk %s is missing", path); + nvlist_free(nvroot); + return; + } + + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); + + zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", + fullpath, path, (ret == 0) ? "no errors" : + libzfs_error_description(g_zfshdl)); + + nvlist_free(nvroot); +} + +/* + * Utility functions to find a vdev matching given criteria. + */ +typedef struct dev_data { + const char *dd_compare; + const char *dd_prop; + zfs_process_func_t dd_func; + boolean_t dd_found; + boolean_t dd_islabeled; + uint64_t dd_pool_guid; + uint64_t dd_vdev_guid; + const char *dd_new_devid; +} dev_data_t; + +static void +zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) +{ + dev_data_t *dp = data; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + + /* + * Iterate over any spares and cache devices + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + + /* once a vdev was matched and processed there is nothing left to do */ + if (dp->dd_found) + return; + + /* + * Match by GUID if available otherwise fallback to devid or physical + */ + if (dp->dd_vdev_guid != 0) { + uint64_t guid; + + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid != dp->dd_vdev_guid) { + return; + } + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); + dp->dd_found = B_TRUE; + + } else if (dp->dd_compare != NULL) { + /* + * NOTE: On Linux there is an event for partition, so unlike + * illumos, substring matching is not required to accommodate + * the partition suffix. An exact match will be present in + * the dp->dd_compare value. + */ + if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || + strcmp(dp->dd_compare, path) != 0) + return; + + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", + dp->dd_prop, path); + dp->dd_found = B_TRUE; + + /* pass the new devid for use by replacing code */ + if (dp->dd_new_devid != NULL) { + (void) nvlist_add_string(nvl, "new_devid", + dp->dd_new_devid); + } + } + + (dp->dd_func)(zhp, nvl, dp->dd_islabeled); +} + +static void +zfs_enable_ds(void *arg) +{ + unavailpool_t *pool = (unavailpool_t *)arg; + + (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); + zpool_close(pool->uap_zhp); + free(pool); +} + +static int +zfs_iter_pool(zpool_handle_t *zhp, void *data) +{ + nvlist_t *config, *nvl; + dev_data_t *dp = data; + uint64_t pool_guid; + unavailpool_t *pool; + + zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", + zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); + + /* + * For each vdev in this pool, look for a match to apply dd_func + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (dp->dd_pool_guid == 0 || + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { + (void) nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvl); + zfs_iter_vdev(zhp, nvl, data); + } + } + + /* + * if this pool was originally unavailable, + * then enable its datasets asynchronously + */ + if (g_enumeration_done) { + for (pool = list_head(&g_pool_list); pool != NULL; + pool = list_next(&g_pool_list, pool)) { + + if (strcmp(zpool_get_name(zhp), + zpool_get_name(pool->uap_zhp))) + continue; + if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { + list_remove(&g_pool_list, pool); + (void) tpool_dispatch(g_tpool, zfs_enable_ds, + pool); + break; + } + } + } + + zpool_close(zhp); + return (dp->dd_found); /* cease iteration after a match */ +} + +/* + * Given a physical device location, iterate over all + * (pool, vdev) pairs which correspond to that location. + */ +static boolean_t +devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, + boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = physical; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; /* used by auto replace code */ + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + +/* + * Given a device identifier, find any vdevs with a matching devid. + * On Linux we can match devid directly which is always a whole disk. + */ +static boolean_t +devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = devid; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_DEVID; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + +/* + * Handle a EC_DEV_ADD.ESC_DISK event. + * + * illumos + * Expects: DEV_PHYS_PATH string in schema + * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID + * + * path: '/dev/dsk/c0t1d0s0' (persistent) + * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' + * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' + * + * linux + * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema + * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID + * + * path: '/dev/sdc1' (not persistent) + * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' + */ +static int +zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) +{ + char *devpath = NULL, *devid; + boolean_t is_slice; + + /* + * Expecting a devid string and an optional physical location + */ + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) + return (-1); + + (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); + + is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); + + zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", + devid, devpath ? devpath : "NULL", is_slice); + + /* + * Iterate over all vdevs looking for a match in the following order: + * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) + * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). + * + * For disks, we only want to pay attention to vdevs marked as whole + * disks or are a multipath device. + */ + if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) + (void) devphys_iter(devpath, devid, zfs_process_add, is_slice); + + return (0); +} + +/* + * Called when we receive a VDEV_CHECK event, which indicates a device could not + * be opened during initial pool open, but the autoreplace property was set on + * the pool. In this case, we treat it as if it were an add event. + */ +static int +zfs_deliver_check(nvlist_t *nvl) +{ + dev_data_t data = { 0 }; + + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, + &data.dd_pool_guid) != 0 || + nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, + &data.dd_vdev_guid) != 0 || + data.dd_vdev_guid == 0) + return (0); + + zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", + data.dd_pool_guid, data.dd_vdev_guid); + + data.dd_func = zfs_process_add; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (0); +} + +static int +zfsdle_vdev_online(zpool_handle_t *zhp, void *data) +{ + char *devname = data; + boolean_t avail_spare, l2cache; + nvlist_t *tgt; + int error; + + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", + devname, zpool_get_name(zhp)); + + if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, + &avail_spare, &l2cache, NULL)) != NULL) { + char *path, fullpath[MAXPATHLEN]; + uint64_t wholedisk; + + error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path); + if (error) { + zpool_close(zhp); + return (0); + } + + error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + if (error) + wholedisk = 0; + + if (wholedisk) { + path = strrchr(path, '/'); + if (path != NULL) { + path = zfs_strip_partition(path + 1); + if (path == NULL) { + zpool_close(zhp); + return (0); + } + } else { + zpool_close(zhp); + return (0); + } + + (void) strlcpy(fullpath, path, sizeof (fullpath)); + free(path); + + /* + * We need to reopen the pool associated with this + * device so that the kernel can update the size of + * the expanded device. When expanding there is no + * need to restart the scrub from the beginning. + */ + boolean_t scrub_restart = B_FALSE; + (void) zpool_reopen_one(zhp, &scrub_restart); + } else { + (void) strlcpy(fullpath, path, sizeof (fullpath)); + } + + if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { + vdev_state_t newstate; + + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { + error = zpool_vdev_online(zhp, fullpath, 0, + &newstate); + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: " + "setting device '%s' to ONLINE state " + "in pool '%s': %d", fullpath, + zpool_get_name(zhp), error); + } + } + zpool_close(zhp); + return (1); + } + zpool_close(zhp); + return (0); +} + +/* + * This function handles the ESC_DEV_DLE device change event. Use the + * provided vdev guid when looking up a disk or partition, when the guid + * is not present assume the entire disk is owned by ZFS and append the + * expected -part1 partition information then lookup by physical path. + */ +static int +zfs_deliver_dle(nvlist_t *nvl) +{ + char *devname, name[MAXPATHLEN]; + uint64_t guid; + + if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { + sprintf(name, "%llu", (u_longlong_t)guid); + } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) { + strlcpy(name, devname, MAXPATHLEN); + zfs_append_partition(name, MAXPATHLEN); + } else { + zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); + } + + if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) { + zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " + "found", name); + return (1); + } + + return (0); +} + +/* + * syseventd daemon module event handler + * + * Handles syseventd daemon zfs device related events: + * + * EC_DEV_ADD.ESC_DISK + * EC_DEV_STATUS.ESC_DEV_DLE + * EC_ZFS.ESC_ZFS_VDEV_CHECK + * + * Note: assumes only one thread active at a time (not thread safe) + */ +static int +zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + int ret; + boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE; + + if (strcmp(class, EC_DEV_ADD) == 0) { + /* + * We're mainly interested in disk additions, but we also listen + * for new loop devices, to allow for simplified testing. + */ + if (strcmp(subclass, ESC_DISK) == 0) + is_lofi = B_FALSE; + else if (strcmp(subclass, ESC_LOFI) == 0) + is_lofi = B_TRUE; + else + return (0); + + is_check = B_FALSE; + } else if (strcmp(class, EC_ZFS) == 0 && + strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { + /* + * This event signifies that a device failed to open + * during pool load, but the 'autoreplace' property was + * set, so we should pretend it's just been added. + */ + is_check = B_TRUE; + } else if (strcmp(class, EC_DEV_STATUS) == 0 && + strcmp(subclass, ESC_DEV_DLE) == 0) { + is_dle = B_TRUE; + } else { + return (0); + } + + if (is_dle) + ret = zfs_deliver_dle(nvl); + else if (is_check) + ret = zfs_deliver_check(nvl); + else + ret = zfs_deliver_add(nvl, is_lofi); + + return (ret); +} + +/*ARGSUSED*/ +static void * +zfs_enum_pools(void *arg) +{ + (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); + /* + * Linux - instead of using a thread pool, each list entry + * will spawn a thread when an unavailable pool transitions + * to available. zfs_slm_fini will wait for these threads. + */ + g_enumeration_done = B_TRUE; + return (NULL); +} + +/* + * called from zed daemon at startup + * + * sent messages from zevents or udev monitor + * + * For now, each agent has its own libzfs instance + */ +int +zfs_slm_init() +{ + if ((g_zfshdl = libzfs_init()) == NULL) + return (-1); + + /* + * collect a list of unavailable pools (asynchronously, + * since this can take a while) + */ + list_create(&g_pool_list, sizeof (struct unavailpool), + offsetof(struct unavailpool, uap_node)); + + if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { + list_destroy(&g_pool_list); + libzfs_fini(g_zfshdl); + return (-1); + } + + list_create(&g_device_list, sizeof (struct pendingdev), + offsetof(struct pendingdev, pd_node)); + + return (0); +} + +void +zfs_slm_fini() +{ + unavailpool_t *pool; + pendingdev_t *device; + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_zfs_tid, NULL); + /* destroy the thread pool */ + if (g_tpool != NULL) { + tpool_wait(g_tpool); + tpool_destroy(g_tpool); + } + + while ((pool = (list_head(&g_pool_list))) != NULL) { + list_remove(&g_pool_list, pool); + zpool_close(pool->uap_zhp); + free(pool); + } + list_destroy(&g_pool_list); + + while ((device = (list_head(&g_device_list))) != NULL) { + list_remove(&g_device_list, device); + free(device); + } + list_destroy(&g_device_list); + + libzfs_fini(g_zfshdl); +} + +void +zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); + (void) zfs_slm_deliver_event(class, subclass, nvl); +} diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c new file mode 100644 index 000000000000..9e95e20d5683 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c @@ -0,0 +1,557 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> + */ + +/* + * The ZFS retire agent is responsible for managing hot spares across all pools. + * When we see a device fault or a device removal, we try to open the associated + * pool and look for any hot spares. We iterate over any available hot spares + * and attempt a 'zpool replace' for each one. + * + * For vdevs diagnosed as faulty, the agent is also responsible for proactively + * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). + */ + +#include <sys/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> +#include <libzfs.h> +#include <string.h> + +#include "zfs_agents.h" +#include "fmd_api.h" + + +typedef struct zfs_retire_repaired { + struct zfs_retire_repaired *zrr_next; + uint64_t zrr_pool; + uint64_t zrr_vdev; +} zfs_retire_repaired_t; + +typedef struct zfs_retire_data { + libzfs_handle_t *zrd_hdl; + zfs_retire_repaired_t *zrd_repaired; +} zfs_retire_data_t; + +static void +zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) +{ + zfs_retire_repaired_t *zrp; + + while ((zrp = zdp->zrd_repaired) != NULL) { + zdp->zrd_repaired = zrp->zrr_next; + fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); + } +} + +/* + * Find a pool with a matching GUID. + */ +typedef struct find_cbdata { + uint64_t cb_guid; + zpool_handle_t *cb_zhp; + nvlist_t *cb_vdev; +} find_cbdata_t; + +static int +find_pool(zpool_handle_t *zhp, void *data) +{ + find_cbdata_t *cbp = data; + + if (cbp->cb_guid == + zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +/* + * Find a vdev within a tree with a matching GUID. + */ +static nvlist_t * +find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + nvlist_t *ret; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + guid == search_guid) { + fmd_hdl_debug(fmd_module_hdl("zfs-retire"), + "matched vdev %llu", guid); + return (nv); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) + return (ret); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) + return (ret); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) + return (ret); + } + + return (NULL); +} + +/* + * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp) +{ + find_cbdata_t cb; + zpool_handle_t *zhp; + nvlist_t *config, *nvroot; + + /* + * Find the corresponding pool and make sure the vdev still exists. + */ + cb.cb_guid = pool_guid; + if (zpool_iter(zhdl, find_pool, &cb) != 1) + return (NULL); + + zhp = cb.cb_zhp; + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) { + zpool_close(zhp); + return (NULL); + } + + if (vdev_guid != 0) { + if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { + zpool_close(zhp); + return (NULL); + } + } + + return (zhp); +} + +/* + * Given a vdev, attempt to replace it with every known spare until one + * succeeds or we run out of devices to try. + * Return whether we were successful or not in replacing the device. + */ +static boolean_t +replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) +{ + nvlist_t *config, *nvroot, *replacement; + nvlist_t **spares; + uint_t s, nspares; + char *dev_name; + zprop_source_t source; + int ashift; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) + return (B_FALSE); + + /* + * Find out if there are any hot spares available in the pool. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + return (B_FALSE); + + /* + * lookup "ashift" pool property, we may need it for the replacement + */ + ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); + + replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT); + + dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); + + /* + * Try to replace each spare, ending when we successfully + * replace it. + */ + for (s = 0; s < nspares; s++) { + char *spare_name; + + if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, + &spare_name) != 0) + continue; + + /* if set, add the "ashift" pool property to the spare nvlist */ + if (source != ZPROP_SRC_DEFAULT) + (void) nvlist_add_uint64(spares[s], + ZPOOL_CONFIG_ASHIFT, ashift); + + (void) nvlist_add_nvlist_array(replacement, + ZPOOL_CONFIG_CHILDREN, &spares[s], 1); + + fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", + dev_name, basename(spare_name)); + + if (zpool_vdev_attach(zhp, dev_name, spare_name, + replacement, B_TRUE, B_FALSE) == 0) { + free(dev_name); + nvlist_free(replacement); + return (B_TRUE); + } + } + + free(dev_name); + nvlist_free(replacement); + + return (B_FALSE); +} + +/* + * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and + * ASRU is now usable. ZFS has found the device to be present and + * functioning. + */ +/*ARGSUSED*/ +static void +zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + zfs_retire_repaired_t *zrp; + uint64_t pool_guid, vdev_guid; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + return; + + /* + * Before checking the state of the ASRU, go through and see if we've + * already made an attempt to repair this ASRU. This list is cleared + * whenever we receive any kind of list event, and is designed to + * prevent us from generating a feedback loop when we attempt repairs + * against a faulted pool. The problem is that checking the unusable + * state of the ASRU can involve opening the pool, which can post + * statechange events but otherwise leave the pool in the faulted + * state. This list allows us to detect when a statechange event is + * due to our own request. + */ + for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { + if (zrp->zrr_pool == pool_guid && + zrp->zrr_vdev == vdev_guid) + return; + } + + zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); + zrp->zrr_next = zdp->zrd_repaired; + zrp->zrr_pool = pool_guid; + zrp->zrr_vdev = vdev_guid; + zdp->zrd_repaired = zrp; + + fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", + vdev_guid, pool_guid); +} + +/*ARGSUSED*/ +static void +zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, + const char *class) +{ + uint64_t pool_guid, vdev_guid; + zpool_handle_t *zhp; + nvlist_t *resource, *fault; + nvlist_t **faults; + uint_t f, nfaults; + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + libzfs_handle_t *zhdl = zdp->zrd_hdl; + boolean_t fault_device, degrade_device; + boolean_t is_repair; + char *scheme; + nvlist_t *vdev = NULL; + char *uuid; + int repair_done = 0; + boolean_t retire; + boolean_t is_disk; + vdev_aux_t aux; + uint64_t state = 0; + + fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); + + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state); + + /* + * If this is a resource notifying us of device removal then simply + * check for an available spare and continue unless the device is a + * l2arc vdev, in which case we just offline it. + */ + if (strcmp(class, "resource.fs.zfs.removed") == 0 || + (strcmp(class, "resource.fs.zfs.statechange") == 0 && + state == VDEV_STATE_REMOVED)) { + char *devtype; + char *devname; + + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + return; + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + return; + + devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); + + /* Can't replace l2arc with a spare: offline the device */ + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) { + fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname); + zpool_vdev_offline(zhp, devname, B_TRUE); + } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") || + replace_with_spare(hdl, zhp, vdev) == B_FALSE) { + /* Could not handle with spare */ + fmd_hdl_debug(hdl, "no spare for '%s'", devname); + } + + free(devname); + zpool_close(zhp); + return; + } + + if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) + return; + + /* + * Note: on zfsonlinux statechange events are more than just + * healthy ones so we need to confirm the actual state value. + */ + if (strcmp(class, "resource.fs.zfs.statechange") == 0 && + state == VDEV_STATE_HEALTHY) { + zfs_vdev_repair(hdl, nvl); + return; + } + if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + zfs_vdev_repair(hdl, nvl); + return; + } + + zfs_retire_clear_data(hdl, zdp); + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) + is_repair = B_TRUE; + else + is_repair = B_FALSE; + + /* + * We subscribe to zfs faults as well as all repair events. + */ + if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, + &faults, &nfaults) != 0) + return; + + for (f = 0; f < nfaults; f++) { + fault = faults[f]; + + fault_device = B_FALSE; + degrade_device = B_FALSE; + is_disk = B_FALSE; + + if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, + &retire) == 0 && retire == 0) + continue; + + /* + * While we subscribe to fault.fs.zfs.*, we only take action + * for faults targeting a specific vdev (open failure or SERD + * failure). We also subscribe to fault.io.* events, so that + * faulty disks will be faulted in the ZFS configuration. + */ + if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { + fault_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.vdev.checksum")) { + degrade_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.device")) { + fault_device = B_FALSE; + } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { + is_disk = B_TRUE; + fault_device = B_TRUE; + } else { + continue; + } + + if (is_disk) { + continue; + } else { + /* + * This is a ZFS fault. Lookup the resource, and + * attempt to find the matching vdev. + */ + if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, + &resource) != 0 || + nvlist_lookup_string(resource, FM_FMRI_SCHEME, + &scheme) != 0) + continue; + + if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, + &pool_guid) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, + &vdev_guid) != 0) { + if (is_repair) + vdev_guid = 0; + else + continue; + } + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + continue; + + aux = VDEV_AUX_ERR_EXCEEDED; + } + + if (vdev_guid == 0) { + /* + * For pool-level repair events, clear the entire pool. + */ + fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", + zpool_get_name(zhp)); + (void) zpool_clear(zhp, NULL, NULL); + zpool_close(zhp); + continue; + } + + /* + * If this is a repair event, then mark the vdev as repaired and + * continue. + */ + if (is_repair) { + repair_done = 1; + fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", + zpool_get_name(zhp), vdev_guid); + (void) zpool_vdev_clear(zhp, vdev_guid); + zpool_close(zhp); + continue; + } + + /* + * Actively fault the device if needed. + */ + if (fault_device) + (void) zpool_vdev_fault(zhp, vdev_guid, aux); + if (degrade_device) + (void) zpool_vdev_degrade(zhp, vdev_guid, aux); + + if (fault_device || degrade_device) + fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", + fault_device ? "fault" : "degrade", vdev_guid, + zpool_get_name(zhp)); + + /* + * Attempt to substitute a hot spare. + */ + (void) replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); + } + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && + nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) + fmd_case_uuresolved(hdl, uuid); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_retire_recv, /* fmdo_recv */ + NULL, /* fmdo_timeout */ + NULL, /* fmdo_close */ + NULL, /* fmdo_stats */ + NULL, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "spare_on_remove", FMD_TYPE_BOOL, "true" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props +}; + +void +_zfs_retire_init(fmd_hdl_t *hdl) +{ + zfs_retire_data_t *zdp; + libzfs_handle_t *zhdl; + + if ((zhdl = libzfs_init()) == NULL) + return; + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + libzfs_fini(zhdl); + return; + } + + zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); + zdp->zrd_hdl = zhdl; + + fmd_hdl_setspecific(hdl, zdp); +} + +void +_zfs_retire_fini(fmd_hdl_t *hdl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + + if (zdp != NULL) { + zfs_retire_clear_data(hdl, zdp); + libzfs_fini(zdp->zrd_hdl); + fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); + } +} diff --git a/sys/contrib/openzfs/cmd/zed/zed.c b/sys/contrib/openzfs/cmd/zed/zed.c new file mode 100644 index 000000000000..0784e3834733 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.c @@ -0,0 +1,306 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <errno.h> +#include <fcntl.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> +#include "zed.h" +#include "zed_conf.h" +#include "zed_event.h" +#include "zed_file.h" +#include "zed_log.h" + +static volatile sig_atomic_t _got_exit = 0; +static volatile sig_atomic_t _got_hup = 0; + +/* + * Signal handler for SIGINT & SIGTERM. + */ +static void +_exit_handler(int signum) +{ + _got_exit = 1; +} + +/* + * Signal handler for SIGHUP. + */ +static void +_hup_handler(int signum) +{ + _got_hup = 1; +} + +/* + * Register signal handlers. + */ +static void +_setup_sig_handlers(void) +{ + struct sigaction sa; + + if (sigemptyset(&sa.sa_mask) < 0) + zed_log_die("Failed to initialize sigset"); + + sa.sa_flags = SA_RESTART; + sa.sa_handler = SIG_IGN; + + if (sigaction(SIGPIPE, &sa, NULL) < 0) + zed_log_die("Failed to ignore SIGPIPE"); + + sa.sa_handler = _exit_handler; + if (sigaction(SIGINT, &sa, NULL) < 0) + zed_log_die("Failed to register SIGINT handler"); + + if (sigaction(SIGTERM, &sa, NULL) < 0) + zed_log_die("Failed to register SIGTERM handler"); + + sa.sa_handler = _hup_handler; + if (sigaction(SIGHUP, &sa, NULL) < 0) + zed_log_die("Failed to register SIGHUP handler"); +} + +/* + * Lock all current and future pages in the virtual memory address space. + * Access to locked pages will never be delayed by a page fault. + * + * EAGAIN is tested up to max_tries in case this is a transient error. + * + * Note that memory locks are not inherited by a child created via fork() + * and are automatically removed during an execve(). As such, this must + * be called after the daemon fork()s (when running in the background). + */ +static void +_lock_memory(void) +{ +#if HAVE_MLOCKALL + int i = 0; + const int max_tries = 10; + + for (i = 0; i < max_tries; i++) { + if (mlockall(MCL_CURRENT | MCL_FUTURE) == 0) { + zed_log_msg(LOG_INFO, "Locked all pages in memory"); + return; + } + if (errno != EAGAIN) + break; + } + zed_log_die("Failed to lock memory pages: %s", strerror(errno)); + +#else /* HAVE_MLOCKALL */ + zed_log_die("Failed to lock memory pages: mlockall() not supported"); +#endif /* HAVE_MLOCKALL */ +} + +/* + * Start daemonization of the process including the double fork(). + * + * The parent process will block here until _finish_daemonize() is called + * (in the grandchild process), at which point the parent process will exit. + * This prevents the parent process from exiting until initialization is + * complete. + */ +static void +_start_daemonize(void) +{ + pid_t pid; + struct sigaction sa; + + /* Create pipe for communicating with child during daemonization. */ + zed_log_pipe_open(); + + /* Background process and ensure child is not process group leader. */ + pid = fork(); + if (pid < 0) { + zed_log_die("Failed to create child process: %s", + strerror(errno)); + } else if (pid > 0) { + + /* Close writes since parent will only read from pipe. */ + zed_log_pipe_close_writes(); + + /* Wait for notification that daemonization is complete. */ + zed_log_pipe_wait(); + + zed_log_pipe_close_reads(); + _exit(EXIT_SUCCESS); + } + + /* Close reads since child will only write to pipe. */ + zed_log_pipe_close_reads(); + + /* Create independent session and detach from terminal. */ + if (setsid() < 0) + zed_log_die("Failed to create new session: %s", + strerror(errno)); + + /* Prevent child from terminating on HUP when session leader exits. */ + if (sigemptyset(&sa.sa_mask) < 0) + zed_log_die("Failed to initialize sigset"); + + sa.sa_flags = 0; + sa.sa_handler = SIG_IGN; + + if (sigaction(SIGHUP, &sa, NULL) < 0) + zed_log_die("Failed to ignore SIGHUP"); + + /* Ensure process cannot re-acquire terminal. */ + pid = fork(); + if (pid < 0) { + zed_log_die("Failed to create grandchild process: %s", + strerror(errno)); + } else if (pid > 0) { + _exit(EXIT_SUCCESS); + } +} + +/* + * Finish daemonization of the process by closing stdin/stdout/stderr. + * + * This must be called at the end of initialization after all external + * communication channels are established and accessible. + */ +static void +_finish_daemonize(void) +{ + int devnull; + + /* Preserve fd 0/1/2, but discard data to/from stdin/stdout/stderr. */ + devnull = open("/dev/null", O_RDWR); + if (devnull < 0) + zed_log_die("Failed to open /dev/null: %s", strerror(errno)); + + if (dup2(devnull, STDIN_FILENO) < 0) + zed_log_die("Failed to dup /dev/null onto stdin: %s", + strerror(errno)); + + if (dup2(devnull, STDOUT_FILENO) < 0) + zed_log_die("Failed to dup /dev/null onto stdout: %s", + strerror(errno)); + + if (dup2(devnull, STDERR_FILENO) < 0) + zed_log_die("Failed to dup /dev/null onto stderr: %s", + strerror(errno)); + + if ((devnull > STDERR_FILENO) && (close(devnull) < 0)) + zed_log_die("Failed to close /dev/null: %s", strerror(errno)); + + /* Notify parent that daemonization is complete. */ + zed_log_pipe_close_writes(); +} + +/* + * ZFS Event Daemon (ZED). + */ +int +main(int argc, char *argv[]) +{ + struct zed_conf *zcp; + uint64_t saved_eid; + int64_t saved_etime[2]; + + zed_log_init(argv[0]); + zed_log_stderr_open(LOG_NOTICE); + zcp = zed_conf_create(); + zed_conf_parse_opts(zcp, argc, argv); + if (zcp->do_verbose) + zed_log_stderr_open(LOG_INFO); + + if (geteuid() != 0) + zed_log_die("Must be run as root"); + + zed_conf_parse_file(zcp); + + zed_file_close_from(STDERR_FILENO + 1); + + (void) umask(0); + + if (chdir("/") < 0) + zed_log_die("Failed to change to root directory"); + + if (zed_conf_scan_dir(zcp) < 0) + exit(EXIT_FAILURE); + + if (!zcp->do_foreground) { + _start_daemonize(); + zed_log_syslog_open(LOG_DAEMON); + } + _setup_sig_handlers(); + + if (zcp->do_memlock) + _lock_memory(); + + if ((zed_conf_write_pid(zcp) < 0) && (!zcp->do_force)) + exit(EXIT_FAILURE); + + if (!zcp->do_foreground) + _finish_daemonize(); + + zed_log_msg(LOG_NOTICE, + "ZFS Event Daemon %s-%s (PID %d)", + ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid()); + + if (zed_conf_open_state(zcp) < 0) + exit(EXIT_FAILURE); + + if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0) + exit(EXIT_FAILURE); + +idle: + /* + * If -I is specified, attempt to open /dev/zfs repeatedly until + * successful. + */ + do { + if (!zed_event_init(zcp)) + break; + /* Wait for some time and try again. tunable? */ + sleep(30); + } while (!_got_exit && zcp->do_idle); + + if (_got_exit) + goto out; + + zed_event_seek(zcp, saved_eid, saved_etime); + + while (!_got_exit) { + int rv; + if (_got_hup) { + _got_hup = 0; + (void) zed_conf_scan_dir(zcp); + } + rv = zed_event_service(zcp); + + /* ENODEV: When kernel module is unloaded (osx) */ + if (rv == ENODEV) + break; + } + + zed_log_msg(LOG_NOTICE, "Exiting"); + zed_event_fini(zcp); + + if (zcp->do_idle && !_got_exit) + goto idle; + +out: + zed_conf_destroy(zcp); + zed_log_fini(); + exit(EXIT_SUCCESS); +} diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/.gitignore b/sys/contrib/openzfs/cmd/zed/zed.d/.gitignore new file mode 100644 index 000000000000..46a00945aa7c --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/.gitignore @@ -0,0 +1 @@ +history_event-zfs-list-cacher.sh diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am new file mode 100644 index 000000000000..8b2d0c200286 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am @@ -0,0 +1,53 @@ +include $(top_srcdir)/config/Rules.am +include $(top_srcdir)/config/Substfiles.am + +EXTRA_DIST += README + +zedconfdir = $(sysconfdir)/zfs/zed.d + +dist_zedconf_DATA = \ + zed-functions.sh \ + zed.rc + +zedexecdir = $(zfsexecdir)/zed.d + +dist_zedexec_SCRIPTS = \ + all-debug.sh \ + all-syslog.sh \ + data-notify.sh \ + generic-notify.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh \ + trim_finish-notify.sh + +nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh + +SUBSTFILES += $(nodist_zedexec_SCRIPTS) + +zedconfdefaults = \ + all-syslog.sh \ + data-notify.sh \ + history_event-zfs-list-cacher.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh + +install-data-hook: + $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" + for f in $(zedconfdefaults); do \ + test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ + -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ + ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ + done + chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc" diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/README b/sys/contrib/openzfs/cmd/zed/zed.d/README new file mode 100644 index 000000000000..7279b93704e2 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/README @@ -0,0 +1,30 @@ +Shell scripts are the recommended choice for ZEDLETs that mostly call +other utilities and do relatively little data manipulation. + +Shell scripts MUST work on both bash and dash. + +Shell scripts MUST run cleanly through ShellCheck: + http://www.shellcheck.net/ + +General functions reside in "zed-functions.sh". Use them where applicable. + +Additional references that may be of use: + + Google Shell Style Guide + https://github.com/google/styleguide/blob/gh-pages/shell.xml + + Dash as /bin/sh + https://wiki.ubuntu.com/DashAsBinSh + + Common shell script mistakes + http://www.pixelbeat.org/programming/shell_script_mistakes.html + + Filenames and Pathnames in Shell: How to do it Correctly + http://www.dwheeler.com/essays/filenames-in-shell.html + + Autoconf: Portable Shell Programming + https://www.gnu.org/software/autoconf/manual/autoconf.html#Portable-Shell + +Please BE CONSISTENT with the existing style, check for errors, +minimize dependencies where possible, try to be portable, +and comment anything non-obvious. Festina lente. diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh b/sys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh new file mode 100755 index 000000000000..14b39caacd9d --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# +# Log all environment variables to ZED_DEBUG_LOG. +# +# This can be a useful aid when developing/debugging ZEDLETs since it shows the +# environment variables defined for each zevent. + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +: "${ZED_DEBUG_LOG:="${TMPDIR:="/tmp"}/zed.debug.log"}" + +zed_exit_if_ignoring_this_event + +lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock" + +umask 077 +zed_lock "${lockfile}" +exec >> "${ZED_DEBUG_LOG}" + +printenv | sort +echo + +exec >&- +zed_unlock "${lockfile}" +exit 0 diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh b/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh new file mode 100755 index 000000000000..cb9286500136 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh @@ -0,0 +1,14 @@ +#!/bin/sh +# +# Log the zevent via syslog. + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +zed_exit_if_ignoring_this_event + +zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \ + "${ZEVENT_POOL_GUID:+"pool_guid=${ZEVENT_POOL_GUID}"}" \ + "${ZEVENT_VDEV_PATH:+"vdev_path=${ZEVENT_VDEV_PATH}"}" \ + "${ZEVENT_VDEV_STATE_STR:+"vdev_state=${ZEVENT_VDEV_STATE_STR}"}" +exit 0 diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh new file mode 100755 index 000000000000..639b459bdd3b --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh @@ -0,0 +1,43 @@ +#!/bin/sh +# +# Send notification in response to a DATA error. +# +# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given +# class/pool/[vdev] combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool/[vdev]. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: notification suppressed +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 +[ -n "${ZED_NOTIFY_DATA}" ] || exit 3 + +rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify" +zed_rate_limit "${rate_limit_tag}" || exit 3 + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has detected a data error:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + echo " error: ${ZEVENT_ZIO_ERR}" + echo " objid: ${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}" + echo " pool: ${ZEVENT_POOL}" +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh new file mode 100755 index 000000000000..e438031a088a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh @@ -0,0 +1,54 @@ +#!/bin/sh +# +# Send notification in response to a given zevent. +# +# This is a generic script than can be symlinked to a file in the +# enabled-zedlets directory to have a notification sent when a particular +# class of zevents occurs. The symlink filename must begin with the zevent +# (sub)class string (e.g., "probe_failure-notify.sh" for the "probe_failure" +# subclass). Refer to the zed(8) manpage for details. +# +# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given +# class/pool combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: notification suppressed + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +# Rate-limit the notification based in part on the filename. +# +rate_limit_tag="${ZEVENT_POOL};${ZEVENT_SUBCLASS};$(basename -- "$0")" +rate_limit_interval="${ZED_NOTIFY_INTERVAL_SECS}" +zed_rate_limit "${rate_limit_tag}" "${rate_limit_interval}" || exit 3 + +umask 077 +pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}" +host_str=" on $(hostname)" +note_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has posted the following event:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" + [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" + [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" + + [ -n "${ZEVENT_POOL}" ] && [ -x "${ZPOOL}" ] \ + && "${ZPOOL}" status "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in new file mode 100755 index 000000000000..053b4414a768 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in @@ -0,0 +1,85 @@ +#!/bin/sh +# +# Track changes to enumerated pools for use in early-boot +set -ef + +FSLIST_DIR="@sysconfdir@/zfs/zfs-list.cache" +FSLIST_TMP="@runstatedir@/zfs-list.cache.new" +FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}" + +# If the pool specific cache file is not writeable, abort +[ -w "${FSLIST}" ] || exit 0 + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +zed_exit_if_ignoring_this_event +zed_check_cmd "${ZFS}" sort diff grep + +# If we are acting on a snapshot, we have nothing to do +printf '%s' "${ZEVENT_HISTORY_DSNAME}" | grep '@' && exit 0 + +# We obtain a lock on zfs-list to avoid any simultaneous writes. +# If we run into trouble, log and drop the lock +abort_alter() { + zed_log_msg "Error updating zfs-list.cache!" + zed_unlock zfs-list +} + +finished() { + zed_unlock zfs-list + trap - EXIT + exit 0 +} + +case "${ZEVENT_HISTORY_INTERNAL_NAME}" in + create|"finish receiving"|import|destroy|rename) + ;; + + export) + zed_lock zfs-list + trap abort_alter EXIT + echo > "${FSLIST}" + finished + ;; + + set|inherit) + # Only act if one of the tracked properties is altered. + case "${ZEVENT_HISTORY_INTERNAL_STR%%=*}" in + canmount|mountpoint|atime|relatime|devices|exec|readonly| \ + setuid|nbmand|encroot|keylocation|org.openzfs.systemd:requires| \ + org.openzfs.systemd:requires-mounts-for| \ + org.openzfs.systemd:before|org.openzfs.systemd:after| \ + org.openzfs.systemd:wanted-by|org.openzfs.systemd:required-by| \ + org.openzfs.systemd:nofail|org.openzfs.systemd:ignore \ + ) ;; + *) exit 0 ;; + esac + ;; + + *) + # Ignore all other events. + exit 0 + ;; +esac + +zed_lock zfs-list +trap abort_alter EXIT + +PROPS="name,mountpoint,canmount,atime,relatime,devices,exec\ +,readonly,setuid,nbmand,encroot,keylocation\ +,org.openzfs.systemd:requires,org.openzfs.systemd:requires-mounts-for\ +,org.openzfs.systemd:before,org.openzfs.systemd:after\ +,org.openzfs.systemd:wanted-by,org.openzfs.systemd:required-by\ +,org.openzfs.systemd:nofail,org.openzfs.systemd:ignore" + +"${ZFS}" list -H -t filesystem -o $PROPS -r "${ZEVENT_POOL}" > "${FSLIST_TMP}" + +# Sort the output so that it is stable +sort "${FSLIST_TMP}" -o "${FSLIST_TMP}" + +# Don't modify the file if it hasn't changed +diff -q "${FSLIST_TMP}" "${FSLIST}" || mv "${FSLIST_TMP}" "${FSLIST}" +rm -f "${FSLIST_TMP}" + +finished diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh new file mode 120000 index 000000000000..7d7404398a4a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh @@ -0,0 +1 @@ +statechange-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh new file mode 120000 index 000000000000..e4c56bc5f816 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh @@ -0,0 +1 @@ +scrub_finish-notify.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh new file mode 100755 index 000000000000..c7cfd1ddba80 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# resilver_finish-start-scrub.sh +# Run a scrub after a resilver +# +# Exit codes: +# 1: Internal error +# 2: Script wasn't enabled in zed.rc +# 3: Scrubs are automatically started for sequential resilvers +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2 +[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3 +[ -n "${ZEVENT_POOL}" ] || exit 1 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 1 +zed_check_cmd "${ZPOOL}" || exit 1 + +zed_log_msg "Starting scrub after resilver on ${ZEVENT_POOL}" +"${ZPOOL}" scrub "${ZEVENT_POOL}" diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh new file mode 100755 index 000000000000..2145a100a3fa --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh @@ -0,0 +1,59 @@ +#!/bin/sh +# +# Send notification in response to a RESILVER_FINISH or SCRUB_FINISH. +# +# By default, "zpool status" output will only be included for a scrub_finish +# zevent if the pool is not healthy; to always include its output, set +# ZED_NOTIFY_VERBOSE=1. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: notification suppressed +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 + +if [ "${ZEVENT_SUBCLASS}" = "resilver_finish" ]; then + action="resilver" +elif [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then + action="scrub" +else + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 +fi + +zed_check_cmd "${ZPOOL}" || exit 9 + +# For scrub, suppress notification if the pool is healthy +# and verbosity is not enabled. +# +if [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then + healthy="$("${ZPOOL}" status -x "${ZEVENT_POOL}" \ + | grep "'${ZEVENT_POOL}' is healthy")" + [ -n "${healthy}" ] && [ "${ZED_NOTIFY_VERBOSE}" -eq 0 ] && exit 3 +fi + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has finished a ${action}:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + "${ZPOOL}" status "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh new file mode 100755 index 000000000000..e656e125d378 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh @@ -0,0 +1,177 @@ +#!/bin/sh +# +# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes. +# +# Turn the VDEV's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL. +# Turn the LED off when it's back ONLINE again. +# +# This script run in two basic modes: +# +# 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then +# only set the LED for that particular VDEV. This is the case for statechange +# events and some vdev_* events. +# +# 2. If those vars are not set, then check the state of all VDEVs in the pool +# and set the LEDs accordingly. This is the case for pool_import events. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI enclosure services (ses) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: enclosure led successfully set +# 1: enclosure leds not available +# 2: enclosure leds administratively disabled +# 3: The led sysfs path passed from ZFS does not exist +# 4: $ZPOOL not set +# 5: awk is not installed + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + exit 1 +fi + +if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then + exit 2 +fi + +zed_check_cmd "$ZPOOL" || exit 4 +zed_check_cmd awk || exit 5 + +# Global used in set_led debug print +vdev="" + +# check_and_set_led (file, val) +# +# Read an enclosure sysfs file, and write it if it's not already set to 'val' +# +# Arguments +# file: sysfs file to set (like /sys/class/enclosure/0:0:1:0/SLOT 10/fault) +# val: value to set it to +# +# Return +# 0 on success, 3 on missing sysfs path +# +check_and_set_led() +{ + file="$1" + val="$2" + + if [ ! -e "$file" ] ; then + return 3 + fi + + # If another process is accessing the LED when we attempt to update it, + # the update will be lost so retry until the LED actually changes or we + # timeout. + for _ in $(seq 1 5); do + # We want to check the current state first, since writing to the + # 'fault' entry always causes a SES command, even if the + # current state is already what you want. + current=$(cat "${file}") + + # On some enclosures if you write 1 to fault, and read it back, + # it will return 2. Treat all non-zero values as 1 for + # simplicity. + if [ "$current" != "0" ] ; then + current=1 + fi + + if [ "$current" != "$val" ] ; then + echo "$val" > "$file" + zed_log_msg "vdev $vdev set '$file' LED to $val" + else + break + fi + done +} + +state_to_val() +{ + state="$1" + if [ "$state" = "FAULTED" ] || [ "$state" = "DEGRADED" ] || \ + [ "$state" = "UNAVAIL" ] ; then + echo 1 + elif [ "$state" = "ONLINE" ] ; then + echo 0 + fi +} + +# process_pool ([pool]) +# +# Iterate through a pool (or pools) and set the VDEV's enclosure slot LEDs to +# the VDEV's state. +# +# Arguments +# pool: Optional pool name. If not specified, iterate though all pools. +# +# Return +# 0 on success, 3 on missing sysfs path +# +process_pool() +{ + pool="$1" + rc=0 + + # Lookup all the current LED values and paths in parallel + #shellcheck disable=SC2016 + cmd='echo led_token=$(cat "$VDEV_ENC_SYSFS_PATH/fault"),"$VDEV_ENC_SYSFS_PATH",' + out=$($ZPOOL status -vc "$cmd" "$pool" | grep 'led_token=') + + #shellcheck disable=SC2034 + echo "$out" | while read -r vdev state read write chksum therest; do + # Read out current LED value and path + tmp=$(echo "$therest" | sed 's/^.*led_token=//g') + vdev_enc_sysfs_path=$(echo "$tmp" | awk -F ',' '{print $2}') + current_val=$(echo "$tmp" | awk -F ',' '{print $1}') + + if [ "$current_val" != "0" ] ; then + current_val=1 + fi + + if [ -z "$vdev_enc_sysfs_path" ] ; then + # Skip anything with no sysfs LED entries + continue + fi + + if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then + #shellcheck disable=SC2030 + rc=1 + zed_log_msg "vdev $vdev '$file/fault' doesn't exist" + continue; + fi + + val=$(state_to_val "$state") + + if [ "$current_val" = "$val" ] ; then + # LED is already set correctly + continue; + fi + + if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then + rc=1 + fi + + done + + #shellcheck disable=SC2031 + if [ "$rc" = "0" ] ; then + return 0 + else + # We didn't see a sysfs entry that we wanted to set + return 3 + fi +} + +if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then + # Got a statechange for an individual VDEV + val=$(state_to_val "$ZEVENT_VDEV_STATE_STR") + vdev=$(basename "$ZEVENT_VDEV_PATH") + check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val" +else + # Process the entire pool + poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID") + process_pool "$poolname" +fi diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh new file mode 100755 index 000000000000..f46080a03239 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh @@ -0,0 +1,74 @@ +#!/bin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. +# You may not use this file except in compliance with the license. +# +# CDDL HEADER END +# + +# +# Send notification in response to a fault induced statechange +# +# ZEVENT_SUBCLASS: 'statechange' +# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED' +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: statechange not relevant +# 4: statechange string missing (unexpected) + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_VDEV_STATE_STR}" ] || exit 4 + +if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then + exit 3 +fi + +umask 077 +note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + if [ "${ZEVENT_VDEV_STATE_STR}" = "FAULTED" ] ; then + echo "The number of I/O errors associated with a ZFS device exceeded" + echo "acceptable levels. ZFS has marked the device as faulted." + elif [ "${ZEVENT_VDEV_STATE_STR}" = "DEGRADED" ] ; then + echo "The number of checksum errors associated with a ZFS device" + echo "exceeded acceptable levels. ZFS has marked the device as" + echo "degraded." + else + echo "ZFS has detected that a device was removed." + fi + + echo + echo " impact: Fault tolerance of the pool may be compromised." + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " state: ${ZEVENT_VDEV_STATE_STR}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" + [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" + [ -n "${ZEVENT_VDEV_PHYSPATH}" ] && echo " vphys: ${ZEVENT_VDEV_PHYSPATH}" + [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" + [ -n "${ZEVENT_VDEV_DEVID}" ] && echo " devid: ${ZEVENT_VDEV_DEVID}" + + echo " pool: ${ZEVENT_POOL_GUID}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? + +rm -f "${note_pathname}" +exit "${rv}" diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh new file mode 100755 index 000000000000..5075302997e3 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# +# Send notification in response to a TRIM_FINISH. The event +# will be received for each vdev in the pool which was trimmed. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 + +zed_check_cmd "${ZPOOL}" || exit 9 + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has finished a trim:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + "${ZPOOL}" status -t "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh new file mode 120000 index 000000000000..7d7404398a4a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh @@ -0,0 +1 @@ +statechange-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh new file mode 120000 index 000000000000..7d7404398a4a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh @@ -0,0 +1 @@ +statechange-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh new file mode 100755 index 000000000000..44a9b8d23303 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh @@ -0,0 +1,538 @@ +#!/bin/sh +# shellcheck disable=SC2039 +# zed-functions.sh +# +# ZED helper functions for use in ZEDLETs + + +# Variable Defaults +# +: "${ZED_LOCKDIR:="/var/lock"}" +: "${ZED_NOTIFY_INTERVAL_SECS:=3600}" +: "${ZED_NOTIFY_VERBOSE:=0}" +: "${ZED_RUNDIR:="/var/run"}" +: "${ZED_SYSLOG_PRIORITY:="daemon.notice"}" +: "${ZED_SYSLOG_TAG:="zed"}" + +ZED_FLOCK_FD=8 + + +# zed_check_cmd (cmd, ...) +# +# For each argument given, search PATH for the executable command [cmd]. +# Log a message if [cmd] is not found. +# +# Arguments +# cmd: name of executable command for which to search +# +# Return +# 0 if all commands are found in PATH and are executable +# n for a count of the command executables that are not found +# +zed_check_cmd() +{ + local cmd + local rv=0 + + for cmd; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + zed_log_err "\"${cmd}\" not installed" + rv=$((rv + 1)) + fi + done + return "${rv}" +} + + +# zed_log_msg (msg, ...) +# +# Write all argument strings to the system log. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# +# Return +# nothing +# +zed_log_msg() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "$@" +} + + +# zed_log_err (msg, ...) +# +# Write an error message to the system log. This message will contain the +# script name, EID, and all argument strings. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# ZEVENT_EID +# +# Return +# nothing +# +zed_log_err() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "error:" \ + "$(basename -- "$0"):""${ZEVENT_EID:+" eid=${ZEVENT_EID}:"}" "$@" +} + + +# zed_lock (lockfile, [fd]) +# +# Obtain an exclusive (write) lock on [lockfile]. If the lock cannot be +# immediately acquired, wait until it becomes available. +# +# Every zed_lock() must be paired with a corresponding zed_unlock(). +# +# By default, flock-style locks associate the lockfile with file descriptor 8. +# The bash manpage warns that file descriptors >9 should be used with care as +# they may conflict with file descriptors used internally by the shell. File +# descriptor 9 is reserved for zed_rate_limit(). If concurrent locks are held +# within the same process, they must use different file descriptors (preferably +# decrementing from 8); otherwise, obtaining a new lock with a given file +# descriptor will release the previous lock associated with that descriptor. +# +# Arguments +# lockfile: pathname of the lock file; the lock will be stored in +# ZED_LOCKDIR unless the pathname contains a "/". +# fd: integer for the file descriptor used by flock (OPTIONAL unless holding +# concurrent locks) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_lock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local umask_bak + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + umask_bak="$(umask)" + umask 077 + + # Obtain a lock on the file bound to the given file descriptor. + # + eval "exec ${fd}> '${lockfile}'" + err="$(flock --exclusive "${fd}" 2>&1)" + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + zed_log_err "failed to lock \"${lockfile}\": ${err}" + fi + + umask "${umask_bak}" +} + + +# zed_unlock (lockfile, [fd]) +# +# Release the lock on [lockfile]. +# +# Arguments +# lockfile: pathname of the lock file +# fd: integer for the file descriptor used by flock (must match the file +# descriptor passed to the zed_lock function call) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_unlock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + # Release the lock and close the file descriptor. + err="$(flock --unlock "${fd}" 2>&1)" + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + zed_log_err "failed to unlock \"${lockfile}\": ${err}" + fi + eval "exec ${fd}>&-" +} + + +# zed_notify (subject, pathname) +# +# Send a notification via all available methods. +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Return +# 0: notification succeeded via at least one method +# 1: notification failed +# 2: no notification methods configured +# +zed_notify() +{ + local subject="$1" + local pathname="$2" + local num_success=0 + local num_failure=0 + + zed_notify_email "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + zed_notify_pushbullet "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + zed_notify_slack_webhook "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + [ "${num_success}" -gt 0 ] && return 0 + [ "${num_failure}" -gt 0 ] && return 1 + return 2 +} + + +# zed_notify_email (subject, pathname) +# +# Send a notification via email to the address specified by ZED_EMAIL_ADDR. +# +# Requires the mail executable to be installed in the standard PATH, or +# ZED_EMAIL_PROG to be defined with the pathname of an executable capable of +# reading a message body from stdin. +# +# Command-line options to the mail executable can be specified in +# ZED_EMAIL_OPTS. This undergoes the following keyword substitutions: +# - @ADDRESS@ is replaced with the space-delimited recipient email address(es) +# - @SUBJECT@ is replaced with the notification subject +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_EMAIL_PROG +# ZED_EMAIL_OPTS +# ZED_EMAIL_ADDR +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_email() +{ + local subject="$1" + local pathname="${2:-"/dev/null"}" + + : "${ZED_EMAIL_PROG:="mail"}" + : "${ZED_EMAIL_OPTS:="-s '@SUBJECT@' @ADDRESS@"}" + + # For backward compatibility with ZED_EMAIL. + if [ -n "${ZED_EMAIL}" ] && [ -z "${ZED_EMAIL_ADDR}" ]; then + ZED_EMAIL_ADDR="${ZED_EMAIL}" + fi + [ -n "${ZED_EMAIL_ADDR}" ] || return 2 + + zed_check_cmd "${ZED_EMAIL_PROG}" || return 1 + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err \ + "$(basename "${ZED_EMAIL_PROG}") cannot read \"${pathname}\"" + return 1 + fi + + ZED_EMAIL_OPTS="$(echo "${ZED_EMAIL_OPTS}" \ + | sed -e "s/@ADDRESS@/${ZED_EMAIL_ADDR}/g" \ + -e "s/@SUBJECT@/${subject}/g")" + + # shellcheck disable=SC2086 + eval "${ZED_EMAIL_PROG}" ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1 + rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "$(basename "${ZED_EMAIL_PROG}") exit=${rv}" + return 1 + fi + return 0 +} + + +# zed_notify_pushbullet (subject, pathname) +# +# Send a notification via Pushbullet <https://www.pushbullet.com/>. +# The access token (ZED_PUSHBULLET_ACCESS_TOKEN) identifies this client to the +# Pushbullet server. The optional channel tag (ZED_PUSHBULLET_CHANNEL_TAG) is +# for pushing to notification feeds that can be subscribed to; if a channel is +# not defined, push notifications will instead be sent to all devices +# associated with the account specified by the access token. +# +# Requires awk, curl, and sed executables to be installed in the standard PATH. +# +# References +# https://docs.pushbullet.com/ +# https://www.pushbullet.com/security +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_PUSHBULLET_ACCESS_TOKEN +# ZED_PUSHBULLET_CHANNEL_TAG +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_pushbullet() +{ + local subject="$1" + local pathname="${2:-"/dev/null"}" + local msg_body + local msg_tag + local msg_json + local msg_out + local msg_err + local url="https://api.pushbullet.com/v2/pushes" + + [ -n "${ZED_PUSHBULLET_ACCESS_TOKEN}" ] || return 2 + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err "pushbullet cannot read \"${pathname}\"" + return 1 + fi + + zed_check_cmd "awk" "curl" "sed" || return 1 + + # Escape the following characters in the message body for JSON: + # newline, backslash, double quote, horizontal tab, vertical tab, + # and carriage return. + # + msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\""); + gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \ + "${pathname}")" + + # Push to a channel if one is configured. + # + [ -n "${ZED_PUSHBULLET_CHANNEL_TAG}" ] && msg_tag="$(printf \ + '"channel_tag": "%s", ' "${ZED_PUSHBULLET_CHANNEL_TAG}")" + + # Construct the JSON message for pushing a note. + # + msg_json="$(printf '{%s"type": "note", "title": "%s", "body": "%s"}' \ + "${msg_tag}" "${subject}" "${msg_body}")" + + # Send the POST request and check for errors. + # + msg_out="$(curl -u "${ZED_PUSHBULLET_ACCESS_TOKEN}:" -X POST "${url}" \ + --header "Content-Type: application/json" --data-binary "${msg_json}" \ + 2>/dev/null)"; rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "curl exit=${rv}" + return 1 + fi + msg_err="$(echo "${msg_out}" \ + | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')" + if [ -n "${msg_err}" ]; then + zed_log_err "pushbullet \"${msg_err}"\" + return 1 + fi + return 0 +} + + +# zed_notify_slack_webhook (subject, pathname) +# +# Notification via Slack Webhook <https://api.slack.com/incoming-webhooks>. +# The Webhook URL (ZED_SLACK_WEBHOOK_URL) identifies this client to the +# Slack channel. +# +# Requires awk, curl, and sed executables to be installed in the standard PATH. +# +# References +# https://api.slack.com/incoming-webhooks +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_SLACK_WEBHOOK_URL +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_slack_webhook() +{ + [ -n "${ZED_SLACK_WEBHOOK_URL}" ] || return 2 + + local subject="$1" + local pathname="${2:-"/dev/null"}" + local msg_body + local msg_tag + local msg_json + local msg_out + local msg_err + local url="${ZED_SLACK_WEBHOOK_URL}" + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err "slack webhook cannot read \"${pathname}\"" + return 1 + fi + + zed_check_cmd "awk" "curl" "sed" || return 1 + + # Escape the following characters in the message body for JSON: + # newline, backslash, double quote, horizontal tab, vertical tab, + # and carriage return. + # + msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\""); + gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \ + "${pathname}")" + + # Construct the JSON message for posting. + # + msg_json="$(printf '{"text": "*%s*\n%s"}' "${subject}" "${msg_body}" )" + + # Send the POST request and check for errors. + # + msg_out="$(curl -X POST "${url}" \ + --header "Content-Type: application/json" --data-binary "${msg_json}" \ + 2>/dev/null)"; rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "curl exit=${rv}" + return 1 + fi + msg_err="$(echo "${msg_out}" \ + | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')" + if [ -n "${msg_err}" ]; then + zed_log_err "slack webhook \"${msg_err}"\" + return 1 + fi + return 0 +} + +# zed_rate_limit (tag, [interval]) +# +# Check whether an event of a given type [tag] has already occurred within the +# last [interval] seconds. +# +# This function obtains a lock on the statefile using file descriptor 9. +# +# Arguments +# tag: arbitrary string for grouping related events to rate-limit +# interval: time interval in seconds (OPTIONAL) +# +# Globals +# ZED_NOTIFY_INTERVAL_SECS +# ZED_RUNDIR +# +# Return +# 0 if the event should be processed +# 1 if the event should be dropped +# +# State File Format +# time;tag +# +zed_rate_limit() +{ + local tag="$1" + local interval="${2:-${ZED_NOTIFY_INTERVAL_SECS}}" + local lockfile="zed.zedlet.state.lock" + local lockfile_fd=9 + local statefile="${ZED_RUNDIR}/zed.zedlet.state" + local time_now + local time_prev + local umask_bak + local rv=0 + + [ -n "${tag}" ] || return 0 + + zed_lock "${lockfile}" "${lockfile_fd}" + time_now="$(date +%s)" + time_prev="$(grep -E "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + | tail -1 | cut -d\; -f1)" + + if [ -n "${time_prev}" ] \ + && [ "$((time_now - time_prev))" -lt "${interval}" ]; then + rv=1 + else + umask_bak="$(umask)" + umask 077 + grep -E -v "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + > "${statefile}.$$" + echo "${time_now};${tag}" >> "${statefile}.$$" + mv -f "${statefile}.$$" "${statefile}" + umask "${umask_bak}" + fi + + zed_unlock "${lockfile}" "${lockfile_fd}" + return "${rv}" +} + + +# zed_guid_to_pool (guid) +# +# Convert a pool GUID into its pool name (like "tank") +# Arguments +# guid: pool GUID (decimal or hex) +# +# Return +# Pool name +# +zed_guid_to_pool() +{ + if [ -z "$1" ] ; then + return + fi + + guid=$(printf "%llu" "$1") + if [ -n "$guid" ] ; then + $ZPOOL get -H -ovalue,name guid | awk '$1=='"$guid"' {print $2}' + fi +} + +# zed_exit_if_ignoring_this_event +# +# Exit the script if we should ignore this event, as determined by +# $ZED_SYSLOG_SUBCLASS_INCLUDE and $ZED_SYSLOG_SUBCLASS_EXCLUDE in zed.rc. +# This function assumes you've imported the normal zed variables. +zed_exit_if_ignoring_this_event() +{ + if [ -n "${ZED_SYSLOG_SUBCLASS_INCLUDE}" ]; then + eval "case ${ZEVENT_SUBCLASS} in + ${ZED_SYSLOG_SUBCLASS_INCLUDE});; + *) exit 0;; + esac" + elif [ -n "${ZED_SYSLOG_SUBCLASS_EXCLUDE}" ]; then + eval "case ${ZEVENT_SUBCLASS} in + ${ZED_SYSLOG_SUBCLASS_EXCLUDE}) exit 0;; + *);; + esac" + fi +} diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc new file mode 100644 index 000000000000..1b220d28db20 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc @@ -0,0 +1,122 @@ +## +# zed.rc +# +# This file should be owned by root and permissioned 0600. +## + +## +# Absolute path to the debug output file. +# +#ZED_DEBUG_LOG="/tmp/zed.debug.log" + +## +# Email address of the zpool administrator for receipt of notifications; +# multiple addresses can be specified if they are delimited by whitespace. +# Email will only be sent if ZED_EMAIL_ADDR is defined. +# Disabled by default; uncomment to enable. +# +#ZED_EMAIL_ADDR="root" + +## +# Name or path of executable responsible for sending notifications via email; +# the mail program must be capable of reading a message body from stdin. +# Email will only be sent if ZED_EMAIL_ADDR is defined. +# +#ZED_EMAIL_PROG="mail" + +## +# Command-line options for ZED_EMAIL_PROG. +# The string @ADDRESS@ will be replaced with the recipient email address(es). +# The string @SUBJECT@ will be replaced with the notification subject; +# this should be protected with quotes to prevent word-splitting. +# Email will only be sent if ZED_EMAIL_ADDR is defined. +# +#ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@" + +## +# Default directory for zed lock files. +# +#ZED_LOCKDIR="/var/lock" + +## +# Minimum number of seconds between notifications for a similar event. +# +#ZED_NOTIFY_INTERVAL_SECS=3600 + +## +# Notification verbosity. +# If set to 0, suppress notification if the pool is healthy. +# If set to 1, send notification regardless of pool health. +# +#ZED_NOTIFY_VERBOSE=0 + +## +# Send notifications for 'ereport.fs.zfs.data' events. +# Disabled by default, any non-empty value will enable the feature. +# +#ZED_NOTIFY_DATA= + +## +# Pushbullet access token. +# This grants full access to your account -- protect it accordingly! +# <https://www.pushbullet.com/get-started> +# <https://www.pushbullet.com/account> +# Disabled by default; uncomment to enable. +# +#ZED_PUSHBULLET_ACCESS_TOKEN="" + +## +# Pushbullet channel tag for push notification feeds that can be subscribed to. +# <https://www.pushbullet.com/my-channel> +# If not defined, push notifications will instead be sent to all devices +# associated with the account specified by the access token. +# Disabled by default; uncomment to enable. +# +#ZED_PUSHBULLET_CHANNEL_TAG="" + +## +# Slack Webhook URL. +# This allows posting to the given channel and includes an access token. +# <https://api.slack.com/incoming-webhooks> +# Disabled by default; uncomment to enable. +# +#ZED_SLACK_WEBHOOK_URL="" + +## +# Default directory for zed state files. +# +#ZED_RUNDIR="/var/run" + +## +# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for +# device mapper and multipath devices as well. Your enclosure must be +# supported by the Linux SES driver for this to work. +# +ZED_USE_ENCLOSURE_LEDS=1 + +## +# Run a scrub after every resilver +# Disabled by default, 1 to enable and 0 to disable. +#ZED_SCRUB_AFTER_RESILVER=0 + +## +# The syslog priority (e.g., specified as a "facility.level" pair). +# +#ZED_SYSLOG_PRIORITY="daemon.notice" + +## +# The syslog tag for marking zed events. +# +#ZED_SYSLOG_TAG="zed" + +## +# Which set of event subclasses to log +# By default, events from all subclasses are logged. +# If ZED_SYSLOG_SUBCLASS_INCLUDE is set, only subclasses +# matching the pattern are logged. Use the pipe symbol (|) +# or shell wildcards (*, ?) to match multiple subclasses. +# Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the +# matching subclasses are excluded from logging. +#ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*" +#ZED_SYSLOG_SUBCLASS_EXCLUDE="statechange|config_*|history_event" + diff --git a/sys/contrib/openzfs/cmd/zed/zed.h b/sys/contrib/openzfs/cmd/zed/zed.h new file mode 100644 index 000000000000..3ac0e63141e8 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.h @@ -0,0 +1,58 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_H +#define ZED_H + +/* + * Absolute path for the default zed configuration file. + */ +#define ZED_CONF_FILE SYSCONFDIR "/zfs/zed.conf" + +/* + * Absolute path for the default zed pid file. + */ +#define ZED_PID_FILE RUNSTATEDIR "/zed.pid" + +/* + * Absolute path for the default zed state file. + */ +#define ZED_STATE_FILE RUNSTATEDIR "/zed.state" + +/* + * Absolute path for the default zed zedlet directory. + */ +#define ZED_ZEDLET_DIR SYSCONFDIR "/zfs/zed.d" + +/* + * Reserved for future use. + */ +#define ZED_MAX_EVENTS 0 + +/* + * Reserved for future use. + */ +#define ZED_MIN_EVENTS 0 + +/* + * String prefix for ZED variables passed via environment variables. + */ +#define ZED_VAR_PREFIX "ZED_" + +/* + * String prefix for ZFS event names passed via environment variables. + */ +#define ZEVENT_VAR_PREFIX "ZEVENT_" + +#endif /* !ZED_H */ diff --git a/sys/contrib/openzfs/cmd/zed/zed_conf.c b/sys/contrib/openzfs/cmd/zed/zed_conf.c new file mode 100644 index 000000000000..52370eb87b29 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_conf.c @@ -0,0 +1,735 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <assert.h> +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <libgen.h> +#include <limits.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <unistd.h> +#include "zed.h" +#include "zed_conf.h" +#include "zed_file.h" +#include "zed_log.h" +#include "zed_strings.h" + +/* + * Return a new configuration with default values. + */ +struct zed_conf * +zed_conf_create(void) +{ + struct zed_conf *zcp; + + zcp = calloc(1, sizeof (*zcp)); + if (!zcp) + goto nomem; + + zcp->syslog_facility = LOG_DAEMON; + zcp->min_events = ZED_MIN_EVENTS; + zcp->max_events = ZED_MAX_EVENTS; + zcp->pid_fd = -1; + zcp->zedlets = NULL; /* created via zed_conf_scan_dir() */ + zcp->state_fd = -1; /* opened via zed_conf_open_state() */ + zcp->zfs_hdl = NULL; /* opened via zed_event_init() */ + zcp->zevent_fd = -1; /* opened via zed_event_init() */ + + if (!(zcp->conf_file = strdup(ZED_CONF_FILE))) + goto nomem; + + if (!(zcp->pid_file = strdup(ZED_PID_FILE))) + goto nomem; + + if (!(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR))) + goto nomem; + + if (!(zcp->state_file = strdup(ZED_STATE_FILE))) + goto nomem; + + return (zcp); + +nomem: + zed_log_die("Failed to create conf: %s", strerror(errno)); + return (NULL); +} + +/* + * Destroy the configuration [zcp]. + * + * Note: zfs_hdl & zevent_fd are destroyed via zed_event_fini(). + */ +void +zed_conf_destroy(struct zed_conf *zcp) +{ + if (!zcp) + return; + + if (zcp->state_fd >= 0) { + if (close(zcp->state_fd) < 0) + zed_log_msg(LOG_WARNING, + "Failed to close state file \"%s\": %s", + zcp->state_file, strerror(errno)); + zcp->state_fd = -1; + } + if (zcp->pid_file) { + if ((unlink(zcp->pid_file) < 0) && (errno != ENOENT)) + zed_log_msg(LOG_WARNING, + "Failed to remove PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } + if (zcp->pid_fd >= 0) { + if (close(zcp->pid_fd) < 0) + zed_log_msg(LOG_WARNING, + "Failed to close PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + zcp->pid_fd = -1; + } + if (zcp->conf_file) { + free(zcp->conf_file); + zcp->conf_file = NULL; + } + if (zcp->pid_file) { + free(zcp->pid_file); + zcp->pid_file = NULL; + } + if (zcp->zedlet_dir) { + free(zcp->zedlet_dir); + zcp->zedlet_dir = NULL; + } + if (zcp->state_file) { + free(zcp->state_file); + zcp->state_file = NULL; + } + if (zcp->zedlets) { + zed_strings_destroy(zcp->zedlets); + zcp->zedlets = NULL; + } + free(zcp); +} + +/* + * Display command-line help and exit. + * + * If [got_err] is 0, output to stdout and exit normally; + * otherwise, output to stderr and exit with a failure status. + */ +static void +_zed_conf_display_help(const char *prog, int got_err) +{ + FILE *fp = got_err ? stderr : stdout; + int w1 = 4; /* width of leading whitespace */ + int w2 = 8; /* width of L-justified option field */ + + fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed")); + fprintf(fp, "\n"); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h", + "Display help."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L", + "Display license information."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V", + "Display version information."); + fprintf(fp, "\n"); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v", + "Be verbose."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f", + "Force daemon to run."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F", + "Run daemon in the foreground."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-I", + "Idle daemon until kernel module is (re)loaded."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M", + "Lock all pages in memory."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-P", + "$PATH for ZED to use (only used by ZTS)."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z", + "Zero state file."); + fprintf(fp, "\n"); +#if 0 + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE", + "Read configuration from FILE.", ZED_CONF_FILE); +#endif + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR", + "Read enabled ZEDLETs from DIR.", ZED_ZEDLET_DIR); + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE", + "Write daemon's PID to FILE.", ZED_PID_FILE); + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE", + "Write daemon's state to FILE.", ZED_STATE_FILE); + fprintf(fp, "\n"); + + exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS); +} + +/* + * Display license information to stdout and exit. + */ +static void +_zed_conf_display_license(void) +{ + const char **pp; + const char *text[] = { + "The ZFS Event Daemon (ZED) is distributed under the terms of the", + " Common Development and Distribution License (CDDL-1.0)", + " <http://opensource.org/licenses/CDDL-1.0>.", + "", + "Developed at Lawrence Livermore National Laboratory" + " (LLNL-CODE-403049).", + "", + NULL + }; + + for (pp = text; *pp; pp++) + printf("%s\n", *pp); + + exit(EXIT_SUCCESS); +} + +/* + * Display version information to stdout and exit. + */ +static void +_zed_conf_display_version(void) +{ + printf("%s-%s-%s\n", + ZFS_META_NAME, ZFS_META_VERSION, ZFS_META_RELEASE); + + exit(EXIT_SUCCESS); +} + +/* + * Copy the [path] string to the [resultp] ptr. + * If [path] is not an absolute path, prefix it with the current working dir. + * If [resultp] is non-null, free its existing string before assignment. + */ +static void +_zed_conf_parse_path(char **resultp, const char *path) +{ + char buf[PATH_MAX]; + + assert(resultp != NULL); + assert(path != NULL); + + if (*resultp) + free(*resultp); + + if (path[0] == '/') { + *resultp = strdup(path); + } else if (!getcwd(buf, sizeof (buf))) { + zed_log_die("Failed to get current working dir: %s", + strerror(errno)); + } else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) { + zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); + } else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) { + zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); + } else { + *resultp = strdup(buf); + } + if (!*resultp) + zed_log_die("Failed to copy path: %s", strerror(ENOMEM)); +} + +/* + * Parse the command-line options into the configuration [zcp]. + */ +void +zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) +{ + const char * const opts = ":hLVc:d:p:P:s:vfFMZI"; + int opt; + + if (!zcp || !argv || !argv[0]) + zed_log_die("Failed to parse options: Internal error"); + + opterr = 0; /* suppress default getopt err msgs */ + + while ((opt = getopt(argc, argv, opts)) != -1) { + switch (opt) { + case 'h': + _zed_conf_display_help(argv[0], EXIT_SUCCESS); + break; + case 'L': + _zed_conf_display_license(); + break; + case 'V': + _zed_conf_display_version(); + break; + case 'c': + _zed_conf_parse_path(&zcp->conf_file, optarg); + break; + case 'd': + _zed_conf_parse_path(&zcp->zedlet_dir, optarg); + break; + case 'I': + zcp->do_idle = 1; + break; + case 'p': + _zed_conf_parse_path(&zcp->pid_file, optarg); + break; + case 'P': + _zed_conf_parse_path(&zcp->path, optarg); + break; + case 's': + _zed_conf_parse_path(&zcp->state_file, optarg); + break; + case 'v': + zcp->do_verbose = 1; + break; + case 'f': + zcp->do_force = 1; + break; + case 'F': + zcp->do_foreground = 1; + break; + case 'M': + zcp->do_memlock = 1; + break; + case 'Z': + zcp->do_zero = 1; + break; + case '?': + default: + if (optopt == '?') + _zed_conf_display_help(argv[0], EXIT_SUCCESS); + + fprintf(stderr, "%s: %s '-%c'\n\n", argv[0], + "Invalid option", optopt); + _zed_conf_display_help(argv[0], EXIT_FAILURE); + break; + } + } +} + +/* + * Parse the configuration file into the configuration [zcp]. + * + * FIXME: Not yet implemented. + */ +void +zed_conf_parse_file(struct zed_conf *zcp) +{ + if (!zcp) + zed_log_die("Failed to parse config: %s", strerror(EINVAL)); +} + +/* + * Scan the [zcp] zedlet_dir for files to exec based on the event class. + * Files must be executable by user, but not writable by group or other. + * Dotfiles are ignored. + * + * Return 0 on success with an updated set of zedlets, + * or -1 on error with errno set. + * + * FIXME: Check if zedlet_dir and all parent dirs are secure. + */ +int +zed_conf_scan_dir(struct zed_conf *zcp) +{ + zed_strings_t *zedlets; + DIR *dirp; + struct dirent *direntp; + char pathname[PATH_MAX]; + struct stat st; + int n; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to scan zedlet dir: %s", + strerror(errno)); + return (-1); + } + zedlets = zed_strings_create(); + if (!zedlets) { + errno = ENOMEM; + zed_log_msg(LOG_WARNING, "Failed to scan dir \"%s\": %s", + zcp->zedlet_dir, strerror(errno)); + return (-1); + } + dirp = opendir(zcp->zedlet_dir); + if (!dirp) { + int errno_bak = errno; + zed_log_msg(LOG_WARNING, "Failed to open dir \"%s\": %s", + zcp->zedlet_dir, strerror(errno)); + zed_strings_destroy(zedlets); + errno = errno_bak; + return (-1); + } + while ((direntp = readdir(dirp))) { + if (direntp->d_name[0] == '.') + continue; + + n = snprintf(pathname, sizeof (pathname), + "%s/%s", zcp->zedlet_dir, direntp->d_name); + if ((n < 0) || (n >= sizeof (pathname))) { + zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", + direntp->d_name, strerror(ENAMETOOLONG)); + continue; + } + if (stat(pathname, &st) < 0) { + zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", + pathname, strerror(errno)); + continue; + } + if (!S_ISREG(st.st_mode)) { + zed_log_msg(LOG_INFO, + "Ignoring \"%s\": not a regular file", + direntp->d_name); + continue; + } + if ((st.st_uid != 0) && !zcp->do_force) { + zed_log_msg(LOG_NOTICE, + "Ignoring \"%s\": not owned by root", + direntp->d_name); + continue; + } + if (!(st.st_mode & S_IXUSR)) { + zed_log_msg(LOG_INFO, + "Ignoring \"%s\": not executable by user", + direntp->d_name); + continue; + } + if ((st.st_mode & S_IWGRP) && !zcp->do_force) { + zed_log_msg(LOG_NOTICE, + "Ignoring \"%s\": writable by group", + direntp->d_name); + continue; + } + if ((st.st_mode & S_IWOTH) && !zcp->do_force) { + zed_log_msg(LOG_NOTICE, + "Ignoring \"%s\": writable by other", + direntp->d_name); + continue; + } + if (zed_strings_add(zedlets, NULL, direntp->d_name) < 0) { + zed_log_msg(LOG_WARNING, + "Failed to register \"%s\": %s", + direntp->d_name, strerror(errno)); + continue; + } + if (zcp->do_verbose) + zed_log_msg(LOG_INFO, + "Registered zedlet \"%s\"", direntp->d_name); + } + if (closedir(dirp) < 0) { + int errno_bak = errno; + zed_log_msg(LOG_WARNING, "Failed to close dir \"%s\": %s", + zcp->zedlet_dir, strerror(errno)); + zed_strings_destroy(zedlets); + errno = errno_bak; + return (-1); + } + if (zcp->zedlets) + zed_strings_destroy(zcp->zedlets); + + zcp->zedlets = zedlets; + return (0); +} + +/* + * Write the PID file specified in [zcp]. + * Return 0 on success, -1 on error. + * + * This must be called after fork()ing to become a daemon (so the correct PID + * is recorded), but before daemonization is complete and the parent process + * exits (for synchronization with systemd). + */ +int +zed_conf_write_pid(struct zed_conf *zcp) +{ + const mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; + const mode_t filemode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + char buf[PATH_MAX]; + int n; + char *p; + mode_t mask; + int rv; + + if (!zcp || !zcp->pid_file) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to create PID file: %s", + strerror(errno)); + return (-1); + } + assert(zcp->pid_fd == -1); + /* + * Create PID file directory if needed. + */ + n = strlcpy(buf, zcp->pid_file, sizeof (buf)); + if (n >= sizeof (buf)) { + errno = ENAMETOOLONG; + zed_log_msg(LOG_ERR, "Failed to create PID file: %s", + strerror(errno)); + goto err; + } + p = strrchr(buf, '/'); + if (p) + *p = '\0'; + + if ((mkdirp(buf, dirmode) < 0) && (errno != EEXIST)) { + zed_log_msg(LOG_ERR, "Failed to create directory \"%s\": %s", + buf, strerror(errno)); + goto err; + } + /* + * Obtain PID file lock. + */ + mask = umask(0); + umask(mask | 022); + zcp->pid_fd = open(zcp->pid_file, (O_RDWR | O_CREAT), filemode); + umask(mask); + if (zcp->pid_fd < 0) { + zed_log_msg(LOG_ERR, "Failed to open PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + goto err; + } + rv = zed_file_lock(zcp->pid_fd); + if (rv < 0) { + zed_log_msg(LOG_ERR, "Failed to lock PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + goto err; + } else if (rv > 0) { + pid_t pid = zed_file_is_locked(zcp->pid_fd); + if (pid < 0) { + zed_log_msg(LOG_ERR, + "Failed to test lock on PID file \"%s\"", + zcp->pid_file); + } else if (pid > 0) { + zed_log_msg(LOG_ERR, + "Found PID %d bound to PID file \"%s\"", + pid, zcp->pid_file); + } else { + zed_log_msg(LOG_ERR, + "Inconsistent lock state on PID file \"%s\"", + zcp->pid_file); + } + goto err; + } + /* + * Write PID file. + */ + n = snprintf(buf, sizeof (buf), "%d\n", (int)getpid()); + if ((n < 0) || (n >= sizeof (buf))) { + errno = ERANGE; + zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } else if (zed_file_write_n(zcp->pid_fd, buf, n) != n) { + zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } else if (fdatasync(zcp->pid_fd) < 0) { + zed_log_msg(LOG_ERR, "Failed to sync PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } else { + return (0); + } + +err: + if (zcp->pid_fd >= 0) { + (void) close(zcp->pid_fd); + zcp->pid_fd = -1; + } + return (-1); +} + +/* + * Open and lock the [zcp] state_file. + * Return 0 on success, -1 on error. + * + * FIXME: Move state information into kernel. + */ +int +zed_conf_open_state(struct zed_conf *zcp) +{ + char dirbuf[PATH_MAX]; + mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; + int n; + char *p; + int rv; + + if (!zcp || !zcp->state_file) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to open state file: %s", + strerror(errno)); + return (-1); + } + n = strlcpy(dirbuf, zcp->state_file, sizeof (dirbuf)); + if (n >= sizeof (dirbuf)) { + errno = ENAMETOOLONG; + zed_log_msg(LOG_WARNING, "Failed to open state file: %s", + strerror(errno)); + return (-1); + } + p = strrchr(dirbuf, '/'); + if (p) + *p = '\0'; + + if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) { + zed_log_msg(LOG_WARNING, + "Failed to create directory \"%s\": %s", + dirbuf, strerror(errno)); + return (-1); + } + if (zcp->state_fd >= 0) { + if (close(zcp->state_fd) < 0) { + zed_log_msg(LOG_WARNING, + "Failed to close state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + } + if (zcp->do_zero) + (void) unlink(zcp->state_file); + + zcp->state_fd = open(zcp->state_file, + (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); + if (zcp->state_fd < 0) { + zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + rv = zed_file_lock(zcp->state_fd); + if (rv < 0) { + zed_log_msg(LOG_WARNING, "Failed to lock state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + if (rv > 0) { + pid_t pid = zed_file_is_locked(zcp->state_fd); + if (pid < 0) { + zed_log_msg(LOG_WARNING, + "Failed to test lock on state file \"%s\"", + zcp->state_file); + } else if (pid > 0) { + zed_log_msg(LOG_WARNING, + "Found PID %d bound to state file \"%s\"", + pid, zcp->state_file); + } else { + zed_log_msg(LOG_WARNING, + "Inconsistent lock state on state file \"%s\"", + zcp->state_file); + } + return (-1); + } + return (0); +} + +/* + * Read the opened [zcp] state_file to obtain the eid & etime of the last event + * processed. Write the state from the last event to the [eidp] & [etime] args + * passed by reference. Note that etime[] is an array of size 2. + * Return 0 on success, -1 on error. + */ +int +zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]) +{ + ssize_t len; + struct iovec iov[3]; + ssize_t n; + + if (!zcp || !eidp || !etime) { + errno = EINVAL; + zed_log_msg(LOG_ERR, + "Failed to read state file: %s", strerror(errno)); + return (-1); + } + if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) { + zed_log_msg(LOG_WARNING, + "Failed to reposition state file offset: %s", + strerror(errno)); + return (-1); + } + len = 0; + iov[0].iov_base = eidp; + len += iov[0].iov_len = sizeof (*eidp); + iov[1].iov_base = &etime[0]; + len += iov[1].iov_len = sizeof (etime[0]); + iov[2].iov_base = &etime[1]; + len += iov[2].iov_len = sizeof (etime[1]); + + n = readv(zcp->state_fd, iov, 3); + if (n == 0) { + *eidp = 0; + } else if (n < 0) { + zed_log_msg(LOG_WARNING, + "Failed to read state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } else if (n != len) { + errno = EIO; + zed_log_msg(LOG_WARNING, + "Failed to read state file \"%s\": Read %d of %d bytes", + zcp->state_file, n, len); + return (-1); + } + return (0); +} + +/* + * Write the [eid] & [etime] of the last processed event to the opened + * [zcp] state_file. Note that etime[] is an array of size 2. + * Return 0 on success, -1 on error. + */ +int +zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]) +{ + ssize_t len; + struct iovec iov[3]; + ssize_t n; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, + "Failed to write state file: %s", strerror(errno)); + return (-1); + } + if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) { + zed_log_msg(LOG_WARNING, + "Failed to reposition state file offset: %s", + strerror(errno)); + return (-1); + } + len = 0; + iov[0].iov_base = &eid; + len += iov[0].iov_len = sizeof (eid); + iov[1].iov_base = &etime[0]; + len += iov[1].iov_len = sizeof (etime[0]); + iov[2].iov_base = &etime[1]; + len += iov[2].iov_len = sizeof (etime[1]); + + n = writev(zcp->state_fd, iov, 3); + if (n < 0) { + zed_log_msg(LOG_WARNING, + "Failed to write state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + if (n != len) { + errno = EIO; + zed_log_msg(LOG_WARNING, + "Failed to write state file \"%s\": Wrote %d of %d bytes", + zcp->state_file, n, len); + return (-1); + } + if (fdatasync(zcp->state_fd) < 0) { + zed_log_msg(LOG_WARNING, + "Failed to sync state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zed/zed_conf.h b/sys/contrib/openzfs/cmd/zed/zed_conf.h new file mode 100644 index 000000000000..424cb2c01c8c --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_conf.h @@ -0,0 +1,62 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_CONF_H +#define ZED_CONF_H + +#include <libzfs.h> +#include <stdint.h> +#include "zed_strings.h" + +struct zed_conf { + unsigned do_force:1; /* true if force enabled */ + unsigned do_foreground:1; /* true if run in foreground */ + unsigned do_memlock:1; /* true if locking memory */ + unsigned do_verbose:1; /* true if verbosity enabled */ + unsigned do_zero:1; /* true if zeroing state */ + unsigned do_idle:1; /* true if idle enabled */ + int syslog_facility; /* syslog facility value */ + int min_events; /* RESERVED FOR FUTURE USE */ + int max_events; /* RESERVED FOR FUTURE USE */ + char *conf_file; /* abs path to config file */ + char *pid_file; /* abs path to pid file */ + int pid_fd; /* fd to pid file for lock */ + char *zedlet_dir; /* abs path to zedlet dir */ + zed_strings_t *zedlets; /* names of enabled zedlets */ + char *state_file; /* abs path to state file */ + int state_fd; /* fd to state file */ + libzfs_handle_t *zfs_hdl; /* handle to libzfs */ + int zevent_fd; /* fd for access to zevents */ + char *path; /* custom $PATH for zedlets to use */ +}; + +struct zed_conf *zed_conf_create(void); + +void zed_conf_destroy(struct zed_conf *zcp); + +void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv); + +void zed_conf_parse_file(struct zed_conf *zcp); + +int zed_conf_scan_dir(struct zed_conf *zcp); + +int zed_conf_write_pid(struct zed_conf *zcp); + +int zed_conf_open_state(struct zed_conf *zcp); + +int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]); + +int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]); + +#endif /* !ZED_CONF_H */ diff --git a/sys/contrib/openzfs/cmd/zed/zed_disk_event.c b/sys/contrib/openzfs/cmd/zed/zed_disk_event.c new file mode 100644 index 000000000000..174d24523253 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_disk_event.c @@ -0,0 +1,416 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017, Intel Corporation. + */ + +#ifdef HAVE_LIBUDEV + +#include <errno.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <libudev.h> +#include <libzfs.h> +#include <libzutil.h> +#include <pthread.h> +#include <stdlib.h> +#include <string.h> + +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> + +#include "zed_log.h" +#include "zed_disk_event.h" +#include "agents/zfs_agents.h" + +/* + * Portions of ZED need to see disk events for disks belonging to ZFS pools. + * A libudev monitor is established to monitor block device actions and pass + * them on to internal ZED logic modules. Initially, zfs_mod.c is the only + * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM + * module responsible for handling disk events for ZFS. + */ + +pthread_t g_mon_tid; +struct udev *g_udev; +struct udev_monitor *g_mon; + + +#define DEV_BYID_PATH "/dev/disk/by-id/" + +/* 64MB is minimum usable disk for ZFS */ +#define MINIMUM_SECTORS 131072 + + +/* + * Post disk event to SLM module + * + * occurs in the context of monitor thread + */ +static void +zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + char *strval; + uint64_t numval; + + zed_log_msg(LOG_INFO, "zed_disk_event:"); + zed_log_msg(LOG_INFO, "\tclass: %s", class); + zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass); + if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval); + if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval); + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval); + if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); + if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval); + + (void) zfs_agent_post_event(class, subclass, nvl); +} + +/* + * dev_event_nvlist: place event schema into an nv pair list + * + * NAME VALUE (example) + * -------------- -------------------------------------------------------- + * DEV_NAME /dev/sdl + * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/... + * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC + * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0 + * DEV_IS_PART --- + * DEV_SIZE 500107862016 + * ZFS_EV_POOL_GUID 17523635698032189180 + * ZFS_EV_VDEV_GUID 14663607734290803088 + */ +static nvlist_t * +dev_event_nvlist(struct udev_device *dev) +{ + nvlist_t *nvl; + char strval[128]; + const char *value, *path; + uint64_t guid; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval); + if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval); + if ((path = udev_device_get_devnode(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_NAME, path); + if ((value = udev_device_get_devpath(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_PATH, value); + value = udev_device_get_devtype(dev); + if ((value != NULL && strcmp("partition", value) == 0) || + (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER") + != NULL)) { + (void) nvlist_add_boolean(nvl, DEV_IS_PART); + } + if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) { + uint64_t numval = DEV_BSIZE; + + numval *= strtoull(value, NULL, 10); + (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); + } + + /* + * Grab the pool and vdev guids from blkid cache + */ + value = udev_device_get_property_value(dev, "ID_FS_UUID"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid); + + value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid); + + /* + * Either a vdev guid or a devid must be present for matching + */ + if (!nvlist_exists(nvl, DEV_IDENTIFIER) && + !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) { + nvlist_free(nvl); + return (NULL); + } + + return (nvl); +} + +/* + * Listen for block device uevents + */ +static void * +zed_udev_monitor(void *arg) +{ + struct udev_monitor *mon = arg; + char *tmp, *tmp2; + + zed_log_msg(LOG_INFO, "Waiting for new udev disk events..."); + + while (1) { + struct udev_device *dev; + const char *action, *type, *part, *sectors; + const char *bus, *uuid; + const char *class, *subclass; + nvlist_t *nvl; + boolean_t is_zfs = B_FALSE; + + /* allow a cancellation while blocked (recvmsg) */ + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + + /* blocks at recvmsg until an event occurs */ + if ((dev = udev_monitor_receive_device(mon)) == NULL) { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive " + "device error %d", errno); + continue; + } + + /* allow all steps to complete before a cancellation */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + /* + * Strongly typed device is the preferred filter + */ + type = udev_device_get_property_value(dev, "ID_FS_TYPE"); + if (type != NULL && type[0] != '\0') { + if (strcmp(type, "zfs_member") == 0) { + is_zfs = B_TRUE; + } else { + /* not ours, so skip */ + zed_log_msg(LOG_INFO, "zed_udev_monitor: skip " + "%s (in use by %s)", + udev_device_get_devnode(dev), type); + udev_device_unref(dev); + continue; + } + } + + /* + * if this is a disk and it is partitioned, then the + * zfs label will reside in a DEVTYPE=partition and + * we can skip passing this event + */ + type = udev_device_get_property_value(dev, "DEVTYPE"); + part = udev_device_get_property_value(dev, + "ID_PART_TABLE_TYPE"); + if (type != NULL && type[0] != '\0' && + strcmp(type, "disk") == 0 && + part != NULL && part[0] != '\0') { + /* skip and wait for partition event */ + udev_device_unref(dev); + continue; + } + + /* + * ignore small partitions + */ + sectors = udev_device_get_property_value(dev, + "ID_PART_ENTRY_SIZE"); + if (sectors == NULL) + sectors = udev_device_get_sysattr_value(dev, "size"); + if (sectors != NULL && + strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) { + udev_device_unref(dev); + continue; + } + + /* + * If the blkid probe didn't find ZFS, then a persistent + * device id string is required in the message schema + * for matching with vdevs. Preflight here for expected + * udev information. + */ + bus = udev_device_get_property_value(dev, "ID_BUS"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (!is_zfs && (bus == NULL && uuid == NULL)) { + zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " + "source", udev_device_get_devnode(dev)); + udev_device_unref(dev); + continue; + } + + action = udev_device_get_action(dev); + if (strcmp(action, "add") == 0) { + class = EC_DEV_ADD; + subclass = ESC_DISK; + } else if (strcmp(action, "remove") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } else if (strcmp(action, "change") == 0) { + class = EC_DEV_STATUS; + subclass = ESC_DEV_DLE; + } else { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown", + action); + udev_device_unref(dev); + continue; + } + + /* + * Special case an EC_DEV_ADD for multipath devices + * + * When a multipath device is created, udev reports the + * following: + * + * 1. "add" event of the dm device for the multipath device + * (like /dev/dm-3). + * 2. "change" event to create the actual multipath device + * symlink (like /dev/mapper/mpatha). The event also + * passes back the relevant DM vars we care about, like + * DM_UUID. + * 3. Another "change" event identical to #2 (that we ignore). + * + * To get the behavior we want, we treat the "change" event + * in #2 as a "add" event; as if "/dev/mapper/mpatha" was + * a new disk being added. + */ + if (strcmp(class, EC_DEV_STATUS) == 0 && + udev_device_get_property_value(dev, "DM_UUID") && + udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) { + tmp = (char *)udev_device_get_devnode(dev); + tmp2 = zfs_get_underlying_path(tmp); + if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) { + /* + * We have a real underlying device, which + * means that this multipath "change" event is + * an "add" event. + * + * If the multipath device and the underlying + * dev are the same name (i.e. /dev/dm-5), then + * there is no real underlying disk for this + * multipath device, and so this "change" event + * really is a multipath removal. + */ + class = EC_DEV_ADD; + subclass = ESC_DISK; + } else { + tmp = (char *) + udev_device_get_property_value(dev, + "DM_NR_VALID_PATHS"); + /* treat as a multipath remove */ + if (tmp != NULL && strcmp(tmp, "0") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } + } + free(tmp2); + } + + /* + * Special case an EC_DEV_ADD for scsi_debug devices + * + * These devices require a udevadm trigger command after + * creation in order to register the vdev_id scsidebug alias + * rule (adds a persistent path (phys_path) used for fault + * management automated tests in the ZFS test suite. + * + * After udevadm trigger command, event registers as a "change" + * event but needs to instead be handled as another "add" event + * to allow for disk labeling and partitioning to occur. + */ + if (strcmp(class, EC_DEV_STATUS) == 0 && + udev_device_get_property_value(dev, "ID_VDEV") && + udev_device_get_property_value(dev, "ID_MODEL")) { + const char *id_model, *id_model_sd = "scsi_debug"; + + id_model = udev_device_get_property_value(dev, + "ID_MODEL"); + if (strcmp(id_model, id_model_sd) == 0) { + class = EC_DEV_ADD; + subclass = ESC_DISK; + } + } + + if ((nvl = dev_event_nvlist(dev)) != NULL) { + zed_udev_event(class, subclass, nvl); + nvlist_free(nvl); + } + + udev_device_unref(dev); + } + + return (NULL); +} + +int +zed_disk_event_init() +{ + int fd, fflags; + + if ((g_udev = udev_new()) == NULL) { + zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno); + return (-1); + } + + /* Set up a udev monitor for block devices */ + g_mon = udev_monitor_new_from_netlink(g_udev, "udev"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", + "partition"); + udev_monitor_enable_receiving(g_mon); + + /* Make sure monitoring socket is blocking */ + fd = udev_monitor_get_fd(g_mon); + if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK) + (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK); + + /* spawn a thread to monitor events */ + if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) { + udev_monitor_unref(g_mon); + udev_unref(g_udev); + zed_log_msg(LOG_WARNING, "pthread_create failed"); + return (-1); + } + + zed_log_msg(LOG_INFO, "zed_disk_event_init"); + + return (0); +} + +void +zed_disk_event_fini() +{ + /* cancel monitor thread at recvmsg() */ + (void) pthread_cancel(g_mon_tid); + (void) pthread_join(g_mon_tid, NULL); + + /* cleanup udev resources */ + udev_monitor_unref(g_mon); + udev_unref(g_udev); + + zed_log_msg(LOG_INFO, "zed_disk_event_fini"); +} + +#else + +#include "zed_disk_event.h" + +int +zed_disk_event_init() +{ + return (0); +} + +void +zed_disk_event_fini() +{ +} + +#endif /* HAVE_LIBUDEV */ diff --git a/sys/contrib/openzfs/cmd/zed/zed_disk_event.h b/sys/contrib/openzfs/cmd/zed/zed_disk_event.h new file mode 100644 index 000000000000..ea9813d0a595 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_disk_event.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef ZED_DISK_EVENT_H +#define ZED_DISK_EVENT_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zed_disk_event_init(void); +extern void zed_disk_event_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* !ZED_DISK_EVENT_H */ diff --git a/sys/contrib/openzfs/cmd/zed/zed_event.c b/sys/contrib/openzfs/cmd/zed/zed_event.c new file mode 100644 index 000000000000..1c5d00e297ff --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_event.c @@ -0,0 +1,965 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <libzfs.h> /* FIXME: Replace with libzfs_core. */ +#include <paths.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/zfs_ioctl.h> +#include <time.h> +#include <unistd.h> +#include <sys/fm/fs/zfs.h> +#include "zed.h" +#include "zed_conf.h" +#include "zed_disk_event.h" +#include "zed_event.h" +#include "zed_exec.h" +#include "zed_file.h" +#include "zed_log.h" +#include "zed_strings.h" + +#include "agents/zfs_agents.h" + +#define MAXBUF 4096 + +/* + * Open the libzfs interface. + */ +int +zed_event_init(struct zed_conf *zcp) +{ + if (!zcp) + zed_log_die("Failed zed_event_init: %s", strerror(EINVAL)); + + zcp->zfs_hdl = libzfs_init(); + if (!zcp->zfs_hdl) { + if (zcp->do_idle) + return (-1); + zed_log_die("Failed to initialize libzfs"); + } + + zcp->zevent_fd = open(ZFS_DEV, O_RDWR); + if (zcp->zevent_fd < 0) { + if (zcp->do_idle) + return (-1); + zed_log_die("Failed to open \"%s\": %s", + ZFS_DEV, strerror(errno)); + } + + zfs_agent_init(zcp->zfs_hdl); + + if (zed_disk_event_init() != 0) { + if (zcp->do_idle) + return (-1); + zed_log_die("Failed to initialize disk events"); + } + + return (0); +} + +/* + * Close the libzfs interface. + */ +void +zed_event_fini(struct zed_conf *zcp) +{ + if (!zcp) + zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL)); + + zed_disk_event_fini(); + zfs_agent_fini(); + + if (zcp->zevent_fd >= 0) { + if (close(zcp->zevent_fd) < 0) + zed_log_msg(LOG_WARNING, "Failed to close \"%s\": %s", + ZFS_DEV, strerror(errno)); + + zcp->zevent_fd = -1; + } + if (zcp->zfs_hdl) { + libzfs_fini(zcp->zfs_hdl); + zcp->zfs_hdl = NULL; + } +} + +/* + * Seek to the event specified by [saved_eid] and [saved_etime]. + * This protects against processing a given event more than once. + * Return 0 upon a successful seek to the specified event, or -1 otherwise. + * + * A zevent is considered to be uniquely specified by its (eid,time) tuple. + * The unsigned 64b eid is set to 1 when the kernel module is loaded, and + * incremented by 1 for each new event. Since the state file can persist + * across a kernel module reload, the time must be checked to ensure a match. + */ +int +zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[]) +{ + uint64_t eid; + int found; + nvlist_t *nvl; + int n_dropped; + int64_t *etime; + uint_t nelem; + int rv; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to seek zevent: %s", + strerror(errno)); + return (-1); + } + eid = 0; + found = 0; + while ((eid < saved_eid) && !found) { + rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, + ZEVENT_NONBLOCK, zcp->zevent_fd); + + if ((rv != 0) || !nvl) + break; + + if (n_dropped > 0) { + zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); + /* + * FIXME: Increase max size of event nvlist in + * /sys/module/zfs/parameters/zfs_zevent_len_max ? + */ + } + if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { + zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); + } else if (nvlist_lookup_int64_array(nvl, "time", + &etime, &nelem) != 0) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu)", eid); + } else if (nelem != 2) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu, nelem=%u)", + eid, nelem); + } else if ((eid != saved_eid) || + (etime[0] != saved_etime[0]) || + (etime[1] != saved_etime[1])) { + /* no-op */ + } else { + found = 1; + } + free(nvl); + } + if (!found && (saved_eid > 0)) { + if (zpool_events_seek(zcp->zfs_hdl, ZEVENT_SEEK_START, + zcp->zevent_fd) < 0) + zed_log_msg(LOG_WARNING, "Failed to seek to eid=0"); + else + eid = 0; + } + zed_log_msg(LOG_NOTICE, "Processing events since eid=%llu", eid); + return (found ? 0 : -1); +} + +/* + * Return non-zero if nvpair [name] should be formatted in hex; o/w, return 0. + */ +static int +_zed_event_value_is_hex(const char *name) +{ + const char *hex_suffix[] = { + "_guid", + "_guids", + NULL + }; + const char **pp; + char *p; + + if (!name) + return (0); + + for (pp = hex_suffix; *pp; pp++) { + p = strstr(name, *pp); + if (p && strlen(p) == strlen(*pp)) + return (1); + } + return (0); +} + +/* + * Add an environment variable for [eid] to the container [zsp]. + * + * The variable name is the concatenation of [prefix] and [name] converted to + * uppercase with non-alphanumeric characters converted to underscores; + * [prefix] is optional, and [name] must begin with an alphabetic character. + * If the converted variable name already exists within the container [zsp], + * its existing value will be replaced with the new value. + * + * The variable value is specified by the format string [fmt]. + * + * Returns 0 on success, and -1 on error (with errno set). + * + * All environment variables in [zsp] should be added through this function. + */ +static int +_zed_event_add_var(uint64_t eid, zed_strings_t *zsp, + const char *prefix, const char *name, const char *fmt, ...) +{ + char keybuf[MAXBUF]; + char valbuf[MAXBUF]; + char *dstp; + const char *srcp; + const char *lastp; + int n; + int buflen; + va_list vargs; + + assert(zsp != NULL); + assert(fmt != NULL); + + if (!name) { + errno = EINVAL; + zed_log_msg(LOG_WARNING, + "Failed to add variable for eid=%llu: Name is empty", eid); + return (-1); + } else if (!isalpha(name[0])) { + errno = EINVAL; + zed_log_msg(LOG_WARNING, + "Failed to add variable for eid=%llu: " + "Name \"%s\" is invalid", eid, name); + return (-1); + } + /* + * Construct the string key by converting PREFIX (if present) and NAME. + */ + dstp = keybuf; + lastp = keybuf + sizeof (keybuf); + if (prefix) { + for (srcp = prefix; *srcp && (dstp < lastp); srcp++) + *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_'; + } + for (srcp = name; *srcp && (dstp < lastp); srcp++) + *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_'; + + if (dstp == lastp) { + errno = ENAMETOOLONG; + zed_log_msg(LOG_WARNING, + "Failed to add variable for eid=%llu: Name too long", eid); + return (-1); + } + *dstp = '\0'; + /* + * Construct the string specified by "[PREFIX][NAME]=[FMT]". + */ + dstp = valbuf; + buflen = sizeof (valbuf); + n = strlcpy(dstp, keybuf, buflen); + if (n >= sizeof (valbuf)) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } + dstp += n; + buflen -= n; + + *dstp++ = '='; + buflen--; + + if (buflen <= 0) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } + + va_start(vargs, fmt); + n = vsnprintf(dstp, buflen, fmt, vargs); + va_end(vargs); + + if ((n < 0) || (n >= buflen)) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } else if (zed_strings_add(zsp, keybuf, valbuf) < 0) { + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, strerror(errno)); + return (-1); + } + return (0); +} + +static int +_zed_event_add_array_err(uint64_t eid, const char *name) +{ + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, + "Failed to convert nvpair \"%s\" for eid=%llu: " + "Exceeded buffer size", name, eid); + return (-1); +} + +static int +_zed_event_add_int8_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int8_t *i8p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT8_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int8_array(nvp, &i8p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%d ", i8p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint8_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + uint8_t *u8p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT8_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_uint8_array(nvp, &u8p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%u ", u8p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_int16_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int16_t *i16p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT16_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int16_array(nvp, &i16p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%d ", i16p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint16_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + uint16_t *u16p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT16_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_uint16_array(nvp, &u16p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%u ", u16p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_int32_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int32_t *i32p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT32_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int32_array(nvp, &i32p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%d ", i32p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint32_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + uint32_t *u32p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_uint32_array(nvp, &u32p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%u ", u32p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_int64_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int64_t *i64p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT64_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int64_array(nvp, &i64p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%lld ", (u_longlong_t)i64p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint64_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + const char *fmt; + uint64_t *u64p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT64_ARRAY)); + + name = nvpair_name(nvp); + fmt = _zed_event_value_is_hex(name) ? "0x%.16llX " : "%llu "; + (void) nvpair_value_uint64_array(nvp, &u64p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, fmt, (u_longlong_t)u64p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_string_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + char **strp; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_STRING_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_string_array(nvp, &strp, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%s ", strp[i] ? strp[i] : "<NULL>"); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +/* + * Convert the nvpair [nvp] to a string which is added to the environment + * of the child process. + * Return 0 on success, -1 on error. + * + * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()? + */ +static void +_zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp) +{ + const char *name; + data_type_t type; + const char *prefix = ZEVENT_VAR_PREFIX; + boolean_t b; + double d; + uint8_t i8; + uint16_t i16; + uint32_t i32; + uint64_t i64; + char *str; + + assert(zsp != NULL); + assert(nvp != NULL); + + name = nvpair_name(nvp); + type = nvpair_type(nvp); + + switch (type) { + case DATA_TYPE_BOOLEAN: + _zed_event_add_var(eid, zsp, prefix, name, "%s", "1"); + break; + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(nvp, &b); + _zed_event_add_var(eid, zsp, prefix, name, "%s", b ? "1" : "0"); + break; + case DATA_TYPE_BYTE: + (void) nvpair_value_byte(nvp, &i8); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i8); + break; + case DATA_TYPE_INT8: + (void) nvpair_value_int8(nvp, (int8_t *)&i8); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i8); + break; + case DATA_TYPE_UINT8: + (void) nvpair_value_uint8(nvp, &i8); + _zed_event_add_var(eid, zsp, prefix, name, "%u", i8); + break; + case DATA_TYPE_INT16: + (void) nvpair_value_int16(nvp, (int16_t *)&i16); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i16); + break; + case DATA_TYPE_UINT16: + (void) nvpair_value_uint16(nvp, &i16); + _zed_event_add_var(eid, zsp, prefix, name, "%u", i16); + break; + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (int32_t *)&i32); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i32); + break; + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + _zed_event_add_var(eid, zsp, prefix, name, "%u", i32); + break; + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (int64_t *)&i64); + _zed_event_add_var(eid, zsp, prefix, name, + "%lld", (longlong_t)i64); + break; + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + _zed_event_add_var(eid, zsp, prefix, name, + (_zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"), + (u_longlong_t)i64); + /* + * shadow readable strings for vdev state pairs + */ + if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || + strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { + char alt[32]; + + (void) snprintf(alt, sizeof (alt), "%s_str", name); + _zed_event_add_var(eid, zsp, prefix, alt, "%s", + zpool_state_to_name(i64, VDEV_AUX_NONE)); + } else + /* + * shadow readable strings for pool state + */ + if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE) == 0) { + char alt[32]; + + (void) snprintf(alt, sizeof (alt), "%s_str", name); + _zed_event_add_var(eid, zsp, prefix, alt, "%s", + zpool_pool_state_to_name(i64)); + } + break; + case DATA_TYPE_DOUBLE: + (void) nvpair_value_double(nvp, &d); + _zed_event_add_var(eid, zsp, prefix, name, "%g", d); + break; + case DATA_TYPE_HRTIME: + (void) nvpair_value_hrtime(nvp, (hrtime_t *)&i64); + _zed_event_add_var(eid, zsp, prefix, name, + "%llu", (u_longlong_t)i64); + break; + case DATA_TYPE_NVLIST: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + _zed_event_add_var(eid, zsp, prefix, name, + "%s", (str ? str : "<NULL>")); + break; + case DATA_TYPE_BOOLEAN_ARRAY: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + case DATA_TYPE_BYTE_ARRAY: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + case DATA_TYPE_INT8_ARRAY: + _zed_event_add_int8_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT8_ARRAY: + _zed_event_add_uint8_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_INT16_ARRAY: + _zed_event_add_int16_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT16_ARRAY: + _zed_event_add_uint16_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_INT32_ARRAY: + _zed_event_add_int32_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT32_ARRAY: + _zed_event_add_uint32_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_INT64_ARRAY: + _zed_event_add_int64_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT64_ARRAY: + _zed_event_add_uint64_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_STRING_ARRAY: + _zed_event_add_string_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_NVLIST_ARRAY: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + default: + errno = EINVAL; + zed_log_msg(LOG_WARNING, + "Failed to convert nvpair \"%s\" for eid=%llu: " + "Unrecognized type=%u", name, eid, (unsigned int) type); + break; + } +} + +/* + * Restrict various environment variables to safe and sane values + * when constructing the environment for the child process, unless + * we're running with a custom $PATH (like under the ZFS test suite). + * + * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. + */ +static void +_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp, + const char *path) +{ + const char *env_restrict[][2] = { + { "IFS", " \t\n" }, + { "PATH", _PATH_STDPATH }, + { "ZDB", SBINDIR "/zdb" }, + { "ZED", SBINDIR "/zed" }, + { "ZFS", SBINDIR "/zfs" }, + { "ZINJECT", SBINDIR "/zinject" }, + { "ZPOOL", SBINDIR "/zpool" }, + { "ZFS_ALIAS", ZFS_META_ALIAS }, + { "ZFS_VERSION", ZFS_META_VERSION }, + { "ZFS_RELEASE", ZFS_META_RELEASE }, + { NULL, NULL } + }; + + /* + * If we have a custom $PATH, use the default ZFS binary locations + * instead of the hard-coded ones. + */ + const char *env_path[][2] = { + { "IFS", " \t\n" }, + { "PATH", NULL }, /* $PATH copied in later on */ + { "ZDB", "zdb" }, + { "ZED", "zed" }, + { "ZFS", "zfs" }, + { "ZINJECT", "zinject" }, + { "ZPOOL", "zpool" }, + { "ZFS_ALIAS", ZFS_META_ALIAS }, + { "ZFS_VERSION", ZFS_META_VERSION }, + { "ZFS_RELEASE", ZFS_META_RELEASE }, + { NULL, NULL } + }; + const char *(*pa)[2]; + + assert(zsp != NULL); + + pa = path != NULL ? env_path : env_restrict; + + for (; *(*pa); pa++) { + /* Use our custom $PATH if we have one */ + if (path != NULL && strcmp((*pa)[0], "PATH") == 0) + (*pa)[1] = path; + + _zed_event_add_var(eid, zsp, NULL, (*pa)[0], "%s", (*pa)[1]); + } +} + +/* + * Preserve specified variables from the parent environment + * when constructing the environment for the child process. + * + * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. + */ +static void +_zed_event_add_env_preserve(uint64_t eid, zed_strings_t *zsp) +{ + const char *env_preserve[] = { + "TZ", + NULL + }; + const char **keyp; + const char *val; + + assert(zsp != NULL); + + for (keyp = env_preserve; *keyp; keyp++) { + if ((val = getenv(*keyp))) + _zed_event_add_var(eid, zsp, NULL, *keyp, "%s", val); + } +} + +/* + * Compute the "subclass" by removing the first 3 components of [class] + * (which will always be of the form "*.fs.zfs"). Return a pointer inside + * the string [class], or NULL if insufficient components exist. + */ +static const char * +_zed_event_get_subclass(const char *class) +{ + const char *p; + int i; + + if (!class) + return (NULL); + + p = class; + for (i = 0; i < 3; i++) { + p = strchr(p, '.'); + if (!p) + break; + p++; + } + return (p); +} + +/* + * Convert the zevent time from a 2-element array of 64b integers + * into a more convenient form: + * - TIME_SECS is the second component of the time. + * - TIME_NSECS is the nanosecond component of the time. + * - TIME_STRING is an almost-RFC3339-compliant string representation. + */ +static void +_zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) +{ + struct tm *stp; + char buf[32]; + + assert(zsp != NULL); + assert(etime != NULL); + + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_SECS", + "%lld", (long long int) etime[0]); + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_NSECS", + "%lld", (long long int) etime[1]); + + if (!(stp = localtime((const time_t *) &etime[0]))) { + zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", + ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "localtime error"); + } else if (!strftime(buf, sizeof (buf), "%Y-%m-%d %H:%M:%S%z", stp)) { + zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", + ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "strftime error"); + } else { + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_STRING", + "%s", buf); + } +} + +/* + * Service the next zevent, blocking until one is available. + */ +int +zed_event_service(struct zed_conf *zcp) +{ + nvlist_t *nvl; + nvpair_t *nvp; + int n_dropped; + zed_strings_t *zsp; + uint64_t eid; + int64_t *etime; + uint_t nelem; + char *class; + const char *subclass; + int rv; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to service zevent: %s", + strerror(errno)); + return (EINVAL); + } + rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, ZEVENT_NONE, + zcp->zevent_fd); + + if ((rv != 0) || !nvl) + return (errno); + + if (n_dropped > 0) { + zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); + /* + * FIXME: Increase max size of event nvlist in + * /sys/module/zfs/parameters/zfs_zevent_len_max ? + */ + } + if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { + zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); + } else if (nvlist_lookup_int64_array( + nvl, "time", &etime, &nelem) != 0) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu)", eid); + } else if (nelem != 2) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu, nelem=%u)", + eid, nelem); + } else if (nvlist_lookup_string(nvl, "class", &class) != 0) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent class (eid=%llu)", eid); + } else { + /* let internal modules see this event first */ + zfs_agent_post_event(class, NULL, nvl); + + zsp = zed_strings_create(); + + nvp = NULL; + while ((nvp = nvlist_next_nvpair(nvl, nvp))) + _zed_event_add_nvpair(eid, zsp, nvp); + + _zed_event_add_env_restrict(eid, zsp, zcp->path); + _zed_event_add_env_preserve(eid, zsp); + + _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "PID", + "%d", (int)getpid()); + _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "ZEDLET_DIR", + "%s", zcp->zedlet_dir); + subclass = _zed_event_get_subclass(class); + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "SUBCLASS", + "%s", (subclass ? subclass : class)); + + _zed_event_add_time_strings(eid, zsp, etime); + + zed_exec_process(eid, class, subclass, + zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd); + + zed_conf_write_state(zcp, eid, etime); + + zed_strings_destroy(zsp); + } + nvlist_free(nvl); + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zed/zed_event.h b/sys/contrib/openzfs/cmd/zed/zed_event.h new file mode 100644 index 000000000000..c1455c3a0629 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_event.h @@ -0,0 +1,29 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_EVENT_H +#define ZED_EVENT_H + +#include <stdint.h> + +int zed_event_init(struct zed_conf *zcp); + +void zed_event_fini(struct zed_conf *zcp); + +int zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, + int64_t saved_etime[]); + +int zed_event_service(struct zed_conf *zcp); + +#endif /* !ZED_EVENT_H */ diff --git a/sys/contrib/openzfs/cmd/zed/zed_exec.c b/sys/contrib/openzfs/cmd/zed/zed_exec.c new file mode 100644 index 000000000000..08b7b5568362 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_exec.c @@ -0,0 +1,232 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <time.h> +#include <unistd.h> +#include "zed_exec.h" +#include "zed_file.h" +#include "zed_log.h" +#include "zed_strings.h" + +#define ZEVENT_FILENO 3 + +/* + * Create an environment string array for passing to execve() using the + * NAME=VALUE strings in container [zsp]. + * Return a newly-allocated environment, or NULL on error. + */ +static char ** +_zed_exec_create_env(zed_strings_t *zsp) +{ + int num_ptrs; + int buflen; + char *buf; + char **pp; + char *p; + const char *q; + int i; + int len; + + num_ptrs = zed_strings_count(zsp) + 1; + buflen = num_ptrs * sizeof (char *); + for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) + buflen += strlen(q) + 1; + + buf = calloc(1, buflen); + if (!buf) + return (NULL); + + pp = (char **)buf; + p = buf + (num_ptrs * sizeof (char *)); + i = 0; + for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) { + pp[i] = p; + len = strlen(q) + 1; + memcpy(p, q, len); + p += len; + i++; + } + pp[i] = NULL; + assert(buf + buflen == p); + return ((char **)buf); +} + +/* + * Fork a child process to handle event [eid]. The program [prog] + * in directory [dir] is executed with the environment [env]. + * + * The file descriptor [zfd] is the zevent_fd used to track the + * current cursor location within the zevent nvlist. + */ +static void +_zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog, + char *env[], int zfd) +{ + char path[PATH_MAX]; + int n; + pid_t pid; + int fd; + pid_t wpid; + int status; + + assert(dir != NULL); + assert(prog != NULL); + assert(env != NULL); + assert(zfd >= 0); + + n = snprintf(path, sizeof (path), "%s/%s", dir, prog); + if ((n < 0) || (n >= sizeof (path))) { + zed_log_msg(LOG_WARNING, + "Failed to fork \"%s\" for eid=%llu: %s", + prog, eid, strerror(ENAMETOOLONG)); + return; + } + pid = fork(); + if (pid < 0) { + zed_log_msg(LOG_WARNING, + "Failed to fork \"%s\" for eid=%llu: %s", + prog, eid, strerror(errno)); + return; + } else if (pid == 0) { + (void) umask(022); + if ((fd = open("/dev/null", O_RDWR)) != -1) { + (void) dup2(fd, STDIN_FILENO); + (void) dup2(fd, STDOUT_FILENO); + (void) dup2(fd, STDERR_FILENO); + } + (void) dup2(zfd, ZEVENT_FILENO); + zed_file_close_from(ZEVENT_FILENO + 1); + execle(path, prog, NULL, env); + _exit(127); + } + + /* parent process */ + + zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", + prog, eid, pid); + + /* FIXME: Timeout rogue child processes with sigalarm? */ + + /* + * Wait for child process using WNOHANG to limit + * the time spent waiting to 10 seconds (10,000ms). + */ + for (n = 0; n < 1000; n++) { + wpid = waitpid(pid, &status, WNOHANG); + if (wpid == (pid_t)-1) { + if (errno == EINTR) + continue; + zed_log_msg(LOG_WARNING, + "Failed to wait for \"%s\" eid=%llu pid=%d", + prog, eid, pid); + break; + } else if (wpid == 0) { + struct timespec t; + + /* child still running */ + t.tv_sec = 0; + t.tv_nsec = 10000000; /* 10ms */ + (void) nanosleep(&t, NULL); + continue; + } + + if (WIFEXITED(status)) { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d exit=%d", + prog, eid, pid, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d sig=%d/%s", + prog, eid, pid, WTERMSIG(status), + strsignal(WTERMSIG(status))); + } else { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d status=0x%X", + prog, eid, (unsigned int) status); + } + break; + } + + /* + * kill child process after 10 seconds + */ + if (wpid == 0) { + zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d", + prog, pid); + (void) kill(pid, SIGKILL); + } +} + +/* + * Process the event [eid] by synchronously invoking all zedlets with a + * matching class prefix. + * + * Each executable in [zedlets] from the directory [dir] is matched against + * the event's [class], [subclass], and the "all" class (which matches + * all events). Every zedlet with a matching class prefix is invoked. + * The NAME=VALUE strings in [envs] will be passed to the zedlet as + * environment variables. + * + * The file descriptor [zfd] is the zevent_fd used to track the + * current cursor location within the zevent nvlist. + * + * Return 0 on success, -1 on error. + */ +int +zed_exec_process(uint64_t eid, const char *class, const char *subclass, + const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, int zfd) +{ + const char *class_strings[4]; + const char *allclass = "all"; + const char **csp; + const char *z; + char **e; + int n; + + if (!dir || !zedlets || !envs || zfd < 0) + return (-1); + + csp = class_strings; + + if (class) + *csp++ = class; + + if (subclass) + *csp++ = subclass; + + if (allclass) + *csp++ = allclass; + + *csp = NULL; + + e = _zed_exec_create_env(envs); + + for (z = zed_strings_first(zedlets); z; z = zed_strings_next(zedlets)) { + for (csp = class_strings; *csp; csp++) { + n = strlen(*csp); + if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) + _zed_exec_fork_child(eid, dir, z, e, zfd); + } + } + free(e); + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zed/zed_exec.h b/sys/contrib/openzfs/cmd/zed/zed_exec.h new file mode 100644 index 000000000000..4153e5519a46 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_exec.h @@ -0,0 +1,25 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_EXEC_H +#define ZED_EXEC_H + +#include <stdint.h> +#include "zed_strings.h" + +int zed_exec_process(uint64_t eid, const char *class, const char *subclass, + const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, + int zevent_fd); + +#endif /* !ZED_EXEC_H */ diff --git a/sys/contrib/openzfs/cmd/zed/zed_file.c b/sys/contrib/openzfs/cmd/zed/zed_file.c new file mode 100644 index 000000000000..c3cf3d421c6f --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_file.c @@ -0,0 +1,217 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <string.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include "zed_file.h" +#include "zed_log.h" + +/* + * Read up to [n] bytes from [fd] into [buf]. + * Return the number of bytes read, 0 on EOF, or -1 on error. + */ +ssize_t +zed_file_read_n(int fd, void *buf, size_t n) +{ + unsigned char *p; + size_t n_left; + ssize_t n_read; + + p = buf; + n_left = n; + while (n_left > 0) { + if ((n_read = read(fd, p, n_left)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + + } else if (n_read == 0) { + break; + } + n_left -= n_read; + p += n_read; + } + return (n - n_left); +} + +/* + * Write [n] bytes from [buf] out to [fd]. + * Return the number of bytes written, or -1 on error. + */ +ssize_t +zed_file_write_n(int fd, void *buf, size_t n) +{ + const unsigned char *p; + size_t n_left; + ssize_t n_written; + + p = buf; + n_left = n; + while (n_left > 0) { + if ((n_written = write(fd, p, n_left)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + + } + n_left -= n_written; + p += n_written; + } + return (n); +} + +/* + * Set an exclusive advisory lock on the open file descriptor [fd]. + * Return 0 on success, 1 if a conflicting lock is held by another process, + * or -1 on error (with errno set). + */ +int +zed_file_lock(int fd) +{ + struct flock lock; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(fd, F_SETLK, &lock) < 0) { + if ((errno == EACCES) || (errno == EAGAIN)) + return (1); + + return (-1); + } + return (0); +} + +/* + * Release an advisory lock held on the open file descriptor [fd]. + * Return 0 on success, or -1 on error (with errno set). + */ +int +zed_file_unlock(int fd) +{ + struct flock lock; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(fd, F_SETLK, &lock) < 0) + return (-1); + + return (0); +} + +/* + * Test whether an exclusive advisory lock could be obtained for the open + * file descriptor [fd]. + * Return 0 if the file is not locked, >0 for the PID of another process + * holding a conflicting lock, or -1 on error (with errno set). + */ +pid_t +zed_file_is_locked(int fd) +{ + struct flock lock; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(fd, F_GETLK, &lock) < 0) + return (-1); + + if (lock.l_type == F_UNLCK) + return (0); + + return (lock.l_pid); +} + +/* + * Close all open file descriptors greater than or equal to [lowfd]. + * Any errors encountered while closing file descriptors are ignored. + */ +void +zed_file_close_from(int lowfd) +{ + const int maxfd_def = 256; + int errno_bak; + struct rlimit rl; + int maxfd; + int fd; + + errno_bak = errno; + + if (getrlimit(RLIMIT_NOFILE, &rl) < 0) { + maxfd = maxfd_def; + } else if (rl.rlim_max == RLIM_INFINITY) { + maxfd = maxfd_def; + } else { + maxfd = rl.rlim_max; + } + for (fd = lowfd; fd < maxfd; fd++) + (void) close(fd); + + errno = errno_bak; +} + +/* + * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically + * closed upon successful execution of one of the exec functions. + * Return 0 on success, or -1 on error. + * + * FIXME: No longer needed? + */ +int +zed_file_close_on_exec(int fd) +{ + int flags; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + flags = fcntl(fd, F_GETFD); + if (flags == -1) + return (-1); + + flags |= FD_CLOEXEC; + + if (fcntl(fd, F_SETFD, flags) == -1) + return (-1); + + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zed/zed_file.h b/sys/contrib/openzfs/cmd/zed/zed_file.h new file mode 100644 index 000000000000..05f360d20efd --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_file.h @@ -0,0 +1,35 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_FILE_H +#define ZED_FILE_H + +#include <sys/types.h> +#include <unistd.h> + +ssize_t zed_file_read_n(int fd, void *buf, size_t n); + +ssize_t zed_file_write_n(int fd, void *buf, size_t n); + +int zed_file_lock(int fd); + +int zed_file_unlock(int fd); + +pid_t zed_file_is_locked(int fd); + +void zed_file_close_from(int fd); + +int zed_file_close_on_exec(int fd); + +#endif /* !ZED_FILE_H */ diff --git a/sys/contrib/openzfs/cmd/zed/zed_log.c b/sys/contrib/openzfs/cmd/zed/zed_log.c new file mode 100644 index 000000000000..5a3f2dbdb832 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_log.c @@ -0,0 +1,256 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <assert.h> +#include <errno.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <syslog.h> +#include <unistd.h> +#include "zed_log.h" + +#define ZED_LOG_MAX_LOG_LEN 1024 + +static struct { + unsigned do_stderr:1; + unsigned do_syslog:1; + const char *identity; + int priority; + int pipe_fd[2]; +} _ctx; + +/* + * Initialize the logging subsystem. + */ +void +zed_log_init(const char *identity) +{ + if (identity) { + const char *p = strrchr(identity, '/'); + _ctx.identity = (p != NULL) ? p + 1 : identity; + } else { + _ctx.identity = NULL; + } + _ctx.pipe_fd[0] = -1; + _ctx.pipe_fd[1] = -1; +} + +/* + * Shutdown the logging subsystem. + */ +void +zed_log_fini(void) +{ + zed_log_stderr_close(); + zed_log_syslog_close(); +} + +/* + * Create pipe for communicating daemonization status between the parent and + * child processes across the double-fork(). + */ +void +zed_log_pipe_open(void) +{ + if ((_ctx.pipe_fd[0] != -1) || (_ctx.pipe_fd[1] != -1)) + zed_log_die("Invalid use of zed_log_pipe_open in PID %d", + (int)getpid()); + + if (pipe(_ctx.pipe_fd) < 0) + zed_log_die("Failed to create daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); +} + +/* + * Close the read-half of the daemonize pipe. + * + * This should be called by the child after fork()ing from the parent since + * the child will never read from this pipe. + */ +void +zed_log_pipe_close_reads(void) +{ + if (_ctx.pipe_fd[0] < 0) + zed_log_die( + "Invalid use of zed_log_pipe_close_reads in PID %d", + (int)getpid()); + + if (close(_ctx.pipe_fd[0]) < 0) + zed_log_die( + "Failed to close reads on daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); + + _ctx.pipe_fd[0] = -1; +} + +/* + * Close the write-half of the daemonize pipe. + * + * This should be called by the parent after fork()ing its child since the + * parent will never write to this pipe. + * + * This should also be called by the child once initialization is complete + * in order to signal the parent that it can safely exit. + */ +void +zed_log_pipe_close_writes(void) +{ + if (_ctx.pipe_fd[1] < 0) + zed_log_die( + "Invalid use of zed_log_pipe_close_writes in PID %d", + (int)getpid()); + + if (close(_ctx.pipe_fd[1]) < 0) + zed_log_die( + "Failed to close writes on daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); + + _ctx.pipe_fd[1] = -1; +} + +/* + * Block on reading from the daemonize pipe until signaled by the child + * (via zed_log_pipe_close_writes()) that initialization is complete. + * + * This should only be called by the parent while waiting to exit after + * fork()ing the child. + */ +void +zed_log_pipe_wait(void) +{ + ssize_t n; + char c; + + if (_ctx.pipe_fd[0] < 0) + zed_log_die("Invalid use of zed_log_pipe_wait in PID %d", + (int)getpid()); + + for (;;) { + n = read(_ctx.pipe_fd[0], &c, sizeof (c)); + if (n < 0) { + if (errno == EINTR) + continue; + zed_log_die( + "Failed to read from daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); + } + if (n == 0) { + break; + } + } +} + +/* + * Start logging messages at the syslog [priority] level or higher to stderr. + * Refer to syslog(3) for valid priority values. + */ +void +zed_log_stderr_open(int priority) +{ + _ctx.do_stderr = 1; + _ctx.priority = priority; +} + +/* + * Stop logging messages to stderr. + */ +void +zed_log_stderr_close(void) +{ + if (_ctx.do_stderr) + _ctx.do_stderr = 0; +} + +/* + * Start logging messages to syslog. + * Refer to syslog(3) for valid option/facility values. + */ +void +zed_log_syslog_open(int facility) +{ + _ctx.do_syslog = 1; + openlog(_ctx.identity, LOG_NDELAY | LOG_PID, facility); +} + +/* + * Stop logging messages to syslog. + */ +void +zed_log_syslog_close(void) +{ + if (_ctx.do_syslog) { + _ctx.do_syslog = 0; + closelog(); + } +} + +/* + * Auxiliary function to log a message to syslog and/or stderr. + */ +static void +_zed_log_aux(int priority, const char *fmt, va_list vargs) +{ + char buf[ZED_LOG_MAX_LOG_LEN]; + int n; + + if (!fmt) + return; + + n = vsnprintf(buf, sizeof (buf), fmt, vargs); + if ((n < 0) || (n >= sizeof (buf))) { + buf[sizeof (buf) - 2] = '+'; + buf[sizeof (buf) - 1] = '\0'; + } + + if (_ctx.do_syslog) + syslog(priority, "%s", buf); + + if (_ctx.do_stderr && (priority <= _ctx.priority)) + fprintf(stderr, "%s\n", buf); +} + +/* + * Log a message at the given [priority] level specified by the printf-style + * format string [fmt]. + */ +void +zed_log_msg(int priority, const char *fmt, ...) +{ + va_list vargs; + + if (fmt) { + va_start(vargs, fmt); + _zed_log_aux(priority, fmt, vargs); + va_end(vargs); + } +} + +/* + * Log a fatal error message specified by the printf-style format string [fmt]. + */ +void +zed_log_die(const char *fmt, ...) +{ + va_list vargs; + + if (fmt) { + va_start(vargs, fmt); + _zed_log_aux(LOG_ERR, fmt, vargs); + va_end(vargs); + } + exit(EXIT_FAILURE); +} diff --git a/sys/contrib/openzfs/cmd/zed/zed_log.h b/sys/contrib/openzfs/cmd/zed/zed_log.h new file mode 100644 index 000000000000..a03a4f53967c --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_log.h @@ -0,0 +1,44 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_LOG_H +#define ZED_LOG_H + +#include <syslog.h> + +void zed_log_init(const char *identity); + +void zed_log_fini(void); + +void zed_log_pipe_open(void); + +void zed_log_pipe_close_reads(void); + +void zed_log_pipe_close_writes(void); + +void zed_log_pipe_wait(void); + +void zed_log_stderr_open(int priority); + +void zed_log_stderr_close(void); + +void zed_log_syslog_open(int facility); + +void zed_log_syslog_close(void); + +void zed_log_msg(int priority, const char *fmt, ...); + +void zed_log_die(const char *fmt, ...); + +#endif /* !ZED_LOG_H */ diff --git a/sys/contrib/openzfs/cmd/zed/zed_strings.c b/sys/contrib/openzfs/cmd/zed/zed_strings.c new file mode 100644 index 000000000000..6b1c669d71f4 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_strings.c @@ -0,0 +1,247 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#include <assert.h> +#include <errno.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/avl.h> +#include <sys/sysmacros.h> +#include "zed_strings.h" + +struct zed_strings { + avl_tree_t tree; + avl_node_t *iteratorp; +}; + +struct zed_strings_node { + avl_node_t node; + char *key; + char *val; +}; + +typedef struct zed_strings_node zed_strings_node_t; + +/* + * Compare zed_strings_node_t nodes [x1] and [x2]. + * As required for the AVL tree, return -1 for <, 0 for ==, and +1 for >. + */ +static int +_zed_strings_node_compare(const void *x1, const void *x2) +{ + const char *s1; + const char *s2; + int rv; + + assert(x1 != NULL); + assert(x2 != NULL); + + s1 = ((const zed_strings_node_t *) x1)->key; + assert(s1 != NULL); + s2 = ((const zed_strings_node_t *) x2)->key; + assert(s2 != NULL); + rv = strcmp(s1, s2); + + if (rv < 0) + return (-1); + + if (rv > 0) + return (1); + + return (0); +} + +/* + * Return a new string container, or NULL on error. + */ +zed_strings_t * +zed_strings_create(void) +{ + zed_strings_t *zsp; + + zsp = calloc(1, sizeof (*zsp)); + if (!zsp) + return (NULL); + + avl_create(&zsp->tree, _zed_strings_node_compare, + sizeof (zed_strings_node_t), offsetof(zed_strings_node_t, node)); + + zsp->iteratorp = NULL; + return (zsp); +} + +/* + * Destroy the string node [np]. + */ +static void +_zed_strings_node_destroy(zed_strings_node_t *np) +{ + if (!np) + return; + + if (np->key) { + if (np->key != np->val) + free(np->key); + np->key = NULL; + } + if (np->val) { + free(np->val); + np->val = NULL; + } + free(np); +} + +/* + * Return a new string node for storing the string [val], or NULL on error. + * If [key] is specified, it will be used to index the node; otherwise, + * the string [val] will be used. + */ +static zed_strings_node_t * +_zed_strings_node_create(const char *key, const char *val) +{ + zed_strings_node_t *np; + + assert(val != NULL); + + np = calloc(1, sizeof (*np)); + if (!np) + return (NULL); + + np->val = strdup(val); + if (!np->val) + goto nomem; + + if (key) { + np->key = strdup(key); + if (!np->key) + goto nomem; + } else { + np->key = np->val; + } + return (np); + +nomem: + _zed_strings_node_destroy(np); + return (NULL); +} + +/* + * Destroy the string container [zsp] and all nodes within. + */ +void +zed_strings_destroy(zed_strings_t *zsp) +{ + void *cookie; + zed_strings_node_t *np; + + if (!zsp) + return; + + cookie = NULL; + while ((np = avl_destroy_nodes(&zsp->tree, &cookie))) + _zed_strings_node_destroy(np); + + avl_destroy(&zsp->tree); + free(zsp); +} + +/* + * Add a copy of the string [s] indexed by [key] to the container [zsp]. + * If [key] already exists within the container [zsp], it will be replaced + * with the new string [s]. + * If [key] is NULL, the string [s] will be used as the key. + * Return 0 on success, or -1 on error. + */ +int +zed_strings_add(zed_strings_t *zsp, const char *key, const char *s) +{ + zed_strings_node_t *newp, *oldp; + + if (!zsp || !s) { + errno = EINVAL; + return (-1); + } + if (key == s) + key = NULL; + + newp = _zed_strings_node_create(key, s); + if (!newp) + return (-1); + + oldp = avl_find(&zsp->tree, newp, NULL); + if (oldp) { + avl_remove(&zsp->tree, oldp); + _zed_strings_node_destroy(oldp); + } + avl_add(&zsp->tree, newp); + return (0); +} + +/* + * Return the first string in container [zsp]. + * Return NULL if there are no strings, or on error. + * This can be called multiple times to re-traverse [zsp]. + * XXX: Not thread-safe. + */ +const char * +zed_strings_first(zed_strings_t *zsp) +{ + if (!zsp) { + errno = EINVAL; + return (NULL); + } + zsp->iteratorp = avl_first(&zsp->tree); + if (!zsp->iteratorp) + return (NULL); + + return (((zed_strings_node_t *)zsp->iteratorp)->val); + +} + +/* + * Return the next string in container [zsp]. + * Return NULL after the last string, or on error. + * This must be called after zed_strings_first(). + * XXX: Not thread-safe. + */ +const char * +zed_strings_next(zed_strings_t *zsp) +{ + if (!zsp) { + errno = EINVAL; + return (NULL); + } + if (!zsp->iteratorp) + return (NULL); + + zsp->iteratorp = AVL_NEXT(&zsp->tree, zsp->iteratorp); + if (!zsp->iteratorp) + return (NULL); + + return (((zed_strings_node_t *)zsp->iteratorp)->val); +} + +/* + * Return the number of strings in container [zsp], or -1 on error. + */ +int +zed_strings_count(zed_strings_t *zsp) +{ + if (!zsp) { + errno = EINVAL; + return (-1); + } + return (avl_numnodes(&zsp->tree)); +} diff --git a/sys/contrib/openzfs/cmd/zed/zed_strings.h b/sys/contrib/openzfs/cmd/zed/zed_strings.h new file mode 100644 index 000000000000..37a84cad7ffc --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed_strings.h @@ -0,0 +1,27 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>. + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_STRINGS_H +#define ZED_STRINGS_H + +typedef struct zed_strings zed_strings_t; + +zed_strings_t *zed_strings_create(void); +void zed_strings_destroy(zed_strings_t *zsp); +int zed_strings_add(zed_strings_t *zsp, const char *key, const char *s); +const char *zed_strings_first(zed_strings_t *zsp); +const char *zed_strings_next(zed_strings_t *zsp); +int zed_strings_count(zed_strings_t *zsp); + +#endif /* !ZED_STRINGS_H */ diff --git a/sys/contrib/openzfs/cmd/zfs/.gitignore b/sys/contrib/openzfs/cmd/zfs/.gitignore new file mode 100644 index 000000000000..0fd9cc63af2a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs/.gitignore @@ -0,0 +1 @@ +/zfs diff --git a/sys/contrib/openzfs/cmd/zfs/Makefile.am b/sys/contrib/openzfs/cmd/zfs/Makefile.am new file mode 100644 index 000000000000..dec5920381d5 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs/Makefile.am @@ -0,0 +1,23 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zfs + +zfs_SOURCES = \ + zfs_iter.c \ + zfs_iter.h \ + zfs_main.c \ + zfs_util.h \ + zfs_project.c \ + zfs_projectutil.h + +zfs_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +zfs_LDADD += $(LTLIBINTL) + +if BUILD_FREEBSD +zfs_LDADD += -lgeom -ljail +endif diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_iter.c b/sys/contrib/openzfs/cmd/zfs/zfs_iter.c new file mode 100644 index 000000000000..f2359508c16d --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs/zfs_iter.c @@ -0,0 +1,512 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include <libintl.h> +#include <libuutil.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> + +#include <libzfs.h> + +#include "zfs_util.h" +#include "zfs_iter.h" + +/* + * This is a private interface used to gather up all the datasets specified on + * the command line so that we can iterate over them in order. + * + * First, we iterate over all filesystems, gathering them together into an + * AVL tree. We report errors for any explicitly specified datasets + * that we couldn't open. + * + * When finished, we have an AVL tree of ZFS handles. We go through and execute + * the provided callback for each one, passing whatever data the user supplied. + */ + +typedef struct zfs_node { + zfs_handle_t *zn_handle; + uu_avl_node_t zn_avlnode; +} zfs_node_t; + +typedef struct callback_data { + uu_avl_t *cb_avl; + int cb_flags; + zfs_type_t cb_types; + zfs_sort_column_t *cb_sortcol; + zprop_list_t **cb_proplist; + int cb_depth_limit; + int cb_depth; + uint8_t cb_props_table[ZFS_NUM_PROPS]; +} callback_data_t; + +uu_avl_pool_t *avl_pool; + +/* + * Include snaps if they were requested or if this a zfs list where types + * were not specified and the "listsnapshots" property is set on this pool. + */ +static boolean_t +zfs_include_snapshots(zfs_handle_t *zhp, callback_data_t *cb) +{ + zpool_handle_t *zph; + + if ((cb->cb_flags & ZFS_ITER_PROP_LISTSNAPS) == 0) + return (cb->cb_types & ZFS_TYPE_SNAPSHOT); + + zph = zfs_get_pool_handle(zhp); + return (zpool_get_prop_int(zph, ZPOOL_PROP_LISTSNAPS, NULL)); +} + +/* + * Called for each dataset. If the object is of an appropriate type, + * add it to the avl tree and recurse over any children as necessary. + */ +static int +zfs_callback(zfs_handle_t *zhp, void *data) +{ + callback_data_t *cb = data; + boolean_t should_close = B_TRUE; + boolean_t include_snaps = zfs_include_snapshots(zhp, cb); + boolean_t include_bmarks = (cb->cb_types & ZFS_TYPE_BOOKMARK); + + if ((zfs_get_type(zhp) & cb->cb_types) || + ((zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) && include_snaps)) { + uu_avl_index_t idx; + zfs_node_t *node = safe_malloc(sizeof (zfs_node_t)); + + node->zn_handle = zhp; + uu_avl_node_init(node, &node->zn_avlnode, avl_pool); + if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol, + &idx) == NULL) { + if (cb->cb_proplist) { + if ((*cb->cb_proplist) && + !(*cb->cb_proplist)->pl_all) + zfs_prune_proplist(zhp, + cb->cb_props_table); + + if (zfs_expand_proplist(zhp, cb->cb_proplist, + (cb->cb_flags & ZFS_ITER_RECVD_PROPS), + (cb->cb_flags & ZFS_ITER_LITERAL_PROPS)) + != 0) { + free(node); + return (-1); + } + } + uu_avl_insert(cb->cb_avl, node, idx); + should_close = B_FALSE; + } else { + free(node); + } + } + + /* + * Recurse if necessary. + */ + if (cb->cb_flags & ZFS_ITER_RECURSE && + ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 || + cb->cb_depth < cb->cb_depth_limit)) { + cb->cb_depth++; + + /* + * If we are not looking for filesystems, we don't need to + * recurse into filesystems when we are at our depth limit. + */ + if ((cb->cb_depth < cb->cb_depth_limit || + (cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 || + (cb->cb_types & + (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME))) && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + (void) zfs_iter_filesystems(zhp, zfs_callback, data); + } + + if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | + ZFS_TYPE_BOOKMARK)) == 0) && include_snaps) { + (void) zfs_iter_snapshots(zhp, + (cb->cb_flags & ZFS_ITER_SIMPLE) != 0, + zfs_callback, data, 0, 0); + } + + if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | + ZFS_TYPE_BOOKMARK)) == 0) && include_bmarks) { + (void) zfs_iter_bookmarks(zhp, zfs_callback, data); + } + + cb->cb_depth--; + } + + if (should_close) + zfs_close(zhp); + + return (0); +} + +int +zfs_add_sort_column(zfs_sort_column_t **sc, const char *name, + boolean_t reverse) +{ + zfs_sort_column_t *col; + zfs_prop_t prop; + + if ((prop = zfs_name_to_prop(name)) == ZPROP_INVAL && + !zfs_prop_user(name)) + return (-1); + + col = safe_malloc(sizeof (zfs_sort_column_t)); + + col->sc_prop = prop; + col->sc_reverse = reverse; + if (prop == ZPROP_INVAL) { + col->sc_user_prop = safe_malloc(strlen(name) + 1); + (void) strcpy(col->sc_user_prop, name); + } + + if (*sc == NULL) { + col->sc_last = col; + *sc = col; + } else { + (*sc)->sc_last->sc_next = col; + (*sc)->sc_last = col; + } + + return (0); +} + +void +zfs_free_sort_columns(zfs_sort_column_t *sc) +{ + zfs_sort_column_t *col; + + while (sc != NULL) { + col = sc->sc_next; + free(sc->sc_user_prop); + free(sc); + sc = col; + } +} + +int +zfs_sort_only_by_name(const zfs_sort_column_t *sc) +{ + return (sc != NULL && sc->sc_next == NULL && + sc->sc_prop == ZFS_PROP_NAME); +} + +/* ARGSUSED */ +static int +zfs_compare(const void *larg, const void *rarg, void *unused) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + const char *lname = zfs_get_name(l); + const char *rname = zfs_get_name(r); + char *lat, *rat; + uint64_t lcreate, rcreate; + int ret; + + lat = (char *)strchr(lname, '@'); + rat = (char *)strchr(rname, '@'); + + if (lat != NULL) + *lat = '\0'; + if (rat != NULL) + *rat = '\0'; + + ret = strcmp(lname, rname); + if (ret == 0 && (lat != NULL || rat != NULL)) { + /* + * If we're comparing a dataset to one of its snapshots, we + * always make the full dataset first. + */ + if (lat == NULL) { + ret = -1; + } else if (rat == NULL) { + ret = 1; + } else { + /* + * If we have two snapshots from the same dataset, then + * we want to sort them according to creation time. We + * use the hidden CREATETXG property to get an absolute + * ordering of snapshots. + */ + lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); + rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); + + /* + * Both lcreate and rcreate being 0 means we don't have + * properties and we should compare full name. + */ + if (lcreate == 0 && rcreate == 0) + ret = strcmp(lat + 1, rat + 1); + else if (lcreate < rcreate) + ret = -1; + else if (lcreate > rcreate) + ret = 1; + } + } + + if (lat != NULL) + *lat = '@'; + if (rat != NULL) + *rat = '@'; + + return (ret); +} + +/* + * Sort datasets by specified columns. + * + * o Numeric types sort in ascending order. + * o String types sort in alphabetical order. + * o Types inappropriate for a row sort that row to the literal + * bottom, regardless of the specified ordering. + * + * If no sort columns are specified, or two datasets compare equally + * across all specified columns, they are sorted alphabetically by name + * with snapshots grouped under their parents. + */ +static int +zfs_sort(const void *larg, const void *rarg, void *data) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + zfs_sort_column_t *sc = (zfs_sort_column_t *)data; + zfs_sort_column_t *psc; + + for (psc = sc; psc != NULL; psc = psc->sc_next) { + char lbuf[ZFS_MAXPROPLEN], rbuf[ZFS_MAXPROPLEN]; + char *lstr, *rstr; + uint64_t lnum, rnum; + boolean_t lvalid, rvalid; + int ret = 0; + + /* + * We group the checks below the generic code. If 'lstr' and + * 'rstr' are non-NULL, then we do a string based comparison. + * Otherwise, we compare 'lnum' and 'rnum'. + */ + lstr = rstr = NULL; + if (psc->sc_prop == ZPROP_INVAL) { + nvlist_t *luser, *ruser; + nvlist_t *lval, *rval; + + luser = zfs_get_user_props(l); + ruser = zfs_get_user_props(r); + + lvalid = (nvlist_lookup_nvlist(luser, + psc->sc_user_prop, &lval) == 0); + rvalid = (nvlist_lookup_nvlist(ruser, + psc->sc_user_prop, &rval) == 0); + + if (lvalid) + verify(nvlist_lookup_string(lval, + ZPROP_VALUE, &lstr) == 0); + if (rvalid) + verify(nvlist_lookup_string(rval, + ZPROP_VALUE, &rstr) == 0); + } else if (psc->sc_prop == ZFS_PROP_NAME) { + lvalid = rvalid = B_TRUE; + + (void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf)); + (void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf)); + + lstr = lbuf; + rstr = rbuf; + } else if (zfs_prop_is_string(psc->sc_prop)) { + lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf, + sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0); + rvalid = (zfs_prop_get(r, psc->sc_prop, rbuf, + sizeof (rbuf), NULL, NULL, 0, B_TRUE) == 0); + + lstr = lbuf; + rstr = rbuf; + } else { + lvalid = zfs_prop_valid_for_type(psc->sc_prop, + zfs_get_type(l), B_FALSE); + rvalid = zfs_prop_valid_for_type(psc->sc_prop, + zfs_get_type(r), B_FALSE); + + if (lvalid) + (void) zfs_prop_get_numeric(l, psc->sc_prop, + &lnum, NULL, NULL, 0); + if (rvalid) + (void) zfs_prop_get_numeric(r, psc->sc_prop, + &rnum, NULL, NULL, 0); + } + + if (!lvalid && !rvalid) + continue; + else if (!lvalid) + return (1); + else if (!rvalid) + return (-1); + + if (lstr) + ret = strcmp(lstr, rstr); + else if (lnum < rnum) + ret = -1; + else if (lnum > rnum) + ret = 1; + + if (ret != 0) { + if (psc->sc_reverse == B_TRUE) + ret = (ret < 0) ? 1 : -1; + return (ret); + } + } + + return (zfs_compare(larg, rarg, NULL)); +} + +int +zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, + zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit, + zfs_iter_f callback, void *data) +{ + callback_data_t cb = {0}; + int ret = 0; + zfs_node_t *node; + uu_avl_walk_t *walk; + + avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t), + offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT); + + if (avl_pool == NULL) + nomem(); + + cb.cb_sortcol = sortcol; + cb.cb_flags = flags; + cb.cb_proplist = proplist; + cb.cb_types = types; + cb.cb_depth_limit = limit; + /* + * If cb_proplist is provided then in the zfs_handles created we + * retain only those properties listed in cb_proplist and sortcol. + * The rest are pruned. So, the caller should make sure that no other + * properties other than those listed in cb_proplist/sortcol are + * accessed. + * + * If cb_proplist is NULL then we retain all the properties. We + * always retain the zoned property, which some other properties + * need (userquota & friends), and the createtxg property, which + * we need to sort snapshots. + */ + if (cb.cb_proplist && *cb.cb_proplist) { + zprop_list_t *p = *cb.cb_proplist; + + while (p) { + if (p->pl_prop >= ZFS_PROP_TYPE && + p->pl_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[p->pl_prop] = B_TRUE; + } + p = p->pl_next; + } + + while (sortcol) { + if (sortcol->sc_prop >= ZFS_PROP_TYPE && + sortcol->sc_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[sortcol->sc_prop] = B_TRUE; + } + sortcol = sortcol->sc_next; + } + + cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE; + cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE; + } else { + (void) memset(cb.cb_props_table, B_TRUE, + sizeof (cb.cb_props_table)); + } + + if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) + nomem(); + + if (argc == 0) { + /* + * If given no arguments, iterate over all datasets. + */ + cb.cb_flags |= ZFS_ITER_RECURSE; + ret = zfs_iter_root(g_zfs, zfs_callback, &cb); + } else { + int i; + zfs_handle_t *zhp; + zfs_type_t argtype; + + /* + * If we're recursive, then we always allow filesystems as + * arguments. If we also are interested in snapshots or + * bookmarks, then we can take volumes as well. + */ + argtype = types; + if (flags & ZFS_ITER_RECURSE) { + argtype |= ZFS_TYPE_FILESYSTEM; + if (types & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) + argtype |= ZFS_TYPE_VOLUME; + } + + for (i = 0; i < argc; i++) { + if (flags & ZFS_ITER_ARGS_CAN_BE_PATHS) { + zhp = zfs_path_to_zhandle(g_zfs, argv[i], + argtype); + } else { + zhp = zfs_open(g_zfs, argv[i], argtype); + } + if (zhp != NULL) + ret |= zfs_callback(zhp, &cb); + else + ret = 1; + } + } + + /* + * At this point we've got our AVL tree full of zfs handles, so iterate + * over each one and execute the real user callback. + */ + for (node = uu_avl_first(cb.cb_avl); node != NULL; + node = uu_avl_next(cb.cb_avl, node)) + ret |= callback(node->zn_handle, data); + + /* + * Finally, clean up the AVL tree. + */ + if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(cb.cb_avl, node); + zfs_close(node->zn_handle); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(cb.cb_avl); + uu_avl_pool_destroy(avl_pool); + + return (ret); +} diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_iter.h b/sys/contrib/openzfs/cmd/zfs/zfs_iter.h new file mode 100644 index 000000000000..2697fbdca1df --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs/zfs_iter.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef ZFS_ITER_H +#define ZFS_ITER_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfs_sort_column { + struct zfs_sort_column *sc_next; + struct zfs_sort_column *sc_last; + zfs_prop_t sc_prop; + char *sc_user_prop; + boolean_t sc_reverse; +} zfs_sort_column_t; + +#define ZFS_ITER_RECURSE (1 << 0) +#define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1) +#define ZFS_ITER_PROP_LISTSNAPS (1 << 2) +#define ZFS_ITER_DEPTH_LIMIT (1 << 3) +#define ZFS_ITER_RECVD_PROPS (1 << 4) +#define ZFS_ITER_LITERAL_PROPS (1 << 5) +#define ZFS_ITER_SIMPLE (1 << 6) + +int zfs_for_each(int, char **, int options, zfs_type_t, + zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *); +int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t); +void zfs_free_sort_columns(zfs_sort_column_t *); +int zfs_sort_only_by_name(const zfs_sort_column_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* ZFS_ITER_H */ diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c new file mode 100644 index 000000000000..650b4fc9b74f --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c @@ -0,0 +1,8620 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright 2012 Milan Jurik. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright 2016 Nexenta Systems, Inc. + * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com> + * Copyright 2019 Joyent, Inc. + * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. + */ + +#include <assert.h> +#include <ctype.h> +#include <sys/debug.h> +#include <errno.h> +#include <getopt.h> +#include <libgen.h> +#include <libintl.h> +#include <libuutil.h> +#include <libnvpair.h> +#include <locale.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> +#include <fcntl.h> +#include <zone.h> +#include <grp.h> +#include <pwd.h> +#include <signal.h> +#include <sys/debug.h> +#include <sys/list.h> +#include <sys/mkdev.h> +#include <sys/mntent.h> +#include <sys/mnttab.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/fs/zfs.h> +#include <sys/systeminfo.h> +#include <sys/types.h> +#include <time.h> +#include <sys/zfs_project.h> + +#include <libzfs.h> +#include <libzfs_core.h> +#include <zfs_prop.h> +#include <zfs_deleg.h> +#include <libzutil.h> +#include <libuutil.h> +#ifdef HAVE_IDMAP +#include <aclutils.h> +#include <directory.h> +#endif /* HAVE_IDMAP */ + +#include "zfs_iter.h" +#include "zfs_util.h" +#include "zfs_comutil.h" +#include "libzfs_impl.h" +#include "zfs_projectutil.h" + +libzfs_handle_t *g_zfs; + +static FILE *mnttab_file; +static char history_str[HIS_MAX_RECORD_LEN]; +static boolean_t log_history = B_TRUE; + +static int zfs_do_clone(int argc, char **argv); +static int zfs_do_create(int argc, char **argv); +static int zfs_do_destroy(int argc, char **argv); +static int zfs_do_get(int argc, char **argv); +static int zfs_do_inherit(int argc, char **argv); +static int zfs_do_list(int argc, char **argv); +static int zfs_do_mount(int argc, char **argv); +static int zfs_do_rename(int argc, char **argv); +static int zfs_do_rollback(int argc, char **argv); +static int zfs_do_set(int argc, char **argv); +static int zfs_do_upgrade(int argc, char **argv); +static int zfs_do_snapshot(int argc, char **argv); +static int zfs_do_unmount(int argc, char **argv); +static int zfs_do_share(int argc, char **argv); +static int zfs_do_unshare(int argc, char **argv); +static int zfs_do_send(int argc, char **argv); +static int zfs_do_receive(int argc, char **argv); +static int zfs_do_promote(int argc, char **argv); +static int zfs_do_userspace(int argc, char **argv); +static int zfs_do_allow(int argc, char **argv); +static int zfs_do_unallow(int argc, char **argv); +static int zfs_do_hold(int argc, char **argv); +static int zfs_do_holds(int argc, char **argv); +static int zfs_do_release(int argc, char **argv); +static int zfs_do_diff(int argc, char **argv); +static int zfs_do_bookmark(int argc, char **argv); +static int zfs_do_channel_program(int argc, char **argv); +static int zfs_do_load_key(int argc, char **argv); +static int zfs_do_unload_key(int argc, char **argv); +static int zfs_do_change_key(int argc, char **argv); +static int zfs_do_project(int argc, char **argv); +static int zfs_do_version(int argc, char **argv); +static int zfs_do_redact(int argc, char **argv); +static int zfs_do_wait(int argc, char **argv); + +#ifdef __FreeBSD__ +static int zfs_do_jail(int argc, char **argv); +static int zfs_do_unjail(int argc, char **argv); +#endif + +/* + * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +typedef enum { + HELP_CLONE, + HELP_CREATE, + HELP_DESTROY, + HELP_GET, + HELP_INHERIT, + HELP_UPGRADE, + HELP_LIST, + HELP_MOUNT, + HELP_PROMOTE, + HELP_RECEIVE, + HELP_RENAME, + HELP_ROLLBACK, + HELP_SEND, + HELP_SET, + HELP_SHARE, + HELP_SNAPSHOT, + HELP_UNMOUNT, + HELP_UNSHARE, + HELP_ALLOW, + HELP_UNALLOW, + HELP_USERSPACE, + HELP_GROUPSPACE, + HELP_PROJECTSPACE, + HELP_PROJECT, + HELP_HOLD, + HELP_HOLDS, + HELP_RELEASE, + HELP_DIFF, + HELP_BOOKMARK, + HELP_CHANNEL_PROGRAM, + HELP_LOAD_KEY, + HELP_UNLOAD_KEY, + HELP_CHANGE_KEY, + HELP_VERSION, + HELP_REDACT, + HELP_JAIL, + HELP_UNJAIL, + HELP_WAIT, +} zfs_help_t; + +typedef struct zfs_command { + const char *name; + int (*func)(int argc, char **argv); + zfs_help_t usage; +} zfs_command_t; + +/* + * Master command table. Each ZFS command has a name, associated function, and + * usage message. The usage messages need to be internationalized, so we have + * to have a function to return the usage message based on a command index. + * + * These commands are organized according to how they are displayed in the usage + * message. An empty command (one with a NULL name) indicates an empty line in + * the generic usage message. + */ +static zfs_command_t command_table[] = { + { "version", zfs_do_version, HELP_VERSION }, + { NULL }, + { "create", zfs_do_create, HELP_CREATE }, + { "destroy", zfs_do_destroy, HELP_DESTROY }, + { NULL }, + { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, + { "rollback", zfs_do_rollback, HELP_ROLLBACK }, + { "clone", zfs_do_clone, HELP_CLONE }, + { "promote", zfs_do_promote, HELP_PROMOTE }, + { "rename", zfs_do_rename, HELP_RENAME }, + { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, + { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, + { NULL }, + { "list", zfs_do_list, HELP_LIST }, + { NULL }, + { "set", zfs_do_set, HELP_SET }, + { "get", zfs_do_get, HELP_GET }, + { "inherit", zfs_do_inherit, HELP_INHERIT }, + { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, + { NULL }, + { "userspace", zfs_do_userspace, HELP_USERSPACE }, + { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, + { "projectspace", zfs_do_userspace, HELP_PROJECTSPACE }, + { NULL }, + { "project", zfs_do_project, HELP_PROJECT }, + { NULL }, + { "mount", zfs_do_mount, HELP_MOUNT }, + { "unmount", zfs_do_unmount, HELP_UNMOUNT }, + { "share", zfs_do_share, HELP_SHARE }, + { "unshare", zfs_do_unshare, HELP_UNSHARE }, + { NULL }, + { "send", zfs_do_send, HELP_SEND }, + { "receive", zfs_do_receive, HELP_RECEIVE }, + { NULL }, + { "allow", zfs_do_allow, HELP_ALLOW }, + { NULL }, + { "unallow", zfs_do_unallow, HELP_UNALLOW }, + { NULL }, + { "hold", zfs_do_hold, HELP_HOLD }, + { "holds", zfs_do_holds, HELP_HOLDS }, + { "release", zfs_do_release, HELP_RELEASE }, + { "diff", zfs_do_diff, HELP_DIFF }, + { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, + { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, + { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, + { "redact", zfs_do_redact, HELP_REDACT }, + { "wait", zfs_do_wait, HELP_WAIT }, + +#ifdef __FreeBSD__ + { "jail", zfs_do_jail, HELP_JAIL }, + { "unjail", zfs_do_unjail, HELP_UNJAIL }, +#endif +}; + +#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) + +zfs_command_t *current_command; + +static const char * +get_usage(zfs_help_t idx) +{ + switch (idx) { + case HELP_CLONE: + return (gettext("\tclone [-p] [-o property=value] ... " + "<snapshot> <filesystem|volume>\n")); + case HELP_CREATE: + return (gettext("\tcreate [-Pnpv] [-o property=value] ... " + "<filesystem>\n" + "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... " + "-V <size> <volume>\n")); + case HELP_DESTROY: + return (gettext("\tdestroy [-fnpRrv] <filesystem|volume>\n" + "\tdestroy [-dnpRrv] " + "<filesystem|volume>@<snap>[%<snap>][,...]\n" + "\tdestroy <filesystem|volume>#<bookmark>\n")); + case HELP_GET: + return (gettext("\tget [-rHp] [-d max] " + "[-o \"all\" | field[,...]]\n" + "\t [-t type[,...]] [-s source[,...]]\n" + "\t <\"all\" | property[,...]> " + "[filesystem|volume|snapshot|bookmark] ...\n")); + case HELP_INHERIT: + return (gettext("\tinherit [-rS] <property> " + "<filesystem|volume|snapshot> ...\n")); + case HELP_UPGRADE: + return (gettext("\tupgrade [-v]\n" + "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); + case HELP_LIST: + return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " + "[-s property]...\n\t [-S property]... [-t type[,...]] " + "[filesystem|volume|snapshot] ...\n")); + case HELP_MOUNT: + return (gettext("\tmount\n" + "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); + case HELP_PROMOTE: + return (gettext("\tpromote <clone-filesystem>\n")); + case HELP_RECEIVE: + return (gettext("\treceive [-vMnsFhu] " + "[-o <property>=<value>] ... [-x <property>] ...\n" + "\t <filesystem|volume|snapshot>\n" + "\treceive [-vMnsFhu] [-o <property>=<value>] ... " + "[-x <property>] ... \n" + "\t [-d | -e] <filesystem>\n" + "\treceive -A <filesystem|volume>\n")); + case HELP_RENAME: + return (gettext("\trename [-f] <filesystem|volume|snapshot> " + "<filesystem|volume|snapshot>\n" + "\trename [-f] -p <filesystem|volume> <filesystem|volume>\n" + "\trename -r <snapshot> <snapshot>\n")); + case HELP_ROLLBACK: + return (gettext("\trollback [-rRf] <snapshot>\n")); + case HELP_SEND: + return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] " + "<snapshot>\n" + "\tsend [-nvPLecw] [-i snapshot|bookmark] " + "<filesystem|volume|snapshot>\n" + "\tsend [-DnPpvLec] [-i bookmark|snapshot] " + "--redact <bookmark> <snapshot>\n" + "\tsend [-nvPe] -t <receive_resume_token>\n" + "\tsend [-Pnv] --saved filesystem\n")); + case HELP_SET: + return (gettext("\tset <property=value> ... " + "<filesystem|volume|snapshot> ...\n")); + case HELP_SHARE: + return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n")); + case HELP_SNAPSHOT: + return (gettext("\tsnapshot [-r] [-o property=value] ... " + "<filesystem|volume>@<snap> ...\n")); + case HELP_UNMOUNT: + return (gettext("\tunmount [-fu] " + "<-a | filesystem|mountpoint>\n")); + case HELP_UNSHARE: + return (gettext("\tunshare " + "<-a [nfs|smb] | filesystem|mountpoint>\n")); + case HELP_ALLOW: + return (gettext("\tallow <filesystem|volume>\n" + "\tallow [-ldug] " + "<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n" + "\t <filesystem|volume>\n" + "\tallow [-ld] -e <perm|@setname>[,...] " + "<filesystem|volume>\n" + "\tallow -c <perm|@setname>[,...] <filesystem|volume>\n" + "\tallow -s @setname <perm|@setname>[,...] " + "<filesystem|volume>\n")); + case HELP_UNALLOW: + return (gettext("\tunallow [-rldug] " + "<\"everyone\"|user|group>[,...]\n" + "\t [<perm|@setname>[,...]] <filesystem|volume>\n" + "\tunallow [-rld] -e [<perm|@setname>[,...]] " + "<filesystem|volume>\n" + "\tunallow [-r] -c [<perm|@setname>[,...]] " + "<filesystem|volume>\n" + "\tunallow [-r] -s @setname [<perm|@setname>[,...]] " + "<filesystem|volume>\n")); + case HELP_USERSPACE: + return (gettext("\tuserspace [-Hinp] [-o field[,...]] " + "[-s field] ...\n" + "\t [-S field] ... [-t type[,...]] " + "<filesystem|snapshot>\n")); + case HELP_GROUPSPACE: + return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " + "[-s field] ...\n" + "\t [-S field] ... [-t type[,...]] " + "<filesystem|snapshot>\n")); + case HELP_PROJECTSPACE: + return (gettext("\tprojectspace [-Hp] [-o field[,...]] " + "[-s field] ... \n" + "\t [-S field] ... <filesystem|snapshot>\n")); + case HELP_PROJECT: + return (gettext("\tproject [-d|-r] <directory|file ...>\n" + "\tproject -c [-0] [-d|-r] [-p id] <directory|file ...>\n" + "\tproject -C [-k] [-r] <directory ...>\n" + "\tproject [-p id] [-r] [-s] <directory ...>\n")); + case HELP_HOLD: + return (gettext("\thold [-r] <tag> <snapshot> ...\n")); + case HELP_HOLDS: + return (gettext("\tholds [-rH] <snapshot> ...\n")); + case HELP_RELEASE: + return (gettext("\trelease [-r] <tag> <snapshot> ...\n")); + case HELP_DIFF: + return (gettext("\tdiff [-FHt] <snapshot> " + "[snapshot|filesystem]\n")); + case HELP_BOOKMARK: + return (gettext("\tbookmark <snapshot|bookmark> " + "<newbookmark>\n")); + case HELP_CHANNEL_PROGRAM: + return (gettext("\tprogram [-jn] [-t <instruction limit>] " + "[-m <memory limit (b)>]\n" + "\t <pool> <program file> [lua args...]\n")); + case HELP_LOAD_KEY: + return (gettext("\tload-key [-rn] [-L <keylocation>] " + "<-a | filesystem|volume>\n")); + case HELP_UNLOAD_KEY: + return (gettext("\tunload-key [-r] " + "<-a | filesystem|volume>\n")); + case HELP_CHANGE_KEY: + return (gettext("\tchange-key [-l] [-o keyformat=<value>]\n" + "\t [-o keylocation=<value>] [-o pbkfd2iters=<value>]\n" + "\t <filesystem|volume>\n" + "\tchange-key -i [-l] <filesystem|volume>\n")); + case HELP_VERSION: + return (gettext("\tversion\n")); + case HELP_REDACT: + return (gettext("\tredact <snapshot> <bookmark> " + "<redaction_snapshot> ...\n")); + case HELP_JAIL: + return (gettext("\tjail <jailid|jailname> <filesystem>\n")); + case HELP_UNJAIL: + return (gettext("\tunjail <jailid|jailname> <filesystem>\n")); + case HELP_WAIT: + return (gettext("\twait [-t <activity>] <filesystem>\n")); + } + + abort(); + /* NOTREACHED */ +} + +void +nomem(void) +{ + (void) fprintf(stderr, gettext("internal error: out of memory\n")); + exit(1); +} + +/* + * Utility function to guarantee malloc() success. + */ + +void * +safe_malloc(size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) + nomem(); + + return (data); +} + +static void * +safe_realloc(void *data, size_t size) +{ + void *newp; + if ((newp = realloc(data, size)) == NULL) { + free(data); + nomem(); + } + + return (newp); +} + +static char * +safe_strdup(char *str) +{ + char *dupstr = strdup(str); + + if (dupstr == NULL) + nomem(); + + return (dupstr); +} + +/* + * Callback routine that will print out information for each of + * the properties. + */ +static int +usage_prop_cb(int prop, void *cb) +{ + FILE *fp = cb; + + (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); + + if (zfs_prop_readonly(prop)) + (void) fprintf(fp, " NO "); + else + (void) fprintf(fp, "YES "); + + if (zfs_prop_inheritable(prop)) + (void) fprintf(fp, " YES "); + else + (void) fprintf(fp, " NO "); + + if (zfs_prop_values(prop) == NULL) + (void) fprintf(fp, "-\n"); + else + (void) fprintf(fp, "%s\n", zfs_prop_values(prop)); + + return (ZPROP_CONT); +} + +/* + * Display usage message. If we're inside a command, display only the usage for + * that command. Otherwise, iterate over the entire command table and display + * a complete usage message. + */ +static void +usage(boolean_t requested) +{ + int i; + boolean_t show_properties = B_FALSE; + FILE *fp = requested ? stdout : stderr; + + if (current_command == NULL) { + + (void) fprintf(fp, gettext("usage: zfs command args ...\n")); + (void) fprintf(fp, + gettext("where 'command' is one of the following:\n\n")); + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + (void) fprintf(fp, "\n"); + else + (void) fprintf(fp, "%s", + get_usage(command_table[i].usage)); + } + + (void) fprintf(fp, gettext("\nEach dataset is of the form: " + "pool/[dataset/]*dataset[@name]\n")); + } else { + (void) fprintf(fp, gettext("usage:\n")); + (void) fprintf(fp, "%s", get_usage(current_command->usage)); + } + + if (current_command != NULL && + (strcmp(current_command->name, "set") == 0 || + strcmp(current_command->name, "get") == 0 || + strcmp(current_command->name, "inherit") == 0 || + strcmp(current_command->name, "list") == 0)) + show_properties = B_TRUE; + + if (show_properties) { + (void) fprintf(fp, + gettext("\nThe following properties are supported:\n")); + + (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", + "PROPERTY", "EDIT", "INHERIT", "VALUES"); + + /* Iterate over all properties */ + (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, + ZFS_TYPE_DATASET); + + (void) fprintf(fp, "\t%-15s ", "userused@..."); + (void) fprintf(fp, " NO NO <size>\n"); + (void) fprintf(fp, "\t%-15s ", "groupused@..."); + (void) fprintf(fp, " NO NO <size>\n"); + (void) fprintf(fp, "\t%-15s ", "projectused@..."); + (void) fprintf(fp, " NO NO <size>\n"); + (void) fprintf(fp, "\t%-15s ", "userobjused@..."); + (void) fprintf(fp, " NO NO <size>\n"); + (void) fprintf(fp, "\t%-15s ", "groupobjused@..."); + (void) fprintf(fp, " NO NO <size>\n"); + (void) fprintf(fp, "\t%-15s ", "projectobjused@..."); + (void) fprintf(fp, " NO NO <size>\n"); + (void) fprintf(fp, "\t%-15s ", "userquota@..."); + (void) fprintf(fp, "YES NO <size> | none\n"); + (void) fprintf(fp, "\t%-15s ", "groupquota@..."); + (void) fprintf(fp, "YES NO <size> | none\n"); + (void) fprintf(fp, "\t%-15s ", "projectquota@..."); + (void) fprintf(fp, "YES NO <size> | none\n"); + (void) fprintf(fp, "\t%-15s ", "userobjquota@..."); + (void) fprintf(fp, "YES NO <size> | none\n"); + (void) fprintf(fp, "\t%-15s ", "groupobjquota@..."); + (void) fprintf(fp, "YES NO <size> | none\n"); + (void) fprintf(fp, "\t%-15s ", "projectobjquota@..."); + (void) fprintf(fp, "YES NO <size> | none\n"); + (void) fprintf(fp, "\t%-15s ", "written@<snap>"); + (void) fprintf(fp, " NO NO <size>\n"); + (void) fprintf(fp, "\t%-15s ", "written#<bookmark>"); + (void) fprintf(fp, " NO NO <size>\n"); + + (void) fprintf(fp, gettext("\nSizes are specified in bytes " + "with standard units such as K, M, G, etc.\n")); + (void) fprintf(fp, gettext("\nUser-defined properties can " + "be specified by using a name containing a colon (:).\n")); + (void) fprintf(fp, gettext("\nThe {user|group|project}" + "[obj]{used|quota}@ properties must be appended with\n" + "a user|group|project specifier of one of these forms:\n" + " POSIX name (eg: \"matt\")\n" + " POSIX id (eg: \"126829\")\n" + " SMB name@domain (eg: \"matt@sun\")\n" + " SMB SID (eg: \"S-1-234-567-89\")\n")); + } else { + (void) fprintf(fp, + gettext("\nFor the property list, run: %s\n"), + "zfs set|get"); + (void) fprintf(fp, + gettext("\nFor the delegated permission list, run: %s\n"), + "zfs allow|unallow"); + } + + /* + * See comments at end of main(). + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + exit(requested ? 0 : 2); +} + +/* + * Take a property=value argument string and add it to the given nvlist. + * Modifies the argument inplace. + */ +static boolean_t +parseprop(nvlist_t *props, char *propname) +{ + char *propval; + + if ((propval = strchr(propname, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for property=value argument\n")); + return (B_FALSE); + } + *propval = '\0'; + propval++; + if (nvlist_exists(props, propname)) { + (void) fprintf(stderr, gettext("property '%s' " + "specified multiple times\n"), propname); + return (B_FALSE); + } + if (nvlist_add_string(props, propname, propval) != 0) + nomem(); + return (B_TRUE); +} + +/* + * Take a property name argument and add it to the given nvlist. + * Modifies the argument inplace. + */ +static boolean_t +parsepropname(nvlist_t *props, char *propname) +{ + if (strchr(propname, '=') != NULL) { + (void) fprintf(stderr, gettext("invalid character " + "'=' in property argument\n")); + return (B_FALSE); + } + if (nvlist_exists(props, propname)) { + (void) fprintf(stderr, gettext("property '%s' " + "specified multiple times\n"), propname); + return (B_FALSE); + } + if (nvlist_add_boolean(props, propname) != 0) + nomem(); + return (B_TRUE); +} + +static int +parse_depth(char *opt, int *flags) +{ + char *tmp; + int depth; + + depth = (int)strtol(opt, &tmp, 0); + if (*tmp) { + (void) fprintf(stderr, + gettext("%s is not an integer\n"), optarg); + usage(B_FALSE); + } + if (depth < 0) { + (void) fprintf(stderr, + gettext("Depth can not be negative.\n")); + usage(B_FALSE); + } + *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); + return (depth); +} + +#define PROGRESS_DELAY 2 /* seconds */ + +static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; +static time_t pt_begin; +static char *pt_header = NULL; +static boolean_t pt_shown; + +static void +start_progress_timer(void) +{ + pt_begin = time(NULL) + PROGRESS_DELAY; + pt_shown = B_FALSE; +} + +static void +set_progress_header(char *header) +{ + assert(pt_header == NULL); + pt_header = safe_strdup(header); + if (pt_shown) { + (void) printf("%s: ", header); + (void) fflush(stdout); + } +} + +static void +update_progress(char *update) +{ + if (!pt_shown && time(NULL) > pt_begin) { + int len = strlen(update); + + (void) printf("%s: %s%*.*s", pt_header, update, len, len, + pt_reverse); + (void) fflush(stdout); + pt_shown = B_TRUE; + } else if (pt_shown) { + int len = strlen(update); + + (void) printf("%s%*.*s", update, len, len, pt_reverse); + (void) fflush(stdout); + } +} + +static void +finish_progress(char *done) +{ + if (pt_shown) { + (void) printf("%s\n", done); + (void) fflush(stdout); + } + free(pt_header); + pt_header = NULL; +} + +static int +zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) +{ + zfs_handle_t *zhp = NULL; + int ret = 0; + + zhp = zfs_open(hdl, dataset, type); + if (zhp == NULL) + return (1); + + /* + * Volumes may neither be mounted or shared. Potentially in the + * future filesystems detected on these volumes could be mounted. + */ + if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) { + zfs_close(zhp); + return (0); + } + + /* + * Mount and/or share the new filesystem as appropriate. We provide a + * verbose error message to let the user know that their filesystem was + * in fact created, even if we failed to mount or share it. + * + * If the user doesn't want the dataset automatically mounted, then + * skip the mount/share step + */ + if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) && + zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) { + if (zfs_mount_delegation_check()) { + (void) fprintf(stderr, gettext("filesystem " + "successfully created, but it may only be " + "mounted by root\n")); + ret = 1; + } else if (zfs_mount(zhp, NULL, 0) != 0) { + (void) fprintf(stderr, gettext("filesystem " + "successfully created, but not mounted\n")); + ret = 1; + } else if (zfs_share(zhp) != 0) { + (void) fprintf(stderr, gettext("filesystem " + "successfully created, but not shared\n")); + ret = 1; + } + zfs_commit_all_shares(); + } + + zfs_close(zhp); + + return (ret); +} + +/* + * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol> + * + * Given an existing dataset, create a writable copy whose initial contents + * are the same as the source. The newly created dataset maintains a + * dependency on the original; the original cannot be destroyed so long as + * the clone exists. + * + * The '-p' flag creates all the non-existing ancestors of the target first. + */ +static int +zfs_do_clone(int argc, char **argv) +{ + zfs_handle_t *zhp = NULL; + boolean_t parents = B_FALSE; + nvlist_t *props; + int ret = 0; + int c; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, "o:p")) != -1) { + switch (c) { + case 'o': + if (!parseprop(props, optarg)) { + nvlist_free(props); + return (1); + } + break; + case 'p': + parents = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing source dataset " + "argument\n")); + goto usage; + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing target dataset " + "argument\n")); + goto usage; + } + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + goto usage; + } + + /* open the source dataset */ + if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) { + nvlist_free(props); + return (1); + } + + if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) { + /* + * Now create the ancestors of the target dataset. If the + * target already exists and '-p' option was used we should not + * complain. + */ + if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) { + zfs_close(zhp); + nvlist_free(props); + return (0); + } + if (zfs_create_ancestors(g_zfs, argv[1]) != 0) { + zfs_close(zhp); + nvlist_free(props); + return (1); + } + } + + /* pass to libzfs */ + ret = zfs_clone(zhp, argv[1], props); + + /* create the mountpoint if necessary */ + if (ret == 0) { + if (log_history) { + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + + ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); + } + + zfs_close(zhp); + nvlist_free(props); + + return (!!ret); + +usage: + ASSERT3P(zhp, ==, NULL); + nvlist_free(props); + usage(B_FALSE); + return (-1); +} + +/* + * zfs create [-Pnpv] [-o prop=value] ... fs + * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size + * + * Create a new dataset. This command can be used to create filesystems + * and volumes. Snapshot creation is handled by 'zfs snapshot'. + * For volumes, the user must specify a size to be used. + * + * The '-s' flag applies only to volumes, and indicates that we should not try + * to set the reservation for this volume. By default we set a reservation + * equal to the size for any volume. For pools with SPA_VERSION >= + * SPA_VERSION_REFRESERVATION, we set a refreservation instead. + * + * The '-p' flag creates all the non-existing ancestors of the target first. + * + * The '-n' flag is no-op (dry run) mode. This will perform a user-space sanity + * check of arguments and properties, but does not check for permissions, + * available space, etc. + * + * The '-v' flag is for verbose output. + * + * The '-P' flag is used for parseable output. It implies '-v'. + */ +static int +zfs_do_create(int argc, char **argv) +{ + zfs_type_t type = ZFS_TYPE_FILESYSTEM; + zpool_handle_t *zpool_handle = NULL; + nvlist_t *real_props = NULL; + uint64_t volsize = 0; + int c; + boolean_t noreserve = B_FALSE; + boolean_t bflag = B_FALSE; + boolean_t parents = B_FALSE; + boolean_t dryrun = B_FALSE; + boolean_t verbose = B_FALSE; + boolean_t parseable = B_FALSE; + int ret = 1; + nvlist_t *props; + uint64_t intval; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, ":PV:b:nso:pv")) != -1) { + switch (c) { + case 'V': + type = ZFS_TYPE_VOLUME; + if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { + (void) fprintf(stderr, gettext("bad volume " + "size '%s': %s\n"), optarg, + libzfs_error_description(g_zfs)); + goto error; + } + + if (nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) + nomem(); + volsize = intval; + break; + case 'P': + verbose = B_TRUE; + parseable = B_TRUE; + break; + case 'p': + parents = B_TRUE; + break; + case 'b': + bflag = B_TRUE; + if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { + (void) fprintf(stderr, gettext("bad volume " + "block size '%s': %s\n"), optarg, + libzfs_error_description(g_zfs)); + goto error; + } + + if (nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + intval) != 0) + nomem(); + break; + case 'n': + dryrun = B_TRUE; + break; + case 'o': + if (!parseprop(props, optarg)) + goto error; + break; + case 's': + noreserve = B_TRUE; + break; + case 'v': + verbose = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing size " + "argument\n")); + goto badusage; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto badusage; + } + } + + if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { + (void) fprintf(stderr, gettext("'-s' and '-b' can only be " + "used when creating a volume\n")); + goto badusage; + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc == 0) { + (void) fprintf(stderr, gettext("missing %s argument\n"), + zfs_type_to_name(type)); + goto badusage; + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + goto badusage; + } + + if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) { + char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; + char *p; + + if ((p = strchr(argv[0], '/')) != NULL) + *p = '\0'; + zpool_handle = zpool_open(g_zfs, argv[0]); + if (p != NULL) + *p = '/'; + if (zpool_handle == NULL) + goto error; + + (void) snprintf(msg, sizeof (msg), + dryrun ? gettext("cannot verify '%s'") : + gettext("cannot create '%s'"), argv[0]); + if (props && (real_props = zfs_valid_proplist(g_zfs, type, + props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) { + zpool_close(zpool_handle); + goto error; + } + } + + /* + * if volsize is not a multiple of volblocksize, round it up to the + * nearest multiple of the volblocksize + */ + if (type == ZFS_TYPE_VOLUME) { + uint64_t volblocksize; + + if (nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize) != 0) + volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + + if (volsize % volblocksize) { + volsize = P2ROUNDUP_TYPED(volsize, volblocksize, + uint64_t); + + if (nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) { + nvlist_free(props); + nomem(); + } + } + } + + + if (type == ZFS_TYPE_VOLUME && !noreserve) { + uint64_t spa_version; + zfs_prop_t resv_prop; + char *strval; + + spa_version = zpool_get_prop_int(zpool_handle, + ZPOOL_PROP_VERSION, NULL); + if (spa_version >= SPA_VERSION_REFRESERVATION) + resv_prop = ZFS_PROP_REFRESERVATION; + else + resv_prop = ZFS_PROP_RESERVATION; + + volsize = zvol_volsize_to_reservation(zpool_handle, volsize, + real_props); + + if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), + &strval) != 0) { + if (nvlist_add_uint64(props, + zfs_prop_to_name(resv_prop), volsize) != 0) { + nvlist_free(props); + nomem(); + } + } + } + if (zpool_handle != NULL) { + zpool_close(zpool_handle); + nvlist_free(real_props); + } + + if (parents && zfs_name_valid(argv[0], type)) { + /* + * Now create the ancestors of target dataset. If the target + * already exists and '-p' option was used we should not + * complain. + */ + if (zfs_dataset_exists(g_zfs, argv[0], type)) { + ret = 0; + goto error; + } + if (verbose) { + (void) printf(parseable ? "create_ancestors\t%s\n" : + dryrun ? "would create ancestors of %s\n" : + "create ancestors of %s\n", argv[0]); + } + if (!dryrun) { + if (zfs_create_ancestors(g_zfs, argv[0]) != 0) { + goto error; + } + } + } + + if (verbose) { + nvpair_t *nvp = NULL; + (void) printf(parseable ? "create\t%s\n" : + dryrun ? "would create %s\n" : "create %s\n", argv[0]); + while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { + uint64_t uval; + char *sval; + + switch (nvpair_type(nvp)) { + case DATA_TYPE_UINT64: + VERIFY0(nvpair_value_uint64(nvp, &uval)); + (void) printf(parseable ? + "property\t%s\t%llu\n" : "\t%s=%llu\n", + nvpair_name(nvp), (u_longlong_t)uval); + break; + case DATA_TYPE_STRING: + VERIFY0(nvpair_value_string(nvp, &sval)); + (void) printf(parseable ? + "property\t%s\t%s\n" : "\t%s=%s\n", + nvpair_name(nvp), sval); + break; + default: + (void) fprintf(stderr, "property '%s' " + "has illegal type %d\n", + nvpair_name(nvp), nvpair_type(nvp)); + abort(); + } + } + } + if (dryrun) { + ret = 0; + goto error; + } + + /* pass to libzfs */ + if (zfs_create(g_zfs, argv[0], type, props) != 0) + goto error; + + if (log_history) { + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + + ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); +error: + nvlist_free(props); + return (ret); +badusage: + nvlist_free(props); + usage(B_FALSE); + return (2); +} + +/* + * zfs destroy [-rRf] <fs, vol> + * zfs destroy [-rRd] <snap> + * + * -r Recursively destroy all children + * -R Recursively destroy all dependents, including clones + * -f Force unmounting of any dependents + * -d If we can't destroy now, mark for deferred destruction + * + * Destroys the given dataset. By default, it will unmount any filesystems, + * and refuse to destroy a dataset that has any dependents. A dependent can + * either be a child, or a clone of a child. + */ +typedef struct destroy_cbdata { + boolean_t cb_first; + boolean_t cb_force; + boolean_t cb_recurse; + boolean_t cb_error; + boolean_t cb_doclones; + zfs_handle_t *cb_target; + boolean_t cb_defer_destroy; + boolean_t cb_verbose; + boolean_t cb_parsable; + boolean_t cb_dryrun; + nvlist_t *cb_nvl; + nvlist_t *cb_batchedsnaps; + + /* first snap in contiguous run */ + char *cb_firstsnap; + /* previous snap in contiguous run */ + char *cb_prevsnap; + int64_t cb_snapused; + char *cb_snapspec; + char *cb_bookmark; + uint64_t cb_snap_count; +} destroy_cbdata_t; + +/* + * Check for any dependents based on the '-r' or '-R' flags. + */ +static int +destroy_check_dependent(zfs_handle_t *zhp, void *data) +{ + destroy_cbdata_t *cbp = data; + const char *tname = zfs_get_name(cbp->cb_target); + const char *name = zfs_get_name(zhp); + + if (strncmp(tname, name, strlen(tname)) == 0 && + (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { + /* + * This is a direct descendant, not a clone somewhere else in + * the hierarchy. + */ + if (cbp->cb_recurse) + goto out; + + if (cbp->cb_first) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "%s has children\n"), + zfs_get_name(cbp->cb_target), + zfs_type_to_name(zfs_get_type(cbp->cb_target))); + (void) fprintf(stderr, gettext("use '-r' to destroy " + "the following datasets:\n")); + cbp->cb_first = B_FALSE; + cbp->cb_error = B_TRUE; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + } else { + /* + * This is a clone. We only want to report this if the '-r' + * wasn't specified, or the target is a snapshot. + */ + if (!cbp->cb_recurse && + zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) + goto out; + + if (cbp->cb_first) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "%s has dependent clones\n"), + zfs_get_name(cbp->cb_target), + zfs_type_to_name(zfs_get_type(cbp->cb_target))); + (void) fprintf(stderr, gettext("use '-R' to destroy " + "the following datasets:\n")); + cbp->cb_first = B_FALSE; + cbp->cb_error = B_TRUE; + cbp->cb_dryrun = B_TRUE; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + } + +out: + zfs_close(zhp); + return (0); +} + +static int +destroy_batched(destroy_cbdata_t *cb) +{ + int error = zfs_destroy_snaps_nvl(g_zfs, + cb->cb_batchedsnaps, B_FALSE); + fnvlist_free(cb->cb_batchedsnaps); + cb->cb_batchedsnaps = fnvlist_alloc(); + return (error); +} + +static int +destroy_callback(zfs_handle_t *zhp, void *data) +{ + destroy_cbdata_t *cb = data; + const char *name = zfs_get_name(zhp); + int error; + + if (cb->cb_verbose) { + if (cb->cb_parsable) { + (void) printf("destroy\t%s\n", name); + } else if (cb->cb_dryrun) { + (void) printf(gettext("would destroy %s\n"), + name); + } else { + (void) printf(gettext("will destroy %s\n"), + name); + } + } + + /* + * Ignore pools (which we've already flagged as an error before getting + * here). + */ + if (strchr(zfs_get_name(zhp), '/') == NULL && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + zfs_close(zhp); + return (0); + } + if (cb->cb_dryrun) { + zfs_close(zhp); + return (0); + } + + /* + * We batch up all contiguous snapshots (even of different + * filesystems) and destroy them with one ioctl. We can't + * simply do all snap deletions and then all fs deletions, + * because we must delete a clone before its origin. + */ + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { + cb->cb_snap_count++; + fnvlist_add_boolean(cb->cb_batchedsnaps, name); + if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy) + error = destroy_batched(cb); + } else { + error = destroy_batched(cb); + if (error != 0 || + zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || + zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { + zfs_close(zhp); + /* + * When performing a recursive destroy we ignore errors + * so that the recursive destroy could continue + * destroying past problem datasets + */ + if (cb->cb_recurse) { + cb->cb_error = B_TRUE; + return (0); + } + return (-1); + } + } + + zfs_close(zhp); + return (0); +} + +static int +destroy_print_cb(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + const char *name = zfs_get_name(zhp); + int err = 0; + + if (nvlist_exists(cb->cb_nvl, name)) { + if (cb->cb_firstsnap == NULL) + cb->cb_firstsnap = strdup(name); + if (cb->cb_prevsnap != NULL) + free(cb->cb_prevsnap); + /* this snap continues the current range */ + cb->cb_prevsnap = strdup(name); + if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) + nomem(); + if (cb->cb_verbose) { + if (cb->cb_parsable) { + (void) printf("destroy\t%s\n", name); + } else if (cb->cb_dryrun) { + (void) printf(gettext("would destroy %s\n"), + name); + } else { + (void) printf(gettext("will destroy %s\n"), + name); + } + } + } else if (cb->cb_firstsnap != NULL) { + /* end of this range */ + uint64_t used = 0; + err = lzc_snaprange_space(cb->cb_firstsnap, + cb->cb_prevsnap, &used); + cb->cb_snapused += used; + free(cb->cb_firstsnap); + cb->cb_firstsnap = NULL; + free(cb->cb_prevsnap); + cb->cb_prevsnap = NULL; + } + zfs_close(zhp); + return (err); +} + +static int +destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) +{ + int err; + assert(cb->cb_firstsnap == NULL); + assert(cb->cb_prevsnap == NULL); + err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0); + if (cb->cb_firstsnap != NULL) { + uint64_t used = 0; + if (err == 0) { + err = lzc_snaprange_space(cb->cb_firstsnap, + cb->cb_prevsnap, &used); + } + cb->cb_snapused += used; + free(cb->cb_firstsnap); + cb->cb_firstsnap = NULL; + free(cb->cb_prevsnap); + cb->cb_prevsnap = NULL; + } + return (err); +} + +static int +snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + int err = 0; + + /* Check for clones. */ + if (!cb->cb_doclones && !cb->cb_defer_destroy) { + cb->cb_target = zhp; + cb->cb_first = B_TRUE; + err = zfs_iter_dependents(zhp, B_TRUE, + destroy_check_dependent, cb); + } + + if (err == 0) { + if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) + nomem(); + } + zfs_close(zhp); + return (err); +} + +static int +gather_snapshots(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + int err = 0; + + err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); + if (err == ENOENT) + err = 0; + if (err != 0) + goto out; + + if (cb->cb_verbose) { + err = destroy_print_snapshots(zhp, cb); + if (err != 0) + goto out; + } + + if (cb->cb_recurse) + err = zfs_iter_filesystems(zhp, gather_snapshots, cb); + +out: + zfs_close(zhp); + return (err); +} + +static int +destroy_clones(destroy_cbdata_t *cb) +{ + nvpair_t *pair; + for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); + pair != NULL; + pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { + zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), + ZFS_TYPE_SNAPSHOT); + if (zhp != NULL) { + boolean_t defer = cb->cb_defer_destroy; + int err; + + /* + * We can't defer destroy non-snapshots, so set it to + * false while destroying the clones. + */ + cb->cb_defer_destroy = B_FALSE; + err = zfs_iter_dependents(zhp, B_FALSE, + destroy_callback, cb); + cb->cb_defer_destroy = defer; + zfs_close(zhp); + if (err != 0) + return (err); + } + } + return (0); +} + +static int +zfs_do_destroy(int argc, char **argv) +{ + destroy_cbdata_t cb = { 0 }; + int rv = 0; + int err = 0; + int c; + zfs_handle_t *zhp = NULL; + char *at, *pound; + zfs_type_t type = ZFS_TYPE_DATASET; + + /* check options */ + while ((c = getopt(argc, argv, "vpndfrR")) != -1) { + switch (c) { + case 'v': + cb.cb_verbose = B_TRUE; + break; + case 'p': + cb.cb_verbose = B_TRUE; + cb.cb_parsable = B_TRUE; + break; + case 'n': + cb.cb_dryrun = B_TRUE; + break; + case 'd': + cb.cb_defer_destroy = B_TRUE; + type = ZFS_TYPE_SNAPSHOT; + break; + case 'f': + cb.cb_force = B_TRUE; + break; + case 'r': + cb.cb_recurse = B_TRUE; + break; + case 'R': + cb.cb_recurse = B_TRUE; + cb.cb_doclones = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc == 0) { + (void) fprintf(stderr, gettext("missing dataset argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + at = strchr(argv[0], '@'); + pound = strchr(argv[0], '#'); + if (at != NULL) { + + /* Build the list of snaps to destroy in cb_nvl. */ + cb.cb_nvl = fnvlist_alloc(); + + *at = '\0'; + zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + nvlist_free(cb.cb_nvl); + return (1); + } + + cb.cb_snapspec = at + 1; + if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || + cb.cb_error) { + rv = 1; + goto out; + } + + if (nvlist_empty(cb.cb_nvl)) { + (void) fprintf(stderr, gettext("could not find any " + "snapshots to destroy; check snapshot names.\n")); + rv = 1; + goto out; + } + + if (cb.cb_verbose) { + char buf[16]; + zfs_nicebytes(cb.cb_snapused, buf, sizeof (buf)); + if (cb.cb_parsable) { + (void) printf("reclaim\t%llu\n", + (u_longlong_t)cb.cb_snapused); + } else if (cb.cb_dryrun) { + (void) printf(gettext("would reclaim %s\n"), + buf); + } else { + (void) printf(gettext("will reclaim %s\n"), + buf); + } + } + + if (!cb.cb_dryrun) { + if (cb.cb_doclones) { + cb.cb_batchedsnaps = fnvlist_alloc(); + err = destroy_clones(&cb); + if (err == 0) { + err = zfs_destroy_snaps_nvl(g_zfs, + cb.cb_batchedsnaps, B_FALSE); + } + if (err != 0) { + rv = 1; + goto out; + } + } + if (err == 0) { + err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, + cb.cb_defer_destroy); + } + } + + if (err != 0) + rv = 1; + } else if (pound != NULL) { + int err; + nvlist_t *nvl; + + if (cb.cb_dryrun) { + (void) fprintf(stderr, + "dryrun is not supported with bookmark\n"); + return (-1); + } + + if (cb.cb_defer_destroy) { + (void) fprintf(stderr, + "defer destroy is not supported with bookmark\n"); + return (-1); + } + + if (cb.cb_recurse) { + (void) fprintf(stderr, + "recursive is not supported with bookmark\n"); + return (-1); + } + + /* + * Unfortunately, zfs_bookmark() doesn't honor the + * casesensitivity setting. However, we can't simply + * remove this check, because lzc_destroy_bookmarks() + * ignores non-existent bookmarks, so this is necessary + * to get a proper error message. + */ + if (!zfs_bookmark_exists(argv[0])) { + (void) fprintf(stderr, gettext("bookmark '%s' " + "does not exist.\n"), argv[0]); + return (1); + } + + nvl = fnvlist_alloc(); + fnvlist_add_boolean(nvl, argv[0]); + + err = lzc_destroy_bookmarks(nvl, NULL); + if (err != 0) { + (void) zfs_standard_error(g_zfs, err, + "cannot destroy bookmark"); + } + + nvlist_free(nvl); + + return (err); + } else { + /* Open the given dataset */ + if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) + return (1); + + cb.cb_target = zhp; + + /* + * Perform an explicit check for pools before going any further. + */ + if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "operation does not apply to pools\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zfs destroy -r " + "%s' to destroy all datasets in the pool\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zpool destroy %s' " + "to destroy the pool itself\n"), zfs_get_name(zhp)); + rv = 1; + goto out; + } + + /* + * Check for any dependents and/or clones. + */ + cb.cb_first = B_TRUE; + if (!cb.cb_doclones && + zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, + &cb) != 0) { + rv = 1; + goto out; + } + + if (cb.cb_error) { + rv = 1; + goto out; + } + cb.cb_batchedsnaps = fnvlist_alloc(); + if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, + &cb) != 0) { + rv = 1; + goto out; + } + + /* + * Do the real thing. The callback will close the + * handle regardless of whether it succeeds or not. + */ + err = destroy_callback(zhp, &cb); + zhp = NULL; + if (err == 0) { + err = zfs_destroy_snaps_nvl(g_zfs, + cb.cb_batchedsnaps, cb.cb_defer_destroy); + } + if (err != 0 || cb.cb_error == B_TRUE) + rv = 1; + } + +out: + fnvlist_free(cb.cb_batchedsnaps); + fnvlist_free(cb.cb_nvl); + if (zhp != NULL) + zfs_close(zhp); + return (rv); +} + +static boolean_t +is_recvd_column(zprop_get_cbdata_t *cbp) +{ + int i; + zfs_get_column_t col; + + for (i = 0; i < ZFS_GET_NCOLS && + (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) + if (col == GET_COL_RECVD) + return (B_TRUE); + return (B_FALSE); +} + +/* + * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] + * < all | property[,property]... > < fs | snap | vol > ... + * + * -r recurse over any child datasets + * -H scripted mode. Headers are stripped, and fields are separated + * by tabs instead of spaces. + * -o Set of fields to display. One of "name,property,value, + * received,source". Default is "name,property,value,source". + * "all" is an alias for all five. + * -s Set of sources to allow. One of + * "local,default,inherited,received,temporary,none". Default is + * all six. + * -p Display values in parsable (literal) format. + * + * Prints properties for the given datasets. The user can control which + * columns to display as well as which property types to allow. + */ + +/* + * Invoked to display the properties for a single dataset. + */ +static int +get_callback(zfs_handle_t *zhp, void *data) +{ + char buf[ZFS_MAXPROPLEN]; + char rbuf[ZFS_MAXPROPLEN]; + zprop_source_t sourcetype; + char source[ZFS_MAX_DATASET_NAME_LEN]; + zprop_get_cbdata_t *cbp = data; + nvlist_t *user_props = zfs_get_user_props(zhp); + zprop_list_t *pl = cbp->cb_proplist; + nvlist_t *propval; + char *strval; + char *sourceval; + boolean_t received = is_recvd_column(cbp); + + for (; pl != NULL; pl = pl->pl_next) { + char *recvdval = NULL; + /* + * Skip the special fake placeholder. This will also skip over + * the name property when 'all' is specified. + */ + if (pl->pl_prop == ZFS_PROP_NAME && + pl == cbp->cb_proplist) + continue; + + if (pl->pl_prop != ZPROP_INVAL) { + if (zfs_prop_get(zhp, pl->pl_prop, buf, + sizeof (buf), &sourcetype, source, + sizeof (source), + cbp->cb_literal) != 0) { + if (pl->pl_all) + continue; + if (!zfs_prop_valid_for_type(pl->pl_prop, + ZFS_TYPE_DATASET, B_FALSE)) { + (void) fprintf(stderr, + gettext("No such property '%s'\n"), + zfs_prop_to_name(pl->pl_prop)); + continue; + } + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + if (received && (zfs_prop_get_recvd(zhp, + zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), + cbp->cb_literal) == 0)) + recvdval = rbuf; + + zprop_print_one_property(zfs_get_name(zhp), cbp, + zfs_prop_to_name(pl->pl_prop), + buf, sourcetype, source, recvdval); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + sourcetype = ZPROP_SRC_LOCAL; + + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + buf, sizeof (buf), cbp->cb_literal) != 0) { + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, buf, sourcetype, source, NULL); + } else if (zfs_prop_written(pl->pl_user_prop)) { + sourcetype = ZPROP_SRC_LOCAL; + + if (zfs_prop_get_written(zhp, pl->pl_user_prop, + buf, sizeof (buf), cbp->cb_literal) != 0) { + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, buf, sourcetype, source, NULL); + } else { + if (nvlist_lookup_nvlist(user_props, + pl->pl_user_prop, &propval) != 0) { + if (pl->pl_all) + continue; + sourcetype = ZPROP_SRC_NONE; + strval = "-"; + } else { + verify(nvlist_lookup_string(propval, + ZPROP_VALUE, &strval) == 0); + verify(nvlist_lookup_string(propval, + ZPROP_SOURCE, &sourceval) == 0); + + if (strcmp(sourceval, + zfs_get_name(zhp)) == 0) { + sourcetype = ZPROP_SRC_LOCAL; + } else if (strcmp(sourceval, + ZPROP_SOURCE_VAL_RECVD) == 0) { + sourcetype = ZPROP_SRC_RECEIVED; + } else { + sourcetype = ZPROP_SRC_INHERITED; + (void) strlcpy(source, + sourceval, sizeof (source)); + } + } + + if (received && (zfs_prop_get_recvd(zhp, + pl->pl_user_prop, rbuf, sizeof (rbuf), + cbp->cb_literal) == 0)) + recvdval = rbuf; + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, strval, sourcetype, + source, recvdval); + } + } + + return (0); +} + +static int +zfs_do_get(int argc, char **argv) +{ + zprop_get_cbdata_t cb = { 0 }; + int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; + int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; + char *value, *fields; + int ret = 0; + int limit = 0; + zprop_list_t fake_name = { 0 }; + + /* + * Set up default columns and sources. + */ + cb.cb_sources = ZPROP_SRC_ALL; + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_SOURCE; + cb.cb_type = ZFS_TYPE_DATASET; + + /* check options */ + while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { + switch (c) { + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'd': + limit = parse_depth(optarg, &flags); + break; + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case 'o': + /* + * Process the set of columns to display. We zero out + * the structure to give us a blank slate. + */ + bzero(&cb.cb_columns, sizeof (cb.cb_columns)); + i = 0; + while (*optarg != '\0') { + static char *col_subopts[] = + { "name", "property", "value", "received", + "source", "all", NULL }; + + if (i == ZFS_GET_NCOLS) { + (void) fprintf(stderr, gettext("too " + "many fields given to -o " + "option\n")); + usage(B_FALSE); + } + + switch (getsubopt(&optarg, col_subopts, + &value)) { + case 0: + cb.cb_columns[i++] = GET_COL_NAME; + break; + case 1: + cb.cb_columns[i++] = GET_COL_PROPERTY; + break; + case 2: + cb.cb_columns[i++] = GET_COL_VALUE; + break; + case 3: + cb.cb_columns[i++] = GET_COL_RECVD; + flags |= ZFS_ITER_RECVD_PROPS; + break; + case 4: + cb.cb_columns[i++] = GET_COL_SOURCE; + break; + case 5: + if (i > 0) { + (void) fprintf(stderr, + gettext("\"all\" conflicts " + "with specific fields " + "given to -o option\n")); + usage(B_FALSE); + } + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_RECVD; + cb.cb_columns[4] = GET_COL_SOURCE; + flags |= ZFS_ITER_RECVD_PROPS; + i = ZFS_GET_NCOLS; + break; + default: + (void) fprintf(stderr, + gettext("invalid column name " + "'%s'\n"), value); + usage(B_FALSE); + } + } + break; + + case 's': + cb.cb_sources = 0; + while (*optarg != '\0') { + static char *source_subopts[] = { + "local", "default", "inherited", + "received", "temporary", "none", + NULL }; + + switch (getsubopt(&optarg, source_subopts, + &value)) { + case 0: + cb.cb_sources |= ZPROP_SRC_LOCAL; + break; + case 1: + cb.cb_sources |= ZPROP_SRC_DEFAULT; + break; + case 2: + cb.cb_sources |= ZPROP_SRC_INHERITED; + break; + case 3: + cb.cb_sources |= ZPROP_SRC_RECEIVED; + break; + case 4: + cb.cb_sources |= ZPROP_SRC_TEMPORARY; + break; + case 5: + cb.cb_sources |= ZPROP_SRC_NONE; + break; + default: + (void) fprintf(stderr, + gettext("invalid source " + "'%s'\n"), value); + usage(B_FALSE); + } + } + break; + + case 't': + types = 0; + flags &= ~ZFS_ITER_PROP_LISTSNAPS; + while (*optarg != '\0') { + static char *type_subopts[] = { "filesystem", + "volume", "snapshot", "snap", "bookmark", + "all", NULL }; + + switch (getsubopt(&optarg, type_subopts, + &value)) { + case 0: + types |= ZFS_TYPE_FILESYSTEM; + break; + case 1: + types |= ZFS_TYPE_VOLUME; + break; + case 2: + case 3: + types |= ZFS_TYPE_SNAPSHOT; + break; + case 4: + types |= ZFS_TYPE_BOOKMARK; + break; + case 5: + types = ZFS_TYPE_DATASET | + ZFS_TYPE_BOOKMARK; + break; + + default: + (void) fprintf(stderr, + gettext("invalid type '%s'\n"), + value); + usage(B_FALSE); + } + } + break; + + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing property " + "argument\n")); + usage(B_FALSE); + } + + fields = argv[0]; + + /* + * Handle users who want to get all snapshots or bookmarks + * of a dataset (ex. 'zfs get -t snapshot refer <dataset>'). + */ + if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && + argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { + flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); + limit = 1; + } + + if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) + != 0) + usage(B_FALSE); + + argc--; + argv++; + + /* + * As part of zfs_expand_proplist(), we keep track of the maximum column + * width for each property. For the 'NAME' (and 'SOURCE') columns, we + * need to know the maximum name length. However, the user likely did + * not specify 'name' as one of the properties to fetch, so we need to + * make sure we always include at least this property for + * print_get_headers() to work properly. + */ + if (cb.cb_proplist != NULL) { + fake_name.pl_prop = ZFS_PROP_NAME; + fake_name.pl_width = strlen(gettext("NAME")); + fake_name.pl_next = cb.cb_proplist; + cb.cb_proplist = &fake_name; + } + + cb.cb_first = B_TRUE; + + /* run for each object */ + ret = zfs_for_each(argc, argv, flags, types, NULL, + &cb.cb_proplist, limit, get_callback, &cb); + + if (cb.cb_proplist == &fake_name) + zprop_free_list(fake_name.pl_next); + else + zprop_free_list(cb.cb_proplist); + + return (ret); +} + +/* + * inherit [-rS] <property> <fs|vol> ... + * + * -r Recurse over all children + * -S Revert to received value, if any + * + * For each dataset specified on the command line, inherit the given property + * from its parent. Inheriting a property at the pool level will cause it to + * use the default value. The '-r' flag will recurse over all children, and is + * useful for setting a property on a hierarchy-wide basis, regardless of any + * local modifications for each dataset. + */ + +typedef struct inherit_cbdata { + const char *cb_propname; + boolean_t cb_received; +} inherit_cbdata_t; + +static int +inherit_recurse_cb(zfs_handle_t *zhp, void *data) +{ + inherit_cbdata_t *cb = data; + zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); + + /* + * If we're doing it recursively, then ignore properties that + * are not valid for this type of dataset. + */ + if (prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE)) + return (0); + + return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); +} + +static int +inherit_cb(zfs_handle_t *zhp, void *data) +{ + inherit_cbdata_t *cb = data; + + return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); +} + +static int +zfs_do_inherit(int argc, char **argv) +{ + int c; + zfs_prop_t prop; + inherit_cbdata_t cb = { 0 }; + char *propname; + int ret = 0; + int flags = 0; + boolean_t received = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "rS")) != -1) { + switch (c) { + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'S': + received = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing property argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing dataset argument\n")); + usage(B_FALSE); + } + + propname = argv[0]; + argc--; + argv++; + + if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { + if (zfs_prop_readonly(prop)) { + (void) fprintf(stderr, gettext( + "%s property is read-only\n"), + propname); + return (1); + } + if (!zfs_prop_inheritable(prop) && !received) { + (void) fprintf(stderr, gettext("'%s' property cannot " + "be inherited\n"), propname); + if (prop == ZFS_PROP_QUOTA || + prop == ZFS_PROP_RESERVATION || + prop == ZFS_PROP_REFQUOTA || + prop == ZFS_PROP_REFRESERVATION) { + (void) fprintf(stderr, gettext("use 'zfs set " + "%s=none' to clear\n"), propname); + (void) fprintf(stderr, gettext("use 'zfs " + "inherit -S %s' to revert to received " + "value\n"), propname); + } + return (1); + } + if (received && (prop == ZFS_PROP_VOLSIZE || + prop == ZFS_PROP_VERSION)) { + (void) fprintf(stderr, gettext("'%s' property cannot " + "be reverted to a received value\n"), propname); + return (1); + } + } else if (!zfs_prop_user(propname)) { + (void) fprintf(stderr, gettext("invalid property '%s'\n"), + propname); + usage(B_FALSE); + } + + cb.cb_propname = propname; + cb.cb_received = received; + + if (flags & ZFS_ITER_RECURSE) { + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, + NULL, NULL, 0, inherit_recurse_cb, &cb); + } else { + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, + NULL, NULL, 0, inherit_cb, &cb); + } + + return (ret); +} + +typedef struct upgrade_cbdata { + uint64_t cb_numupgraded; + uint64_t cb_numsamegraded; + uint64_t cb_numfailed; + uint64_t cb_version; + boolean_t cb_newer; + boolean_t cb_foundone; + char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; +} upgrade_cbdata_t; + +static int +same_pool(zfs_handle_t *zhp, const char *name) +{ + int len1 = strcspn(name, "/@"); + const char *zhname = zfs_get_name(zhp); + int len2 = strcspn(zhname, "/@"); + + if (len1 != len2) + return (B_FALSE); + return (strncmp(name, zhname, len1) == 0); +} + +static int +upgrade_list_callback(zfs_handle_t *zhp, void *data) +{ + upgrade_cbdata_t *cb = data; + int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + + /* list if it's old/new */ + if ((!cb->cb_newer && version < ZPL_VERSION) || + (cb->cb_newer && version > ZPL_VERSION)) { + char *str; + if (cb->cb_newer) { + str = gettext("The following filesystems are " + "formatted using a newer software version and\n" + "cannot be accessed on the current system.\n\n"); + } else { + str = gettext("The following filesystems are " + "out of date, and can be upgraded. After being\n" + "upgraded, these filesystems (and any 'zfs send' " + "streams generated from\n" + "subsequent snapshots) will no longer be " + "accessible by older software versions.\n\n"); + } + + if (!cb->cb_foundone) { + (void) puts(str); + (void) printf(gettext("VER FILESYSTEM\n")); + (void) printf(gettext("--- ------------\n")); + cb->cb_foundone = B_TRUE; + } + + (void) printf("%2u %s\n", version, zfs_get_name(zhp)); + } + + return (0); +} + +static int +upgrade_set_callback(zfs_handle_t *zhp, void *data) +{ + upgrade_cbdata_t *cb = data; + int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + int needed_spa_version; + int spa_version; + + if (zfs_spa_version(zhp, &spa_version) < 0) + return (-1); + + needed_spa_version = zfs_spa_version_map(cb->cb_version); + + if (needed_spa_version < 0) + return (-1); + + if (spa_version < needed_spa_version) { + /* can't upgrade */ + (void) printf(gettext("%s: can not be " + "upgraded; the pool version needs to first " + "be upgraded\nto version %d\n\n"), + zfs_get_name(zhp), needed_spa_version); + cb->cb_numfailed++; + return (0); + } + + /* upgrade */ + if (version < cb->cb_version) { + char verstr[16]; + (void) snprintf(verstr, sizeof (verstr), + "%llu", (u_longlong_t)cb->cb_version); + if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { + /* + * If they did "zfs upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + if (zfs_prop_set(zhp, "version", verstr) == 0) + cb->cb_numupgraded++; + else + cb->cb_numfailed++; + (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp)); + } else if (version > cb->cb_version) { + /* can't downgrade */ + (void) printf(gettext("%s: can not be downgraded; " + "it is already at version %u\n"), + zfs_get_name(zhp), version); + cb->cb_numfailed++; + } else { + cb->cb_numsamegraded++; + } + return (0); +} + +/* + * zfs upgrade + * zfs upgrade -v + * zfs upgrade [-r] [-V <version>] <-a | filesystem> + */ +static int +zfs_do_upgrade(int argc, char **argv) +{ + boolean_t all = B_FALSE; + boolean_t showversions = B_FALSE; + int ret = 0; + upgrade_cbdata_t cb = { 0 }; + int c; + int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; + + /* check options */ + while ((c = getopt(argc, argv, "rvV:a")) != -1) { + switch (c) { + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'v': + showversions = B_TRUE; + break; + case 'V': + if (zfs_prop_string_to_index(ZFS_PROP_VERSION, + optarg, &cb.cb_version) != 0) { + (void) fprintf(stderr, + gettext("invalid version %s\n"), optarg); + usage(B_FALSE); + } + break; + case 'a': + all = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) + usage(B_FALSE); + if (showversions && (flags & ZFS_ITER_RECURSE || all || + cb.cb_version || argc)) + usage(B_FALSE); + if ((all || argc) && (showversions)) + usage(B_FALSE); + if (all && argc) + usage(B_FALSE); + + if (showversions) { + /* Show info on available versions. */ + (void) printf(gettext("The following filesystem versions are " + "supported:\n\n")); + (void) printf(gettext("VER DESCRIPTION\n")); + (void) printf("--- -----------------------------------------" + "---------------\n"); + (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); + (void) printf(gettext(" 2 Enhanced directory entries\n")); + (void) printf(gettext(" 3 Case insensitive and filesystem " + "user identifier (FUID)\n")); + (void) printf(gettext(" 4 userquota, groupquota " + "properties\n")); + (void) printf(gettext(" 5 System attributes\n")); + (void) printf(gettext("\nFor more information on a particular " + "version, including supported releases,\n")); + (void) printf("see the ZFS Administration Guide.\n\n"); + ret = 0; + } else if (argc || all) { + /* Upgrade filesystems */ + if (cb.cb_version == 0) + cb.cb_version = ZPL_VERSION; + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, + NULL, NULL, 0, upgrade_set_callback, &cb); + (void) printf(gettext("%llu filesystems upgraded\n"), + (u_longlong_t)cb.cb_numupgraded); + if (cb.cb_numsamegraded) { + (void) printf(gettext("%llu filesystems already at " + "this version\n"), + (u_longlong_t)cb.cb_numsamegraded); + } + if (cb.cb_numfailed != 0) + ret = 1; + } else { + /* List old-version filesystems */ + boolean_t found; + (void) printf(gettext("This system is currently running " + "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); + + flags |= ZFS_ITER_RECURSE; + ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, + NULL, NULL, 0, upgrade_list_callback, &cb); + + found = cb.cb_foundone; + cb.cb_foundone = B_FALSE; + cb.cb_newer = B_TRUE; + + ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, + NULL, NULL, 0, upgrade_list_callback, &cb); + + if (!cb.cb_foundone && !found) { + (void) printf(gettext("All filesystems are " + "formatted with the current version.\n")); + } + } + + return (ret); +} + +/* + * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] + * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot + * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] + * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot + * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...] + * [-S field [-S field]...] filesystem | snapshot + * + * -H Scripted mode; elide headers and separate columns by tabs. + * -i Translate SID to POSIX ID. + * -n Print numeric ID instead of user/group name. + * -o Control which fields to display. + * -p Use exact (parsable) numeric output. + * -s Specify sort columns, descending order. + * -S Specify sort columns, ascending order. + * -t Control which object types to display. + * + * Displays space consumed by, and quotas on, each user in the specified + * filesystem or snapshot. + */ + +/* us_field_types, us_field_hdr and us_field_names should be kept in sync */ +enum us_field_types { + USFIELD_TYPE, + USFIELD_NAME, + USFIELD_USED, + USFIELD_QUOTA, + USFIELD_OBJUSED, + USFIELD_OBJQUOTA +}; +static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA", + "OBJUSED", "OBJQUOTA" }; +static char *us_field_names[] = { "type", "name", "used", "quota", + "objused", "objquota" }; +#define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) + +#define USTYPE_PSX_GRP (1 << 0) +#define USTYPE_PSX_USR (1 << 1) +#define USTYPE_SMB_GRP (1 << 2) +#define USTYPE_SMB_USR (1 << 3) +#define USTYPE_PROJ (1 << 4) +#define USTYPE_ALL \ + (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR | \ + USTYPE_PROJ) + +static int us_type_bits[] = { + USTYPE_PSX_GRP, + USTYPE_PSX_USR, + USTYPE_SMB_GRP, + USTYPE_SMB_USR, + USTYPE_ALL +}; +static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup", + "smbuser", "all" }; + +typedef struct us_node { + nvlist_t *usn_nvl; + uu_avl_node_t usn_avlnode; + uu_list_node_t usn_listnode; +} us_node_t; + +typedef struct us_cbdata { + nvlist_t **cb_nvlp; + uu_avl_pool_t *cb_avl_pool; + uu_avl_t *cb_avl; + boolean_t cb_numname; + boolean_t cb_nicenum; + boolean_t cb_sid2posix; + zfs_userquota_prop_t cb_prop; + zfs_sort_column_t *cb_sortcol; + size_t cb_width[USFIELD_LAST]; +} us_cbdata_t; + +static boolean_t us_populated = B_FALSE; + +typedef struct { + zfs_sort_column_t *si_sortcol; + boolean_t si_numname; +} us_sort_info_t; + +static int +us_field_index(char *field) +{ + int i; + + for (i = 0; i < USFIELD_LAST; i++) { + if (strcmp(field, us_field_names[i]) == 0) + return (i); + } + + return (-1); +} + +static int +us_compare(const void *larg, const void *rarg, void *unused) +{ + const us_node_t *l = larg; + const us_node_t *r = rarg; + us_sort_info_t *si = (us_sort_info_t *)unused; + zfs_sort_column_t *sortcol = si->si_sortcol; + boolean_t numname = si->si_numname; + nvlist_t *lnvl = l->usn_nvl; + nvlist_t *rnvl = r->usn_nvl; + int rc = 0; + boolean_t lvb, rvb; + + for (; sortcol != NULL; sortcol = sortcol->sc_next) { + char *lvstr = ""; + char *rvstr = ""; + uint32_t lv32 = 0; + uint32_t rv32 = 0; + uint64_t lv64 = 0; + uint64_t rv64 = 0; + zfs_prop_t prop = sortcol->sc_prop; + const char *propname = NULL; + boolean_t reverse = sortcol->sc_reverse; + + switch (prop) { + case ZFS_PROP_TYPE: + propname = "type"; + (void) nvlist_lookup_uint32(lnvl, propname, &lv32); + (void) nvlist_lookup_uint32(rnvl, propname, &rv32); + if (rv32 != lv32) + rc = (rv32 < lv32) ? 1 : -1; + break; + case ZFS_PROP_NAME: + propname = "name"; + if (numname) { +compare_nums: + (void) nvlist_lookup_uint64(lnvl, propname, + &lv64); + (void) nvlist_lookup_uint64(rnvl, propname, + &rv64); + if (rv64 != lv64) + rc = (rv64 < lv64) ? 1 : -1; + } else { + if ((nvlist_lookup_string(lnvl, propname, + &lvstr) == ENOENT) || + (nvlist_lookup_string(rnvl, propname, + &rvstr) == ENOENT)) { + goto compare_nums; + } + rc = strcmp(lvstr, rvstr); + } + break; + case ZFS_PROP_USED: + case ZFS_PROP_QUOTA: + if (!us_populated) + break; + if (prop == ZFS_PROP_USED) + propname = "used"; + else + propname = "quota"; + (void) nvlist_lookup_uint64(lnvl, propname, &lv64); + (void) nvlist_lookup_uint64(rnvl, propname, &rv64); + if (rv64 != lv64) + rc = (rv64 < lv64) ? 1 : -1; + break; + + default: + break; + } + + if (rc != 0) { + if (rc < 0) + return (reverse ? 1 : -1); + else + return (reverse ? -1 : 1); + } + } + + /* + * If entries still seem to be the same, check if they are of the same + * type (smbentity is added only if we are doing SID to POSIX ID + * translation where we can have duplicate type/name combinations). + */ + if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && + nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && + lvb != rvb) + return (lvb < rvb ? -1 : 1); + + return (0); +} + +static boolean_t +zfs_prop_is_user(unsigned p) +{ + return (p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA || + p == ZFS_PROP_USEROBJUSED || p == ZFS_PROP_USEROBJQUOTA); +} + +static boolean_t +zfs_prop_is_group(unsigned p) +{ + return (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA || + p == ZFS_PROP_GROUPOBJUSED || p == ZFS_PROP_GROUPOBJQUOTA); +} + +static boolean_t +zfs_prop_is_project(unsigned p) +{ + return (p == ZFS_PROP_PROJECTUSED || p == ZFS_PROP_PROJECTQUOTA || + p == ZFS_PROP_PROJECTOBJUSED || p == ZFS_PROP_PROJECTOBJQUOTA); +} + +static inline const char * +us_type2str(unsigned field_type) +{ + switch (field_type) { + case USTYPE_PSX_USR: + return ("POSIX User"); + case USTYPE_PSX_GRP: + return ("POSIX Group"); + case USTYPE_SMB_USR: + return ("SMB User"); + case USTYPE_SMB_GRP: + return ("SMB Group"); + case USTYPE_PROJ: + return ("Project"); + default: + return ("Undefined"); + } +} + +static int +userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) +{ + us_cbdata_t *cb = (us_cbdata_t *)arg; + zfs_userquota_prop_t prop = cb->cb_prop; + char *name = NULL; + char *propname; + char sizebuf[32]; + us_node_t *node; + uu_avl_pool_t *avl_pool = cb->cb_avl_pool; + uu_avl_t *avl = cb->cb_avl; + uu_avl_index_t idx; + nvlist_t *props; + us_node_t *n; + zfs_sort_column_t *sortcol = cb->cb_sortcol; + unsigned type = 0; + const char *typestr; + size_t namelen; + size_t typelen; + size_t sizelen; + int typeidx, nameidx, sizeidx; + us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; + boolean_t smbentity = B_FALSE; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + node = safe_malloc(sizeof (us_node_t)); + uu_avl_node_init(node, &node->usn_avlnode, avl_pool); + node->usn_nvl = props; + + if (domain != NULL && domain[0] != '\0') { +#ifdef HAVE_IDMAP + /* SMB */ + char sid[MAXNAMELEN + 32]; + uid_t id; + uint64_t classes; + int err; + directory_error_t e; + + smbentity = B_TRUE; + + (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); + + if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { + type = USTYPE_SMB_GRP; + err = sid_to_id(sid, B_FALSE, &id); + } else { + type = USTYPE_SMB_USR; + err = sid_to_id(sid, B_TRUE, &id); + } + + if (err == 0) { + rid = id; + if (!cb->cb_sid2posix) { + e = directory_name_from_sid(NULL, sid, &name, + &classes); + if (e != NULL) + directory_error_free(e); + if (name == NULL) + name = sid; + } + } +#else + nvlist_free(props); + free(node); + + return (-1); +#endif /* HAVE_IDMAP */ + } + + if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { + /* POSIX or -i */ + if (zfs_prop_is_group(prop)) { + type = USTYPE_PSX_GRP; + if (!cb->cb_numname) { + struct group *g; + + if ((g = getgrgid(rid)) != NULL) + name = g->gr_name; + } + } else if (zfs_prop_is_user(prop)) { + type = USTYPE_PSX_USR; + if (!cb->cb_numname) { + struct passwd *p; + + if ((p = getpwuid(rid)) != NULL) + name = p->pw_name; + } + } else { + type = USTYPE_PROJ; + } + } + + /* + * Make sure that the type/name combination is unique when doing + * SID to POSIX ID translation (hence changing the type from SMB to + * POSIX). + */ + if (cb->cb_sid2posix && + nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) + nomem(); + + /* Calculate/update width of TYPE field */ + typestr = us_type2str(type); + typelen = strlen(gettext(typestr)); + typeidx = us_field_index("type"); + if (typelen > cb->cb_width[typeidx]) + cb->cb_width[typeidx] = typelen; + if (nvlist_add_uint32(props, "type", type) != 0) + nomem(); + + /* Calculate/update width of NAME field */ + if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { + if (nvlist_add_uint64(props, "name", rid) != 0) + nomem(); + namelen = snprintf(NULL, 0, "%u", rid); + } else { + if (nvlist_add_string(props, "name", name) != 0) + nomem(); + namelen = strlen(name); + } + nameidx = us_field_index("name"); + if (nameidx >= 0 && namelen > cb->cb_width[nameidx]) + cb->cb_width[nameidx] = namelen; + + /* + * Check if this type/name combination is in the list and update it; + * otherwise add new node to the list. + */ + if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { + uu_avl_insert(avl, node, idx); + } else { + nvlist_free(props); + free(node); + node = n; + props = node->usn_nvl; + } + + /* Calculate/update width of USED/QUOTA fields */ + if (cb->cb_nicenum) { + if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || + prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || + prop == ZFS_PROP_PROJECTUSED || + prop == ZFS_PROP_PROJECTQUOTA) { + zfs_nicebytes(space, sizebuf, sizeof (sizebuf)); + } else { + zfs_nicenum(space, sizebuf, sizeof (sizebuf)); + } + } else { + (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", + (u_longlong_t)space); + } + sizelen = strlen(sizebuf); + if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || + prop == ZFS_PROP_PROJECTUSED) { + propname = "used"; + if (!nvlist_exists(props, "quota")) + (void) nvlist_add_uint64(props, "quota", 0); + } else if (prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || + prop == ZFS_PROP_PROJECTQUOTA) { + propname = "quota"; + if (!nvlist_exists(props, "used")) + (void) nvlist_add_uint64(props, "used", 0); + } else if (prop == ZFS_PROP_USEROBJUSED || + prop == ZFS_PROP_GROUPOBJUSED || prop == ZFS_PROP_PROJECTOBJUSED) { + propname = "objused"; + if (!nvlist_exists(props, "objquota")) + (void) nvlist_add_uint64(props, "objquota", 0); + } else if (prop == ZFS_PROP_USEROBJQUOTA || + prop == ZFS_PROP_GROUPOBJQUOTA || + prop == ZFS_PROP_PROJECTOBJQUOTA) { + propname = "objquota"; + if (!nvlist_exists(props, "objused")) + (void) nvlist_add_uint64(props, "objused", 0); + } else { + return (-1); + } + sizeidx = us_field_index(propname); + if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx]) + cb->cb_width[sizeidx] = sizelen; + + if (nvlist_add_uint64(props, propname, space) != 0) + nomem(); + + return (0); +} + +static void +print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, + size_t *width, us_node_t *node) +{ + nvlist_t *nvl = node->usn_nvl; + char valstr[MAXNAMELEN]; + boolean_t first = B_TRUE; + int cfield = 0; + int field; + uint32_t ustype; + + /* Check type */ + (void) nvlist_lookup_uint32(nvl, "type", &ustype); + if (!(ustype & types)) + return; + + while ((field = fields[cfield]) != USFIELD_LAST) { + nvpair_t *nvp = NULL; + data_type_t type; + uint32_t val32; + uint64_t val64; + char *strval = "-"; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + if (strcmp(nvpair_name(nvp), + us_field_names[field]) == 0) + break; + } + + type = nvp == NULL ? DATA_TYPE_UNKNOWN : nvpair_type(nvp); + switch (type) { + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &val32); + break; + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &val64); + break; + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &strval); + break; + case DATA_TYPE_UNKNOWN: + break; + default: + (void) fprintf(stderr, "invalid data type\n"); + } + + switch (field) { + case USFIELD_TYPE: + if (type == DATA_TYPE_UINT32) + strval = (char *)us_type2str(val32); + break; + case USFIELD_NAME: + if (type == DATA_TYPE_UINT64) { + (void) sprintf(valstr, "%llu", + (u_longlong_t)val64); + strval = valstr; + } + break; + case USFIELD_USED: + case USFIELD_QUOTA: + if (type == DATA_TYPE_UINT64) { + if (parsable) { + (void) sprintf(valstr, "%llu", + (u_longlong_t)val64); + strval = valstr; + } else if (field == USFIELD_QUOTA && + val64 == 0) { + strval = "none"; + } else { + zfs_nicebytes(val64, valstr, + sizeof (valstr)); + strval = valstr; + } + } + break; + case USFIELD_OBJUSED: + case USFIELD_OBJQUOTA: + if (type == DATA_TYPE_UINT64) { + if (parsable) { + (void) sprintf(valstr, "%llu", + (u_longlong_t)val64); + strval = valstr; + } else if (field == USFIELD_OBJQUOTA && + val64 == 0) { + strval = "none"; + } else { + zfs_nicenum(val64, valstr, + sizeof (valstr)); + strval = valstr; + } + } + break; + } + + if (!first) { + if (scripted) + (void) printf("\t"); + else + (void) printf(" "); + } + if (scripted) + (void) printf("%s", strval); + else if (field == USFIELD_TYPE || field == USFIELD_NAME) + (void) printf("%-*s", (int)width[field], strval); + else + (void) printf("%*s", (int)width[field], strval); + + first = B_FALSE; + cfield++; + } + + (void) printf("\n"); +} + +static void +print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, + size_t *width, boolean_t rmnode, uu_avl_t *avl) +{ + us_node_t *node; + const char *col; + int cfield = 0; + int field; + + if (!scripted) { + boolean_t first = B_TRUE; + + while ((field = fields[cfield]) != USFIELD_LAST) { + col = gettext(us_field_hdr[field]); + if (field == USFIELD_TYPE || field == USFIELD_NAME) { + (void) printf(first ? "%-*s" : " %-*s", + (int)width[field], col); + } else { + (void) printf(first ? "%*s" : " %*s", + (int)width[field], col); + } + first = B_FALSE; + cfield++; + } + (void) printf("\n"); + } + + for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { + print_us_node(scripted, parsable, fields, types, width, node); + if (rmnode) + nvlist_free(node->usn_nvl); + } +} + +static int +zfs_do_userspace(int argc, char **argv) +{ + zfs_handle_t *zhp; + zfs_userquota_prop_t p; + uu_avl_pool_t *avl_pool; + uu_avl_t *avl_tree; + uu_avl_walk_t *walk; + char *delim; + char deffields[] = "type,name,used,quota,objused,objquota"; + char *ofield = NULL; + char *tfield = NULL; + int cfield = 0; + int fields[256]; + int i; + boolean_t scripted = B_FALSE; + boolean_t prtnum = B_FALSE; + boolean_t parsable = B_FALSE; + boolean_t sid2posix = B_FALSE; + int ret = 0; + int c; + zfs_sort_column_t *sortcol = NULL; + int types = USTYPE_PSX_USR | USTYPE_SMB_USR; + us_cbdata_t cb; + us_node_t *node; + us_node_t *rmnode; + uu_list_pool_t *listpool; + uu_list_t *list; + uu_avl_index_t idx = 0; + uu_list_index_t idx2 = 0; + + if (argc < 2) + usage(B_FALSE); + + if (strcmp(argv[0], "groupspace") == 0) { + /* Toggle default group types */ + types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; + } else if (strcmp(argv[0], "projectspace") == 0) { + types = USTYPE_PROJ; + prtnum = B_TRUE; + } + + while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { + switch (c) { + case 'n': + if (types == USTYPE_PROJ) { + (void) fprintf(stderr, + gettext("invalid option 'n'\n")); + usage(B_FALSE); + } + prtnum = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'p': + parsable = B_TRUE; + break; + case 'o': + ofield = optarg; + break; + case 's': + case 'S': + if (zfs_add_sort_column(&sortcol, optarg, + c == 's' ? B_FALSE : B_TRUE) != 0) { + (void) fprintf(stderr, + gettext("invalid field '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 't': + if (types == USTYPE_PROJ) { + (void) fprintf(stderr, + gettext("invalid option 't'\n")); + usage(B_FALSE); + } + tfield = optarg; + break; + case 'i': + if (types == USTYPE_PROJ) { + (void) fprintf(stderr, + gettext("invalid option 'i'\n")); + usage(B_FALSE); + } + sid2posix = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing dataset name\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* Use default output fields if not specified using -o */ + if (ofield == NULL) + ofield = deffields; + do { + if ((delim = strchr(ofield, ',')) != NULL) + *delim = '\0'; + if ((fields[cfield++] = us_field_index(ofield)) == -1) { + (void) fprintf(stderr, gettext("invalid type '%s' " + "for -o option\n"), ofield); + return (-1); + } + if (delim != NULL) + ofield = delim + 1; + } while (delim != NULL); + fields[cfield] = USFIELD_LAST; + + /* Override output types (-t option) */ + if (tfield != NULL) { + types = 0; + + do { + boolean_t found = B_FALSE; + + if ((delim = strchr(tfield, ',')) != NULL) + *delim = '\0'; + for (i = 0; i < sizeof (us_type_bits) / sizeof (int); + i++) { + if (strcmp(tfield, us_type_names[i]) == 0) { + found = B_TRUE; + types |= us_type_bits[i]; + break; + } + } + if (!found) { + (void) fprintf(stderr, gettext("invalid type " + "'%s' for -t option\n"), tfield); + return (-1); + } + if (delim != NULL) + tfield = delim + 1; + } while (delim != NULL); + } + + if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_SNAPSHOT)) == NULL) + return (1); + if (zhp->zfs_head_type != ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("operation is only applicable " + "to filesystems and their snapshots\n")); + zfs_close(zhp); + return (1); + } + + if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), + offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) + nomem(); + if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) + nomem(); + + /* Always add default sorting columns */ + (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); + (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); + + cb.cb_sortcol = sortcol; + cb.cb_numname = prtnum; + cb.cb_nicenum = !parsable; + cb.cb_avl_pool = avl_pool; + cb.cb_avl = avl_tree; + cb.cb_sid2posix = sid2posix; + + for (i = 0; i < USFIELD_LAST; i++) + cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); + + for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { + if ((zfs_prop_is_user(p) && + !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || + (zfs_prop_is_group(p) && + !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))) || + (zfs_prop_is_project(p) && types != USTYPE_PROJ)) + continue; + + cb.cb_prop = p; + if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) { + zfs_close(zhp); + return (ret); + } + } + zfs_close(zhp); + + /* Sort the list */ + if ((node = uu_avl_first(avl_tree)) == NULL) + return (0); + + us_populated = B_TRUE; + + listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), + offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); + list = uu_list_create(listpool, NULL, UU_DEFAULT); + uu_list_node_init(node, &node->usn_listnode, listpool); + + while (node != NULL) { + rmnode = node; + node = uu_avl_next(avl_tree, node); + uu_avl_remove(avl_tree, rmnode); + if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) + uu_list_insert(list, rmnode, idx2); + } + + for (node = uu_list_first(list); node != NULL; + node = uu_list_next(list, node)) { + us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; + + if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) + uu_avl_insert(avl_tree, node, idx); + } + + uu_list_destroy(list); + uu_list_pool_destroy(listpool); + + /* Print and free node nvlist memory */ + print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, + cb.cb_avl); + + zfs_free_sort_columns(sortcol); + + /* Clean up the AVL tree */ + if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(cb.cb_avl, node); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(avl_tree); + uu_avl_pool_destroy(avl_pool); + + return (ret); +} + +/* + * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] + * [-t type[,...]] [filesystem|volume|snapshot] ... + * + * -H Scripted mode; elide headers and separate columns by tabs + * -p Display values in parsable (literal) format. + * -r Recurse over all children + * -d Limit recursion by depth. + * -o Control which fields to display. + * -s Specify sort columns, descending order. + * -S Specify sort columns, ascending order. + * -t Control which object types to display. + * + * When given no arguments, list all filesystems in the system. + * Otherwise, list the specified datasets, optionally recursing down them if + * '-r' is specified. + */ +typedef struct list_cbdata { + boolean_t cb_first; + boolean_t cb_literal; + boolean_t cb_scripted; + zprop_list_t *cb_proplist; +} list_cbdata_t; + +/* + * Given a list of columns to display, output appropriate headers for each one. + */ +static void +print_header(list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + char headerbuf[ZFS_MAXPROPLEN]; + const char *header; + int i; + boolean_t first = B_TRUE; + boolean_t right_justify; + + for (; pl != NULL; pl = pl->pl_next) { + if (!first) { + (void) printf(" "); + } else { + first = B_FALSE; + } + + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + header = zfs_prop_column_name(pl->pl_prop); + right_justify = zfs_prop_align_right(pl->pl_prop); + } else { + for (i = 0; pl->pl_user_prop[i] != '\0'; i++) + headerbuf[i] = toupper(pl->pl_user_prop[i]); + headerbuf[i] = '\0'; + header = headerbuf; + } + + if (pl->pl_next == NULL && !right_justify) + (void) printf("%s", header); + else if (right_justify) + (void) printf("%*s", (int)pl->pl_width, header); + else + (void) printf("%-*s", (int)pl->pl_width, header); + } + + (void) printf("\n"); +} + +/* + * Given a dataset and a list of fields, print out all the properties according + * to the described layout. + */ +static void +print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + boolean_t first = B_TRUE; + char property[ZFS_MAXPROPLEN]; + nvlist_t *userprops = zfs_get_user_props(zhp); + nvlist_t *propval; + char *propstr; + boolean_t right_justify; + + for (; pl != NULL; pl = pl->pl_next) { + if (!first) { + if (cb->cb_scripted) + (void) printf("\t"); + else + (void) printf(" "); + } else { + first = B_FALSE; + } + + if (pl->pl_prop == ZFS_PROP_NAME) { + (void) strlcpy(property, zfs_get_name(zhp), + sizeof (property)); + propstr = property; + right_justify = zfs_prop_align_right(pl->pl_prop); + } else if (pl->pl_prop != ZPROP_INVAL) { + if (zfs_prop_get(zhp, pl->pl_prop, property, + sizeof (property), NULL, NULL, 0, + cb->cb_literal) != 0) + propstr = "-"; + else + propstr = property; + right_justify = zfs_prop_align_right(pl->pl_prop); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + property, sizeof (property), cb->cb_literal) != 0) + propstr = "-"; + else + propstr = property; + right_justify = B_TRUE; + } else if (zfs_prop_written(pl->pl_user_prop)) { + if (zfs_prop_get_written(zhp, pl->pl_user_prop, + property, sizeof (property), cb->cb_literal) != 0) + propstr = "-"; + else + propstr = property; + right_justify = B_TRUE; + } else { + if (nvlist_lookup_nvlist(userprops, + pl->pl_user_prop, &propval) != 0) + propstr = "-"; + else + verify(nvlist_lookup_string(propval, + ZPROP_VALUE, &propstr) == 0); + right_justify = B_FALSE; + } + + /* + * If this is being called in scripted mode, or if this is the + * last column and it is left-justified, don't include a width + * format specifier. + */ + if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) + (void) printf("%s", propstr); + else if (right_justify) + (void) printf("%*s", (int)pl->pl_width, propstr); + else + (void) printf("%-*s", (int)pl->pl_width, propstr); + } + + (void) printf("\n"); +} + +/* + * Generic callback function to list a dataset or snapshot. + */ +static int +list_callback(zfs_handle_t *zhp, void *data) +{ + list_cbdata_t *cbp = data; + + if (cbp->cb_first) { + if (!cbp->cb_scripted) + print_header(cbp); + cbp->cb_first = B_FALSE; + } + + print_dataset(zhp, cbp); + + return (0); +} + +static int +zfs_do_list(int argc, char **argv) +{ + int c; + static char default_fields[] = + "name,used,available,referenced,mountpoint"; + int types = ZFS_TYPE_DATASET; + boolean_t types_specified = B_FALSE; + char *fields = NULL; + list_cbdata_t cb = { 0 }; + char *value; + int limit = 0; + int ret = 0; + zfs_sort_column_t *sortcol = NULL; + int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; + + /* check options */ + while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { + switch (c) { + case 'o': + fields = optarg; + break; + case 'p': + cb.cb_literal = B_TRUE; + flags |= ZFS_ITER_LITERAL_PROPS; + break; + case 'd': + limit = parse_depth(optarg, &flags); + break; + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case 's': + if (zfs_add_sort_column(&sortcol, optarg, + B_FALSE) != 0) { + (void) fprintf(stderr, + gettext("invalid property '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 'S': + if (zfs_add_sort_column(&sortcol, optarg, + B_TRUE) != 0) { + (void) fprintf(stderr, + gettext("invalid property '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 't': + types = 0; + types_specified = B_TRUE; + flags &= ~ZFS_ITER_PROP_LISTSNAPS; + while (*optarg != '\0') { + static char *type_subopts[] = { "filesystem", + "volume", "snapshot", "snap", "bookmark", + "all", NULL }; + + switch (getsubopt(&optarg, type_subopts, + &value)) { + case 0: + types |= ZFS_TYPE_FILESYSTEM; + break; + case 1: + types |= ZFS_TYPE_VOLUME; + break; + case 2: + case 3: + types |= ZFS_TYPE_SNAPSHOT; + break; + case 4: + types |= ZFS_TYPE_BOOKMARK; + break; + case 5: + types = ZFS_TYPE_DATASET | + ZFS_TYPE_BOOKMARK; + break; + default: + (void) fprintf(stderr, + gettext("invalid type '%s'\n"), + value); + usage(B_FALSE); + } + } + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (fields == NULL) + fields = default_fields; + + /* + * If we are only going to list snapshot names and sort by name, + * then we can use faster version. + */ + if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol)) + flags |= ZFS_ITER_SIMPLE; + + /* + * If "-o space" and no types were specified, don't display snapshots. + */ + if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) + types &= ~ZFS_TYPE_SNAPSHOT; + + /* + * Handle users who want to list all snapshots or bookmarks + * of the current dataset (ex. 'zfs list -t snapshot <dataset>'). + */ + if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && + argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { + flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); + limit = 1; + } + + /* + * If the user specifies '-o all', the zprop_get_list() doesn't + * normally include the name of the dataset. For 'zfs list', we always + * want this property to be first. + */ + if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) + != 0) + usage(B_FALSE); + + cb.cb_first = B_TRUE; + + ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, + limit, list_callback, &cb); + + zprop_free_list(cb.cb_proplist); + zfs_free_sort_columns(sortcol); + + if (ret == 0 && cb.cb_first && !cb.cb_scripted) + (void) fprintf(stderr, gettext("no datasets available\n")); + + return (ret); +} + +/* + * zfs rename [-f] <fs | snap | vol> <fs | snap | vol> + * zfs rename [-f] -p <fs | vol> <fs | vol> + * zfs rename -r <snap> <snap> + * + * Renames the given dataset to another of the same type. + * + * The '-p' flag creates all the non-existing ancestors of the target first. + */ +/* ARGSUSED */ +static int +zfs_do_rename(int argc, char **argv) +{ + zfs_handle_t *zhp; + int c; + int ret = 0; + boolean_t recurse = B_FALSE; + boolean_t parents = B_FALSE; + boolean_t force_unmount = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "prf")) != -1) { + switch (c) { + case 'p': + parents = B_TRUE; + break; + case 'r': + recurse = B_TRUE; + break; + case 'f': + force_unmount = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing source dataset " + "argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing target dataset " + "argument\n")); + usage(B_FALSE); + } + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (recurse && parents) { + (void) fprintf(stderr, gettext("-p and -r options are mutually " + "exclusive\n")); + usage(B_FALSE); + } + + if (recurse && strchr(argv[0], '@') == 0) { + (void) fprintf(stderr, gettext("source dataset for recursive " + "rename must be a snapshot\n")); + usage(B_FALSE); + } + + if ((zhp = zfs_open(g_zfs, argv[0], parents ? ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME : ZFS_TYPE_DATASET)) == NULL) + return (1); + + /* If we were asked and the name looks good, try to create ancestors. */ + if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && + zfs_create_ancestors(g_zfs, argv[1]) != 0) { + zfs_close(zhp); + return (1); + } + + ret = (zfs_rename(zhp, argv[1], recurse, force_unmount) != 0); + + zfs_close(zhp); + return (ret); +} + +/* + * zfs promote <fs> + * + * Promotes the given clone fs to be the parent + */ +/* ARGSUSED */ +static int +zfs_do_promote(int argc, char **argv) +{ + zfs_handle_t *zhp; + int ret = 0; + + /* check options */ + if (argc > 1 && argv[1][0] == '-') { + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + argv[1][1]); + usage(B_FALSE); + } + + /* check number of arguments */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing clone filesystem" + " argument\n")); + usage(B_FALSE); + } + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + + ret = (zfs_promote(zhp) != 0); + + + zfs_close(zhp); + return (ret); +} + +static int +zfs_do_redact(int argc, char **argv) +{ + char *snap = NULL; + char *bookname = NULL; + char **rsnaps = NULL; + int numrsnaps = 0; + argv++; + argc--; + if (argc < 3) { + (void) fprintf(stderr, gettext("too few arguments\n")); + usage(B_FALSE); + } + + snap = argv[0]; + bookname = argv[1]; + rsnaps = argv + 2; + numrsnaps = argc - 2; + + nvlist_t *rsnapnv = fnvlist_alloc(); + + for (int i = 0; i < numrsnaps; i++) { + fnvlist_add_boolean(rsnapnv, rsnaps[i]); + } + + int err = lzc_redact(snap, bookname, rsnapnv); + fnvlist_free(rsnapnv); + + switch (err) { + case 0: + break; + case ENOENT: + (void) fprintf(stderr, + gettext("provided snapshot %s does not exist\n"), snap); + break; + case EEXIST: + (void) fprintf(stderr, gettext("specified redaction bookmark " + "(%s) provided already exists\n"), bookname); + break; + case ENAMETOOLONG: + (void) fprintf(stderr, gettext("provided bookmark name cannot " + "be used, final name would be too long\n")); + break; + case E2BIG: + (void) fprintf(stderr, gettext("too many redaction snapshots " + "specified\n")); + break; + case EINVAL: + if (strchr(bookname, '#') != NULL) + (void) fprintf(stderr, gettext( + "redaction bookmark name must not contain '#'\n")); + else + (void) fprintf(stderr, gettext( + "redaction snapshot must be descendent of " + "snapshot being redacted\n")); + break; + case EALREADY: + (void) fprintf(stderr, gettext("attempted to redact redacted " + "dataset or with respect to redacted dataset\n")); + break; + case ENOTSUP: + (void) fprintf(stderr, gettext("redaction bookmarks feature " + "not enabled\n")); + break; + case EXDEV: + (void) fprintf(stderr, gettext("potentially invalid redaction " + "snapshot; full dataset names required\n")); + break; + default: + (void) fprintf(stderr, gettext("internal error: %s\n"), + strerror(errno)); + } + + return (err); +} + +/* + * zfs rollback [-rRf] <snapshot> + * + * -r Delete any intervening snapshots before doing rollback + * -R Delete any snapshots and their clones + * -f ignored for backwards compatibility + * + * Given a filesystem, rollback to a specific snapshot, discarding any changes + * since then and making it the active dataset. If more recent snapshots exist, + * the command will complain unless the '-r' flag is given. + */ +typedef struct rollback_cbdata { + uint64_t cb_create; + uint8_t cb_younger_ds_printed; + boolean_t cb_first; + int cb_doclones; + char *cb_target; + int cb_error; + boolean_t cb_recurse; +} rollback_cbdata_t; + +static int +rollback_check_dependent(zfs_handle_t *zhp, void *data) +{ + rollback_cbdata_t *cbp = data; + + if (cbp->cb_first && cbp->cb_recurse) { + (void) fprintf(stderr, gettext("cannot rollback to " + "'%s': clones of previous snapshots exist\n"), + cbp->cb_target); + (void) fprintf(stderr, gettext("use '-R' to " + "force deletion of the following clones and " + "dependents:\n")); + cbp->cb_first = 0; + cbp->cb_error = 1; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + + zfs_close(zhp); + return (0); +} + + +/* + * Report some snapshots/bookmarks more recent than the one specified. + * Used when '-r' is not specified. We reuse this same callback for the + * snapshot dependents - if 'cb_dependent' is set, then this is a + * dependent and we should report it without checking the transaction group. + */ +static int +rollback_check(zfs_handle_t *zhp, void *data) +{ + rollback_cbdata_t *cbp = data; + /* + * Max number of younger snapshots and/or bookmarks to display before + * we stop the iteration. + */ + const uint8_t max_younger = 32; + + if (cbp->cb_doclones) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { + if (cbp->cb_first && !cbp->cb_recurse) { + (void) fprintf(stderr, gettext("cannot " + "rollback to '%s': more recent snapshots " + "or bookmarks exist\n"), + cbp->cb_target); + (void) fprintf(stderr, gettext("use '-r' to " + "force deletion of the following " + "snapshots and bookmarks:\n")); + cbp->cb_first = 0; + cbp->cb_error = 1; + } + + if (cbp->cb_recurse) { + if (zfs_iter_dependents(zhp, B_TRUE, + rollback_check_dependent, cbp) != 0) { + zfs_close(zhp); + return (-1); + } + } else { + (void) fprintf(stderr, "%s\n", + zfs_get_name(zhp)); + cbp->cb_younger_ds_printed++; + } + } + zfs_close(zhp); + + if (cbp->cb_younger_ds_printed == max_younger) { + /* + * This non-recursive rollback is going to fail due to the + * presence of snapshots and/or bookmarks that are younger than + * the rollback target. + * We printed some of the offending objects, now we stop + * zfs_iter_snapshot/bookmark iteration so we can fail fast and + * avoid iterating over the rest of the younger objects + */ + (void) fprintf(stderr, gettext("Output limited to %d " + "snapshots/bookmarks\n"), max_younger); + return (-1); + } + return (0); +} + +static int +zfs_do_rollback(int argc, char **argv) +{ + int ret = 0; + int c; + boolean_t force = B_FALSE; + rollback_cbdata_t cb = { 0 }; + zfs_handle_t *zhp, *snap; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + char *delim; + uint64_t min_txg = 0; + + /* check options */ + while ((c = getopt(argc, argv, "rRf")) != -1) { + switch (c) { + case 'r': + cb.cb_recurse = 1; + break; + case 'R': + cb.cb_recurse = 1; + cb.cb_doclones = 1; + break; + case 'f': + force = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing dataset argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* open the snapshot */ + if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) + return (1); + + /* open the parent dataset */ + (void) strlcpy(parentname, argv[0], sizeof (parentname)); + verify((delim = strrchr(parentname, '@')) != NULL); + *delim = '\0'; + if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { + zfs_close(snap); + return (1); + } + + /* + * Check for more recent snapshots and/or clones based on the presence + * of '-r' and '-R'. + */ + cb.cb_target = argv[0]; + cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); + cb.cb_first = B_TRUE; + cb.cb_error = 0; + + if (cb.cb_create > 0) + min_txg = cb.cb_create; + + if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb, + min_txg, 0)) != 0) + goto out; + if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0) + goto out; + + if ((ret = cb.cb_error) != 0) + goto out; + + /* + * Rollback parent to the given snapshot. + */ + ret = zfs_rollback(zhp, snap, force); + +out: + zfs_close(snap); + zfs_close(zhp); + + if (ret == 0) + return (0); + else + return (1); +} + +/* + * zfs set property=value ... { fs | snap | vol } ... + * + * Sets the given properties for all datasets specified on the command line. + */ + +static int +set_callback(zfs_handle_t *zhp, void *data) +{ + nvlist_t *props = data; + + if (zfs_prop_set_list(zhp, props) != 0) { + switch (libzfs_errno(g_zfs)) { + case EZFS_MOUNTFAILED: + (void) fprintf(stderr, gettext("property may be set " + "but unable to remount filesystem\n")); + break; + case EZFS_SHARENFSFAILED: + (void) fprintf(stderr, gettext("property may be set " + "but unable to reshare filesystem\n")); + break; + } + return (1); + } + return (0); +} + +static int +zfs_do_set(int argc, char **argv) +{ + nvlist_t *props = NULL; + int ds_start = -1; /* argv idx of first dataset arg */ + int ret = 0; + int i; + + /* check for options */ + if (argc > 1 && argv[1][0] == '-') { + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + argv[1][1]); + usage(B_FALSE); + } + + /* check number of arguments */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing arguments\n")); + usage(B_FALSE); + } + if (argc < 3) { + if (strchr(argv[1], '=') == NULL) { + (void) fprintf(stderr, gettext("missing property=value " + "argument(s)\n")); + } else { + (void) fprintf(stderr, gettext("missing dataset " + "name(s)\n")); + } + usage(B_FALSE); + } + + /* validate argument order: prop=val args followed by dataset args */ + for (i = 1; i < argc; i++) { + if (strchr(argv[i], '=') != NULL) { + if (ds_start > 0) { + /* out-of-order prop=val argument */ + (void) fprintf(stderr, gettext("invalid " + "argument order\n")); + usage(B_FALSE); + } + } else if (ds_start < 0) { + ds_start = i; + } + } + if (ds_start < 0) { + (void) fprintf(stderr, gettext("missing dataset name(s)\n")); + usage(B_FALSE); + } + + /* Populate a list of property settings */ + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + for (i = 1; i < ds_start; i++) { + if (!parseprop(props, argv[i])) { + ret = -1; + goto error; + } + } + + ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, + ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); + +error: + nvlist_free(props); + return (ret); +} + +typedef struct snap_cbdata { + nvlist_t *sd_nvl; + boolean_t sd_recursive; + const char *sd_snapname; +} snap_cbdata_t; + +static int +zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) +{ + snap_cbdata_t *sd = arg; + char *name; + int rv = 0; + int error; + + if (sd->sd_recursive && + zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { + zfs_close(zhp); + return (0); + } + + error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); + if (error == -1) + nomem(); + fnvlist_add_boolean(sd->sd_nvl, name); + free(name); + + if (sd->sd_recursive) + rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); + zfs_close(zhp); + return (rv); +} + +/* + * zfs snapshot [-r] [-o prop=value] ... <fs@snap> + * + * Creates a snapshot with the given name. While functionally equivalent to + * 'zfs create', it is a separate command to differentiate intent. + */ +static int +zfs_do_snapshot(int argc, char **argv) +{ + int ret = 0; + int c; + nvlist_t *props; + snap_cbdata_t sd = { 0 }; + boolean_t multiple_snaps = B_FALSE; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, "ro:")) != -1) { + switch (c) { + case 'o': + if (!parseprop(props, optarg)) { + nvlist_free(sd.sd_nvl); + nvlist_free(props); + return (1); + } + break; + case 'r': + sd.sd_recursive = B_TRUE; + multiple_snaps = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing snapshot argument\n")); + goto usage; + } + + if (argc > 1) + multiple_snaps = B_TRUE; + for (; argc > 0; argc--, argv++) { + char *atp; + zfs_handle_t *zhp; + + atp = strchr(argv[0], '@'); + if (atp == NULL) + goto usage; + *atp = '\0'; + sd.sd_snapname = atp + 1; + zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + goto usage; + if (zfs_snapshot_cb(zhp, &sd) != 0) + goto usage; + } + + ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); + nvlist_free(sd.sd_nvl); + nvlist_free(props); + if (ret != 0 && multiple_snaps) + (void) fprintf(stderr, gettext("no snapshots were created\n")); + return (ret != 0); + +usage: + nvlist_free(sd.sd_nvl); + nvlist_free(props); + usage(B_FALSE); + return (-1); +} + + +/* + * Send a backup stream to stdout. + */ +static int +zfs_do_send(int argc, char **argv) +{ + char *fromname = NULL; + char *toname = NULL; + char *resume_token = NULL; + char *cp; + zfs_handle_t *zhp; + sendflags_t flags = { 0 }; + int c, err; + nvlist_t *dbgnv = NULL; + char *redactbook = NULL; + + struct option long_options[] = { + {"replicate", no_argument, NULL, 'R'}, + {"redact", required_argument, NULL, 'd'}, + {"props", no_argument, NULL, 'p'}, + {"parsable", no_argument, NULL, 'P'}, + {"dedup", no_argument, NULL, 'D'}, + {"verbose", no_argument, NULL, 'v'}, + {"dryrun", no_argument, NULL, 'n'}, + {"large-block", no_argument, NULL, 'L'}, + {"embed", no_argument, NULL, 'e'}, + {"resume", required_argument, NULL, 't'}, + {"compressed", no_argument, NULL, 'c'}, + {"raw", no_argument, NULL, 'w'}, + {"backup", no_argument, NULL, 'b'}, + {"holds", no_argument, NULL, 'h'}, + {"saved", no_argument, NULL, 'S'}, + {0, 0, 0, 0} + }; + + /* check options */ + while ((c = getopt_long(argc, argv, ":i:I:RDpvnPLeht:cwbd:S", + long_options, NULL)) != -1) { + switch (c) { + case 'i': + if (fromname) + usage(B_FALSE); + fromname = optarg; + break; + case 'I': + if (fromname) + usage(B_FALSE); + fromname = optarg; + flags.doall = B_TRUE; + break; + case 'R': + flags.replicate = B_TRUE; + break; + case 'd': + redactbook = optarg; + break; + case 'p': + flags.props = B_TRUE; + break; + case 'b': + flags.backup = B_TRUE; + break; + case 'h': + flags.holds = B_TRUE; + break; + case 'P': + flags.parsable = B_TRUE; + break; + case 'v': + flags.verbosity++; + flags.progress = B_TRUE; + break; + case 'D': + (void) fprintf(stderr, + gettext("WARNING: deduplicated send is no " + "longer supported. A regular,\n" + "non-deduplicated stream will be generated.\n\n")); + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'L': + flags.largeblock = B_TRUE; + break; + case 'e': + flags.embed_data = B_TRUE; + break; + case 't': + resume_token = optarg; + break; + case 'c': + flags.compress = B_TRUE; + break; + case 'w': + flags.raw = B_TRUE; + flags.compress = B_TRUE; + flags.embed_data = B_TRUE; + flags.largeblock = B_TRUE; + break; + case 'S': + flags.saved = B_TRUE; + break; + case ':': + /* + * If a parameter was not passed, optopt contains the + * value that would normally lead us into the + * appropriate case statement. If it's > 256, then this + * must be a longopt and we should look at argv to get + * the string. Otherwise it's just the character, so we + * should use it directly. + */ + if (optopt <= UINT8_MAX) { + (void) fprintf(stderr, + gettext("missing argument for '%c' " + "option\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("missing argument for '%s' " + "option\n"), argv[optind - 1]); + } + usage(B_FALSE); + break; + case '?': + /*FALLTHROUGH*/ + default: + /* + * If an invalid flag was passed, optopt contains the + * character if it was a short flag, or 0 if it was a + * longopt. + */ + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + + } + usage(B_FALSE); + } + } + + if (flags.parsable && flags.verbosity == 0) + flags.verbosity = 1; + + argc -= optind; + argv += optind; + + if (resume_token != NULL) { + if (fromname != NULL || flags.replicate || flags.props || + flags.backup || flags.holds || + flags.saved || redactbook != NULL) { + (void) fprintf(stderr, + gettext("invalid flags combined with -t\n")); + usage(B_FALSE); + } + if (argc > 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + } else { + if (argc < 1) { + (void) fprintf(stderr, + gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + } + + if (flags.saved) { + if (fromname != NULL || flags.replicate || flags.props || + flags.doall || flags.backup || + flags.holds || flags.largeblock || flags.embed_data || + flags.compress || flags.raw || redactbook != NULL) { + (void) fprintf(stderr, gettext("incompatible flags " + "combined with saved send flag\n")); + usage(B_FALSE); + } + if (strchr(argv[0], '@') != NULL) { + (void) fprintf(stderr, gettext("saved send must " + "specify the dataset with partially-received " + "state\n")); + usage(B_FALSE); + } + } + + if (flags.raw && redactbook != NULL) { + (void) fprintf(stderr, + gettext("Error: raw sends may not be redacted.\n")); + return (1); + } + + if (!flags.dryrun && isatty(STDOUT_FILENO)) { + (void) fprintf(stderr, + gettext("Error: Stream can not be written to a terminal.\n" + "You must redirect standard output.\n")); + return (1); + } + + if (flags.saved) { + zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); + if (zhp == NULL) + return (1); + + err = zfs_send_saved(zhp, &flags, STDOUT_FILENO, + resume_token); + zfs_close(zhp); + return (err != 0); + } else if (resume_token != NULL) { + return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, + resume_token)); + } + + /* + * For everything except -R and -I, use the new, cleaner code path. + */ + if (!(flags.replicate || flags.doall)) { + char frombuf[ZFS_MAX_DATASET_NAME_LEN]; + + if (fromname != NULL && (strchr(fromname, '#') == NULL && + strchr(fromname, '@') == NULL)) { + /* + * Neither bookmark or snapshot was specified. Print a + * warning, and assume snapshot. + */ + (void) fprintf(stderr, "Warning: incremental source " + "didn't specify type, assuming snapshot. Use '@' " + "or '#' prefix to avoid ambiguity.\n"); + (void) snprintf(frombuf, sizeof (frombuf), "@%s", + fromname); + fromname = frombuf; + } + if (fromname != NULL && + (fromname[0] == '#' || fromname[0] == '@')) { + /* + * Incremental source name begins with # or @. + * Default to same fs as target. + */ + char tmpbuf[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf)); + (void) strlcpy(frombuf, argv[0], sizeof (frombuf)); + cp = strchr(frombuf, '@'); + if (cp != NULL) + *cp = '\0'; + (void) strlcat(frombuf, tmpbuf, sizeof (frombuf)); + fromname = frombuf; + } + + zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); + if (zhp == NULL) + return (1); + err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags, + redactbook); + zfs_close(zhp); + return (err != 0); + } + + if (fromname != NULL && strchr(fromname, '#')) { + (void) fprintf(stderr, + gettext("Error: multiple snapshots cannot be " + "sent from a bookmark.\n")); + return (1); + } + + if (redactbook != NULL) { + (void) fprintf(stderr, gettext("Error: multiple snapshots " + "cannot be sent redacted.\n")); + return (1); + } + + if ((cp = strchr(argv[0], '@')) == NULL) { + (void) fprintf(stderr, gettext("Error: " + "Unsupported flag with filesystem or bookmark.\n")); + return (1); + } + *cp = '\0'; + toname = cp + 1; + zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + + /* + * If they specified the full path to the snapshot, chop off + * everything except the short name of the snapshot, but special + * case if they specify the origin. + */ + if (fromname && (cp = strchr(fromname, '@')) != NULL) { + char origin[ZFS_MAX_DATASET_NAME_LEN]; + zprop_source_t src; + + (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, + origin, sizeof (origin), &src, NULL, 0, B_FALSE); + + if (strcmp(origin, fromname) == 0) { + fromname = NULL; + flags.fromorigin = B_TRUE; + } else { + *cp = '\0'; + if (cp != fromname && strcmp(argv[0], fromname)) { + (void) fprintf(stderr, + gettext("incremental source must be " + "in same filesystem\n")); + usage(B_FALSE); + } + fromname = cp + 1; + if (strchr(fromname, '@') || strchr(fromname, '/')) { + (void) fprintf(stderr, + gettext("invalid incremental source\n")); + usage(B_FALSE); + } + } + } + + if (flags.replicate && fromname == NULL) + flags.doall = B_TRUE; + + err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, + flags.verbosity >= 3 ? &dbgnv : NULL); + + if (flags.verbosity >= 3 && dbgnv != NULL) { + /* + * dump_nvlist prints to stdout, but that's been + * redirected to a file. Make it print to stderr + * instead. + */ + (void) dup2(STDERR_FILENO, STDOUT_FILENO); + dump_nvlist(dbgnv, 0); + nvlist_free(dbgnv); + } + zfs_close(zhp); + + return (err != 0); +} + +/* + * Restore a backup stream from stdin. + */ +static int +zfs_do_receive(int argc, char **argv) +{ + int c, err = 0; + recvflags_t flags = { 0 }; + boolean_t abort_resumable = B_FALSE; + nvlist_t *props; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, ":o:x:dehMnuvFsA")) != -1) { + switch (c) { + case 'o': + if (!parseprop(props, optarg)) { + nvlist_free(props); + usage(B_FALSE); + } + break; + case 'x': + if (!parsepropname(props, optarg)) { + nvlist_free(props); + usage(B_FALSE); + } + break; + case 'd': + if (flags.istail) { + (void) fprintf(stderr, gettext("invalid option " + "combination: -d and -e are mutually " + "exclusive\n")); + usage(B_FALSE); + } + flags.isprefix = B_TRUE; + break; + case 'e': + if (flags.isprefix) { + (void) fprintf(stderr, gettext("invalid option " + "combination: -d and -e are mutually " + "exclusive\n")); + usage(B_FALSE); + } + flags.istail = B_TRUE; + break; + case 'h': + flags.skipholds = B_TRUE; + break; + case 'M': + flags.forceunmount = B_TRUE; + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'u': + flags.nomount = B_TRUE; + break; + case 'v': + flags.verbose = B_TRUE; + break; + case 's': + flags.resumable = B_TRUE; + break; + case 'F': + flags.force = B_TRUE; + break; + case 'A': + abort_resumable = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* zfs recv -e (use "tail" name) implies -d (remove dataset "head") */ + if (flags.istail) + flags.isprefix = B_TRUE; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (abort_resumable) { + if (flags.isprefix || flags.istail || flags.dryrun || + flags.resumable || flags.nomount) { + (void) fprintf(stderr, gettext("invalid option\n")); + usage(B_FALSE); + } + + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(namebuf, sizeof (namebuf), + "%s/%%recv", argv[0]); + + if (zfs_dataset_exists(g_zfs, namebuf, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { + zfs_handle_t *zhp = zfs_open(g_zfs, + namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + nvlist_free(props); + return (1); + } + err = zfs_destroy(zhp, B_FALSE); + zfs_close(zhp); + } else { + zfs_handle_t *zhp = zfs_open(g_zfs, + argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + usage(B_FALSE); + if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { + (void) fprintf(stderr, + gettext("'%s' does not have any " + "resumable receive state to abort\n"), + argv[0]); + nvlist_free(props); + zfs_close(zhp); + return (1); + } + err = zfs_destroy(zhp, B_FALSE); + zfs_close(zhp); + } + nvlist_free(props); + return (err != 0); + } + + if (isatty(STDIN_FILENO)) { + (void) fprintf(stderr, + gettext("Error: Backup stream can not be read " + "from a terminal.\n" + "You must redirect standard input.\n")); + nvlist_free(props); + return (1); + } + err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); + nvlist_free(props); + + return (err != 0); +} + +/* + * allow/unallow stuff + */ +/* copied from zfs/sys/dsl_deleg.h */ +#define ZFS_DELEG_PERM_CREATE "create" +#define ZFS_DELEG_PERM_DESTROY "destroy" +#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" +#define ZFS_DELEG_PERM_ROLLBACK "rollback" +#define ZFS_DELEG_PERM_CLONE "clone" +#define ZFS_DELEG_PERM_PROMOTE "promote" +#define ZFS_DELEG_PERM_RENAME "rename" +#define ZFS_DELEG_PERM_MOUNT "mount" +#define ZFS_DELEG_PERM_SHARE "share" +#define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_RECEIVE "receive" +#define ZFS_DELEG_PERM_ALLOW "allow" +#define ZFS_DELEG_PERM_USERPROP "userprop" +#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ +#define ZFS_DELEG_PERM_USERQUOTA "userquota" +#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" +#define ZFS_DELEG_PERM_USERUSED "userused" +#define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_USEROBJQUOTA "userobjquota" +#define ZFS_DELEG_PERM_GROUPOBJQUOTA "groupobjquota" +#define ZFS_DELEG_PERM_USEROBJUSED "userobjused" +#define ZFS_DELEG_PERM_GROUPOBJUSED "groupobjused" + +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" +#define ZFS_DELEG_PERM_DIFF "diff" +#define ZFS_DELEG_PERM_BOOKMARK "bookmark" +#define ZFS_DELEG_PERM_LOAD_KEY "load-key" +#define ZFS_DELEG_PERM_CHANGE_KEY "change-key" + +#define ZFS_DELEG_PERM_PROJECTUSED "projectused" +#define ZFS_DELEG_PERM_PROJECTQUOTA "projectquota" +#define ZFS_DELEG_PERM_PROJECTOBJUSED "projectobjused" +#define ZFS_DELEG_PERM_PROJECTOBJQUOTA "projectobjquota" + +#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE + +static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { + { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, + { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, + { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, + { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, + { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, + { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, + { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, + { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, + { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, + { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, + { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, + { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, + { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, + { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, + { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, + { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, + { ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY }, + { ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY }, + + { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, + { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, + { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, + { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, + { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, + { ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_NOTE_USEROBJQUOTA }, + { ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_NOTE_USEROBJUSED }, + { ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_NOTE_GROUPOBJQUOTA }, + { ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_NOTE_GROUPOBJUSED }, + { ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_NOTE_PROJECTUSED }, + { ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTQUOTA }, + { ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJUSED }, + { ZFS_DELEG_PERM_PROJECTOBJQUOTA, ZFS_DELEG_NOTE_PROJECTOBJQUOTA }, + { NULL, ZFS_DELEG_NOTE_NONE } +}; + +/* permission structure */ +typedef struct deleg_perm { + zfs_deleg_who_type_t dp_who_type; + const char *dp_name; + boolean_t dp_local; + boolean_t dp_descend; +} deleg_perm_t; + +/* */ +typedef struct deleg_perm_node { + deleg_perm_t dpn_perm; + + uu_avl_node_t dpn_avl_node; +} deleg_perm_node_t; + +typedef struct fs_perm fs_perm_t; + +/* permissions set */ +typedef struct who_perm { + zfs_deleg_who_type_t who_type; + const char *who_name; /* id */ + char who_ug_name[256]; /* user/group name */ + fs_perm_t *who_fsperm; /* uplink */ + + uu_avl_t *who_deleg_perm_avl; /* permissions */ +} who_perm_t; + +/* */ +typedef struct who_perm_node { + who_perm_t who_perm; + uu_avl_node_t who_avl_node; +} who_perm_node_t; + +typedef struct fs_perm_set fs_perm_set_t; +/* fs permissions */ +struct fs_perm { + const char *fsp_name; + + uu_avl_t *fsp_sc_avl; /* sets,create */ + uu_avl_t *fsp_uge_avl; /* user,group,everyone */ + + fs_perm_set_t *fsp_set; /* uplink */ +}; + +/* */ +typedef struct fs_perm_node { + fs_perm_t fspn_fsperm; + uu_avl_t *fspn_avl; + + uu_list_node_t fspn_list_node; +} fs_perm_node_t; + +/* top level structure */ +struct fs_perm_set { + uu_list_pool_t *fsps_list_pool; + uu_list_t *fsps_list; /* list of fs_perms */ + + uu_avl_pool_t *fsps_named_set_avl_pool; + uu_avl_pool_t *fsps_who_perm_avl_pool; + uu_avl_pool_t *fsps_deleg_perm_avl_pool; +}; + +static inline const char * +deleg_perm_type(zfs_deleg_note_t note) +{ + /* subcommands */ + switch (note) { + /* SUBCOMMANDS */ + /* OTHER */ + case ZFS_DELEG_NOTE_GROUPQUOTA: + case ZFS_DELEG_NOTE_GROUPUSED: + case ZFS_DELEG_NOTE_USERPROP: + case ZFS_DELEG_NOTE_USERQUOTA: + case ZFS_DELEG_NOTE_USERUSED: + case ZFS_DELEG_NOTE_USEROBJQUOTA: + case ZFS_DELEG_NOTE_USEROBJUSED: + case ZFS_DELEG_NOTE_GROUPOBJQUOTA: + case ZFS_DELEG_NOTE_GROUPOBJUSED: + case ZFS_DELEG_NOTE_PROJECTUSED: + case ZFS_DELEG_NOTE_PROJECTQUOTA: + case ZFS_DELEG_NOTE_PROJECTOBJUSED: + case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: + /* other */ + return (gettext("other")); + default: + return (gettext("subcommand")); + } +} + +static int +who_type2weight(zfs_deleg_who_type_t who_type) +{ + int res; + switch (who_type) { + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + res = 0; + break; + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_CREATE: + res = 1; + break; + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + res = 2; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + res = 3; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + res = 4; + break; + default: + res = -1; + } + + return (res); +} + +/* ARGSUSED */ +static int +who_perm_compare(const void *larg, const void *rarg, void *unused) +{ + const who_perm_node_t *l = larg; + const who_perm_node_t *r = rarg; + zfs_deleg_who_type_t ltype = l->who_perm.who_type; + zfs_deleg_who_type_t rtype = r->who_perm.who_type; + int lweight = who_type2weight(ltype); + int rweight = who_type2weight(rtype); + int res = lweight - rweight; + if (res == 0) + res = strncmp(l->who_perm.who_name, r->who_perm.who_name, + ZFS_MAX_DELEG_NAME-1); + + if (res == 0) + return (0); + if (res > 0) + return (1); + else + return (-1); +} + +/* ARGSUSED */ +static int +deleg_perm_compare(const void *larg, const void *rarg, void *unused) +{ + const deleg_perm_node_t *l = larg; + const deleg_perm_node_t *r = rarg; + int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, + ZFS_MAX_DELEG_NAME-1); + + if (res == 0) + return (0); + + if (res > 0) + return (1); + else + return (-1); +} + +static inline void +fs_perm_set_init(fs_perm_set_t *fspset) +{ + bzero(fspset, sizeof (fs_perm_set_t)); + + if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", + sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), + NULL, UU_DEFAULT)) == NULL) + nomem(); + if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( + "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( + who_perm_node_t, who_avl_node), who_perm_compare, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( + "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( + who_perm_node_t, who_avl_node), who_perm_compare, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( + "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( + deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) + == NULL) + nomem(); +} + +static inline void fs_perm_fini(fs_perm_t *); +static inline void who_perm_fini(who_perm_t *); + +static inline void +fs_perm_set_fini(fs_perm_set_t *fspset) +{ + fs_perm_node_t *node = uu_list_first(fspset->fsps_list); + + while (node != NULL) { + fs_perm_node_t *next_node = + uu_list_next(fspset->fsps_list, node); + fs_perm_t *fsperm = &node->fspn_fsperm; + fs_perm_fini(fsperm); + uu_list_remove(fspset->fsps_list, node); + free(node); + node = next_node; + } + + uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); + uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); + uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); +} + +static inline void +deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, + const char *name) +{ + deleg_perm->dp_who_type = type; + deleg_perm->dp_name = name; +} + +static inline void +who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, + zfs_deleg_who_type_t type, const char *name) +{ + uu_avl_pool_t *pool; + pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; + + bzero(who_perm, sizeof (who_perm_t)); + + if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, + UU_DEFAULT)) == NULL) + nomem(); + + who_perm->who_type = type; + who_perm->who_name = name; + who_perm->who_fsperm = fsperm; +} + +static inline void +who_perm_fini(who_perm_t *who_perm) +{ + deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); + + while (node != NULL) { + deleg_perm_node_t *next_node = + uu_avl_next(who_perm->who_deleg_perm_avl, node); + + uu_avl_remove(who_perm->who_deleg_perm_avl, node); + free(node); + node = next_node; + } + + uu_avl_destroy(who_perm->who_deleg_perm_avl); +} + +static inline void +fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) +{ + uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; + uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; + + bzero(fsperm, sizeof (fs_perm_t)); + + if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) + == NULL) + nomem(); + + if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) + == NULL) + nomem(); + + fsperm->fsp_set = fspset; + fsperm->fsp_name = fsname; +} + +static inline void +fs_perm_fini(fs_perm_t *fsperm) +{ + who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); + while (node != NULL) { + who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, + node); + who_perm_t *who_perm = &node->who_perm; + who_perm_fini(who_perm); + uu_avl_remove(fsperm->fsp_sc_avl, node); + free(node); + node = next_node; + } + + node = uu_avl_first(fsperm->fsp_uge_avl); + while (node != NULL) { + who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, + node); + who_perm_t *who_perm = &node->who_perm; + who_perm_fini(who_perm); + uu_avl_remove(fsperm->fsp_uge_avl, node); + free(node); + node = next_node; + } + + uu_avl_destroy(fsperm->fsp_sc_avl); + uu_avl_destroy(fsperm->fsp_uge_avl); +} + +static void +set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, + zfs_deleg_who_type_t who_type, const char *name, char locality) +{ + uu_avl_index_t idx = 0; + + deleg_perm_node_t *found_node = NULL; + deleg_perm_t *deleg_perm = &node->dpn_perm; + + deleg_perm_init(deleg_perm, who_type, name); + + if ((found_node = uu_avl_find(avl, node, NULL, &idx)) + == NULL) + uu_avl_insert(avl, node, idx); + else { + node = found_node; + deleg_perm = &node->dpn_perm; + } + + + switch (locality) { + case ZFS_DELEG_LOCAL: + deleg_perm->dp_local = B_TRUE; + break; + case ZFS_DELEG_DESCENDENT: + deleg_perm->dp_descend = B_TRUE; + break; + case ZFS_DELEG_NA: + break; + default: + assert(B_FALSE); /* invalid locality */ + } +} + +static inline int +parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) +{ + nvpair_t *nvp = NULL; + fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; + uu_avl_t *avl = who_perm->who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_perm->who_type; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + const char *name = nvpair_name(nvp); + data_type_t type = nvpair_type(nvp); + uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; + deleg_perm_node_t *node = + safe_malloc(sizeof (deleg_perm_node_t)); + + VERIFY(type == DATA_TYPE_BOOLEAN); + + uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); + set_deleg_perm_node(avl, node, who_type, name, locality); + } + + return (0); +} + +static inline int +parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) +{ + nvpair_t *nvp = NULL; + fs_perm_set_t *fspset = fsperm->fsp_set; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + nvlist_t *nvl2 = NULL; + const char *name = nvpair_name(nvp); + uu_avl_t *avl = NULL; + uu_avl_pool_t *avl_pool = NULL; + zfs_deleg_who_type_t perm_type = name[0]; + char perm_locality = name[1]; + const char *perm_name = name + 3; + who_perm_t *who_perm = NULL; + + assert('$' == name[2]); + + if (nvpair_value_nvlist(nvp, &nvl2) != 0) + return (-1); + + switch (perm_type) { + case ZFS_DELEG_CREATE: + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_NAMED_SET: + case ZFS_DELEG_NAMED_SET_SETS: + avl_pool = fspset->fsps_named_set_avl_pool; + avl = fsperm->fsp_sc_avl; + break; + case ZFS_DELEG_USER: + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_GROUP: + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_EVERYONE: + case ZFS_DELEG_EVERYONE_SETS: + avl_pool = fspset->fsps_who_perm_avl_pool; + avl = fsperm->fsp_uge_avl; + break; + + default: + assert(!"unhandled zfs_deleg_who_type_t"); + } + + who_perm_node_t *found_node = NULL; + who_perm_node_t *node = safe_malloc( + sizeof (who_perm_node_t)); + who_perm = &node->who_perm; + uu_avl_index_t idx = 0; + + uu_avl_node_init(node, &node->who_avl_node, avl_pool); + who_perm_init(who_perm, fsperm, perm_type, perm_name); + + if ((found_node = uu_avl_find(avl, node, NULL, &idx)) + == NULL) { + if (avl == fsperm->fsp_uge_avl) { + uid_t rid = 0; + struct passwd *p = NULL; + struct group *g = NULL; + const char *nice_name = NULL; + + switch (perm_type) { + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + rid = atoi(perm_name); + p = getpwuid(rid); + if (p) + nice_name = p->pw_name; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + rid = atoi(perm_name); + g = getgrgid(rid); + if (g) + nice_name = g->gr_name; + break; + + default: + break; + } + + if (nice_name != NULL) { + (void) strlcpy( + node->who_perm.who_ug_name, + nice_name, 256); + } else { + /* User or group unknown */ + (void) snprintf( + node->who_perm.who_ug_name, + sizeof (node->who_perm.who_ug_name), + "(unknown: %d)", rid); + } + } + + uu_avl_insert(avl, node, idx); + } else { + node = found_node; + who_perm = &node->who_perm; + } + + assert(who_perm != NULL); + (void) parse_who_perm(who_perm, nvl2, perm_locality); + } + + return (0); +} + +static inline int +parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) +{ + nvpair_t *nvp = NULL; + uu_avl_index_t idx = 0; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + nvlist_t *nvl2 = NULL; + const char *fsname = nvpair_name(nvp); + data_type_t type = nvpair_type(nvp); + fs_perm_t *fsperm = NULL; + fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); + if (node == NULL) + nomem(); + + fsperm = &node->fspn_fsperm; + + VERIFY(DATA_TYPE_NVLIST == type); + + uu_list_node_init(node, &node->fspn_list_node, + fspset->fsps_list_pool); + + idx = uu_list_numnodes(fspset->fsps_list); + fs_perm_init(fsperm, fspset, fsname); + + if (nvpair_value_nvlist(nvp, &nvl2) != 0) + return (-1); + + (void) parse_fs_perm(fsperm, nvl2); + + uu_list_insert(fspset->fsps_list, node, idx); + } + + return (0); +} + +static inline const char * +deleg_perm_comment(zfs_deleg_note_t note) +{ + const char *str = ""; + + /* subcommands */ + switch (note) { + /* SUBCOMMANDS */ + case ZFS_DELEG_NOTE_ALLOW: + str = gettext("Must also have the permission that is being" + "\n\t\t\t\tallowed"); + break; + case ZFS_DELEG_NOTE_CLONE: + str = gettext("Must also have the 'create' ability and 'mount'" + "\n\t\t\t\tability in the origin file system"); + break; + case ZFS_DELEG_NOTE_CREATE: + str = gettext("Must also have the 'mount' ability"); + break; + case ZFS_DELEG_NOTE_DESTROY: + str = gettext("Must also have the 'mount' ability"); + break; + case ZFS_DELEG_NOTE_DIFF: + str = gettext("Allows lookup of paths within a dataset;" + "\n\t\t\t\tgiven an object number. Ordinary users need this" + "\n\t\t\t\tin order to use zfs diff"); + break; + case ZFS_DELEG_NOTE_HOLD: + str = gettext("Allows adding a user hold to a snapshot"); + break; + case ZFS_DELEG_NOTE_MOUNT: + str = gettext("Allows mount/umount of ZFS datasets"); + break; + case ZFS_DELEG_NOTE_PROMOTE: + str = gettext("Must also have the 'mount'\n\t\t\t\tand" + " 'promote' ability in the origin file system"); + break; + case ZFS_DELEG_NOTE_RECEIVE: + str = gettext("Must also have the 'mount' and 'create'" + " ability"); + break; + case ZFS_DELEG_NOTE_RELEASE: + str = gettext("Allows releasing a user hold which\n\t\t\t\t" + "might destroy the snapshot"); + break; + case ZFS_DELEG_NOTE_RENAME: + str = gettext("Must also have the 'mount' and 'create'" + "\n\t\t\t\tability in the new parent"); + break; + case ZFS_DELEG_NOTE_ROLLBACK: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_SEND: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_SHARE: + str = gettext("Allows sharing file systems over NFS or SMB" + "\n\t\t\t\tprotocols"); + break; + case ZFS_DELEG_NOTE_SNAPSHOT: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_LOAD_KEY: + str = gettext("Allows loading or unloading an encryption key"); + break; + case ZFS_DELEG_NOTE_CHANGE_KEY: + str = gettext("Allows changing or adding an encryption key"); + break; +/* + * case ZFS_DELEG_NOTE_VSCAN: + * str = gettext(""); + * break; + */ + /* OTHER */ + case ZFS_DELEG_NOTE_GROUPQUOTA: + str = gettext("Allows accessing any groupquota@... property"); + break; + case ZFS_DELEG_NOTE_GROUPUSED: + str = gettext("Allows reading any groupused@... property"); + break; + case ZFS_DELEG_NOTE_USERPROP: + str = gettext("Allows changing any user property"); + break; + case ZFS_DELEG_NOTE_USERQUOTA: + str = gettext("Allows accessing any userquota@... property"); + break; + case ZFS_DELEG_NOTE_USERUSED: + str = gettext("Allows reading any userused@... property"); + break; + case ZFS_DELEG_NOTE_USEROBJQUOTA: + str = gettext("Allows accessing any userobjquota@... property"); + break; + case ZFS_DELEG_NOTE_GROUPOBJQUOTA: + str = gettext("Allows accessing any \n\t\t\t\t" + "groupobjquota@... property"); + break; + case ZFS_DELEG_NOTE_GROUPOBJUSED: + str = gettext("Allows reading any groupobjused@... property"); + break; + case ZFS_DELEG_NOTE_USEROBJUSED: + str = gettext("Allows reading any userobjused@... property"); + break; + case ZFS_DELEG_NOTE_PROJECTQUOTA: + str = gettext("Allows accessing any projectquota@... property"); + break; + case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: + str = gettext("Allows accessing any \n\t\t\t\t" + "projectobjquota@... property"); + break; + case ZFS_DELEG_NOTE_PROJECTUSED: + str = gettext("Allows reading any projectused@... property"); + break; + case ZFS_DELEG_NOTE_PROJECTOBJUSED: + str = gettext("Allows accessing any \n\t\t\t\t" + "projectobjused@... property"); + break; + /* other */ + default: + str = ""; + } + + return (str); +} + +struct allow_opts { + boolean_t local; + boolean_t descend; + boolean_t user; + boolean_t group; + boolean_t everyone; + boolean_t create; + boolean_t set; + boolean_t recursive; /* unallow only */ + boolean_t prt_usage; + + boolean_t prt_perms; + char *who; + char *perms; + const char *dataset; +}; + +static inline int +prop_cmp(const void *a, const void *b) +{ + const char *str1 = *(const char **)a; + const char *str2 = *(const char **)b; + return (strcmp(str1, str2)); +} + +static void +allow_usage(boolean_t un, boolean_t requested, const char *msg) +{ + const char *opt_desc[] = { + "-h", gettext("show this help message and exit"), + "-l", gettext("set permission locally"), + "-d", gettext("set permission for descents"), + "-u", gettext("set permission for user"), + "-g", gettext("set permission for group"), + "-e", gettext("set permission for everyone"), + "-c", gettext("set create time permission"), + "-s", gettext("define permission set"), + /* unallow only */ + "-r", gettext("remove permissions recursively"), + }; + size_t unallow_size = sizeof (opt_desc) / sizeof (char *); + size_t allow_size = unallow_size - 2; + const char *props[ZFS_NUM_PROPS]; + int i; + size_t count = 0; + FILE *fp = requested ? stdout : stderr; + zprop_desc_t *pdtbl = zfs_prop_get_table(); + const char *fmt = gettext("%-16s %-14s\t%s\n"); + + (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : + HELP_ALLOW)); + (void) fprintf(fp, gettext("Options:\n")); + for (i = 0; i < (un ? unallow_size : allow_size); i += 2) { + const char *opt = opt_desc[i]; + const char *optdsc = opt_desc[i + 1]; + (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); + } + + (void) fprintf(fp, gettext("\nThe following permissions are " + "supported:\n\n")); + (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), + gettext("NOTES")); + for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { + const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; + zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; + const char *perm_type = deleg_perm_type(perm_note); + const char *perm_comment = deleg_perm_comment(perm_note); + (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); + } + + for (i = 0; i < ZFS_NUM_PROPS; i++) { + zprop_desc_t *pd = &pdtbl[i]; + if (pd->pd_visible != B_TRUE) + continue; + + if (pd->pd_attr == PROP_READONLY) + continue; + + props[count++] = pd->pd_name; + } + props[count] = NULL; + + qsort(props, count, sizeof (char *), prop_cmp); + + for (i = 0; i < count; i++) + (void) fprintf(fp, fmt, props[i], gettext("property"), ""); + + if (msg != NULL) + (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); + + exit(requested ? 0 : 2); +} + +static inline const char * +munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, + char **permsp) +{ + if (un && argc == expected_argc - 1) + *permsp = NULL; + else if (argc == expected_argc) + *permsp = argv[argc - 2]; + else + allow_usage(un, B_FALSE, + gettext("wrong number of parameters\n")); + + return (argv[argc - 1]); +} + +static void +parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) +{ + int uge_sum = opts->user + opts->group + opts->everyone; + int csuge_sum = opts->create + opts->set + uge_sum; + int ldcsuge_sum = csuge_sum + opts->local + opts->descend; + int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; + + if (uge_sum > 1) + allow_usage(un, B_FALSE, + gettext("-u, -g, and -e are mutually exclusive\n")); + + if (opts->prt_usage) { + if (argc == 0 && all_sum == 0) + allow_usage(un, B_TRUE, NULL); + else + usage(B_FALSE); + } + + if (opts->set) { + if (csuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -s\n")); + + opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); + if (argv[0][0] != '@') + allow_usage(un, B_FALSE, + gettext("invalid set name: missing '@' prefix\n")); + opts->who = argv[0]; + } else if (opts->create) { + if (ldcsuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -c\n")); + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (opts->everyone) { + if (csuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -e\n")); + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") + == 0) { + opts->everyone = B_TRUE; + argc--; + argv++; + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (argc == 1 && !un) { + opts->prt_perms = B_TRUE; + opts->dataset = argv[argc-1]; + } else { + opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); + opts->who = argv[0]; + } + + if (!opts->local && !opts->descend) { + opts->local = B_TRUE; + opts->descend = B_TRUE; + } +} + +static void +store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, + const char *who, char *perms, nvlist_t *top_nvl) +{ + int i; + char ld[2] = { '\0', '\0' }; + char who_buf[MAXNAMELEN + 32]; + char base_type = '\0'; + char set_type = '\0'; + nvlist_t *base_nvl = NULL; + nvlist_t *set_nvl = NULL; + nvlist_t *nvl; + + if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + switch (type) { + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + set_type = ZFS_DELEG_NAMED_SET_SETS; + base_type = ZFS_DELEG_NAMED_SET; + ld[0] = ZFS_DELEG_NA; + break; + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_CREATE: + set_type = ZFS_DELEG_CREATE_SETS; + base_type = ZFS_DELEG_CREATE; + ld[0] = ZFS_DELEG_NA; + break; + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + set_type = ZFS_DELEG_USER_SETS; + base_type = ZFS_DELEG_USER; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + set_type = ZFS_DELEG_GROUP_SETS; + base_type = ZFS_DELEG_GROUP; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + set_type = ZFS_DELEG_EVERYONE_SETS; + base_type = ZFS_DELEG_EVERYONE; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + + default: + assert(set_type != '\0' && base_type != '\0'); + } + + if (perms != NULL) { + char *curr = perms; + char *end = curr + strlen(perms); + + while (curr < end) { + char *delim = strchr(curr, ','); + if (delim == NULL) + delim = end; + else + *delim = '\0'; + + if (curr[0] == '@') + nvl = set_nvl; + else + nvl = base_nvl; + + (void) nvlist_add_boolean(nvl, curr); + if (delim != end) + *delim = ','; + curr = delim + 1; + } + + for (i = 0; i < 2; i++) { + char locality = ld[i]; + if (locality == 0) + continue; + + if (!nvlist_empty(base_nvl)) { + if (who != NULL) + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$%s", + base_type, locality, who); + else + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$", + base_type, locality); + + (void) nvlist_add_nvlist(top_nvl, who_buf, + base_nvl); + } + + + if (!nvlist_empty(set_nvl)) { + if (who != NULL) + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$%s", + set_type, locality, who); + else + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$", + set_type, locality); + + (void) nvlist_add_nvlist(top_nvl, who_buf, + set_nvl); + } + } + } else { + for (i = 0; i < 2; i++) { + char locality = ld[i]; + if (locality == 0) + continue; + + if (who != NULL) + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$%s", base_type, locality, who); + else + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$", base_type, locality); + (void) nvlist_add_boolean(top_nvl, who_buf); + + if (who != NULL) + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$%s", set_type, locality, who); + else + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$", set_type, locality); + (void) nvlist_add_boolean(top_nvl, who_buf); + } + } +} + +static int +construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) +{ + if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + if (opts->set) { + store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, + opts->descend, opts->who, opts->perms, *nvlp); + } else if (opts->create) { + store_allow_perm(ZFS_DELEG_CREATE, opts->local, + opts->descend, NULL, opts->perms, *nvlp); + } else if (opts->everyone) { + store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, + opts->descend, NULL, opts->perms, *nvlp); + } else { + char *curr = opts->who; + char *end = curr + strlen(curr); + + while (curr < end) { + const char *who; + zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; + char *endch; + char *delim = strchr(curr, ','); + char errbuf[256]; + char id[64]; + struct passwd *p = NULL; + struct group *g = NULL; + + uid_t rid; + if (delim == NULL) + delim = end; + else + *delim = '\0'; + + rid = (uid_t)strtol(curr, &endch, 0); + if (opts->user) { + who_type = ZFS_DELEG_USER; + if (*endch != '\0') + p = getpwnam(curr); + else + p = getpwuid(rid); + + if (p != NULL) + rid = p->pw_uid; + else if (*endch != '\0') { + (void) snprintf(errbuf, 256, gettext( + "invalid user %s\n"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } else if (opts->group) { + who_type = ZFS_DELEG_GROUP; + if (*endch != '\0') + g = getgrnam(curr); + else + g = getgrgid(rid); + + if (g != NULL) + rid = g->gr_gid; + else if (*endch != '\0') { + (void) snprintf(errbuf, 256, gettext( + "invalid group %s\n"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } else { + if (*endch != '\0') { + p = getpwnam(curr); + } else { + p = getpwuid(rid); + } + + if (p == NULL) { + if (*endch != '\0') { + g = getgrnam(curr); + } else { + g = getgrgid(rid); + } + } + + if (p != NULL) { + who_type = ZFS_DELEG_USER; + rid = p->pw_uid; + } else if (g != NULL) { + who_type = ZFS_DELEG_GROUP; + rid = g->gr_gid; + } else { + (void) snprintf(errbuf, 256, gettext( + "invalid user/group %s\n"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } + + (void) sprintf(id, "%u", rid); + who = id; + + store_allow_perm(who_type, opts->local, + opts->descend, who, opts->perms, *nvlp); + curr = delim + 1; + } + } + + return (0); +} + +static void +print_set_creat_perms(uu_avl_t *who_avl) +{ + const char *sc_title[] = { + gettext("Permission sets:\n"), + gettext("Create time permissions:\n"), + NULL + }; + who_perm_node_t *who_node = NULL; + int prev_weight = -1; + + for (who_node = uu_avl_first(who_avl); who_node != NULL; + who_node = uu_avl_next(who_avl, who_node)) { + uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; + const char *who_name = who_node->who_perm.who_name; + int weight = who_type2weight(who_type); + boolean_t first = B_TRUE; + deleg_perm_node_t *deleg_node; + + if (prev_weight != weight) { + (void) printf("%s", sc_title[weight]); + prev_weight = weight; + } + + if (who_name == NULL || strnlen(who_name, 1) == 0) + (void) printf("\t"); + else + (void) printf("\t%s ", who_name); + + for (deleg_node = uu_avl_first(avl); deleg_node != NULL; + deleg_node = uu_avl_next(avl, deleg_node)) { + if (first) { + (void) printf("%s", + deleg_node->dpn_perm.dp_name); + first = B_FALSE; + } else + (void) printf(",%s", + deleg_node->dpn_perm.dp_name); + } + + (void) printf("\n"); + } +} + +static void +print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, + const char *title) +{ + who_perm_node_t *who_node = NULL; + boolean_t prt_title = B_TRUE; + uu_avl_walk_t *walk; + + if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((who_node = uu_avl_walk_next(walk)) != NULL) { + const char *who_name = who_node->who_perm.who_name; + const char *nice_who_name = who_node->who_perm.who_ug_name; + uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; + char delim = ' '; + deleg_perm_node_t *deleg_node; + boolean_t prt_who = B_TRUE; + + for (deleg_node = uu_avl_first(avl); + deleg_node != NULL; + deleg_node = uu_avl_next(avl, deleg_node)) { + if (local != deleg_node->dpn_perm.dp_local || + descend != deleg_node->dpn_perm.dp_descend) + continue; + + if (prt_who) { + const char *who = NULL; + if (prt_title) { + prt_title = B_FALSE; + (void) printf("%s", title); + } + + switch (who_type) { + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + who = gettext("user"); + if (nice_who_name) + who_name = nice_who_name; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + who = gettext("group"); + if (nice_who_name) + who_name = nice_who_name; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + who = gettext("everyone"); + who_name = NULL; + break; + + default: + assert(who != NULL); + } + + prt_who = B_FALSE; + if (who_name == NULL) + (void) printf("\t%s", who); + else + (void) printf("\t%s %s", who, who_name); + } + + (void) printf("%c%s", delim, + deleg_node->dpn_perm.dp_name); + delim = ','; + } + + if (!prt_who) + (void) printf("\n"); + } + + uu_avl_walk_end(walk); +} + +static void +print_fs_perms(fs_perm_set_t *fspset) +{ + fs_perm_node_t *node = NULL; + char buf[MAXNAMELEN + 32]; + const char *dsname = buf; + + for (node = uu_list_first(fspset->fsps_list); node != NULL; + node = uu_list_next(fspset->fsps_list, node)) { + uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; + uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; + int left = 0; + + (void) snprintf(buf, sizeof (buf), + gettext("---- Permissions on %s "), + node->fspn_fsperm.fsp_name); + (void) printf("%s", dsname); + left = 70 - strlen(buf); + while (left-- > 0) + (void) printf("-"); + (void) printf("\n"); + + print_set_creat_perms(sc_avl); + print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, + gettext("Local permissions:\n")); + print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, + gettext("Descendent permissions:\n")); + print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, + gettext("Local+Descendent permissions:\n")); + } +} + +static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; + +struct deleg_perms { + boolean_t un; + nvlist_t *nvl; +}; + +static int +set_deleg_perms(zfs_handle_t *zhp, void *data) +{ + struct deleg_perms *perms = (struct deleg_perms *)data; + zfs_type_t zfs_type = zfs_get_type(zhp); + + if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) + return (0); + + return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); +} + +static int +zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) +{ + zfs_handle_t *zhp; + nvlist_t *perm_nvl = NULL; + nvlist_t *update_perm_nvl = NULL; + int error = 1; + int c; + struct allow_opts opts = { 0 }; + + const char *optstr = un ? "ldugecsrh" : "ldugecsh"; + + /* check opts */ + while ((c = getopt(argc, argv, optstr)) != -1) { + switch (c) { + case 'l': + opts.local = B_TRUE; + break; + case 'd': + opts.descend = B_TRUE; + break; + case 'u': + opts.user = B_TRUE; + break; + case 'g': + opts.group = B_TRUE; + break; + case 'e': + opts.everyone = B_TRUE; + break; + case 's': + opts.set = B_TRUE; + break; + case 'c': + opts.create = B_TRUE; + break; + case 'r': + opts.recursive = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case 'h': + opts.prt_usage = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check arguments */ + parse_allow_args(argc, argv, un, &opts); + + /* try to open the dataset */ + if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) == NULL) { + (void) fprintf(stderr, "Failed to open dataset: %s\n", + opts.dataset); + return (-1); + } + + if (zfs_get_fsacl(zhp, &perm_nvl) != 0) + goto cleanup2; + + fs_perm_set_init(&fs_perm_set); + if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { + (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); + goto cleanup1; + } + + if (opts.prt_perms) + print_fs_perms(&fs_perm_set); + else { + (void) construct_fsacl_list(un, &opts, &update_perm_nvl); + if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) + goto cleanup0; + + if (un && opts.recursive) { + struct deleg_perms data = { un, update_perm_nvl }; + if (zfs_iter_filesystems(zhp, set_deleg_perms, + &data) != 0) + goto cleanup0; + } + } + + error = 0; + +cleanup0: + nvlist_free(perm_nvl); + nvlist_free(update_perm_nvl); +cleanup1: + fs_perm_set_fini(&fs_perm_set); +cleanup2: + zfs_close(zhp); + + return (error); +} + +static int +zfs_do_allow(int argc, char **argv) +{ + return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); +} + +static int +zfs_do_unallow(int argc, char **argv) +{ + return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); +} + +static int +zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) +{ + int errors = 0; + int i; + const char *tag; + boolean_t recursive = B_FALSE; + const char *opts = holding ? "rt" : "r"; + int c; + + /* check options */ + while ((c = getopt(argc, argv, opts)) != -1) { + switch (c) { + case 'r': + recursive = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 2) + usage(B_FALSE); + + tag = argv[0]; + --argc; + ++argv; + + if (holding && tag[0] == '.') { + /* tags starting with '.' are reserved for libzfs */ + (void) fprintf(stderr, gettext("tag may not start with '.'\n")); + usage(B_FALSE); + } + + for (i = 0; i < argc; ++i) { + zfs_handle_t *zhp; + char parent[ZFS_MAX_DATASET_NAME_LEN]; + const char *delim; + char *path = argv[i]; + + delim = strchr(path, '@'); + if (delim == NULL) { + (void) fprintf(stderr, + gettext("'%s' is not a snapshot\n"), path); + ++errors; + continue; + } + (void) strncpy(parent, path, delim - path); + parent[delim - path] = '\0'; + + zhp = zfs_open(g_zfs, parent, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + ++errors; + continue; + } + if (holding) { + if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) + ++errors; + } else { + if (zfs_release(zhp, delim+1, tag, recursive) != 0) + ++errors; + } + zfs_close(zhp); + } + + return (errors != 0); +} + +/* + * zfs hold [-r] [-t] <tag> <snap> ... + * + * -r Recursively hold + * + * Apply a user-hold with the given tag to the list of snapshots. + */ +static int +zfs_do_hold(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); +} + +/* + * zfs release [-r] <tag> <snap> ... + * + * -r Recursively release + * + * Release a user-hold with the given tag from the list of snapshots. + */ +static int +zfs_do_release(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); +} + +typedef struct holds_cbdata { + boolean_t cb_recursive; + const char *cb_snapname; + nvlist_t **cb_nvlp; + size_t cb_max_namelen; + size_t cb_max_taglen; +} holds_cbdata_t; + +#define STRFTIME_FMT_STR "%a %b %e %H:%M %Y" +#define DATETIME_BUF_LEN (32) +/* + * + */ +static void +print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl) +{ + int i; + nvpair_t *nvp = NULL; + char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; + const char *col; + + if (!scripted) { + for (i = 0; i < 3; i++) { + col = gettext(hdr_cols[i]); + if (i < 2) + (void) printf("%-*s ", i ? tagwidth : nwidth, + col); + else + (void) printf("%s\n", col); + } + } + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + char *zname = nvpair_name(nvp); + nvlist_t *nvl2; + nvpair_t *nvp2 = NULL; + (void) nvpair_value_nvlist(nvp, &nvl2); + while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { + char tsbuf[DATETIME_BUF_LEN]; + char *tagname = nvpair_name(nvp2); + uint64_t val = 0; + time_t time; + struct tm t; + + (void) nvpair_value_uint64(nvp2, &val); + time = (time_t)val; + (void) localtime_r(&time, &t); + (void) strftime(tsbuf, DATETIME_BUF_LEN, + gettext(STRFTIME_FMT_STR), &t); + + if (scripted) { + (void) printf("%s\t%s\t%s\n", zname, + tagname, tsbuf); + } else { + (void) printf("%-*s %-*s %s\n", nwidth, + zname, tagwidth, tagname, tsbuf); + } + } + } +} + +/* + * Generic callback function to list a dataset or snapshot. + */ +static int +holds_callback(zfs_handle_t *zhp, void *data) +{ + holds_cbdata_t *cbp = data; + nvlist_t *top_nvl = *cbp->cb_nvlp; + nvlist_t *nvl = NULL; + nvpair_t *nvp = NULL; + const char *zname = zfs_get_name(zhp); + size_t znamelen = strlen(zname); + + if (cbp->cb_recursive) { + const char *snapname; + char *delim = strchr(zname, '@'); + if (delim == NULL) + return (0); + + snapname = delim + 1; + if (strcmp(cbp->cb_snapname, snapname)) + return (0); + } + + if (zfs_get_holds(zhp, &nvl) != 0) + return (-1); + + if (znamelen > cbp->cb_max_namelen) + cbp->cb_max_namelen = znamelen; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + const char *tag = nvpair_name(nvp); + size_t taglen = strlen(tag); + if (taglen > cbp->cb_max_taglen) + cbp->cb_max_taglen = taglen; + } + + return (nvlist_add_nvlist(top_nvl, zname, nvl)); +} + +/* + * zfs holds [-rH] <snap> ... + * + * -r Lists holds that are set on the named snapshots recursively. + * -H Scripted mode; elide headers and separate columns by tabs. + */ +static int +zfs_do_holds(int argc, char **argv) +{ + int errors = 0; + int c; + int i; + boolean_t scripted = B_FALSE; + boolean_t recursive = B_FALSE; + const char *opts = "rH"; + nvlist_t *nvl; + + int types = ZFS_TYPE_SNAPSHOT; + holds_cbdata_t cb = { 0 }; + + int limit = 0; + int ret = 0; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, opts)) != -1) { + switch (c) { + case 'r': + recursive = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (recursive) { + types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; + flags |= ZFS_ITER_RECURSE; + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) + usage(B_FALSE); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + for (i = 0; i < argc; ++i) { + char *snapshot = argv[i]; + const char *delim; + const char *snapname; + + delim = strchr(snapshot, '@'); + if (delim == NULL) { + (void) fprintf(stderr, + gettext("'%s' is not a snapshot\n"), snapshot); + ++errors; + continue; + } + snapname = delim + 1; + if (recursive) + snapshot[delim - snapshot] = '\0'; + + cb.cb_recursive = recursive; + cb.cb_snapname = snapname; + cb.cb_nvlp = &nvl; + + /* + * 1. collect holds data, set format options + */ + ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit, + holds_callback, &cb); + if (ret != 0) + ++errors; + } + + /* + * 2. print holds data + */ + print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl); + + if (nvlist_empty(nvl)) + (void) fprintf(stderr, gettext("no datasets available\n")); + + nvlist_free(nvl); + + return (0 != errors); +} + +#define CHECK_SPINNER 30 +#define SPINNER_TIME 3 /* seconds */ +#define MOUNT_TIME 1 /* seconds */ + +typedef struct get_all_state { + boolean_t ga_verbose; + get_all_cb_t *ga_cbp; +} get_all_state_t; + +static int +get_one_dataset(zfs_handle_t *zhp, void *data) +{ + static char *spin[] = { "-", "\\", "|", "/" }; + static int spinval = 0; + static int spincheck = 0; + static time_t last_spin_time = (time_t)0; + get_all_state_t *state = data; + zfs_type_t type = zfs_get_type(zhp); + + if (state->ga_verbose) { + if (--spincheck < 0) { + time_t now = time(NULL); + if (last_spin_time + SPINNER_TIME < now) { + update_progress(spin[spinval++ % 4]); + last_spin_time = now; + } + spincheck = CHECK_SPINNER; + } + } + + /* + * Iterate over any nested datasets. + */ + if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { + zfs_close(zhp); + return (1); + } + + /* + * Skip any datasets whose type does not match. + */ + if ((type & ZFS_TYPE_FILESYSTEM) == 0) { + zfs_close(zhp); + return (0); + } + libzfs_add_handle(state->ga_cbp, zhp); + assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc); + + return (0); +} + +static void +get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) +{ + get_all_state_t state = { + .ga_verbose = verbose, + .ga_cbp = cbp + }; + + if (verbose) + set_progress_header(gettext("Reading ZFS config")); + (void) zfs_iter_root(g_zfs, get_one_dataset, &state); + + if (verbose) + finish_progress(gettext("done.")); +} + +/* + * Generic callback for sharing or mounting filesystems. Because the code is so + * similar, we have a common function with an extra parameter to determine which + * mode we are using. + */ +typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; + +typedef struct share_mount_state { + share_mount_op_t sm_op; + boolean_t sm_verbose; + int sm_flags; + char *sm_options; + char *sm_proto; /* only valid for OP_SHARE */ + pthread_mutex_t sm_lock; /* protects the remaining fields */ + uint_t sm_total; /* number of filesystems to process */ + uint_t sm_done; /* number of filesystems processed */ + int sm_status; /* -1 if any of the share/mount operations failed */ +} share_mount_state_t; + +/* + * Share or mount a dataset. + */ +static int +share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, + boolean_t explicit, const char *options) +{ + char mountpoint[ZFS_MAXPROPLEN]; + char shareopts[ZFS_MAXPROPLEN]; + char smbshareopts[ZFS_MAXPROPLEN]; + const char *cmdname = op == OP_SHARE ? "share" : "mount"; + struct mnttab mnt; + uint64_t zoned, canmount; + boolean_t shared_nfs, shared_smb; + + assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); + + /* + * Check to make sure we can mount/share this dataset. If we + * are in the global zone and the filesystem is exported to a + * local zone, or if we are in a local zone and the + * filesystem is not exported, then it is an error. + */ + zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + + if (zoned && getzoneid() == GLOBAL_ZONEID) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "dataset is exported to a local zone\n"), cmdname, + zfs_get_name(zhp)); + return (1); + + } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "permission denied\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } + + /* + * Ignore any filesystems which don't apply to us. This + * includes those with a legacy mountpoint, or those with + * legacy share options. + */ + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, + sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, + sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); + + if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && + strcmp(smbshareopts, "off") == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot share '%s': " + "legacy share\n"), zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use share(1M) to " + "share this filesystem, or set " + "sharenfs property on\n")); + return (1); + } + + /* + * We cannot share or mount legacy filesystems. If the + * shareopts is non-legacy but the mountpoint is legacy, we + * treat it as a legacy share. + */ + if (strcmp(mountpoint, "legacy") == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use %s(1M) to " + "%s this filesystem\n"), cmdname, cmdname); + return (1); + } + + if (strcmp(mountpoint, "none") == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': no " + "mountpoint set\n"), cmdname, zfs_get_name(zhp)); + return (1); + } + + /* + * canmount explicit outcome + * on no pass through + * on yes pass through + * off no return 0 + * off yes display error, return 1 + * noauto no return 0 + * noauto yes pass through + */ + canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); + if (canmount == ZFS_CANMOUNT_OFF) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "'canmount' property is set to 'off'\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { + /* + * When performing a 'zfs mount -a', we skip any mounts for + * datasets that have 'noauto' set. Sharing a dataset with + * 'noauto' set is only allowed if it's mounted. + */ + if (op == OP_MOUNT) + return (0); + if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) { + /* also purge it from existing exports */ + zfs_unshareall_bypath(zhp, mountpoint); + return (0); + } + } + + /* + * If this filesystem is encrypted and does not have + * a loaded key, we can not mount it. + */ + if ((flags & MS_CRYPT) == 0 && + zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF && + zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == + ZFS_KEYSTATUS_UNAVAILABLE) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "encryption key not loaded\n"), cmdname, zfs_get_name(zhp)); + return (1); + } + + /* + * If this filesystem is inconsistent and has a receive resume + * token, we can not mount it. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "Contains partially-completed state from " + "\"zfs receive -s\", which can be resumed with " + "\"zfs send -t\"\n"), + cmdname, zfs_get_name(zhp)); + return (1); + } + + if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "Dataset is not complete, was created by receiving " + "a redacted zfs send stream.\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } + + /* + * At this point, we have verified that the mountpoint and/or + * shareopts are appropriate for auto management. If the + * filesystem is already mounted or shared, return (failing + * for explicit requests); otherwise mount or share the + * filesystem. + */ + switch (op) { + case OP_SHARE: + + shared_nfs = zfs_is_shared_nfs(zhp, NULL); + shared_smb = zfs_is_shared_smb(zhp, NULL); + + if ((shared_nfs && shared_smb) || + (shared_nfs && strcmp(shareopts, "on") == 0 && + strcmp(smbshareopts, "off") == 0) || + (shared_smb && strcmp(smbshareopts, "on") == 0 && + strcmp(shareopts, "off") == 0)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot share " + "'%s': filesystem already shared\n"), + zfs_get_name(zhp)); + return (1); + } + + if (!zfs_is_mounted(zhp, NULL) && + zfs_mount(zhp, NULL, flags) != 0) + return (1); + + if (protocol == NULL) { + if (zfs_shareall(zhp) != 0) + return (1); + } else if (strcmp(protocol, "nfs") == 0) { + if (zfs_share_nfs(zhp)) + return (1); + } else if (strcmp(protocol, "smb") == 0) { + if (zfs_share_smb(zhp)) + return (1); + } else { + (void) fprintf(stderr, gettext("cannot share " + "'%s': invalid share type '%s' " + "specified\n"), + zfs_get_name(zhp), protocol); + return (1); + } + + break; + + case OP_MOUNT: + if (options == NULL) + mnt.mnt_mntopts = ""; + else + mnt.mnt_mntopts = (char *)options; + + if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && + zfs_is_mounted(zhp, NULL)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot mount " + "'%s': filesystem already mounted\n"), + zfs_get_name(zhp)); + return (1); + } + + if (zfs_mount(zhp, options, flags) != 0) + return (1); + break; + } + + return (0); +} + +/* + * Reports progress in the form "(current/total)". Not thread-safe. + */ +static void +report_mount_progress(int current, int total) +{ + static time_t last_progress_time = 0; + time_t now = time(NULL); + char info[32]; + + /* report 1..n instead of 0..n-1 */ + ++current; + + /* display header if we're here for the first time */ + if (current == 1) { + set_progress_header(gettext("Mounting ZFS filesystems")); + } else if (current != total && last_progress_time + MOUNT_TIME >= now) { + /* too soon to report again */ + return; + } + + last_progress_time = now; + + (void) sprintf(info, "(%d/%d)", current, total); + + if (current == total) + finish_progress(info); + else + update_progress(info); +} + +/* + * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and + * updates the progress meter. + */ +static int +share_mount_one_cb(zfs_handle_t *zhp, void *arg) +{ + share_mount_state_t *sms = arg; + int ret; + + ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, + B_FALSE, sms->sm_options); + + pthread_mutex_lock(&sms->sm_lock); + if (ret != 0) + sms->sm_status = ret; + sms->sm_done++; + if (sms->sm_verbose) + report_mount_progress(sms->sm_done, sms->sm_total); + pthread_mutex_unlock(&sms->sm_lock); + return (ret); +} + +static void +append_options(char *mntopts, char *newopts) +{ + int len = strlen(mntopts); + + /* original length plus new string to append plus 1 for the comma */ + if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { + (void) fprintf(stderr, gettext("the opts argument for " + "'%s' option is too long (more than %d chars)\n"), + "-o", MNT_LINE_MAX); + usage(B_FALSE); + } + + if (*mntopts) + mntopts[len++] = ','; + + (void) strcpy(&mntopts[len], newopts); +} + +static int +share_mount(int op, int argc, char **argv) +{ + int do_all = 0; + boolean_t verbose = B_FALSE; + int c, ret = 0; + char *options = NULL; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) + != -1) { + switch (c) { + case 'a': + do_all = 1; + break; + case 'v': + verbose = B_TRUE; + break; + case 'l': + flags |= MS_CRYPT; + break; + case 'o': + if (*optarg == '\0') { + (void) fprintf(stderr, gettext("empty mount " + "options (-o) specified\n")); + usage(B_FALSE); + } + + if (options == NULL) + options = safe_malloc(MNT_LINE_MAX + 1); + + /* option validation is done later */ + append_options(options, optarg); + break; + case 'O': + flags |= MS_OVERLAY; + break; + case 'f': + flags |= MS_FORCE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (do_all) { + char *protocol = NULL; + + if (op == OP_SHARE && argc > 0) { + if (strcmp(argv[0], "nfs") != 0 && + strcmp(argv[0], "smb") != 0) { + (void) fprintf(stderr, gettext("share type " + "must be 'nfs' or 'smb'\n")); + usage(B_FALSE); + } + protocol = argv[0]; + argc--; + argv++; + } + + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + start_progress_timer(); + get_all_cb_t cb = { 0 }; + get_all_datasets(&cb, verbose); + + if (cb.cb_used == 0) { + if (options != NULL) + free(options); + return (0); + } + + share_mount_state_t share_mount_state = { 0 }; + share_mount_state.sm_op = op; + share_mount_state.sm_verbose = verbose; + share_mount_state.sm_flags = flags; + share_mount_state.sm_options = options; + share_mount_state.sm_proto = protocol; + share_mount_state.sm_total = cb.cb_used; + pthread_mutex_init(&share_mount_state.sm_lock, NULL); + + /* + * libshare isn't mt-safe, so only do the operation in parallel + * if we're mounting. Additionally, the key-loading option must + * be serialized so that we can prompt the user for their keys + * in a consistent manner. + */ + zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, + share_mount_one_cb, &share_mount_state, + op == OP_MOUNT && !(flags & MS_CRYPT)); + zfs_commit_all_shares(); + + ret = share_mount_state.sm_status; + + for (int i = 0; i < cb.cb_used; i++) + zfs_close(cb.cb_handles[i]); + free(cb.cb_handles); + } else if (argc == 0) { + struct mnttab entry; + + if ((op == OP_SHARE) || (options != NULL)) { + (void) fprintf(stderr, gettext("missing filesystem " + "argument (specify -a for all)\n")); + usage(B_FALSE); + } + + /* + * When mount is given no arguments, go through + * /proc/self/mounts and display any active ZFS mounts. + * We hide any snapshots, since they are controlled + * automatically. + */ + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", mnttab_file) == NULL) { + if (options != NULL) + free(options); + return (ENOENT); + } + + while (getmntent(mnttab_file, &entry) == 0) { + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || + strchr(entry.mnt_special, '@') != NULL) + continue; + + (void) printf("%-30s %s\n", entry.mnt_special, + entry.mnt_mountp); + } + + } else { + zfs_handle_t *zhp; + + if (argc > 1) { + (void) fprintf(stderr, + gettext("too many arguments\n")); + usage(B_FALSE); + } + + if ((zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM)) == NULL) { + ret = 1; + } else { + ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, + options); + zfs_commit_all_shares(); + zfs_close(zhp); + } + } + + if (options != NULL) + free(options); + + return (ret); +} + +/* + * zfs mount -a [nfs] + * zfs mount filesystem + * + * Mount all filesystems, or mount the given filesystem. + */ +static int +zfs_do_mount(int argc, char **argv) +{ + return (share_mount(OP_MOUNT, argc, argv)); +} + +/* + * zfs share -a [nfs | smb] + * zfs share filesystem + * + * Share all filesystems, or share the given filesystem. + */ +static int +zfs_do_share(int argc, char **argv) +{ + return (share_mount(OP_SHARE, argc, argv)); +} + +typedef struct unshare_unmount_node { + zfs_handle_t *un_zhp; + char *un_mountp; + uu_avl_node_t un_avlnode; +} unshare_unmount_node_t; + +/* ARGSUSED */ +static int +unshare_unmount_compare(const void *larg, const void *rarg, void *unused) +{ + const unshare_unmount_node_t *l = larg; + const unshare_unmount_node_t *r = rarg; + + return (strcmp(l->un_mountp, r->un_mountp)); +} + +/* + * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an + * absolute path, find the entry /proc/self/mounts, verify that it's a + * ZFS filesystem, and unmount it appropriately. + */ +static int +unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) +{ + zfs_handle_t *zhp; + int ret = 0; + struct stat64 statbuf; + struct extmnttab entry; + const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; + ino_t path_inode; + + /* + * Search for the given (major,minor) pair in the mount table. + */ + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", mnttab_file) == NULL) + return (ENOENT); + + if (getextmntent(path, &entry, &statbuf) != 0) { + if (op == OP_SHARE) { + (void) fprintf(stderr, gettext("cannot %s '%s': not " + "currently mounted\n"), cmdname, path); + return (1); + } + (void) fprintf(stderr, gettext("warning: %s not in" + "/proc/self/mounts\n"), path); + if ((ret = umount2(path, flags)) != 0) + (void) fprintf(stderr, gettext("%s: %s\n"), path, + strerror(errno)); + return (ret != 0); + } + path_inode = statbuf.st_ino; + + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " + "filesystem\n"), cmdname, path); + return (1); + } + + if ((zhp = zfs_open(g_zfs, entry.mnt_special, + ZFS_TYPE_FILESYSTEM)) == NULL) + return (1); + + ret = 1; + if (stat64(entry.mnt_mountp, &statbuf) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), + cmdname, path, strerror(errno)); + goto out; + } else if (statbuf.st_ino != path_inode) { + (void) fprintf(stderr, gettext("cannot " + "%s '%s': not a mountpoint\n"), cmdname, path); + goto out; + } + + if (op == OP_SHARE) { + char nfs_mnt_prop[ZFS_MAXPROPLEN]; + char smbshare_prop[ZFS_MAXPROPLEN]; + + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, + sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, + sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); + + if (strcmp(nfs_mnt_prop, "off") == 0 && + strcmp(smbshare_prop, "off") == 0) { + (void) fprintf(stderr, gettext("cannot unshare " + "'%s': legacy share\n"), path); + (void) fprintf(stderr, gettext("use exportfs(8) " + "or smbcontrol(1) to unshare this filesystem\n")); + } else if (!zfs_is_shared(zhp)) { + (void) fprintf(stderr, gettext("cannot unshare '%s': " + "not currently shared\n"), path); + } else { + ret = zfs_unshareall_bypath(zhp, path); + zfs_commit_all_shares(); + } + } else { + char mtpt_prop[ZFS_MAXPROPLEN]; + + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, + sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); + + if (is_manual) { + ret = zfs_unmount(zhp, NULL, flags); + } else if (strcmp(mtpt_prop, "legacy") == 0) { + (void) fprintf(stderr, gettext("cannot unmount " + "'%s': legacy mountpoint\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use umount(8) " + "to unmount this filesystem\n")); + } else { + ret = zfs_unmountall(zhp, flags); + } + } + +out: + zfs_close(zhp); + + return (ret != 0); +} + +/* + * Generic callback for unsharing or unmounting a filesystem. + */ +static int +unshare_unmount(int op, int argc, char **argv) +{ + int do_all = 0; + int flags = 0; + int ret = 0; + int c; + zfs_handle_t *zhp; + char nfs_mnt_prop[ZFS_MAXPROPLEN]; + char sharesmb[ZFS_MAXPROPLEN]; + + /* check options */ + while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) { + switch (c) { + case 'a': + do_all = 1; + break; + case 'f': + flags |= MS_FORCE; + break; + case 'u': + flags |= MS_CRYPT; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (do_all) { + /* + * We could make use of zfs_for_each() to walk all datasets in + * the system, but this would be very inefficient, especially + * since we would have to linearly search /proc/self/mounts for + * each one. Instead, do one pass through /proc/self/mounts + * looking for zfs entries and call zfs_unmount() for each one. + * + * Things get a little tricky if the administrator has created + * mountpoints beneath other ZFS filesystems. In this case, we + * have to unmount the deepest filesystems first. To accomplish + * this, we place all the mountpoints in an AVL tree sorted by + * the special type (dataset name), and walk the result in + * reverse to make sure to get any snapshots first. + */ + struct mnttab entry; + uu_avl_pool_t *pool; + uu_avl_t *tree = NULL; + unshare_unmount_node_t *node; + uu_avl_index_t idx; + uu_avl_walk_t *walk; + char *protocol = NULL; + + if (op == OP_SHARE && argc > 0) { + if (strcmp(argv[0], "nfs") != 0 && + strcmp(argv[0], "smb") != 0) { + (void) fprintf(stderr, gettext("share type " + "must be 'nfs' or 'smb'\n")); + usage(B_FALSE); + } + protocol = argv[0]; + argc--; + argv++; + } + + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (((pool = uu_avl_pool_create("unmount_pool", + sizeof (unshare_unmount_node_t), + offsetof(unshare_unmount_node_t, un_avlnode), + unshare_unmount_compare, UU_DEFAULT)) == NULL) || + ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) + nomem(); + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", mnttab_file) == NULL) + return (ENOENT); + + while (getmntent(mnttab_file, &entry) == 0) { + + /* ignore non-ZFS entries */ + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) + continue; + + /* ignore snapshots */ + if (strchr(entry.mnt_special, '@') != NULL) + continue; + + if ((zhp = zfs_open(g_zfs, entry.mnt_special, + ZFS_TYPE_FILESYSTEM)) == NULL) { + ret = 1; + continue; + } + + /* + * Ignore datasets that are excluded/restricted by + * parent pool name. + */ + if (zpool_skip_pool(zfs_get_pool_name(zhp))) { + zfs_close(zhp); + continue; + } + + switch (op) { + case OP_SHARE: + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(nfs_mnt_prop, "off") != 0) + break; + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(nfs_mnt_prop, "off") == 0) + continue; + break; + case OP_MOUNT: + /* Ignore legacy mounts */ + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(nfs_mnt_prop, "legacy") == 0) + continue; + /* Ignore canmount=noauto mounts */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == + ZFS_CANMOUNT_NOAUTO) + continue; + default: + break; + } + + node = safe_malloc(sizeof (unshare_unmount_node_t)); + node->un_zhp = zhp; + node->un_mountp = safe_strdup(entry.mnt_mountp); + + uu_avl_node_init(node, &node->un_avlnode, pool); + + if (uu_avl_find(tree, node, NULL, &idx) == NULL) { + uu_avl_insert(tree, node, idx); + } else { + zfs_close(node->un_zhp); + free(node->un_mountp); + free(node); + } + } + + /* + * Walk the AVL tree in reverse, unmounting each filesystem and + * removing it from the AVL tree in the process. + */ + if ((walk = uu_avl_walk_start(tree, + UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + const char *mntarg = NULL; + + uu_avl_remove(tree, node); + switch (op) { + case OP_SHARE: + if (zfs_unshareall_bytype(node->un_zhp, + node->un_mountp, protocol) != 0) + ret = 1; + break; + + case OP_MOUNT: + if (zfs_unmount(node->un_zhp, + mntarg, flags) != 0) + ret = 1; + break; + } + + zfs_close(node->un_zhp); + free(node->un_mountp); + free(node); + } + + if (op == OP_SHARE) + zfs_commit_shares(protocol); + + uu_avl_walk_end(walk); + uu_avl_destroy(tree); + uu_avl_pool_destroy(pool); + + } else { + if (argc != 1) { + if (argc == 0) + (void) fprintf(stderr, + gettext("missing filesystem argument\n")); + else + (void) fprintf(stderr, + gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* + * We have an argument, but it may be a full path or a ZFS + * filesystem. Pass full paths off to unmount_path() (shared by + * manual_unmount), otherwise open the filesystem and pass to + * zfs_unmount(). + */ + if (argv[0][0] == '/') + return (unshare_unmount_path(op, argv[0], + flags, B_FALSE)); + + if ((zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM)) == NULL) + return (1); + + verify(zfs_prop_get(zhp, op == OP_SHARE ? + ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, + nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, + NULL, 0, B_FALSE) == 0); + + switch (op) { + case OP_SHARE: + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, + sharesmb, sizeof (sharesmb), NULL, NULL, + 0, B_FALSE) == 0); + + if (strcmp(nfs_mnt_prop, "off") == 0 && + strcmp(sharesmb, "off") == 0) { + (void) fprintf(stderr, gettext("cannot " + "unshare '%s': legacy share\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use " + "unshare(1M) to unshare this " + "filesystem\n")); + ret = 1; + } else if (!zfs_is_shared(zhp)) { + (void) fprintf(stderr, gettext("cannot " + "unshare '%s': not currently " + "shared\n"), zfs_get_name(zhp)); + ret = 1; + } else if (zfs_unshareall(zhp) != 0) { + ret = 1; + } + break; + + case OP_MOUNT: + if (strcmp(nfs_mnt_prop, "legacy") == 0) { + (void) fprintf(stderr, gettext("cannot " + "unmount '%s': legacy " + "mountpoint\n"), zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use " + "umount(1M) to unmount this " + "filesystem\n")); + ret = 1; + } else if (!zfs_is_mounted(zhp, NULL)) { + (void) fprintf(stderr, gettext("cannot " + "unmount '%s': not currently " + "mounted\n"), + zfs_get_name(zhp)); + ret = 1; + } else if (zfs_unmountall(zhp, flags) != 0) { + ret = 1; + } + break; + } + + zfs_close(zhp); + } + + return (ret); +} + +/* + * zfs unmount [-fu] -a + * zfs unmount [-fu] filesystem + * + * Unmount all filesystems, or a specific ZFS filesystem. + */ +static int +zfs_do_unmount(int argc, char **argv) +{ + return (unshare_unmount(OP_MOUNT, argc, argv)); +} + +/* + * zfs unshare -a + * zfs unshare filesystem + * + * Unshare all filesystems, or a specific ZFS filesystem. + */ +static int +zfs_do_unshare(int argc, char **argv) +{ + return (unshare_unmount(OP_SHARE, argc, argv)); +} + +static int +find_command_idx(char *command, int *idx) +{ + int i; + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + continue; + + if (strcmp(command, command_table[i].name) == 0) { + *idx = i; + return (0); + } + } + return (1); +} + +static int +zfs_do_diff(int argc, char **argv) +{ + zfs_handle_t *zhp; + int flags = 0; + char *tosnap = NULL; + char *fromsnap = NULL; + char *atp, *copy; + int err = 0; + int c; + struct sigaction sa; + + while ((c = getopt(argc, argv, "FHt")) != -1) { + switch (c) { + case 'F': + flags |= ZFS_DIFF_CLASSIFY; + break; + case 'H': + flags |= ZFS_DIFF_PARSEABLE; + break; + case 't': + flags |= ZFS_DIFF_TIMESTAMP; + break; + default: + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, + gettext("must provide at least one snapshot name\n")); + usage(B_FALSE); + } + + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + fromsnap = argv[0]; + tosnap = (argc == 2) ? argv[1] : NULL; + + copy = NULL; + if (*fromsnap != '@') + copy = strdup(fromsnap); + else if (tosnap) + copy = strdup(tosnap); + if (copy == NULL) + usage(B_FALSE); + + if ((atp = strchr(copy, '@')) != NULL) + *atp = '\0'; + + if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) { + free(copy); + return (1); + } + free(copy); + + /* + * Ignore SIGPIPE so that the library can give us + * information on any failure + */ + if (sigemptyset(&sa.sa_mask) == -1) { + err = errno; + goto out; + } + sa.sa_flags = 0; + sa.sa_handler = SIG_IGN; + if (sigaction(SIGPIPE, &sa, NULL) == -1) { + err = errno; + goto out; + } + + err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); +out: + zfs_close(zhp); + + return (err != 0); +} + +/* + * zfs bookmark <fs@source>|<fs#source> <fs#bookmark> + * + * Creates a bookmark with the given name from the source snapshot + * or creates a copy of an existing source bookmark. + */ +static int +zfs_do_bookmark(int argc, char **argv) +{ + char *source, *bookname; + char expbuf[ZFS_MAX_DATASET_NAME_LEN]; + int source_type; + nvlist_t *nvl; + int ret = 0; + int c; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing source argument\n")); + goto usage; + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing bookmark argument\n")); + goto usage; + } + + source = argv[0]; + bookname = argv[1]; + + if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) { + (void) fprintf(stderr, + gettext("invalid source name '%s': " + "must contain a '@' or '#'\n"), source); + goto usage; + } + if (strchr(bookname, '#') == NULL) { + (void) fprintf(stderr, + gettext("invalid bookmark name '%s': " + "must contain a '#'\n"), bookname); + goto usage; + } + + /* + * expand source or bookname to full path: + * one of them may be specified as short name + */ + { + char **expand; + char *source_short, *bookname_short; + source_short = strpbrk(source, "@#"); + bookname_short = strpbrk(bookname, "#"); + if (source_short == source && + bookname_short == bookname) { + (void) fprintf(stderr, gettext( + "either source or bookmark must be specified as " + "full dataset paths")); + goto usage; + } else if (source_short != source && + bookname_short != bookname) { + expand = NULL; + } else if (source_short != source) { + strlcpy(expbuf, source, sizeof (expbuf)); + expand = &bookname; + } else if (bookname_short != bookname) { + strlcpy(expbuf, bookname, sizeof (expbuf)); + expand = &source; + } else { + abort(); + } + if (expand != NULL) { + *strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */ + (void) strlcat(expbuf, *expand, sizeof (expbuf)); + *expand = expbuf; + } + } + + /* determine source type */ + switch (*strpbrk(source, "@#")) { + case '@': source_type = ZFS_TYPE_SNAPSHOT; break; + case '#': source_type = ZFS_TYPE_BOOKMARK; break; + default: abort(); + } + + /* test the source exists */ + zfs_handle_t *zhp; + zhp = zfs_open(g_zfs, source, source_type); + if (zhp == NULL) + goto usage; + zfs_close(zhp); + + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, bookname, source); + ret = lzc_bookmark(nvl, NULL); + fnvlist_free(nvl); + + if (ret != 0) { + const char *err_msg = NULL; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot create bookmark '%s'"), bookname); + + switch (ret) { + case EXDEV: + err_msg = "bookmark is in a different pool"; + break; + case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: + err_msg = "source is not an ancestor of the " + "new bookmark's dataset"; + break; + case EEXIST: + err_msg = "bookmark exists"; + break; + case EINVAL: + err_msg = "invalid argument"; + break; + case ENOTSUP: + err_msg = "bookmark feature not enabled"; + break; + case ENOSPC: + err_msg = "out of space"; + break; + case ENOENT: + err_msg = "dataset does not exist"; + break; + default: + (void) zfs_standard_error(g_zfs, ret, errbuf); + break; + } + if (err_msg != NULL) { + (void) fprintf(stderr, "%s: %s\n", errbuf, + dgettext(TEXT_DOMAIN, err_msg)); + } + } + + return (ret != 0); + +usage: + usage(B_FALSE); + return (-1); +} + +static int +zfs_do_channel_program(int argc, char **argv) +{ + int ret, fd, c; + char *progbuf, *filename, *poolname; + size_t progsize, progread; + nvlist_t *outnvl = NULL; + uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT; + uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT; + boolean_t sync_flag = B_TRUE, json_output = B_FALSE; + zpool_handle_t *zhp; + + /* check options */ + while ((c = getopt(argc, argv, "nt:m:j")) != -1) { + switch (c) { + case 't': + case 'm': { + uint64_t arg; + char *endp; + + errno = 0; + arg = strtoull(optarg, &endp, 0); + if (errno != 0 || *endp != '\0') { + (void) fprintf(stderr, gettext( + "invalid argument " + "'%s': expected integer\n"), optarg); + goto usage; + } + + if (c == 't') { + instrlimit = arg; + } else { + ASSERT3U(c, ==, 'm'); + memlimit = arg; + } + break; + } + case 'n': { + sync_flag = B_FALSE; + break; + } + case 'j': { + json_output = B_TRUE; + break; + } + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, + gettext("invalid number of arguments\n")); + goto usage; + } + + poolname = argv[0]; + filename = argv[1]; + if (strcmp(filename, "-") == 0) { + fd = 0; + filename = "standard input"; + } else if ((fd = open(filename, O_RDONLY)) < 0) { + (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), + filename, strerror(errno)); + return (1); + } + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { + (void) fprintf(stderr, gettext("cannot open pool '%s'\n"), + poolname); + if (fd != 0) + (void) close(fd); + return (1); + } + zpool_close(zhp); + + /* + * Read in the channel program, expanding the program buffer as + * necessary. + */ + progread = 0; + progsize = 1024; + progbuf = safe_malloc(progsize); + do { + ret = read(fd, progbuf + progread, progsize - progread); + progread += ret; + if (progread == progsize && ret > 0) { + progsize *= 2; + progbuf = safe_realloc(progbuf, progsize); + } + } while (ret > 0); + + if (fd != 0) + (void) close(fd); + if (ret < 0) { + free(progbuf); + (void) fprintf(stderr, + gettext("cannot read '%s': %s\n"), + filename, strerror(errno)); + return (1); + } + progbuf[progread] = '\0'; + + /* + * Any remaining arguments are passed as arguments to the lua script as + * a string array: + * { + * "argv" -> [ "arg 1", ... "arg n" ], + * } + */ + nvlist_t *argnvl = fnvlist_alloc(); + fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2); + + if (sync_flag) { + ret = lzc_channel_program(poolname, progbuf, + instrlimit, memlimit, argnvl, &outnvl); + } else { + ret = lzc_channel_program_nosync(poolname, progbuf, + instrlimit, memlimit, argnvl, &outnvl); + } + + if (ret != 0) { + /* + * On error, report the error message handed back by lua if one + * exists. Otherwise, generate an appropriate error message, + * falling back on strerror() for an unexpected return code. + */ + char *errstring = NULL; + const char *msg = gettext("Channel program execution failed"); + uint64_t instructions = 0; + if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) { + (void) nvlist_lookup_string(outnvl, + ZCP_RET_ERROR, &errstring); + if (errstring == NULL) + errstring = strerror(ret); + if (ret == ETIME) { + (void) nvlist_lookup_uint64(outnvl, + ZCP_ARG_INSTRLIMIT, &instructions); + } + } else { + switch (ret) { + case EINVAL: + errstring = + "Invalid instruction or memory limit."; + break; + case ENOMEM: + errstring = "Return value too large."; + break; + case ENOSPC: + errstring = "Memory limit exhausted."; + break; + case ETIME: + errstring = "Timed out."; + break; + case EPERM: + errstring = "Permission denied. Channel " + "programs must be run as root."; + break; + default: + (void) zfs_standard_error(g_zfs, ret, msg); + } + } + if (errstring != NULL) + (void) fprintf(stderr, "%s:\n%s\n", msg, errstring); + + if (ret == ETIME && instructions != 0) + (void) fprintf(stderr, + gettext("%llu Lua instructions\n"), + (u_longlong_t)instructions); + } else { + if (json_output) { + (void) nvlist_print_json(stdout, outnvl); + } else if (nvlist_empty(outnvl)) { + (void) fprintf(stdout, gettext("Channel program fully " + "executed and did not produce output.\n")); + } else { + (void) fprintf(stdout, gettext("Channel program fully " + "executed and produced output:\n")); + dump_nvlist(outnvl, 4); + } + } + + free(progbuf); + fnvlist_free(outnvl); + fnvlist_free(argnvl); + return (ret != 0); + +usage: + usage(B_FALSE); + return (-1); +} + + +typedef struct loadkey_cbdata { + boolean_t cb_loadkey; + boolean_t cb_recursive; + boolean_t cb_noop; + char *cb_keylocation; + uint64_t cb_numfailed; + uint64_t cb_numattempted; +} loadkey_cbdata_t; + +static int +load_key_callback(zfs_handle_t *zhp, void *data) +{ + int ret; + boolean_t is_encroot; + loadkey_cbdata_t *cb = data; + uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); + + /* + * If we are working recursively, we want to skip loading / unloading + * keys for non-encryption roots and datasets whose keys are already + * in the desired end-state. + */ + if (cb->cb_recursive) { + ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); + if (ret != 0) + return (ret); + if (!is_encroot) + return (0); + + if ((cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_AVAILABLE) || + (!cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_UNAVAILABLE)) + return (0); + } + + cb->cb_numattempted++; + + if (cb->cb_loadkey) + ret = zfs_crypto_load_key(zhp, cb->cb_noop, cb->cb_keylocation); + else + ret = zfs_crypto_unload_key(zhp); + + if (ret != 0) { + cb->cb_numfailed++; + return (ret); + } + + return (0); +} + +static int +load_unload_keys(int argc, char **argv, boolean_t loadkey) +{ + int c, ret = 0, flags = 0; + boolean_t do_all = B_FALSE; + loadkey_cbdata_t cb = { 0 }; + + cb.cb_loadkey = loadkey; + + while ((c = getopt(argc, argv, "anrL:")) != -1) { + /* noop and alternate keylocations only apply to zfs load-key */ + if (loadkey) { + switch (c) { + case 'n': + cb.cb_noop = B_TRUE; + continue; + case 'L': + cb.cb_keylocation = optarg; + continue; + default: + break; + } + } + + switch (c) { + case 'a': + do_all = B_TRUE; + cb.cb_recursive = B_TRUE; + break; + case 'r': + flags |= ZFS_ITER_RECURSE; + cb.cb_recursive = B_TRUE; + break; + default: + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (!do_all && argc == 0) { + (void) fprintf(stderr, + gettext("Missing dataset argument or -a option\n")); + usage(B_FALSE); + } + + if (do_all && argc != 0) { + (void) fprintf(stderr, + gettext("Cannot specify dataset with -a option\n")); + usage(B_FALSE); + } + + if (cb.cb_recursive && cb.cb_keylocation != NULL && + strcmp(cb.cb_keylocation, "prompt") != 0) { + (void) fprintf(stderr, gettext("alternate keylocation may only " + "be 'prompt' with -r or -a\n")); + usage(B_FALSE); + } + + ret = zfs_for_each(argc, argv, flags, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL, 0, + load_key_callback, &cb); + + if (cb.cb_noop || (cb.cb_recursive && cb.cb_numattempted != 0)) { + (void) printf(gettext("%llu / %llu key(s) successfully %s\n"), + (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed), + (u_longlong_t)cb.cb_numattempted, + loadkey ? (cb.cb_noop ? "verified" : "loaded") : + "unloaded"); + } + + if (cb.cb_numfailed != 0) + ret = -1; + + return (ret); +} + +static int +zfs_do_load_key(int argc, char **argv) +{ + return (load_unload_keys(argc, argv, B_TRUE)); +} + + +static int +zfs_do_unload_key(int argc, char **argv) +{ + return (load_unload_keys(argc, argv, B_FALSE)); +} + +static int +zfs_do_change_key(int argc, char **argv) +{ + int c, ret; + uint64_t keystatus; + boolean_t loadkey = B_FALSE, inheritkey = B_FALSE; + zfs_handle_t *zhp = NULL; + nvlist_t *props = fnvlist_alloc(); + + while ((c = getopt(argc, argv, "lio:")) != -1) { + switch (c) { + case 'l': + loadkey = B_TRUE; + break; + case 'i': + inheritkey = B_TRUE; + break; + case 'o': + if (!parseprop(props, optarg)) { + nvlist_free(props); + return (1); + } + break; + default: + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + usage(B_FALSE); + } + } + + if (inheritkey && !nvlist_empty(props)) { + (void) fprintf(stderr, + gettext("Properties not allowed for inheriting\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("Missing dataset argument\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("Too many arguments\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[argc - 1], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + usage(B_FALSE); + + if (loadkey) { + keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); + if (keystatus != ZFS_KEYSTATUS_AVAILABLE) { + ret = zfs_crypto_load_key(zhp, B_FALSE, NULL); + if (ret != 0) { + nvlist_free(props); + zfs_close(zhp); + return (-1); + } + } + + /* refresh the properties so the new keystatus is visible */ + zfs_refresh_properties(zhp); + } + + ret = zfs_crypto_rewrap(zhp, props, inheritkey); + if (ret != 0) { + nvlist_free(props); + zfs_close(zhp); + return (-1); + } + + nvlist_free(props); + zfs_close(zhp); + return (0); +} + +/* + * 1) zfs project [-d|-r] <file|directory ...> + * List project ID and inherit flag of file(s) or directories. + * -d: List the directory itself, not its children. + * -r: List subdirectories recursively. + * + * 2) zfs project -C [-k] [-r] <file|directory ...> + * Clear project inherit flag and/or ID on the file(s) or directories. + * -k: Keep the project ID unchanged. If not specified, the project ID + * will be reset as zero. + * -r: Clear on subdirectories recursively. + * + * 3) zfs project -c [-0] [-d|-r] [-p id] <file|directory ...> + * Check project ID and inherit flag on the file(s) or directories, + * report the outliers. + * -0: Print file name followed by a NUL instead of newline. + * -d: Check the directory itself, not its children. + * -p: Specify the referenced ID for comparing with the target file(s) + * or directories' project IDs. If not specified, the target (top) + * directory's project ID will be used as the referenced one. + * -r: Check subdirectories recursively. + * + * 4) zfs project [-p id] [-r] [-s] <file|directory ...> + * Set project ID and/or inherit flag on the file(s) or directories. + * -p: Set the project ID as the given id. + * -r: Set on subdirectories recursively. If not specify "-p" option, + * it will use top-level directory's project ID as the given id, + * then set both project ID and inherit flag on all descendants + * of the top-level directory. + * -s: Set project inherit flag. + */ +static int +zfs_do_project(int argc, char **argv) +{ + zfs_project_control_t zpc = { + .zpc_expected_projid = ZFS_INVALID_PROJID, + .zpc_op = ZFS_PROJECT_OP_DEFAULT, + .zpc_dironly = B_FALSE, + .zpc_keep_projid = B_FALSE, + .zpc_newline = B_TRUE, + .zpc_recursive = B_FALSE, + .zpc_set_flag = B_FALSE, + }; + int ret = 0, c; + + if (argc < 2) + usage(B_FALSE); + + while ((c = getopt(argc, argv, "0Ccdkp:rs")) != -1) { + switch (c) { + case '0': + zpc.zpc_newline = B_FALSE; + break; + case 'C': + if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { + (void) fprintf(stderr, gettext("cannot " + "specify '-C' '-c' '-s' together\n")); + usage(B_FALSE); + } + + zpc.zpc_op = ZFS_PROJECT_OP_CLEAR; + break; + case 'c': + if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { + (void) fprintf(stderr, gettext("cannot " + "specify '-C' '-c' '-s' together\n")); + usage(B_FALSE); + } + + zpc.zpc_op = ZFS_PROJECT_OP_CHECK; + break; + case 'd': + zpc.zpc_dironly = B_TRUE; + /* overwrite "-r" option */ + zpc.zpc_recursive = B_FALSE; + break; + case 'k': + zpc.zpc_keep_projid = B_TRUE; + break; + case 'p': { + char *endptr; + + errno = 0; + zpc.zpc_expected_projid = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0') { + (void) fprintf(stderr, + gettext("project ID must be less than " + "%u\n"), UINT32_MAX); + usage(B_FALSE); + } + if (zpc.zpc_expected_projid >= UINT32_MAX) { + (void) fprintf(stderr, + gettext("invalid project ID\n")); + usage(B_FALSE); + } + break; + } + case 'r': + zpc.zpc_recursive = B_TRUE; + /* overwrite "-d" option */ + zpc.zpc_dironly = B_FALSE; + break; + case 's': + if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { + (void) fprintf(stderr, gettext("cannot " + "specify '-C' '-c' '-s' together\n")); + usage(B_FALSE); + } + + zpc.zpc_set_flag = B_TRUE; + zpc.zpc_op = ZFS_PROJECT_OP_SET; + break; + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (zpc.zpc_op == ZFS_PROJECT_OP_DEFAULT) { + if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) + zpc.zpc_op = ZFS_PROJECT_OP_SET; + else + zpc.zpc_op = ZFS_PROJECT_OP_LIST; + } + + switch (zpc.zpc_op) { + case ZFS_PROJECT_OP_LIST: + if (zpc.zpc_keep_projid) { + (void) fprintf(stderr, + gettext("'-k' is only valid together with '-C'\n")); + usage(B_FALSE); + } + if (!zpc.zpc_newline) { + (void) fprintf(stderr, + gettext("'-0' is only valid together with '-c'\n")); + usage(B_FALSE); + } + break; + case ZFS_PROJECT_OP_CHECK: + if (zpc.zpc_keep_projid) { + (void) fprintf(stderr, + gettext("'-k' is only valid together with '-C'\n")); + usage(B_FALSE); + } + break; + case ZFS_PROJECT_OP_CLEAR: + if (zpc.zpc_dironly) { + (void) fprintf(stderr, + gettext("'-d' is useless together with '-C'\n")); + usage(B_FALSE); + } + if (!zpc.zpc_newline) { + (void) fprintf(stderr, + gettext("'-0' is only valid together with '-c'\n")); + usage(B_FALSE); + } + if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) { + (void) fprintf(stderr, + gettext("'-p' is useless together with '-C'\n")); + usage(B_FALSE); + } + break; + case ZFS_PROJECT_OP_SET: + if (zpc.zpc_dironly) { + (void) fprintf(stderr, + gettext("'-d' is useless for set project ID and/or " + "inherit flag\n")); + usage(B_FALSE); + } + if (zpc.zpc_keep_projid) { + (void) fprintf(stderr, + gettext("'-k' is only valid together with '-C'\n")); + usage(B_FALSE); + } + if (!zpc.zpc_newline) { + (void) fprintf(stderr, + gettext("'-0' is only valid together with '-c'\n")); + usage(B_FALSE); + } + break; + default: + ASSERT(0); + break; + } + + argv += optind; + argc -= optind; + if (argc == 0) { + (void) fprintf(stderr, + gettext("missing file or directory target(s)\n")); + usage(B_FALSE); + } + + for (int i = 0; i < argc; i++) { + int err; + + err = zfs_project_handle(argv[i], &zpc); + if (err && !ret) + ret = err; + } + + return (ret); +} + +static int +zfs_do_wait(int argc, char **argv) +{ + boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; + int error, i; + char c; + + /* By default, wait for all types of activity. */ + for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) + enabled[i] = B_TRUE; + + while ((c = getopt(argc, argv, "t:")) != -1) { + switch (c) { + case 't': + { + static char *col_subopts[] = { "deleteq", NULL }; + char *value; + + /* Reset activities array */ + bzero(&enabled, sizeof (enabled)); + while (*optarg != '\0') { + int activity = getsubopt(&optarg, col_subopts, + &value); + + if (activity < 0) { + (void) fprintf(stderr, + gettext("invalid activity '%s'\n"), + value); + usage(B_FALSE); + } + + enabled[activity] = B_TRUE; + } + break; + } + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argv += optind; + argc -= optind; + if (argc < 1) { + (void) fprintf(stderr, gettext("missing 'filesystem' " + "argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + return (1); + + for (;;) { + boolean_t missing = B_FALSE; + boolean_t any_waited = B_FALSE; + + for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) { + boolean_t waited; + + if (!enabled[i]) + continue; + + error = zfs_wait_status(zhp, i, &missing, &waited); + if (error != 0 || missing) + break; + + any_waited = (any_waited || waited); + } + + if (error != 0 || missing || !any_waited) + break; + } + + zfs_close(zhp); + + return (error); +} + +/* + * Display version message + */ +static int +zfs_do_version(int argc, char **argv) +{ + if (zfs_version_print() == -1) + return (1); + + return (0); +} + +int +main(int argc, char **argv) +{ + int ret = 0; + int i = 0; + char *cmdname; + char **newargv; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + opterr = 0; + + /* + * Make sure the user has specified some command. + */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing command\n")); + usage(B_FALSE); + } + + cmdname = argv[1]; + + /* + * The 'umount' command is an alias for 'unmount' + */ + if (strcmp(cmdname, "umount") == 0) + cmdname = "unmount"; + + /* + * The 'recv' command is an alias for 'receive' + */ + if (strcmp(cmdname, "recv") == 0) + cmdname = "receive"; + + /* + * The 'snap' command is an alias for 'snapshot' + */ + if (strcmp(cmdname, "snap") == 0) + cmdname = "snapshot"; + + /* + * Special case '-?' + */ + if ((strcmp(cmdname, "-?") == 0) || + (strcmp(cmdname, "--help") == 0)) + usage(B_TRUE); + + /* + * Special case '-V|--version' + */ + if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) + return (zfs_do_version(argc, argv)); + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (1); + } + + mnttab_file = g_zfs->libzfs_mnttab; + + zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); + + libzfs_print_on_error(g_zfs, B_TRUE); + + /* + * Many commands modify input strings for string parsing reasons. + * We create a copy to protect the original argv. + */ + newargv = malloc((argc + 1) * sizeof (newargv[0])); + for (i = 0; i < argc; i++) + newargv[i] = strdup(argv[i]); + newargv[argc] = NULL; + + /* + * Run the appropriate command. + */ + libzfs_mnttab_cache(g_zfs, B_TRUE); + if (find_command_idx(cmdname, &i) == 0) { + current_command = &command_table[i]; + ret = command_table[i].func(argc - 1, newargv + 1); + } else if (strchr(cmdname, '=') != NULL) { + verify(find_command_idx("set", &i) == 0); + current_command = &command_table[i]; + ret = command_table[i].func(argc, newargv); + } else { + (void) fprintf(stderr, gettext("unrecognized " + "command '%s'\n"), cmdname); + usage(B_FALSE); + ret = 1; + } + + for (i = 0; i < argc; i++) + free(newargv[i]); + free(newargv); + + if (ret == 0 && log_history) + (void) zpool_log_history(g_zfs, history_str); + + libzfs_fini(g_zfs); + + /* + * The 'ZFS_ABORT' environment variable causes us to dump core on exit + * for the purposes of running ::findleaks. + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + return (ret); +} + +#ifdef __FreeBSD__ +#include <sys/jail.h> +#include <jail.h> +/* + * Attach/detach the given dataset to/from the given jail + */ +/* ARGSUSED */ +static int +zfs_do_jail_impl(int argc, char **argv, boolean_t attach) +{ + zfs_handle_t *zhp; + int jailid, ret; + + /* check number of arguments */ + if (argc < 3) { + (void) fprintf(stderr, gettext("missing argument(s)\n")); + usage(B_FALSE); + } + if (argc > 3) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + jailid = jail_getid(argv[1]); + if (jailid < 0) { + (void) fprintf(stderr, gettext("invalid jail id or name\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + return (1); + + ret = (zfs_jail(zhp, jailid, attach) != 0); + + zfs_close(zhp); + return (ret); +} + +/* + * zfs jail jailid filesystem + * + * Attach the given dataset to the given jail + */ +/* ARGSUSED */ +static int +zfs_do_jail(int argc, char **argv) +{ + return (zfs_do_jail_impl(argc, argv, B_TRUE)); +} + +/* + * zfs unjail jailid filesystem + * + * Detach the given dataset from the given jail + */ +/* ARGSUSED */ +static int +zfs_do_unjail(int argc, char **argv) +{ + return (zfs_do_jail_impl(argc, argv, B_FALSE)); +} +#endif diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_project.c b/sys/contrib/openzfs/cmd/zfs/zfs_project.c new file mode 100644 index 000000000000..341cc005de48 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs/zfs_project.c @@ -0,0 +1,295 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Intle Corporation. All rights reserved. + */ + +#include <errno.h> +#include <getopt.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> +#include <fcntl.h> +#include <dirent.h> +#include <stddef.h> +#include <libintl.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/list.h> +#include <sys/zfs_project.h> + +#include "zfs_util.h" +#include "zfs_projectutil.h" + +typedef struct zfs_project_item { + list_node_t zpi_list; + char zpi_name[0]; +} zfs_project_item_t; + +static void +zfs_project_item_alloc(list_t *head, const char *name) +{ + zfs_project_item_t *zpi; + + zpi = safe_malloc(sizeof (zfs_project_item_t) + strlen(name) + 1); + strcpy(zpi->zpi_name, name); + list_insert_tail(head, zpi); +} + +static int +zfs_project_sanity_check(const char *name, zfs_project_control_t *zpc, + struct stat *st) +{ + int ret; + + ret = stat(name, st); + if (ret) { + (void) fprintf(stderr, gettext("failed to stat %s: %s\n"), + name, strerror(errno)); + return (ret); + } + + if (!S_ISREG(st->st_mode) && !S_ISDIR(st->st_mode)) { + (void) fprintf(stderr, gettext("only support project quota on " + "regular file or directory\n")); + return (-1); + } + + if (!S_ISDIR(st->st_mode)) { + if (zpc->zpc_dironly) { + (void) fprintf(stderr, gettext( + "'-d' option on non-dir target %s\n"), name); + return (-1); + } + + if (zpc->zpc_recursive) { + (void) fprintf(stderr, gettext( + "'-r' option on non-dir target %s\n"), name); + return (-1); + } + } + + return (0); +} + +static int +zfs_project_load_projid(const char *name, zfs_project_control_t *zpc) +{ + zfsxattr_t fsx; + int ret, fd; + + fd = open(name, O_RDONLY | O_NOCTTY); + if (fd < 0) { + (void) fprintf(stderr, gettext("failed to open %s: %s\n"), + name, strerror(errno)); + return (fd); + } + + ret = ioctl(fd, ZFS_IOC_FSGETXATTR, &fsx); + if (ret) + (void) fprintf(stderr, + gettext("failed to get xattr for %s: %s\n"), + name, strerror(errno)); + else + zpc->zpc_expected_projid = fsx.fsx_projid; + + close(fd); + return (ret); +} + +static int +zfs_project_handle_one(const char *name, zfs_project_control_t *zpc) +{ + zfsxattr_t fsx; + int ret, fd; + + fd = open(name, O_RDONLY | O_NOCTTY); + if (fd < 0) { + if (errno == ENOENT && zpc->zpc_ignore_noent) + return (0); + + (void) fprintf(stderr, gettext("failed to open %s: %s\n"), + name, strerror(errno)); + return (fd); + } + + ret = ioctl(fd, ZFS_IOC_FSGETXATTR, &fsx); + if (ret) { + (void) fprintf(stderr, + gettext("failed to get xattr for %s: %s\n"), + name, strerror(errno)); + goto out; + } + + switch (zpc->zpc_op) { + case ZFS_PROJECT_OP_LIST: + (void) printf("%5u %c %s\n", fsx.fsx_projid, + (fsx.fsx_xflags & ZFS_PROJINHERIT_FL) ? 'P' : '-', name); + goto out; + case ZFS_PROJECT_OP_CHECK: + if (fsx.fsx_projid == zpc->zpc_expected_projid && + fsx.fsx_xflags & ZFS_PROJINHERIT_FL) + goto out; + + if (!zpc->zpc_newline) { + char c = '\0'; + + (void) printf("%s%c", name, c); + goto out; + } + + if (fsx.fsx_projid != zpc->zpc_expected_projid) + (void) printf("%s - project ID is not set properly " + "(%u/%u)\n", name, fsx.fsx_projid, + (uint32_t)zpc->zpc_expected_projid); + + if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL)) + (void) printf("%s - project inherit flag is not set\n", + name); + + goto out; + case ZFS_PROJECT_OP_CLEAR: + if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL) && + (zpc->zpc_keep_projid || + fsx.fsx_projid == ZFS_DEFAULT_PROJID)) + goto out; + + fsx.fsx_xflags &= ~ZFS_PROJINHERIT_FL; + if (!zpc->zpc_keep_projid) + fsx.fsx_projid = ZFS_DEFAULT_PROJID; + break; + case ZFS_PROJECT_OP_SET: + if (fsx.fsx_projid == zpc->zpc_expected_projid && + (!zpc->zpc_set_flag || fsx.fsx_xflags & ZFS_PROJINHERIT_FL)) + goto out; + + fsx.fsx_projid = zpc->zpc_expected_projid; + if (zpc->zpc_set_flag) + fsx.fsx_xflags |= ZFS_PROJINHERIT_FL; + break; + default: + ASSERT(0); + break; + } + + ret = ioctl(fd, ZFS_IOC_FSSETXATTR, &fsx); + if (ret) + (void) fprintf(stderr, + gettext("failed to set xattr for %s: %s\n"), + name, strerror(errno)); + +out: + close(fd); + return (ret); +} + +static int +zfs_project_handle_dir(const char *name, zfs_project_control_t *zpc, + list_t *head) +{ + char fullname[PATH_MAX]; + struct dirent *ent; + DIR *dir; + int ret = 0; + + dir = opendir(name); + if (dir == NULL) { + if (errno == ENOENT && zpc->zpc_ignore_noent) + return (0); + + ret = -errno; + (void) fprintf(stderr, gettext("failed to opendir %s: %s\n"), + name, strerror(errno)); + return (ret); + } + + /* Non-top item, ignore the case of being removed or renamed by race. */ + zpc->zpc_ignore_noent = B_TRUE; + errno = 0; + while (!ret && (ent = readdir(dir)) != NULL) { + /* skip "." and ".." */ + if (strcmp(ent->d_name, ".") == 0 || + strcmp(ent->d_name, "..") == 0) + continue; + + if (strlen(ent->d_name) + strlen(name) >= + sizeof (fullname) + 1) { + errno = ENAMETOOLONG; + break; + } + + sprintf(fullname, "%s/%s", name, ent->d_name); + ret = zfs_project_handle_one(fullname, zpc); + if (!ret && zpc->zpc_recursive && ent->d_type == DT_DIR) + zfs_project_item_alloc(head, fullname); + } + + if (errno && !ret) { + ret = -errno; + (void) fprintf(stderr, gettext("failed to readdir %s: %s\n"), + name, strerror(errno)); + } + + closedir(dir); + return (ret); +} + +int +zfs_project_handle(const char *name, zfs_project_control_t *zpc) +{ + zfs_project_item_t *zpi; + struct stat st; + list_t head; + int ret; + + ret = zfs_project_sanity_check(name, zpc, &st); + if (ret) + return (ret); + + if ((zpc->zpc_op == ZFS_PROJECT_OP_SET || + zpc->zpc_op == ZFS_PROJECT_OP_CHECK) && + zpc->zpc_expected_projid == ZFS_INVALID_PROJID) { + ret = zfs_project_load_projid(name, zpc); + if (ret) + return (ret); + } + + zpc->zpc_ignore_noent = B_FALSE; + ret = zfs_project_handle_one(name, zpc); + if (ret || !S_ISDIR(st.st_mode) || zpc->zpc_dironly || + (!zpc->zpc_recursive && + zpc->zpc_op != ZFS_PROJECT_OP_LIST && + zpc->zpc_op != ZFS_PROJECT_OP_CHECK)) + return (ret); + + list_create(&head, sizeof (zfs_project_item_t), + offsetof(zfs_project_item_t, zpi_list)); + zfs_project_item_alloc(&head, name); + while ((zpi = list_remove_head(&head)) != NULL) { + if (!ret) + ret = zfs_project_handle_dir(zpi->zpi_name, zpc, &head); + free(zpi); + } + + return (ret); +} diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h b/sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h new file mode 100644 index 000000000000..1792a3383a03 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. All rights reserved. + */ + +#ifndef _ZFS_PROJECTUTIL_H +#define _ZFS_PROJECTUTIL_H + +typedef enum { + ZFS_PROJECT_OP_DEFAULT = 0, + ZFS_PROJECT_OP_LIST = 1, + ZFS_PROJECT_OP_CHECK = 2, + ZFS_PROJECT_OP_CLEAR = 3, + ZFS_PROJECT_OP_SET = 4, +} zfs_project_ops_t; + +typedef struct zfs_project_control { + uint64_t zpc_expected_projid; + zfs_project_ops_t zpc_op; + boolean_t zpc_dironly; + boolean_t zpc_ignore_noent; + boolean_t zpc_keep_projid; + boolean_t zpc_newline; + boolean_t zpc_recursive; + boolean_t zpc_set_flag; +} zfs_project_control_t; + +int zfs_project_handle(const char *name, zfs_project_control_t *zpc); + +#endif /* _ZFS_PROJECTUTIL_H */ diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_util.h b/sys/contrib/openzfs/cmd/zfs/zfs_util.h new file mode 100644 index 000000000000..a56af59adb15 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs/zfs_util.h @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZFS_UTIL_H +#define _ZFS_UTIL_H + +#include <libzfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +void * safe_malloc(size_t size); +void nomem(void); +extern libzfs_handle_t *g_zfs; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_UTIL_H */ diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore b/sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore new file mode 100644 index 000000000000..f95f853e48c2 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore @@ -0,0 +1 @@ +zfs_ids_to_path diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am b/sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am new file mode 100644 index 000000000000..176eeb3c72c5 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am @@ -0,0 +1,9 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zfs_ids_to_path + +zfs_ids_to_path_SOURCES = \ + zfs_ids_to_path.c + +zfs_ids_to_path_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c b/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c new file mode 100644 index 000000000000..6cfaa6f41fa5 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ +#include <libintl.h> +#include <unistd.h> +#include <sys/types.h> +#include <stdint.h> +#include <libzfs.h> +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> + +libzfs_handle_t *g_zfs; + +static void +usage(int err) +{ + fprintf(stderr, "Usage: [-v] zfs_ids_to_path <pool> <objset id> " + "<object id>\n"); + exit(err); +} + +int +main(int argc, char **argv) +{ + boolean_t verbose = B_FALSE; + char c; + while ((c = getopt(argc, argv, "v")) != -1) { + switch (c) { + case 'v': + verbose = B_TRUE; + break; + } + } + argc -= optind; + argv += optind; + + if (argc != 3) { + (void) fprintf(stderr, "Incorrect number of arguments: %d\n", + argc); + usage(1); + } + + uint64_t objset, object; + if (sscanf(argv[1], "%llu", (u_longlong_t *)&objset) != 1) { + (void) fprintf(stderr, "Invalid objset id: %s\n", argv[2]); + usage(2); + } + if (sscanf(argv[2], "%llu", (u_longlong_t *)&object) != 1) { + (void) fprintf(stderr, "Invalid object id: %s\n", argv[3]); + usage(3); + } + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (4); + } + zpool_handle_t *pool = zpool_open(g_zfs, argv[0]); + if (pool == NULL) { + fprintf(stderr, "Could not open pool %s\n", argv[1]); + libzfs_fini(g_zfs); + return (5); + } + + char pathname[PATH_MAX * 2]; + if (verbose) { + zpool_obj_to_path_ds(pool, objset, object, pathname, + sizeof (pathname)); + } else { + zpool_obj_to_path(pool, objset, object, pathname, + sizeof (pathname)); + } + printf("%s\n", pathname); + zpool_close(pool); + libzfs_fini(g_zfs); + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zgenhostid/Makefile.am b/sys/contrib/openzfs/cmd/zgenhostid/Makefile.am new file mode 100644 index 000000000000..69c99ca9d828 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zgenhostid/Makefile.am @@ -0,0 +1 @@ +dist_bin_SCRIPTS = zgenhostid diff --git a/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid b/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid new file mode 100755 index 000000000000..8b468740c72b --- /dev/null +++ b/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +# Emulate genhostid(1) available on RHEL/CENTOS, for use on distros +# which do not provide that utility. +# +# Usage: +# zgenhostid +# zgenhostid <value> +# +# If /etc/hostid already exists and is size > 0, the script exits immediately +# and changes nothing. Unlike genhostid, this generates an error message. +# +# The first form generates a random hostid and stores it in /etc/hostid. +# The second form checks that the provided value is between 0x1 and 0xFFFFFFFF +# and if so, stores it in /etc/hostid. This form is not supported by +# genhostid(1). + +hostid_file=/etc/hostid + +function usage { + echo "$0 [value]" + echo "If $hostid_file is not present, store a hostid in it." >&2 + echo "The optional value must be an 8-digit hex number between" >&2 + echo "1 and 2^32-1. If no value is provided, a random one will" >&2 + echo "be generated. The value must be unique among your systems." >&2 +} + +# hostid(1) ignores contents of /etc/hostid if size < 4 bytes. It would +# be better if this checked size >= 4 bytes but it the method must be +# widely portable. +if [ -s $hostid_file ]; then + echo "$hostid_file already exists. No change made." >&2 + exit 1 +fi + +if [ -n "$1" ]; then + host_id=$1 +else + # $RANDOM goes from 0..32k-1 + number=$((((RANDOM % 4) * 32768 + RANDOM) * 32768 + RANDOM)) + host_id=$(printf "%08x" $number) +fi + +if egrep -o '^0{8}$' <<< $host_id >/dev/null 2>&1; then + usage + exit 2 +fi + +if ! egrep -o '^[a-fA-F0-9]{8}$' <<< $host_id >/dev/null 2>&1; then + usage + exit 3 +fi + +a=${host_id:6:2} +b=${host_id:4:2} +c=${host_id:2:2} +d=${host_id:0:2} + +echo -ne \\x$a\\x$b\\x$c\\x$d > $hostid_file + +exit 0 diff --git a/sys/contrib/openzfs/cmd/zhack/.gitignore b/sys/contrib/openzfs/cmd/zhack/.gitignore new file mode 100644 index 000000000000..763a18898b88 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zhack/.gitignore @@ -0,0 +1 @@ +/zhack diff --git a/sys/contrib/openzfs/cmd/zhack/Makefile.am b/sys/contrib/openzfs/cmd/zhack/Makefile.am new file mode 100644 index 000000000000..5cddac32b5ac --- /dev/null +++ b/sys/contrib/openzfs/cmd/zhack/Makefile.am @@ -0,0 +1,14 @@ +include $(top_srcdir)/config/Rules.am + +# Unconditionally enable debugging for zhack +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +sbin_PROGRAMS = zhack + +zhack_SOURCES = \ + zhack.c + +zhack_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/sys/contrib/openzfs/cmd/zhack/zhack.c b/sys/contrib/openzfs/cmd/zhack/zhack.c new file mode 100644 index 000000000000..4d958fe4365a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zhack/zhack.c @@ -0,0 +1,532 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + */ + +/* + * zhack is a debugging tool that can write changes to ZFS pool using libzpool + * for testing purposes. Altering pools with zhack is unsupported and may + * result in corrupted pools. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/zfs_znode.h> +#include <sys/dsl_synctask.h> +#include <sys/vdev.h> +#include <sys/fs/zfs.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_pool.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/zfeature.h> +#include <sys/dmu_tx.h> +#include <libzutil.h> + +extern boolean_t zfeature_checks_disable; + +const char cmdname[] = "zhack"; +static importargs_t g_importargs; +static char *g_pool; +static boolean_t g_readonly; + +static void +usage(void) +{ + (void) fprintf(stderr, + "Usage: %s [-c cachefile] [-d dir] <subcommand> <args> ...\n" + "where <subcommand> <args> is one of the following:\n" + "\n", cmdname); + + (void) fprintf(stderr, + " feature stat <pool>\n" + " print information about enabled features\n" + " feature enable [-r] [-d desc] <pool> <feature>\n" + " add a new enabled feature to the pool\n" + " -d <desc> sets the feature's description\n" + " -r set read-only compatible flag for feature\n" + " feature ref [-md] <pool> <feature>\n" + " change the refcount on the given feature\n" + " -d decrease instead of increase the refcount\n" + " -m add the feature to the label if increasing refcount\n" + "\n" + " <feature> : should be a feature guid\n"); + exit(1); +} + + +static void +fatal(spa_t *spa, void *tag, const char *fmt, ...) +{ + va_list ap; + + if (spa != NULL) { + spa_close(spa, tag); + (void) spa_export(g_pool, NULL, B_TRUE, B_FALSE); + } + + va_start(ap, fmt); + (void) fprintf(stderr, "%s: ", cmdname); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); + (void) fprintf(stderr, "\n"); + + exit(1); +} + +/* ARGSUSED */ +static int +space_delta_cb(dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zoi) +{ + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (ENOENT); + (void) fprintf(stderr, "modifying object that needs user accounting"); + abort(); + /* NOTREACHED */ +} + +/* + * Target is the dataset whose pool we want to open. + */ +static void +zhack_import(char *target, boolean_t readonly) +{ + nvlist_t *config; + nvlist_t *props; + int error; + + kernel_init(readonly ? SPA_MODE_READ : + (SPA_MODE_READ | SPA_MODE_WRITE)); + + dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb); + + g_readonly = readonly; + g_importargs.can_be_active = readonly; + g_pool = strdup(target); + + error = zpool_find_config(NULL, target, &config, &g_importargs, + &libzpool_config_ops); + if (error) + fatal(NULL, FTAG, "cannot import '%s'", target); + + props = NULL; + if (readonly) { + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0); + } + + zfeature_checks_disable = B_TRUE; + error = spa_import(target, config, props, + (readonly ? ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL)); + zfeature_checks_disable = B_FALSE; + if (error == EEXIST) + error = 0; + + if (error) + fatal(NULL, FTAG, "can't import '%s': %s", target, + strerror(error)); +} + +static void +zhack_spa_open(char *target, boolean_t readonly, void *tag, spa_t **spa) +{ + int err; + + zhack_import(target, readonly); + + zfeature_checks_disable = B_TRUE; + err = spa_open(target, spa, tag); + zfeature_checks_disable = B_FALSE; + + if (err != 0) + fatal(*spa, FTAG, "cannot open '%s': %s", target, + strerror(err)); + if (spa_version(*spa) < SPA_VERSION_FEATURES) { + fatal(*spa, FTAG, "'%s' has version %d, features not enabled", + target, (int)spa_version(*spa)); + } +} + +static void +dump_obj(objset_t *os, uint64_t obj, const char *name) +{ + zap_cursor_t zc; + zap_attribute_t za; + + (void) printf("%s_obj:\n", name); + + for (zap_cursor_init(&zc, os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (za.za_integer_length == 8) { + ASSERT(za.za_num_integers == 1); + (void) printf("\t%s = %llu\n", + za.za_name, (u_longlong_t)za.za_first_integer); + } else { + ASSERT(za.za_integer_length == 1); + char val[1024]; + VERIFY(zap_lookup(os, obj, za.za_name, + 1, sizeof (val), val) == 0); + (void) printf("\t%s = %s\n", za.za_name, val); + } + } + zap_cursor_fini(&zc); +} + +static void +dump_mos(spa_t *spa) +{ + nvlist_t *nv = spa->spa_label_features; + nvpair_t *pair; + + (void) printf("label config:\n"); + for (pair = nvlist_next_nvpair(nv, NULL); + pair != NULL; + pair = nvlist_next_nvpair(nv, pair)) { + (void) printf("\t%s\n", nvpair_name(pair)); + } +} + +static void +zhack_do_feature_stat(int argc, char **argv) +{ + spa_t *spa; + objset_t *os; + char *target; + + argc--; + argv++; + + if (argc < 1) { + (void) fprintf(stderr, "error: missing pool name\n"); + usage(); + } + target = argv[0]; + + zhack_spa_open(target, B_TRUE, FTAG, &spa); + os = spa->spa_meta_objset; + + dump_obj(os, spa->spa_feat_for_read_obj, "for_read"); + dump_obj(os, spa->spa_feat_for_write_obj, "for_write"); + dump_obj(os, spa->spa_feat_desc_obj, "descriptions"); + if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { + dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg"); + } + dump_mos(spa); + + spa_close(spa, FTAG); +} + +static void +zhack_feature_enable_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; + + feature_enable_sync(spa, feature, tx); + + spa_history_log_internal(spa, "zhack enable feature", tx, + "name=%s flags=%u", + feature->fi_guid, feature->fi_flags); +} + +static void +zhack_do_feature_enable(int argc, char **argv) +{ + int c; + char *desc, *target; + spa_t *spa; + objset_t *mos; + zfeature_info_t feature; + spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; + + /* + * Features are not added to the pool's label until their refcounts + * are incremented, so fi_mos can just be left as false for now. + */ + desc = NULL; + feature.fi_uname = "zhack"; + feature.fi_flags = 0; + feature.fi_depends = nodeps; + feature.fi_feature = SPA_FEATURE_NONE; + + optind = 1; + while ((c = getopt(argc, argv, "+rd:")) != -1) { + switch (c) { + case 'r': + feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT; + break; + case 'd': + desc = strdup(optarg); + break; + default: + usage(); + break; + } + } + + if (desc == NULL) + desc = strdup("zhack injected"); + feature.fi_desc = desc; + + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, "error: missing feature or pool name\n"); + usage(); + } + target = argv[0]; + feature.fi_guid = argv[1]; + + if (!zfeature_is_valid_guid(feature.fi_guid)) + fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid); + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + mos = spa->spa_meta_objset; + + if (zfeature_is_supported(feature.fi_guid)) + fatal(spa, FTAG, "'%s' is a real feature, will not enable"); + if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid)) + fatal(spa, FTAG, "feature already enabled: %s", + feature.fi_guid); + + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + zhack_feature_enable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL)); + + spa_close(spa, FTAG); + + free(desc); +} + +static void +feature_incr_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; + uint64_t refcount; + + VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); + feature_sync(spa, feature, refcount + 1, tx); + spa_history_log_internal(spa, "zhack feature incr", tx, + "name=%s", feature->fi_guid); +} + +static void +feature_decr_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; + uint64_t refcount; + + VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); + feature_sync(spa, feature, refcount - 1, tx); + spa_history_log_internal(spa, "zhack feature decr", tx, + "name=%s", feature->fi_guid); +} + +static void +zhack_do_feature_ref(int argc, char **argv) +{ + int c; + char *target; + boolean_t decr = B_FALSE; + spa_t *spa; + objset_t *mos; + zfeature_info_t feature; + spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; + + /* + * fi_desc does not matter here because it was written to disk + * when the feature was enabled, but we need to properly set the + * feature for read or write based on the information we read off + * disk later. + */ + feature.fi_uname = "zhack"; + feature.fi_flags = 0; + feature.fi_desc = NULL; + feature.fi_depends = nodeps; + feature.fi_feature = SPA_FEATURE_NONE; + + optind = 1; + while ((c = getopt(argc, argv, "+md")) != -1) { + switch (c) { + case 'm': + feature.fi_flags |= ZFEATURE_FLAG_MOS; + break; + case 'd': + decr = B_TRUE; + break; + default: + usage(); + break; + } + } + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, "error: missing feature or pool name\n"); + usage(); + } + target = argv[0]; + feature.fi_guid = argv[1]; + + if (!zfeature_is_valid_guid(feature.fi_guid)) + fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid); + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + mos = spa->spa_meta_objset; + + if (zfeature_is_supported(feature.fi_guid)) { + fatal(spa, FTAG, + "'%s' is a real feature, will not change refcount"); + } + + if (0 == zap_contains(mos, spa->spa_feat_for_read_obj, + feature.fi_guid)) { + feature.fi_flags &= ~ZFEATURE_FLAG_READONLY_COMPAT; + } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj, + feature.fi_guid)) { + feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT; + } else { + fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid); + } + + if (decr) { + uint64_t count; + if (feature_get_refcount_from_disk(spa, &feature, + &count) == 0 && count == 0) { + fatal(spa, FTAG, "feature refcount already 0: %s", + feature.fi_guid); + } + } + + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + decr ? feature_decr_sync : feature_incr_sync, &feature, + 5, ZFS_SPACE_CHECK_NORMAL)); + + spa_close(spa, FTAG); +} + +static int +zhack_do_feature(int argc, char **argv) +{ + char *subcommand; + + argc--; + argv++; + if (argc == 0) { + (void) fprintf(stderr, + "error: no feature operation specified\n"); + usage(); + } + + subcommand = argv[0]; + if (strcmp(subcommand, "stat") == 0) { + zhack_do_feature_stat(argc, argv); + } else if (strcmp(subcommand, "enable") == 0) { + zhack_do_feature_enable(argc, argv); + } else if (strcmp(subcommand, "ref") == 0) { + zhack_do_feature_ref(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + return (0); +} + +#define MAX_NUM_PATHS 1024 + +int +main(int argc, char **argv) +{ + extern void zfs_prop_init(void); + + char *path[MAX_NUM_PATHS]; + const char *subcommand; + int rv = 0; + int c; + + g_importargs.path = path; + + dprintf_setup(&argc, argv); + zfs_prop_init(); + + while ((c = getopt(argc, argv, "+c:d:")) != -1) { + switch (c) { + case 'c': + g_importargs.cachefile = optarg; + break; + case 'd': + assert(g_importargs.paths < MAX_NUM_PATHS); + g_importargs.path[g_importargs.paths++] = optarg; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + optind = 1; + + if (argc == 0) { + (void) fprintf(stderr, "error: no command specified\n"); + usage(); + } + + subcommand = argv[0]; + + if (strcmp(subcommand, "feature") == 0) { + rv = zhack_do_feature(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_FALSE) != 0) { + fatal(NULL, FTAG, "pool export failed; " + "changes may not be committed to disk\n"); + } + + kernel_fini(); + + return (rv); +} diff --git a/sys/contrib/openzfs/cmd/zinject/.gitignore b/sys/contrib/openzfs/cmd/zinject/.gitignore new file mode 100644 index 000000000000..bded8400996c --- /dev/null +++ b/sys/contrib/openzfs/cmd/zinject/.gitignore @@ -0,0 +1 @@ +/zinject diff --git a/sys/contrib/openzfs/cmd/zinject/Makefile.am b/sys/contrib/openzfs/cmd/zinject/Makefile.am new file mode 100644 index 000000000000..091d92cd6026 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zinject/Makefile.am @@ -0,0 +1,13 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zinject + +zinject_SOURCES = \ + translate.c \ + zinject.c \ + zinject.h + +zinject_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/sys/contrib/openzfs/cmd/zinject/translate.c b/sys/contrib/openzfs/cmd/zinject/translate.c new file mode 100644 index 000000000000..4939c0b85b5f --- /dev/null +++ b/sys/contrib/openzfs/cmd/zinject/translate.c @@ -0,0 +1,397 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + */ + +#include <libzfs.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdarg.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <sys/file.h> +#include <sys/mntent.h> +#include <sys/mnttab.h> +#include <sys/param.h> +#include <sys/stat.h> + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dnode.h> +#include <sys/vdev_impl.h> + +#include <sys/mkdev.h> + +#include "zinject.h" + +static int debug; + +static void +ziprintf(const char *fmt, ...) +{ + va_list ap; + + if (!debug) + return; + + va_start(ap, fmt); + (void) vprintf(fmt, ap); + va_end(ap); +} + +static void +compress_slashes(const char *src, char *dest) +{ + while (*src != '\0') { + *dest = *src++; + while (*dest == '/' && *src == '/') + ++src; + ++dest; + } + *dest = '\0'; +} + +/* + * Given a full path to a file, translate into a dataset name and a relative + * path within the dataset. 'dataset' must be at least MAXNAMELEN characters, + * and 'relpath' must be at least MAXPATHLEN characters. We also pass a stat64 + * buffer, which we need later to get the object ID. + */ +static int +parse_pathname(const char *inpath, char *dataset, char *relpath, + struct stat64 *statbuf) +{ + struct extmnttab mp; + const char *rel; + char fullpath[MAXPATHLEN]; + + compress_slashes(inpath, fullpath); + + if (fullpath[0] != '/') { + (void) fprintf(stderr, "invalid object '%s': must be full " + "path\n", fullpath); + usage(); + return (-1); + } + + if (getextmntent(fullpath, &mp, statbuf) != 0) { + (void) fprintf(stderr, "cannot find mountpoint for '%s'\n", + fullpath); + return (-1); + } + + if (strcmp(mp.mnt_fstype, MNTTYPE_ZFS) != 0) { + (void) fprintf(stderr, "invalid path '%s': not a ZFS " + "filesystem\n", fullpath); + return (-1); + } + + if (strncmp(fullpath, mp.mnt_mountp, strlen(mp.mnt_mountp)) != 0) { + (void) fprintf(stderr, "invalid path '%s': mountpoint " + "doesn't match path\n", fullpath); + return (-1); + } + + (void) strcpy(dataset, mp.mnt_special); + + rel = fullpath + strlen(mp.mnt_mountp); + if (rel[0] == '/') + rel++; + (void) strcpy(relpath, rel); + + return (0); +} + +/* + * Convert from a dataset to a objset id. Note that + * we grab the object number from the inode number. + */ +static int +object_from_path(const char *dataset, uint64_t object, zinject_record_t *record) +{ + zfs_handle_t *zhp; + + if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) + return (-1); + + record->zi_objset = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + record->zi_object = object; + + zfs_close(zhp); + + return (0); +} + +/* + * Initialize the range based on the type, level, and range given. + */ +static int +initialize_range(err_type_t type, int level, char *range, + zinject_record_t *record) +{ + /* + * Determine the numeric range from the string. + */ + if (range == NULL) { + /* + * If range is unspecified, set the range to [0,-1], which + * indicates that the whole object should be treated as an + * error. + */ + record->zi_start = 0; + record->zi_end = -1ULL; + } else { + char *end; + + /* XXX add support for suffixes */ + record->zi_start = strtoull(range, &end, 10); + + + if (*end == '\0') + record->zi_end = record->zi_start + 1; + else if (*end == ',') + record->zi_end = strtoull(end + 1, &end, 10); + + if (*end != '\0') { + (void) fprintf(stderr, "invalid range '%s': must be " + "a numeric range of the form 'start[,end]'\n", + range); + return (-1); + } + } + + switch (type) { + default: + break; + case TYPE_DATA: + break; + + case TYPE_DNODE: + /* + * If this is a request to inject faults into the dnode, then we + * must translate the current (objset,object) pair into an + * offset within the metadnode for the objset. Specifying any + * kind of range with type 'dnode' is illegal. + */ + if (range != NULL) { + (void) fprintf(stderr, "range cannot be specified when " + "type is 'dnode'\n"); + return (-1); + } + + record->zi_start = record->zi_object * sizeof (dnode_phys_t); + record->zi_end = record->zi_start + sizeof (dnode_phys_t); + record->zi_object = 0; + break; + } + + record->zi_level = level; + + return (0); +} + +int +translate_record(err_type_t type, const char *object, const char *range, + int level, zinject_record_t *record, char *poolname, char *dataset) +{ + char path[MAXPATHLEN]; + char *slash; + struct stat64 statbuf; + int ret = -1; + + debug = (getenv("ZINJECT_DEBUG") != NULL); + + ziprintf("translating: %s\n", object); + + if (MOS_TYPE(type)) { + /* + * MOS objects are treated specially. + */ + switch (type) { + default: + break; + case TYPE_MOS: + record->zi_type = 0; + break; + case TYPE_MOSDIR: + record->zi_type = DMU_OT_OBJECT_DIRECTORY; + break; + case TYPE_METASLAB: + record->zi_type = DMU_OT_OBJECT_ARRAY; + break; + case TYPE_CONFIG: + record->zi_type = DMU_OT_PACKED_NVLIST; + break; + case TYPE_BPOBJ: + record->zi_type = DMU_OT_BPOBJ; + break; + case TYPE_SPACEMAP: + record->zi_type = DMU_OT_SPACE_MAP; + break; + case TYPE_ERRLOG: + record->zi_type = DMU_OT_ERROR_LOG; + break; + } + + dataset[0] = '\0'; + (void) strcpy(poolname, object); + return (0); + } + + /* + * Convert a full path into a (dataset, file) pair. + */ + if (parse_pathname(object, dataset, path, &statbuf) != 0) + goto err; + + ziprintf(" dataset: %s\n", dataset); + ziprintf(" path: %s\n", path); + + /* + * Convert (dataset, file) into (objset, object) + */ + if (object_from_path(dataset, statbuf.st_ino, record) != 0) + goto err; + + ziprintf("raw objset: %llu\n", record->zi_objset); + ziprintf("raw object: %llu\n", record->zi_object); + + /* + * For the given object, initialize the range in bytes + */ + if (initialize_range(type, level, (char *)range, record) != 0) + goto err; + + ziprintf(" objset: %llu\n", record->zi_objset); + ziprintf(" object: %llu\n", record->zi_object); + if (record->zi_start == 0 && + record->zi_end == -1ULL) + ziprintf(" range: all\n"); + else + ziprintf(" range: [%llu, %llu]\n", record->zi_start, + record->zi_end); + + /* + * Copy the pool name + */ + (void) strcpy(poolname, dataset); + if ((slash = strchr(poolname, '/')) != NULL) + *slash = '\0'; + + ret = 0; + +err: + return (ret); +} + +int +translate_raw(const char *str, zinject_record_t *record) +{ + /* + * A raw bookmark of the form objset:object:level:blkid, where each + * number is a hexadecimal value. + */ + if (sscanf(str, "%llx:%llx:%x:%llx", (u_longlong_t *)&record->zi_objset, + (u_longlong_t *)&record->zi_object, &record->zi_level, + (u_longlong_t *)&record->zi_start) != 4) { + (void) fprintf(stderr, "bad raw spec '%s': must be of the form " + "'objset:object:level:blkid'\n", str); + return (-1); + } + + record->zi_end = record->zi_start; + + return (0); +} + +int +translate_device(const char *pool, const char *device, err_type_t label_type, + zinject_record_t *record) +{ + char *end; + zpool_handle_t *zhp; + nvlist_t *tgt; + boolean_t isspare, iscache; + + /* + * Given a device name or GUID, create an appropriate injection record + * with zi_guid set. + */ + if ((zhp = zpool_open(g_zfs, pool)) == NULL) + return (-1); + + record->zi_guid = strtoull(device, &end, 0); + if (record->zi_guid == 0 || *end != '\0') { + tgt = zpool_find_vdev(zhp, device, &isspare, &iscache, NULL); + + if (tgt == NULL) { + (void) fprintf(stderr, "cannot find device '%s' in " + "pool '%s'\n", device, pool); + zpool_close(zhp); + return (-1); + } + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, + &record->zi_guid) == 0); + } + + /* + * Device faults can take on three different forms: + * 1). delayed or hanging I/O + * 2). zfs label faults + * 3). generic disk faults + */ + if (record->zi_timer != 0) { + record->zi_cmd = ZINJECT_DELAY_IO; + } else if (label_type != TYPE_INVAL) { + record->zi_cmd = ZINJECT_LABEL_FAULT; + } else { + record->zi_cmd = ZINJECT_DEVICE_FAULT; + } + + switch (label_type) { + default: + break; + case TYPE_LABEL_UBERBLOCK: + record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]); + record->zi_end = record->zi_start + VDEV_UBERBLOCK_RING - 1; + break; + case TYPE_LABEL_NVLIST: + record->zi_start = offsetof(vdev_label_t, vl_vdev_phys); + record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1; + break; + case TYPE_LABEL_PAD1: + record->zi_start = offsetof(vdev_label_t, vl_pad1); + record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; + break; + case TYPE_LABEL_PAD2: + record->zi_start = offsetof(vdev_label_t, vl_be); + record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; + break; + } + zpool_close(zhp); + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.c b/sys/contrib/openzfs/cmd/zinject/zinject.c new file mode 100644 index 000000000000..bf97b0d68713 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zinject/zinject.c @@ -0,0 +1,1287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + */ + +/* + * ZFS Fault Injector + * + * This userland component takes a set of options and uses libzpool to translate + * from a user-visible object type and name to an internal representation. + * There are two basic types of faults: device faults and data faults. + * + * + * DEVICE FAULTS + * + * Errors can be injected into a particular vdev using the '-d' option. This + * option takes a path or vdev GUID to uniquely identify the device within a + * pool. There are four types of errors that can be injected, IO, ENXIO, + * ECHILD, and EILSEQ. These can be controlled through the '-e' option and the + * default is ENXIO. For EIO failures, any attempt to read data from the device + * will return EIO, but a subsequent attempt to reopen the device will succeed. + * For ENXIO failures, any attempt to read from the device will return EIO, but + * any attempt to reopen the device will also return ENXIO. The EILSEQ failures + * only apply to read operations (-T read) and will flip a bit after the device + * has read the original data. + * + * For label faults, the -L option must be specified. This allows faults + * to be injected into either the nvlist, uberblock, pad1, or pad2 region + * of all the labels for the specified device. + * + * This form of the command looks like: + * + * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool + * + * + * DATA FAULTS + * + * We begin with a tuple of the form: + * + * <type,level,range,object> + * + * type A string describing the type of data to target. Each type + * implicitly describes how to interpret 'object'. Currently, + * the following values are supported: + * + * data User data for a file + * dnode Dnode for a file or directory + * + * The following MOS objects are special. Instead of injecting + * errors on a particular object or blkid, we inject errors across + * all objects of the given type. + * + * mos Any data in the MOS + * mosdir object directory + * config pool configuration + * bpobj blkptr list + * spacemap spacemap + * metaslab metaslab + * errlog persistent error log + * + * level Object level. Defaults to '0', not applicable to all types. If + * a range is given, this corresponds to the indirect block + * corresponding to the specific range. + * + * range A numerical range [start,end) within the object. Defaults to + * the full size of the file. + * + * object A string describing the logical location of the object. For + * files and directories (currently the only supported types), + * this is the path of the object on disk. + * + * This is translated, via libzpool, into the following internal representation: + * + * <type,objset,object,level,range> + * + * These types should be self-explanatory. This tuple is then passed to the + * kernel via a special ioctl() to initiate fault injection for the given + * object. Note that 'type' is not strictly necessary for fault injection, but + * is used when translating existing faults into a human-readable string. + * + * + * The command itself takes one of the forms: + * + * zinject + * zinject <-a | -u pool> + * zinject -c <id|all> + * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] + * [-r range] <object> + * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool + * + * With no arguments, the command prints all currently registered injection + * handlers, with their numeric identifiers. + * + * The '-c' option will clear the given handler, or all handlers if 'all' is + * specified. + * + * The '-e' option takes a string describing the errno to simulate. This must + * be one of 'io', 'checksum', 'decompress', or 'decrypt'. In most cases this + * will result in the same behavior, but RAID-Z will produce a different set of + * ereports for this situation. + * + * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is + * specified, then the ARC cache is flushed appropriately. If '-u' is + * specified, then the underlying SPA is unloaded. Either of these flags can be + * specified independently of any other handlers. The '-m' flag automatically + * does an unmount and remount of the underlying dataset to aid in flushing the + * cache. + * + * The '-f' flag controls the frequency of errors injected, expressed as a + * real number percentage between 0.0001 and 100. The default is 100. + * + * The this form is responsible for actually injecting the handler into the + * framework. It takes the arguments described above, translates them to the + * internal tuple using libzpool, and then issues an ioctl() to register the + * handler. + * + * The final form can target a specific bookmark, regardless of whether a + * human-readable interface has been designed. It allows developers to specify + * a particular block by number. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/fs/zfs.h> +#include <sys/mount.h> + +#include <libzfs.h> + +#undef verify /* both libzfs.h and zfs_context.h want to define this */ + +#include "zinject.h" + +libzfs_handle_t *g_zfs; +int zfs_fd; + +static const char *errtable[TYPE_INVAL] = { + "data", + "dnode", + "mos", + "mosdir", + "metaslab", + "config", + "bpobj", + "spacemap", + "errlog", + "uber", + "nvlist", + "pad1", + "pad2" +}; + +static err_type_t +name_to_type(const char *arg) +{ + int i; + for (i = 0; i < TYPE_INVAL; i++) + if (strcmp(errtable[i], arg) == 0) + return (i); + + return (TYPE_INVAL); +} + +static const char * +type_to_name(uint64_t type) +{ + switch (type) { + case DMU_OT_OBJECT_DIRECTORY: + return ("mosdir"); + case DMU_OT_OBJECT_ARRAY: + return ("metaslab"); + case DMU_OT_PACKED_NVLIST: + return ("config"); + case DMU_OT_BPOBJ: + return ("bpobj"); + case DMU_OT_SPACE_MAP: + return ("spacemap"); + case DMU_OT_ERROR_LOG: + return ("errlog"); + default: + return ("-"); + } +} + + +/* + * Print usage message. + */ +void +usage(void) +{ + (void) printf( + "usage:\n" + "\n" + "\tzinject\n" + "\n" + "\t\tList all active injection records.\n" + "\n" + "\tzinject -c <id|all>\n" + "\n" + "\t\tClear the particular record (if given a numeric ID), or\n" + "\t\tall records if 'all' is specified.\n" + "\n" + "\tzinject -p <function name> pool\n" + "\t\tInject a panic fault at the specified function. Only \n" + "\t\tfunctions which call spa_vdev_config_exit(), or \n" + "\t\tspa_vdev_exit() will trigger a panic.\n" + "\n" + "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n" + "\t\t[-T <read|write|free|claim|all>] [-f frequency] pool\n\n" + "\t\tInject a fault into a particular device or the device's\n" + "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " + "\t\t'pad1', or 'pad2'.\n" + "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n" + "\t\t'corrupt' (bit flip).\n" + "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n" + "\t\tdevice error injection to a percentage of the IOs.\n" + "\n" + "\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n" + "\t\tPerform a specific action on a particular device.\n" + "\n" + "\tzinject -d device -D latency:lanes pool\n" + "\n" + "\t\tAdd an artificial delay to IO requests on a particular\n" + "\t\tdevice, such that the requests take a minimum of 'latency'\n" + "\t\tmilliseconds to complete. Each delay has an associated\n" + "\t\tnumber of 'lanes' which defines the number of concurrent\n" + "\t\tIO requests that can be processed.\n" + "\n" + "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n" + "\t\tthe device will only be able to service a single IO request\n" + "\t\tat a time with each request taking 10 ms to complete. So,\n" + "\t\tif only a single request is submitted every 10 ms, the\n" + "\t\taverage latency will be 10 ms; but if more than one request\n" + "\t\tis submitted every 10 ms, the average latency will be more\n" + "\t\tthan 10 ms.\n" + "\n" + "\t\tSimilarly, if a delay of 10 ms is specified to have two\n" + "\t\tlanes (-D 10:2), then the device will be able to service\n" + "\t\ttwo requests at a time, each with a minimum latency of\n" + "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n" + "\t\tthe average latency will be 10 ms; but if more than two\n" + "\t\trequests are submitted every 10 ms, the average latency\n" + "\t\twill be more than 10 ms.\n" + "\n" + "\t\tAlso note, these delays are additive. So two invocations\n" + "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n" + "\t\tof '-D 10:2'. This also means, one can specify multiple\n" + "\t\tlanes with differing target latencies. For example, an\n" + "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n" + "\t\tcreate 3 lanes on the device; one lane with a latency\n" + "\t\tof 10 ms and two lanes with a 25 ms latency.\n" + "\n" + "\tzinject -I [-s <seconds> | -g <txgs>] pool\n" + "\t\tCause the pool to stop writing blocks yet not\n" + "\t\treport errors for a duration. Simulates buggy hardware\n" + "\t\tthat fails to honor cache flush requests.\n" + "\t\tDefault duration is 30 seconds. The machine is panicked\n" + "\t\tat the end of the duration.\n" + "\n" + "\tzinject -b objset:object:level:blkid pool\n" + "\n" + "\t\tInject an error into pool 'pool' with the numeric bookmark\n" + "\t\tspecified by the remaining tuple. Each number is in\n" + "\t\thexadecimal, and only one block can be specified.\n" + "\n" + "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n" + "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n" + "\n" + "\t\tInject an error into the object specified by the '-t' option\n" + "\t\tand the object descriptor. The 'object' parameter is\n" + "\t\tinterpreted depending on the '-t' option.\n" + "\n" + "\t\t-q\tQuiet mode. Only print out the handler number added.\n" + "\t\t-e\tInject a specific error. Must be one of 'io',\n" + "\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n" + "\t\t-C\tInject the given error only into specific DVAs. The\n" + "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n" + "\t\t\tseparated by commas (ex. '0,2').\n" + "\t\t-l\tInject error at a particular block level. Default is " + "0.\n" + "\t\t-m\tAutomatically remount underlying filesystem.\n" + "\t\t-r\tInject error over a particular logical range of an\n" + "\t\t\tobject. Will be translated to the appropriate blkid\n" + "\t\t\trange according to the object's properties.\n" + "\t\t-a\tFlush the ARC cache. Can be specified without any\n" + "\t\t\tassociated object.\n" + "\t\t-u\tUnload the associated pool. Can be specified with only\n" + "\t\t\ta pool object.\n" + "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n" + "\t\t\ta percentage between 0.0001 and 100.\n" + "\n" + "\t-t data\t\tInject an error into the plain file contents of a\n" + "\t\t\tfile. The object must be specified as a complete path\n" + "\t\t\tto a file on a ZFS filesystem.\n" + "\n" + "\t-t dnode\tInject an error into the metadnode in the block\n" + "\t\t\tcorresponding to the dnode for a file or directory. The\n" + "\t\t\t'-r' option is incompatible with this mode. The object\n" + "\t\t\tis specified as a complete path to a file or directory\n" + "\t\t\ton a ZFS filesystem.\n" + "\n" + "\t-t <mos>\tInject errors into the MOS for objects of the given\n" + "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n" + "\t\t\tspacemap, metaslab, errlog. The only valid <object> is\n" + "\t\t\tthe poolname.\n"); +} + +static int +iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), + void *data) +{ + zfs_cmd_t zc = {"\0"}; + int ret; + + while (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) + if ((ret = func((int)zc.zc_guid, zc.zc_name, + &zc.zc_inject_record, data)) != 0) + return (ret); + + if (errno != ENOENT) { + (void) fprintf(stderr, "Unable to list handlers: %s\n", + strerror(errno)); + return (-1); + } + + return (0); +} + +static int +print_data_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid != 0 || record->zi_func[0] != '\0') + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " + "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", + "LVL", "DVAs", "RANGE"); + (void) printf("--- --------------- ------ " + "------ -------- --- ---- ---------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ", + id, pool, (u_longlong_t)record->zi_objset, + (u_longlong_t)record->zi_object, type_to_name(record->zi_type), + record->zi_level, record->zi_dvas); + + + if (record->zi_start == 0 && + record->zi_end == -1ULL) + (void) printf("all\n"); + else + (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start, + (u_longlong_t)record->zi_end); + + return (0); +} + +static int +print_device_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid == 0 || record->zi_func[0] != '\0') + return (0); + + if (record->zi_cmd == ZINJECT_DELAY_IO) + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); + (void) printf("--- --------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %llx\n", id, pool, + (u_longlong_t)record->zi_guid); + + return (0); +} + +static int +print_delay_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid == 0 || record->zi_func[0] != '\0') + return (0); + + if (record->zi_cmd != ZINJECT_DELAY_IO) + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %-15s %-15s %s\n", + "ID", "POOL", "DELAY (ms)", "LANES", "GUID"); + (void) printf("--- --------------- --------------- " + "--------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool, + (u_longlong_t)NSEC2MSEC(record->zi_timer), + (u_longlong_t)record->zi_nlanes, + (u_longlong_t)record->zi_guid); + + return (0); +} + +static int +print_panic_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_func[0] == '\0') + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION"); + (void) printf("--- --------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %s\n", id, pool, record->zi_func); + + return (0); +} + +/* + * Print all registered error handlers. Returns the number of handlers + * registered. + */ +static int +print_all_handlers(void) +{ + int count = 0, total = 0; + + (void) iter_handlers(print_device_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + + (void) iter_handlers(print_delay_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + + (void) iter_handlers(print_data_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + + (void) iter_handlers(print_panic_handler, &count); + + return (count + total); +} + +/* ARGSUSED */ +static int +cancel_one_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + zfs_cmd_t zc = {"\0"}; + + zc.zc_guid = (uint64_t)id; + + if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { + (void) fprintf(stderr, "failed to remove handler %d: %s\n", + id, strerror(errno)); + return (1); + } + + return (0); +} + +/* + * Remove all fault injection handlers. + */ +static int +cancel_all_handlers(void) +{ + int ret = iter_handlers(cancel_one_handler, NULL); + + if (ret == 0) + (void) printf("removed all registered handlers\n"); + + return (ret); +} + +/* + * Remove a specific fault injection handler. + */ +static int +cancel_handler(int id) +{ + zfs_cmd_t zc = {"\0"}; + + zc.zc_guid = (uint64_t)id; + + if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { + (void) fprintf(stderr, "failed to remove handler %d: %s\n", + id, strerror(errno)); + return (1); + } + + (void) printf("removed handler %d\n", id); + + return (0); +} + +/* + * Register a new fault injection handler. + */ +static int +register_handler(const char *pool, int flags, zinject_record_t *record, + int quiet) +{ + zfs_cmd_t zc = {"\0"}; + + (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); + zc.zc_inject_record = *record; + zc.zc_guid = flags; + + if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) { + (void) fprintf(stderr, "failed to add handler: %s\n", + errno == EDOM ? "block level exceeds max level of object" : + strerror(errno)); + return (1); + } + + if (flags & ZINJECT_NULL) + return (0); + + if (quiet) { + (void) printf("%llu\n", (u_longlong_t)zc.zc_guid); + } else { + (void) printf("Added handler %llu with the following " + "properties:\n", (u_longlong_t)zc.zc_guid); + (void) printf(" pool: %s\n", pool); + if (record->zi_guid) { + (void) printf(" vdev: %llx\n", + (u_longlong_t)record->zi_guid); + } else if (record->zi_func[0] != '\0') { + (void) printf(" panic function: %s\n", + record->zi_func); + } else if (record->zi_duration > 0) { + (void) printf(" time: %lld seconds\n", + (u_longlong_t)record->zi_duration); + } else if (record->zi_duration < 0) { + (void) printf(" txgs: %lld \n", + (u_longlong_t)-record->zi_duration); + } else { + (void) printf("objset: %llu\n", + (u_longlong_t)record->zi_objset); + (void) printf("object: %llu\n", + (u_longlong_t)record->zi_object); + (void) printf(" type: %llu\n", + (u_longlong_t)record->zi_type); + (void) printf(" level: %d\n", record->zi_level); + if (record->zi_start == 0 && + record->zi_end == -1ULL) + (void) printf(" range: all\n"); + else + (void) printf(" range: [%llu, %llu)\n", + (u_longlong_t)record->zi_start, + (u_longlong_t)record->zi_end); + (void) printf(" dvas: 0x%x\n", record->zi_dvas); + } + } + + return (0); +} + +static int +perform_action(const char *pool, zinject_record_t *record, int cmd) +{ + zfs_cmd_t zc = {"\0"}; + + ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED); + (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); + zc.zc_guid = record->zi_guid; + zc.zc_cookie = cmd; + + if (zfs_ioctl(g_zfs, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + return (1); +} + +static int +parse_delay(char *str, uint64_t *delay, uint64_t *nlanes) +{ + unsigned long scan_delay; + unsigned long scan_nlanes; + + if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2) + return (1); + + /* + * We explicitly disallow a delay of zero here, because we key + * off this value being non-zero in translate_device(), to + * determine if the fault is a ZINJECT_DELAY_IO fault or not. + */ + if (scan_delay == 0) + return (1); + + /* + * The units for the CLI delay parameter is milliseconds, but + * the data passed to the kernel is interpreted as nanoseconds. + * Thus we scale the milliseconds to nanoseconds here, and this + * nanosecond value is used to pass the delay to the kernel. + */ + *delay = MSEC2NSEC(scan_delay); + *nlanes = scan_nlanes; + + return (0); +} + +static int +parse_frequency(const char *str, uint32_t *percent) +{ + double val; + char *post; + + val = strtod(str, &post); + if (post == NULL || *post != '\0') + return (EINVAL); + + /* valid range is [0.0001, 100.0] */ + val /= 100.0f; + if (val < 0.000001f || val > 1.0f) + return (ERANGE); + + /* convert to an integer for use by kernel */ + *percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX)); + + return (0); +} + +/* + * This function converts a string specifier for DVAs into a bit mask. + * The dva's provided by the user should be 0 indexed and separated by + * a comma. For example: + * "1" -> 0b0010 (0x2) + * "0,1" -> 0b0011 (0x3) + * "0,1,2" -> 0b0111 (0x7) + */ +static int +parse_dvas(const char *str, uint32_t *dvas_out) +{ + const char *c = str; + uint32_t mask = 0; + boolean_t need_delim = B_FALSE; + + /* max string length is 5 ("0,1,2") */ + if (strlen(str) > 5 || strlen(str) == 0) + return (EINVAL); + + while (*c != '\0') { + switch (*c) { + case '0': + case '1': + case '2': + /* check for pipe between DVAs */ + if (need_delim) + return (EINVAL); + + /* check if this DVA has been set already */ + if (mask & (1 << ((*c) - '0'))) + return (EINVAL); + + mask |= (1 << ((*c) - '0')); + need_delim = B_TRUE; + break; + case ',': + need_delim = B_FALSE; + break; + default: + /* check for invalid character */ + return (EINVAL); + } + c++; + } + + /* check for dangling delimiter */ + if (!need_delim) + return (EINVAL); + + *dvas_out = mask; + return (0); +} + +int +main(int argc, char **argv) +{ + int c; + char *range = NULL; + char *cancel = NULL; + char *end; + char *raw = NULL; + char *device = NULL; + int level = 0; + int quiet = 0; + int error = 0; + int domount = 0; + int io_type = ZIO_TYPES; + int action = VDEV_STATE_UNKNOWN; + err_type_t type = TYPE_INVAL; + err_type_t label = TYPE_INVAL; + zinject_record_t record = { 0 }; + char pool[MAXNAMELEN] = ""; + char dataset[MAXNAMELEN] = ""; + zfs_handle_t *zhp = NULL; + int nowrites = 0; + int dur_txg = 0; + int dur_secs = 0; + int ret; + int flags = 0; + uint32_t dvas = 0; + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (1); + } + + libzfs_print_on_error(g_zfs, B_TRUE); + + if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { + (void) fprintf(stderr, "failed to open ZFS device\n"); + libzfs_fini(g_zfs); + return (1); + } + + if (argc == 1) { + /* + * No arguments. Print the available handlers. If there are no + * available handlers, direct the user to '-h' for help + * information. + */ + if (print_all_handlers() == 0) { + (void) printf("No handlers registered.\n"); + (void) printf("Run 'zinject -h' for usage " + "information.\n"); + } + libzfs_fini(g_zfs); + return (0); + } + + while ((c = getopt(argc, argv, + ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + switch (c) { + case 'a': + flags |= ZINJECT_FLUSH_ARC; + break; + case 'A': + if (strcasecmp(optarg, "degrade") == 0) { + action = VDEV_STATE_DEGRADED; + } else if (strcasecmp(optarg, "fault") == 0) { + action = VDEV_STATE_FAULTED; + } else { + (void) fprintf(stderr, "invalid action '%s': " + "must be 'degrade' or 'fault'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'b': + raw = optarg; + break; + case 'c': + cancel = optarg; + break; + case 'C': + ret = parse_dvas(optarg, &dvas); + if (ret != 0) { + (void) fprintf(stderr, "invalid DVA list '%s': " + "DVAs should be 0 indexed and separated by " + "commas.\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'd': + device = optarg; + break; + case 'D': + errno = 0; + ret = parse_delay(optarg, &record.zi_timer, + &record.zi_nlanes); + if (ret != 0) { + + (void) fprintf(stderr, "invalid i/o delay " + "value: '%s'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'e': + if (strcasecmp(optarg, "io") == 0) { + error = EIO; + } else if (strcasecmp(optarg, "checksum") == 0) { + error = ECKSUM; + } else if (strcasecmp(optarg, "decompress") == 0) { + error = EINVAL; + } else if (strcasecmp(optarg, "decrypt") == 0) { + error = EACCES; + } else if (strcasecmp(optarg, "nxio") == 0) { + error = ENXIO; + } else if (strcasecmp(optarg, "dtl") == 0) { + error = ECHILD; + } else if (strcasecmp(optarg, "corrupt") == 0) { + error = EILSEQ; + } else { + (void) fprintf(stderr, "invalid error type " + "'%s': must be 'io', 'checksum' or " + "'nxio'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'f': + ret = parse_frequency(optarg, &record.zi_freq); + if (ret != 0) { + (void) fprintf(stderr, "%sfrequency value must " + "be in the range [0.0001, 100.0]\n", + ret == EINVAL ? "invalid value: " : + ret == ERANGE ? "out of range: " : ""); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'F': + record.zi_failfast = B_TRUE; + break; + case 'g': + dur_txg = 1; + record.zi_duration = (int)strtol(optarg, &end, 10); + if (record.zi_duration <= 0 || *end != '\0') { + (void) fprintf(stderr, "invalid duration '%s': " + "must be a positive integer\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + /* store duration of txgs as its negative */ + record.zi_duration *= -1; + break; + case 'h': + usage(); + libzfs_fini(g_zfs); + return (0); + case 'I': + /* default duration, if one hasn't yet been defined */ + nowrites = 1; + if (dur_secs == 0 && dur_txg == 0) + record.zi_duration = 30; + break; + case 'l': + level = (int)strtol(optarg, &end, 10); + if (*end != '\0') { + (void) fprintf(stderr, "invalid level '%s': " + "must be an integer\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'm': + domount = 1; + break; + case 'p': + (void) strlcpy(record.zi_func, optarg, + sizeof (record.zi_func)); + record.zi_cmd = ZINJECT_PANIC; + break; + case 'q': + quiet = 1; + break; + case 'r': + range = optarg; + flags |= ZINJECT_CALC_RANGE; + break; + case 's': + dur_secs = 1; + record.zi_duration = (int)strtol(optarg, &end, 10); + if (record.zi_duration <= 0 || *end != '\0') { + (void) fprintf(stderr, "invalid duration '%s': " + "must be a positive integer\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'T': + if (strcasecmp(optarg, "read") == 0) { + io_type = ZIO_TYPE_READ; + } else if (strcasecmp(optarg, "write") == 0) { + io_type = ZIO_TYPE_WRITE; + } else if (strcasecmp(optarg, "free") == 0) { + io_type = ZIO_TYPE_FREE; + } else if (strcasecmp(optarg, "claim") == 0) { + io_type = ZIO_TYPE_CLAIM; + } else if (strcasecmp(optarg, "all") == 0) { + io_type = ZIO_TYPES; + } else { + (void) fprintf(stderr, "invalid I/O type " + "'%s': must be 'read', 'write', 'free', " + "'claim' or 'all'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 't': + if ((type = name_to_type(optarg)) == TYPE_INVAL && + !MOS_TYPE(type)) { + (void) fprintf(stderr, "invalid type '%s'\n", + optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'u': + flags |= ZINJECT_UNLOAD_SPA; + break; + case 'L': + if ((label = name_to_type(optarg)) == TYPE_INVAL && + !LABEL_TYPE(type)) { + (void) fprintf(stderr, "invalid label type " + "'%s'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case ':': + (void) fprintf(stderr, "option -%c requires an " + "operand\n", optopt); + usage(); + libzfs_fini(g_zfs); + return (1); + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + usage(); + libzfs_fini(g_zfs); + return (2); + } + } + + argc -= optind; + argv += optind; + + if (record.zi_duration != 0) + record.zi_cmd = ZINJECT_IGNORED_WRITES; + + if (cancel != NULL) { + /* + * '-c' is invalid with any other options. + */ + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || + record.zi_freq > 0 || dvas != 0) { + (void) fprintf(stderr, "cancel (-c) incompatible with " + "any other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + if (argc != 0) { + (void) fprintf(stderr, "extraneous argument to '-c'\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (strcmp(cancel, "all") == 0) { + return (cancel_all_handlers()); + } else { + int id = (int)strtol(cancel, &end, 10); + if (*end != '\0') { + (void) fprintf(stderr, "invalid handle id '%s':" + " must be an integer or 'all'\n", cancel); + usage(); + libzfs_fini(g_zfs); + return (1); + } + return (cancel_handler(id)); + } + } + + if (device != NULL) { + /* + * Device (-d) injection uses a completely different mechanism + * for doing injection, so handle it separately here. + */ + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || + dvas != 0) { + (void) fprintf(stderr, "device (-d) incompatible with " + "data error injection\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc != 1) { + (void) fprintf(stderr, "device (-d) injection requires " + "a single pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); + dataset[0] = '\0'; + + if (error == ECKSUM) { + (void) fprintf(stderr, "device error type must be " + "'io', 'nxio' or 'corrupt'\n"); + libzfs_fini(g_zfs); + return (1); + } + + if (error == EILSEQ && + (record.zi_freq == 0 || io_type != ZIO_TYPE_READ)) { + (void) fprintf(stderr, "device corrupt errors require " + "io type read and a frequency value\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_iotype = io_type; + if (translate_device(pool, device, label, &record) != 0) { + libzfs_fini(g_zfs); + return (1); + } + if (!error) + error = ENXIO; + + if (action != VDEV_STATE_UNKNOWN) + return (perform_action(pool, &record, action)); + + } else if (raw != NULL) { + if (range != NULL || type != TYPE_INVAL || level != 0 || + record.zi_cmd != ZINJECT_UNINITIALIZED || + record.zi_freq > 0 || dvas != 0) { + (void) fprintf(stderr, "raw (-b) format with " + "any other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc != 1) { + (void) fprintf(stderr, "raw (-b) format expects a " + "single pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); + dataset[0] = '\0'; + + if (error == ENXIO) { + (void) fprintf(stderr, "data error type must be " + "'checksum' or 'io'\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_cmd = ZINJECT_DATA_FAULT; + if (translate_raw(raw, &record) != 0) { + libzfs_fini(g_zfs); + return (1); + } + if (!error) + error = EIO; + } else if (record.zi_cmd == ZINJECT_PANIC) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || device != NULL || record.zi_freq > 0 || + dvas != 0) { + (void) fprintf(stderr, "panic (-p) incompatible with " + "other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc < 1 || argc > 2) { + (void) fprintf(stderr, "panic (-p) injection requires " + "a single pool name and an optional id\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); + if (argv[1] != NULL) + record.zi_type = atoi(argv[1]); + dataset[0] = '\0'; + } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_freq > 0 || dvas != 0) { + (void) fprintf(stderr, "hardware failure (-I) " + "incompatible with other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (nowrites == 0) { + (void) fprintf(stderr, "-s or -g meaningless " + "without -I (ignore writes)\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } else if (dur_secs && dur_txg) { + (void) fprintf(stderr, "choose a duration either " + "in seconds (-s) or a number of txgs (-g) " + "but not both\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } else if (argc != 1) { + (void) fprintf(stderr, "ignore writes (-I) " + "injection requires a single pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); + dataset[0] = '\0'; + } else if (type == TYPE_INVAL) { + if (flags == 0) { + (void) fprintf(stderr, "at least one of '-b', '-d', " + "'-t', '-a', '-p', '-I' or '-u' " + "must be specified\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) { + (void) strlcpy(pool, argv[0], sizeof (pool)); + dataset[0] = '\0'; + } else if (argc != 0) { + (void) fprintf(stderr, "extraneous argument for " + "'-f'\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + flags |= ZINJECT_NULL; + } else { + if (argc != 1) { + (void) fprintf(stderr, "missing object\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (error == ENXIO || error == EILSEQ) { + (void) fprintf(stderr, "data error type must be " + "'checksum' or 'io'\n"); + libzfs_fini(g_zfs); + return (1); + } + + if (dvas != 0) { + if (error == EACCES || error == EINVAL) { + (void) fprintf(stderr, "the '-C' option may " + "not be used with logical data errors " + "'decrypt' and 'decompress'\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_dvas = dvas; + } + + if (error == EACCES) { + if (type != TYPE_DATA) { + (void) fprintf(stderr, "decryption errors " + "may only be injected for 'data' types\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_cmd = ZINJECT_DECRYPT_FAULT; + /* + * Internally, ZFS actually uses ECKSUM for decryption + * errors since EACCES is used to indicate the key was + * not found. + */ + error = ECKSUM; + } else { + record.zi_cmd = ZINJECT_DATA_FAULT; + } + + if (translate_record(type, argv[0], range, level, &record, pool, + dataset) != 0) { + libzfs_fini(g_zfs); + return (1); + } + if (!error) + error = EIO; + } + + /* + * If this is pool-wide metadata, unmount everything. The ioctl() will + * unload the pool, so that we trigger spa-wide reopen of metadata next + * time we access the pool. + */ + if (dataset[0] != '\0' && domount) { + if ((zhp = zfs_open(g_zfs, dataset, + ZFS_TYPE_DATASET)) == NULL) { + libzfs_fini(g_zfs); + return (1); + } + if (zfs_unmount(zhp, NULL, 0) != 0) { + libzfs_fini(g_zfs); + return (1); + } + } + + record.zi_error = error; + + ret = register_handler(pool, flags, &record, quiet); + + if (dataset[0] != '\0' && domount) + ret = (zfs_mount(zhp, NULL, 0) != 0); + + libzfs_fini(g_zfs); + + return (ret); +} diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.h b/sys/contrib/openzfs/cmd/zinject/zinject.h new file mode 100644 index 000000000000..46fdcad8b31f --- /dev/null +++ b/sys/contrib/openzfs/cmd/zinject/zinject.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZINJECT_H +#define _ZINJECT_H + +#include <sys/zfs_ioctl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + TYPE_DATA, /* plain file contents */ + TYPE_DNODE, /* metadnode contents */ + TYPE_MOS, /* all MOS data */ + TYPE_MOSDIR, /* MOS object directory */ + TYPE_METASLAB, /* metaslab objects */ + TYPE_CONFIG, /* MOS config */ + TYPE_BPOBJ, /* block pointer list */ + TYPE_SPACEMAP, /* space map objects */ + TYPE_ERRLOG, /* persistent error log */ + TYPE_LABEL_UBERBLOCK, /* label specific uberblock */ + TYPE_LABEL_NVLIST, /* label specific nvlist */ + TYPE_LABEL_PAD1, /* label specific 8K pad1 area */ + TYPE_LABEL_PAD2, /* label specific 8K pad2 area */ + TYPE_INVAL +} err_type_t; + +#define MOS_TYPE(t) \ + ((t) >= TYPE_MOS && (t) < TYPE_LABEL_UBERBLOCK) + +#define LABEL_TYPE(t) \ + ((t) >= TYPE_LABEL_UBERBLOCK && (t) < TYPE_INVAL) + +int translate_record(err_type_t type, const char *object, const char *range, + int level, zinject_record_t *record, char *poolname, char *dataset); +int translate_raw(const char *raw, zinject_record_t *record); +int translate_device(const char *pool, const char *device, + err_type_t label_type, zinject_record_t *record); +void usage(void); + +extern libzfs_handle_t *g_zfs; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZINJECT_H */ diff --git a/sys/contrib/openzfs/cmd/zpool/.gitignore b/sys/contrib/openzfs/cmd/zpool/.gitignore new file mode 100644 index 000000000000..8ea518af78e5 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/.gitignore @@ -0,0 +1 @@ +/zpool diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am new file mode 100644 index 000000000000..c0378b136901 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am @@ -0,0 +1,136 @@ +include $(top_srcdir)/config/Rules.am + +AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUUID_CFLAGS) + +DEFAULT_INCLUDES += -I$(srcdir) + +sbin_PROGRAMS = zpool + +zpool_SOURCES = \ + zpool_iter.c \ + zpool_main.c \ + zpool_util.c \ + zpool_util.h \ + zpool_vdev.c + +if BUILD_FREEBSD +zpool_SOURCES += os/freebsd/zpool_vdev_os.c +endif + +if BUILD_LINUX +zpool_SOURCES += os/linux/zpool_vdev_os.c +endif + +zpool_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +zpool_LDADD += $(LTLIBINTL) + +if BUILD_FREEBSD +zpool_LDADD += -lgeom +endif +zpool_LDADD += -lm $(LIBBLKID_LIBS) $(LIBUUID_LIBS) + +zpoolconfdir = $(sysconfdir)/zfs/zpool.d +zpoolexecdir = $(zfsexecdir)/zpool.d + +EXTRA_DIST = zpool.d/README + +dist_zpoolexec_SCRIPTS = \ + zpool.d/dm-deps \ + zpool.d/enc \ + zpool.d/encdev \ + zpool.d/fault_led \ + zpool.d/iostat \ + zpool.d/iostat-1s \ + zpool.d/iostat-10s \ + zpool.d/label \ + zpool.d/locate_led \ + zpool.d/lsblk \ + zpool.d/media \ + zpool.d/model \ + zpool.d/serial \ + zpool.d/ses \ + zpool.d/size \ + zpool.d/slot \ + zpool.d/smart \ + zpool.d/smartx \ + zpool.d/temp \ + zpool.d/health \ + zpool.d/r_proc \ + zpool.d/w_proc \ + zpool.d/r_ucor \ + zpool.d/w_ucor \ + zpool.d/nonmed \ + zpool.d/defect \ + zpool.d/hours_on \ + zpool.d/realloc \ + zpool.d/rep_ucor \ + zpool.d/cmd_to \ + zpool.d/pend_sec \ + zpool.d/off_ucor \ + zpool.d/ata_err \ + zpool.d/nvme_err \ + zpool.d/pwr_cyc \ + zpool.d/upath \ + zpool.d/vendor \ + zpool.d/smart_test \ + zpool.d/test_type \ + zpool.d/test_status \ + zpool.d/test_progress \ + zpool.d/test_ended + +zpoolconfdefaults = \ + dm-deps \ + enc \ + encdev \ + fault_led \ + iostat \ + iostat-1s \ + iostat-10s \ + label \ + locate_led \ + lsblk \ + media \ + model \ + serial \ + ses \ + size \ + slot \ + smart \ + smartx \ + temp \ + health \ + r_proc \ + w_proc \ + r_ucor \ + w_ucor \ + nonmed \ + defect \ + hours_on \ + realloc \ + rep_ucor \ + cmd_to \ + pend_sec \ + off_ucor \ + ata_err \ + nvme_err \ + pwr_cyc \ + upath \ + vendor \ + smart_test \ + test_type \ + test_status \ + test_progress \ + test_ended + +install-data-hook: + $(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)" + for f in $(zpoolconfdefaults); do \ + test -f "$(DESTDIR)$(zpoolconfdir)/$${f}" -o \ + -L "$(DESTDIR)$(zpoolconfdir)/$${f}" || \ + ln -s "$(zpoolexecdir)/$${f}" "$(DESTDIR)$(zpoolconfdir)"; \ + done diff --git a/sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c b/sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c new file mode 100644 index 000000000000..7d48f61a0ee7 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. + * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libdiskmgt, makes sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <libintl.h> +#include <libnvpair.h> +#include <libzutil.h> +#include <limits.h> +#include <sys/spa.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <paths.h> +#include <sys/stat.h> +#include <sys/disk.h> +#include <sys/mntent.h> +#include <libgeom.h> + +#include "zpool_util.h" +#include <sys/zfs_context.h> + +int +check_device(const char *name, boolean_t force, boolean_t isspare, + boolean_t iswholedisk) +{ + char path[MAXPATHLEN]; + + if (strncmp(name, _PATH_DEV, sizeof (_PATH_DEV) - 1) != 0) + snprintf(path, sizeof (path), "%s%s", _PATH_DEV, name); + else + strlcpy(path, name, sizeof (path)); + + return (check_file(path, force, isspare)); +} + +boolean_t +check_sector_size_database(char *path, int *sector_size) +{ + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c b/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c new file mode 100644 index 000000000000..d087c4c14dac --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c @@ -0,0 +1,410 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. + * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libblkid to make sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <libintl.h> +#include <libnvpair.h> +#include <libzutil.h> +#include <limits.h> +#include <sys/spa.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include "zpool_util.h" +#include <sys/zfs_context.h> + +#include <scsi/scsi.h> +#include <scsi/sg.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/efi_partition.h> +#include <sys/stat.h> +#include <sys/vtoc.h> +#include <sys/mntent.h> +#include <uuid/uuid.h> +#include <blkid/blkid.h> + +typedef struct vdev_disk_db_entry +{ + char id[24]; + int sector_size; +} vdev_disk_db_entry_t; + +/* + * Database of block devices that lie about physical sector sizes. The + * identification string must be precisely 24 characters to avoid false + * negatives + */ +static vdev_disk_db_entry_t vdev_disk_database[] = { + {"ATA ADATA SSD S396 3", 8192}, + {"ATA APPLE SSD SM128E", 8192}, + {"ATA APPLE SSD SM256E", 8192}, + {"ATA APPLE SSD SM512E", 8192}, + {"ATA APPLE SSD SM768E", 8192}, + {"ATA C400-MTFDDAC064M", 8192}, + {"ATA C400-MTFDDAC128M", 8192}, + {"ATA C400-MTFDDAC256M", 8192}, + {"ATA C400-MTFDDAC512M", 8192}, + {"ATA Corsair Force 3 ", 8192}, + {"ATA Corsair Force GS", 8192}, + {"ATA INTEL SSDSA2CT04", 8192}, + {"ATA INTEL SSDSA2BZ10", 8192}, + {"ATA INTEL SSDSA2BZ20", 8192}, + {"ATA INTEL SSDSA2BZ30", 8192}, + {"ATA INTEL SSDSA2CW04", 8192}, + {"ATA INTEL SSDSA2CW08", 8192}, + {"ATA INTEL SSDSA2CW12", 8192}, + {"ATA INTEL SSDSA2CW16", 8192}, + {"ATA INTEL SSDSA2CW30", 8192}, + {"ATA INTEL SSDSA2CW60", 8192}, + {"ATA INTEL SSDSC2CT06", 8192}, + {"ATA INTEL SSDSC2CT12", 8192}, + {"ATA INTEL SSDSC2CT18", 8192}, + {"ATA INTEL SSDSC2CT24", 8192}, + {"ATA INTEL SSDSC2CW06", 8192}, + {"ATA INTEL SSDSC2CW12", 8192}, + {"ATA INTEL SSDSC2CW18", 8192}, + {"ATA INTEL SSDSC2CW24", 8192}, + {"ATA INTEL SSDSC2CW48", 8192}, + {"ATA KINGSTON SH100S3", 8192}, + {"ATA KINGSTON SH103S3", 8192}, + {"ATA M4-CT064M4SSD2 ", 8192}, + {"ATA M4-CT128M4SSD2 ", 8192}, + {"ATA M4-CT256M4SSD2 ", 8192}, + {"ATA M4-CT512M4SSD2 ", 8192}, + {"ATA OCZ-AGILITY2 ", 8192}, + {"ATA OCZ-AGILITY3 ", 8192}, + {"ATA OCZ-VERTEX2 3.5 ", 8192}, + {"ATA OCZ-VERTEX3 ", 8192}, + {"ATA OCZ-VERTEX3 LT ", 8192}, + {"ATA OCZ-VERTEX3 MI ", 8192}, + {"ATA OCZ-VERTEX4 ", 8192}, + {"ATA SAMSUNG MZ7WD120", 8192}, + {"ATA SAMSUNG MZ7WD240", 8192}, + {"ATA SAMSUNG MZ7WD480", 8192}, + {"ATA SAMSUNG MZ7WD960", 8192}, + {"ATA SAMSUNG SSD 830 ", 8192}, + {"ATA Samsung SSD 840 ", 8192}, + {"ATA SanDisk SSD U100", 8192}, + {"ATA TOSHIBA THNSNH06", 8192}, + {"ATA TOSHIBA THNSNH12", 8192}, + {"ATA TOSHIBA THNSNH25", 8192}, + {"ATA TOSHIBA THNSNH51", 8192}, + {"ATA APPLE SSD TS064C", 4096}, + {"ATA APPLE SSD TS128C", 4096}, + {"ATA APPLE SSD TS256C", 4096}, + {"ATA APPLE SSD TS512C", 4096}, + {"ATA INTEL SSDSA2M040", 4096}, + {"ATA INTEL SSDSA2M080", 4096}, + {"ATA INTEL SSDSA2M160", 4096}, + {"ATA INTEL SSDSC2MH12", 4096}, + {"ATA INTEL SSDSC2MH25", 4096}, + {"ATA OCZ CORE_SSD ", 4096}, + {"ATA OCZ-VERTEX ", 4096}, + {"ATA SAMSUNG MCCOE32G", 4096}, + {"ATA SAMSUNG MCCOE64G", 4096}, + {"ATA SAMSUNG SSD PM80", 4096}, + /* Flash drives optimized for 4KB IOs on larger pages */ + {"ATA INTEL SSDSC2BA10", 4096}, + {"ATA INTEL SSDSC2BA20", 4096}, + {"ATA INTEL SSDSC2BA40", 4096}, + {"ATA INTEL SSDSC2BA80", 4096}, + {"ATA INTEL SSDSC2BB08", 4096}, + {"ATA INTEL SSDSC2BB12", 4096}, + {"ATA INTEL SSDSC2BB16", 4096}, + {"ATA INTEL SSDSC2BB24", 4096}, + {"ATA INTEL SSDSC2BB30", 4096}, + {"ATA INTEL SSDSC2BB40", 4096}, + {"ATA INTEL SSDSC2BB48", 4096}, + {"ATA INTEL SSDSC2BB60", 4096}, + {"ATA INTEL SSDSC2BB80", 4096}, + {"ATA INTEL SSDSC2BW24", 4096}, + {"ATA INTEL SSDSC2BW48", 4096}, + {"ATA INTEL SSDSC2BP24", 4096}, + {"ATA INTEL SSDSC2BP48", 4096}, + {"NA SmrtStorSDLKAE9W", 4096}, + {"NVMe Amazon EC2 NVMe ", 4096}, + /* Imported from Open Solaris */ + {"ATA MARVELL SD88SA02", 4096}, + /* Advanced format Hard drives */ + {"ATA Hitachi HDS5C303", 4096}, + {"ATA SAMSUNG HD204UI ", 4096}, + {"ATA ST2000DL004 HD20", 4096}, + {"ATA WDC WD10EARS-00M", 4096}, + {"ATA WDC WD10EARS-00S", 4096}, + {"ATA WDC WD10EARS-00Z", 4096}, + {"ATA WDC WD15EARS-00M", 4096}, + {"ATA WDC WD15EARS-00S", 4096}, + {"ATA WDC WD15EARS-00Z", 4096}, + {"ATA WDC WD20EARS-00M", 4096}, + {"ATA WDC WD20EARS-00S", 4096}, + {"ATA WDC WD20EARS-00Z", 4096}, + {"ATA WDC WD1600BEVT-0", 4096}, + {"ATA WDC WD2500BEVT-0", 4096}, + {"ATA WDC WD3200BEVT-0", 4096}, + {"ATA WDC WD5000BEVT-0", 4096}, +}; + + +#define INQ_REPLY_LEN 96 +#define INQ_CMD_LEN 6 + +static const int vdev_disk_database_size = + sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]); + +boolean_t +check_sector_size_database(char *path, int *sector_size) +{ + unsigned char inq_buff[INQ_REPLY_LEN]; + unsigned char sense_buffer[32]; + unsigned char inq_cmd_blk[INQ_CMD_LEN] = + {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0}; + sg_io_hdr_t io_hdr; + int error; + int fd; + int i; + + /* Prepare INQUIRY command */ + memset(&io_hdr, 0, sizeof (sg_io_hdr_t)); + io_hdr.interface_id = 'S'; + io_hdr.cmd_len = sizeof (inq_cmd_blk); + io_hdr.mx_sb_len = sizeof (sense_buffer); + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.dxfer_len = INQ_REPLY_LEN; + io_hdr.dxferp = inq_buff; + io_hdr.cmdp = inq_cmd_blk; + io_hdr.sbp = sense_buffer; + io_hdr.timeout = 10; /* 10 milliseconds is ample time */ + + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) + return (B_FALSE); + + error = ioctl(fd, SG_IO, (unsigned long) &io_hdr); + + (void) close(fd); + + if (error < 0) + return (B_FALSE); + + if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK) + return (B_FALSE); + + for (i = 0; i < vdev_disk_database_size; i++) { + if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24)) + continue; + + *sector_size = vdev_disk_database[i].sector_size; + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare) +{ + int err; + char *value; + + /* No valid type detected device is safe to use */ + value = blkid_get_tag_value(cache, "TYPE", path); + if (value == NULL) + return (0); + + /* + * If libblkid detects a ZFS device, we check the device + * using check_file() to see if it's safe. The one safe + * case is a spare device shared between multiple pools. + */ + if (strcmp(value, "zfs_member") == 0) { + err = check_file(path, force, isspare); + } else { + if (force) { + err = 0; + } else { + err = -1; + vdev_error(gettext("%s contains a filesystem of " + "type '%s'\n"), path, value); + } + } + + free(value); + + return (err); +} + +/* + * Validate that a disk including all partitions are safe to use. + * + * For EFI labeled disks this can done relatively easily with the libefi + * library. The partition numbers are extracted from the label and used + * to generate the expected /dev/ paths. Each partition can then be + * checked for conflicts. + * + * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible + * but due to the lack of a readily available libraries this scanning is + * not implemented. Instead only the device path as given is checked. + */ +static int +check_disk(const char *path, blkid_cache cache, int force, + boolean_t isspare, boolean_t iswholedisk) +{ + struct dk_gpt *vtoc; + char slice_path[MAXPATHLEN]; + int err = 0; + int fd, i; + int flags = O_RDONLY|O_DIRECT; + + if (!iswholedisk) + return (check_slice(path, cache, force, isspare)); + + /* only spares can be shared, other devices require exclusive access */ + if (!isspare) + flags |= O_EXCL; + + if ((fd = open(path, flags)) < 0) { + char *value = blkid_get_tag_value(cache, "TYPE", path); + (void) fprintf(stderr, gettext("%s is in use and contains " + "a %s filesystem.\n"), path, value ? value : "unknown"); + free(value); + return (-1); + } + + /* + * Expected to fail for non-EFI labeled disks. Just check the device + * as given and do not attempt to detect and scan partitions. + */ + err = efi_alloc_and_read(fd, &vtoc); + if (err) { + (void) close(fd); + return (check_slice(path, cache, force, isspare)); + } + + /* + * The primary efi partition label is damaged however the secondary + * label at the end of the device is intact. Rather than use this + * label we should play it safe and treat this as a non efi device. + */ + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { + efi_free(vtoc); + (void) close(fd); + + if (force) { + /* Partitions will now be created using the backup */ + return (0); + } else { + vdev_error(gettext("%s contains a corrupt primary " + "EFI label.\n"), path); + return (-1); + } + } + + for (i = 0; i < vtoc->efi_nparts; i++) { + + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED || + uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid)) + continue; + + if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) + (void) snprintf(slice_path, sizeof (slice_path), + "%s%s%d", path, "-part", i+1); + else + (void) snprintf(slice_path, sizeof (slice_path), + "%s%s%d", path, isdigit(path[strlen(path)-1]) ? + "p" : "", i+1); + + err = check_slice(slice_path, cache, force, isspare); + if (err) + break; + } + + efi_free(vtoc); + (void) close(fd); + + return (err); +} + +int +check_device(const char *path, boolean_t force, + boolean_t isspare, boolean_t iswholedisk) +{ + blkid_cache cache; + int error; + + error = blkid_get_cache(&cache, NULL); + if (error != 0) { + (void) fprintf(stderr, gettext("unable to access the blkid " + "cache.\n")); + return (-1); + } + + error = check_disk(path, cache, force, isspare, iswholedisk); + blkid_put_cache(cache); + + return (error); +} diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/README b/sys/contrib/openzfs/cmd/zpool/zpool.d/README new file mode 100644 index 000000000000..033b7c363f5a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/README @@ -0,0 +1,9 @@ +This directory contains scripts that can be run the zpool status/iostat +-c option: + + zpool status -c script1,script2, ... + + zpool iostat -vc script1,script2, ... + +Some scripts output different values depending on the symlink name that is +used to run them. See the zpool(8) man page for more details. diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err b/sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to b/sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/defect b/sys/contrib/openzfs/cmd/zpool/zpool.d/defect new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/defect @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps b/sys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps new file mode 100755 index 000000000000..ee39514e4d92 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps @@ -0,0 +1,29 @@ +#!/bin/sh +# +# Show device mapper dependent / underlying devices. This is useful for +# looking up the /dev/sd* devices associated with a dm or multipath device. +# + +if [ "$1" = "-h" ] ; then + echo "Show device mapper dependent (underlying) devices." + exit +fi + +dev="$VDEV_PATH" + +# If the VDEV path is a symlink, resolve it to a real device +if [ -L "$dev" ] ; then + dev=$(readlink "$dev") +fi + +dev=$(basename "$dev") +val="" +if [ -d "/sys/class/block/$dev/slaves" ] ; then + # ls -C: output in columns, no newlines + val=$(ls -C "/sys/class/block/$dev/slaves") + + # ls -C will print two spaces between files; change to one space. + val=$(echo "$val" | sed -r 's/[[:blank:]]+/ /g') +fi + +echo "dm-deps=$val" diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/enc b/sys/contrib/openzfs/cmd/zpool/zpool.d/enc new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/enc @@ -0,0 +1 @@ +ses
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/encdev b/sys/contrib/openzfs/cmd/zpool/zpool.d/encdev new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/encdev @@ -0,0 +1 @@ +ses
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led b/sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led @@ -0,0 +1 @@ +ses
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/health b/sys/contrib/openzfs/cmd/zpool/zpool.d/health new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/health @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on b/sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat new file mode 100755 index 000000000000..41a3acfae7a4 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat @@ -0,0 +1,77 @@ +#!/bin/sh +# +# Display most relevant iostat bandwidth/latency numbers. The output is +# dependent on the name of the script/symlink used to call it. +# + +helpstr=" +iostat: Show iostat values since boot (summary page). +iostat-1s: Do a single 1-second iostat sample and show values. +iostat-10s: Do a single 10-second iostat sample and show values." + +script=$(basename "$0") +if [ "$1" = "-h" ] ; then + echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- + exit +fi + +if [ "$script" = "iostat-1s" ] ; then + # Do a single one-second sample + interval=1 + # Don't show summary stats + brief="yes" +elif [ "$script" = "iostat-10s" ] ; then + # Do a single ten-second sample + interval=10 + # Don't show summary stats + brief="yes" +fi + +if [ -f "$VDEV_UPATH" ] ; then + # We're a file-based vdev, iostat doesn't work on us. Do nothing. + exit +fi + +if [ "$(uname)" = "FreeBSD" ]; then + out=$(iostat -dKx \ + ${interval:+"-w $interval"} \ + ${interval:+"-c 1"} \ + "$VDEV_UPATH" | tail -n 2) +else + out=$(iostat -kx \ + ${brief:+"-y"} \ + ${interval:+"$interval"} \ + ${interval:+"1"} \ + "$VDEV_UPATH" | awk NF | tail -n 2) +fi + + +# Sample output (we want the last two lines): +# +# Linux 2.6.32-642.13.1.el6.x86_64 (centos68) 03/09/2017 _x86_64_ (6 CPU) +# +# avg-cpu: %user %nice %system %iowait %steal %idle +# 0.00 0.00 0.00 0.00 0.00 100.00 +# +# Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util +# sdb 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +# + +# Get the column names +cols=$(echo "$out" | head -n 1) + +# Get the values and tab separate them to make them cut-able. +vals=$(echo "$out" | tail -n 1 | sed -r 's/[[:blank:]]+/\t/g') + +i=0 +for col in $cols ; do + i=$((i+1)) + # Skip the first column since it's just the device name + if [ $i -eq 1 ]; then + continue + fi + + # Get i'th value + val=$(echo "$vals" | cut -f "$i") + echo "$col=$val" +done diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s new file mode 120000 index 000000000000..084278d99f0f --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s @@ -0,0 +1 @@ +iostat
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s new file mode 120000 index 000000000000..084278d99f0f --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s @@ -0,0 +1 @@ +iostat
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/label b/sys/contrib/openzfs/cmd/zpool/zpool.d/label new file mode 120000 index 000000000000..7d1e766add99 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/label @@ -0,0 +1 @@ +lsblk
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led b/sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led @@ -0,0 +1 @@ +ses
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/lsblk b/sys/contrib/openzfs/cmd/zpool/zpool.d/lsblk new file mode 100755 index 000000000000..1cdef40494fe --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/lsblk @@ -0,0 +1,83 @@ +#!/bin/sh +# +# Print some common lsblk values +# +# Any (lowercased) name symlinked to the lsblk script will be passed to lsblk +# as one of its --output names. Here's a partial list of --output names +# from the lsblk binary: +# +# Available columns (for --output): +# NAME device name +# KNAME internal kernel device name +# MAJ:MIN major:minor device number +# FSTYPE filesystem type +# MOUNTPOINT where the device is mounted +# LABEL filesystem LABEL +# UUID filesystem UUID +# RA read-ahead of the device +# RO read-only device +# RM removable device +# MODEL device identifier +# SIZE size of the device +# STATE state of the device +# OWNER user name +# GROUP group name +# MODE device node permissions +# ALIGNMENT alignment offset +# MIN-IO minimum I/O size +# OPT-IO optimal I/O size +# PHY-SEC physical sector size +# LOG-SEC logical sector size +# ROTA rotational device +# SCHED I/O scheduler name +# RQ-SIZE request queue size +# TYPE device type +# DISC-ALN discard alignment offset +# DISC-GRAN discard granularity +# DISC-MAX discard max bytes +# DISC-ZERO discard zeroes data +# +# If the script is run as just 'lsblk' then print out disk size, vendor, +# and model number. + + +helpstr=" +label: Show filesystem label. +model: Show disk model number. +size: Show the disk capacity. +vendor: Show the disk vendor. +lsblk: Show the disk size, vendor, and model number." + +script=$(basename "$0") + +if [ "$1" = "-h" ] ; then + echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- + exit +fi + +if [ "$script" = "lsblk" ] ; then + list="size vendor model" +else + list=$(echo "$script" | tr '[:upper:]' '[:lower:]') +fi + +# Older versions of lsblk don't support all these values (like SERIAL). +for i in $list ; do + + # Special case: Looking up the size of a file-based vdev can't + # be done with lsblk. + if [ "$i" = "size" ] && [ -f "$VDEV_UPATH" ] ; then + size=$(du -h --apparent-size "$VDEV_UPATH" | cut -f 1) + echo "size=$size" + continue + fi + + + val="" + if val=$(eval "lsblk -dl -n -o $i $VDEV_UPATH 2>/dev/null") ; then + # Remove leading/trailing whitespace from value + val=$(echo "$val" | sed -e 's/^[[:space:]]*//' \ + -e 's/[[:space:]]*$//') + fi + echo "$i=$val" +done diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/media b/sys/contrib/openzfs/cmd/zpool/zpool.d/media new file mode 100755 index 000000000000..05bc15918bc9 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/media @@ -0,0 +1,27 @@ +#!/bin/sh +# +# Print out the type of device +# + +if [ "$1" = "-h" ] ; then + echo "Show whether a vdev is a file, hdd, or ssd." + exit +fi + +if [ -b "$VDEV_UPATH" ]; then + device=$(basename "$VDEV_UPATH") + val=$(cat "/sys/block/$device/queue/rotational" 2>/dev/null) + if [ "$val" = "0" ]; then + MEDIA="ssd" + fi + + if [ "$val" = "1" ]; then + MEDIA="hdd" + fi +else + if [ -f "$VDEV_UPATH" ]; then + MEDIA="file" + fi +fi + +echo "media=$MEDIA" diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/model b/sys/contrib/openzfs/cmd/zpool/zpool.d/model new file mode 120000 index 000000000000..7d1e766add99 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/model @@ -0,0 +1 @@ +lsblk
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed b/sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err b/sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec b/sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc b/sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/realloc b/sys/contrib/openzfs/cmd/zpool/zpool.d/realloc new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/realloc @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/serial b/sys/contrib/openzfs/cmd/zpool/zpool.d/serial new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/serial @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/ses b/sys/contrib/openzfs/cmd/zpool/zpool.d/ses new file mode 100755 index 000000000000..f6b7520dfb6c --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/ses @@ -0,0 +1,52 @@ +#!/bin/sh +# +# Print SCSI Enclosure Services (SES) info. The output is dependent on the name +# of the script/symlink used to call it. +# +helpstr=" +enc: Show disk enclosure w:x:y:z value. +slot: Show disk slot number as reported by the enclosure. +encdev: Show /dev/sg* device associated with the enclosure disk slot. +fault_led: Show value of the disk enclosure slot fault LED. +locate_led: Show value of the disk enclosure slot locate LED. +ses: Show disk's enc, enc device, slot, and fault/locate LED values." + +script=$(basename "$0") +if [ "$1" = "-h" ] ; then + echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- + exit +fi + +if [ "$script" = "ses" ] ; then + scripts='enc encdev slot fault_led locate_led' +else + scripts="$script" +fi + +for i in $scripts ; do + if [ -z "$VDEV_ENC_SYSFS_PATH" ] ; then + echo "$i=" + continue + fi + + val="" + case $i in + enc) + val=$(ls "$VDEV_ENC_SYSFS_PATH/../../" 2>/dev/null) + ;; + slot) + val=$(cat "$VDEV_ENC_SYSFS_PATH/slot" 2>/dev/null) + ;; + encdev) + val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null) + ;; + fault_led) + val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null) + ;; + locate_led) + val=$(cat "$VDEV_ENC_SYSFS_PATH/locate" 2>/dev/null) + ;; + esac + echo "$i=$val" +done + diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/size b/sys/contrib/openzfs/cmd/zpool/zpool.d/size new file mode 120000 index 000000000000..7d1e766add99 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/size @@ -0,0 +1 @@ +lsblk
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/slot b/sys/contrib/openzfs/cmd/zpool/zpool.d/slot new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/slot @@ -0,0 +1 @@ +ses
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/smart b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart new file mode 100755 index 000000000000..f8854b75227c --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart @@ -0,0 +1,243 @@ +#!/bin/sh +# +# Show SMART stats +# + +helpstr=" +smart: Show SMART temperature and error stats (specific to drive type) +smartx: Show SMART extended drive stats (specific to drive type). +temp: Show SMART drive temperature in celsius (all drives). +health: Show reported SMART status (all drives). +r_proc: Show SMART read GBytes processed over drive lifetime (SAS). +w_proc: Show SMART write GBytes processed over drive lifetime (SAS). +r_ucor: Show SMART read uncorrectable errors (SAS). +w_ucor: Show SMART write uncorrectable errors (SAS). +nonmed: Show SMART non-medium errors (SAS). +defect: Show SMART grown defect list (SAS). +hours_on: Show number of hours drive powered on (all drives). +realloc: Show SMART reallocated sectors count (ATA). +rep_ucor: Show SMART reported uncorrectable count (ATA). +cmd_to: Show SMART command timeout count (ATA). +pend_sec: Show SMART current pending sector count (ATA). +off_ucor: Show SMART offline uncorrectable errors (ATA). +ata_err: Show SMART ATA errors (ATA). +pwr_cyc: Show SMART power cycle count (ATA). +serial: Show disk serial number. +nvme_err: Show SMART NVMe errors (NVMe). +smart_test: Show SMART self-test results summary. +test_type: Show SMART self-test type (short, long... ). +test_status: Show SMART self-test status. +test_progress: Show SMART self-test percentage done. +test_ended: Show when the last SMART self-test ended (if supported). +" + +# Hack for developer testing +# +# If you set $samples to a directory containing smartctl output text files, +# we will use them instead of running smartctl on the vdevs. This can be +# useful if you want to test a bunch of different smartctl outputs. Also, if +# $samples is set, and additional 'file' column is added to the zpool output +# showing the filename. +samples= + +# get_filename_from_dir DIR +# +# Look in directory DIR and return a filename from it. The filename returned +# is chosen quasi-sequentially (based off our PID). This allows us to return +# a different filename every time this script is invoked (which we do for each +# vdev), without having to maintain state. +get_filename_from_dir() +{ + dir=$1 + pid="$$" + num_files=$(find "$dir" -maxdepth 1 -type f | wc -l) + mod=$((pid % num_files)) + i=0 + find "$dir" -type f -printf "%f\n" | while read -r file ; do + if [ "$mod" = "$i" ] ; then + echo "$file" + break + fi + i=$((i+1)) + done +} + +script=$(basename "$0") + +if [ "$1" = "-h" ] ; then + echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- + exit +fi + +smartctl_path=$(command -v smartctl) + +# shellcheck disable=SC2015 +if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then + if [ -n "$samples" ] ; then + # cat a smartctl output text file instead of running smartctl + # on a vdev (only used for developer testing). + file=$(get_filename_from_dir "$samples") + echo "file=$file" + raw_out=$(cat "$samples/$file") + else + raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH") + fi + + # What kind of drive are we? Look for the right line in smartctl: + # + # SAS: + # Transport protocol: SAS + # + # SATA: + # ATA Version is: 8 + # + # NVMe: + # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn) + # + out=$(echo "$raw_out" | awk ' +# SAS specific +/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8} +/write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8} +/Non-medium error count/{print "nonmed="$4} +/Elements in grown defect list/{print "defect="$6} + +# SAS common +/SAS/{type="sas"} +/Drive Temperature:/{print "temp="$4} +# Status can be a long string, substitute spaces for '_' +/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i} +/number of hours powered up/{print "hours_on="$7; hours_on=int($7)} +/Serial number:/{print "serial="$3} + +# SATA specific +/Reallocated_Sector_Ct/{print "realloc="$10} +/Reported_Uncorrect/{print "rep_ucor="$10} +/Command_Timeout/{print "cmd_to="$10} +/Current_Pending_Sector/{print "pend_sec="$10} +/Offline_Uncorrectable/{print "off_ucor="$10} +/ATA Error Count:/{print "ata_err="$4} +/Power_Cycle_Count/{print "pwr_cyc="$10} + +# SATA common +/SATA/{type="sata"} +/Temperature_Celsius/{print "temp="$10} +/Airflow_Temperature_Cel/{print "temp="$10} +/Current Temperature:/{print "temp="$3} +/SMART overall-health self-assessment test result:/{print "health="$6} +/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)} +/Serial Number:/{print "serial="$3} + +# NVMe common +/NVMe/{type="nvme"} +/Temperature:/{print "temp="$2} +/SMART overall-health self-assessment test result:/{print "health="$6} +/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4} +/Serial Number:/{print "serial="$3} +/Power Cycles:/{print "pwr_cyc="$3} + +# NVMe specific +/Media and Data Integrity Errors:/{print "nvme_err="$6} + +# SMART self-test info +/Self-test execution status:/{progress=tolower($4)} # SAS +/SMART Self-test log/{test_seen=1} # SAS +/SMART Extended Self-test Log/{test_seen=1} # SATA +/# 1/{ + test_type=tolower($3"_"$4); + # Status could be one word ("Completed") or multiple ("Completed: read + # failure"). Look for the ":" to see if we need to grab more words. + + if ($5 ~ ":") + status=tolower($5""$6"_"$7) + else + status=tolower($5) + if (status=="self") + status="running"; + + if (type == "sas") { + hours=int($(NF-4)) + } else { + hours=int($(NF-1)) + # SATA reports percent remaining, rather than percent done + # Convert it to percent done. + progress=(100-int($(NF-2)))"%" + } + # When we int()-ify "hours", it converts stuff like "NOW" and "-" into + # 0. In those cases, set it to hours_on, so they will cancel out in + # the "hours_ago" calculation later on. + if (hours == 0) + hours=hours_on + + if (test_seen) { + print "test="hours_on + print "test_type="test_type + print "test_status="status + print "test_progress="progress + } + # Not all drives report hours_on + if (hours_on && hours) { + total_hours_ago=(hours_on-hours) + days_ago=int(total_hours_ago/24) + hours_ago=(total_hours_ago % 24) + if (days_ago != 0) + ago_str=days_ago"d" + if (hours_ago !=0) + ago_str=ago_str""hours_ago"h" + print "test_ended="ago_str + } +} + +END {print "type="type; ORS="\n"; print ""} +'); +fi +type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2) + +# If type is not set by now, either we don't have a block device +# or smartctl failed. Either way, default to ATA and set $out to +# nothing. +if [ -z "$type" ]; then + type="sata" + out= +fi + +case $script in +smart) + # Print temperature plus common predictors of drive failure + if [ "$type" = "sas" ] ; then + scripts="temp|health|r_ucor|w_ucor" + elif [ "$type" = "sata" ] ; then + scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor" + elif [ "$type" = "nvme" ] ; then + scripts="temp|health|nvme_err" + fi + ;; +smartx) + # Print some other interesting stats + if [ "$type" = "sas" ] ; then + scripts="hours_on|defect|nonmed|r_proc|w_proc" + elif [ "$type" = "sata" ] ; then + scripts="hours_on|pwr_cyc" + elif [ "$type" = "nvme" ] ; then + scripts="hours_on|pwr_cyc" + fi + ;; +smart_test) + scripts="test_type|test_status|test_progress|test_ended" + ;; +*) + scripts="$script" +esac + +with_vals=$(echo "$out" | grep -E "$scripts") +if [ -n "$with_vals" ]; then + echo "$with_vals" + without_vals=$(echo "$scripts" | tr "|" "\n" | + grep -v -E "$(echo "$with_vals" | + awk -F "=" '{print $1}')" | awk '{print $0"="}') +else + without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}') +fi + +if [ -n "$without_vals" ]; then + echo "$without_vals" +fi diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/smartx b/sys/contrib/openzfs/cmd/zpool/zpool.d/smartx new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/smartx @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/temp b/sys/contrib/openzfs/cmd/zpool/zpool.d/temp new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/temp @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_status b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_status new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_status @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_type b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_type new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_type @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/upath b/sys/contrib/openzfs/cmd/zpool/zpool.d/upath new file mode 100755 index 000000000000..16a4327d4850 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/upath @@ -0,0 +1,7 @@ +#!/bin/sh +if [ "$1" = "-h" ] ; then + echo "Show the underlying path for a device." + exit +fi + +echo upath="$VDEV_UPATH" diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/vendor b/sys/contrib/openzfs/cmd/zpool/zpool.d/vendor new file mode 120000 index 000000000000..7d1e766add99 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/vendor @@ -0,0 +1 @@ +lsblk
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor @@ -0,0 +1 @@ +smart
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c new file mode 100644 index 000000000000..5f3153bca2c2 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c @@ -0,0 +1,757 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. + */ + +#include <libintl.h> +#include <libuutil.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <thread_pool.h> + +#include <libzfs.h> +#include <libzutil.h> +#include <sys/zfs_context.h> +#include <sys/wait.h> + +#include "zpool_util.h" + +/* + * Private interface for iterating over pools specified on the command line. + * Most consumers will call for_each_pool, but in order to support iostat, we + * allow fined grained control through the zpool_list_t interface. + */ + +typedef struct zpool_node { + zpool_handle_t *zn_handle; + uu_avl_node_t zn_avlnode; + int zn_mark; +} zpool_node_t; + +struct zpool_list { + boolean_t zl_findall; + uu_avl_t *zl_avl; + uu_avl_pool_t *zl_pool; + zprop_list_t **zl_proplist; +}; + +/* ARGSUSED */ +static int +zpool_compare(const void *larg, const void *rarg, void *unused) +{ + zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle; + zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle; + const char *lname = zpool_get_name(l); + const char *rname = zpool_get_name(r); + + return (strcmp(lname, rname)); +} + +/* + * Callback function for pool_list_get(). Adds the given pool to the AVL tree + * of known pools. + */ +static int +add_pool(zpool_handle_t *zhp, void *data) +{ + zpool_list_t *zlp = data; + zpool_node_t *node = safe_malloc(sizeof (zpool_node_t)); + uu_avl_index_t idx; + + node->zn_handle = zhp; + uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); + if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { + if (zlp->zl_proplist && + zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) { + zpool_close(zhp); + free(node); + return (-1); + } + uu_avl_insert(zlp->zl_avl, node, idx); + } else { + zpool_close(zhp); + free(node); + return (-1); + } + + return (0); +} + +/* + * Create a list of pools based on the given arguments. If we're given no + * arguments, then iterate over all pools in the system and add them to the AVL + * tree. Otherwise, add only those pool explicitly specified on the command + * line. + */ +zpool_list_t * +pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) +{ + zpool_list_t *zlp; + + zlp = safe_malloc(sizeof (zpool_list_t)); + + zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t), + offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT); + + if (zlp->zl_pool == NULL) + zpool_no_memory(); + + if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL, + UU_DEFAULT)) == NULL) + zpool_no_memory(); + + zlp->zl_proplist = proplist; + + if (argc == 0) { + (void) zpool_iter(g_zfs, add_pool, zlp); + zlp->zl_findall = B_TRUE; + } else { + int i; + + for (i = 0; i < argc; i++) { + zpool_handle_t *zhp; + + if ((zhp = zpool_open_canfail(g_zfs, argv[i])) != + NULL) { + if (add_pool(zhp, zlp) != 0) + *err = B_TRUE; + } else { + *err = B_TRUE; + } + } + } + + return (zlp); +} + +/* + * Search for any new pools, adding them to the list. We only add pools when no + * options were given on the command line. Otherwise, we keep the list fixed as + * those that were explicitly specified. + */ +void +pool_list_update(zpool_list_t *zlp) +{ + if (zlp->zl_findall) + (void) zpool_iter(g_zfs, add_pool, zlp); +} + +/* + * Iterate over all pools in the list, executing the callback for each + */ +int +pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func, + void *data) +{ + zpool_node_t *node, *next_node; + int ret = 0; + + for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) { + next_node = uu_avl_next(zlp->zl_avl, node); + if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL || + unavail) + ret |= func(node->zn_handle, data); + } + + return (ret); +} + +/* + * Remove the given pool from the list. When running iostat, we want to remove + * those pools that no longer exist. + */ +void +pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp) +{ + zpool_node_t search, *node; + + search.zn_handle = zhp; + if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) { + uu_avl_remove(zlp->zl_avl, node); + zpool_close(node->zn_handle); + free(node); + } +} + +/* + * Free all the handles associated with this list. + */ +void +pool_list_free(zpool_list_t *zlp) +{ + uu_avl_walk_t *walk; + zpool_node_t *node; + + if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) { + (void) fprintf(stderr, + gettext("internal error: out of memory")); + exit(1); + } + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(zlp->zl_avl, node); + zpool_close(node->zn_handle); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(zlp->zl_avl); + uu_avl_pool_destroy(zlp->zl_pool); + + free(zlp); +} + +/* + * Returns the number of elements in the pool list. + */ +int +pool_list_count(zpool_list_t *zlp) +{ + return (uu_avl_numnodes(zlp->zl_avl)); +} + +/* + * High level function which iterates over all pools given on the command line, + * using the pool_list_* interfaces. + */ +int +for_each_pool(int argc, char **argv, boolean_t unavail, + zprop_list_t **proplist, zpool_iter_f func, void *data) +{ + zpool_list_t *list; + int ret = 0; + + if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL) + return (1); + + if (pool_list_iter(list, unavail, func, data) != 0) + ret = 1; + + pool_list_free(list); + + return (ret); +} + +static int +for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func, + void *data) +{ + nvlist_t **child; + uint_t c, children; + int ret = 0; + int i; + char *type; + + const char *list[] = { + ZPOOL_CONFIG_SPARES, + ZPOOL_CONFIG_L2CACHE, + ZPOOL_CONFIG_CHILDREN + }; + + for (i = 0; i < ARRAY_SIZE(list); i++) { + if (nvlist_lookup_nvlist_array(nv, list[i], &child, + &children) == 0) { + for (c = 0; c < children; c++) { + uint64_t ishole = 0; + + (void) nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole); + + if (ishole) + continue; + + ret |= for_each_vdev_cb(zhp, child[c], func, + data); + } + } + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (ret); + + /* Don't run our function on root vdevs */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0) { + ret |= func(zhp, nv, data); + } + + return (ret); +} + +/* + * This is the equivalent of for_each_pool() for vdevs. It iterates thorough + * all vdevs in the pool, ignoring root vdevs and holes, calling func() on + * each one. + * + * @zhp: Zpool handle + * @func: Function to call on each vdev + * @data: Custom data to pass to the function + */ +int +for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data) +{ + nvlist_t *config, *nvroot = NULL; + + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + } + return (for_each_vdev_cb(zhp, nvroot, func, data)); +} + +/* + * Process the vcdl->vdev_cmd_data[] array to figure out all the unique column + * names and their widths. When this function is done, vcdl->uniq_cols, + * vcdl->uniq_cols_cnt, and vcdl->uniq_cols_width will be filled in. + */ +static void +process_unique_cmd_columns(vdev_cmd_data_list_t *vcdl) +{ + char **uniq_cols = NULL, **tmp = NULL; + int *uniq_cols_width; + vdev_cmd_data_t *data; + int cnt = 0; + int k; + + /* For each vdev */ + for (int i = 0; i < vcdl->count; i++) { + data = &vcdl->data[i]; + /* For each column the vdev reported */ + for (int j = 0; j < data->cols_cnt; j++) { + /* Is this column in our list of unique column names? */ + for (k = 0; k < cnt; k++) { + if (strcmp(data->cols[j], uniq_cols[k]) == 0) + break; /* yes it is */ + } + if (k == cnt) { + /* No entry for column, add to list */ + tmp = realloc(uniq_cols, sizeof (*uniq_cols) * + (cnt + 1)); + if (tmp == NULL) + break; /* Nothing we can do... */ + uniq_cols = tmp; + uniq_cols[cnt] = data->cols[j]; + cnt++; + } + } + } + + /* + * We now have a list of all the unique column names. Figure out the + * max width of each column by looking at the column name and all its + * values. + */ + uniq_cols_width = safe_malloc(sizeof (*uniq_cols_width) * cnt); + for (int i = 0; i < cnt; i++) { + /* Start off with the column title's width */ + uniq_cols_width[i] = strlen(uniq_cols[i]); + /* For each vdev */ + for (int j = 0; j < vcdl->count; j++) { + /* For each of the vdev's values in a column */ + data = &vcdl->data[j]; + for (k = 0; k < data->cols_cnt; k++) { + /* Does this vdev have a value for this col? */ + if (strcmp(data->cols[k], uniq_cols[i]) == 0) { + /* Is the value width larger? */ + uniq_cols_width[i] = + MAX(uniq_cols_width[i], + strlen(data->lines[k])); + } + } + } + } + + vcdl->uniq_cols = uniq_cols; + vcdl->uniq_cols_cnt = cnt; + vcdl->uniq_cols_width = uniq_cols_width; +} + + +/* + * Process a line of command output + * + * When running 'zpool iostat|status -c' the lines of output can either be + * in the form of: + * + * column_name=value + * + * Or just: + * + * value + * + * Process the column_name (if any) and value. + * + * Returns 0 if line was processed, and there are more lines can still be + * processed. + * + * Returns 1 if this was the last line to process, or error. + */ +static int +vdev_process_cmd_output(vdev_cmd_data_t *data, char *line) +{ + char *col = NULL; + char *val = line; + char *equals; + char **tmp; + + if (line == NULL) + return (1); + + equals = strchr(line, '='); + if (equals != NULL) { + /* + * We have a 'column=value' type line. Split it into the + * column and value strings by turning the '=' into a '\0'. + */ + *equals = '\0'; + col = line; + val = equals + 1; + } else { + val = line; + } + + /* Do we already have a column by this name? If so, skip it. */ + if (col != NULL) { + for (int i = 0; i < data->cols_cnt; i++) { + if (strcmp(col, data->cols[i]) == 0) + return (0); /* Duplicate, skip */ + } + } + + if (val != NULL) { + tmp = realloc(data->lines, + (data->lines_cnt + 1) * sizeof (*data->lines)); + if (tmp == NULL) + return (1); + + data->lines = tmp; + data->lines[data->lines_cnt] = strdup(val); + data->lines_cnt++; + } + + if (col != NULL) { + tmp = realloc(data->cols, + (data->cols_cnt + 1) * sizeof (*data->cols)); + if (tmp == NULL) + return (1); + + data->cols = tmp; + data->cols[data->cols_cnt] = strdup(col); + data->cols_cnt++; + } + + if (val != NULL && col == NULL) + return (1); + + return (0); +} + +/* + * Run the cmd and store results in *data. + */ +static void +vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) +{ + int rc; + char *argv[2] = {cmd, 0}; + char *env[5] = {"PATH=/bin:/sbin:/usr/bin:/usr/sbin", NULL, NULL, NULL, + NULL}; + char **lines = NULL; + int lines_cnt = 0; + int i; + + /* Setup our custom environment variables */ + rc = asprintf(&env[1], "VDEV_PATH=%s", + data->path ? data->path : ""); + if (rc == -1) + goto out; + + rc = asprintf(&env[2], "VDEV_UPATH=%s", + data->upath ? data->upath : ""); + if (rc == -1) + goto out; + + rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s", + data->vdev_enc_sysfs_path ? + data->vdev_enc_sysfs_path : ""); + if (rc == -1) + goto out; + + /* Run the command */ + rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines, + &lines_cnt); + if (rc != 0) + goto out; + + /* Process the output we got */ + for (i = 0; i < lines_cnt; i++) + if (vdev_process_cmd_output(data, lines[i]) != 0) + break; + +out: + if (lines != NULL) + libzfs_free_str_array(lines, lines_cnt); + + /* Start with i = 1 since env[0] was statically allocated */ + for (i = 1; i < ARRAY_SIZE(env); i++) + if (env[i] != NULL) + free(env[i]); +} + +/* + * Generate the search path for zpool iostat/status -c scripts. + * The string returned must be freed. + */ +char * +zpool_get_cmd_search_path(void) +{ + const char *env; + char *sp = NULL; + + env = getenv("ZPOOL_SCRIPTS_PATH"); + if (env != NULL) + return (strdup(env)); + + env = getenv("HOME"); + if (env != NULL) { + if (asprintf(&sp, "%s/.zpool.d:%s", + env, ZPOOL_SCRIPTS_DIR) != -1) { + return (sp); + } + } + + if (asprintf(&sp, "%s", ZPOOL_SCRIPTS_DIR) != -1) + return (sp); + + return (NULL); +} + +/* Thread function run for each vdev */ +static void +vdev_run_cmd_thread(void *cb_cmd_data) +{ + vdev_cmd_data_t *data = cb_cmd_data; + char *cmd = NULL, *cmddup, *cmdrest; + + cmddup = strdup(data->cmd); + if (cmddup == NULL) + return; + + cmdrest = cmddup; + while ((cmd = strtok_r(cmdrest, ",", &cmdrest))) { + char *dir = NULL, *sp, *sprest; + char fullpath[MAXPATHLEN]; + + if (strchr(cmd, '/') != NULL) + continue; + + sp = zpool_get_cmd_search_path(); + if (sp == NULL) + continue; + + sprest = sp; + while ((dir = strtok_r(sprest, ":", &sprest))) { + if (snprintf(fullpath, sizeof (fullpath), + "%s/%s", dir, cmd) == -1) + continue; + + if (access(fullpath, X_OK) == 0) { + vdev_run_cmd(data, fullpath); + break; + } + } + free(sp); + } + free(cmddup); +} + +/* For each vdev in the pool run a command */ +static int +for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl) +{ + vdev_cmd_data_list_t *vcdl = cb_vcdl; + vdev_cmd_data_t *data; + char *path = NULL; + char *vname = NULL; + char *vdev_enc_sysfs_path = NULL; + int i, match = 0; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return (1); + + nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &vdev_enc_sysfs_path); + + /* Spares show more than once if they're in use, so skip if exists */ + for (i = 0; i < vcdl->count; i++) { + if ((strcmp(vcdl->data[i].path, path) == 0) && + (strcmp(vcdl->data[i].pool, zpool_get_name(zhp)) == 0)) { + /* vdev already exists, skip it */ + return (0); + } + } + + /* Check for selected vdevs here, if any */ + for (i = 0; i < vcdl->vdev_names_count; i++) { + vname = zpool_vdev_name(g_zfs, zhp, nv, vcdl->cb_name_flags); + if (strcmp(vcdl->vdev_names[i], vname) == 0) { + free(vname); + match = 1; + break; /* match */ + } + free(vname); + } + + /* If we selected vdevs, and this isn't one of them, then bail out */ + if (!match && vcdl->vdev_names_count) + return (0); + + /* + * Resize our array and add in the new element. + */ + if (!(vcdl->data = realloc(vcdl->data, + sizeof (*vcdl->data) * (vcdl->count + 1)))) + return (ENOMEM); /* couldn't realloc */ + + data = &vcdl->data[vcdl->count]; + + data->pool = strdup(zpool_get_name(zhp)); + data->path = strdup(path); + data->upath = zfs_get_underlying_path(path); + data->cmd = vcdl->cmd; + data->lines = data->cols = NULL; + data->lines_cnt = data->cols_cnt = 0; + if (vdev_enc_sysfs_path) + data->vdev_enc_sysfs_path = strdup(vdev_enc_sysfs_path); + else + data->vdev_enc_sysfs_path = NULL; + + vcdl->count++; + + return (0); +} + +/* Get the names and count of the vdevs */ +static int +all_pools_for_each_vdev_gather_cb(zpool_handle_t *zhp, void *cb_vcdl) +{ + return (for_each_vdev(zhp, for_each_vdev_run_cb, cb_vcdl)); +} + +/* + * Now that vcdl is populated with our complete list of vdevs, spawn + * off the commands. + */ +static void +all_pools_for_each_vdev_run_vcdl(vdev_cmd_data_list_t *vcdl) +{ + tpool_t *t; + + t = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); + if (t == NULL) + return; + + /* Spawn off the command for each vdev */ + for (int i = 0; i < vcdl->count; i++) { + (void) tpool_dispatch(t, vdev_run_cmd_thread, + (void *) &vcdl->data[i]); + } + + /* Wait for threads to finish */ + tpool_wait(t); + tpool_destroy(t); +} + +/* + * Run command 'cmd' on all vdevs in all pools in argv. Saves the first line of + * output from the command in vcdk->data[].line for all vdevs. If you want + * to run the command on only certain vdevs, fill in g_zfs, vdev_names, + * vdev_names_count, and cb_name_flags. Otherwise leave them as zero. + * + * Returns a vdev_cmd_data_list_t that must be freed with + * free_vdev_cmd_data_list(); + */ +vdev_cmd_data_list_t * +all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, + libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, + int cb_name_flags) +{ + vdev_cmd_data_list_t *vcdl; + vcdl = safe_malloc(sizeof (vdev_cmd_data_list_t)); + vcdl->cmd = cmd; + + vcdl->vdev_names = vdev_names; + vcdl->vdev_names_count = vdev_names_count; + vcdl->cb_name_flags = cb_name_flags; + vcdl->g_zfs = g_zfs; + + /* Gather our list of all vdevs in all pools */ + for_each_pool(argc, argv, B_TRUE, NULL, + all_pools_for_each_vdev_gather_cb, vcdl); + + /* Run command on all vdevs in all pools */ + all_pools_for_each_vdev_run_vcdl(vcdl); + + /* + * vcdl->data[] now contains all the column names and values for each + * vdev. We need to process that into a master list of unique column + * names, and figure out the width of each column. + */ + process_unique_cmd_columns(vcdl); + + return (vcdl); +} + +/* + * Free the vdev_cmd_data_list_t created by all_pools_for_each_vdev_run() + */ +void +free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl) +{ + free(vcdl->uniq_cols); + free(vcdl->uniq_cols_width); + + for (int i = 0; i < vcdl->count; i++) { + free(vcdl->data[i].path); + free(vcdl->data[i].pool); + free(vcdl->data[i].upath); + + for (int j = 0; j < vcdl->data[i].lines_cnt; j++) + free(vcdl->data[i].lines[j]); + + free(vcdl->data[i].lines); + + for (int j = 0; j < vcdl->data[i].cols_cnt; j++) + free(vcdl->data[i].cols[j]); + + free(vcdl->data[i].cols); + free(vcdl->data[i].vdev_enc_sysfs_path); + } + free(vcdl->data); + free(vcdl); +} diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c new file mode 100644 index 000000000000..f3756a5d9547 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -0,0 +1,10326 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012 by Frederik Wessels. All rights reserved. + * Copyright (c) 2012 by Cyril Plisko. All rights reserved. + * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. + * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com> + */ + +#include <assert.h> +#include <ctype.h> +#include <dirent.h> +#include <errno.h> +#include <fcntl.h> +#include <getopt.h> +#include <libgen.h> +#include <libintl.h> +#include <libuutil.h> +#include <locale.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <time.h> +#include <unistd.h> +#include <pwd.h> +#include <zone.h> +#include <sys/wait.h> +#include <zfs_prop.h> +#include <sys/fs/zfs.h> +#include <sys/stat.h> +#include <sys/systeminfo.h> +#include <sys/fm/fs/zfs.h> +#include <sys/fm/util.h> +#include <sys/fm/protocol.h> +#include <sys/zfs_ioctl.h> +#include <sys/mount.h> +#include <sys/sysmacros.h> + +#include <math.h> + +#include <libzfs.h> +#include <libzutil.h> + +#include "zpool_util.h" +#include "zfs_comutil.h" +#include "zfeature_common.h" + +#include "statcommon.h" + +libzfs_handle_t *g_zfs; + +static int zpool_do_create(int, char **); +static int zpool_do_destroy(int, char **); + +static int zpool_do_add(int, char **); +static int zpool_do_remove(int, char **); +static int zpool_do_labelclear(int, char **); + +static int zpool_do_checkpoint(int, char **); + +static int zpool_do_list(int, char **); +static int zpool_do_iostat(int, char **); +static int zpool_do_status(int, char **); + +static int zpool_do_online(int, char **); +static int zpool_do_offline(int, char **); +static int zpool_do_clear(int, char **); +static int zpool_do_reopen(int, char **); + +static int zpool_do_reguid(int, char **); + +static int zpool_do_attach(int, char **); +static int zpool_do_detach(int, char **); +static int zpool_do_replace(int, char **); +static int zpool_do_split(int, char **); + +static int zpool_do_initialize(int, char **); +static int zpool_do_scrub(int, char **); +static int zpool_do_resilver(int, char **); +static int zpool_do_trim(int, char **); + +static int zpool_do_import(int, char **); +static int zpool_do_export(int, char **); + +static int zpool_do_upgrade(int, char **); + +static int zpool_do_history(int, char **); +static int zpool_do_events(int, char **); + +static int zpool_do_get(int, char **); +static int zpool_do_set(int, char **); + +static int zpool_do_sync(int, char **); + +static int zpool_do_version(int, char **); + +static int zpool_do_wait(int, char **); + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +typedef enum { + HELP_ADD, + HELP_ATTACH, + HELP_CLEAR, + HELP_CREATE, + HELP_CHECKPOINT, + HELP_DESTROY, + HELP_DETACH, + HELP_EXPORT, + HELP_HISTORY, + HELP_IMPORT, + HELP_IOSTAT, + HELP_LABELCLEAR, + HELP_LIST, + HELP_OFFLINE, + HELP_ONLINE, + HELP_REPLACE, + HELP_REMOVE, + HELP_INITIALIZE, + HELP_SCRUB, + HELP_RESILVER, + HELP_TRIM, + HELP_STATUS, + HELP_UPGRADE, + HELP_EVENTS, + HELP_GET, + HELP_SET, + HELP_SPLIT, + HELP_SYNC, + HELP_REGUID, + HELP_REOPEN, + HELP_VERSION, + HELP_WAIT +} zpool_help_t; + + +/* + * Flags for stats to display with "zpool iostats" + */ +enum iostat_type { + IOS_DEFAULT = 0, + IOS_LATENCY = 1, + IOS_QUEUES = 2, + IOS_L_HISTO = 3, + IOS_RQ_HISTO = 4, + IOS_COUNT, /* always last element */ +}; + +/* iostat_type entries as bitmasks */ +#define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) +#define IOS_LATENCY_M (1ULL << IOS_LATENCY) +#define IOS_QUEUES_M (1ULL << IOS_QUEUES) +#define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) +#define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) + +/* Mask of all the histo bits */ +#define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) + +/* + * Lookup table for iostat flags to nvlist names. Basically a list + * of all the nvlists a flag requires. Also specifies the order in + * which data gets printed in zpool iostat. + */ +static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { + [IOS_L_HISTO] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + NULL}, + [IOS_LATENCY] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + NULL}, + [IOS_QUEUES] = { + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + NULL}, + [IOS_RQ_HISTO] = { + ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, + ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, + ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, + ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, + ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, + NULL}, +}; + + +/* + * Given a cb->cb_flags with a histogram bit set, return the iostat_type. + * Right now, only one histo bit is ever set at one time, so we can + * just do a highbit64(a) + */ +#define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) + +typedef struct zpool_command { + const char *name; + int (*func)(int, char **); + zpool_help_t usage; +} zpool_command_t; + +/* + * Master command table. Each ZFS command has a name, associated function, and + * usage message. The usage messages need to be internationalized, so we have + * to have a function to return the usage message based on a command index. + * + * These commands are organized according to how they are displayed in the usage + * message. An empty command (one with a NULL name) indicates an empty line in + * the generic usage message. + */ +static zpool_command_t command_table[] = { + { "version", zpool_do_version, HELP_VERSION }, + { NULL }, + { "create", zpool_do_create, HELP_CREATE }, + { "destroy", zpool_do_destroy, HELP_DESTROY }, + { NULL }, + { "add", zpool_do_add, HELP_ADD }, + { "remove", zpool_do_remove, HELP_REMOVE }, + { NULL }, + { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, + { NULL }, + { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, + { NULL }, + { "list", zpool_do_list, HELP_LIST }, + { "iostat", zpool_do_iostat, HELP_IOSTAT }, + { "status", zpool_do_status, HELP_STATUS }, + { NULL }, + { "online", zpool_do_online, HELP_ONLINE }, + { "offline", zpool_do_offline, HELP_OFFLINE }, + { "clear", zpool_do_clear, HELP_CLEAR }, + { "reopen", zpool_do_reopen, HELP_REOPEN }, + { NULL }, + { "attach", zpool_do_attach, HELP_ATTACH }, + { "detach", zpool_do_detach, HELP_DETACH }, + { "replace", zpool_do_replace, HELP_REPLACE }, + { "split", zpool_do_split, HELP_SPLIT }, + { NULL }, + { "initialize", zpool_do_initialize, HELP_INITIALIZE }, + { "resilver", zpool_do_resilver, HELP_RESILVER }, + { "scrub", zpool_do_scrub, HELP_SCRUB }, + { "trim", zpool_do_trim, HELP_TRIM }, + { NULL }, + { "import", zpool_do_import, HELP_IMPORT }, + { "export", zpool_do_export, HELP_EXPORT }, + { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, + { "reguid", zpool_do_reguid, HELP_REGUID }, + { NULL }, + { "history", zpool_do_history, HELP_HISTORY }, + { "events", zpool_do_events, HELP_EVENTS }, + { NULL }, + { "get", zpool_do_get, HELP_GET }, + { "set", zpool_do_set, HELP_SET }, + { "sync", zpool_do_sync, HELP_SYNC }, + { NULL }, + { "wait", zpool_do_wait, HELP_WAIT }, +}; + +#define NCOMMAND (ARRAY_SIZE(command_table)) + +#define VDEV_ALLOC_CLASS_LOGS "logs" + +static zpool_command_t *current_command; +static char history_str[HIS_MAX_RECORD_LEN]; +static boolean_t log_history = B_TRUE; +static uint_t timestamp_fmt = NODATE; + +static const char * +get_usage(zpool_help_t idx) +{ + switch (idx) { + case HELP_ADD: + return (gettext("\tadd [-fgLnP] [-o property=value] " + "<pool> <vdev> ...\n")); + case HELP_ATTACH: + return (gettext("\tattach [-fsw] [-o property=value] " + "<pool> <device> <new-device>\n")); + case HELP_CLEAR: + return (gettext("\tclear [-nF] <pool> [device]\n")); + case HELP_CREATE: + return (gettext("\tcreate [-fnd] [-o property=value] ... \n" + "\t [-O file-system-property=value] ... \n" + "\t [-m mountpoint] [-R root] <pool> <vdev> ...\n")); + case HELP_CHECKPOINT: + return (gettext("\tcheckpoint [-d [-w]] <pool> ...\n")); + case HELP_DESTROY: + return (gettext("\tdestroy [-f] <pool>\n")); + case HELP_DETACH: + return (gettext("\tdetach <pool> <device>\n")); + case HELP_EXPORT: + return (gettext("\texport [-af] <pool> ...\n")); + case HELP_HISTORY: + return (gettext("\thistory [-il] [<pool>] ...\n")); + case HELP_IMPORT: + return (gettext("\timport [-d dir] [-D]\n" + "\timport [-o mntopts] [-o property=value] ... \n" + "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " + "[-R root] [-F [-n]] -a\n" + "\timport [-o mntopts] [-o property=value] ... \n" + "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " + "[-R root] [-F [-n]]\n" + "\t [--rewind-to-checkpoint] <pool | id> [newpool]\n")); + case HELP_IOSTAT: + return (gettext("\tiostat [[[-c [script1,script2,...]" + "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n" + "\t [[pool ...]|[pool vdev ...]|[vdev ...]]" + " [[-n] interval [count]]\n")); + case HELP_LABELCLEAR: + return (gettext("\tlabelclear [-f] <vdev>\n")); + case HELP_LIST: + return (gettext("\tlist [-gHLpPv] [-o property[,...]] " + "[-T d|u] [pool] ... \n" + "\t [interval [count]]\n")); + case HELP_OFFLINE: + return (gettext("\toffline [-f] [-t] <pool> <device> ...\n")); + case HELP_ONLINE: + return (gettext("\tonline [-e] <pool> <device> ...\n")); + case HELP_REPLACE: + return (gettext("\treplace [-fsw] [-o property=value] " + "<pool> <device> [new-device]\n")); + case HELP_REMOVE: + return (gettext("\tremove [-npsw] <pool> <device> ...\n")); + case HELP_REOPEN: + return (gettext("\treopen [-n] <pool>\n")); + case HELP_INITIALIZE: + return (gettext("\tinitialize [-c | -s] [-w] <pool> " + "[<device> ...]\n")); + case HELP_SCRUB: + return (gettext("\tscrub [-s | -p] [-w] <pool> ...\n")); + case HELP_RESILVER: + return (gettext("\tresilver <pool> ...\n")); + case HELP_TRIM: + return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] <pool> " + "[<device> ...]\n")); + case HELP_STATUS: + return (gettext("\tstatus [-c [script1,script2,...]] " + "[-igLpPstvxD] [-T d|u] [pool] ... \n" + "\t [interval [count]]\n")); + case HELP_UPGRADE: + return (gettext("\tupgrade\n" + "\tupgrade -v\n" + "\tupgrade [-V version] <-a | pool ...>\n")); + case HELP_EVENTS: + return (gettext("\tevents [-vHf [pool] | -c]\n")); + case HELP_GET: + return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " + "<\"all\" | property[,...]> <pool> ...\n")); + case HELP_SET: + return (gettext("\tset <property=value> <pool> \n")); + case HELP_SPLIT: + return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n" + "\t [-o property=value] <pool> <newpool> " + "[<device> ...]\n")); + case HELP_REGUID: + return (gettext("\treguid <pool>\n")); + case HELP_SYNC: + return (gettext("\tsync [pool] ...\n")); + case HELP_VERSION: + return (gettext("\tversion\n")); + case HELP_WAIT: + return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] " + "<pool> [interval]\n")); + } + + abort(); + /* NOTREACHED */ +} + +static void +zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) +{ + uint_t children = 0; + nvlist_t **child; + uint_t i; + + (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + if (children == 0) { + char *path = zpool_vdev_name(g_zfs, zhp, nvroot, + VDEV_NAME_PATH); + + if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && + strcmp(path, VDEV_TYPE_HOLE) != 0) + fnvlist_add_boolean(res, path); + + free(path); + return; + } + + for (i = 0; i < children; i++) { + zpool_collect_leaves(zhp, child[i], res); + } +} + +/* + * Callback routine that will print out a pool property value. + */ +static int +print_prop_cb(int prop, void *cb) +{ + FILE *fp = cb; + + (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); + + if (zpool_prop_readonly(prop)) + (void) fprintf(fp, " NO "); + else + (void) fprintf(fp, " YES "); + + if (zpool_prop_values(prop) == NULL) + (void) fprintf(fp, "-\n"); + else + (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); + + return (ZPROP_CONT); +} + +/* + * Display usage message. If we're inside a command, display only the usage for + * that command. Otherwise, iterate over the entire command table and display + * a complete usage message. + */ +static void +usage(boolean_t requested) +{ + FILE *fp = requested ? stdout : stderr; + + if (current_command == NULL) { + int i; + + (void) fprintf(fp, gettext("usage: zpool command args ...\n")); + (void) fprintf(fp, + gettext("where 'command' is one of the following:\n\n")); + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + (void) fprintf(fp, "\n"); + else + (void) fprintf(fp, "%s", + get_usage(command_table[i].usage)); + } + } else { + (void) fprintf(fp, gettext("usage:\n")); + (void) fprintf(fp, "%s", get_usage(current_command->usage)); + } + + if (current_command != NULL && + ((strcmp(current_command->name, "set") == 0) || + (strcmp(current_command->name, "get") == 0) || + (strcmp(current_command->name, "list") == 0))) { + + (void) fprintf(fp, + gettext("\nthe following properties are supported:\n")); + + (void) fprintf(fp, "\n\t%-19s %s %s\n\n", + "PROPERTY", "EDIT", "VALUES"); + + /* Iterate over all properties */ + (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, + ZFS_TYPE_POOL); + + (void) fprintf(fp, "\t%-19s ", "feature@..."); + (void) fprintf(fp, "YES disabled | enabled | active\n"); + + (void) fprintf(fp, gettext("\nThe feature@ properties must be " + "appended with a feature name.\nSee zpool-features(5).\n")); + } + + /* + * See comments at end of main(). + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + exit(requested ? 0 : 2); +} + +/* + * zpool initialize [-c | -s] [-w] <pool> [<vdev> ...] + * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool + * if none specified. + * + * -c Cancel. Ends active initializing. + * -s Suspend. Initializing can then be restarted with no flags. + * -w Wait. Blocks until initializing has completed. + */ +int +zpool_do_initialize(int argc, char **argv) +{ + int c; + char *poolname; + zpool_handle_t *zhp; + nvlist_t *vdevs; + int err = 0; + boolean_t wait = B_FALSE; + + struct option long_options[] = { + {"cancel", no_argument, NULL, 'c'}, + {"suspend", no_argument, NULL, 's'}, + {"wait", no_argument, NULL, 'w'}, + {0, 0, 0, 0} + }; + + pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; + while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) { + switch (c) { + case 'c': + if (cmd_type != POOL_INITIALIZE_START && + cmd_type != POOL_INITIALIZE_CANCEL) { + (void) fprintf(stderr, gettext("-c cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_INITIALIZE_CANCEL; + break; + case 's': + if (cmd_type != POOL_INITIALIZE_START && + cmd_type != POOL_INITIALIZE_SUSPEND) { + (void) fprintf(stderr, gettext("-s cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_INITIALIZE_SUSPEND; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + return (-1); + } + + if (wait && (cmd_type != POOL_INITIALIZE_START)) { + (void) fprintf(stderr, gettext("-w cannot be used with -c or " + "-s\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + + vdevs = fnvlist_alloc(); + if (argc == 1) { + /* no individual leaf vdevs specified, so add them all */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + zpool_collect_leaves(zhp, nvroot, vdevs); + } else { + for (int i = 1; i < argc; i++) { + fnvlist_add_boolean(vdevs, argv[i]); + } + } + + if (wait) + err = zpool_initialize_wait(zhp, cmd_type, vdevs); + else + err = zpool_initialize(zhp, cmd_type, vdevs); + + fnvlist_free(vdevs); + zpool_close(zhp); + + return (err); +} + +/* + * print a pool vdev config for dry runs + */ +static void +print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, + const char *match, int name_flags) +{ + nvlist_t **child; + uint_t c, children; + char *vname; + boolean_t printed = B_FALSE; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + if (name != NULL) + (void) printf("\t%*s%s\n", indent, "", name); + return; + } + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + char *class = ""; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (is_log) + class = VDEV_ALLOC_BIAS_LOG; + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &class); + if (strcmp(match, class) != 0) + continue; + + if (!printed && name != NULL) { + (void) printf("\t%*s%s\n", indent, "", name); + printed = B_TRUE; + } + vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); + print_vdev_tree(zhp, vname, child[c], indent + 2, "", + name_flags); + free(vname); + } +} + +static boolean_t +prop_list_contains_feature(nvlist_t *proplist) +{ + nvpair_t *nvp; + for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; + nvp = nvlist_next_nvpair(proplist, nvp)) { + if (zpool_prop_feature(nvpair_name(nvp))) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Add a property pair (name, string-value) into a property nvlist. + */ +static int +add_prop_list(const char *propname, char *propval, nvlist_t **props, + boolean_t poolprop) +{ + zpool_prop_t prop = ZPOOL_PROP_INVAL; + nvlist_t *proplist; + const char *normnm; + char *strval; + + if (*props == NULL && + nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { + (void) fprintf(stderr, + gettext("internal error: out of memory\n")); + return (1); + } + + proplist = *props; + + if (poolprop) { + const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); + + if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && + !zpool_prop_feature(propname)) { + (void) fprintf(stderr, gettext("property '%s' is " + "not a valid pool property\n"), propname); + return (2); + } + + /* + * feature@ properties and version should not be specified + * at the same time. + */ + if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && + nvlist_exists(proplist, vname)) || + (prop == ZPOOL_PROP_VERSION && + prop_list_contains_feature(proplist))) { + (void) fprintf(stderr, gettext("'feature@' and " + "'version' properties cannot be specified " + "together\n")); + return (2); + } + + + if (zpool_prop_feature(propname)) + normnm = propname; + else + normnm = zpool_prop_to_name(prop); + } else { + zfs_prop_t fsprop = zfs_name_to_prop(propname); + + if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM, + B_FALSE)) { + normnm = zfs_prop_to_name(fsprop); + } else if (zfs_prop_user(propname) || + zfs_prop_userquota(propname)) { + normnm = propname; + } else { + (void) fprintf(stderr, gettext("property '%s' is " + "not a valid filesystem property\n"), propname); + return (2); + } + } + + if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && + prop != ZPOOL_PROP_CACHEFILE) { + (void) fprintf(stderr, gettext("property '%s' " + "specified multiple times\n"), propname); + return (2); + } + + if (nvlist_add_string(proplist, normnm, propval) != 0) { + (void) fprintf(stderr, gettext("internal " + "error: out of memory\n")); + return (1); + } + + return (0); +} + +/* + * Set a default property pair (name, string-value) in a property nvlist + */ +static int +add_prop_list_default(const char *propname, char *propval, nvlist_t **props, + boolean_t poolprop) +{ + char *pval; + + if (nvlist_lookup_string(*props, propname, &pval) == 0) + return (0); + + return (add_prop_list(propname, propval, props, B_TRUE)); +} + +/* + * zpool add [-fgLnP] [-o property=value] <pool> <vdev> ... + * + * -f Force addition of devices, even if they appear in use + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -n Do not add the devices, but display the resulting layout if + * they were to be added. + * -o Set property=value. + * -P Display full path for vdev name. + * + * Adds the given vdevs to 'pool'. As with create, the bulk of this work is + * handled by make_root_vdev(), which constructs the nvlist needed to pass to + * libzfs. + */ +int +zpool_do_add(int argc, char **argv) +{ + boolean_t force = B_FALSE; + boolean_t dryrun = B_FALSE; + int name_flags = 0; + int c; + nvlist_t *nvroot; + char *poolname; + int ret; + zpool_handle_t *zhp; + nvlist_t *config; + nvlist_t *props = NULL; + char *propval; + + /* check options */ + while ((c = getopt(argc, argv, "fgLno:P")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case 'g': + name_flags |= VDEV_NAME_GUID; + break; + case 'L': + name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; + case 'P': + name_flags |= VDEV_NAME_PATH; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing vdev specification\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + argc--; + argv++; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), + poolname); + zpool_close(zhp); + return (1); + } + + /* unless manually specified use "ashift" pool property (if set) */ + if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { + int intval; + zprop_source_t src; + char strval[ZPOOL_MAXPROPLEN]; + + intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); + if (src != ZPROP_SRC_DEFAULT) { + (void) sprintf(strval, "%" PRId32, intval); + verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, + &props, B_TRUE) == 0); + } + } + + /* pass off to make_root_vdev for processing */ + nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, + argc, argv); + if (nvroot == NULL) { + zpool_close(zhp); + return (1); + } + + if (dryrun) { + nvlist_t *poolnvroot; + nvlist_t **l2child; + uint_t l2children, c; + char *vname; + boolean_t hadcache = B_FALSE; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &poolnvroot) == 0); + + (void) printf(gettext("would update '%s' to the following " + "configuration:\n"), zpool_get_name(zhp)); + + /* print original main pool and new tree */ + print_vdev_tree(zhp, poolname, poolnvroot, 0, "", + name_flags | VDEV_NAME_TYPE_ID); + print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); + + /* print other classes: 'dedup', 'special', and 'log' */ + if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) { + print_vdev_tree(zhp, "dedup", poolnvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) { + print_vdev_tree(zhp, "dedup", nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + } + + if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) { + print_vdev_tree(zhp, "special", poolnvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) { + print_vdev_tree(zhp, "special", nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + } + + if (num_logs(poolnvroot) > 0) { + print_vdev_tree(zhp, "logs", poolnvroot, 0, + VDEV_ALLOC_BIAS_LOG, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, + VDEV_ALLOC_BIAS_LOG, name_flags); + } else if (num_logs(nvroot) > 0) { + print_vdev_tree(zhp, "logs", nvroot, 0, + VDEV_ALLOC_BIAS_LOG, name_flags); + } + + /* Do the same for the caches */ + if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, + &l2child, &l2children) == 0 && l2children) { + hadcache = B_TRUE; + (void) printf(gettext("\tcache\n")); + for (c = 0; c < l2children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + l2child[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2child, &l2children) == 0 && l2children) { + if (!hadcache) + (void) printf(gettext("\tcache\n")); + for (c = 0; c < l2children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + l2child[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } + + ret = 0; + } else { + ret = (zpool_add(zhp, nvroot) != 0); + } + + nvlist_free(props); + nvlist_free(nvroot); + zpool_close(zhp); + + return (ret); +} + +/* + * zpool remove [-npsw] <pool> <vdev> ... + * + * Removes the given vdev from the pool. + */ +int +zpool_do_remove(int argc, char **argv) +{ + char *poolname; + int i, ret = 0; + zpool_handle_t *zhp = NULL; + boolean_t stop = B_FALSE; + int c; + boolean_t noop = B_FALSE; + boolean_t parsable = B_FALSE; + boolean_t wait = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "npsw")) != -1) { + switch (c) { + case 'n': + noop = B_TRUE; + break; + case 'p': + parsable = B_TRUE; + break; + case 's': + stop = B_TRUE; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + if (stop && noop) { + (void) fprintf(stderr, gettext("stop request ignored\n")); + return (0); + } + + if (stop) { + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + if (zpool_vdev_remove_cancel(zhp) != 0) + ret = 1; + if (wait) { + (void) fprintf(stderr, gettext("invalid option " + "combination: -w cannot be used with -s\n")); + usage(B_FALSE); + } + } else { + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device\n")); + usage(B_FALSE); + } + + for (i = 1; i < argc; i++) { + if (noop) { + uint64_t size; + + if (zpool_vdev_indirect_size(zhp, argv[i], + &size) != 0) { + ret = 1; + break; + } + if (parsable) { + (void) printf("%s %llu\n", + argv[i], (unsigned long long)size); + } else { + char valstr[32]; + zfs_nicenum(size, valstr, + sizeof (valstr)); + (void) printf("Memory that will be " + "used after removing %s: %s\n", + argv[i], valstr); + } + } else { + if (zpool_vdev_remove(zhp, argv[i]) != 0) + ret = 1; + } + } + + if (ret == 0 && wait) + ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE); + } + zpool_close(zhp); + + return (ret); +} + +/* + * zpool labelclear [-f] <vdev> + * + * -f Force clearing the label for the vdevs which are members of + * the exported or foreign pools. + * + * Verifies that the vdev is not active and zeros out the label information + * on the device. + */ +int +zpool_do_labelclear(int argc, char **argv) +{ + char vdev[MAXPATHLEN]; + char *name = NULL; + struct stat st; + int c, fd = -1, ret = 0; + nvlist_t *config; + pool_state_t state; + boolean_t inuse = B_FALSE; + boolean_t force = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get vdev name */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing vdev name\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* + * Check if we were given absolute path and use it as is. + * Otherwise if the provided vdev name doesn't point to a file, + * try prepending expected disk paths and partition numbers. + */ + (void) strlcpy(vdev, argv[0], sizeof (vdev)); + if (vdev[0] != '/' && stat(vdev, &st) != 0) { + int error; + + error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN); + if (error == 0 && zfs_dev_is_whole_disk(vdev)) { + if (zfs_append_partition(vdev, MAXPATHLEN) == -1) + error = ENOENT; + } + + if (error || (stat(vdev, &st) != 0)) { + (void) fprintf(stderr, gettext( + "failed to find device %s, try specifying absolute " + "path instead\n"), argv[0]); + return (1); + } + } + + if ((fd = open(vdev, O_RDWR)) < 0) { + (void) fprintf(stderr, gettext("failed to open %s: %s\n"), + vdev, strerror(errno)); + return (1); + } + + /* + * Flush all dirty pages for the block device. This should not be + * fatal when the device does not support BLKFLSBUF as would be the + * case for a file vdev. + */ + if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY)) + (void) fprintf(stderr, gettext("failed to invalidate " + "cache for %s: %s\n"), vdev, strerror(errno)); + + if (zpool_read_label(fd, &config, NULL) != 0) { + (void) fprintf(stderr, + gettext("failed to read label from %s\n"), vdev); + ret = 1; + goto errout; + } + nvlist_free(config); + + ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); + if (ret != 0) { + (void) fprintf(stderr, + gettext("failed to check state for %s\n"), vdev); + ret = 1; + goto errout; + } + + if (!inuse) + goto wipe_label; + + switch (state) { + default: + case POOL_STATE_ACTIVE: + case POOL_STATE_SPARE: + case POOL_STATE_L2CACHE: + (void) fprintf(stderr, gettext( + "%s is a member (%s) of pool \"%s\"\n"), + vdev, zpool_pool_state_to_name(state), name); + ret = 1; + goto errout; + + case POOL_STATE_EXPORTED: + if (force) + break; + (void) fprintf(stderr, gettext( + "use '-f' to override the following error:\n" + "%s is a member of exported pool \"%s\"\n"), + vdev, name); + ret = 1; + goto errout; + + case POOL_STATE_POTENTIALLY_ACTIVE: + if (force) + break; + (void) fprintf(stderr, gettext( + "use '-f' to override the following error:\n" + "%s is a member of potentially active pool \"%s\"\n"), + vdev, name); + ret = 1; + goto errout; + + case POOL_STATE_DESTROYED: + /* inuse should never be set for a destroyed pool */ + assert(0); + break; + } + +wipe_label: + ret = zpool_clear_label(fd); + if (ret != 0) { + (void) fprintf(stderr, + gettext("failed to clear label for %s\n"), vdev); + } + +errout: + free(name); + (void) close(fd); + + return (ret); +} + +/* + * zpool create [-fnd] [-o property=value] ... + * [-O file-system-property=value] ... + * [-R root] [-m mountpoint] <pool> <dev> ... + * + * -f Force creation, even if devices appear in use + * -n Do not create the pool, but display the resulting layout if it + * were to be created. + * -R Create a pool under an alternate root + * -m Set default mountpoint for the root dataset. By default it's + * '/<pool>' + * -o Set property=value. + * -o Set feature@feature=enabled|disabled. + * -d Don't automatically enable all supported pool features + * (individual features can be enabled with -o). + * -O Set fsproperty=value in the pool's root file system + * + * Creates the named pool according to the given vdev specification. The + * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c. + * Once we get the nvlist back from make_root_vdev(), we either print out the + * contents (if '-n' was specified), or pass it to libzfs to do the creation. + */ +int +zpool_do_create(int argc, char **argv) +{ + boolean_t force = B_FALSE; + boolean_t dryrun = B_FALSE; + boolean_t enable_all_pool_feat = B_TRUE; + int c; + nvlist_t *nvroot = NULL; + char *poolname; + char *tname = NULL; + int ret = 1; + char *altroot = NULL; + char *mountpoint = NULL; + nvlist_t *fsprops = NULL; + nvlist_t *props = NULL; + char *propval; + + /* check options */ + while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'd': + enable_all_pool_feat = B_FALSE; + break; + case 'R': + altroot = optarg; + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) + goto errout; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto errout; + break; + case 'm': + /* Equivalent to -O mountpoint=optarg */ + mountpoint = optarg; + break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + goto errout; + } + *propval = '\0'; + propval++; + + if (add_prop_list(optarg, propval, &props, B_TRUE)) + goto errout; + + /* + * If the user is creating a pool that doesn't support + * feature flags, don't enable any features. + */ + if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { + char *end; + u_longlong_t ver; + + ver = strtoull(propval, &end, 10); + if (*end == '\0' && + ver < SPA_VERSION_FEATURES) { + enable_all_pool_feat = B_FALSE; + } + } + if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) + altroot = propval; + break; + case 'O': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -O option\n")); + goto errout; + } + *propval = '\0'; + propval++; + + /* + * Mountpoints are checked and then added later. + * Uniquely among properties, they can be specified + * more than once, to avoid conflict with -m. + */ + if (0 == strcmp(optarg, + zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { + mountpoint = propval; + } else if (add_prop_list(optarg, propval, &fsprops, + B_FALSE)) { + goto errout; + } + break; + case 't': + /* + * Sanity check temporary pool name. + */ + if (strchr(optarg, '/') != NULL) { + (void) fprintf(stderr, gettext("cannot create " + "'%s': invalid character '/' in temporary " + "name\n"), optarg); + (void) fprintf(stderr, gettext("use 'zfs " + "create' to create a dataset\n")); + goto errout; + } + + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) + goto errout; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto errout; + tname = optarg; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + goto badusage; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto badusage; + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + goto badusage; + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing vdev specification\n")); + goto badusage; + } + + poolname = argv[0]; + + /* + * As a special case, check for use of '/' in the name, and direct the + * user to use 'zfs create' instead. + */ + if (strchr(poolname, '/') != NULL) { + (void) fprintf(stderr, gettext("cannot create '%s': invalid " + "character '/' in pool name\n"), poolname); + (void) fprintf(stderr, gettext("use 'zfs create' to " + "create a dataset\n")); + goto errout; + } + + /* pass off to make_root_vdev for bulk processing */ + nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, + argc - 1, argv + 1); + if (nvroot == NULL) + goto errout; + + /* make_root_vdev() allows 0 toplevel children if there are spares */ + if (!zfs_allocatable_devs(nvroot)) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: at least one toplevel vdev must be " + "specified\n")); + goto errout; + } + + if (altroot != NULL && altroot[0] != '/') { + (void) fprintf(stderr, gettext("invalid alternate root '%s': " + "must be an absolute path\n"), altroot); + goto errout; + } + + /* + * Check the validity of the mountpoint and direct the user to use the + * '-m' mountpoint option if it looks like its in use. + */ + if (mountpoint == NULL || + (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && + strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { + char buf[MAXPATHLEN]; + DIR *dirp; + + if (mountpoint && mountpoint[0] != '/') { + (void) fprintf(stderr, gettext("invalid mountpoint " + "'%s': must be an absolute path, 'legacy', or " + "'none'\n"), mountpoint); + goto errout; + } + + if (mountpoint == NULL) { + if (altroot != NULL) + (void) snprintf(buf, sizeof (buf), "%s/%s", + altroot, poolname); + else + (void) snprintf(buf, sizeof (buf), "/%s", + poolname); + } else { + if (altroot != NULL) + (void) snprintf(buf, sizeof (buf), "%s%s", + altroot, mountpoint); + else + (void) snprintf(buf, sizeof (buf), "%s", + mountpoint); + } + + if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { + (void) fprintf(stderr, gettext("mountpoint '%s' : " + "%s\n"), buf, strerror(errno)); + (void) fprintf(stderr, gettext("use '-m' " + "option to provide a different default\n")); + goto errout; + } else if (dirp) { + int count = 0; + + while (count < 3 && readdir(dirp) != NULL) + count++; + (void) closedir(dirp); + + if (count > 2) { + (void) fprintf(stderr, gettext("mountpoint " + "'%s' exists and is not empty\n"), buf); + (void) fprintf(stderr, gettext("use '-m' " + "option to provide a " + "different default\n")); + goto errout; + } + } + } + + /* + * Now that the mountpoint's validity has been checked, ensure that + * the property is set appropriately prior to creating the pool. + */ + if (mountpoint != NULL) { + ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), + mountpoint, &fsprops, B_FALSE); + if (ret != 0) + goto errout; + } + + ret = 1; + if (dryrun) { + /* + * For a dry run invocation, print out a basic message and run + * through all the vdevs in the list and print out in an + * appropriate hierarchy. + */ + (void) printf(gettext("would create '%s' with the " + "following layout:\n\n"), poolname); + + print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); + print_vdev_tree(NULL, "dedup", nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, 0); + print_vdev_tree(NULL, "special", nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, 0); + print_vdev_tree(NULL, "logs", nvroot, 0, + VDEV_ALLOC_BIAS_LOG, 0); + + ret = 0; + } else { + /* + * Hand off to libzfs. + */ + spa_feature_t i; + for (i = 0; i < SPA_FEATURES; i++) { + char propname[MAXPATHLEN]; + char *propval; + zfeature_info_t *feat = &spa_feature_table[i]; + + (void) snprintf(propname, sizeof (propname), + "feature@%s", feat->fi_uname); + + /* + * Only features contained in props will be enabled: + * remove from the nvlist every ZFS_FEATURE_DISABLED + * value and add every missing ZFS_FEATURE_ENABLED if + * enable_all_pool_feat is set. + */ + if (!nvlist_lookup_string(props, propname, &propval)) { + if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) + (void) nvlist_remove_all(props, + propname); + } else if (enable_all_pool_feat) { + ret = add_prop_list(propname, + ZFS_FEATURE_ENABLED, &props, B_TRUE); + if (ret != 0) + goto errout; + } + } + + ret = 1; + if (zpool_create(g_zfs, poolname, + nvroot, props, fsprops) == 0) { + zfs_handle_t *pool = zfs_open(g_zfs, + tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); + if (pool != NULL) { + if (zfs_mount(pool, NULL, 0) == 0) { + ret = zfs_shareall(pool); + zfs_commit_all_shares(); + } + zfs_close(pool); + } + } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { + (void) fprintf(stderr, gettext("pool name may have " + "been omitted\n")); + } + } + +errout: + nvlist_free(nvroot); + nvlist_free(fsprops); + nvlist_free(props); + return (ret); +badusage: + nvlist_free(fsprops); + nvlist_free(props); + usage(B_FALSE); + return (2); +} + +/* + * zpool destroy <pool> + * + * -f Forcefully unmount any datasets + * + * Destroy the given pool. Automatically unmounts any datasets in the pool. + */ +int +zpool_do_destroy(int argc, char **argv) +{ + boolean_t force = B_FALSE; + int c; + char *pool; + zpool_handle_t *zhp; + int ret; + + /* check options */ + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + pool = argv[0]; + + if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { + /* + * As a special case, check for use of '/' in the name, and + * direct the user to use 'zfs destroy' instead. + */ + if (strchr(pool, '/') != NULL) + (void) fprintf(stderr, gettext("use 'zfs destroy' to " + "destroy a dataset\n")); + return (1); + } + + if (zpool_disable_datasets(zhp, force) != 0) { + (void) fprintf(stderr, gettext("could not destroy '%s': " + "could not unmount datasets\n"), zpool_get_name(zhp)); + zpool_close(zhp); + return (1); + } + + /* The history must be logged as part of the export */ + log_history = B_FALSE; + + ret = (zpool_destroy(zhp, history_str) != 0); + + zpool_close(zhp); + + return (ret); +} + +typedef struct export_cbdata { + boolean_t force; + boolean_t hardforce; +} export_cbdata_t; + +/* + * Export one pool + */ +static int +zpool_export_one(zpool_handle_t *zhp, void *data) +{ + export_cbdata_t *cb = data; + + if (zpool_disable_datasets(zhp, cb->force) != 0) + return (1); + + /* The history must be logged as part of the export */ + log_history = B_FALSE; + + if (cb->hardforce) { + if (zpool_export_force(zhp, history_str) != 0) + return (1); + } else if (zpool_export(zhp, cb->force, history_str) != 0) { + return (1); + } + + return (0); +} + +/* + * zpool export [-f] <pool> ... + * + * -a Export all pools + * -f Forcefully unmount datasets + * + * Export the given pools. By default, the command will attempt to cleanly + * unmount any active datasets within the pool. If the '-f' flag is specified, + * then the datasets will be forcefully unmounted. + */ +int +zpool_do_export(int argc, char **argv) +{ + export_cbdata_t cb; + boolean_t do_all = B_FALSE; + boolean_t force = B_FALSE; + boolean_t hardforce = B_FALSE; + int c, ret; + + /* check options */ + while ((c = getopt(argc, argv, "afF")) != -1) { + switch (c) { + case 'a': + do_all = B_TRUE; + break; + case 'f': + force = B_TRUE; + break; + case 'F': + hardforce = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + cb.force = force; + cb.hardforce = hardforce; + argc -= optind; + argv += optind; + + if (do_all) { + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + return (for_each_pool(argc, argv, B_TRUE, NULL, + zpool_export_one, &cb)); + } + + /* check arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } + + ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb); + + return (ret); +} + +/* + * Given a vdev configuration, determine the maximum width needed for the device + * name column. + */ +static int +max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, + int name_flags) +{ + char *name; + nvlist_t **child; + uint_t c, children; + int ret; + + name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); + if (strlen(name) + depth > max) + max = strlen(name) + depth; + + free(name); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max, name_flags)) > max) + max = ret; + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max, name_flags)) > max) + max = ret; + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max, name_flags)) > max) + max = ret; + } + + return (max); +} + +typedef struct spare_cbdata { + uint64_t cb_guid; + zpool_handle_t *cb_zhp; +} spare_cbdata_t; + +static boolean_t +find_vdev(nvlist_t *nv, uint64_t search) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + search == guid) + return (B_TRUE); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (find_vdev(child[c], search)) + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +find_spare(zpool_handle_t *zhp, void *data) +{ + spare_cbdata_t *cbp = data; + nvlist_t *config, *nvroot; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + if (find_vdev(nvroot, cbp->cb_guid)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +typedef struct status_cbdata { + int cb_count; + int cb_name_flags; + int cb_namewidth; + boolean_t cb_allpools; + boolean_t cb_verbose; + boolean_t cb_literal; + boolean_t cb_explain; + boolean_t cb_first; + boolean_t cb_dedup_stats; + boolean_t cb_print_status; + boolean_t cb_print_slow_ios; + boolean_t cb_print_vdev_init; + boolean_t cb_print_vdev_trim; + vdev_cmd_data_list_t *vcdl; +} status_cbdata_t; + +/* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ +static int +is_blank_str(char *str) +{ + while (str != NULL && *str != '\0') { + if (!isblank(*str)) + return (0); + str++; + } + return (1); +} + +/* Print command output lines for specific vdev in a specific pool */ +static void +zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path) +{ + vdev_cmd_data_t *data; + int i, j; + char *val; + + for (i = 0; i < vcdl->count; i++) { + if ((strcmp(vcdl->data[i].path, path) != 0) || + (strcmp(vcdl->data[i].pool, pool) != 0)) { + /* Not the vdev we're looking for */ + continue; + } + + data = &vcdl->data[i]; + /* Print out all the output values for this vdev */ + for (j = 0; j < vcdl->uniq_cols_cnt; j++) { + val = NULL; + /* Does this vdev have values for this column? */ + for (int k = 0; k < data->cols_cnt; k++) { + if (strcmp(data->cols[k], + vcdl->uniq_cols[j]) == 0) { + /* yes it does, record the value */ + val = data->lines[k]; + break; + } + } + /* + * Mark empty values with dashes to make output + * awk-able. + */ + if (is_blank_str(val)) + val = "-"; + + printf("%*s", vcdl->uniq_cols_width[j], val); + if (j < vcdl->uniq_cols_cnt - 1) + printf(" "); + } + + /* Print out any values that aren't in a column at the end */ + for (j = data->cols_cnt; j < data->lines_cnt; j++) { + /* Did we have any columns? If so print a spacer. */ + if (vcdl->uniq_cols_cnt > 0) + printf(" "); + + val = data->lines[j]; + printf("%s", val ? val : ""); + } + break; + } +} + +/* + * Print vdev initialization status for leaves + */ +static void +print_status_initialize(vdev_stat_t *vs, boolean_t verbose) +{ + if (verbose) { + if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || + vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && + !vs->vs_scan_removing) { + char zbuf[1024]; + char tbuf[256]; + struct tm zaction_ts; + + time_t t = vs->vs_initialize_action_time; + int initialize_pct = 100; + if (vs->vs_initialize_state != + VDEV_INITIALIZE_COMPLETE) { + initialize_pct = (vs->vs_initialize_bytes_done * + 100 / (vs->vs_initialize_bytes_est + 1)); + } + + (void) localtime_r(&t, &zaction_ts); + (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + + switch (vs->vs_initialize_state) { + case VDEV_INITIALIZE_SUSPENDED: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("suspended, started at"), tbuf); + break; + case VDEV_INITIALIZE_ACTIVE: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("started at"), tbuf); + break; + case VDEV_INITIALIZE_COMPLETE: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("completed at"), tbuf); + break; + } + + (void) printf(gettext(" (%d%% initialized%s)"), + initialize_pct, zbuf); + } else { + (void) printf(gettext(" (uninitialized)")); + } + } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) { + (void) printf(gettext(" (initializing)")); + } +} + +/* + * Print vdev TRIM status for leaves + */ +static void +print_status_trim(vdev_stat_t *vs, boolean_t verbose) +{ + if (verbose) { + if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE || + vs->vs_trim_state == VDEV_TRIM_SUSPENDED || + vs->vs_trim_state == VDEV_TRIM_COMPLETE) && + !vs->vs_scan_removing) { + char zbuf[1024]; + char tbuf[256]; + struct tm zaction_ts; + + time_t t = vs->vs_trim_action_time; + int trim_pct = 100; + if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) { + trim_pct = (vs->vs_trim_bytes_done * + 100 / (vs->vs_trim_bytes_est + 1)); + } + + (void) localtime_r(&t, &zaction_ts); + (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + + switch (vs->vs_trim_state) { + case VDEV_TRIM_SUSPENDED: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("suspended, started at"), tbuf); + break; + case VDEV_TRIM_ACTIVE: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("started at"), tbuf); + break; + case VDEV_TRIM_COMPLETE: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("completed at"), tbuf); + break; + } + + (void) printf(gettext(" (%d%% trimmed%s)"), + trim_pct, zbuf); + } else if (vs->vs_trim_notsup) { + (void) printf(gettext(" (trim unsupported)")); + } else { + (void) printf(gettext(" (untrimmed)")); + } + } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) { + (void) printf(gettext(" (trimming)")); + } +} + +/* + * Return the color associated with a health string. This includes returning + * NULL for no color change. + */ +static char * +health_str_to_color(const char *health) +{ + if (strcmp(health, gettext("FAULTED")) == 0 || + strcmp(health, gettext("SUSPENDED")) == 0 || + strcmp(health, gettext("UNAVAIL")) == 0) { + return (ANSI_RED); + } + + if (strcmp(health, gettext("OFFLINE")) == 0 || + strcmp(health, gettext("DEGRADED")) == 0 || + strcmp(health, gettext("REMOVED")) == 0) { + return (ANSI_YELLOW); + } + + return (NULL); +} + +/* + * Print out configuration state as requested by status_callback. + */ +static void +print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, + nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) +{ + nvlist_t **child, *root; + uint_t c, i, vsc, children; + pool_scan_stat_t *ps = NULL; + vdev_stat_t *vs; + char rbuf[6], wbuf[6], cbuf[6]; + char *vname; + uint64_t notpresent; + spare_cbdata_t spare_cb; + const char *state; + char *type; + char *path = NULL; + char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) + return; + + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); + + if (isspare) { + /* + * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for + * online drives. + */ + if (vs->vs_aux == VDEV_AUX_SPARED) + state = gettext("INUSE"); + else if (vs->vs_state == VDEV_STATE_HEALTHY) + state = gettext("AVAIL"); + } + + printf_color(health_str_to_color(state), + "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, + name, state); + + if (!isspare) { + if (vs->vs_read_errors) + rcolor = ANSI_RED; + + if (vs->vs_write_errors) + wcolor = ANSI_RED; + + if (vs->vs_checksum_errors) + ccolor = ANSI_RED; + + if (cb->cb_literal) { + printf(" "); + printf_color(rcolor, "%5llu", + (u_longlong_t)vs->vs_read_errors); + printf(" "); + printf_color(wcolor, "%5llu", + (u_longlong_t)vs->vs_write_errors); + printf(" "); + printf_color(ccolor, "%5llu", + (u_longlong_t)vs->vs_checksum_errors); + } else { + zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); + zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); + zfs_nicenum(vs->vs_checksum_errors, cbuf, + sizeof (cbuf)); + printf(" "); + printf_color(rcolor, "%5s", rbuf); + printf(" "); + printf_color(wcolor, "%5s", wbuf); + printf(" "); + printf_color(ccolor, "%5s", cbuf); + } + if (cb->cb_print_slow_ios) { + if (children == 0) { + /* Only leafs vdevs have slow IOs */ + zfs_nicenum(vs->vs_slow_ios, rbuf, + sizeof (rbuf)); + } else { + snprintf(rbuf, sizeof (rbuf), "-"); + } + + if (cb->cb_literal) + printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + else + printf(" %5s", rbuf); + } + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + ¬present) == 0) { + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); + (void) printf(" %s %s", gettext("was"), path); + } else if (vs->vs_aux != 0) { + (void) printf(" "); + color_start(ANSI_RED); + switch (vs->vs_aux) { + case VDEV_AUX_OPEN_FAILED: + (void) printf(gettext("cannot open")); + break; + + case VDEV_AUX_BAD_GUID_SUM: + (void) printf(gettext("missing device")); + break; + + case VDEV_AUX_NO_REPLICAS: + (void) printf(gettext("insufficient replicas")); + break; + + case VDEV_AUX_VERSION_NEWER: + (void) printf(gettext("newer version")); + break; + + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + + case VDEV_AUX_ASHIFT_TOO_BIG: + (void) printf(gettext("unsupported minimum blocksize")); + break; + + case VDEV_AUX_SPARED: + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &spare_cb.cb_guid) == 0); + if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { + if (strcmp(zpool_get_name(spare_cb.cb_zhp), + zpool_get_name(zhp)) == 0) + (void) printf(gettext("currently in " + "use")); + else + (void) printf(gettext("in use by " + "pool '%s'"), + zpool_get_name(spare_cb.cb_zhp)); + zpool_close(spare_cb.cb_zhp); + } else { + (void) printf(gettext("currently in use")); + } + break; + + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + + case VDEV_AUX_IO_FAILURE: + (void) printf(gettext("experienced I/O failures")); + break; + + case VDEV_AUX_BAD_LOG: + (void) printf(gettext("bad intent log")); + break; + + case VDEV_AUX_EXTERNAL: + (void) printf(gettext("external device fault")); + break; + + case VDEV_AUX_SPLIT_POOL: + (void) printf(gettext("split into new pool")); + break; + + case VDEV_AUX_ACTIVE: + (void) printf(gettext("currently in use")); + break; + + case VDEV_AUX_CHILDREN_OFFLINE: + (void) printf(gettext("all children offline")); + break; + + default: + (void) printf(gettext("corrupted data")); + break; + } + color_end(); + } + + /* The root vdev has the scrub/resilver stats */ + root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE); + (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c); + + if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) { + if (vs->vs_scan_processed != 0) { + (void) printf(gettext(" (%s)"), + (ps->pss_func == POOL_SCAN_RESILVER) ? + "resilvering" : "repairing"); + } else if (vs->vs_resilver_deferred) { + (void) printf(gettext(" (awaiting resilver)")); + } + } + + /* The top-level vdevs have the rebuild stats */ + if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && + children == 0) { + if (vs->vs_rebuild_processed != 0) { + (void) printf(gettext(" (resilvering)")); + } + } + + if (cb->vcdl != NULL) { + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { + printf(" "); + zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); + } + } + + /* Display vdev initialization and trim status for leaves */ + if (children == 0) { + print_status_initialize(vs, cb->cb_print_vdev_init); + print_status_trim(vs, cb->cb_print_vdev_trim); + } + + (void) printf("\n"); + + for (c = 0; c < children; c++) { + uint64_t islog = B_FALSE, ishole = B_FALSE; + + /* Don't print logs or holes here */ + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &islog); + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &ishole); + if (islog || ishole) + continue; + /* Only print normal classes here */ + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + /* Provide vdev_rebuild_stats to children if available */ + if (vrs == NULL) { + (void) nvlist_lookup_uint64_array(nv, + ZPOOL_CONFIG_REBUILD_STATS, + (uint64_t **)&vrs, &i); + } + + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags | VDEV_NAME_TYPE_ID); + print_status_config(zhp, cb, vname, child[c], depth + 2, + isspare, vrs); + free(vname); + } +} + +/* + * Print the configuration of an exported pool. Iterate over all vdevs in the + * pool, printing out the name and status for each one. + */ +static void +print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, + int depth) +{ + nvlist_t **child; + uint_t c, children; + vdev_stat_t *vs; + char *type, *vname; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + if (strcmp(type, VDEV_TYPE_MISSING) == 0 || + strcmp(type, VDEV_TYPE_HOLE) == 0) + return; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); + (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); + + if (vs->vs_aux != 0) { + (void) printf(" "); + + switch (vs->vs_aux) { + case VDEV_AUX_OPEN_FAILED: + (void) printf(gettext("cannot open")); + break; + + case VDEV_AUX_BAD_GUID_SUM: + (void) printf(gettext("missing device")); + break; + + case VDEV_AUX_NO_REPLICAS: + (void) printf(gettext("insufficient replicas")); + break; + + case VDEV_AUX_VERSION_NEWER: + (void) printf(gettext("newer version")); + break; + + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + + case VDEV_AUX_ACTIVE: + (void) printf(gettext("currently in use")); + break; + + case VDEV_AUX_CHILDREN_OFFLINE: + (void) printf(gettext("all children offline")); + break; + + default: + (void) printf(gettext("corrupted data")); + break; + } + } + (void) printf("\n"); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (is_log) + continue; + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + vname = zpool_vdev_name(g_zfs, NULL, child[c], + cb->cb_name_flags | VDEV_NAME_TYPE_ID); + print_import_config(cb, vname, child[c], depth + 2); + free(vname); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + (void) printf(gettext("\tcache\n")); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, child[c], + cb->cb_name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + (void) printf(gettext("\tspares\n")); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, child[c], + cb->cb_name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } +} + +/* + * Print specialized class vdevs. + * + * These are recorded as top level vdevs in the main pool child array + * but with "is_log" set to 1 or an "alloc_bias" string. We use either + * print_status_config() or print_import_config() to print the top level + * class vdevs then any of their children (eg mirrored slogs) are printed + * recursively - which works because only the top level vdev is marked. + */ +static void +print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, + const char *class) +{ + uint_t c, children; + nvlist_t **child; + boolean_t printed = B_FALSE; + + assert(zhp != NULL || !cb->cb_verbose); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + char *bias = NULL; + char *type = NULL; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + + if (is_log) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_TYPE, &type); + } + + if (bias == NULL || strcmp(bias, class) != 0) + continue; + if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + if (!printed) { + (void) printf("\t%s\t\n", gettext(class)); + printed = B_TRUE; + } + + char *name = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags | VDEV_NAME_TYPE_ID); + if (cb->cb_print_status) + print_status_config(zhp, cb, name, child[c], 2, + B_FALSE, NULL); + else + print_import_config(cb, name, child[c], 2); + free(name); + } +} + +/* + * Display the status for the given pool. + */ +static void +show_import(nvlist_t *config) +{ + uint64_t pool_state; + vdev_stat_t *vs; + char *name; + uint64_t guid; + uint64_t hostid = 0; + char *msgid; + char *hostname = "unknown"; + nvlist_t *nvroot, *nvinfo; + zpool_status_t reason; + zpool_errata_t errata; + const char *health; + uint_t vsc; + char *comment; + status_cbdata_t cb = { 0 }; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + health = zpool_state_to_name(vs->vs_state, vs->vs_aux); + + reason = zpool_import_status(config, &msgid, &errata); + + (void) printf(gettext(" pool: %s\n"), name); + (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); + (void) printf(gettext(" state: %s"), health); + if (pool_state == POOL_STATE_DESTROYED) + (void) printf(gettext(" (DESTROYED)")); + (void) printf("\n"); + + switch (reason) { + case ZPOOL_STATUS_MISSING_DEV_R: + case ZPOOL_STATUS_MISSING_DEV_NR: + case ZPOOL_STATUS_BAD_GUID_SUM: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "missing from the system.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_LABEL_R: + case ZPOOL_STATUS_CORRUPT_LABEL_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices contains" + " corrupted data.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_DATA: + (void) printf( + gettext(" status: The pool data is corrupted.\n")); + break; + + case ZPOOL_STATUS_OFFLINE_DEV: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices " + "are offlined.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_POOL: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool metadata is " + "corrupted.\n")); + break; + + case ZPOOL_STATUS_VERSION_OLDER: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is formatted using " + "a legacy on-disk version.\n")); + break; + + case ZPOOL_STATUS_VERSION_NEWER: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is formatted using " + "an incompatible version.\n")); + break; + + case ZPOOL_STATUS_FEAT_DISABLED: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Some supported features are " + "not enabled on the pool.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_READ: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool uses the following " + "feature(s) not supported on this system:\n")); + color_start(ANSI_YELLOW); + zpool_print_unsup_feat(config); + color_end(); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool can only be " + "accessed in read-only mode on this system. It\n\tcannot be" + " accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + color_start(ANSI_YELLOW); + zpool_print_unsup_feat(config); + color_end(); + break; + + case ZPOOL_STATUS_HOSTID_ACTIVE: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is currently " + "imported by another system.\n")); + break; + + case ZPOOL_STATUS_HOSTID_REQUIRED: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool has the " + "multihost property on. It cannot\n\tbe safely imported " + "when the system hostid is not set.\n")); + break; + + case ZPOOL_STATUS_HOSTID_MISMATCH: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool was last accessed " + "by another system.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_R: + case ZPOOL_STATUS_FAULTED_DEV_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "faulted.\n")); + break; + + case ZPOOL_STATUS_BAD_LOG: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("An intent log record cannot " + "be read.\n")); + break; + + case ZPOOL_STATUS_RESILVERING: + case ZPOOL_STATUS_REBUILDING: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices were " + "being resilvered.\n")); + break; + + case ZPOOL_STATUS_ERRATA: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), + errata); + break; + + default: + /* + * No other status can be seen when importing pools. + */ + assert(reason == ZPOOL_STATUS_OK); + } + + /* + * Print out an action according to the overall state of the pool. + */ + if (vs->vs_state == VDEV_STATE_HEALTHY) { + if (reason == ZPOOL_STATUS_VERSION_OLDER || + reason == ZPOOL_STATUS_FEAT_DISABLED) { + (void) printf(gettext(" action: The pool can be " + "imported using its name or numeric identifier, " + "though\n\tsome features will not be available " + "without an explicit 'zpool upgrade'.\n")); + } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { + (void) printf(gettext(" action: The pool can be " + "imported using its name or numeric " + "identifier and\n\tthe '-f' flag.\n")); + } else if (reason == ZPOOL_STATUS_ERRATA) { + switch (errata) { + case ZPOOL_ERRATA_NONE: + break; + + case ZPOOL_ERRATA_ZOL_2094_SCRUB: + (void) printf(gettext(" action: The pool can " + "be imported using its name or numeric " + "identifier,\n\thowever there is a compat" + "ibility issue which should be corrected" + "\n\tby running 'zpool scrub'\n")); + break; + + case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: + (void) printf(gettext(" action: The pool can" + "not be imported with this version of ZFS " + "due to\n\tan active asynchronous destroy. " + "Revert to an earlier version\n\tand " + "allow the destroy to complete before " + "updating.\n")); + break; + + case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: + (void) printf(gettext(" action: Existing " + "encrypted datasets contain an on-disk " + "incompatibility, which\n\tneeds to be " + "corrected. Backup these datasets to new " + "encrypted datasets\n\tand destroy the " + "old ones.\n")); + break; + + case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: + (void) printf(gettext(" action: Existing " + "encrypted snapshots and bookmarks contain " + "an on-disk\n\tincompatibility. This may " + "cause on-disk corruption if they are used" + "\n\twith 'zfs recv'. To correct the " + "issue, enable the bookmark_v2 feature.\n\t" + "No additional action is needed if there " + "are no encrypted snapshots or\n\t" + "bookmarks. If preserving the encrypted " + "snapshots and bookmarks is\n\trequired, " + "use a non-raw send to backup and restore " + "them. Alternately,\n\tthey may be removed" + " to resolve the incompatibility.\n")); + break; + default: + /* + * All errata must contain an action message. + */ + assert(0); + } + } else { + (void) printf(gettext(" action: The pool can be " + "imported using its name or numeric " + "identifier.\n")); + } + } else if (vs->vs_state == VDEV_STATE_DEGRADED) { + (void) printf(gettext(" action: The pool can be imported " + "despite missing or damaged devices. The\n\tfault " + "tolerance of the pool may be compromised if imported.\n")); + } else { + switch (reason) { + case ZPOOL_STATUS_VERSION_NEWER: + (void) printf(gettext(" action: The pool cannot be " + "imported. Access the pool on a system running " + "newer\n\tsoftware, or recreate the pool from " + "backup.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be " + "imported. Access the pool on a system that " + "supports\n\tthe required feature(s), or recreate " + "the pool from backup.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be " + "imported in read-write mode. Import the pool " + "with\n" + "\t\"-o readonly=on\", access the pool on a system " + "that supports the\n\trequired feature(s), or " + "recreate the pool from backup.\n")); + break; + case ZPOOL_STATUS_MISSING_DEV_R: + case ZPOOL_STATUS_MISSING_DEV_NR: + case ZPOOL_STATUS_BAD_GUID_SUM: + (void) printf(gettext(" action: The pool cannot be " + "imported. Attach the missing\n\tdevices and try " + "again.\n")); + break; + case ZPOOL_STATUS_HOSTID_ACTIVE: + VERIFY0(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_MMP_HOSTNAME); + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_HOSTID); + + (void) printf(gettext(" action: The pool must be " + "exported from %s (hostid=%lx)\n\tbefore it " + "can be safely imported.\n"), hostname, + (unsigned long) hostid); + break; + case ZPOOL_STATUS_HOSTID_REQUIRED: + (void) printf(gettext(" action: Set a unique system " + "hostid with the zgenhostid(8) command.\n")); + break; + default: + (void) printf(gettext(" action: The pool cannot be " + "imported due to damaged devices or data.\n")); + } + } + + /* Print the comment attached to the pool. */ + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + (void) printf(gettext("comment: %s\n"), comment); + + /* + * If the state is "closed" or "can't open", and the aux state + * is "corrupt data": + */ + if (((vs->vs_state == VDEV_STATE_CLOSED) || + (vs->vs_state == VDEV_STATE_CANT_OPEN)) && + (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { + if (pool_state == POOL_STATE_DESTROYED) + (void) printf(gettext("\tThe pool was destroyed, " + "but can be imported using the '-Df' flags.\n")); + else if (pool_state != POOL_STATE_EXPORTED) + (void) printf(gettext("\tThe pool may be active on " + "another system, but can be imported using\n\t" + "the '-f' flag.\n")); + } + + if (msgid != NULL) { + (void) printf(gettext( + " see: https://zfsonlinux.org/msg/%s\n"), msgid); + } + + (void) printf(gettext(" config:\n\n")); + + cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), + VDEV_NAME_TYPE_ID); + if (cb.cb_namewidth < 10) + cb.cb_namewidth = 10; + + print_import_config(&cb, name, nvroot, 0); + + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); + + if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { + (void) printf(gettext("\n\tAdditional devices are known to " + "be part of this pool, though their\n\texact " + "configuration cannot be determined.\n")); + } +} + +static boolean_t +zfs_force_import_required(nvlist_t *config) +{ + uint64_t state; + uint64_t hostid = 0; + nvlist_t *nvinfo; + + state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + + if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) + return (B_TRUE); + + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { + mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_STATE); + + if (mmp_state != MMP_STATE_INACTIVE) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Perform the import for the given configuration. This passes the heavy + * lifting off to zpool_import_props(), and then mounts the datasets contained + * within the pool. + */ +static int +do_import(nvlist_t *config, const char *newname, const char *mntopts, + nvlist_t *props, int flags) +{ + int ret = 0; + zpool_handle_t *zhp; + char *name; + uint64_t version; + + name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); + version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); + + if (!SPA_VERSION_IS_SUPPORTED(version)) { + (void) fprintf(stderr, gettext("cannot import '%s': pool " + "is formatted using an unsupported ZFS version\n"), name); + return (1); + } else if (zfs_force_import_required(config) && + !(flags & ZFS_IMPORT_ANY_HOST)) { + mmp_state_t mmp_state = MMP_STATE_INACTIVE; + nvlist_t *nvinfo; + + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) + mmp_state = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_STATE); + + if (mmp_state == MMP_STATE_ACTIVE) { + char *hostname = "<unknown>"; + uint64_t hostid = 0; + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_MMP_HOSTNAME); + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_HOSTID); + + (void) fprintf(stderr, gettext("cannot import '%s': " + "pool is imported on %s (hostid: " + "0x%lx)\nExport the pool on the other system, " + "then run 'zpool import'.\n"), + name, hostname, (unsigned long) hostid); + } else if (mmp_state == MMP_STATE_NO_HOSTID) { + (void) fprintf(stderr, gettext("Cannot import '%s': " + "pool has the multihost property on and the\n" + "system's hostid is not set. Set a unique hostid " + "with the zgenhostid(8) command.\n"), name); + } else { + char *hostname = "<unknown>"; + uint64_t timestamp = 0; + uint64_t hostid = 0; + + if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) + hostname = fnvlist_lookup_string(config, + ZPOOL_CONFIG_HOSTNAME); + + if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) + timestamp = fnvlist_lookup_uint64(config, + ZPOOL_CONFIG_TIMESTAMP); + + if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(config, + ZPOOL_CONFIG_HOSTID); + + (void) fprintf(stderr, gettext("cannot import '%s': " + "pool was previously in use from another system.\n" + "Last accessed by %s (hostid=%lx) at %s" + "The pool can be imported, use 'zpool import -f' " + "to import the pool.\n"), name, hostname, + (unsigned long)hostid, ctime((time_t *)×tamp)); + } + + return (1); + } + + if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) + return (1); + + if (newname != NULL) + name = (char *)newname; + + if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) + return (1); + + /* + * Loading keys is best effort. We don't want to return immediately + * if it fails but we do want to give the error to the caller. + */ + if (flags & ZFS_IMPORT_LOAD_KEYS) { + ret = zfs_crypto_attempt_load_keys(g_zfs, name); + if (ret != 0) + ret = 1; + } + + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + !(flags & ZFS_IMPORT_ONLY) && + zpool_enable_datasets(zhp, mntopts, 0) != 0) { + zpool_close(zhp); + return (1); + } + + zpool_close(zhp); + return (ret); +} + +typedef struct target_exists_args { + const char *poolname; + uint64_t poolguid; +} target_exists_args_t; + +static int +name_or_guid_exists(zpool_handle_t *zhp, void *data) +{ + target_exists_args_t *args = data; + nvlist_t *config = zpool_get_config(zhp, NULL); + int found = 0; + + if (config == NULL) + return (0); + + if (args->poolname != NULL) { + char *pool_name; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pool_name) == 0); + if (strcmp(pool_name, args->poolname) == 0) + found = 1; + } else { + uint64_t pool_guid; + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0); + if (pool_guid == args->poolguid) + found = 1; + } + zpool_close(zhp); + + return (found); +} +/* + * zpool checkpoint <pool> + * checkpoint --discard <pool> + * + * -d Discard the checkpoint from a checkpointed + * --discard pool. + * + * -w Wait for discarding a checkpoint to complete. + * --wait + * + * Checkpoints the specified pool, by taking a "snapshot" of its + * current state. A pool can only have one checkpoint at a time. + */ +int +zpool_do_checkpoint(int argc, char **argv) +{ + boolean_t discard, wait; + char *pool; + zpool_handle_t *zhp; + int c, err; + + struct option long_options[] = { + {"discard", no_argument, NULL, 'd'}, + {"wait", no_argument, NULL, 'w'}, + {0, 0, 0, 0} + }; + + discard = B_FALSE; + wait = B_FALSE; + while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) { + switch (c) { + case 'd': + discard = B_TRUE; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (wait && !discard) { + (void) fprintf(stderr, gettext("--wait only valid when " + "--discard also specified\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + pool = argv[0]; + + if ((zhp = zpool_open(g_zfs, pool)) == NULL) { + /* As a special case, check for use of '/' in the name */ + if (strchr(pool, '/') != NULL) + (void) fprintf(stderr, gettext("'zpool checkpoint' " + "doesn't work on datasets. To save the state " + "of a dataset from a specific point in time " + "please use 'zfs snapshot'\n")); + return (1); + } + + if (discard) { + err = (zpool_discard_checkpoint(zhp) != 0); + if (err == 0 && wait) + err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD); + } else { + err = (zpool_checkpoint(zhp) != 0); + } + + zpool_close(zhp); + + return (err); +} + +#define CHECKPOINT_OPT 1024 + +/* + * zpool import [-d dir] [-D] + * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] + * [-d dir | -c cachefile] [-f] -a + * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] + * [-d dir | -c cachefile] [-f] [-n] [-F] <pool | id> [newpool] + * + * -c Read pool information from a cachefile instead of searching + * devices. + * + * -d Scan in a specific directory, other than /dev/. More than + * one directory can be specified using multiple '-d' options. + * + * -D Scan for previously destroyed pools or import all or only + * specified destroyed pools. + * + * -R Temporarily import the pool, with all mountpoints relative to + * the given root. The pool will remain exported when the machine + * is rebooted. + * + * -V Import even in the presence of faulted vdevs. This is an + * intentionally undocumented option for testing purposes, and + * treats the pool configuration as complete, leaving any bad + * vdevs in the FAULTED state. In other words, it does verbatim + * import. + * + * -f Force import, even if it appears that the pool is active. + * + * -F Attempt rewind if necessary. + * + * -n See if rewind would work, but don't actually rewind. + * + * -N Import the pool but don't mount datasets. + * + * -T Specify a starting txg to use for import. This option is + * intentionally undocumented option for testing purposes. + * + * -a Import all pools found. + * + * -l Load encryption keys while importing. + * + * -o Set property=value and/or temporary mount options (without '='). + * + * -s Scan using the default search path, the libblkid cache will + * not be consulted. + * + * --rewind-to-checkpoint + * Import the pool and revert back to the checkpoint. + * + * The import command scans for pools to import, and import pools based on pool + * name and GUID. The pool can also be renamed as part of the import process. + */ +int +zpool_do_import(int argc, char **argv) +{ + char **searchdirs = NULL; + char *env, *envdup = NULL; + int nsearch = 0; + int c; + int err = 0; + nvlist_t *pools = NULL; + boolean_t do_all = B_FALSE; + boolean_t do_destroyed = B_FALSE; + char *mntopts = NULL; + nvpair_t *elem; + nvlist_t *config; + uint64_t searchguid = 0; + char *searchname = NULL; + char *propval; + nvlist_t *found_config; + nvlist_t *policy = NULL; + nvlist_t *props = NULL; + boolean_t first; + int flags = ZFS_IMPORT_NORMAL; + uint32_t rewind_policy = ZPOOL_NO_REWIND; + boolean_t dryrun = B_FALSE; + boolean_t do_rewind = B_FALSE; + boolean_t xtreme_rewind = B_FALSE; + boolean_t do_scan = B_FALSE; + boolean_t pool_exists = B_FALSE; + uint64_t pool_state, txg = -1ULL; + char *cachefile = NULL; + importargs_t idata = { 0 }; + char *endptr; + + struct option long_options[] = { + {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, + {0, 0, 0, 0} + }; + + /* check options */ + while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX", + long_options, NULL)) != -1) { + switch (c) { + case 'a': + do_all = B_TRUE; + break; + case 'c': + cachefile = optarg; + break; + case 'd': + if (searchdirs == NULL) { + searchdirs = safe_malloc(sizeof (char *)); + } else { + char **tmp = safe_malloc((nsearch + 1) * + sizeof (char *)); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + free(searchdirs); + searchdirs = tmp; + } + searchdirs[nsearch++] = optarg; + break; + case 'D': + do_destroyed = B_TRUE; + break; + case 'f': + flags |= ZFS_IMPORT_ANY_HOST; + break; + case 'F': + do_rewind = B_TRUE; + break; + case 'l': + flags |= ZFS_IMPORT_LOAD_KEYS; + break; + case 'm': + flags |= ZFS_IMPORT_MISSING_LOG; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'N': + flags |= ZFS_IMPORT_ONLY; + break; + case 'o': + if ((propval = strchr(optarg, '=')) != NULL) { + *propval = '\0'; + propval++; + if (add_prop_list(optarg, propval, + &props, B_TRUE)) + goto error; + } else { + mntopts = optarg; + } + break; + case 'R': + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) + goto error; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto error; + break; + case 's': + do_scan = B_TRUE; + break; + case 't': + flags |= ZFS_IMPORT_TEMP_NAME; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto error; + break; + + case 'T': + errno = 0; + txg = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0') { + (void) fprintf(stderr, + gettext("invalid txg value\n")); + usage(B_FALSE); + } + rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; + break; + case 'V': + flags |= ZFS_IMPORT_VERBATIM; + break; + case 'X': + xtreme_rewind = B_TRUE; + break; + case CHECKPOINT_OPT: + flags |= ZFS_IMPORT_CHECKPOINT; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (cachefile && nsearch != 0) { + (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); + usage(B_FALSE); + } + + if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { + (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); + usage(B_FALSE); + } + + if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) { + (void) fprintf(stderr, gettext("-l is only meaningful during " + "an import\n")); + usage(B_FALSE); + } + + if ((dryrun || xtreme_rewind) && !do_rewind) { + (void) fprintf(stderr, + gettext("-n or -X only meaningful with -F\n")); + usage(B_FALSE); + } + if (dryrun) + rewind_policy = ZPOOL_TRY_REWIND; + else if (do_rewind) + rewind_policy = ZPOOL_DO_REWIND; + if (xtreme_rewind) + rewind_policy |= ZPOOL_EXTREME_REWIND; + + /* In the future, we can capture further policy and include it here */ + if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || + nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, + rewind_policy) != 0) + goto error; + + /* check argument count */ + if (do_all) { + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + } else { + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + } + + /* + * Check for the effective uid. We do this explicitly here because + * otherwise any attempt to discover pools will silently fail. + */ + if (argc == 0 && geteuid() != 0) { + (void) fprintf(stderr, gettext("cannot " + "discover pools: permission denied\n")); + if (searchdirs != NULL) + free(searchdirs); + + nvlist_free(props); + nvlist_free(policy); + return (1); + } + + /* + * Depending on the arguments given, we do one of the following: + * + * <none> Iterate through all pools and display information about + * each one. + * + * -a Iterate through all pools and try to import each one. + * + * <id> Find the pool that corresponds to the given GUID/pool + * name and import that one. + * + * -D Above options applies only to destroyed pools. + */ + if (argc != 0) { + char *endptr; + + errno = 0; + searchguid = strtoull(argv[0], &endptr, 10); + if (errno != 0 || *endptr != '\0') { + searchname = argv[0]; + searchguid = 0; + } + found_config = NULL; + + /* + * User specified a name or guid. Ensure it's unique. + */ + target_exists_args_t search = {searchname, searchguid}; + pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); + } + + /* + * Check the environment for the preferred search path. + */ + if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { + char *dir; + + envdup = strdup(env); + + dir = strtok(envdup, ":"); + while (dir != NULL) { + if (searchdirs == NULL) { + searchdirs = safe_malloc(sizeof (char *)); + } else { + char **tmp = safe_malloc((nsearch + 1) * + sizeof (char *)); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + free(searchdirs); + searchdirs = tmp; + } + searchdirs[nsearch++] = dir; + dir = strtok(NULL, ":"); + } + } + + idata.path = searchdirs; + idata.paths = nsearch; + idata.poolname = searchname; + idata.guid = searchguid; + idata.cachefile = cachefile; + idata.scan = do_scan; + idata.policy = policy; + + pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); + + if (pools != NULL && pool_exists && + (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "a pool with that name already exists\n"), + argv[0]); + (void) fprintf(stderr, gettext("use the form '%s " + "<pool | id> <newpool>' to give it a new name\n"), + "zpool import"); + err = 1; + } else if (pools == NULL && pool_exists) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "a pool with that name is already created/imported,\n"), + argv[0]); + (void) fprintf(stderr, gettext("and no additional pools " + "with that name were found\n")); + err = 1; + } else if (pools == NULL) { + if (argc != 0) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "no such pool available\n"), argv[0]); + } + err = 1; + } + + if (err == 1) { + if (searchdirs != NULL) + free(searchdirs); + if (envdup != NULL) + free(envdup); + nvlist_free(policy); + nvlist_free(pools); + nvlist_free(props); + return (1); + } + + /* + * At this point we have a list of import candidate configs. Even if + * we were searching by pool name or guid, we still need to + * post-process the list to deal with pool state and possible + * duplicate names. + */ + err = 0; + elem = NULL; + first = B_TRUE; + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + + verify(nvpair_value_nvlist(elem, &config) == 0); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) + continue; + if (do_destroyed && pool_state != POOL_STATE_DESTROYED) + continue; + + verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, + policy) == 0); + + if (argc == 0) { + if (first) + first = B_FALSE; + else if (!do_all) + (void) printf("\n"); + + if (do_all) { + err |= do_import(config, NULL, mntopts, + props, flags); + } else { + show_import(config); + } + } else if (searchname != NULL) { + char *name; + + /* + * We are searching for a pool based on name. + */ + verify(nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &name) == 0); + + if (strcmp(name, searchname) == 0) { + if (found_config != NULL) { + (void) fprintf(stderr, gettext( + "cannot import '%s': more than " + "one matching pool\n"), searchname); + (void) fprintf(stderr, gettext( + "import by numeric ID instead\n")); + err = B_TRUE; + } + found_config = config; + } + } else { + uint64_t guid; + + /* + * Search for a pool by guid. + */ + verify(nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, &guid) == 0); + + if (guid == searchguid) + found_config = config; + } + } + + /* + * If we were searching for a specific pool, verify that we found a + * pool, and then do the import. + */ + if (argc != 0 && err == 0) { + if (found_config == NULL) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "no such pool available\n"), argv[0]); + err = B_TRUE; + } else { + err |= do_import(found_config, argc == 1 ? NULL : + argv[1], mntopts, props, flags); + } + } + + /* + * If we were just looking for pools, report an error if none were + * found. + */ + if (argc == 0 && first) + (void) fprintf(stderr, + gettext("no pools available to import\n")); + +error: + nvlist_free(props); + nvlist_free(pools); + nvlist_free(policy); + if (searchdirs != NULL) + free(searchdirs); + if (envdup != NULL) + free(envdup); + + return (err ? 1 : 0); +} + +/* + * zpool sync [-f] [pool] ... + * + * -f (undocumented) force uberblock (and config including zpool cache file) + * update. + * + * Sync the specified pool(s). + * Without arguments "zpool sync" will sync all pools. + * This command initiates TXG sync(s) and will return after the TXG(s) commit. + * + */ +static int +zpool_do_sync(int argc, char **argv) +{ + int ret; + boolean_t force = B_FALSE; + + /* check options */ + while ((ret = getopt(argc, argv, "f")) != -1) { + switch (ret) { + case 'f': + force = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* if argc == 0 we will execute zpool_sync_one on all pools */ + ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force); + + return (ret); +} + +typedef struct iostat_cbdata { + uint64_t cb_flags; + int cb_name_flags; + int cb_namewidth; + int cb_iteration; + char **cb_vdev_names; /* Only show these vdevs */ + unsigned int cb_vdev_names_count; + boolean_t cb_verbose; + boolean_t cb_literal; + boolean_t cb_scripted; + zpool_list_t *cb_list; + vdev_cmd_data_list_t *vcdl; +} iostat_cbdata_t; + +/* iostat labels */ +typedef struct name_and_columns { + const char *name; /* Column name */ + unsigned int columns; /* Center name to this number of columns */ +} name_and_columns_t; + +#define IOSTAT_MAX_LABELS 13 /* Max number of labels on one line */ + +static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, + {NULL}}, + [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, + {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {NULL}}, + [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, + {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, + {"trimq_write", 2}, {NULL}}, + [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, + {"asyncq_wait", 2}, {NULL}}, + [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, + {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, + {"trim", 2}, {NULL}}, +}; + +/* Shorthand - if "columns" field not set, default to 1 column */ +static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, + {"write"}, {NULL}}, + [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}}, + [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, + {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, + {"pend"}, {"activ"}, {NULL}}, + [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}}, + [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, + {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, +}; + +static const char *histo_to_title[] = { + [IOS_L_HISTO] = "latency", + [IOS_RQ_HISTO] = "req_size", +}; + +/* + * Return the number of labels in a null-terminated name_and_columns_t + * array. + * + */ +static unsigned int +label_array_len(const name_and_columns_t *labels) +{ + int i = 0; + + while (labels[i].name) + i++; + + return (i); +} + +/* + * Return the number of strings in a null-terminated string array. + * For example: + * + * const char foo[] = {"bar", "baz", NULL} + * + * returns 2 + */ +static uint64_t +str_array_len(const char *array[]) +{ + uint64_t i = 0; + while (array[i]) + i++; + + return (i); +} + + +/* + * Return a default column width for default/latency/queue columns. This does + * not include histograms, which have their columns autosized. + */ +static unsigned int +default_column_width(iostat_cbdata_t *cb, enum iostat_type type) +{ + unsigned long column_width = 5; /* Normal niceprint */ + static unsigned long widths[] = { + /* + * Choose some sane default column sizes for printing the + * raw numbers. + */ + [IOS_DEFAULT] = 15, /* 1PB capacity */ + [IOS_LATENCY] = 10, /* 1B ns = 10sec */ + [IOS_QUEUES] = 6, /* 1M queue entries */ + [IOS_L_HISTO] = 10, /* 1B ns = 10sec */ + [IOS_RQ_HISTO] = 6, /* 1M queue entries */ + }; + + if (cb->cb_literal) + column_width = widths[type]; + + return (column_width); +} + +/* + * Print the column labels, i.e: + * + * capacity operations bandwidth + * alloc free read write read write ... + * + * If force_column_width is set, use it for the column width. If not set, use + * the default column width. + */ +static void +print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, + const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) +{ + int i, idx, s; + int text_start, rw_column_width, spaces_to_end; + uint64_t flags = cb->cb_flags; + uint64_t f; + unsigned int column_width = force_column_width; + + /* For each bit set in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + if (!force_column_width) + column_width = default_column_width(cb, idx); + /* Print our top labels centered over "read write" label. */ + for (i = 0; i < label_array_len(labels[idx]); i++) { + const char *name = labels[idx][i].name; + /* + * We treat labels[][].columns == 0 as shorthand + * for one column. It makes writing out the label + * tables more concise. + */ + unsigned int columns = MAX(1, labels[idx][i].columns); + unsigned int slen = strlen(name); + + rw_column_width = (column_width * columns) + + (2 * (columns - 1)); + + text_start = (int)((rw_column_width) / columns - + slen / columns); + if (text_start < 0) + text_start = 0; + + printf(" "); /* Two spaces between columns */ + + /* Space from beginning of column to label */ + for (s = 0; s < text_start; s++) + printf(" "); + + printf("%s", name); + + /* Print space after label to end of column */ + spaces_to_end = rw_column_width - text_start - slen; + if (spaces_to_end < 0) + spaces_to_end = 0; + + for (s = 0; s < spaces_to_end; s++) + printf(" "); + } + } +} + + +/* + * print_cmd_columns - Print custom column titles from -c + * + * If the user specified the "zpool status|iostat -c" then print their custom + * column titles in the header. For example, print_cmd_columns() would print + * the " col1 col2" part of this: + * + * $ zpool iostat -vc 'echo col1=val1; echo col2=val2' + * ... + * capacity operations bandwidth + * pool alloc free read write read write col1 col2 + * ---------- ----- ----- ----- ----- ----- ----- ---- ---- + * mypool 269K 1008M 0 0 107 946 + * mirror 269K 1008M 0 0 107 946 + * sdb - - 0 0 102 473 val1 val2 + * sdc - - 0 0 5 473 val1 val2 + * ---------- ----- ----- ----- ----- ----- ----- ---- ---- + */ +static void +print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) +{ + int i, j; + vdev_cmd_data_t *data = &vcdl->data[0]; + + if (vcdl->count == 0 || data == NULL) + return; + + /* + * Each vdev cmd should have the same column names unless the user did + * something weird with their cmd. Just take the column names from the + * first vdev and assume it works for all of them. + */ + for (i = 0; i < vcdl->uniq_cols_cnt; i++) { + printf(" "); + if (use_dashes) { + for (j = 0; j < vcdl->uniq_cols_width[i]; j++) + printf("-"); + } else { + printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i], + vcdl->uniq_cols[i]); + } + } +} + + +/* + * Utility function to print out a line of dashes like: + * + * -------------------------------- ----- ----- ----- ----- ----- + * + * ...or a dashed named-row line like: + * + * logs - - - - - + * + * @cb: iostat data + * + * @force_column_width If non-zero, use the value as the column width. + * Otherwise use the default column widths. + * + * @name: Print a dashed named-row line starting + * with @name. Otherwise, print a regular + * dashed line. + */ +static void +print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *name) +{ + int i; + unsigned int namewidth; + uint64_t flags = cb->cb_flags; + uint64_t f; + int idx; + const name_and_columns_t *labels; + const char *title; + + + if (cb->cb_flags & IOS_ANYHISTO_M) { + title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; + } else if (cb->cb_vdev_names_count) { + title = "vdev"; + } else { + title = "pool"; + } + + namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), + name ? strlen(name) : 0); + + + if (name) { + printf("%-*s", namewidth, name); + } else { + for (i = 0; i < namewidth; i++) + (void) printf("-"); + } + + /* For each bit in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + unsigned int column_width; + idx = lowbit64(f) - 1; + if (force_column_width) + column_width = force_column_width; + else + column_width = default_column_width(cb, idx); + + labels = iostat_bottom_labels[idx]; + for (i = 0; i < label_array_len(labels); i++) { + if (name) + printf(" %*s-", column_width - 1, " "); + else + printf(" %.*s", column_width, + "--------------------"); + } + } +} + + +static void +print_iostat_separator_impl(iostat_cbdata_t *cb, + unsigned int force_column_width) +{ + print_iostat_dashes(cb, force_column_width, NULL); +} + +static void +print_iostat_separator(iostat_cbdata_t *cb) +{ + print_iostat_separator_impl(cb, 0); +} + +static void +print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *histo_vdev_name) +{ + unsigned int namewidth; + const char *title; + + if (cb->cb_flags & IOS_ANYHISTO_M) { + title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; + } else if (cb->cb_vdev_names_count) { + title = "vdev"; + } else { + title = "pool"; + } + + namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), + histo_vdev_name ? strlen(histo_vdev_name) : 0); + + if (histo_vdev_name) + printf("%-*s", namewidth, histo_vdev_name); + else + printf("%*s", namewidth, ""); + + + print_iostat_labels(cb, force_column_width, iostat_top_labels); + printf("\n"); + + printf("%-*s", namewidth, title); + + print_iostat_labels(cb, force_column_width, iostat_bottom_labels); + if (cb->vcdl != NULL) + print_cmd_columns(cb->vcdl, 0); + + printf("\n"); + + print_iostat_separator_impl(cb, force_column_width); + + if (cb->vcdl != NULL) + print_cmd_columns(cb->vcdl, 1); + + printf("\n"); +} + +static void +print_iostat_header(iostat_cbdata_t *cb) +{ + print_iostat_header_impl(cb, 0, NULL); +} + + +/* + * Display a single statistic. + */ +static void +print_one_stat(uint64_t value, enum zfs_nicenum_format format, + unsigned int column_size, boolean_t scripted) +{ + char buf[64]; + + zfs_nicenum_format(value, buf, sizeof (buf), format); + + if (scripted) + printf("\t%s", buf); + else + printf(" %*s", column_size, buf); +} + +/* + * Calculate the default vdev stats + * + * Subtract oldvs from newvs, apply a scaling factor, and save the resulting + * stats into calcvs. + */ +static void +calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, + vdev_stat_t *calcvs) +{ + int i; + + memcpy(calcvs, newvs, sizeof (*calcvs)); + for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) + calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); + + for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) + calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); +} + +/* + * Internal representation of the extended iostats data. + * + * The extended iostat stats are exported in nvlists as either uint64_t arrays + * or single uint64_t's. We make both look like arrays to make them easier + * to process. In order to make single uint64_t's look like arrays, we set + * __data to the stat data, and then set *data = &__data with count = 1. Then, + * we can just use *data and count. + */ +struct stat_array { + uint64_t *data; + uint_t count; /* Number of entries in data[] */ + uint64_t __data; /* Only used when data is a single uint64_t */ +}; + +static uint64_t +stat_histo_max(struct stat_array *nva, unsigned int len) +{ + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array64_max(nva[i].data, nva[i].count)); + + return (max); +} + +/* + * Helper function to lookup a uint64_t array or uint64_t value and store its + * data as a stat_array. If the nvpair is a single uint64_t value, then we make + * it look like a one element array to make it easier to process. + */ +static int +nvpair64_to_stat_array(nvlist_t *nvl, const char *name, + struct stat_array *nva) +{ + nvpair_t *tmp; + int ret; + + verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); + switch (nvpair_type(tmp)) { + case DATA_TYPE_UINT64_ARRAY: + ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); + break; + case DATA_TYPE_UINT64: + ret = nvpair_value_uint64(tmp, &nva->__data); + nva->data = &nva->__data; + nva->count = 1; + break; + default: + /* Not a uint64_t */ + ret = EINVAL; + break; + } + + return (ret); +} + +/* + * Given a list of nvlist names, look up the extended stats in newnv and oldnv, + * subtract them, and return the results in a newly allocated stat_array. + * You must free the returned array after you are done with it with + * free_calc_stats(). + * + * Additionally, you can set "oldnv" to NULL if you simply want the newnv + * values. + */ +static struct stat_array * +calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, + nvlist_t *newnv) +{ + nvlist_t *oldnvx = NULL, *newnvx; + struct stat_array *oldnva, *newnva, *calcnva; + int i, j; + unsigned int alloc_size = (sizeof (struct stat_array)) * len; + + /* Extract our extended stats nvlist from the main list */ + verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &newnvx) == 0); + if (oldnv) { + verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &oldnvx) == 0); + } + + newnva = safe_malloc(alloc_size); + oldnva = safe_malloc(alloc_size); + calcnva = safe_malloc(alloc_size); + + for (j = 0; j < len; j++) { + verify(nvpair64_to_stat_array(newnvx, names[j], + &newnva[j]) == 0); + calcnva[j].count = newnva[j].count; + alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); + calcnva[j].data = safe_malloc(alloc_size); + memcpy(calcnva[j].data, newnva[j].data, alloc_size); + + if (oldnvx) { + verify(nvpair64_to_stat_array(oldnvx, names[j], + &oldnva[j]) == 0); + for (i = 0; i < oldnva[j].count; i++) + calcnva[j].data[i] -= oldnva[j].data[i]; + } + } + free(newnva); + free(oldnva); + return (calcnva); +} + +static void +free_calc_stats(struct stat_array *nva, unsigned int len) +{ + int i; + for (i = 0; i < len; i++) + free(nva[i].data); + + free(nva); +} + +static void +print_iostat_histo(struct stat_array *nva, unsigned int len, + iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, + double scale) +{ + int i, j; + char buf[6]; + uint64_t val; + enum zfs_nicenum_format format; + unsigned int buckets; + unsigned int start_bucket; + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + /* All these histos are the same size, so just use nva[0].count */ + buckets = nva[0].count; + + if (cb->cb_flags & IOS_RQ_HISTO_M) { + /* Start at 512 - req size should never be lower than this */ + start_bucket = 9; + } else { + start_bucket = 0; + } + + for (j = start_bucket; j < buckets; j++) { + /* Print histogram bucket label */ + if (cb->cb_flags & IOS_L_HISTO_M) { + /* Ending range of this bucket */ + val = (1UL << (j + 1)) - 1; + zfs_nicetime(val, buf, sizeof (buf)); + } else { + /* Request size (starting range of bucket) */ + val = (1UL << j); + zfs_nicenum(val, buf, sizeof (buf)); + } + + if (cb->cb_scripted) + printf("%llu", (u_longlong_t)val); + else + printf("%-*s", namewidth, buf); + + /* Print the values on the line */ + for (i = 0; i < len; i++) { + print_one_stat(nva[i].data[j] * scale, format, + column_width, cb->cb_scripted); + } + printf("\n"); + } +} + +static void +print_solid_separator(unsigned int length) +{ + while (length--) + printf("-"); + printf("\n"); +} + +static void +print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv, double scale, const char *name) +{ + unsigned int column_width; + unsigned int namewidth; + unsigned int entire_width; + enum iostat_type type; + struct stat_array *nva; + const char **names; + unsigned int names_len; + + /* What type of histo are we? */ + type = IOS_HISTO_IDX(cb->cb_flags); + + /* Get NULL-terminated array of nvlist names for our histo */ + names = vsx_type_to_nvlist[type]; + names_len = str_array_len(names); /* num of names */ + + nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); + + if (cb->cb_literal) { + column_width = MAX(5, + (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); + } else { + column_width = 5; + } + + namewidth = MAX(cb->cb_namewidth, + strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); + + /* + * Calculate the entire line width of what we're printing. The + * +2 is for the two spaces between columns: + */ + /* read write */ + /* ----- ----- */ + /* |___| <---------- column_width */ + /* */ + /* |__________| <--- entire_width */ + /* */ + entire_width = namewidth + (column_width + 2) * + label_array_len(iostat_bottom_labels[type]); + + if (cb->cb_scripted) + printf("%s\n", name); + else + print_iostat_header_impl(cb, column_width, name); + + print_iostat_histo(nva, names_len, cb, column_width, + namewidth, scale); + + free_calc_stats(nva, names_len); + if (!cb->cb_scripted) + print_solid_separator(entire_width); +} + +/* + * Calculate the average latency of a power-of-two latency histogram + */ +static uint64_t +single_histo_average(uint64_t *histo, unsigned int buckets) +{ + int i; + uint64_t count = 0, total = 0; + + for (i = 0; i < buckets; i++) { + /* + * Our buckets are power-of-two latency ranges. Use the + * midpoint latency of each bucket to calculate the average. + * For example: + * + * Bucket Midpoint + * 8ns-15ns: 12ns + * 16ns-31ns: 24ns + * ... + */ + if (histo[i] != 0) { + total += histo[i] * (((1UL << i) + ((1UL << i)/2))); + count += histo[i]; + } + } + + /* Prevent divide by zero */ + return (count == 0 ? 0 : total / count); +} + +static void +print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + }; + + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_QUEUES); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + for (i = 0; i < ARRAY_SIZE(names); i++) { + val = nva[i].data[0]; + print_one_stat(val, format, column_width, cb->cb_scripted); + } + + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +static void +print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + }; + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_LATENCY); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAWTIME; + else + format = ZFS_NICENUM_TIME; + + /* Print our avg latencies on the line */ + for (i = 0; i < ARRAY_SIZE(names); i++) { + /* Compute average latency for a latency histo */ + val = single_histo_average(nva[i].data, nva[i].count); + print_one_stat(val, format, column_width, cb->cb_scripted); + } + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +/* + * Print default statistics (capacity/operations/bandwidth) + */ +static void +print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) +{ + unsigned int column_width = default_column_width(cb, IOS_DEFAULT); + enum zfs_nicenum_format format; + char na; /* char to print for "not applicable" values */ + + if (cb->cb_literal) { + format = ZFS_NICENUM_RAW; + na = '0'; + } else { + format = ZFS_NICENUM_1024; + na = '-'; + } + + /* only toplevel vdevs have capacity stats */ + if (vs->vs_space == 0) { + if (cb->cb_scripted) + printf("\t%c\t%c", na, na); + else + printf(" %*c %*c", column_width, na, column_width, + na); + } else { + print_one_stat(vs->vs_alloc, format, column_width, + cb->cb_scripted); + print_one_stat(vs->vs_space - vs->vs_alloc, format, + column_width, cb->cb_scripted); + } + + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); +} + +static const char *class_name[] = { + VDEV_ALLOC_BIAS_DEDUP, + VDEV_ALLOC_BIAS_SPECIAL, + VDEV_ALLOC_CLASS_LOGS +}; + +/* + * Print out all the statistics for the given vdev. This can either be the + * toplevel configuration, or called recursively. If 'name' is NULL, then this + * is a verbose output, and we don't want to display the toplevel pool stats. + * + * Returns the number of stat lines printed. + */ +static unsigned int +print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, + nvlist_t *newnv, iostat_cbdata_t *cb, int depth) +{ + nvlist_t **oldchild, **newchild; + uint_t c, children, oldchildren; + vdev_stat_t *oldvs, *newvs, *calcvs; + vdev_stat_t zerovs = { 0 }; + char *vname; + int i; + int ret = 0; + uint64_t tdelta; + double scale; + + if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) + return (ret); + + calcvs = safe_malloc(sizeof (*calcvs)); + + if (oldnv != NULL) { + verify(nvlist_lookup_uint64_array(oldnv, + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); + } else { + oldvs = &zerovs; + } + + /* Do we only want to see a specific vdev? */ + for (i = 0; i < cb->cb_vdev_names_count; i++) { + /* Yes we do. Is this the vdev? */ + if (strcmp(name, cb->cb_vdev_names[i]) == 0) { + /* + * This is our vdev. Since it is the only vdev we + * will be displaying, make depth = 0 so that it + * doesn't get indented. + */ + depth = 0; + break; + } + } + + if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { + /* Couldn't match the name */ + goto children; + } + + + verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&newvs, &c) == 0); + + /* + * Print the vdev name unless it's is a histogram. Histograms + * display the vdev name in the header itself. + */ + if (!(cb->cb_flags & IOS_ANYHISTO_M)) { + if (cb->cb_scripted) { + printf("%s", name); + } else { + if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - + depth), ""); + } + } + + /* Calculate our scaling factor */ + tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; + if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { + /* + * If we specify printing histograms with no time interval, then + * print the histogram numbers over the entire lifetime of the + * vdev. + */ + scale = 1; + } else { + if (tdelta == 0) + scale = 1.0; + else + scale = (double)NANOSEC / tdelta; + } + + if (cb->cb_flags & IOS_DEFAULT_M) { + calc_default_iostats(oldvs, newvs, calcvs); + print_iostat_default(calcvs, cb, scale); + } + if (cb->cb_flags & IOS_LATENCY_M) + print_iostat_latency(cb, oldnv, newnv); + if (cb->cb_flags & IOS_QUEUES_M) + print_iostat_queues(cb, oldnv, newnv); + if (cb->cb_flags & IOS_ANYHISTO_M) { + printf("\n"); + print_iostat_histos(cb, oldnv, newnv, scale, name); + } + + if (cb->vcdl != NULL) { + char *path; + if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, + &path) == 0) { + printf(" "); + zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); + } + } + + if (!(cb->cb_flags & IOS_ANYHISTO_M)) + printf("\n"); + + ret++; + +children: + + free(calcvs); + + if (!cb->cb_verbose) + return (ret); + + if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, + &newchild, &children) != 0) + return (ret); + + if (oldnv) { + if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, + &oldchild, &oldchildren) != 0) + return (ret); + + children = MIN(oldchildren, children); + } + + /* + * print normal top-level devices + */ + for (c = 0; c < children; c++) { + uint64_t ishole = B_FALSE, islog = B_FALSE; + + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, + &ishole); + + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, + &islog); + + if (ishole || islog) + continue; + + if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + cb->cb_name_flags); + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, + newchild[c], cb, depth + 2); + free(vname); + } + + /* + * print all other top-level devices + */ + for (uint_t n = 0; n < 3; n++) { + boolean_t printed = B_FALSE; + + for (c = 0; c < children; c++) { + uint64_t islog = B_FALSE; + char *bias = NULL; + char *type = NULL; + + (void) nvlist_lookup_uint64(newchild[c], + ZPOOL_CONFIG_IS_LOG, &islog); + if (islog) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(newchild[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(newchild[c], + ZPOOL_CONFIG_TYPE, &type); + } + if (bias == NULL || strcmp(bias, class_name[n]) != 0) + continue; + if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + if (!printed) { + if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && + !cb->cb_scripted && !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, + class_name[n]); + } + printf("\n"); + printed = B_TRUE; + } + + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + cb->cb_name_flags); + ret += print_vdev_stats(zhp, vname, oldnv ? + oldchild[c] : NULL, newchild[c], cb, depth + 2); + free(vname); + } + } + + /* + * Include level 2 ARC devices in iostat output + */ + if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, + &newchild, &children) != 0) + return (ret); + + if (oldnv) { + if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, + &oldchild, &oldchildren) != 0) + return (ret); + + children = MIN(oldchildren, children); + } + + if (children > 0) { + if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && + !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, "cache"); + } + printf("\n"); + + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + cb->cb_name_flags); + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] + : NULL, newchild[c], cb, depth + 2); + free(vname); + } + } + + return (ret); +} + +static int +refresh_iostat(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + boolean_t missing; + + /* + * If the pool has disappeared, remove it from the list and continue. + */ + if (zpool_refresh_stats(zhp, &missing) != 0) + return (-1); + + if (missing) + pool_list_remove(cb->cb_list, zhp); + + return (0); +} + +/* + * Callback to print out the iostats for the given pool. + */ +static int +print_iostat(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + nvlist_t *oldconfig, *newconfig; + nvlist_t *oldnvroot, *newnvroot; + int ret; + + newconfig = zpool_get_config(zhp, &oldconfig); + + if (cb->cb_iteration == 1) + oldconfig = NULL; + + verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, + &newnvroot) == 0); + + if (oldconfig == NULL) + oldnvroot = NULL; + else + verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, + &oldnvroot) == 0); + + ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, + cb, 0); + if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && + !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) { + print_iostat_separator(cb); + if (cb->vcdl != NULL) { + print_cmd_columns(cb->vcdl, 1); + } + printf("\n"); + } + + return (ret); +} + +static int +get_columns(void) +{ + struct winsize ws; + int columns = 80; + int error; + + if (isatty(STDOUT_FILENO)) { + error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); + if (error == 0) + columns = ws.ws_col; + } else { + columns = 999; + } + + return (columns); +} + +/* + * Return the required length of the pool/vdev name column. The minimum + * allowed width and output formatting flags must be provided. + */ +static int +get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose) +{ + nvlist_t *config, *nvroot; + int width = min_width; + + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + unsigned int poolname_len = strlen(zpool_get_name(zhp)); + if (verbose == B_FALSE) { + width = MAX(poolname_len, min_width); + } else { + width = MAX(poolname_len, + max_width(zhp, nvroot, 0, min_width, flags)); + } + } + + return (width); +} + +/* + * Parse the input string, get the 'interval' and 'count' value if there is one. + */ +static void +get_interval_count(int *argcp, char **argv, float *iv, + unsigned long *cnt) +{ + float interval = 0; + unsigned long count = 0; + int argc = *argcp; + + /* + * Determine if the last argument is an integer or a pool name + */ + if (argc > 0 && zfs_isnumber(argv[argc - 1])) { + char *end; + + errno = 0; + interval = strtof(argv[argc - 1], &end); + + if (*end == '\0' && errno == 0) { + if (interval == 0) { + (void) fprintf(stderr, gettext("interval " + "cannot be zero\n")); + usage(B_FALSE); + } + /* + * Ignore the last parameter + */ + argc--; + } else { + /* + * If this is not a valid number, just plow on. The + * user will get a more informative error message later + * on. + */ + interval = 0; + } + } + + /* + * If the last argument is also an integer, then we have both a count + * and an interval. + */ + if (argc > 0 && zfs_isnumber(argv[argc - 1])) { + char *end; + + errno = 0; + count = interval; + interval = strtof(argv[argc - 1], &end); + + if (*end == '\0' && errno == 0) { + if (interval == 0) { + (void) fprintf(stderr, gettext("interval " + "cannot be zero\n")); + usage(B_FALSE); + } + + /* + * Ignore the last parameter + */ + argc--; + } else { + interval = 0; + } + } + + *iv = interval; + *cnt = count; + *argcp = argc; +} + +static void +get_timestamp_arg(char c) +{ + if (c == 'u') + timestamp_fmt = UDATE; + else if (c == 'd') + timestamp_fmt = DDATE; + else + usage(B_FALSE); +} + +/* + * Return stat flags that are supported by all pools by both the module and + * zpool iostat. "*data" should be initialized to all 0xFFs before running. + * It will get ANDed down until only the flags that are supported on all pools + * remain. + */ +static int +get_stat_flags_cb(zpool_handle_t *zhp, void *data) +{ + uint64_t *mask = data; + nvlist_t *config, *nvroot, *nvx; + uint64_t flags = 0; + int i, j; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + /* Default stats are always supported, but for completeness.. */ + if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) + flags |= IOS_DEFAULT_M; + + /* Get our extended stats nvlist from the main list */ + if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, + &nvx) != 0) { + /* + * No extended stats; they're probably running an older + * module. No big deal, we support that too. + */ + goto end; + } + + /* For each extended stat, make sure all its nvpairs are supported */ + for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { + if (!vsx_type_to_nvlist[j][0]) + continue; + + /* Start off by assuming the flag is supported, then check */ + flags |= (1ULL << j); + for (i = 0; vsx_type_to_nvlist[j][i]; i++) { + if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { + /* flag isn't supported */ + flags = flags & ~(1ULL << j); + break; + } + } + } +end: + *mask = *mask & flags; + return (0); +} + +/* + * Return a bitmask of stats that are supported on all pools by both the module + * and zpool iostat. + */ +static uint64_t +get_stat_flags(zpool_list_t *list) +{ + uint64_t mask = -1; + + /* + * get_stat_flags_cb() will lop off bits from "mask" until only the + * flags that are supported on all pools remain. + */ + pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); + return (mask); +} + +/* + * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. + */ +static int +is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) +{ + iostat_cbdata_t *cb = cb_data; + char *name = NULL; + int ret = 0; + + name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); + + if (strcmp(name, cb->cb_vdev_names[0]) == 0) + ret = 1; /* match */ + free(name); + + return (ret); +} + +/* + * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. + */ +static int +is_vdev(zpool_handle_t *zhp, void *cb_data) +{ + return (for_each_vdev(zhp, is_vdev_cb, cb_data)); +} + +/* + * Check if vdevs are in a pool + * + * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise + * return 0. If pool_name is NULL, then search all pools. + */ +static int +are_vdevs_in_pool(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + char **tmp_name; + int ret = 0; + int i; + int pool_count = 0; + + if ((argc == 0) || !*argv) + return (0); + + if (pool_name) + pool_count = 1; + + /* Temporarily hijack cb_vdev_names for a second... */ + tmp_name = cb->cb_vdev_names; + + /* Go though our list of prospective vdev names */ + for (i = 0; i < argc; i++) { + cb->cb_vdev_names = argv + i; + + /* Is this name a vdev in our pools? */ + ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, + is_vdev, cb); + if (!ret) { + /* No match */ + break; + } + } + + cb->cb_vdev_names = tmp_name; + + return (ret); +} + +static int +is_pool_cb(zpool_handle_t *zhp, void *data) +{ + char *name = data; + if (strcmp(name, zpool_get_name(zhp)) == 0) + return (1); + + return (0); +} + +/* + * Do we have a pool named *name? If so, return 1, otherwise 0. + */ +static int +is_pool(char *name) +{ + return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); +} + +/* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ +static int +are_all_pools(int argc, char **argv) +{ + if ((argc == 0) || !*argv) + return (0); + + while (--argc >= 0) + if (!is_pool(argv[argc])) + return (0); + + return (1); +} + +/* + * Helper function to print out vdev/pool names we can't resolve. Used for an + * error message. + */ +static void +error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + int i; + char *name; + char *str; + for (i = 0; i < argc; i++) { + name = argv[i]; + + if (is_pool(name)) + str = gettext("pool"); + else if (are_vdevs_in_pool(1, &name, pool_name, cb)) + str = gettext("vdev in this pool"); + else if (are_vdevs_in_pool(1, &name, NULL, cb)) + str = gettext("vdev in another pool"); + else + str = gettext("unknown"); + + fprintf(stderr, "\t%s (%s)\n", name, str); + } +} + +/* + * Same as get_interval_count(), but with additional checks to not misinterpret + * guids as interval/count values. Assumes VDEV_NAME_GUID is set in + * cb.cb_name_flags. + */ +static void +get_interval_count_filter_guids(int *argc, char **argv, float *interval, + unsigned long *count, iostat_cbdata_t *cb) +{ + char **tmpargv = argv; + int argc_for_interval = 0; + + /* Is the last arg an interval value? Or a guid? */ + if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { + /* + * The last arg is not a guid, so it's probably an + * interval value. + */ + argc_for_interval++; + + if (*argc >= 2 && + !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { + /* + * The 2nd to last arg is not a guid, so it's probably + * an interval value. + */ + argc_for_interval++; + } + } + + /* Point to our list of possible intervals */ + tmpargv = &argv[*argc - argc_for_interval]; + + *argc = *argc - argc_for_interval; + get_interval_count(&argc_for_interval, tmpargv, + interval, count); +} + +/* + * Floating point sleep(). Allows you to pass in a floating point value for + * seconds. + */ +static void +fsleep(float sec) +{ + struct timespec req; + req.tv_sec = floor(sec); + req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; + nanosleep(&req, NULL); +} + +/* + * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or + * if we were unable to determine its size. + */ +static int +terminal_height(void) +{ + struct winsize win; + + if (isatty(STDOUT_FILENO) == 0) + return (-1); + + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0) + return (win.ws_row); + + return (-1); +} + +/* + * Run one of the zpool status/iostat -c scripts with the help (-h) option and + * print the result. + * + * name: Short name of the script ('iostat'). + * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat'); + */ +static void +print_zpool_script_help(char *name, char *path) +{ + char *argv[] = {path, "-h", NULL}; + char **lines = NULL; + int lines_cnt = 0; + int rc; + + rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines, + &lines_cnt); + if (rc != 0 || lines == NULL || lines_cnt <= 0) { + if (lines != NULL) + libzfs_free_str_array(lines, lines_cnt); + return; + } + + for (int i = 0; i < lines_cnt; i++) + if (!is_blank_str(lines[i])) + printf(" %-14s %s\n", name, lines[i]); + + libzfs_free_str_array(lines, lines_cnt); +} + +/* + * Go though the zpool status/iostat -c scripts in the user's path, run their + * help option (-h), and print out the results. + */ +static void +print_zpool_dir_scripts(char *dirpath) +{ + DIR *dir; + struct dirent *ent; + char fullpath[MAXPATHLEN]; + struct stat dir_stat; + + if ((dir = opendir(dirpath)) != NULL) { + /* print all the files and directories within directory */ + while ((ent = readdir(dir)) != NULL) { + sprintf(fullpath, "%s/%s", dirpath, ent->d_name); + + /* Print the scripts */ + if (stat(fullpath, &dir_stat) == 0) + if (dir_stat.st_mode & S_IXUSR && + S_ISREG(dir_stat.st_mode)) + print_zpool_script_help(ent->d_name, + fullpath); + } + closedir(dir); + } +} + +/* + * Print out help text for all zpool status/iostat -c scripts. + */ +static void +print_zpool_script_list(char *subcommand) +{ + char *dir, *sp; + + printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); + + sp = zpool_get_cmd_search_path(); + if (sp == NULL) + return; + + dir = strtok(sp, ":"); + while (dir != NULL) { + print_zpool_dir_scripts(dir); + dir = strtok(NULL, ":"); + } + + free(sp); +} + +/* + * Set the minimum pool/vdev name column width. The width must be at least 10, + * but may be as large as the column width - 42 so it still fits on one line. + * NOTE: 42 is the width of the default capacity/operations/bandwidth output + */ +static int +get_namewidth_iostat(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + int width, available_width; + + /* + * get_namewidth() returns the maximum width of any name in that column + * for any pool/vdev/device line that will be output. + */ + width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, + cb->cb_verbose); + + /* + * The width we are calculating is the width of the header and also the + * padding width for names that are less than maximum width. The stats + * take up 42 characters, so the width available for names is: + */ + available_width = get_columns() - 42; + + /* + * If the maximum width fits on a screen, then great! Make everything + * line up by justifying all lines to the same width. If that max + * width is larger than what's available, the name plus stats won't fit + * on one line, and justifying to that width would cause every line to + * wrap on the screen. We only want lines with long names to wrap. + * Limit the padding to what won't wrap. + */ + if (width > available_width) + width = available_width; + + /* + * And regardless of whatever the screen width is (get_columns can + * return 0 if the width is not known or less than 42 for a narrow + * terminal) have the width be a minimum of 10. + */ + if (width < 10) + width = 10; + + /* Save the calculated width */ + cb->cb_namewidth = width; + + return (0); +} + +/* + * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name] + * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]] + * [interval [count]] + * + * -c CMD For each vdev, run command CMD + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -P Display full path for vdev name. + * -v Display statistics for individual vdevs + * -h Display help + * -p Display values in parsable (exact) format. + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -l Display average latency + * -q Display queue depths + * -w Display latency histograms + * -r Display request size histogram + * -T Display a timestamp in date(1) or Unix format + * -n Only print headers once + * + * This command can be tricky because we want to be able to deal with pool + * creation/destruction as well as vdev configuration changes. The bulk of this + * processing is handled by the pool_list_* routines in zpool_iter.c. We rely + * on pool_list_update() to detect the addition of new pools. Configuration + * changes are all handled within libzfs. + */ +int +zpool_do_iostat(int argc, char **argv) +{ + int c; + int ret; + int npools; + float interval = 0; + unsigned long count = 0; + int winheight = 24; + zpool_list_t *list; + boolean_t verbose = B_FALSE; + boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; + boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; + boolean_t omit_since_boot = B_FALSE; + boolean_t guid = B_FALSE; + boolean_t follow_links = B_FALSE; + boolean_t full_name = B_FALSE; + boolean_t headers_once = B_FALSE; + iostat_cbdata_t cb = { 0 }; + char *cmd = NULL; + + /* Used for printing error message */ + const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', + [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; + + uint64_t unsupported_flags; + + /* check options */ + while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) { + switch (c) { + case 'c': + if (cmd != NULL) { + fprintf(stderr, + gettext("Can't set -c flag twice\n")); + exit(1); + } + + if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && + !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { + fprintf(stderr, gettext( + "Can't run -c, disabled by " + "ZPOOL_SCRIPTS_ENABLED.\n")); + exit(1); + } + + if ((getuid() <= 0 || geteuid() <= 0) && + !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { + fprintf(stderr, gettext( + "Can't run -c with root privileges " + "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); + exit(1); + } + cmd = optarg; + verbose = B_TRUE; + break; + case 'g': + guid = B_TRUE; + break; + case 'L': + follow_links = B_TRUE; + break; + case 'P': + full_name = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case 'v': + verbose = B_TRUE; + break; + case 'p': + parsable = B_TRUE; + break; + case 'l': + latency = B_TRUE; + break; + case 'q': + queues = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'w': + l_histo = B_TRUE; + break; + case 'r': + rq_histo = B_TRUE; + break; + case 'y': + omit_since_boot = B_TRUE; + break; + case 'n': + headers_once = B_TRUE; + break; + case 'h': + usage(B_FALSE); + break; + case '?': + if (optopt == 'c') { + print_zpool_script_list("iostat"); + exit(0); + } else { + fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + cb.cb_literal = parsable; + cb.cb_scripted = scripted; + + if (guid) + cb.cb_name_flags |= VDEV_NAME_GUID; + if (follow_links) + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + if (full_name) + cb.cb_name_flags |= VDEV_NAME_PATH; + cb.cb_iteration = 0; + cb.cb_namewidth = 0; + cb.cb_verbose = verbose; + + /* Get our interval and count values (if any) */ + if (guid) { + get_interval_count_filter_guids(&argc, argv, &interval, + &count, &cb); + } else { + get_interval_count(&argc, argv, &interval, &count); + } + + if (argc == 0) { + /* No args, so just print the defaults. */ + } else if (are_all_pools(argc, argv)) { + /* All the args are pool names */ + } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { + /* All the args are vdevs */ + cb.cb_vdev_names = argv; + cb.cb_vdev_names_count = argc; + argc = 0; /* No pools to process */ + } else if (are_all_pools(1, argv)) { + /* The first arg is a pool name */ + if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { + /* ...and the rest are vdev names */ + cb.cb_vdev_names = argv + 1; + cb.cb_vdev_names_count = argc - 1; + argc = 1; /* One pool to process */ + } else { + fprintf(stderr, gettext("Expected either a list of ")); + fprintf(stderr, gettext("pools, or list of vdevs in")); + fprintf(stderr, " \"%s\", ", argv[0]); + fprintf(stderr, gettext("but got:\n")); + error_list_unresolved_vdevs(argc - 1, argv + 1, + argv[0], &cb); + fprintf(stderr, "\n"); + usage(B_FALSE); + return (1); + } + } else { + /* + * The args don't make sense. The first arg isn't a pool name, + * nor are all the args vdevs. + */ + fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); + fprintf(stderr, "\n"); + return (1); + } + + if (cb.cb_vdev_names_count != 0) { + /* + * If user specified vdevs, it implies verbose. + */ + cb.cb_verbose = B_TRUE; + } + + /* + * Construct the list of all interesting pools. + */ + ret = 0; + if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) + return (1); + + if (pool_list_count(list) == 0 && argc != 0) { + pool_list_free(list); + return (1); + } + + if (pool_list_count(list) == 0 && interval == 0) { + pool_list_free(list); + (void) fprintf(stderr, gettext("no pools available\n")); + return (1); + } + + if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) { + pool_list_free(list); + (void) fprintf(stderr, + gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n")); + usage(B_FALSE); + return (1); + } + + if (l_histo && rq_histo) { + pool_list_free(list); + (void) fprintf(stderr, + gettext("Only one of [-r|-w] can be passed at a time\n")); + usage(B_FALSE); + return (1); + } + + /* + * Enter the main iostat loop. + */ + cb.cb_list = list; + + if (l_histo) { + /* + * Histograms tables look out of place when you try to display + * them with the other stats, so make a rule that you can only + * print histograms by themselves. + */ + cb.cb_flags = IOS_L_HISTO_M; + } else if (rq_histo) { + cb.cb_flags = IOS_RQ_HISTO_M; + } else { + cb.cb_flags = IOS_DEFAULT_M; + if (latency) + cb.cb_flags |= IOS_LATENCY_M; + if (queues) + cb.cb_flags |= IOS_QUEUES_M; + } + + /* + * See if the module supports all the stats we want to display. + */ + unsupported_flags = cb.cb_flags & ~get_stat_flags(list); + if (unsupported_flags) { + uint64_t f; + int idx; + fprintf(stderr, + gettext("The loaded zfs module doesn't support:")); + + /* for each bit set in unsupported_flags */ + for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + fprintf(stderr, " -%c", flag_to_arg[idx]); + } + + fprintf(stderr, ". Try running a newer module.\n"); + pool_list_free(list); + + return (1); + } + + for (;;) { + if ((npools = pool_list_count(list)) == 0) + (void) fprintf(stderr, gettext("no pools available\n")); + else { + /* + * If this is the first iteration and -y was supplied + * we skip any printing. + */ + boolean_t skip = (omit_since_boot && + cb.cb_iteration == 0); + + /* + * Refresh all statistics. This is done as an + * explicit step before calculating the maximum name + * width, so that any * configuration changes are + * properly accounted for. + */ + (void) pool_list_iter(list, B_FALSE, refresh_iostat, + &cb); + + /* + * Iterate over all pools to determine the maximum width + * for the pool / device name column across all pools. + */ + cb.cb_namewidth = 0; + (void) pool_list_iter(list, B_FALSE, + get_namewidth_iostat, &cb); + + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + if (cmd != NULL && cb.cb_verbose && + !(cb.cb_flags & IOS_ANYHISTO_M)) { + cb.vcdl = all_pools_for_each_vdev_run(argc, + argv, cmd, g_zfs, cb.cb_vdev_names, + cb.cb_vdev_names_count, cb.cb_name_flags); + } else { + cb.vcdl = NULL; + } + + + /* + * Check terminal size so we can print headers + * even when terminal window has its height + * changed. + */ + winheight = terminal_height(); + /* + * Are we connected to TTY? If not, headers_once + * should be true, to avoid breaking scripts. + */ + if (winheight < 0) + headers_once = B_TRUE; + + /* + * If it's the first time and we're not skipping it, + * or either skip or verbose mode, print the header. + * + * The histogram code explicitly prints its header on + * every vdev, so skip this for histograms. + */ + if (((++cb.cb_iteration == 1 && !skip) || + (skip != verbose) || + (!headers_once && + (cb.cb_iteration % winheight) == 0)) && + (!(cb.cb_flags & IOS_ANYHISTO_M)) && + !cb.cb_scripted) + print_iostat_header(&cb); + + if (skip) { + (void) fsleep(interval); + continue; + } + + pool_list_iter(list, B_FALSE, print_iostat, &cb); + + /* + * If there's more than one pool, and we're not in + * verbose mode (which prints a separator for us), + * then print a separator. + * + * In addition, if we're printing specific vdevs then + * we also want an ending separator. + */ + if (((npools > 1 && !verbose && + !(cb.cb_flags & IOS_ANYHISTO_M)) || + (!(cb.cb_flags & IOS_ANYHISTO_M) && + cb.cb_vdev_names_count)) && + !cb.cb_scripted) { + print_iostat_separator(&cb); + if (cb.vcdl != NULL) + print_cmd_columns(cb.vcdl, 1); + printf("\n"); + } + + if (cb.vcdl != NULL) + free_vdev_cmd_data_list(cb.vcdl); + + } + + /* + * Flush the output so that redirection to a file isn't buffered + * indefinitely. + */ + (void) fflush(stdout); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) fsleep(interval); + } + + pool_list_free(list); + + return (ret); +} + +typedef struct list_cbdata { + boolean_t cb_verbose; + int cb_name_flags; + int cb_namewidth; + boolean_t cb_scripted; + zprop_list_t *cb_proplist; + boolean_t cb_literal; +} list_cbdata_t; + + +/* + * Given a list of columns to display, output appropriate headers for each one. + */ +static void +print_header(list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + char headerbuf[ZPOOL_MAXPROPLEN]; + const char *header; + boolean_t first = B_TRUE; + boolean_t right_justify; + size_t width = 0; + + for (; pl != NULL; pl = pl->pl_next) { + width = pl->pl_width; + if (first && cb->cb_verbose) { + /* + * Reset the width to accommodate the verbose listing + * of devices. + */ + width = cb->cb_namewidth; + } + + if (!first) + (void) printf(" "); + else + first = B_FALSE; + + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + header = zpool_prop_column_name(pl->pl_prop); + right_justify = zpool_prop_align_right(pl->pl_prop); + } else { + int i; + + for (i = 0; pl->pl_user_prop[i] != '\0'; i++) + headerbuf[i] = toupper(pl->pl_user_prop[i]); + headerbuf[i] = '\0'; + header = headerbuf; + } + + if (pl->pl_next == NULL && !right_justify) + (void) printf("%s", header); + else if (right_justify) + (void) printf("%*s", (int)width, header); + else + (void) printf("%-*s", (int)width, header); + } + + (void) printf("\n"); +} + +/* + * Given a pool and a list of properties, print out all the properties according + * to the described layout. Used by zpool_do_list(). + */ +static void +print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + boolean_t first = B_TRUE; + char property[ZPOOL_MAXPROPLEN]; + char *propstr; + boolean_t right_justify; + size_t width; + + for (; pl != NULL; pl = pl->pl_next) { + + width = pl->pl_width; + if (first && cb->cb_verbose) { + /* + * Reset the width to accommodate the verbose listing + * of devices. + */ + width = cb->cb_namewidth; + } + + if (!first) { + if (cb->cb_scripted) + (void) printf("\t"); + else + (void) printf(" "); + } else { + first = B_FALSE; + } + + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + if (zpool_get_prop(zhp, pl->pl_prop, property, + sizeof (property), NULL, cb->cb_literal) != 0) + propstr = "-"; + else + propstr = property; + + right_justify = zpool_prop_align_right(pl->pl_prop); + } else if ((zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop)) && + zpool_prop_get_feature(zhp, pl->pl_user_prop, property, + sizeof (property)) == 0) { + propstr = property; + } else { + propstr = "-"; + } + + + /* + * If this is being called in scripted mode, or if this is the + * last column and it is left-justified, don't include a width + * format specifier. + */ + if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) + (void) printf("%s", propstr); + else if (right_justify) + (void) printf("%*s", (int)width, propstr); + else + (void) printf("%-*s", (int)width, propstr); + } + + (void) printf("\n"); +} + +static void +print_one_column(zpool_prop_t prop, uint64_t value, const char *str, + boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format) +{ + char propval[64]; + boolean_t fixed; + size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); + + switch (prop) { + case ZPOOL_PROP_EXPANDSZ: + case ZPOOL_PROP_CHECKPOINT: + case ZPOOL_PROP_DEDUPRATIO: + if (value == 0) + (void) strlcpy(propval, "-", sizeof (propval)); + else + zfs_nicenum_format(value, propval, sizeof (propval), + format); + break; + case ZPOOL_PROP_FRAGMENTATION: + if (value == ZFS_FRAG_INVALID) { + (void) strlcpy(propval, "-", sizeof (propval)); + } else if (format == ZFS_NICENUM_RAW) { + (void) snprintf(propval, sizeof (propval), "%llu", + (unsigned long long)value); + } else { + (void) snprintf(propval, sizeof (propval), "%llu%%", + (unsigned long long)value); + } + break; + case ZPOOL_PROP_CAPACITY: + /* capacity value is in parts-per-10,000 (aka permyriad) */ + if (format == ZFS_NICENUM_RAW) + (void) snprintf(propval, sizeof (propval), "%llu", + (unsigned long long)value / 100); + else + (void) snprintf(propval, sizeof (propval), + value < 1000 ? "%1.2f%%" : value < 10000 ? + "%2.1f%%" : "%3.0f%%", value / 100.0); + break; + case ZPOOL_PROP_HEALTH: + width = 8; + snprintf(propval, sizeof (propval), "%-*s", (int)width, str); + break; + default: + zfs_nicenum_format(value, propval, sizeof (propval), format); + } + + if (!valid) + (void) strlcpy(propval, "-", sizeof (propval)); + + if (scripted) + (void) printf("\t%s", propval); + else + (void) printf(" %*s", (int)width, propval); +} + +/* + * print static default line per vdev + * not compatible with '-o' <proplist> option + */ +static void +print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, + list_cbdata_t *cb, int depth, boolean_t isspare) +{ + nvlist_t **child; + vdev_stat_t *vs; + uint_t c, children; + char *vname; + boolean_t scripted = cb->cb_scripted; + uint64_t islog = B_FALSE; + char *dashes = "%-*s - - - - " + "- - - - -\n"; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + if (name != NULL) { + boolean_t toplevel = (vs->vs_space != 0); + uint64_t cap; + enum zfs_nicenum_format format; + const char *state; + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) + return; + + if (scripted) + (void) printf("\t%s", name); + else if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - depth), ""); + + /* + * Print the properties for the individual vdevs. Some + * properties are only applicable to toplevel vdevs. The + * 'toplevel' boolean value is passed to the print_one_column() + * to indicate that the value is valid. + */ + print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, + toplevel, format); + print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, + scripted, toplevel, format); + print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, + NULL, scripted, toplevel, format); + print_one_column(ZPOOL_PROP_CHECKPOINT, + vs->vs_checkpoint_space, NULL, scripted, toplevel, format); + print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL, + scripted, B_TRUE, format); + print_one_column(ZPOOL_PROP_FRAGMENTATION, + vs->vs_fragmentation, NULL, scripted, + (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), + format); + cap = (vs->vs_space == 0) ? 0 : + (vs->vs_alloc * 10000 / vs->vs_space); + print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL, + scripted, toplevel, format); + print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL, + scripted, toplevel, format); + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); + if (isspare) { + if (vs->vs_aux == VDEV_AUX_SPARED) + state = "INUSE"; + else if (vs->vs_state == VDEV_STATE_HEALTHY) + state = "AVAIL"; + } + print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted, + B_TRUE, format); + (void) printf("\n"); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + /* list the normal vdevs first */ + for (c = 0; c < children; c++) { + uint64_t ishole = B_FALSE; + + if (nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) + continue; + + if (nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) + continue; + + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); + print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); + free(vname); + } + + /* list the classes: 'logs', 'dedup', and 'special' */ + for (uint_t n = 0; n < 3; n++) { + boolean_t printed = B_FALSE; + + for (c = 0; c < children; c++) { + char *bias = NULL; + char *type = NULL; + + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &islog) == 0 && islog) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_TYPE, &type); + } + if (bias == NULL || strcmp(bias, class_name[n]) != 0) + continue; + if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + if (!printed) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, + class_name[n]); + printed = B_TRUE; + } + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); + print_list_stats(zhp, vname, child[c], cb, depth + 2, + B_FALSE); + free(vname); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0 && children > 0) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, "cache"); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); + print_list_stats(zhp, vname, child[c], cb, depth + 2, + B_FALSE); + free(vname); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, + &children) == 0 && children > 0) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, "spare"); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); + print_list_stats(zhp, vname, child[c], cb, depth + 2, + B_TRUE); + free(vname); + } + } +} + +/* + * Generic callback function to list a pool. + */ +static int +list_callback(zpool_handle_t *zhp, void *data) +{ + list_cbdata_t *cbp = data; + + print_pool(zhp, cbp); + + if (cbp->cb_verbose) { + nvlist_t *config, *nvroot; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE); + } + + return (0); +} + +/* + * Set the minimum pool/vdev name column width. The width must be at least 9, + * but may be as large as needed. + */ +static int +get_namewidth_list(zpool_handle_t *zhp, void *data) +{ + list_cbdata_t *cb = data; + int width; + + width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, + cb->cb_verbose); + + if (width < 9) + width = 9; + + cb->cb_namewidth = width; + + return (0); +} + +/* + * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] + * + * -g Display guid for individual vdev name. + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -L Follow links when resolving vdev path name. + * -o List of properties to display. Defaults to + * "name,size,allocated,free,expandsize,fragmentation,capacity," + * "dedupratio,health,altroot" + * -p Display values in parsable (exact) format. + * -P Display full path for vdev name. + * -T Display a timestamp in date(1) or Unix format + * + * List all pools in the system, whether or not they're healthy. Output space + * statistics for each one, as well as health status summary. + */ +int +zpool_do_list(int argc, char **argv) +{ + int c; + int ret = 0; + list_cbdata_t cb = { 0 }; + static char default_props[] = + "name,size,allocated,free,checkpoint,expandsize,fragmentation," + "capacity,dedupratio,health,altroot"; + char *props = default_props; + float interval = 0; + unsigned long count = 0; + zpool_list_t *list; + boolean_t first = B_TRUE; + + /* check options */ + while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { + switch (c) { + case 'g': + cb.cb_name_flags |= VDEV_NAME_GUID; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case 'L': + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; + case 'o': + props = optarg; + break; + case 'P': + cb.cb_name_flags |= VDEV_NAME_PATH; + break; + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case 'v': + cb.cb_verbose = B_TRUE; + cb.cb_namewidth = 8; /* 8 until precalc is avail */ + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &interval, &count); + + if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) + usage(B_FALSE); + + for (;;) { + if ((list = pool_list_get(argc, argv, &cb.cb_proplist, + &ret)) == NULL) + return (1); + + if (pool_list_count(list) == 0) + break; + + cb.cb_namewidth = 0; + (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb); + + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + if (!cb.cb_scripted && (first || cb.cb_verbose)) { + print_header(&cb); + first = B_FALSE; + } + ret = pool_list_iter(list, B_TRUE, list_callback, &cb); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + pool_list_free(list); + (void) fsleep(interval); + } + + if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { + (void) printf(gettext("no pools available\n")); + ret = 0; + } + + pool_list_free(list); + zprop_free_list(cb.cb_proplist); + return (ret); +} + +static int +zpool_do_attach_or_replace(int argc, char **argv, int replacing) +{ + boolean_t force = B_FALSE; + boolean_t rebuild = B_FALSE; + boolean_t wait = B_FALSE; + int c; + nvlist_t *nvroot; + char *poolname, *old_disk, *new_disk; + zpool_handle_t *zhp; + nvlist_t *props = NULL; + char *propval; + int ret; + + /* check options */ + while ((c = getopt(argc, argv, "fo:sw")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; + case 's': + rebuild = B_TRUE; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if (argc < 2) { + (void) fprintf(stderr, + gettext("missing <device> specification\n")); + usage(B_FALSE); + } + + old_disk = argv[1]; + + if (argc < 3) { + if (!replacing) { + (void) fprintf(stderr, + gettext("missing <new_device> specification\n")); + usage(B_FALSE); + } + new_disk = old_disk; + argc -= 1; + argv += 1; + } else { + new_disk = argv[2]; + argc -= 2; + argv += 2; + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { + nvlist_free(props); + return (1); + } + + if (zpool_get_config(zhp, NULL) == NULL) { + (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), + poolname); + zpool_close(zhp); + nvlist_free(props); + return (1); + } + + /* unless manually specified use "ashift" pool property (if set) */ + if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { + int intval; + zprop_source_t src; + char strval[ZPOOL_MAXPROPLEN]; + + intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); + if (src != ZPROP_SRC_DEFAULT) { + (void) sprintf(strval, "%" PRId32, intval); + verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, + &props, B_TRUE) == 0); + } + } + + nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, + argc, argv); + if (nvroot == NULL) { + zpool_close(zhp); + nvlist_free(props); + return (1); + } + + ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, + rebuild); + + if (ret == 0 && wait) + ret = zpool_wait(zhp, + replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); + + nvlist_free(props); + nvlist_free(nvroot); + zpool_close(zhp); + + return (ret); +} + +/* + * zpool replace [-fsw] [-o property=value] <pool> <device> <new_device> + * + * -f Force attach, even if <new_device> appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. + * -o Set property=value. + * -w Wait for replacing to complete before returning + * + * Replace <device> with <new_device>. + */ +/* ARGSUSED */ +int +zpool_do_replace(int argc, char **argv) +{ + return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); +} + +/* + * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device> + * + * -f Force attach, even if <new_device> appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. + * -o Set property=value. + * -w Wait for resilvering to complete before returning + * + * Attach <new_device> to the mirror containing <device>. If <device> is not + * part of a mirror, then <device> will be transformed into a mirror of + * <device> and <new_device>. In either case, <new_device> will begin life + * with a DTL of [0, now], and will immediately begin to resilver itself. + */ +int +zpool_do_attach(int argc, char **argv) +{ + return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); +} + +/* + * zpool detach [-f] <pool> <device> + * + * -f Force detach of <device>, even if DTLs argue against it + * (not supported yet) + * + * Detach a device from a mirror. The operation will be refused if <device> + * is the last device in the mirror, or if the DTLs indicate that this device + * has the only valid copy of some data. + */ +/* ARGSUSED */ +int +zpool_do_detach(int argc, char **argv) +{ + int c; + char *poolname, *path; + zpool_handle_t *zhp; + int ret; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + if (argc < 2) { + (void) fprintf(stderr, + gettext("missing <device> specification\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + path = argv[1]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + ret = zpool_vdev_detach(zhp, path); + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool split [-gLnP] [-o prop=val] ... + * [-o mntopt] ... + * [-R altroot] <pool> <newpool> [<device> ...] + * + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -n Do not split the pool, but display the resulting layout if + * it were to be split. + * -o Set property=value, or set mount options. + * -P Display full path for vdev name. + * -R Mount the split-off pool under an alternate root. + * -l Load encryption keys while importing. + * + * Splits the named pool and gives it the new pool name. Devices to be split + * off may be listed, provided that no more than one device is specified + * per top-level vdev mirror. The newly split pool is left in an exported + * state unless -R is specified. + * + * Restrictions: the top-level of the pool pool must only be made up of + * mirrors; all devices in the pool must be healthy; no device may be + * undergoing a resilvering operation. + */ +int +zpool_do_split(int argc, char **argv) +{ + char *srcpool, *newpool, *propval; + char *mntopts = NULL; + splitflags_t flags; + int c, ret = 0; + boolean_t loadkeys = B_FALSE; + zpool_handle_t *zhp; + nvlist_t *config, *props = NULL; + + flags.dryrun = B_FALSE; + flags.import = B_FALSE; + flags.name_flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) { + switch (c) { + case 'g': + flags.name_flags |= VDEV_NAME_GUID; + break; + case 'L': + flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; + case 'R': + flags.import = B_TRUE; + if (add_prop_list( + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, + &props, B_TRUE) != 0) { + nvlist_free(props); + usage(B_FALSE); + } + break; + case 'l': + loadkeys = B_TRUE; + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'o': + if ((propval = strchr(optarg, '=')) != NULL) { + *propval = '\0'; + propval++; + if (add_prop_list(optarg, propval, + &props, B_TRUE) != 0) { + nvlist_free(props); + usage(B_FALSE); + } + } else { + mntopts = optarg; + } + break; + case 'P': + flags.name_flags |= VDEV_NAME_PATH; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + break; + } + } + + if (!flags.import && mntopts != NULL) { + (void) fprintf(stderr, gettext("setting mntopts is only " + "valid when importing the pool\n")); + usage(B_FALSE); + } + + if (!flags.import && loadkeys) { + (void) fprintf(stderr, gettext("loading keys is only " + "valid when importing the pool\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("Missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("Missing new pool name\n")); + usage(B_FALSE); + } + + srcpool = argv[0]; + newpool = argv[1]; + + argc -= 2; + argv += 2; + + if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { + nvlist_free(props); + return (1); + } + + config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); + if (config == NULL) { + ret = 1; + } else { + if (flags.dryrun) { + (void) printf(gettext("would create '%s' with the " + "following layout:\n\n"), newpool); + print_vdev_tree(NULL, newpool, config, 0, "", + flags.name_flags); + } + } + + zpool_close(zhp); + + if (ret != 0 || flags.dryrun || !flags.import) { + nvlist_free(config); + nvlist_free(props); + return (ret); + } + + /* + * The split was successful. Now we need to open the new + * pool and import it. + */ + if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { + nvlist_free(config); + nvlist_free(props); + return (1); + } + + if (loadkeys) { + ret = zfs_crypto_attempt_load_keys(g_zfs, newpool); + if (ret != 0) + ret = 1; + } + + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + zpool_enable_datasets(zhp, mntopts, 0) != 0) { + ret = 1; + (void) fprintf(stderr, gettext("Split was successful, but " + "the datasets could not all be mounted\n")); + (void) fprintf(stderr, gettext("Try doing '%s' with a " + "different altroot\n"), "zpool import"); + } + zpool_close(zhp); + nvlist_free(config); + nvlist_free(props); + + return (ret); +} + + + +/* + * zpool online <pool> <device> ... + */ +int +zpool_do_online(int argc, char **argv) +{ + int c, i; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + vdev_state_t newstate; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, "e")) != -1) { + switch (c) { + case 'e': + flags |= ZFS_ONLINE_EXPAND; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device name\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + for (i = 1; i < argc; i++) { + if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { + if (newstate != VDEV_STATE_HEALTHY) { + (void) printf(gettext("warning: device '%s' " + "onlined, but remains in faulted state\n"), + argv[i]); + if (newstate == VDEV_STATE_FAULTED) + (void) printf(gettext("use 'zpool " + "clear' to restore a faulted " + "device\n")); + else + (void) printf(gettext("use 'zpool " + "replace' to replace devices " + "that are no longer present\n")); + } + } else { + ret = 1; + } + } + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool offline [-ft] <pool> <device> ... + * + * -f Force the device into a faulted state. + * + * -t Only take the device off-line temporarily. The offline/faulted + * state will not be persistent across reboots. + */ +/* ARGSUSED */ +int +zpool_do_offline(int argc, char **argv) +{ + int c, i; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + boolean_t istmp = B_FALSE; + boolean_t fault = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "ft")) != -1) { + switch (c) { + case 'f': + fault = B_TRUE; + break; + case 't': + istmp = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device name\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + for (i = 1; i < argc; i++) { + if (fault) { + uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); + vdev_aux_t aux; + if (istmp == B_FALSE) { + /* Force the fault to persist across imports */ + aux = VDEV_AUX_EXTERNAL_PERSIST; + } else { + aux = VDEV_AUX_EXTERNAL; + } + + if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0) + ret = 1; + } else { + if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) + ret = 1; + } + } + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool clear <pool> [device] + * + * Clear all errors associated with a pool or a particular device. + */ +int +zpool_do_clear(int argc, char **argv) +{ + int c; + int ret = 0; + boolean_t dryrun = B_FALSE; + boolean_t do_rewind = B_FALSE; + boolean_t xtreme_rewind = B_FALSE; + uint32_t rewind_policy = ZPOOL_NO_REWIND; + nvlist_t *policy = NULL; + zpool_handle_t *zhp; + char *pool, *device; + + /* check options */ + while ((c = getopt(argc, argv, "FnX")) != -1) { + switch (c) { + case 'F': + do_rewind = B_TRUE; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'X': + xtreme_rewind = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if ((dryrun || xtreme_rewind) && !do_rewind) { + (void) fprintf(stderr, + gettext("-n or -X only meaningful with -F\n")); + usage(B_FALSE); + } + if (dryrun) + rewind_policy = ZPOOL_TRY_REWIND; + else if (do_rewind) + rewind_policy = ZPOOL_DO_REWIND; + if (xtreme_rewind) + rewind_policy |= ZPOOL_EXTREME_REWIND; + + /* In future, further rewind policy choices can be passed along here */ + if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, + rewind_policy) != 0) { + return (1); + } + + pool = argv[0]; + device = argc == 2 ? argv[1] : NULL; + + if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { + nvlist_free(policy); + return (1); + } + + if (zpool_clear(zhp, device, policy) != 0) + ret = 1; + + zpool_close(zhp); + + nvlist_free(policy); + + return (ret); +} + +/* + * zpool reguid <pool> + */ +int +zpool_do_reguid(int argc, char **argv) +{ + int c; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + ret = zpool_reguid(zhp); + + zpool_close(zhp); + return (ret); +} + + +/* + * zpool reopen <pool> + * + * Reopen the pool so that the kernel can update the sizes of all vdevs. + */ +int +zpool_do_reopen(int argc, char **argv) +{ + int c; + int ret = 0; + boolean_t scrub_restart = B_TRUE; + + /* check options */ + while ((c = getopt(argc, argv, "n")) != -1) { + switch (c) { + case 'n': + scrub_restart = B_FALSE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* if argc == 0 we will execute zpool_reopen_one on all pools */ + ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_reopen_one, + &scrub_restart); + + return (ret); +} + +typedef struct scrub_cbdata { + int cb_type; + pool_scrub_cmd_t cb_scrub_cmd; +} scrub_cbdata_t; + +static boolean_t +zpool_has_checkpoint(zpool_handle_t *zhp) +{ + nvlist_t *config, *nvroot; + + config = zpool_get_config(zhp, NULL); + + if (config != NULL) { + pool_checkpoint_stat_t *pcs = NULL; + uint_t c; + + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + + if (pcs == NULL || pcs->pcs_state == CS_NONE) + return (B_FALSE); + + assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || + pcs->pcs_state == CS_CHECKPOINT_DISCARDING); + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +scrub_callback(zpool_handle_t *zhp, void *data) +{ + scrub_cbdata_t *cb = data; + int err; + + /* + * Ignore faulted pools. + */ + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + (void) fprintf(stderr, gettext("cannot scan '%s': pool is " + "currently unavailable\n"), zpool_get_name(zhp)); + return (1); + } + + err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); + + if (err == 0 && zpool_has_checkpoint(zhp) && + cb->cb_type == POOL_SCAN_SCRUB) { + (void) printf(gettext("warning: will not scrub state that " + "belongs to the checkpoint of pool '%s'\n"), + zpool_get_name(zhp)); + } + + return (err != 0); +} + +static int +wait_callback(zpool_handle_t *zhp, void *data) +{ + zpool_wait_activity_t *act = data; + return (zpool_wait(zhp, *act)); +} + +/* + * zpool scrub [-s | -p] [-w] <pool> ... + * + * -s Stop. Stops any in-progress scrub. + * -p Pause. Pause in-progress scrub. + * -w Wait. Blocks until scrub has completed. + */ +int +zpool_do_scrub(int argc, char **argv) +{ + int c; + scrub_cbdata_t cb; + boolean_t wait = B_FALSE; + int error; + + cb.cb_type = POOL_SCAN_SCRUB; + cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + + /* check options */ + while ((c = getopt(argc, argv, "spw")) != -1) { + switch (c) { + case 's': + cb.cb_type = POOL_SCAN_NONE; + break; + case 'p': + cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (cb.cb_type == POOL_SCAN_NONE && + cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { + (void) fprintf(stderr, gettext("invalid option combination: " + "-s and -p are mutually exclusive\n")); + usage(B_FALSE); + } + + if (wait && (cb.cb_type == POOL_SCAN_NONE || + cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { + (void) fprintf(stderr, gettext("invalid option combination: " + "-w cannot be used with -p or -s\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + error = for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb); + + if (wait && !error) { + zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; + error = for_each_pool(argc, argv, B_TRUE, NULL, wait_callback, + &act); + } + + return (error); +} + +/* + * zpool resilver <pool> ... + * + * Restarts any in-progress resilver + */ +int +zpool_do_resilver(int argc, char **argv) +{ + int c; + scrub_cbdata_t cb; + + cb.cb_type = POOL_SCAN_RESILVER; + cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); +} + +/* + * zpool trim [-d] [-r <rate>] [-c | -s] <pool> [<device> ...] + * + * -c Cancel. Ends any in-progress trim. + * -d Secure trim. Requires kernel and device support. + * -r <rate> Sets the TRIM rate in bytes (per second). Supports + * adding a multiplier suffix such as 'k' or 'm'. + * -s Suspend. TRIM can then be restarted with no flags. + * -w Wait. Blocks until trimming has completed. + */ +int +zpool_do_trim(int argc, char **argv) +{ + struct option long_options[] = { + {"cancel", no_argument, NULL, 'c'}, + {"secure", no_argument, NULL, 'd'}, + {"rate", required_argument, NULL, 'r'}, + {"suspend", no_argument, NULL, 's'}, + {"wait", no_argument, NULL, 'w'}, + {0, 0, 0, 0} + }; + + pool_trim_func_t cmd_type = POOL_TRIM_START; + uint64_t rate = 0; + boolean_t secure = B_FALSE; + boolean_t wait = B_FALSE; + + int c; + while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) + != -1) { + switch (c) { + case 'c': + if (cmd_type != POOL_TRIM_START && + cmd_type != POOL_TRIM_CANCEL) { + (void) fprintf(stderr, gettext("-c cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_TRIM_CANCEL; + break; + case 'd': + if (cmd_type != POOL_TRIM_START) { + (void) fprintf(stderr, gettext("-d cannot be " + "combined with the -c or -s options\n")); + usage(B_FALSE); + } + secure = B_TRUE; + break; + case 'r': + if (cmd_type != POOL_TRIM_START) { + (void) fprintf(stderr, gettext("-r cannot be " + "combined with the -c or -s options\n")); + usage(B_FALSE); + } + if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) { + (void) fprintf(stderr, + gettext("invalid value for rate\n")); + usage(B_FALSE); + } + break; + case 's': + if (cmd_type != POOL_TRIM_START && + cmd_type != POOL_TRIM_SUSPEND) { + (void) fprintf(stderr, gettext("-s cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_TRIM_SUSPEND; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + return (-1); + } + + if (wait && (cmd_type != POOL_TRIM_START)) { + (void) fprintf(stderr, gettext("-w cannot be used with -c or " + "-s\n")); + usage(B_FALSE); + } + + char *poolname = argv[0]; + zpool_handle_t *zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + + trimflags_t trim_flags = { + .secure = secure, + .rate = rate, + .wait = wait, + }; + + nvlist_t *vdevs = fnvlist_alloc(); + if (argc == 1) { + /* no individual leaf vdevs specified, so add them all */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + zpool_collect_leaves(zhp, nvroot, vdevs); + trim_flags.fullpool = B_TRUE; + } else { + trim_flags.fullpool = B_FALSE; + for (int i = 1; i < argc; i++) { + fnvlist_add_boolean(vdevs, argv[i]); + } + } + + int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); + + fnvlist_free(vdevs); + zpool_close(zhp); + + return (error); +} + +/* + * Converts a total number of seconds to a human readable string broken + * down in to days/hours/minutes/seconds. + */ +static void +secs_to_dhms(uint64_t total, char *buf) +{ + uint64_t days = total / 60 / 60 / 24; + uint64_t hours = (total / 60 / 60) % 24; + uint64_t mins = (total / 60) % 60; + uint64_t secs = (total % 60); + + if (days > 0) { + (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", + (u_longlong_t)days, (u_longlong_t)hours, + (u_longlong_t)mins, (u_longlong_t)secs); + } else { + (void) sprintf(buf, "%02llu:%02llu:%02llu", + (u_longlong_t)hours, (u_longlong_t)mins, + (u_longlong_t)secs); + } +} + +/* + * Print out detailed scrub status. + */ +static void +print_scan_scrub_resilver_status(pool_scan_stat_t *ps) +{ + time_t start, end, pause; + uint64_t pass_scanned, scanned, pass_issued, issued, total; + uint64_t elapsed, scan_rate, issue_rate; + double fraction_done; + char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; + char srate_buf[7], irate_buf[7], time_buf[32]; + + printf(" "); + printf_color(ANSI_BOLD, gettext("scan:")); + printf(" "); + + /* If there's never been a scan, there's not much to say. */ + if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || + ps->pss_func >= POOL_SCAN_FUNCS) { + (void) printf(gettext("none requested\n")); + return; + } + + start = ps->pss_start_time; + end = ps->pss_end_time; + pause = ps->pss_pass_scrub_pause; + + zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); + + assert(ps->pss_func == POOL_SCAN_SCRUB || + ps->pss_func == POOL_SCAN_RESILVER); + + /* Scan is finished or canceled. */ + if (ps->pss_state == DSS_FINISHED) { + secs_to_dhms(end - start, time_buf); + + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub repaired %s " + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilvered %s " + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); + } + return; + } else if (ps->pss_state == DSS_CANCELED) { + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub canceled on %s"), + ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver canceled on %s"), + ctime(&end)); + } + return; + } + + assert(ps->pss_state == DSS_SCANNING); + + /* Scan is in progress. Resilvers can't be paused. */ + if (ps->pss_func == POOL_SCAN_SCRUB) { + if (pause == 0) { + (void) printf(gettext("scrub in progress since %s"), + ctime(&start)); + } else { + (void) printf(gettext("scrub paused since %s"), + ctime(&pause)); + (void) printf(gettext("\tscrub started on %s"), + ctime(&start)); + } + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver in progress since %s"), + ctime(&start)); + } + + scanned = ps->pss_examined; + pass_scanned = ps->pss_pass_exam; + issued = ps->pss_issued; + pass_issued = ps->pss_pass_issued; + total = ps->pss_to_examine; + + /* we are only done with a block once we have issued the IO for it */ + fraction_done = (double)issued / total; + + /* elapsed time for this pass, rounding up to 1 if it's 0 */ + elapsed = time(NULL) - ps->pss_pass_start; + elapsed -= ps->pss_pass_scrub_spent_paused; + elapsed = (elapsed != 0) ? elapsed : 1; + + scan_rate = pass_scanned / elapsed; + issue_rate = pass_issued / elapsed; + uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? + ((total - issued) / issue_rate) : UINT64_MAX; + secs_to_dhms(total_secs_left, time_buf); + + /* format all of the numbers we will be reporting */ + zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); + zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); + zfs_nicebytes(total, total_buf, sizeof (total_buf)); + zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); + zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); + + /* do not print estimated time if we have a paused scrub */ + if (pause == 0) { + (void) printf(gettext("\t%s scanned at %s/s, " + "%s issued at %s/s, %s total\n"), + scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); + } else { + (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), + scanned_buf, issued_buf, total_buf); + } + + if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("\t%s resilvered, %.2f%% done"), + processed_buf, 100 * fraction_done); + } else if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("\t%s repaired, %.2f%% done"), + processed_buf, 100 * fraction_done); + } + + if (pause == 0) { + if (total_secs_left != UINT64_MAX && + issue_rate >= 10 * 1024 * 1024) { + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext(", no estimated " + "completion time\n")); + } + } else { + (void) printf(gettext("\n")); + } +} + +static void +print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) +{ + if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) + return; + + printf(" "); + printf_color(ANSI_BOLD, gettext("scan:")); + printf(" "); + + uint64_t bytes_scanned = vrs->vrs_bytes_scanned; + uint64_t bytes_issued = vrs->vrs_bytes_issued; + uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; + uint64_t bytes_est = vrs->vrs_bytes_est; + uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / + (vrs->vrs_pass_time_ms + 1)) * 1000; + uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / + (vrs->vrs_pass_time_ms + 1)) * 1000; + double scan_pct = MIN((double)bytes_scanned * 100 / + (bytes_est + 1), 100); + + /* Format all of the numbers we will be reporting */ + char bytes_scanned_buf[7], bytes_issued_buf[7]; + char bytes_rebuilt_buf[7], bytes_est_buf[7]; + char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; + zfs_nicebytes(bytes_scanned, bytes_scanned_buf, + sizeof (bytes_scanned_buf)); + zfs_nicebytes(bytes_issued, bytes_issued_buf, + sizeof (bytes_issued_buf)); + zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, + sizeof (bytes_rebuilt_buf)); + zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); + zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); + zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); + + time_t start = vrs->vrs_start_time; + time_t end = vrs->vrs_end_time; + + /* Rebuild is finished or canceled. */ + if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { + secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); + (void) printf(gettext("resilvered (%s) %s in %s " + "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, + time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { + (void) printf(gettext("resilver (%s) canceled on %s"), + vdev_name, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + (void) printf(gettext("resilver (%s) in progress since %s"), + vdev_name, ctime(&start)); + } + + assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); + + secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / + MAX(scan_rate, 1), time_buf); + + (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " + "%s total\n"), bytes_scanned_buf, scan_rate_buf, + bytes_issued_buf, issue_rate_buf, bytes_est_buf); + (void) printf(gettext("\t%s resilvered, %.2f%% done"), + bytes_rebuilt_buf, scan_pct); + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + if (scan_rate >= 10 * 1024 * 1024) { + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext(", no estimated " + "completion time\n")); + } + } else { + (void) printf(gettext("\n")); + } +} + +/* + * Print rebuild status for top-level vdevs. + */ +static void +print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + char *name = zpool_vdev_name(g_zfs, zhp, + child[c], VDEV_NAME_TYPE_ID); + print_rebuild_status_impl(vrs, name); + free(name); + } + } +} + +/* + * As we don't scrub checkpointed blocks, we want to warn the user that we + * skipped scanning some blocks if a checkpoint exists or existed at any + * time during the scan. If a sequential instead of healing reconstruction + * was performed then the blocks were reconstructed. However, their checksums + * have not been verified so we still print the warning. + */ +static void +print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) +{ + if (ps == NULL || pcs == NULL) + return; + + if (pcs->pcs_state == CS_NONE || + pcs->pcs_state == CS_CHECKPOINT_DISCARDING) + return; + + assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); + + if (ps->pss_state == DSS_NONE) + return; + + if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && + ps->pss_end_time < pcs->pcs_start_time) + return; + + if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { + (void) printf(gettext(" scan warning: skipped blocks " + "that are only referenced by the checkpoint.\n")); + } else { + assert(ps->pss_state == DSS_SCANNING); + (void) printf(gettext(" scan warning: skipping blocks " + "that are only referenced by the checkpoint.\n")); + } +} + +/* + * Returns B_TRUE if there is an active rebuild in progress. Otherwise, + * B_FALSE is returned and 'rebuild_end_time' is set to the end time for + * the last completed (or cancelled) rebuild. + */ +static boolean_t +check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) +{ + nvlist_t **child; + uint_t children; + boolean_t rebuilding = B_FALSE; + uint64_t end_time = 0; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + + if (vrs->vrs_end_time > end_time) + end_time = vrs->vrs_end_time; + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + rebuilding = B_TRUE; + end_time = 0; + break; + } + } + } + + if (rebuild_end_time != NULL) + *rebuild_end_time = end_time; + + return (rebuilding); +} + +/* + * Print the scan status. + */ +static void +print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + uint64_t rebuild_end_time = 0, resilver_end_time = 0; + boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; + boolean_t active_resilver = B_FALSE; + pool_checkpoint_stat_t *pcs = NULL; + pool_scan_stat_t *ps = NULL; + uint_t c; + + if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c) == 0) { + if (ps->pss_func == POOL_SCAN_RESILVER) { + resilver_end_time = ps->pss_end_time; + active_resilver = (ps->pss_state == DSS_SCANNING); + } + + have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); + have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); + } + + boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); + boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); + + /* Always print the scrub status when available. */ + if (have_scrub) + print_scan_scrub_resilver_status(ps); + + /* + * When there is an active resilver or rebuild print its status. + * Otherwise print the status of the last resilver or rebuild. + */ + if (active_resilver || (!active_rebuild && have_resilver && + resilver_end_time && resilver_end_time > rebuild_end_time)) { + print_scan_scrub_resilver_status(ps); + } else if (active_rebuild || (!active_resilver && have_rebuild && + rebuild_end_time && rebuild_end_time > resilver_end_time)) { + print_rebuild_status(zhp, nvroot); + } + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + print_checkpoint_scan_warning(ps, pcs); +} + +/* + * Print out detailed removal status. + */ +static void +print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) +{ + char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; + time_t start, end; + nvlist_t *config, *nvroot; + nvlist_t **child; + uint_t children; + char *vdev_name; + + if (prs == NULL || prs->prs_state == DSS_NONE) + return; + + /* + * Determine name of vdev. + */ + config = zpool_get_config(zhp, NULL); + nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + assert(prs->prs_removing_vdev < children); + vdev_name = zpool_vdev_name(g_zfs, zhp, + child[prs->prs_removing_vdev], B_TRUE); + + (void) printf(gettext("remove: ")); + + start = prs->prs_start_time; + end = prs->prs_end_time; + zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); + + /* + * Removal is finished or canceled. + */ + if (prs->prs_state == DSS_FINISHED) { + uint64_t minutes_taken = (end - start) / 60; + + (void) printf(gettext("Removal of vdev %llu copied %s " + "in %lluh%um, completed on %s"), + (longlong_t)prs->prs_removing_vdev, + copied_buf, + (u_longlong_t)(minutes_taken / 60), + (uint_t)(minutes_taken % 60), + ctime((time_t *)&end)); + } else if (prs->prs_state == DSS_CANCELED) { + (void) printf(gettext("Removal of %s canceled on %s"), + vdev_name, ctime(&end)); + } else { + uint64_t copied, total, elapsed, mins_left, hours_left; + double fraction_done; + uint_t rate; + + assert(prs->prs_state == DSS_SCANNING); + + /* + * Removal is in progress. + */ + (void) printf(gettext( + "Evacuation of %s in progress since %s"), + vdev_name, ctime(&start)); + + copied = prs->prs_copied > 0 ? prs->prs_copied : 1; + total = prs->prs_to_copy; + fraction_done = (double)copied / total; + + /* elapsed time for this pass */ + elapsed = time(NULL) - prs->prs_start_time; + elapsed = elapsed > 0 ? elapsed : 1; + rate = copied / elapsed; + rate = rate > 0 ? rate : 1; + mins_left = ((total - copied) / rate) / 60; + hours_left = mins_left / 60; + + zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than + * 30 days + */ + (void) printf(gettext(" %s copied out of %s at %s/s, " + "%.2f%% done"), + examined_buf, total_buf, rate_buf, 100 * fraction_done); + if (hours_left < (30 * 24)) { + (void) printf(gettext(", %lluh%um to go\n"), + (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); + } else { + (void) printf(gettext( + ", (copy is slow, no estimated time)\n")); + } + } + free(vdev_name); + + if (prs->prs_mapping_memory > 0) { + char mem_buf[7]; + zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); + (void) printf(gettext(" %s memory used for " + "removed device mappings\n"), + mem_buf); + } +} + +static void +print_checkpoint_status(pool_checkpoint_stat_t *pcs) +{ + time_t start; + char space_buf[7]; + + if (pcs == NULL || pcs->pcs_state == CS_NONE) + return; + + (void) printf(gettext("checkpoint: ")); + + start = pcs->pcs_start_time; + zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); + + if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { + char *date = ctime(&start); + + /* + * ctime() adds a newline at the end of the generated + * string, thus the weird format specifier and the + * strlen() call used to chop it off from the output. + */ + (void) printf(gettext("created %.*s, consumes %s\n"), + (int)(strlen(date) - 1), date, space_buf); + return; + } + + assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); + + (void) printf(gettext("discarding, %s remaining.\n"), + space_buf); +} + +static void +print_error_log(zpool_handle_t *zhp) +{ + nvlist_t *nverrlist = NULL; + nvpair_t *elem; + char *pathname; + size_t len = MAXPATHLEN * 2; + + if (zpool_get_errlog(zhp, &nverrlist) != 0) + return; + + (void) printf("errors: Permanent errors have been " + "detected in the following files:\n\n"); + + pathname = safe_malloc(len); + elem = NULL; + while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { + nvlist_t *nv; + uint64_t dsobj, obj; + + verify(nvpair_value_nvlist(elem, &nv) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, + &dsobj) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, + &obj) == 0); + zpool_obj_to_path(zhp, dsobj, obj, pathname, len); + (void) printf("%7s %s\n", "", pathname); + } + free(pathname); + nvlist_free(nverrlist); +} + +static void +print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, + uint_t nspares) +{ + uint_t i; + char *name; + + if (nspares == 0) + return; + + (void) printf(gettext("\tspares\n")); + + for (i = 0; i < nspares; i++) { + name = zpool_vdev_name(g_zfs, zhp, spares[i], + cb->cb_name_flags); + print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); + free(name); + } +} + +static void +print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, + uint_t nl2cache) +{ + uint_t i; + char *name; + + if (nl2cache == 0) + return; + + (void) printf(gettext("\tcache\n")); + + for (i = 0; i < nl2cache; i++) { + name = zpool_vdev_name(g_zfs, zhp, l2cache[i], + cb->cb_name_flags); + print_status_config(zhp, cb, name, l2cache[i], 2, + B_FALSE, NULL); + free(name); + } +} + +static void +print_dedup_stats(nvlist_t *config) +{ + ddt_histogram_t *ddh; + ddt_stat_t *dds; + ddt_object_t *ddo; + uint_t c; + char dspace[6], mspace[6]; + + /* + * If the pool was faulted then we may not have been able to + * obtain the config. Otherwise, if we have anything in the dedup + * table continue processing the stats. + */ + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, + (uint64_t **)&ddo, &c) != 0) + return; + + (void) printf("\n"); + (void) printf(gettext(" dedup: ")); + if (ddo->ddo_count == 0) { + (void) printf(gettext("no DDT entries\n")); + return; + } + + zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); + zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); + (void) printf("DDT entries %llu, size %s on disk, %s in core\n", + (u_longlong_t)ddo->ddo_count, + dspace, + mspace); + + verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, + (uint64_t **)&dds, &c) == 0); + verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, + (uint64_t **)&ddh, &c) == 0); + zpool_dump_ddt(dds, ddh); +} + +/* + * Display a summary of pool status. Displays a summary such as: + * + * pool: tank + * status: DEGRADED + * reason: One or more devices ... + * see: https://zfsonlinux.org/msg/ZFS-xxxx-01 + * config: + * mirror DEGRADED + * c1t0d0 OK + * c2t0d0 UNAVAIL + * + * When given the '-v' option, we print out the complete config. If the '-e' + * option is specified, then we print out error rate information as well. + */ +static int +status_callback(zpool_handle_t *zhp, void *data) +{ + status_cbdata_t *cbp = data; + nvlist_t *config, *nvroot; + char *msgid; + zpool_status_t reason; + zpool_errata_t errata; + const char *health; + uint_t c; + vdev_stat_t *vs; + + config = zpool_get_config(zhp, NULL); + reason = zpool_get_status(zhp, &msgid, &errata); + + cbp->cb_count++; + + /* + * If we were given 'zpool status -x', only report those pools with + * problems. + */ + if (cbp->cb_explain && + (reason == ZPOOL_STATUS_OK || + reason == ZPOOL_STATUS_VERSION_OLDER || + reason == ZPOOL_STATUS_FEAT_DISABLED)) { + if (!cbp->cb_allpools) { + (void) printf(gettext("pool '%s' is healthy\n"), + zpool_get_name(zhp)); + if (cbp->cb_first) + cbp->cb_first = B_FALSE; + } + return (0); + } + + if (cbp->cb_first) + cbp->cb_first = B_FALSE; + else + (void) printf("\n"); + + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + health = zpool_get_state_str(zhp); + + printf(" "); + printf_color(ANSI_BOLD, gettext("pool:")); + printf(" %s\n", zpool_get_name(zhp)); + printf(" "); + printf_color(ANSI_BOLD, gettext("state: ")); + + printf_color(health_str_to_color(health), "%s", health); + + printf("\n"); + + switch (reason) { + case ZPOOL_STATUS_MISSING_DEV_R: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be opened. Sufficient replicas exist for\n\tthe pool " + "to continue functioning in a degraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Attach the missing device " + "and online it using 'zpool online'.\n")); + break; + + case ZPOOL_STATUS_MISSING_DEV_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be opened. There are insufficient\n\treplicas for the" + " pool to continue functioning.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Attach the missing device " + "and online it using 'zpool online'.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_LABEL_R: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be used because the label is missing or\n\tinvalid. " + "Sufficient replicas exist for the pool to continue\n\t" + "functioning in a degraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Replace the device using " + "'zpool replace'.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_LABEL_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be used because the label is missing \n\tor invalid. " + "There are insufficient replicas for the pool to " + "continue\n\tfunctioning.\n")); + zpool_explain_recover(zpool_get_handle(zhp), + zpool_get_name(zhp), reason, config); + break; + + case ZPOOL_STATUS_FAILING_DEV: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " + "experienced an unrecoverable error. An\n\tattempt was " + "made to correct the error. Applications are " + "unaffected.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Determine if the " + "device needs to be replaced, and clear the errors\n\tusing" + " 'zpool clear' or replace the device with 'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_OFFLINE_DEV: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " + "been taken offline by the administrator.\n\tSufficient " + "replicas exist for the pool to continue functioning in " + "a\n\tdegraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Online the device " + "using 'zpool online' or replace the device with\n\t'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_REMOVED_DEV: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " + "been removed by the administrator.\n\tSufficient " + "replicas exist for the pool to continue functioning in " + "a\n\tdegraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Online the device " + "using zpool online' or replace the device with\n\t'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_RESILVERING: + case ZPOOL_STATUS_REBUILDING: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices is " + "currently being resilvered. The pool will\n\tcontinue " + "to function, possibly in a degraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Wait for the resilver to " + "complete.\n")); + break; + + case ZPOOL_STATUS_REBUILD_SCRUB: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices have " + "been sequentially resilvered, scrubbing\n\tthe pool " + "is recommended.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " + "verify all data checksums.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_DATA: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " + "experienced an error resulting in data\n\tcorruption. " + "Applications may be affected.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Restore the file in question" + " if possible. Otherwise restore the\n\tentire pool from " + "backup.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_POOL: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool metadata is " + "corrupted and the pool cannot be opened.\n")); + zpool_explain_recover(zpool_get_handle(zhp), + zpool_get_name(zhp), reason, config); + break; + + case ZPOOL_STATUS_VERSION_OLDER: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is formatted using " + "a legacy on-disk format. The pool can\n\tstill be used, " + "but some features are unavailable.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Upgrade the pool using " + "'zpool upgrade'. Once this is done, the\n\tpool will no " + "longer be accessible on software that does not support\n\t" + "feature flags.\n")); + break; + + case ZPOOL_STATUS_VERSION_NEWER: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool has been upgraded " + "to a newer, incompatible on-disk version.\n\tThe pool " + "cannot be accessed on this system.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Access the pool from a " + "system running more recent software, or\n\trestore the " + "pool from backup.\n")); + break; + + case ZPOOL_STATUS_FEAT_DISABLED: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Some supported features are " + "not enabled on the pool. The pool can\n\tstill be used, " + "but some features are unavailable.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Enable all features using " + "'zpool upgrade'. Once this is done,\n\tthe pool may no " + "longer be accessible by software that does not support\n\t" + "the features. See zpool-features(5) for details.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_READ: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " + "on this system because it uses the\n\tfollowing feature(s)" + " not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Access the pool from a " + "system that supports the required feature(s),\n\tor " + "restore the pool from backup.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool can only be " + "accessed in read-only mode on this system. It\n\tcannot be" + " accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " + "in read-write mode. Import the pool with\n" + "\t\"-o readonly=on\", access the pool from a system that " + "supports the\n\trequired feature(s), or restore the " + "pool from backup.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_R: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "faulted in response to persistent errors.\n\tSufficient " + "replicas exist for the pool to continue functioning " + "in a\n\tdegraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Replace the faulted device, " + "or use 'zpool clear' to mark the device\n\trepaired.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "faulted in response to persistent errors. There are " + "insufficient replicas for the pool to\n\tcontinue " + "functioning.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Destroy and re-create the " + "pool from a backup source. Manually marking the device\n" + "\trepaired using 'zpool clear' may allow some data " + "to be recovered.\n")); + break; + + case ZPOOL_STATUS_IO_FAILURE_MMP: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is suspended " + "because multihost writes failed or were delayed;\n\t" + "another system could import the pool undetected.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" + " are connected, then reboot your system and\n\timport the " + "pool.\n")); + break; + + case ZPOOL_STATUS_IO_FAILURE_WAIT: + case ZPOOL_STATUS_IO_FAILURE_CONTINUE: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "faulted in response to IO failures.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Make sure the affected " + "devices are connected, then run 'zpool clear'.\n")); + break; + + case ZPOOL_STATUS_BAD_LOG: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("An intent log record " + "could not be read.\n" + "\tWaiting for administrator intervention to fix the " + "faulted pool.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Either restore the affected " + "device(s) and run 'zpool online',\n" + "\tor ignore the intent log records by running " + "'zpool clear'.\n")); + break; + + case ZPOOL_STATUS_NON_NATIVE_ASHIFT: + (void) printf(gettext("status: One or more devices are " + "configured to use a non-native block size.\n" + "\tExpect reduced performance.\n")); + (void) printf(gettext("action: Replace affected devices with " + "devices that support the\n\tconfigured block size, or " + "migrate data to a properly configured\n\tpool.\n")); + break; + + case ZPOOL_STATUS_HOSTID_MISMATCH: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid" + " and system hostid on imported pool.\n\tThis pool was " + "previously imported into a system with a different " + "hostid,\n\tand then was verbatim imported into this " + "system.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Export this pool on all " + "systems on which it is imported.\n" + "\tThen import it to correct the mismatch.\n")); + break; + + case ZPOOL_STATUS_ERRATA: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), + errata); + + switch (errata) { + case ZPOOL_ERRATA_NONE: + break; + + case ZPOOL_ERRATA_ZOL_2094_SCRUB: + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("To correct the issue" + " run 'zpool scrub'.\n")); + break; + + case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: + (void) printf(gettext("\tExisting encrypted datasets " + "contain an on-disk incompatibility\n\twhich " + "needs to be corrected.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("To correct the issue" + " backup existing encrypted datasets to new\n\t" + "encrypted datasets and destroy the old ones. " + "'zfs mount -o ro' can\n\tbe used to temporarily " + "mount existing encrypted datasets readonly.\n")); + break; + + case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: + (void) printf(gettext("\tExisting encrypted snapshots " + "and bookmarks contain an on-disk\n\tincompat" + "ibility. This may cause on-disk corruption if " + "they are used\n\twith 'zfs recv'.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("To correct the" + "issue, enable the bookmark_v2 feature. No " + "additional\n\taction is needed if there are no " + "encrypted snapshots or bookmarks.\n\tIf preserving" + "the encrypted snapshots and bookmarks is required," + " use\n\ta non-raw send to backup and restore them." + " Alternately, they may be\n\tremoved to resolve " + "the incompatibility.\n")); + break; + + default: + /* + * All errata which allow the pool to be imported + * must contain an action message. + */ + assert(0); + } + break; + + default: + /* + * The remaining errors can't actually be generated, yet. + */ + assert(reason == ZPOOL_STATUS_OK); + } + + if (msgid != NULL) { + printf(" "); + printf_color(ANSI_BOLD, gettext("see:")); + printf(gettext(" https://zfsonlinux.org/msg/%s\n"), msgid); + } + + if (config != NULL) { + uint64_t nerr; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + pool_checkpoint_stat_t *pcs = NULL; + pool_removal_stat_t *prs = NULL; + + print_scan_status(zhp, nvroot); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); + print_removal_status(zhp, prs); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + print_checkpoint_status(pcs); + + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, + cbp->cb_name_flags | VDEV_NAME_TYPE_ID); + if (cbp->cb_namewidth < 10) + cbp->cb_namewidth = 10; + + color_start(ANSI_BOLD); + (void) printf(gettext("config:\n\n")); + (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), + cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", + "CKSUM"); + color_end(); + + if (cbp->cb_print_slow_ios) { + printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); + } + + if (cbp->vcdl != NULL) + print_cmd_columns(cbp->vcdl, 0); + + printf("\n"); + + print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, + B_FALSE, NULL); + + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) + print_l2cache(zhp, cbp, l2cache, nl2cache); + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) + print_spares(zhp, cbp, spares, nspares); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, + &nerr) == 0) { + nvlist_t *nverrlist = NULL; + + /* + * If the approximate error count is small, get a + * precise count by fetching the entire log and + * uniquifying the results. + */ + if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && + zpool_get_errlog(zhp, &nverrlist) == 0) { + nvpair_t *elem; + + elem = NULL; + nerr = 0; + while ((elem = nvlist_next_nvpair(nverrlist, + elem)) != NULL) { + nerr++; + } + } + nvlist_free(nverrlist); + + (void) printf("\n"); + + if (nerr == 0) + (void) printf(gettext("errors: No known data " + "errors\n")); + else if (!cbp->cb_verbose) + (void) printf(gettext("errors: %llu data " + "errors, use '-v' for a list\n"), + (u_longlong_t)nerr); + else + print_error_log(zhp); + } + + if (cbp->cb_dedup_stats) + print_dedup_stats(config); + } else { + (void) printf(gettext("config: The configuration cannot be " + "determined.\n")); + } + + return (0); +} + +/* + * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ... + * [interval [count]] + * + * -c CMD For each vdev, run command CMD + * -i Display vdev initialization status. + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -p Display values in parsable (exact) format. + * -P Display full path for vdev name. + * -s Display slow IOs column. + * -v Display complete error logs + * -x Display only pools with potential problems + * -D Display dedup status (undocumented) + * -t Display vdev TRIM status. + * -T Display a timestamp in date(1) or Unix format + * + * Describes the health status of all pools or some subset. + */ +int +zpool_do_status(int argc, char **argv) +{ + int c; + int ret; + float interval = 0; + unsigned long count = 0; + status_cbdata_t cb = { 0 }; + char *cmd = NULL; + + /* check options */ + while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) { + switch (c) { + case 'c': + if (cmd != NULL) { + fprintf(stderr, + gettext("Can't set -c flag twice\n")); + exit(1); + } + + if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && + !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { + fprintf(stderr, gettext( + "Can't run -c, disabled by " + "ZPOOL_SCRIPTS_ENABLED.\n")); + exit(1); + } + + if ((getuid() <= 0 || geteuid() <= 0) && + !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { + fprintf(stderr, gettext( + "Can't run -c with root privileges " + "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); + exit(1); + } + cmd = optarg; + break; + case 'i': + cb.cb_print_vdev_init = B_TRUE; + break; + case 'g': + cb.cb_name_flags |= VDEV_NAME_GUID; + break; + case 'L': + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'P': + cb.cb_name_flags |= VDEV_NAME_PATH; + break; + case 's': + cb.cb_print_slow_ios = B_TRUE; + break; + case 'v': + cb.cb_verbose = B_TRUE; + break; + case 'x': + cb.cb_explain = B_TRUE; + break; + case 'D': + cb.cb_dedup_stats = B_TRUE; + break; + case 't': + cb.cb_print_vdev_trim = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case '?': + if (optopt == 'c') { + print_zpool_script_list("status"); + exit(0); + } else { + fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &interval, &count); + + if (argc == 0) + cb.cb_allpools = B_TRUE; + + cb.cb_first = B_TRUE; + cb.cb_print_status = B_TRUE; + + for (;;) { + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + if (cmd != NULL) + cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, + NULL, NULL, 0, 0); + + ret = for_each_pool(argc, argv, B_TRUE, NULL, + status_callback, &cb); + + if (cb.vcdl != NULL) + free_vdev_cmd_data_list(cb.vcdl); + + if (argc == 0 && cb.cb_count == 0) + (void) fprintf(stderr, gettext("no pools available\n")); + else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) + (void) printf(gettext("all pools are healthy\n")); + + if (ret != 0) + return (ret); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) fsleep(interval); + } + + return (0); +} + +typedef struct upgrade_cbdata { + int cb_first; + int cb_argc; + uint64_t cb_version; + char **cb_argv; +} upgrade_cbdata_t; + +static int +check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) +{ + int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + int *count = (int *)unsupp_fs; + + if (zfs_version > ZPL_VERSION) { + (void) printf(gettext("%s (v%d) is not supported by this " + "implementation of ZFS.\n"), + zfs_get_name(zhp), zfs_version); + (*count)++; + } + + zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs); + + zfs_close(zhp); + + return (0); +} + +static int +upgrade_version(zpool_handle_t *zhp, uint64_t version) +{ + int ret; + nvlist_t *config; + uint64_t oldversion; + int unsupp_fs = 0; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &oldversion) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(oldversion)); + assert(oldversion < version); + + ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); + if (ret != 0) + return (ret); + + if (unsupp_fs) { + (void) fprintf(stderr, gettext("Upgrade not performed due " + "to %d unsupported filesystems (max v%d).\n"), + unsupp_fs, (int)ZPL_VERSION); + return (1); + } + + ret = zpool_upgrade(zhp, version); + if (ret != 0) + return (ret); + + if (version >= SPA_VERSION_FEATURES) { + (void) printf(gettext("Successfully upgraded " + "'%s' from version %llu to feature flags.\n"), + zpool_get_name(zhp), (u_longlong_t)oldversion); + } else { + (void) printf(gettext("Successfully upgraded " + "'%s' from version %llu to version %llu.\n"), + zpool_get_name(zhp), (u_longlong_t)oldversion, + (u_longlong_t)version); + } + + return (0); +} + +static int +upgrade_enable_all(zpool_handle_t *zhp, int *countp) +{ + int i, ret, count; + boolean_t firstff = B_TRUE; + nvlist_t *enabled = zpool_get_features(zhp); + + count = 0; + for (i = 0; i < SPA_FEATURES; i++) { + const char *fname = spa_feature_table[i].fi_uname; + const char *fguid = spa_feature_table[i].fi_guid; + if (!nvlist_exists(enabled, fguid)) { + char *propname; + verify(-1 != asprintf(&propname, "feature@%s", fname)); + ret = zpool_set_prop(zhp, propname, + ZFS_FEATURE_ENABLED); + if (ret != 0) { + free(propname); + return (ret); + } + count++; + + if (firstff) { + (void) printf(gettext("Enabled the " + "following features on '%s':\n"), + zpool_get_name(zhp)); + firstff = B_FALSE; + } + (void) printf(gettext(" %s\n"), fname); + free(propname); + } + } + + if (countp != NULL) + *countp = count; + return (0); +} + +static int +upgrade_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + boolean_t printnl = B_FALSE; + int ret; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(version)); + + if (version < cbp->cb_version) { + cbp->cb_first = B_FALSE; + ret = upgrade_version(zhp, cbp->cb_version); + if (ret != 0) + return (ret); + printnl = B_TRUE; + + /* + * If they did "zpool upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + + if (cbp->cb_version >= SPA_VERSION_FEATURES) { + int count; + ret = upgrade_enable_all(zhp, &count); + if (ret != 0) + return (ret); + + if (count > 0) { + cbp->cb_first = B_FALSE; + printnl = B_TRUE; + } + } + + if (printnl) { + (void) printf(gettext("\n")); + } + + return (0); +} + +static int +upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(version)); + + if (version < SPA_VERSION_FEATURES) { + if (cbp->cb_first) { + (void) printf(gettext("The following pools are " + "formatted with legacy version numbers and can\n" + "be upgraded to use feature flags. After " + "being upgraded, these pools\nwill no " + "longer be accessible by software that does not " + "support feature\nflags.\n\n")); + (void) printf(gettext("VER POOL\n")); + (void) printf(gettext("--- ------------\n")); + cbp->cb_first = B_FALSE; + } + + (void) printf("%2llu %s\n", (u_longlong_t)version, + zpool_get_name(zhp)); + } + + return (0); +} + +static int +upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + if (version >= SPA_VERSION_FEATURES) { + int i; + boolean_t poolfirst = B_TRUE; + nvlist_t *enabled = zpool_get_features(zhp); + + for (i = 0; i < SPA_FEATURES; i++) { + const char *fguid = spa_feature_table[i].fi_guid; + const char *fname = spa_feature_table[i].fi_uname; + if (!nvlist_exists(enabled, fguid)) { + if (cbp->cb_first) { + (void) printf(gettext("\nSome " + "supported features are not " + "enabled on the following pools. " + "Once a\nfeature is enabled the " + "pool may become incompatible with " + "software\nthat does not support " + "the feature. See " + "zpool-features(5) for " + "details.\n\n")); + (void) printf(gettext("POOL " + "FEATURE\n")); + (void) printf(gettext("------" + "---------\n")); + cbp->cb_first = B_FALSE; + } + + if (poolfirst) { + (void) printf(gettext("%s\n"), + zpool_get_name(zhp)); + poolfirst = B_FALSE; + } + + (void) printf(gettext(" %s\n"), fname); + } + /* + * If they did "zpool upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + } + + return (0); +} + +/* ARGSUSED */ +static int +upgrade_one(zpool_handle_t *zhp, void *data) +{ + boolean_t printnl = B_FALSE; + upgrade_cbdata_t *cbp = data; + uint64_t cur_version; + int ret; + + if (strcmp("log", zpool_get_name(zhp)) == 0) { + (void) fprintf(stderr, gettext("'log' is now a reserved word\n" + "Pool 'log' must be renamed using export and import" + " to upgrade.\n")); + return (1); + } + + cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); + if (cur_version > cbp->cb_version) { + (void) printf(gettext("Pool '%s' is already formatted " + "using more current version '%llu'.\n\n"), + zpool_get_name(zhp), (u_longlong_t)cur_version); + return (0); + } + + if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { + (void) printf(gettext("Pool '%s' is already formatted " + "using version %llu.\n\n"), zpool_get_name(zhp), + (u_longlong_t)cbp->cb_version); + return (0); + } + + if (cur_version != cbp->cb_version) { + printnl = B_TRUE; + ret = upgrade_version(zhp, cbp->cb_version); + if (ret != 0) + return (ret); + } + + if (cbp->cb_version >= SPA_VERSION_FEATURES) { + int count = 0; + ret = upgrade_enable_all(zhp, &count); + if (ret != 0) + return (ret); + + if (count != 0) { + printnl = B_TRUE; + } else if (cur_version == SPA_VERSION) { + (void) printf(gettext("Pool '%s' already has all " + "supported features enabled.\n"), + zpool_get_name(zhp)); + } + } + + if (printnl) { + (void) printf(gettext("\n")); + } + + return (0); +} + +/* + * zpool upgrade + * zpool upgrade -v + * zpool upgrade [-V version] <-a | pool ...> + * + * With no arguments, display downrev'd ZFS pool available for upgrade. + * Individual pools can be upgraded by specifying the pool, and '-a' will + * upgrade all pools. + */ +int +zpool_do_upgrade(int argc, char **argv) +{ + int c; + upgrade_cbdata_t cb = { 0 }; + int ret = 0; + boolean_t showversions = B_FALSE; + boolean_t upgradeall = B_FALSE; + char *end; + + + /* check options */ + while ((c = getopt(argc, argv, ":avV:")) != -1) { + switch (c) { + case 'a': + upgradeall = B_TRUE; + break; + case 'v': + showversions = B_TRUE; + break; + case 'V': + cb.cb_version = strtoll(optarg, &end, 10); + if (*end != '\0' || + !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { + (void) fprintf(stderr, + gettext("invalid version '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + cb.cb_argc = argc; + cb.cb_argv = argv; + argc -= optind; + argv += optind; + + if (cb.cb_version == 0) { + cb.cb_version = SPA_VERSION; + } else if (!upgradeall && argc == 0) { + (void) fprintf(stderr, gettext("-V option is " + "incompatible with other arguments\n")); + usage(B_FALSE); + } + + if (showversions) { + if (upgradeall || argc != 0) { + (void) fprintf(stderr, gettext("-v option is " + "incompatible with other arguments\n")); + usage(B_FALSE); + } + } else if (upgradeall) { + if (argc != 0) { + (void) fprintf(stderr, gettext("-a option should not " + "be used along with a pool name\n")); + usage(B_FALSE); + } + } + + (void) printf(gettext("This system supports ZFS pool feature " + "flags.\n\n")); + if (showversions) { + int i; + + (void) printf(gettext("The following features are " + "supported:\n\n")); + (void) printf(gettext("FEAT DESCRIPTION\n")); + (void) printf("----------------------------------------------" + "---------------\n"); + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *fi = &spa_feature_table[i]; + const char *ro = + (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + " (read-only compatible)" : ""; + + (void) printf("%-37s%s\n", fi->fi_uname, ro); + (void) printf(" %s\n", fi->fi_desc); + } + (void) printf("\n"); + + (void) printf(gettext("The following legacy versions are also " + "supported:\n\n")); + (void) printf(gettext("VER DESCRIPTION\n")); + (void) printf("--- -----------------------------------------" + "---------------\n"); + (void) printf(gettext(" 1 Initial ZFS version\n")); + (void) printf(gettext(" 2 Ditto blocks " + "(replicated metadata)\n")); + (void) printf(gettext(" 3 Hot spares and double parity " + "RAID-Z\n")); + (void) printf(gettext(" 4 zpool history\n")); + (void) printf(gettext(" 5 Compression using the gzip " + "algorithm\n")); + (void) printf(gettext(" 6 bootfs pool property\n")); + (void) printf(gettext(" 7 Separate intent log devices\n")); + (void) printf(gettext(" 8 Delegated administration\n")); + (void) printf(gettext(" 9 refquota and refreservation " + "properties\n")); + (void) printf(gettext(" 10 Cache devices\n")); + (void) printf(gettext(" 11 Improved scrub performance\n")); + (void) printf(gettext(" 12 Snapshot properties\n")); + (void) printf(gettext(" 13 snapused property\n")); + (void) printf(gettext(" 14 passthrough-x aclinherit\n")); + (void) printf(gettext(" 15 user/group space accounting\n")); + (void) printf(gettext(" 16 stmf property support\n")); + (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); + (void) printf(gettext(" 18 Snapshot user holds\n")); + (void) printf(gettext(" 19 Log device removal\n")); + (void) printf(gettext(" 20 Compression using zle " + "(zero-length encoding)\n")); + (void) printf(gettext(" 21 Deduplication\n")); + (void) printf(gettext(" 22 Received properties\n")); + (void) printf(gettext(" 23 Slim ZIL\n")); + (void) printf(gettext(" 24 System attributes\n")); + (void) printf(gettext(" 25 Improved scrub stats\n")); + (void) printf(gettext(" 26 Improved snapshot deletion " + "performance\n")); + (void) printf(gettext(" 27 Improved snapshot creation " + "performance\n")); + (void) printf(gettext(" 28 Multiple vdev replacements\n")); + (void) printf(gettext("\nFor more information on a particular " + "version, including supported releases,\n")); + (void) printf(gettext("see the ZFS Administration Guide.\n\n")); + } else if (argc == 0 && upgradeall) { + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_cb, &cb); + if (ret == 0 && cb.cb_first) { + if (cb.cb_version == SPA_VERSION) { + (void) printf(gettext("All pools are already " + "formatted using feature flags.\n\n")); + (void) printf(gettext("Every feature flags " + "pool already has all supported features " + "enabled.\n")); + } else { + (void) printf(gettext("All pools are already " + "formatted with version %llu or higher.\n"), + (u_longlong_t)cb.cb_version); + } + } + } else if (argc == 0) { + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); + assert(ret == 0); + + if (cb.cb_first) { + (void) printf(gettext("All pools are formatted " + "using feature flags.\n\n")); + } else { + (void) printf(gettext("\nUse 'zpool upgrade -v' " + "for a list of available legacy versions.\n")); + } + + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); + assert(ret == 0); + + if (cb.cb_first) { + (void) printf(gettext("Every feature flags pool has " + "all supported features enabled.\n")); + } else { + (void) printf(gettext("\n")); + } + } else { + ret = for_each_pool(argc, argv, B_FALSE, NULL, + upgrade_one, &cb); + } + + return (ret); +} + +typedef struct hist_cbdata { + boolean_t first; + boolean_t longfmt; + boolean_t internal; +} hist_cbdata_t; + +static void +print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) +{ + nvlist_t **records; + uint_t numrecords; + int i; + + verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, + &records, &numrecords) == 0); + for (i = 0; i < numrecords; i++) { + nvlist_t *rec = records[i]; + char tbuf[30] = ""; + + if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { + time_t tsec; + struct tm t; + + tsec = fnvlist_lookup_uint64(records[i], + ZPOOL_HIST_TIME); + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + } + + if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { + (void) printf("%s %s", tbuf, + fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); + } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { + int ievent = + fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); + if (!cb->internal) + continue; + if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { + (void) printf("%s unrecognized record:\n", + tbuf); + dump_nvlist(rec, 4); + continue; + } + (void) printf("%s [internal %s txg:%lld] %s", tbuf, + zfs_history_event_names[ievent], + (longlong_t)fnvlist_lookup_uint64( + rec, ZPOOL_HIST_TXG), + fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { + if (!cb->internal) + continue; + (void) printf("%s [txg:%lld] %s", tbuf, + (longlong_t)fnvlist_lookup_uint64( + rec, ZPOOL_HIST_TXG), + fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); + if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { + (void) printf(" %s (%llu)", + fnvlist_lookup_string(rec, + ZPOOL_HIST_DSNAME), + (u_longlong_t)fnvlist_lookup_uint64(rec, + ZPOOL_HIST_DSID)); + } + (void) printf(" %s", fnvlist_lookup_string(rec, + ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { + if (!cb->internal) + continue; + (void) printf("%s ioctl %s\n", tbuf, + fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); + if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { + (void) printf(" input:\n"); + dump_nvlist(fnvlist_lookup_nvlist(rec, + ZPOOL_HIST_INPUT_NVL), 8); + } + if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { + (void) printf(" output:\n"); + dump_nvlist(fnvlist_lookup_nvlist(rec, + ZPOOL_HIST_OUTPUT_NVL), 8); + } + if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { + (void) printf(" errno: %lld\n", + (longlong_t)fnvlist_lookup_int64(rec, + ZPOOL_HIST_ERRNO)); + } + } else { + if (!cb->internal) + continue; + (void) printf("%s unrecognized record:\n", tbuf); + dump_nvlist(rec, 4); + } + + if (!cb->longfmt) { + (void) printf("\n"); + continue; + } + (void) printf(" ["); + if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { + uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); + struct passwd *pwd = getpwuid(who); + (void) printf("user %d ", (int)who); + if (pwd != NULL) + (void) printf("(%s) ", pwd->pw_name); + } + if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { + (void) printf("on %s", + fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); + } + if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { + (void) printf(":%s", + fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); + } + + (void) printf("]"); + (void) printf("\n"); + } +} + +/* + * Print out the command history for a specific pool. + */ +static int +get_history_one(zpool_handle_t *zhp, void *data) +{ + nvlist_t *nvhis; + int ret; + hist_cbdata_t *cb = (hist_cbdata_t *)data; + uint64_t off = 0; + boolean_t eof = B_FALSE; + + cb->first = B_FALSE; + + (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); + + while (!eof) { + if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) + return (ret); + + print_history_records(nvhis, cb); + nvlist_free(nvhis); + } + (void) printf("\n"); + + return (ret); +} + +/* + * zpool history <pool> + * + * Displays the history of commands that modified pools. + */ +int +zpool_do_history(int argc, char **argv) +{ + hist_cbdata_t cbdata = { 0 }; + int ret; + int c; + + cbdata.first = B_TRUE; + /* check options */ + while ((c = getopt(argc, argv, "li")) != -1) { + switch (c) { + case 'l': + cbdata.longfmt = B_TRUE; + break; + case 'i': + cbdata.internal = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, + &cbdata); + + if (argc == 0 && cbdata.first == B_TRUE) { + (void) fprintf(stderr, gettext("no pools available\n")); + return (0); + } + + return (ret); +} + +typedef struct ev_opts { + int verbose; + int scripted; + int follow; + int clear; + char poolname[ZFS_MAX_DATASET_NAME_LEN]; +} ev_opts_t; + +static void +zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) +{ + char ctime_str[26], str[32], *ptr; + int64_t *tv; + uint_t n; + + verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); + memset(str, ' ', 32); + (void) ctime_r((const time_t *)&tv[0], ctime_str); + (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */ + (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */ + (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */ + (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ + if (opts->scripted) + (void) printf(gettext("%s\t"), str); + else + (void) printf(gettext("%s "), str); + + verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); + (void) printf(gettext("%s\n"), ptr); +} + +static void +zpool_do_events_nvprint(nvlist_t *nvl, int depth) +{ + nvpair_t *nvp; + + for (nvp = nvlist_next_nvpair(nvl, NULL); + nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { + + data_type_t type = nvpair_type(nvp); + const char *name = nvpair_name(nvp); + + boolean_t b; + uint8_t i8; + uint16_t i16; + uint32_t i32; + uint64_t i64; + char *str; + nvlist_t *cnv; + + printf(gettext("%*s%s = "), depth, "", name); + + switch (type) { + case DATA_TYPE_BOOLEAN: + printf(gettext("%s"), "1"); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(nvp, &b); + printf(gettext("%s"), b ? "1" : "0"); + break; + + case DATA_TYPE_BYTE: + (void) nvpair_value_byte(nvp, &i8); + printf(gettext("0x%x"), i8); + break; + + case DATA_TYPE_INT8: + (void) nvpair_value_int8(nvp, (void *)&i8); + printf(gettext("0x%x"), i8); + break; + + case DATA_TYPE_UINT8: + (void) nvpair_value_uint8(nvp, &i8); + printf(gettext("0x%x"), i8); + break; + + case DATA_TYPE_INT16: + (void) nvpair_value_int16(nvp, (void *)&i16); + printf(gettext("0x%x"), i16); + break; + + case DATA_TYPE_UINT16: + (void) nvpair_value_uint16(nvp, &i16); + printf(gettext("0x%x"), i16); + break; + + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (void *)&i32); + printf(gettext("0x%x"), i32); + break; + + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + printf(gettext("0x%x"), i32); + break; + + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (void *)&i64); + printf(gettext("0x%llx"), (u_longlong_t)i64); + break; + + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + /* + * translate vdev state values to readable + * strings to aide zpool events consumers + */ + if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || + strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { + printf(gettext("\"%s\" (0x%llx)"), + zpool_state_to_name(i64, VDEV_AUX_NONE), + (u_longlong_t)i64); + } else { + printf(gettext("0x%llx"), (u_longlong_t)i64); + } + break; + + case DATA_TYPE_HRTIME: + (void) nvpair_value_hrtime(nvp, (void *)&i64); + printf(gettext("0x%llx"), (u_longlong_t)i64); + break; + + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + printf(gettext("\"%s\""), str ? str : "<NULL>"); + break; + + case DATA_TYPE_NVLIST: + printf(gettext("(embedded nvlist)\n")); + (void) nvpair_value_nvlist(nvp, &cnv); + zpool_do_events_nvprint(cnv, depth + 8); + printf(gettext("%*s(end %s)"), depth, "", name); + break; + + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **val; + uint_t i, nelem; + + (void) nvpair_value_nvlist_array(nvp, &val, &nelem); + printf(gettext("(%d embedded nvlists)\n"), nelem); + for (i = 0; i < nelem; i++) { + printf(gettext("%*s%s[%d] = %s\n"), + depth, "", name, i, "(embedded nvlist)"); + zpool_do_events_nvprint(val[i], depth + 8); + printf(gettext("%*s(end %s[%i])\n"), + depth, "", name, i); + } + printf(gettext("%*s(end %s)\n"), depth, "", name); + } + break; + + case DATA_TYPE_INT8_ARRAY: { + int8_t *val; + uint_t i, nelem; + + (void) nvpair_value_int8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_INT16_ARRAY: { + int16_t *val; + uint_t i, nelem; + + (void) nvpair_value_int16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_INT32_ARRAY: { + int32_t *val; + uint_t i, nelem; + + (void) nvpair_value_int32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_INT64_ARRAY: { + int64_t *val; + uint_t i, nelem; + + (void) nvpair_value_int64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%llx "), + (u_longlong_t)val[i]); + + break; + } + + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%llx "), + (u_longlong_t)val[i]); + + break; + } + + case DATA_TYPE_STRING_ARRAY: { + char **str; + uint_t i, nelem; + + (void) nvpair_value_string_array(nvp, &str, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("\"%s\" "), + str[i] ? str[i] : "<NULL>"); + + break; + } + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_DOUBLE: + case DATA_TYPE_DONTCARE: + case DATA_TYPE_UNKNOWN: + printf(gettext("<unknown>")); + break; + } + + printf(gettext("\n")); + } +} + +static int +zpool_do_events_next(ev_opts_t *opts) +{ + nvlist_t *nvl; + int zevent_fd, ret, dropped; + char *pool; + + zevent_fd = open(ZFS_DEV, O_RDWR); + VERIFY(zevent_fd >= 0); + + if (!opts->scripted) + (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); + + while (1) { + ret = zpool_events_next(g_zfs, &nvl, &dropped, + (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); + if (ret || nvl == NULL) + break; + + if (dropped > 0) + (void) printf(gettext("dropped %d events\n"), dropped); + + if (strlen(opts->poolname) > 0 && + nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 && + strcmp(opts->poolname, pool) != 0) + continue; + + zpool_do_events_short(nvl, opts); + + if (opts->verbose) { + zpool_do_events_nvprint(nvl, 8); + printf(gettext("\n")); + } + (void) fflush(stdout); + + nvlist_free(nvl); + } + + VERIFY(0 == close(zevent_fd)); + + return (ret); +} + +static int +zpool_do_events_clear(ev_opts_t *opts) +{ + int count, ret; + + ret = zpool_events_clear(g_zfs, &count); + if (!ret) + (void) printf(gettext("cleared %d events\n"), count); + + return (ret); +} + +/* + * zpool events [-vHf [pool] | -c] + * + * Displays events logs by ZFS. + */ +int +zpool_do_events(int argc, char **argv) +{ + ev_opts_t opts = { 0 }; + int ret; + int c; + + /* check options */ + while ((c = getopt(argc, argv, "vHfc")) != -1) { + switch (c) { + case 'v': + opts.verbose = 1; + break; + case 'H': + opts.scripted = 1; + break; + case 'f': + opts.follow = 1; + break; + case 'c': + opts.clear = 1; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } else if (argc == 1) { + (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname)); + if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) { + (void) fprintf(stderr, + gettext("invalid pool name '%s'\n"), opts.poolname); + usage(B_FALSE); + } + } + + if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) && + opts.clear) { + (void) fprintf(stderr, + gettext("invalid options combined with -c\n")); + usage(B_FALSE); + } + + if (opts.clear) + ret = zpool_do_events_clear(&opts); + else + ret = zpool_do_events_next(&opts); + + return (ret); +} + +static int +get_callback(zpool_handle_t *zhp, void *data) +{ + zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; + char value[MAXNAMELEN]; + zprop_source_t srctype; + zprop_list_t *pl; + + for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { + + /* + * Skip the special fake placeholder. This will also skip + * over the name property when 'all' is specified. + */ + if (pl->pl_prop == ZPOOL_PROP_NAME && + pl == cbp->cb_proplist) + continue; + + if (pl->pl_prop == ZPROP_INVAL && + (zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop))) { + srctype = ZPROP_SRC_LOCAL; + + if (zpool_prop_get_feature(zhp, pl->pl_user_prop, + value, sizeof (value)) == 0) { + zprop_print_one_property(zpool_get_name(zhp), + cbp, pl->pl_user_prop, value, srctype, + NULL, NULL); + } + } else { + if (zpool_get_prop(zhp, pl->pl_prop, value, + sizeof (value), &srctype, cbp->cb_literal) != 0) + continue; + + zprop_print_one_property(zpool_get_name(zhp), cbp, + zpool_prop_to_name(pl->pl_prop), value, srctype, + NULL, NULL); + } + } + return (0); +} + +/* + * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> <pool> ... + * + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -o List of columns to display. Defaults to + * "name,property,value,source". + * -p Display values in parsable (exact) format. + * + * Get properties of pools in the system. Output space statistics + * for each one as well as other attributes. + */ +int +zpool_do_get(int argc, char **argv) +{ + zprop_get_cbdata_t cb = { 0 }; + zprop_list_t fake_name = { 0 }; + int ret; + int c, i; + char *value; + + cb.cb_first = B_TRUE; + + /* + * Set up default columns and sources. + */ + cb.cb_sources = ZPROP_SRC_ALL; + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_SOURCE; + cb.cb_type = ZFS_TYPE_POOL; + + /* check options */ + while ((c = getopt(argc, argv, ":Hpo:")) != -1) { + switch (c) { + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case 'o': + bzero(&cb.cb_columns, sizeof (cb.cb_columns)); + i = 0; + while (*optarg != '\0') { + static char *col_subopts[] = + { "name", "property", "value", "source", + "all", NULL }; + + if (i == ZFS_GET_NCOLS) { + (void) fprintf(stderr, gettext("too " + "many fields given to -o " + "option\n")); + usage(B_FALSE); + } + + switch (getsubopt(&optarg, col_subopts, + &value)) { + case 0: + cb.cb_columns[i++] = GET_COL_NAME; + break; + case 1: + cb.cb_columns[i++] = GET_COL_PROPERTY; + break; + case 2: + cb.cb_columns[i++] = GET_COL_VALUE; + break; + case 3: + cb.cb_columns[i++] = GET_COL_SOURCE; + break; + case 4: + if (i > 0) { + (void) fprintf(stderr, + gettext("\"all\" conflicts " + "with specific fields " + "given to -o option\n")); + usage(B_FALSE); + } + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_SOURCE; + i = ZFS_GET_NCOLS; + break; + default: + (void) fprintf(stderr, + gettext("invalid column name " + "'%s'\n"), value); + usage(B_FALSE); + } + } + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing property " + "argument\n")); + usage(B_FALSE); + } + + if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, + ZFS_TYPE_POOL) != 0) + usage(B_FALSE); + + argc--; + argv++; + + if (cb.cb_proplist != NULL) { + fake_name.pl_prop = ZPOOL_PROP_NAME; + fake_name.pl_width = strlen(gettext("NAME")); + fake_name.pl_next = cb.cb_proplist; + cb.cb_proplist = &fake_name; + } + + ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, + get_callback, &cb); + + if (cb.cb_proplist == &fake_name) + zprop_free_list(fake_name.pl_next); + else + zprop_free_list(cb.cb_proplist); + + return (ret); +} + +typedef struct set_cbdata { + char *cb_propname; + char *cb_value; + boolean_t cb_any_successful; +} set_cbdata_t; + +static int +set_callback(zpool_handle_t *zhp, void *data) +{ + int error; + set_cbdata_t *cb = (set_cbdata_t *)data; + + error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); + + if (!error) + cb->cb_any_successful = B_TRUE; + + return (error); +} + +int +zpool_do_set(int argc, char **argv) +{ + set_cbdata_t cb = { 0 }; + int error; + + if (argc > 1 && argv[1][0] == '-') { + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + argv[1][1]); + usage(B_FALSE); + } + + if (argc < 2) { + (void) fprintf(stderr, gettext("missing property=value " + "argument\n")); + usage(B_FALSE); + } + + if (argc < 3) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 3) { + (void) fprintf(stderr, gettext("too many pool names\n")); + usage(B_FALSE); + } + + cb.cb_propname = argv[1]; + cb.cb_value = strchr(cb.cb_propname, '='); + if (cb.cb_value == NULL) { + (void) fprintf(stderr, gettext("missing value in " + "property=value argument\n")); + usage(B_FALSE); + } + + *(cb.cb_value) = '\0'; + cb.cb_value++; + + error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, + set_callback, &cb); + + return (error); +} + +/* Add up the total number of bytes left to initialize/trim across all vdevs */ +static uint64_t +vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) +{ + uint64_t bytes_remaining; + nvlist_t **child; + uint_t c, children; + vdev_stat_t *vs; + + assert(activity == ZPOOL_WAIT_INITIALIZE || + activity == ZPOOL_WAIT_TRIM); + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + if (activity == ZPOOL_WAIT_INITIALIZE && + vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) + bytes_remaining = vs->vs_initialize_bytes_est - + vs->vs_initialize_bytes_done; + else if (activity == ZPOOL_WAIT_TRIM && + vs->vs_trim_state == VDEV_TRIM_ACTIVE) + bytes_remaining = vs->vs_trim_bytes_est - + vs->vs_trim_bytes_done; + else + bytes_remaining = 0; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (c = 0; c < children; c++) + bytes_remaining += vdev_activity_remaining(child[c], activity); + + return (bytes_remaining); +} + +/* Add up the total number of bytes left to rebuild across top-level vdevs */ +static uint64_t +vdev_activity_top_remaining(nvlist_t *nv) +{ + uint64_t bytes_remaining = 0; + nvlist_t **child; + uint_t children; + int error; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + error = nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); + if (error == 0) { + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + bytes_remaining += (vrs->vrs_bytes_est - + vrs->vrs_bytes_rebuilt); + } + } + } + + return (bytes_remaining); +} + +/* Whether any vdevs are 'spare' or 'replacing' vdevs */ +static boolean_t +vdev_any_spare_replacing(nvlist_t *nv) +{ + nvlist_t **child; + uint_t c, children; + char *vdev_type; + + (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); + + if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || + strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) { + return (B_TRUE); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (c = 0; c < children; c++) { + if (vdev_any_spare_replacing(child[c])) + return (B_TRUE); + } + + return (B_FALSE); +} + +typedef struct wait_data { + char *wd_poolname; + boolean_t wd_scripted; + boolean_t wd_exact; + boolean_t wd_headers_once; + boolean_t wd_should_exit; + /* Which activities to wait for */ + boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES]; + float wd_interval; + pthread_cond_t wd_cv; + pthread_mutex_t wd_mutex; +} wait_data_t; + +/* + * Print to stdout a single line, containing one column for each activity that + * we are waiting for specifying how many bytes of work are left for that + * activity. + */ +static void +print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) +{ + nvlist_t *config, *nvroot; + uint_t c; + int i; + pool_checkpoint_stat_t *pcs = NULL; + pool_scan_stat_t *pss = NULL; + pool_removal_stat_t *prs = NULL; + char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", + "REMOVE", "RESILVER", "SCRUB", "TRIM"}; + int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; + + /* Calculate the width of each column */ + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + /* + * Make sure we have enough space in the col for pretty-printed + * numbers and for the column header, and then leave a couple + * spaces between cols for readability. + */ + col_widths[i] = MAX(strlen(headers[i]), 6) + 2; + } + + /* Print header if appropriate */ + int term_height = terminal_height(); + boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && + row % (term_height-1) == 0); + if (!wd->wd_scripted && (row == 0 || reprint_header)) { + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + if (wd->wd_enabled[i]) + (void) printf("%*s", col_widths[i], headers[i]); + } + (void) printf("\n"); + } + + /* Bytes of work remaining in each activity */ + int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0}; + + bytes_rem[ZPOOL_WAIT_FREE] = + zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL); + + config = zpool_get_config(zhp, NULL); + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING) + bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space; + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); + if (prs != NULL && prs->prs_state == DSS_SCANNING) + bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy - + prs->prs_copied; + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c); + if (pss != NULL && pss->pss_state == DSS_SCANNING && + pss->pss_pass_scrub_pause == 0) { + int64_t rem = pss->pss_to_examine - pss->pss_issued; + if (pss->pss_func == POOL_SCAN_SCRUB) + bytes_rem[ZPOOL_WAIT_SCRUB] = rem; + else + bytes_rem[ZPOOL_WAIT_RESILVER] = rem; + } else if (check_rebuilding(nvroot, NULL)) { + bytes_rem[ZPOOL_WAIT_RESILVER] = + vdev_activity_top_remaining(nvroot); + } + + bytes_rem[ZPOOL_WAIT_INITIALIZE] = + vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); + bytes_rem[ZPOOL_WAIT_TRIM] = + vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM); + + /* + * A replace finishes after resilvering finishes, so the amount of work + * left for a replace is the same as for resilvering. + * + * It isn't quite correct to say that if we have any 'spare' or + * 'replacing' vdevs and a resilver is happening, then a replace is in + * progress, like we do here. When a hot spare is used, the faulted vdev + * is not removed after the hot spare is resilvered, so parent 'spare' + * vdev is not removed either. So we could have a 'spare' vdev, but be + * resilvering for a different reason. However, we use it as a heuristic + * because we don't have access to the DTLs, which could tell us whether + * or not we have really finished resilvering a hot spare. + */ + if (vdev_any_spare_replacing(nvroot)) + bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; + + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + char buf[64]; + if (!wd->wd_enabled[i]) + continue; + + if (wd->wd_exact) + (void) snprintf(buf, sizeof (buf), "%" PRIi64, + bytes_rem[i]); + else + zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); + + if (wd->wd_scripted) + (void) printf(i == 0 ? "%s" : "\t%s", buf); + else + (void) printf(" %*s", col_widths[i] - 1, buf); + } + (void) printf("\n"); + (void) fflush(stdout); +} + +static void * +wait_status_thread(void *arg) +{ + wait_data_t *wd = (wait_data_t *)arg; + zpool_handle_t *zhp; + + if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL) + return (void *)(1); + + for (int row = 0; ; row++) { + boolean_t missing; + struct timespec timeout; + int ret = 0; + (void) clock_gettime(CLOCK_REALTIME, &timeout); + + if (zpool_refresh_stats(zhp, &missing) != 0 || missing || + zpool_props_refresh(zhp) != 0) { + zpool_close(zhp); + return (void *)(uintptr_t)(missing ? 0 : 1); + } + + print_wait_status_row(wd, zhp, row); + + timeout.tv_sec += floor(wd->wd_interval); + long nanos = timeout.tv_nsec + + (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC; + if (nanos >= NANOSEC) { + timeout.tv_sec++; + timeout.tv_nsec = nanos - NANOSEC; + } else { + timeout.tv_nsec = nanos; + } + pthread_mutex_lock(&wd->wd_mutex); + if (!wd->wd_should_exit) + ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex, + &timeout); + pthread_mutex_unlock(&wd->wd_mutex); + if (ret == 0) { + break; /* signaled by main thread */ + } else if (ret != ETIMEDOUT) { + (void) fprintf(stderr, gettext("pthread_cond_timedwait " + "failed: %s\n"), strerror(ret)); + zpool_close(zhp); + return (void *)(uintptr_t)(1); + } + } + + zpool_close(zhp); + return (void *)(0); +} + +int +zpool_do_wait(int argc, char **argv) +{ + boolean_t verbose = B_FALSE; + char c; + char *value; + int i; + unsigned long count; + pthread_t status_thr; + int error = 0; + zpool_handle_t *zhp; + + wait_data_t wd; + wd.wd_scripted = B_FALSE; + wd.wd_exact = B_FALSE; + wd.wd_headers_once = B_FALSE; + wd.wd_should_exit = B_FALSE; + + pthread_mutex_init(&wd.wd_mutex, NULL); + pthread_cond_init(&wd.wd_cv, NULL); + + /* By default, wait for all types of activity. */ + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) + wd.wd_enabled[i] = B_TRUE; + + while ((c = getopt(argc, argv, "HpT:t:")) != -1) { + switch (c) { + case 'H': + wd.wd_scripted = B_TRUE; + break; + case 'n': + wd.wd_headers_once = B_TRUE; + break; + case 'p': + wd.wd_exact = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case 't': + { + static char *col_subopts[] = { "discard", "free", + "initialize", "replace", "remove", "resilver", + "scrub", "trim", NULL }; + + /* Reset activities array */ + bzero(&wd.wd_enabled, sizeof (wd.wd_enabled)); + while (*optarg != '\0') { + int activity = getsubopt(&optarg, col_subopts, + &value); + + if (activity < 0) { + (void) fprintf(stderr, + gettext("invalid activity '%s'\n"), + value); + usage(B_FALSE); + } + + wd.wd_enabled[activity] = B_TRUE; + } + break; + } + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &wd.wd_interval, &count); + if (count != 0) { + /* This subcmd only accepts an interval, not a count */ + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (wd.wd_interval != 0) + verbose = B_TRUE; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing 'pool' argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + wd.wd_poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL) + return (1); + + if (verbose) { + /* + * We use a separate thread for printing status updates because + * the main thread will call lzc_wait(), which blocks as long + * as an activity is in progress, which can be a long time. + */ + if (pthread_create(&status_thr, NULL, wait_status_thread, &wd) + != 0) { + (void) fprintf(stderr, gettext("failed to create status" + "thread: %s\n"), strerror(errno)); + zpool_close(zhp); + return (1); + } + } + + /* + * Loop over all activities that we are supposed to wait for until none + * of them are in progress. Note that this means we can end up waiting + * for more activities to complete than just those that were in progress + * when we began waiting; if an activity we are interested in begins + * while we are waiting for another activity, we will wait for both to + * complete before exiting. + */ + for (;;) { + boolean_t missing = B_FALSE; + boolean_t any_waited = B_FALSE; + + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + boolean_t waited; + + if (!wd.wd_enabled[i]) + continue; + + error = zpool_wait_status(zhp, i, &missing, &waited); + if (error != 0 || missing) + break; + + any_waited = (any_waited || waited); + } + + if (error != 0 || missing || !any_waited) + break; + } + + zpool_close(zhp); + + if (verbose) { + uintptr_t status; + pthread_mutex_lock(&wd.wd_mutex); + wd.wd_should_exit = B_TRUE; + pthread_cond_signal(&wd.wd_cv); + pthread_mutex_unlock(&wd.wd_mutex); + (void) pthread_join(status_thr, (void *)&status); + if (status != 0) + error = status; + } + + pthread_mutex_destroy(&wd.wd_mutex); + pthread_cond_destroy(&wd.wd_cv); + return (error); +} + +static int +find_command_idx(char *command, int *idx) +{ + int i; + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + continue; + + if (strcmp(command, command_table[i].name) == 0) { + *idx = i; + return (0); + } + } + return (1); +} + +/* + * Display version message + */ +static int +zpool_do_version(int argc, char **argv) +{ + if (zfs_version_print() == -1) + return (1); + + return (0); +} + +int +main(int argc, char **argv) +{ + int ret = 0; + int i = 0; + char *cmdname; + char **newargv; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + srand(time(NULL)); + + opterr = 0; + + /* + * Make sure the user has specified some command. + */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing command\n")); + usage(B_FALSE); + } + + cmdname = argv[1]; + + /* + * Special case '-?' + */ + if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) + usage(B_TRUE); + + /* + * Special case '-V|--version' + */ + if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) + return (zpool_do_version(argc, argv)); + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (1); + } + + libzfs_print_on_error(g_zfs, B_TRUE); + + zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); + + /* + * Many commands modify input strings for string parsing reasons. + * We create a copy to protect the original argv. + */ + newargv = malloc((argc + 1) * sizeof (newargv[0])); + for (i = 0; i < argc; i++) + newargv[i] = strdup(argv[i]); + newargv[argc] = NULL; + + /* + * Run the appropriate command. + */ + if (find_command_idx(cmdname, &i) == 0) { + current_command = &command_table[i]; + ret = command_table[i].func(argc - 1, newargv + 1); + } else if (strchr(cmdname, '=')) { + verify(find_command_idx("set", &i) == 0); + current_command = &command_table[i]; + ret = command_table[i].func(argc, newargv); + } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { + /* + * 'freeze' is a vile debugging abomination, so we treat + * it as such. + */ + zfs_cmd_t zc = {"\0"}; + + (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); + ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc); + if (ret != 0) { + (void) fprintf(stderr, + gettext("failed to freeze pool: %d\n"), errno); + ret = 1; + } + + log_history = 0; + } else { + (void) fprintf(stderr, gettext("unrecognized " + "command '%s'\n"), cmdname); + usage(B_FALSE); + ret = 1; + } + + for (i = 0; i < argc; i++) + free(newargv[i]); + free(newargv); + + if (ret == 0 && log_history) + (void) zpool_log_history(g_zfs, history_str); + + libzfs_fini(g_zfs); + + /* + * The 'ZFS_ABORT' environment variable causes us to dump core on exit + * for the purposes of running ::findleaks. + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + return (ret); +} diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.c b/sys/contrib/openzfs/cmd/zpool/zpool_util.c new file mode 100644 index 000000000000..1c1eb024f365 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.c @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <errno.h> +#include <libgen.h> +#include <libintl.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <ctype.h> + +#include "zpool_util.h" + +/* + * Utility function to guarantee malloc() success. + */ +void * +safe_malloc(size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) { + (void) fprintf(stderr, "internal error: out of memory\n"); + exit(1); + } + + return (data); +} + +/* + * Display an out of memory error message and abort the current program. + */ +void +zpool_no_memory(void) +{ + assert(errno == ENOMEM); + (void) fprintf(stderr, + gettext("internal error: out of memory\n")); + exit(1); +} + +/* + * Return the number of logs in supplied nvlist + */ +uint_t +num_logs(nvlist_t *nv) +{ + uint_t nlogs = 0; + uint_t c, children; + nvlist_t **child; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (0); + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (is_log) + nlogs++; + } + return (nlogs); +} + +/* Find the max element in an array of uint64_t values */ +uint64_t +array64_max(uint64_t array[], unsigned int len) +{ + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array[i]); + + return (max); +} + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + */ +int +highbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); +} + +/* + * Find lowest one bit set. + * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. + */ +int +lowbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (__builtin_ffsll(i)); +} diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h new file mode 100644 index 000000000000..265aa58953a0 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h @@ -0,0 +1,137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef ZPOOL_UTIL_H +#define ZPOOL_UTIL_H + +#include <libnvpair.h> +#include <libzfs.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Path to scripts you can run with "zpool status/iostat -c" */ +#define ZPOOL_SCRIPTS_DIR SYSCONFDIR"/zfs/zpool.d" + +/* + * Basic utility functions + */ +void *safe_malloc(size_t); +void zpool_no_memory(void); +uint_t num_logs(nvlist_t *nv); +uint64_t array64_max(uint64_t array[], unsigned int len); +int highbit64(uint64_t i); +int lowbit64(uint64_t i); + +/* + * Misc utility functions + */ +char *zpool_get_cmd_search_path(void); + +/* + * Virtual device functions + */ + +nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, + int check_rep, boolean_t replacing, boolean_t dryrun, int argc, + char **argv); +nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, + nvlist_t *props, splitflags_t flags, int argc, char **argv); + +/* + * Pool list functions + */ +int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, + zpool_iter_f, void *); + +/* Vdev list functions */ +typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *); +int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); + +typedef struct zpool_list zpool_list_t; + +zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); +void pool_list_update(zpool_list_t *); +int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); +void pool_list_free(zpool_list_t *); +int pool_list_count(zpool_list_t *); +void pool_list_remove(zpool_list_t *, zpool_handle_t *); + +extern libzfs_handle_t *g_zfs; + + +typedef struct vdev_cmd_data +{ + char **lines; /* Array of lines of output, minus the column name */ + int lines_cnt; /* Number of lines in the array */ + + char **cols; /* Array of column names */ + int cols_cnt; /* Number of column names */ + + + char *path; /* vdev path */ + char *upath; /* vdev underlying path */ + char *pool; /* Pool name */ + char *cmd; /* backpointer to cmd */ + char *vdev_enc_sysfs_path; /* enclosure sysfs path (if any) */ +} vdev_cmd_data_t; + +typedef struct vdev_cmd_data_list +{ + char *cmd; /* Command to run */ + unsigned int count; /* Number of vdev_cmd_data items (vdevs) */ + + /* fields used to select only certain vdevs, if requested */ + libzfs_handle_t *g_zfs; + char **vdev_names; + int vdev_names_count; + int cb_name_flags; + + vdev_cmd_data_t *data; /* Array of vdevs */ + + /* List of unique column names and widths */ + char **uniq_cols; + int uniq_cols_cnt; + int *uniq_cols_width; + +} vdev_cmd_data_list_t; + +vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv, + char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, + int cb_name_flags); + +void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl); + +int check_device(const char *path, boolean_t force, + boolean_t isspare, boolean_t iswholedisk); +boolean_t check_sector_size_database(char *path, int *sector_size); +void vdev_error(const char *fmt, ...); +int check_file(const char *file, boolean_t force, boolean_t isspare); + +#ifdef __cplusplus +} +#endif + +#endif /* ZPOOL_UTIL_H */ diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c new file mode 100644 index 000000000000..9aa09b18c4ae --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c @@ -0,0 +1,1581 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. + * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libblkid to make sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <libintl.h> +#include <libnvpair.h> +#include <libzutil.h> +#include <limits.h> +#include <sys/spa.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include "zpool_util.h" +#include <sys/zfs_context.h> +#include <sys/stat.h> + +/* + * For any given vdev specification, we can have multiple errors. The + * vdev_error() function keeps track of whether we have seen an error yet, and + * prints out a header if its the first error we've seen. + */ +boolean_t error_seen; +boolean_t is_force; + + + + +/*PRINTFLIKE1*/ +void +vdev_error(const char *fmt, ...) +{ + va_list ap; + + if (!error_seen) { + (void) fprintf(stderr, gettext("invalid vdev specification\n")); + if (!is_force) + (void) fprintf(stderr, gettext("use '-f' to override " + "the following errors:\n")); + else + (void) fprintf(stderr, gettext("the following errors " + "must be manually repaired:\n")); + error_seen = B_TRUE; + } + + va_start(ap, fmt); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); +} + +/* + * Check that a file is valid. All we can do in this case is check that it's + * not in use by another pool, and not in use by swap. + */ +int +check_file(const char *file, boolean_t force, boolean_t isspare) +{ + char *name; + int fd; + int ret = 0; + pool_state_t state; + boolean_t inuse; + + if ((fd = open(file, O_RDONLY)) < 0) + return (0); + + if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { + const char *desc; + + switch (state) { + case POOL_STATE_ACTIVE: + desc = gettext("active"); + break; + + case POOL_STATE_EXPORTED: + desc = gettext("exported"); + break; + + case POOL_STATE_POTENTIALLY_ACTIVE: + desc = gettext("potentially active"); + break; + + default: + desc = gettext("unknown"); + break; + } + + /* + * Allow hot spares to be shared between pools. + */ + if (state == POOL_STATE_SPARE && isspare) { + free(name); + (void) close(fd); + return (0); + } + + if (state == POOL_STATE_ACTIVE || + state == POOL_STATE_SPARE || !force) { + switch (state) { + case POOL_STATE_SPARE: + vdev_error(gettext("%s is reserved as a hot " + "spare for pool %s\n"), file, name); + break; + default: + vdev_error(gettext("%s is part of %s pool " + "'%s'\n"), file, desc, name); + break; + } + ret = -1; + } + + free(name); + } + + (void) close(fd); + return (ret); +} + +/* + * This may be a shorthand device path or it could be total gibberish. + * Check to see if it is a known device available in zfs_vdev_paths. + * As part of this check, see if we've been given an entire disk + * (minus the slice number). + */ +static int +is_shorthand_path(const char *arg, char *path, size_t path_size, + struct stat64 *statbuf, boolean_t *wholedisk) +{ + int error; + + error = zfs_resolve_shortname(arg, path, path_size); + if (error == 0) { + *wholedisk = zfs_dev_is_whole_disk(path); + if (*wholedisk || (stat64(path, statbuf) == 0)) + return (0); + } + + strlcpy(path, arg, path_size); + memset(statbuf, 0, sizeof (*statbuf)); + *wholedisk = B_FALSE; + + return (error); +} + +/* + * Determine if the given path is a hot spare within the given configuration. + * If no configuration is given we rely solely on the label. + */ +static boolean_t +is_spare(nvlist_t *config, const char *path) +{ + int fd; + pool_state_t state; + char *name = NULL; + nvlist_t *label; + uint64_t guid, spareguid; + nvlist_t *nvroot; + nvlist_t **spares; + uint_t i, nspares; + boolean_t inuse; + + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) + return (B_FALSE); + + if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || + !inuse || + state != POOL_STATE_SPARE || + zpool_read_label(fd, &label, NULL) != 0) { + free(name); + (void) close(fd); + return (B_FALSE); + } + free(name); + (void) close(fd); + + if (config == NULL) { + nvlist_free(label); + return (B_TRUE); + } + + verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); + nvlist_free(label); + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + verify(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &spareguid) == 0); + if (spareguid == guid) + return (B_TRUE); + } + } + + return (B_FALSE); +} + +/* + * Create a leaf vdev. Determine if this is a file or a device. If it's a + * device, fill in the device id to make a complete nvlist. Valid forms for a + * leaf vdev are: + * + * /dev/xxx Complete disk path + * /xxx Full path to file + * xxx Shorthand for <zfs_vdev_paths>/xxx + */ +static nvlist_t * +make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) +{ + char path[MAXPATHLEN]; + struct stat64 statbuf; + nvlist_t *vdev = NULL; + char *type = NULL; + boolean_t wholedisk = B_FALSE; + uint64_t ashift = 0; + int err; + + /* + * Determine what type of vdev this is, and put the full path into + * 'path'. We detect whether this is a device of file afterwards by + * checking the st_mode of the file. + */ + if (arg[0] == '/') { + /* + * Complete device or file path. Exact type is determined by + * examining the file descriptor afterwards. Symbolic links + * are resolved to their real paths to determine whole disk + * and S_ISBLK/S_ISREG type checks. However, we are careful + * to store the given path as ZPOOL_CONFIG_PATH to ensure we + * can leverage udev's persistent device labels. + */ + if (realpath(arg, path) == NULL) { + (void) fprintf(stderr, + gettext("cannot resolve path '%s'\n"), arg); + return (NULL); + } + + wholedisk = zfs_dev_is_whole_disk(path); + if (!wholedisk && (stat64(path, &statbuf) != 0)) { + (void) fprintf(stderr, + gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (NULL); + } + + /* After whole disk check restore original passed path */ + strlcpy(path, arg, sizeof (path)); + } else { + err = is_shorthand_path(arg, path, sizeof (path), + &statbuf, &wholedisk); + if (err != 0) { + /* + * If we got ENOENT, then the user gave us + * gibberish, so try to direct them with a + * reasonable error message. Otherwise, + * regurgitate strerror() since it's the best we + * can do. + */ + if (err == ENOENT) { + (void) fprintf(stderr, + gettext("cannot open '%s': no such " + "device in %s\n"), arg, DISK_ROOT); + (void) fprintf(stderr, + gettext("must be a full path or " + "shorthand device name\n")); + return (NULL); + } else { + (void) fprintf(stderr, + gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (NULL); + } + } + } + + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + (void) fprintf(stderr, gettext("cannot use '%s': must be a " + "block device or regular file\n"), path); + return (NULL); + } + + /* + * Finally, we have the complete device or file, and we know that it is + * acceptable to use. Construct the nvlist to describe this vdev. All + * vdevs have a 'path' element, and devices also have a 'devid' element. + */ + verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); + verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); + if (strcmp(type, VDEV_TYPE_DISK) == 0) + verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, + (uint64_t)wholedisk) == 0); + + /* + * Override defaults if custom properties are provided. + */ + if (props != NULL) { + char *value = NULL; + + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { + if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { + (void) fprintf(stderr, + gettext("ashift must be a number.\n")); + return (NULL); + } + if (ashift != 0 && + (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { + (void) fprintf(stderr, + gettext("invalid 'ashift=%" PRIu64 "' " + "property: only values between %" PRId32 " " + "and %" PRId32 " are allowed.\n"), + ashift, ASHIFT_MIN, ASHIFT_MAX); + return (NULL); + } + } + } + + /* + * If the device is known to incorrectly report its physical sector + * size explicitly provide the known correct value. + */ + if (ashift == 0) { + int sector_size; + + if (check_sector_size_database(path, §or_size) == B_TRUE) + ashift = highbit64(sector_size) - 1; + } + + if (ashift > 0) + (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); + + return (vdev); +} + +/* + * Go through and verify the replication level of the pool is consistent. + * Performs the following checks: + * + * For the new spec, verifies that devices in mirrors and raidz are the + * same size. + * + * If the current configuration already has inconsistent replication + * levels, ignore any other potential problems in the new spec. + * + * Otherwise, make sure that the current spec (if there is one) and the new + * spec have consistent replication levels. + * + * If there is no current spec (create), make sure new spec has at least + * one general purpose vdev. + */ +typedef struct replication_level { + char *zprl_type; + uint64_t zprl_children; + uint64_t zprl_parity; +} replication_level_t; + +#define ZPOOL_FUZZ (16 * 1024 * 1024) + +static boolean_t +is_raidz_mirror(replication_level_t *a, replication_level_t *b, + replication_level_t **raidz, replication_level_t **mirror) +{ + if (strcmp(a->zprl_type, "raidz") == 0 && + strcmp(b->zprl_type, "mirror") == 0) { + *raidz = a; + *mirror = b; + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Given a list of toplevel vdevs, return the current replication level. If + * the config is inconsistent, then NULL is returned. If 'fatal' is set, then + * an error message will be displayed for each self-inconsistent vdev. + */ +static replication_level_t * +get_replication(nvlist_t *nvroot, boolean_t fatal) +{ + nvlist_t **top; + uint_t t, toplevels; + nvlist_t **child; + uint_t c, children; + nvlist_t *nv; + char *type; + replication_level_t lastrep = {0}; + replication_level_t rep; + replication_level_t *ret; + replication_level_t *raidz, *mirror; + boolean_t dontreport; + + ret = safe_malloc(sizeof (replication_level_t)); + + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &top, &toplevels) == 0); + + for (t = 0; t < toplevels; t++) { + uint64_t is_log = B_FALSE; + + nv = top[t]; + + /* + * For separate logs we ignore the top level vdev replication + * constraints. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); + if (is_log) + continue; + + /* Ignore holes introduced by removing aux devices */ + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + if (strcmp(type, VDEV_TYPE_HOLE) == 0) + continue; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + /* + * This is a 'file' or 'disk' vdev. + */ + rep.zprl_type = type; + rep.zprl_children = 1; + rep.zprl_parity = 0; + } else { + int64_t vdev_size; + + /* + * This is a mirror or RAID-Z vdev. Go through and make + * sure the contents are all the same (files vs. disks), + * keeping track of the number of elements in the + * process. + * + * We also check that the size of each vdev (if it can + * be determined) is the same. + */ + rep.zprl_type = type; + rep.zprl_children = 0; + + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_NPARITY, + &rep.zprl_parity) == 0); + assert(rep.zprl_parity != 0); + } else { + rep.zprl_parity = 0; + } + + /* + * The 'dontreport' variable indicates that we've + * already reported an error for this spec, so don't + * bother doing it again. + */ + type = NULL; + dontreport = 0; + vdev_size = -1LL; + for (c = 0; c < children; c++) { + nvlist_t *cnv = child[c]; + char *path; + struct stat64 statbuf; + int64_t size = -1LL; + char *childtype; + int fd, err; + + rep.zprl_children++; + + verify(nvlist_lookup_string(cnv, + ZPOOL_CONFIG_TYPE, &childtype) == 0); + + /* + * If this is a replacing or spare vdev, then + * get the real first child of the vdev: do this + * in a loop because replacing and spare vdevs + * can be nested. + */ + while (strcmp(childtype, + VDEV_TYPE_REPLACING) == 0 || + strcmp(childtype, VDEV_TYPE_SPARE) == 0) { + nvlist_t **rchild; + uint_t rchildren; + + verify(nvlist_lookup_nvlist_array(cnv, + ZPOOL_CONFIG_CHILDREN, &rchild, + &rchildren) == 0); + assert(rchildren == 2); + cnv = rchild[0]; + + verify(nvlist_lookup_string(cnv, + ZPOOL_CONFIG_TYPE, + &childtype) == 0); + } + + verify(nvlist_lookup_string(cnv, + ZPOOL_CONFIG_PATH, &path) == 0); + + /* + * If we have a raidz/mirror that combines disks + * with files, report it as an error. + */ + if (!dontreport && type != NULL && + strcmp(type, childtype) != 0) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: %s contains both " + "files and devices\n"), + rep.zprl_type); + else + return (NULL); + dontreport = B_TRUE; + } + + /* + * According to stat(2), the value of 'st_size' + * is undefined for block devices and character + * devices. But there is no effective way to + * determine the real size in userland. + * + * Instead, we'll take advantage of an + * implementation detail of spec_size(). If the + * device is currently open, then we (should) + * return a valid size. + * + * If we still don't get a valid size (indicated + * by a size of 0 or MAXOFFSET_T), then ignore + * this device altogether. + */ + if ((fd = open(path, O_RDONLY)) >= 0) { + err = fstat64_blk(fd, &statbuf); + (void) close(fd); + } else { + err = stat64(path, &statbuf); + } + + if (err != 0 || + statbuf.st_size == 0 || + statbuf.st_size == MAXOFFSET_T) + continue; + + size = statbuf.st_size; + + /* + * Also make sure that devices and + * slices have a consistent size. If + * they differ by a significant amount + * (~16MB) then report an error. + */ + if (!dontreport && + (vdev_size != -1LL && + (llabs(size - vdev_size) > + ZPOOL_FUZZ))) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "%s contains devices of " + "different sizes\n"), + rep.zprl_type); + else + return (NULL); + dontreport = B_TRUE; + } + + type = childtype; + vdev_size = size; + } + } + + /* + * At this point, we have the replication of the last toplevel + * vdev in 'rep'. Compare it to 'lastrep' to see if it is + * different. + */ + if (lastrep.zprl_type != NULL) { + if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || + is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { + /* + * Accepted raidz and mirror when they can + * handle the same number of disk failures. + */ + if (raidz->zprl_parity != + mirror->zprl_children - 1) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: " + "%s and %s vdevs with " + "different redundancy, " + "%llu vs. %llu (%llu-way) " + "are present\n"), + raidz->zprl_type, + mirror->zprl_type, + raidz->zprl_parity, + mirror->zprl_children - 1, + mirror->zprl_children); + else + return (NULL); + } + } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != + 0) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication level: " + "both %s and %s vdevs are " + "present\n"), + lastrep.zprl_type, rep.zprl_type); + else + return (NULL); + } else if (lastrep.zprl_parity != rep.zprl_parity) { + if (ret) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication level: " + "both %llu and %llu device parity " + "%s vdevs are present\n"), + lastrep.zprl_parity, + rep.zprl_parity, + rep.zprl_type); + else + return (NULL); + } else if (lastrep.zprl_children != rep.zprl_children) { + if (ret) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication level: " + "both %llu-way and %llu-way %s " + "vdevs are present\n"), + lastrep.zprl_children, + rep.zprl_children, + rep.zprl_type); + else + return (NULL); + } + } + lastrep = rep; + } + + if (ret != NULL) + *ret = rep; + + return (ret); +} + +/* + * Check the replication level of the vdev spec against the current pool. Calls + * get_replication() to make sure the new spec is self-consistent. If the pool + * has a consistent replication level, then we ignore any errors. Otherwise, + * report any difference between the two. + */ +static int +check_replication(nvlist_t *config, nvlist_t *newroot) +{ + nvlist_t **child; + uint_t children; + replication_level_t *current = NULL, *new; + replication_level_t *raidz, *mirror; + int ret; + + /* + * If we have a current pool configuration, check to see if it's + * self-consistent. If not, simply return success. + */ + if (config != NULL) { + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if ((current = get_replication(nvroot, B_FALSE)) == NULL) + return (0); + } + /* + * for spares there may be no children, and therefore no + * replication level to check + */ + if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) || (children == 0)) { + free(current); + return (0); + } + + /* + * If all we have is logs then there's no replication level to check. + */ + if (num_logs(newroot) == children) { + free(current); + return (0); + } + + /* + * Get the replication level of the new vdev spec, reporting any + * inconsistencies found. + */ + if ((new = get_replication(newroot, B_TRUE)) == NULL) { + free(current); + return (-1); + } + + /* + * Check to see if the new vdev spec matches the replication level of + * the current pool. + */ + ret = 0; + if (current != NULL) { + if (is_raidz_mirror(current, new, &raidz, &mirror) || + is_raidz_mirror(new, current, &raidz, &mirror)) { + if (raidz->zprl_parity != mirror->zprl_children - 1) { + vdev_error(gettext( + "mismatched replication level: pool and " + "new vdev with different redundancy, %s " + "and %s vdevs, %llu vs. %llu (%llu-way)\n"), + raidz->zprl_type, + mirror->zprl_type, + raidz->zprl_parity, + mirror->zprl_children - 1, + mirror->zprl_children); + ret = -1; + } + } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { + vdev_error(gettext( + "mismatched replication level: pool uses %s " + "and new vdev is %s\n"), + current->zprl_type, new->zprl_type); + ret = -1; + } else if (current->zprl_parity != new->zprl_parity) { + vdev_error(gettext( + "mismatched replication level: pool uses %llu " + "device parity and new vdev uses %llu\n"), + current->zprl_parity, new->zprl_parity); + ret = -1; + } else if (current->zprl_children != new->zprl_children) { + vdev_error(gettext( + "mismatched replication level: pool uses %llu-way " + "%s and new vdev uses %llu-way %s\n"), + current->zprl_children, current->zprl_type, + new->zprl_children, new->zprl_type); + ret = -1; + } + } + + free(new); + if (current != NULL) + free(current); + + return (ret); +} + +static int +zero_label(char *path) +{ + const int size = 4096; + char buf[size]; + int err, fd; + + if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { + (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (-1); + } + + memset(buf, 0, size); + err = write(fd, buf, size); + (void) fdatasync(fd); + (void) close(fd); + + if (err == -1) { + (void) fprintf(stderr, gettext("cannot zero first %d bytes " + "of '%s': %s\n"), size, path, strerror(errno)); + return (-1); + } + + if (err != size) { + (void) fprintf(stderr, gettext("could only zero %d/%d bytes " + "of '%s'\n"), err, size, path); + return (-1); + } + + return (0); +} + +/* + * Go through and find any whole disks in the vdev specification, labelling them + * as appropriate. When constructing the vdev spec, we were unable to open this + * device in order to provide a devid. Now that we have labelled the disk and + * know that slice 0 is valid, we can construct the devid now. + * + * If the disk was already labeled with an EFI label, we will have gotten the + * devid already (because we were able to open the whole disk). Otherwise, we + * need to get the devid after we label the disk. + */ +static int +make_disks(zpool_handle_t *zhp, nvlist_t *nv) +{ + nvlist_t **child; + uint_t c, children; + char *type, *path; + char devpath[MAXPATHLEN]; + char udevpath[MAXPATHLEN]; + uint64_t wholedisk; + struct stat64 statbuf; + int is_exclusive = 0; + int fd; + int ret; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + + if (strcmp(type, VDEV_TYPE_DISK) != 0) + return (0); + + /* + * We have a disk device. If this is a whole disk write + * out the efi partition table, otherwise write zero's to + * the first 4k of the partition. This is to ensure that + * libblkid will not misidentify the partition due to a + * magic value left by the previous filesystem. + */ + verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); + verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk)); + + if (!wholedisk) { + /* + * Update device id string for mpath nodes (Linux only) + */ + if (is_mpath_whole_disk(path)) + update_vdev_config_dev_strs(nv); + + if (!is_spare(NULL, path)) + (void) zero_label(path); + return (0); + } + + if (realpath(path, devpath) == NULL) { + ret = errno; + (void) fprintf(stderr, + gettext("cannot resolve path '%s'\n"), path); + return (ret); + } + + /* + * Remove any previously existing symlink from a udev path to + * the device before labeling the disk. This ensures that + * only newly created links are used. Otherwise there is a + * window between when udev deletes and recreates the link + * during which access attempts will fail with ENOENT. + */ + strlcpy(udevpath, path, MAXPATHLEN); + (void) zfs_append_partition(udevpath, MAXPATHLEN); + + fd = open(devpath, O_RDWR|O_EXCL); + if (fd == -1) { + if (errno == EBUSY) + is_exclusive = 1; +#ifdef __FreeBSD__ + if (errno == EPERM) + is_exclusive = 1; +#endif + } else { + (void) close(fd); + } + + /* + * If the partition exists, contains a valid spare label, + * and is opened exclusively there is no need to partition + * it. Hot spares have already been partitioned and are + * held open exclusively by the kernel as a safety measure. + * + * If the provided path is for a /dev/disk/ device its + * symbolic link will be removed, partition table created, + * and then block until udev creates the new link. + */ + if (!is_exclusive && !is_spare(NULL, udevpath)) { + char *devnode = strrchr(devpath, '/') + 1; + + ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); + if (ret == 0) { + ret = lstat64(udevpath, &statbuf); + if (ret == 0 && S_ISLNK(statbuf.st_mode)) + (void) unlink(udevpath); + } + + /* + * When labeling a pool the raw device node name + * is provided as it appears under /dev/. + */ + if (zpool_label_disk(g_zfs, zhp, devnode) == -1) + return (-1); + + /* + * Wait for udev to signal the device is available + * by the provided path. + */ + ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); + if (ret) { + (void) fprintf(stderr, + gettext("missing link: %s was " + "partitioned but %s is missing\n"), + devnode, udevpath); + return (ret); + } + + ret = zero_label(udevpath); + if (ret) + return (ret); + } + + /* + * Update the path to refer to the partition. The presence of + * the 'whole_disk' field indicates to the CLI that we should + * chop off the partition number when displaying the device in + * future output. + */ + verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); + + /* + * Update device id strings for whole disks (Linux only) + */ + update_vdev_config_dev_strs(nv); + + return (0); + } + + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + + return (0); +} + +/* + * Go through and find any devices that are in use. We rely on libdiskmgt for + * the majority of this task. + */ +static boolean_t +is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, + boolean_t replacing, boolean_t isspare) +{ + nvlist_t **child; + uint_t c, children; + char *type, *path; + int ret = 0; + char buf[MAXPATHLEN]; + uint64_t wholedisk = B_FALSE; + boolean_t anyinuse = B_FALSE; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + + verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); + if (strcmp(type, VDEV_TYPE_DISK) == 0) + verify(!nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); + + /* + * As a generic check, we look to see if this is a replace of a + * hot spare within the same pool. If so, we allow it + * regardless of what libblkid or zpool_in_use() says. + */ + if (replacing) { + (void) strlcpy(buf, path, sizeof (buf)); + if (wholedisk) { + ret = zfs_append_partition(buf, sizeof (buf)); + if (ret == -1) + return (-1); + } + + if (is_spare(config, buf)) + return (B_FALSE); + } + + if (strcmp(type, VDEV_TYPE_DISK) == 0) + ret = check_device(path, force, isspare, wholedisk); + + else if (strcmp(type, VDEV_TYPE_FILE) == 0) + ret = check_file(path, force, isspare); + + return (ret != 0); + } + + for (c = 0; c < children; c++) + if (is_device_in_use(config, child[c], force, replacing, + B_FALSE)) + anyinuse = B_TRUE; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) + for (c = 0; c < children; c++) + if (is_device_in_use(config, child[c], force, replacing, + B_TRUE)) + anyinuse = B_TRUE; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) + for (c = 0; c < children; c++) + if (is_device_in_use(config, child[c], force, replacing, + B_FALSE)) + anyinuse = B_TRUE; + + return (anyinuse); +} + +static const char * +is_grouping(const char *type, int *mindev, int *maxdev) +{ + if (strncmp(type, "raidz", 5) == 0) { + const char *p = type + 5; + char *end; + long nparity; + + if (*p == '\0') { + nparity = 1; + } else if (*p == '0') { + return (NULL); /* no zero prefixes allowed */ + } else { + errno = 0; + nparity = strtol(p, &end, 10); + if (errno != 0 || nparity < 1 || nparity >= 255 || + *end != '\0') + return (NULL); + } + + if (mindev != NULL) + *mindev = nparity + 1; + if (maxdev != NULL) + *maxdev = 255; + return (VDEV_TYPE_RAIDZ); + } + + if (maxdev != NULL) + *maxdev = INT_MAX; + + if (strcmp(type, "mirror") == 0) { + if (mindev != NULL) + *mindev = 2; + return (VDEV_TYPE_MIRROR); + } + + if (strcmp(type, "spare") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_SPARE); + } + + if (strcmp(type, "log") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_LOG); + } + + if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || + strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { + if (mindev != NULL) + *mindev = 1; + return (type); + } + + if (strcmp(type, "cache") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_L2CACHE); + } + + return (NULL); +} + +/* + * Construct a syntactically valid vdev specification, + * and ensure that all devices and files exist and can be opened. + * Note: we don't bother freeing anything in the error paths + * because the program is just going to exit anyway. + */ +static nvlist_t * +construct_spec(nvlist_t *props, int argc, char **argv) +{ + nvlist_t *nvroot, *nv, **top, **spares, **l2cache; + int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; + const char *type; + uint64_t is_log, is_special, is_dedup; + boolean_t seen_logs; + + top = NULL; + toplevels = 0; + spares = NULL; + l2cache = NULL; + nspares = 0; + nlogs = 0; + nl2cache = 0; + is_log = is_special = is_dedup = B_FALSE; + seen_logs = B_FALSE; + nvroot = NULL; + + while (argc > 0) { + nv = NULL; + + /* + * If it's a mirror or raidz, the subsequent arguments are + * its leaves -- until we encounter the next mirror or raidz. + */ + if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { + nvlist_t **child = NULL; + int c, children = 0; + + if (strcmp(type, VDEV_TYPE_SPARE) == 0) { + if (spares != NULL) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'spare' can be " + "specified only once\n")); + goto spec_out; + } + is_log = is_special = is_dedup = B_FALSE; + } + + if (strcmp(type, VDEV_TYPE_LOG) == 0) { + if (seen_logs) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'log' can be " + "specified only once\n")); + goto spec_out; + } + seen_logs = B_TRUE; + is_log = B_TRUE; + is_special = B_FALSE; + is_dedup = B_FALSE; + argc--; + argv++; + /* + * A log is not a real grouping device. + * We just set is_log and continue. + */ + continue; + } + + if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { + is_special = B_TRUE; + is_log = B_FALSE; + is_dedup = B_FALSE; + argc--; + argv++; + continue; + } + + if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { + is_dedup = B_TRUE; + is_log = B_FALSE; + is_special = B_FALSE; + argc--; + argv++; + continue; + } + + if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { + if (l2cache != NULL) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'cache' can be " + "specified only once\n")); + goto spec_out; + } + is_log = is_special = is_dedup = B_FALSE; + } + + if (is_log || is_special || is_dedup) { + if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: unsupported '%s' " + "device: %s\n"), is_log ? "log" : + "special", type); + goto spec_out; + } + nlogs++; + } + + for (c = 1; c < argc; c++) { + if (is_grouping(argv[c], NULL, NULL) != NULL) + break; + children++; + child = realloc(child, + children * sizeof (nvlist_t *)); + if (child == NULL) + zpool_no_memory(); + if ((nv = make_leaf_vdev(props, argv[c], + B_FALSE)) == NULL) { + for (c = 0; c < children - 1; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + child[children - 1] = nv; + } + + if (children < mindev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s requires at least %d " + "devices\n"), argv[0], mindev); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (children > maxdev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s supports no more than " + "%d devices\n"), argv[0], maxdev); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + argc -= c; + argv += c; + + if (strcmp(type, VDEV_TYPE_SPARE) == 0) { + spares = child; + nspares = children; + continue; + } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { + l2cache = child; + nl2cache = children; + continue; + } else { + /* create a top-level vdev with children */ + verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, + 0) == 0); + verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, + type) == 0); + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); + if (is_special) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) == 0); + } + if (is_dedup) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) == 0); + } + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_NPARITY, + mindev - 1) == 0); + } + verify(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, + children) == 0); + + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + } + } else { + /* + * We have a device. Pass off to make_leaf_vdev() to + * construct the appropriate nvlist describing the vdev. + */ + if ((nv = make_leaf_vdev(props, argv[0], + is_log)) == NULL) + goto spec_out; + + if (is_log) + nlogs++; + if (is_special) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) == 0); + } + if (is_dedup) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) == 0); + } + argc--; + argv++; + } + + toplevels++; + top = realloc(top, toplevels * sizeof (nvlist_t *)); + if (top == NULL) + zpool_no_memory(); + top[toplevels - 1] = nv; + } + + if (toplevels == 0 && nspares == 0 && nl2cache == 0) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: at least one toplevel vdev must be " + "specified\n")); + goto spec_out; + } + + if (seen_logs && nlogs == 0) { + (void) fprintf(stderr, gettext("invalid vdev specification: " + "log requires at least 1 device\n")); + goto spec_out; + } + + /* + * Finally, create nvroot and add all top-level vdevs to it. + */ + verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); + verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + top, toplevels) == 0); + if (nspares != 0) + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + spares, nspares) == 0); + if (nl2cache != 0) + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + l2cache, nl2cache) == 0); + +spec_out: + for (t = 0; t < toplevels; t++) + nvlist_free(top[t]); + for (t = 0; t < nspares; t++) + nvlist_free(spares[t]); + for (t = 0; t < nl2cache; t++) + nvlist_free(l2cache[t]); + + free(spares); + free(l2cache); + free(top); + + return (nvroot); +} + +nvlist_t * +split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, + splitflags_t flags, int argc, char **argv) +{ + nvlist_t *newroot = NULL, **child; + uint_t c, children; + + if (argc > 0) { + if ((newroot = construct_spec(props, argc, argv)) == NULL) { + (void) fprintf(stderr, gettext("Unable to build a " + "pool from the specified devices\n")); + return (NULL); + } + + if (!flags.dryrun && make_disks(zhp, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + /* avoid any tricks in the spec */ + verify(nvlist_lookup_nvlist_array(newroot, + ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + for (c = 0; c < children; c++) { + char *path; + const char *type; + int min, max; + + verify(nvlist_lookup_string(child[c], + ZPOOL_CONFIG_PATH, &path) == 0); + if ((type = is_grouping(path, &min, &max)) != NULL) { + (void) fprintf(stderr, gettext("Cannot use " + "'%s' as a device for splitting\n"), type); + nvlist_free(newroot); + return (NULL); + } + } + } + + if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { + nvlist_free(newroot); + return (NULL); + } + + return (newroot); +} + +static int +num_normal_vdevs(nvlist_t *nvroot) +{ + nvlist_t **top; + uint_t t, toplevels, normal = 0; + + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &top, &toplevels) == 0); + + for (t = 0; t < toplevels; t++) { + uint64_t log = B_FALSE; + + (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); + if (log) + continue; + if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + normal++; + } + + return (normal); +} + +/* + * Get and validate the contents of the given vdev specification. This ensures + * that the nvlist returned is well-formed, that all the devices exist, and that + * they are not currently in use by any other known consumer. The 'poolconfig' + * parameter is the current configuration of the pool when adding devices + * existing pool, and is used to perform additional checks, such as changing the + * replication level of the pool. It can be 'NULL' to indicate that this is a + * new pool. The 'force' flag controls whether devices should be forcefully + * added, even if they appear in use. + */ +nvlist_t * +make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, + boolean_t replacing, boolean_t dryrun, int argc, char **argv) +{ + nvlist_t *newroot; + nvlist_t *poolconfig = NULL; + is_force = force; + + /* + * Construct the vdev specification. If this is successful, we know + * that we have a valid specification, and that all devices can be + * opened. + */ + if ((newroot = construct_spec(props, argc, argv)) == NULL) + return (NULL); + + if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) { + nvlist_free(newroot); + return (NULL); + } + + /* + * Validate each device to make sure that it's not shared with another + * subsystem. We do this even if 'force' is set, because there are some + * uses (such as a dedicated dump device) that even '-f' cannot + * override. + */ + if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { + nvlist_free(newroot); + return (NULL); + } + + /* + * Check the replication level of the given vdevs and report any errors + * found. We include the existing pool spec, if any, as we need to + * catch changes against the existing replication level. + */ + if (check_rep && check_replication(poolconfig, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + /* + * On pool create the new vdev spec must have one normal vdev. + */ + if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { + vdev_error(gettext("at least one general top-level vdev must " + "be specified\n")); + nvlist_free(newroot); + return (NULL); + } + + /* + * Run through the vdev specification and label any whole disks found. + */ + if (!dryrun && make_disks(zhp, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + return (newroot); +} diff --git a/sys/contrib/openzfs/cmd/zstream/.gitignore b/sys/contrib/openzfs/cmd/zstream/.gitignore new file mode 100644 index 000000000000..fd1240d55c4b --- /dev/null +++ b/sys/contrib/openzfs/cmd/zstream/.gitignore @@ -0,0 +1 @@ +zstream diff --git a/sys/contrib/openzfs/cmd/zstream/Makefile.am b/sys/contrib/openzfs/cmd/zstream/Makefile.am new file mode 100644 index 000000000000..5e2ac5d69f1a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zstream/Makefile.am @@ -0,0 +1,15 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zstream + +zstream_SOURCES = \ + zstream.c \ + zstream.h \ + zstream_dump.c \ + zstream_redup.c \ + zstream_token.c + +zstream_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/sys/contrib/openzfs/cmd/zstream/zstream.c b/sys/contrib/openzfs/cmd/zstream/zstream.c new file mode 100644 index 000000000000..cbcb560a8638 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zstream/zstream.c @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + * Copyright (c) 2020 by Datto Inc. All rights reserved. + */ +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> +#include <libintl.h> +#include <stddef.h> +#include <libzfs.h> +#include "zstream.h" + +void +zstream_usage(void) +{ + (void) fprintf(stderr, + "usage: zstream command args ...\n" + "Available commands are:\n" + "\n" + "\tzstream dump [-vCd] FILE\n" + "\t... | zstream dump [-vCd]\n" + "\n" + "\tzstream token resume_token\n" + "\n" + "\tzstream redup [-v] FILE | ...\n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + if (argc < 2) + zstream_usage(); + + char *subcommand = argv[1]; + + if (strcmp(subcommand, "dump") == 0) { + return (zstream_do_dump(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "token") == 0) { + return (zstream_do_token(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "redup") == 0) { + return (zstream_do_redup(argc - 1, argv + 1)); + } else { + zstream_usage(); + } +} diff --git a/sys/contrib/openzfs/cmd/zstream/zstream.h b/sys/contrib/openzfs/cmd/zstream/zstream.h new file mode 100644 index 000000000000..319fecb2876b --- /dev/null +++ b/sys/contrib/openzfs/cmd/zstream/zstream.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#ifndef _ZSTREAM_H +#define _ZSTREAM_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zstream_do_redup(int, char *[]); +extern int zstream_do_dump(int, char *[]); +extern int zstream_do_token(int, char *[]); +extern void zstream_usage(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTREAM_H */ diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_dump.c b/sys/contrib/openzfs/cmd/zstream/zstream_dump.c new file mode 100644 index 000000000000..45cf7b97a147 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zstream/zstream_dump.c @@ -0,0 +1,799 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Portions Copyright 2012 Martin Matuska <martin@matuska.org> + */ + +/* + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#include <ctype.h> +#include <libnvpair.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> +#include <stddef.h> + +#include <sys/dmu.h> +#include <sys/zfs_ioctl.h> +#include <sys/zio.h> +#include <zfs_fletcher.h> +#include "zstream.h" + +/* + * If dump mode is enabled, the number of bytes to print per line + */ +#define BYTES_PER_LINE 16 +/* + * If dump mode is enabled, the number of bytes to group together, separated + * by newlines or spaces + */ +#define DUMP_GROUPING 4 + +uint64_t total_stream_len = 0; +FILE *send_stream = 0; +boolean_t do_byteswap = B_FALSE; +boolean_t do_cksum = B_TRUE; + +static void * +safe_malloc(size_t size) +{ + void *rv = malloc(size); + if (rv == NULL) { + (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n", + size); + abort(); + } + return (rv); +} + +/* + * ssread - send stream read. + * + * Read while computing incremental checksum + */ +static size_t +ssread(void *buf, size_t len, zio_cksum_t *cksum) +{ + size_t outlen; + + if ((outlen = fread(buf, len, 1, send_stream)) == 0) + return (0); + + if (do_cksum) { + if (do_byteswap) + fletcher_4_incremental_byteswap(buf, len, cksum); + else + fletcher_4_incremental_native(buf, len, cksum); + } + total_stream_len += len; + return (outlen); +} + +static size_t +read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum) +{ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum); + if (r == 0) + return (0); + zio_cksum_t saved_cksum = *cksum; + r = ssread(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), cksum); + if (r == 0) + return (0); + if (do_cksum && + !ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) && + !ZIO_CHECKSUM_EQUAL(saved_cksum, + drr->drr_u.drr_checksum.drr_checksum)) { + fprintf(stderr, "invalid checksum\n"); + (void) printf("Incorrect checksum in record header.\n"); + (void) printf("Expected checksum = %llx/%llx/%llx/%llx\n", + (longlong_t)saved_cksum.zc_word[0], + (longlong_t)saved_cksum.zc_word[1], + (longlong_t)saved_cksum.zc_word[2], + (longlong_t)saved_cksum.zc_word[3]); + return (0); + } + return (sizeof (*drr)); +} + +/* + * Print part of a block in ASCII characters + */ +static void +print_ascii_block(char *subbuf, int length) +{ + int i; + + for (i = 0; i < length; i++) { + char char_print = isprint(subbuf[i]) ? subbuf[i] : '.'; + if (i != 0 && i % DUMP_GROUPING == 0) { + (void) printf(" "); + } + (void) printf("%c", char_print); + } + (void) printf("\n"); +} + +/* + * print_block - Dump the contents of a modified block to STDOUT + * + * Assume that buf has capacity evenly divisible by BYTES_PER_LINE + */ +static void +print_block(char *buf, int length) +{ + int i; + /* + * Start printing ASCII characters at a constant offset, after + * the hex prints. Leave 3 characters per byte on a line (2 digit + * hex number plus 1 space) plus spaces between characters and + * groupings. + */ + int ascii_start = BYTES_PER_LINE * 3 + + BYTES_PER_LINE / DUMP_GROUPING + 2; + + for (i = 0; i < length; i += BYTES_PER_LINE) { + int j; + int this_line_length = MIN(BYTES_PER_LINE, length - i); + int print_offset = 0; + + for (j = 0; j < this_line_length; j++) { + int buf_offset = i + j; + + /* + * Separate every DUMP_GROUPING bytes by a space. + */ + if (buf_offset % DUMP_GROUPING == 0) { + print_offset += printf(" "); + } + + /* + * Print the two-digit hex value for this byte. + */ + unsigned char hex_print = buf[buf_offset]; + print_offset += printf("%02x ", hex_print); + } + + (void) printf("%*s", ascii_start - print_offset, " "); + + print_ascii_block(buf + i, this_line_length); + } +} + +/* + * Print an array of bytes to stdout as hexadecimal characters. str must + * have buf_len * 2 + 1 bytes of space. + */ +static void +sprintf_bytes(char *str, uint8_t *buf, uint_t buf_len) +{ + int i, n; + + for (i = 0; i < buf_len; i++) { + n = sprintf(str, "%02x", buf[i] & 0xff); + str += n; + } + + str[0] = '\0'; +} + +int +zstream_do_dump(int argc, char *argv[]) +{ + char *buf = safe_malloc(SPA_MAXBLOCKSIZE); + uint64_t drr_record_count[DRR_NUMTYPES] = { 0 }; + uint64_t total_payload_size = 0; + uint64_t total_overhead_size = 0; + uint64_t drr_byte_count[DRR_NUMTYPES] = { 0 }; + char salt[ZIO_DATA_SALT_LEN * 2 + 1]; + char iv[ZIO_DATA_IV_LEN * 2 + 1]; + char mac[ZIO_DATA_MAC_LEN * 2 + 1]; + uint64_t total_records = 0; + uint64_t payload_size; + dmu_replay_record_t thedrr; + dmu_replay_record_t *drr = &thedrr; + struct drr_begin *drrb = &thedrr.drr_u.drr_begin; + struct drr_end *drre = &thedrr.drr_u.drr_end; + struct drr_object *drro = &thedrr.drr_u.drr_object; + struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects; + struct drr_write *drrw = &thedrr.drr_u.drr_write; + struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref; + struct drr_free *drrf = &thedrr.drr_u.drr_free; + struct drr_spill *drrs = &thedrr.drr_u.drr_spill; + struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; + struct drr_object_range *drror = &thedrr.drr_u.drr_object_range; + struct drr_redact *drrr = &thedrr.drr_u.drr_redact; + struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum; + int c; + boolean_t verbose = B_FALSE; + boolean_t very_verbose = B_FALSE; + boolean_t first = B_TRUE; + /* + * dump flag controls whether the contents of any modified data blocks + * are printed to the console during processing of the stream. Warning: + * for large streams, this can obviously lead to massive prints. + */ + boolean_t dump = B_FALSE; + int err; + zio_cksum_t zc = { { 0 } }; + zio_cksum_t pcksum = { { 0 } }; + + while ((c = getopt(argc, argv, ":vCd")) != -1) { + switch (c) { + case 'C': + do_cksum = B_FALSE; + break; + case 'v': + if (verbose) + very_verbose = B_TRUE; + verbose = B_TRUE; + break; + case 'd': + dump = B_TRUE; + verbose = B_TRUE; + very_verbose = B_TRUE; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + zstream_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + zstream_usage(); + break; + } + } + + if (argc > optind) { + const char *filename = argv[optind]; + send_stream = fopen(filename, "r"); + if (send_stream == NULL) { + (void) fprintf(stderr, + "Error while opening file '%s': %s\n", + filename, strerror(errno)); + exit(1); + } + } else { + if (isatty(STDIN_FILENO)) { + (void) fprintf(stderr, + "Error: The send stream is a binary format " + "and can not be read from a\n" + "terminal. Standard input must be redirected, " + "or a file must be\n" + "specified as a command-line argument.\n"); + exit(1); + } + send_stream = stdin; + } + + fletcher_4_init(); + while (read_hdr(drr, &zc)) { + + /* + * If this is the first DMU record being processed, check for + * the magic bytes and figure out the endian-ness based on them. + */ + if (first) { + if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + do_byteswap = B_TRUE; + if (do_cksum) { + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + /* + * recalculate header checksum now + * that we know it needs to be + * byteswapped. + */ + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &zc); + } + } else if (drrb->drr_magic != DMU_BACKUP_MAGIC) { + (void) fprintf(stderr, "Invalid stream " + "(bad magic number)\n"); + exit(1); + } + first = B_FALSE; + } + if (do_byteswap) { + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = + BSWAP_32(drr->drr_payloadlen); + } + + /* + * At this point, the leading fields of the replay record + * (drr_type and drr_payloadlen) have been byte-swapped if + * necessary, but the rest of the data structure (the + * union of type-specific structures) is still in its + * original state. + */ + if (drr->drr_type >= DRR_NUMTYPES) { + (void) printf("INVALID record found: type 0x%x\n", + drr->drr_type); + (void) printf("Aborting.\n"); + exit(1); + } + + drr_record_count[drr->drr_type]++; + total_overhead_size += sizeof (*drr); + total_records++; + payload_size = 0; + + switch (drr->drr_type) { + case DRR_BEGIN: + if (do_byteswap) { + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_versioninfo = + BSWAP_64(drrb->drr_versioninfo); + drrb->drr_creation_time = + BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_flags = BSWAP_32(drrb->drr_flags); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = + BSWAP_64(drrb->drr_fromguid); + } + + (void) printf("BEGIN record\n"); + (void) printf("\thdrtype = %lld\n", + DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo)); + (void) printf("\tfeatures = %llx\n", + DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo)); + (void) printf("\tmagic = %llx\n", + (u_longlong_t)drrb->drr_magic); + (void) printf("\tcreation_time = %llx\n", + (u_longlong_t)drrb->drr_creation_time); + (void) printf("\ttype = %u\n", drrb->drr_type); + (void) printf("\tflags = 0x%x\n", drrb->drr_flags); + (void) printf("\ttoguid = %llx\n", + (u_longlong_t)drrb->drr_toguid); + (void) printf("\tfromguid = %llx\n", + (u_longlong_t)drrb->drr_fromguid); + (void) printf("\ttoname = %s\n", drrb->drr_toname); + (void) printf("\tpayloadlen = %u\n", + drr->drr_payloadlen); + if (verbose) + (void) printf("\n"); + + if (drr->drr_payloadlen != 0) { + nvlist_t *nv; + int sz = drr->drr_payloadlen; + + if (sz > SPA_MAXBLOCKSIZE) { + free(buf); + buf = safe_malloc(sz); + } + (void) ssread(buf, sz, &zc); + if (ferror(send_stream)) + perror("fread"); + err = nvlist_unpack(buf, sz, &nv, 0); + if (err) { + perror(strerror(err)); + } else { + nvlist_print(stdout, nv); + nvlist_free(nv); + } + payload_size = sz; + } + break; + + case DRR_END: + if (do_byteswap) { + drre->drr_checksum.zc_word[0] = + BSWAP_64(drre->drr_checksum.zc_word[0]); + drre->drr_checksum.zc_word[1] = + BSWAP_64(drre->drr_checksum.zc_word[1]); + drre->drr_checksum.zc_word[2] = + BSWAP_64(drre->drr_checksum.zc_word[2]); + drre->drr_checksum.zc_word[3] = + BSWAP_64(drre->drr_checksum.zc_word[3]); + } + /* + * We compare against the *previous* checksum + * value, because the stored checksum is of + * everything before the DRR_END record. + */ + if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum, + pcksum)) { + (void) printf("Expected checksum differs from " + "checksum in stream.\n"); + (void) printf("Expected checksum = " + "%llx/%llx/%llx/%llx\n", + (long long unsigned int)pcksum.zc_word[0], + (long long unsigned int)pcksum.zc_word[1], + (long long unsigned int)pcksum.zc_word[2], + (long long unsigned int)pcksum.zc_word[3]); + } + (void) printf("END checksum = %llx/%llx/%llx/%llx\n", + (long long unsigned int) + drre->drr_checksum.zc_word[0], + (long long unsigned int) + drre->drr_checksum.zc_word[1], + (long long unsigned int) + drre->drr_checksum.zc_word[2], + (long long unsigned int) + drre->drr_checksum.zc_word[3]); + + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + break; + + case DRR_OBJECT: + if (do_byteswap) { + drro->drr_object = BSWAP_64(drro->drr_object); + drro->drr_type = BSWAP_32(drro->drr_type); + drro->drr_bonustype = + BSWAP_32(drro->drr_bonustype); + drro->drr_blksz = BSWAP_32(drro->drr_blksz); + drro->drr_bonuslen = + BSWAP_32(drro->drr_bonuslen); + drro->drr_raw_bonuslen = + BSWAP_32(drro->drr_raw_bonuslen); + drro->drr_toguid = BSWAP_64(drro->drr_toguid); + drro->drr_maxblkid = + BSWAP_64(drro->drr_maxblkid); + } + + payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); + + if (verbose) { + (void) printf("OBJECT object = %llu type = %u " + "bonustype = %u blksz = %u bonuslen = %u " + "dn_slots = %u raw_bonuslen = %u " + "flags = %u maxblkid = %llu " + "indblkshift = %u nlevels = %u " + "nblkptr = %u\n", + (u_longlong_t)drro->drr_object, + drro->drr_type, + drro->drr_bonustype, + drro->drr_blksz, + drro->drr_bonuslen, + drro->drr_dn_slots, + drro->drr_raw_bonuslen, + drro->drr_flags, + (u_longlong_t)drro->drr_maxblkid, + drro->drr_indblkshift, + drro->drr_nlevels, + drro->drr_nblkptr); + } + if (drro->drr_bonuslen > 0) { + (void) ssread(buf, payload_size, &zc); + if (dump) + print_block(buf, payload_size); + } + break; + + case DRR_FREEOBJECTS: + if (do_byteswap) { + drrfo->drr_firstobj = + BSWAP_64(drrfo->drr_firstobj); + drrfo->drr_numobjs = + BSWAP_64(drrfo->drr_numobjs); + drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid); + } + if (verbose) { + (void) printf("FREEOBJECTS firstobj = %llu " + "numobjs = %llu\n", + (u_longlong_t)drrfo->drr_firstobj, + (u_longlong_t)drrfo->drr_numobjs); + } + break; + + case DRR_WRITE: + if (do_byteswap) { + drrw->drr_object = BSWAP_64(drrw->drr_object); + drrw->drr_type = BSWAP_32(drrw->drr_type); + drrw->drr_offset = BSWAP_64(drrw->drr_offset); + drrw->drr_logical_size = + BSWAP_64(drrw->drr_logical_size); + drrw->drr_toguid = BSWAP_64(drrw->drr_toguid); + drrw->drr_key.ddk_prop = + BSWAP_64(drrw->drr_key.ddk_prop); + drrw->drr_compressed_size = + BSWAP_64(drrw->drr_compressed_size); + } + + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + + /* + * If this is verbose and/or dump output, + * print info on the modified block + */ + if (verbose) { + sprintf_bytes(salt, drrw->drr_salt, + ZIO_DATA_SALT_LEN); + sprintf_bytes(iv, drrw->drr_iv, + ZIO_DATA_IV_LEN); + sprintf_bytes(mac, drrw->drr_mac, + ZIO_DATA_MAC_LEN); + + (void) printf("WRITE object = %llu type = %u " + "checksum type = %u compression type = %u " + "flags = %u offset = %llu " + "logical_size = %llu " + "compressed_size = %llu " + "payload_size = %llu props = %llx " + "salt = %s iv = %s mac = %s\n", + (u_longlong_t)drrw->drr_object, + drrw->drr_type, + drrw->drr_checksumtype, + drrw->drr_compressiontype, + drrw->drr_flags, + (u_longlong_t)drrw->drr_offset, + (u_longlong_t)drrw->drr_logical_size, + (u_longlong_t)drrw->drr_compressed_size, + (u_longlong_t)payload_size, + (u_longlong_t)drrw->drr_key.ddk_prop, + salt, + iv, + mac); + } + + /* + * Read the contents of the block in from STDIN to buf + */ + (void) ssread(buf, payload_size, &zc); + /* + * If in dump mode + */ + if (dump) { + print_block(buf, payload_size); + } + break; + + case DRR_WRITE_BYREF: + if (do_byteswap) { + drrwbr->drr_object = + BSWAP_64(drrwbr->drr_object); + drrwbr->drr_offset = + BSWAP_64(drrwbr->drr_offset); + drrwbr->drr_length = + BSWAP_64(drrwbr->drr_length); + drrwbr->drr_toguid = + BSWAP_64(drrwbr->drr_toguid); + drrwbr->drr_refguid = + BSWAP_64(drrwbr->drr_refguid); + drrwbr->drr_refobject = + BSWAP_64(drrwbr->drr_refobject); + drrwbr->drr_refoffset = + BSWAP_64(drrwbr->drr_refoffset); + drrwbr->drr_key.ddk_prop = + BSWAP_64(drrwbr->drr_key.ddk_prop); + } + if (verbose) { + (void) printf("WRITE_BYREF object = %llu " + "checksum type = %u props = %llx " + "offset = %llu length = %llu " + "toguid = %llx refguid = %llx " + "refobject = %llu refoffset = %llu\n", + (u_longlong_t)drrwbr->drr_object, + drrwbr->drr_checksumtype, + (u_longlong_t)drrwbr->drr_key.ddk_prop, + (u_longlong_t)drrwbr->drr_offset, + (u_longlong_t)drrwbr->drr_length, + (u_longlong_t)drrwbr->drr_toguid, + (u_longlong_t)drrwbr->drr_refguid, + (u_longlong_t)drrwbr->drr_refobject, + (u_longlong_t)drrwbr->drr_refoffset); + } + break; + + case DRR_FREE: + if (do_byteswap) { + drrf->drr_object = BSWAP_64(drrf->drr_object); + drrf->drr_offset = BSWAP_64(drrf->drr_offset); + drrf->drr_length = BSWAP_64(drrf->drr_length); + } + if (verbose) { + (void) printf("FREE object = %llu " + "offset = %llu length = %lld\n", + (u_longlong_t)drrf->drr_object, + (u_longlong_t)drrf->drr_offset, + (longlong_t)drrf->drr_length); + } + break; + case DRR_SPILL: + if (do_byteswap) { + drrs->drr_object = BSWAP_64(drrs->drr_object); + drrs->drr_length = BSWAP_64(drrs->drr_length); + drrs->drr_compressed_size = + BSWAP_64(drrs->drr_compressed_size); + drrs->drr_type = BSWAP_32(drrs->drr_type); + } + + payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); + + if (verbose) { + sprintf_bytes(salt, drrs->drr_salt, + ZIO_DATA_SALT_LEN); + sprintf_bytes(iv, drrs->drr_iv, + ZIO_DATA_IV_LEN); + sprintf_bytes(mac, drrs->drr_mac, + ZIO_DATA_MAC_LEN); + + (void) printf("SPILL block for object = %llu " + "length = %llu flags = %u " + "compression type = %u " + "compressed_size = %llu " + "payload_size = %llu " + "salt = %s iv = %s mac = %s\n", + (u_longlong_t)drrs->drr_object, + (u_longlong_t)drrs->drr_length, + drrs->drr_flags, + drrs->drr_compressiontype, + (u_longlong_t)drrs->drr_compressed_size, + (u_longlong_t)payload_size, + salt, + iv, + mac); + } + (void) ssread(buf, payload_size, &zc); + if (dump) { + print_block(buf, payload_size); + } + break; + case DRR_WRITE_EMBEDDED: + if (do_byteswap) { + drrwe->drr_object = + BSWAP_64(drrwe->drr_object); + drrwe->drr_offset = + BSWAP_64(drrwe->drr_offset); + drrwe->drr_length = + BSWAP_64(drrwe->drr_length); + drrwe->drr_toguid = + BSWAP_64(drrwe->drr_toguid); + drrwe->drr_lsize = + BSWAP_32(drrwe->drr_lsize); + drrwe->drr_psize = + BSWAP_32(drrwe->drr_psize); + } + if (verbose) { + (void) printf("WRITE_EMBEDDED object = %llu " + "offset = %llu length = %llu " + "toguid = %llx comp = %u etype = %u " + "lsize = %u psize = %u\n", + (u_longlong_t)drrwe->drr_object, + (u_longlong_t)drrwe->drr_offset, + (u_longlong_t)drrwe->drr_length, + (u_longlong_t)drrwe->drr_toguid, + drrwe->drr_compression, + drrwe->drr_etype, + drrwe->drr_lsize, + drrwe->drr_psize); + } + (void) ssread(buf, + P2ROUNDUP(drrwe->drr_psize, 8), &zc); + if (dump) { + print_block(buf, + P2ROUNDUP(drrwe->drr_psize, 8)); + } + payload_size = P2ROUNDUP(drrwe->drr_psize, 8); + break; + case DRR_OBJECT_RANGE: + if (do_byteswap) { + drror->drr_firstobj = + BSWAP_64(drror->drr_firstobj); + drror->drr_numslots = + BSWAP_64(drror->drr_numslots); + drror->drr_toguid = BSWAP_64(drror->drr_toguid); + } + if (verbose) { + sprintf_bytes(salt, drror->drr_salt, + ZIO_DATA_SALT_LEN); + sprintf_bytes(iv, drror->drr_iv, + ZIO_DATA_IV_LEN); + sprintf_bytes(mac, drror->drr_mac, + ZIO_DATA_MAC_LEN); + + (void) printf("OBJECT_RANGE firstobj = %llu " + "numslots = %llu flags = %u " + "salt = %s iv = %s mac = %s\n", + (u_longlong_t)drror->drr_firstobj, + (u_longlong_t)drror->drr_numslots, + drror->drr_flags, + salt, + iv, + mac); + } + break; + case DRR_REDACT: + if (do_byteswap) { + drrr->drr_object = BSWAP_64(drrr->drr_object); + drrr->drr_offset = BSWAP_64(drrr->drr_offset); + drrr->drr_length = BSWAP_64(drrr->drr_length); + drrr->drr_toguid = BSWAP_64(drrr->drr_toguid); + } + if (verbose) { + (void) printf("REDACT object = %llu offset = " + "%llu length = %llu\n", + (u_longlong_t)drrr->drr_object, + (u_longlong_t)drrr->drr_offset, + (u_longlong_t)drrr->drr_length); + } + break; + case DRR_NUMTYPES: + /* should never be reached */ + exit(1); + } + if (drr->drr_type != DRR_BEGIN && very_verbose) { + (void) printf(" checksum = %llx/%llx/%llx/%llx\n", + (longlong_t)drrc->drr_checksum.zc_word[0], + (longlong_t)drrc->drr_checksum.zc_word[1], + (longlong_t)drrc->drr_checksum.zc_word[2], + (longlong_t)drrc->drr_checksum.zc_word[3]); + } + pcksum = zc; + drr_byte_count[drr->drr_type] += payload_size; + total_payload_size += payload_size; + } + free(buf); + fletcher_4_fini(); + + /* Print final summary */ + + (void) printf("SUMMARY:\n"); + (void) printf("\tTotal DRR_BEGIN records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_BEGIN], + (u_longlong_t)drr_byte_count[DRR_BEGIN]); + (void) printf("\tTotal DRR_END records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_END], + (u_longlong_t)drr_byte_count[DRR_END]); + (void) printf("\tTotal DRR_OBJECT records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_OBJECT], + (u_longlong_t)drr_byte_count[DRR_OBJECT]); + (void) printf("\tTotal DRR_FREEOBJECTS records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_FREEOBJECTS], + (u_longlong_t)drr_byte_count[DRR_FREEOBJECTS]); + (void) printf("\tTotal DRR_WRITE records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_WRITE], + (u_longlong_t)drr_byte_count[DRR_WRITE]); + (void) printf("\tTotal DRR_WRITE_BYREF records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_WRITE_BYREF], + (u_longlong_t)drr_byte_count[DRR_WRITE_BYREF]); + (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld (%llu " + "bytes)\n", (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED], + (u_longlong_t)drr_byte_count[DRR_WRITE_EMBEDDED]); + (void) printf("\tTotal DRR_FREE records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_FREE], + (u_longlong_t)drr_byte_count[DRR_FREE]); + (void) printf("\tTotal DRR_SPILL records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_SPILL], + (u_longlong_t)drr_byte_count[DRR_SPILL]); + (void) printf("\tTotal records = %lld\n", + (u_longlong_t)total_records); + (void) printf("\tTotal payload size = %lld (0x%llx)\n", + (u_longlong_t)total_payload_size, (u_longlong_t)total_payload_size); + (void) printf("\tTotal header overhead = %lld (0x%llx)\n", + (u_longlong_t)total_overhead_size, + (u_longlong_t)total_overhead_size); + (void) printf("\tTotal stream length = %lld (0x%llx)\n", + (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len); + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_redup.c b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c new file mode 100644 index 000000000000..379025ce59e5 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c @@ -0,0 +1,469 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include <assert.h> +#include <cityhash.h> +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <libzfs_impl.h> +#include <libzfs.h> +#include <libzutil.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <umem.h> +#include <unistd.h> +#include <sys/debug.h> +#include <sys/stat.h> +#include <sys/zfs_ioctl.h> +#include <sys/zio_checksum.h> +#include "zfs_fletcher.h" +#include "zstream.h" + + +#define MAX_RDT_PHYSMEM_PERCENT 20 +#define SMALLEST_POSSIBLE_MAX_RDT_MB 128 + +typedef struct redup_entry { + struct redup_entry *rde_next; + uint64_t rde_guid; + uint64_t rde_object; + uint64_t rde_offset; + uint64_t rde_stream_offset; +} redup_entry_t; + +typedef struct redup_table { + redup_entry_t **redup_hash_array; + umem_cache_t *ddecache; + uint64_t ddt_count; + int numhashbits; +} redup_table_t; + +int +highbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); +} + +static void * +safe_calloc(size_t n) +{ + void *rv = calloc(1, n); + if (rv == NULL) { + fprintf(stderr, + "Error: could not allocate %u bytes of memory\n", + (int)n); + exit(1); + } + return (rv); +} + +/* + * Safe version of fread(), exits on error. + */ +static int +sfread(void *buf, size_t size, FILE *fp) +{ + int rv = fread(buf, size, 1, fp); + if (rv == 0 && ferror(fp)) { + (void) fprintf(stderr, "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } + return (rv); +} + +/* + * Safe version of pread(), exits on error. + */ +static void +spread(int fd, void *buf, size_t count, off_t offset) +{ + ssize_t err = pread(fd, buf, count, offset); + if (err == -1) { + (void) fprintf(stderr, + "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } else if (err != count) { + (void) fprintf(stderr, + "Error while reading file: short read\n"); + exit(1); + } +} + +static int +dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, + zio_cksum_t *zc, int outfd) +{ + assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) + == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + fletcher_4_incremental_native(drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); + if (drr->drr_type != DRR_BEGIN) { + assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. + drr_checksum.drr_checksum)); + drr->drr_u.drr_checksum.drr_checksum = *zc; + } + fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), zc); + if (write(outfd, drr, sizeof (*drr)) == -1) + return (errno); + if (payload_len != 0) { + fletcher_4_incremental_native(payload, payload_len, zc); + if (write(outfd, payload, payload_len) == -1) + return (errno); + } + return (0); +} + +static void +rdt_insert(redup_table_t *rdt, + uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) +{ + uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); + redup_entry_t **rdepp; + + rdepp = &(rdt->redup_hash_array[hashcode]); + redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL); + rde->rde_next = *rdepp; + rde->rde_guid = guid; + rde->rde_object = object; + rde->rde_offset = offset; + rde->rde_stream_offset = stream_offset; + *rdepp = rde; + rdt->ddt_count++; +} + +static void +rdt_lookup(redup_table_t *rdt, + uint64_t guid, uint64_t object, uint64_t offset, + uint64_t *stream_offsetp) +{ + uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); + + for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; + rde != NULL; rde = rde->rde_next) { + if (rde->rde_guid == guid && + rde->rde_object == object && + rde->rde_offset == offset) { + *stream_offsetp = rde->rde_stream_offset; + return; + } + } + assert(!"could not find expected redup table entry"); +} + +/* + * Convert a dedup stream (generated by "zfs send -D") to a + * non-deduplicated stream. The entire infd will be converted, including + * any substreams in a stream package (generated by "zfs send -RD"). The + * infd must be seekable. + */ +static void +zfs_redup_stream(int infd, int outfd, boolean_t verbose) +{ + int bufsz = SPA_MAXBLOCKSIZE; + dmu_replay_record_t thedrr = { 0 }; + dmu_replay_record_t *drr = &thedrr; + redup_table_t rdt; + zio_cksum_t stream_cksum; + uint64_t numbuckets; + uint64_t num_records = 0; + uint64_t num_write_byref_records = 0; + +#ifdef _ILP32 + uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; +#else + uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); + uint64_t max_rde_size = + MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100, + SMALLEST_POSSIBLE_MAX_RDT_MB << 20); +#endif + + numbuckets = max_rde_size / (sizeof (redup_entry_t)); + + /* + * numbuckets must be a power of 2. Increase number to + * a power of 2 if necessary. + */ + if (!ISP2(numbuckets)) + numbuckets = 1ULL << highbit64(numbuckets); + + rdt.redup_hash_array = + safe_calloc(numbuckets * sizeof (redup_entry_t *)); + rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0, + NULL, NULL, NULL, NULL, NULL, 0); + rdt.numhashbits = highbit64(numbuckets) - 1; + rdt.ddt_count = 0; + + char *buf = safe_calloc(bufsz); + FILE *ofp = fdopen(infd, "r"); + long offset = ftell(ofp); + while (sfread(drr, sizeof (*drr), ofp) != 0) { + num_records++; + + /* + * We need to regenerate the checksum. + */ + if (drr->drr_type != DRR_BEGIN) { + bzero(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (drr->drr_u.drr_checksum.drr_checksum)); + } + + uint64_t payload_size = 0; + switch (drr->drr_type) { + case DRR_BEGIN: + { + struct drr_begin *drrb = &drr->drr_u.drr_begin; + int fflags; + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + + assert(drrb->drr_magic == DMU_BACKUP_MAGIC); + + /* clear the DEDUP feature flag for this stream */ + fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + fflags &= ~(DMU_BACKUP_FEATURE_DEDUP | + DMU_BACKUP_FEATURE_DEDUPPROPS); + DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); + + int sz = drr->drr_payloadlen; + if (sz != 0) { + if (sz > bufsz) { + free(buf); + buf = safe_calloc(sz); + bufsz = sz; + } + (void) sfread(buf, sz, ofp); + } + payload_size = sz; + break; + } + + case DRR_END: + { + struct drr_end *drre = &drr->drr_u.drr_end; + /* + * Use the recalculated checksum, unless this is + * the END record of a stream package, which has + * no checksum. + */ + if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) + drre->drr_checksum = stream_cksum; + break; + } + + case DRR_OBJECT: + { + struct drr_object *drro = &drr->drr_u.drr_object; + + if (drro->drr_bonuslen > 0) { + payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); + (void) sfread(buf, payload_size, ofp); + } + break; + } + + case DRR_SPILL: + { + struct drr_spill *drrs = &drr->drr_u.drr_spill; + payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); + (void) sfread(buf, payload_size, ofp); + break; + } + + case DRR_WRITE_BYREF: + { + struct drr_write_byref drrwb = + drr->drr_u.drr_write_byref; + + num_write_byref_records++; + + /* + * Look up in hash table by drrwb->drr_refguid, + * drr_refobject, drr_refoffset. Replace this + * record with the found WRITE record, but with + * drr_object,drr_offset,drr_toguid replaced with ours. + */ + uint64_t stream_offset = 0; + rdt_lookup(&rdt, drrwb.drr_refguid, + drrwb.drr_refobject, drrwb.drr_refoffset, + &stream_offset); + + spread(infd, drr, sizeof (*drr), stream_offset); + + assert(drr->drr_type == DRR_WRITE); + struct drr_write *drrw = &drr->drr_u.drr_write; + assert(drrw->drr_toguid == drrwb.drr_refguid); + assert(drrw->drr_object == drrwb.drr_refobject); + assert(drrw->drr_offset == drrwb.drr_refoffset); + + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + spread(infd, buf, payload_size, + stream_offset + sizeof (*drr)); + + drrw->drr_toguid = drrwb.drr_toguid; + drrw->drr_object = drrwb.drr_object; + drrw->drr_offset = drrwb.drr_offset; + break; + } + + case DRR_WRITE: + { + struct drr_write *drrw = &drr->drr_u.drr_write; + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + (void) sfread(buf, payload_size, ofp); + + rdt_insert(&rdt, drrw->drr_toguid, + drrw->drr_object, drrw->drr_offset, offset); + break; + } + + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &drr->drr_u.drr_write_embedded; + payload_size = + P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); + (void) sfread(buf, payload_size, ofp); + break; + } + + case DRR_FREEOBJECTS: + case DRR_FREE: + case DRR_OBJECT_RANGE: + break; + + default: + (void) fprintf(stderr, "INVALID record type 0x%x\n", + drr->drr_type); + /* should never happen, so assert */ + assert(B_FALSE); + } + + if (feof(ofp)) { + fprintf(stderr, "Error: unexpected end-of-file\n"); + exit(1); + } + if (ferror(ofp)) { + fprintf(stderr, "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } + + /* + * We need to recalculate the checksum, and it needs to be + * initially zero to do that. BEGIN records don't have + * a checksum. + */ + if (drr->drr_type != DRR_BEGIN) { + bzero(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (drr->drr_u.drr_checksum.drr_checksum)); + } + if (dump_record(drr, buf, payload_size, + &stream_cksum, outfd) != 0) + break; + if (drr->drr_type == DRR_END) { + /* + * Typically the END record is either the last + * thing in the stream, or it is followed + * by a BEGIN record (which also zeros the checksum). + * However, a stream package ends with two END + * records. The last END record's checksum starts + * from zero. + */ + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + } + offset = ftell(ofp); + } + + if (verbose) { + char mem_str[16]; + zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t), + mem_str, sizeof (mem_str)); + fprintf(stderr, "converted stream with %llu total records, " + "including %llu dedup records, using %sB memory.\n", + (long long)num_records, + (long long)num_write_byref_records, + mem_str); + } + + umem_cache_destroy(rdt.ddecache); + free(rdt.redup_hash_array); + free(buf); + (void) fclose(ofp); +} + +int +zstream_do_redup(int argc, char *argv[]) +{ + boolean_t verbose = B_FALSE; + char c; + + while ((c = getopt(argc, argv, "v")) != -1) { + switch (c) { + case 'v': + verbose = B_TRUE; + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + zstream_usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (argc != 1) + zstream_usage(); + + const char *filename = argv[0]; + + if (isatty(STDOUT_FILENO)) { + (void) fprintf(stderr, + "Error: Stream can not be written to a terminal.\n" + "You must redirect standard output.\n"); + return (1); + } + + int fd = open(filename, O_RDONLY); + if (fd == -1) { + (void) fprintf(stderr, + "Error while opening file '%s': %s\n", + filename, strerror(errno)); + exit(1); + } + + fletcher_4_init(); + zfs_redup_stream(fd, STDOUT_FILENO, verbose); + fletcher_4_fini(); + + close(fd); + + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_token.c b/sys/contrib/openzfs/cmd/zstream/zstream_token.c new file mode 100644 index 000000000000..36a76a4bb851 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zstream/zstream_token.c @@ -0,0 +1,78 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Portions Copyright 2012 Martin Matuska <martin@matuska.org> + */ + +/* + * Copyright (c) 2020 by Datto Inc. All rights reserved. + */ + +#include <ctype.h> +#include <libnvpair.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> +#include <stddef.h> + +#include <libzfs.h> +#include <libzfs_core.h> + +#include <sys/dmu.h> +#include <sys/zfs_ioctl.h> +#include "zstream.h" + +int +zstream_do_token(int argc, char *argv[]) +{ + char *resume_token = NULL; + + if (argc < 2) { + (void) fprintf(stderr, "Need to pass the resume token\n"); + zstream_usage(); + } + + resume_token = argv[1]; + + libzfs_handle_t *hdl = libzfs_init(); + + nvlist_t *resume_nvl = + zfs_send_resume_token_to_nvlist(hdl, resume_token); + + if (resume_nvl == NULL) { + (void) fprintf(stderr, + "Unable to parse resume token: %s\n", + libzfs_error_description(hdl)); + libzfs_fini(hdl); + return (1); + } + + dump_nvlist(resume_nvl, 5); + nvlist_free(resume_nvl); + + libzfs_fini(hdl); + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zstreamdump/Makefile.am b/sys/contrib/openzfs/cmd/zstreamdump/Makefile.am new file mode 100644 index 000000000000..2c04d8513150 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zstreamdump/Makefile.am @@ -0,0 +1 @@ +dist_sbin_SCRIPTS = zstreamdump diff --git a/sys/contrib/openzfs/cmd/zstreamdump/zstreamdump b/sys/contrib/openzfs/cmd/zstreamdump/zstreamdump new file mode 100755 index 000000000000..fbf02ee687f6 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zstreamdump/zstreamdump @@ -0,0 +1,3 @@ +#!/bin/sh + +zstream dump "$@" diff --git a/sys/contrib/openzfs/cmd/ztest/.gitignore b/sys/contrib/openzfs/cmd/ztest/.gitignore new file mode 100644 index 000000000000..d3d498dae693 --- /dev/null +++ b/sys/contrib/openzfs/cmd/ztest/.gitignore @@ -0,0 +1 @@ +/ztest diff --git a/sys/contrib/openzfs/cmd/ztest/Makefile.am b/sys/contrib/openzfs/cmd/ztest/Makefile.am new file mode 100644 index 000000000000..6042b44d1dde --- /dev/null +++ b/sys/contrib/openzfs/cmd/ztest/Makefile.am @@ -0,0 +1,23 @@ +include $(top_srcdir)/config/Rules.am + +# Get rid of compiler warning for unchecked truncating snprintfs on gcc 7.1.1 +AM_CFLAGS += $(NO_FORMAT_TRUNCATION) + +# Includes kernel code, generate warnings for large stack frames +AM_CFLAGS += $(FRAME_LARGER_THAN) + +# Unconditionally enable ASSERTs +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +sbin_PROGRAMS = ztest + +ztest_SOURCES = \ + ztest.c + +ztest_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +ztest_LDADD += -lm +ztest_LDFLAGS = -pthread diff --git a/sys/contrib/openzfs/cmd/ztest/ztest.c b/sys/contrib/openzfs/cmd/ztest/ztest.c new file mode 100644 index 000000000000..31205a5bf8cf --- /dev/null +++ b/sys/contrib/openzfs/cmd/ztest/ztest.c @@ -0,0 +1,7818 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. + */ + +/* + * The objective of this program is to provide a DMU/ZAP/SPA stress test + * that runs entirely in userland, is easy to use, and easy to extend. + * + * The overall design of the ztest program is as follows: + * + * (1) For each major functional area (e.g. adding vdevs to a pool, + * creating and destroying datasets, reading and writing objects, etc) + * we have a simple routine to test that functionality. These + * individual routines do not have to do anything "stressful". + * + * (2) We turn these simple functionality tests into a stress test by + * running them all in parallel, with as many threads as desired, + * and spread across as many datasets, objects, and vdevs as desired. + * + * (3) While all this is happening, we inject faults into the pool to + * verify that self-healing data really works. + * + * (4) Every time we open a dataset, we change its checksum and compression + * functions. Thus even individual objects vary from block to block + * in which checksum they use and whether they're compressed. + * + * (5) To verify that we never lose on-disk consistency after a crash, + * we run the entire test in a child of the main process. + * At random times, the child self-immolates with a SIGKILL. + * This is the software equivalent of pulling the power cord. + * The parent then runs the test again, using the existing + * storage pool, as many times as desired. If backwards compatibility + * testing is enabled ztest will sometimes run the "older" version + * of ztest after a SIGKILL. + * + * (6) To verify that we don't have future leaks or temporal incursions, + * many of the functional tests record the transaction group number + * as part of their data. When reading old data, they verify that + * the transaction group number is less than the current, open txg. + * If you add a new test, please do this if applicable. + * + * (7) Threads are created with a reduced stack size, for sanity checking. + * Therefore, it's important not to allocate huge buffers on the stack. + * + * When run with no arguments, ztest runs for about five minutes and + * produces no output if successful. To get a little bit of information, + * specify -V. To get more information, specify -VV, and so on. + * + * To turn this into an overnight stress test, use -T to specify run time. + * + * You can ask more vdevs [-v], datasets [-d], or threads [-t] + * to increase the pool capacity, fanout, and overall stress level. + * + * Use the -k option to set the desired frequency of kills. + * + * When ztest invokes itself it passes all relevant information through a + * temporary file which is mmap-ed in the child process. This allows shared + * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always + * stored at offset 0 of this file and contains information on the size and + * number of shared structures in the file. The information stored in this file + * must remain backwards compatible with older versions of ztest so that + * ztest can invoke them during backwards compatibility testing (-B). + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/txg.h> +#include <sys/dbuf.h> +#include <sys/zap.h> +#include <sys/dmu_objset.h> +#include <sys/poll.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/zio.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_file.h> +#include <sys/vdev_initialize.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_trim.h> +#include <sys/spa_impl.h> +#include <sys/metaslab_impl.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_scan.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_refcount.h> +#include <sys/zfeature.h> +#include <sys/dsl_userhold.h> +#include <sys/abd.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <umem.h> +#include <ctype.h> +#include <math.h> +#include <sys/fs/zfs.h> +#include <zfs_fletcher.h> +#include <libnvpair.h> +#include <libzutil.h> +#include <sys/crypto/icp.h> +#ifdef __GLIBC__ +#include <execinfo.h> /* for backtrace() */ +#endif + +static int ztest_fd_data = -1; +static int ztest_fd_rand = -1; + +typedef struct ztest_shared_hdr { + uint64_t zh_hdr_size; + uint64_t zh_opts_size; + uint64_t zh_size; + uint64_t zh_stats_size; + uint64_t zh_stats_count; + uint64_t zh_ds_size; + uint64_t zh_ds_count; +} ztest_shared_hdr_t; + +static ztest_shared_hdr_t *ztest_shared_hdr; + +enum ztest_class_state { + ZTEST_VDEV_CLASS_OFF, + ZTEST_VDEV_CLASS_ON, + ZTEST_VDEV_CLASS_RND +}; + +typedef struct ztest_shared_opts { + char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; + char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; + char zo_alt_ztest[MAXNAMELEN]; + char zo_alt_libpath[MAXNAMELEN]; + uint64_t zo_vdevs; + uint64_t zo_vdevtime; + size_t zo_vdev_size; + int zo_ashift; + int zo_mirrors; + int zo_raidz; + int zo_raidz_parity; + int zo_datasets; + int zo_threads; + uint64_t zo_passtime; + uint64_t zo_killrate; + int zo_verbose; + int zo_init; + uint64_t zo_time; + uint64_t zo_maxloops; + uint64_t zo_metaslab_force_ganging; + int zo_mmp_test; + int zo_special_vdevs; + int zo_dump_dbgmsg; +} ztest_shared_opts_t; + +static const ztest_shared_opts_t ztest_opts_defaults = { + .zo_pool = "ztest", + .zo_dir = "/tmp", + .zo_alt_ztest = { '\0' }, + .zo_alt_libpath = { '\0' }, + .zo_vdevs = 5, + .zo_ashift = SPA_MINBLOCKSHIFT, + .zo_mirrors = 2, + .zo_raidz = 4, + .zo_raidz_parity = 1, + .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ + .zo_datasets = 7, + .zo_threads = 23, + .zo_passtime = 60, /* 60 seconds */ + .zo_killrate = 70, /* 70% kill rate */ + .zo_verbose = 0, + .zo_mmp_test = 0, + .zo_init = 1, + .zo_time = 300, /* 5 minutes */ + .zo_maxloops = 50, /* max loops during spa_freeze() */ + .zo_metaslab_force_ganging = 64 << 10, + .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, +}; + +extern uint64_t metaslab_force_ganging; +extern uint64_t metaslab_df_alloc_threshold; +extern unsigned long zfs_deadman_synctime_ms; +extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; +extern int zfs_abd_scatter_enabled; +extern int dmu_object_alloc_chunk_shift; +extern boolean_t zfs_force_some_double_word_sm_entries; +extern unsigned long zio_decompress_fail_fraction; +extern unsigned long zfs_reconstruct_indirect_damage_fraction; + + +static ztest_shared_opts_t *ztest_shared_opts; +static ztest_shared_opts_t ztest_opts; +static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; + +typedef struct ztest_shared_ds { + uint64_t zd_seq; +} ztest_shared_ds_t; + +static ztest_shared_ds_t *ztest_shared_ds; +#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) + +#define BT_MAGIC 0x123456789abcdefULL +#define MAXFAULTS(zs) \ + (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) + +enum ztest_io_type { + ZTEST_IO_WRITE_TAG, + ZTEST_IO_WRITE_PATTERN, + ZTEST_IO_WRITE_ZEROES, + ZTEST_IO_TRUNCATE, + ZTEST_IO_SETATTR, + ZTEST_IO_REWRITE, + ZTEST_IO_TYPES +}; + +typedef struct ztest_block_tag { + uint64_t bt_magic; + uint64_t bt_objset; + uint64_t bt_object; + uint64_t bt_dnodesize; + uint64_t bt_offset; + uint64_t bt_gen; + uint64_t bt_txg; + uint64_t bt_crtxg; +} ztest_block_tag_t; + +typedef struct bufwad { + uint64_t bw_index; + uint64_t bw_txg; + uint64_t bw_data; +} bufwad_t; + +/* + * It would be better to use a rangelock_t per object. Unfortunately + * the rangelock_t is not a drop-in replacement for rl_t, because we + * still need to map from object ID to rangelock_t. + */ +typedef enum { + RL_READER, + RL_WRITER, + RL_APPEND +} rl_type_t; + +typedef struct rll { + void *rll_writer; + int rll_readers; + kmutex_t rll_lock; + kcondvar_t rll_cv; +} rll_t; + +typedef struct rl { + uint64_t rl_object; + uint64_t rl_offset; + uint64_t rl_size; + rll_t *rl_lock; +} rl_t; + +#define ZTEST_RANGE_LOCKS 64 +#define ZTEST_OBJECT_LOCKS 64 + +/* + * Object descriptor. Used as a template for object lookup/create/remove. + */ +typedef struct ztest_od { + uint64_t od_dir; + uint64_t od_object; + dmu_object_type_t od_type; + dmu_object_type_t od_crtype; + uint64_t od_blocksize; + uint64_t od_crblocksize; + uint64_t od_crdnodesize; + uint64_t od_gen; + uint64_t od_crgen; + char od_name[ZFS_MAX_DATASET_NAME_LEN]; +} ztest_od_t; + +/* + * Per-dataset state. + */ +typedef struct ztest_ds { + ztest_shared_ds_t *zd_shared; + objset_t *zd_os; + pthread_rwlock_t zd_zilog_lock; + zilog_t *zd_zilog; + ztest_od_t *zd_od; /* debugging aid */ + char zd_name[ZFS_MAX_DATASET_NAME_LEN]; + kmutex_t zd_dirobj_lock; + rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; + rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; +} ztest_ds_t; + +/* + * Per-iteration state. + */ +typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); + +typedef struct ztest_info { + ztest_func_t *zi_func; /* test function */ + uint64_t zi_iters; /* iterations per execution */ + uint64_t *zi_interval; /* execute every <interval> seconds */ + const char *zi_funcname; /* name of test function */ +} ztest_info_t; + +typedef struct ztest_shared_callstate { + uint64_t zc_count; /* per-pass count */ + uint64_t zc_time; /* per-pass time */ + uint64_t zc_next; /* next time to call this function */ +} ztest_shared_callstate_t; + +static ztest_shared_callstate_t *ztest_shared_callstate; +#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) + +ztest_func_t ztest_dmu_read_write; +ztest_func_t ztest_dmu_write_parallel; +ztest_func_t ztest_dmu_object_alloc_free; +ztest_func_t ztest_dmu_object_next_chunk; +ztest_func_t ztest_dmu_commit_callbacks; +ztest_func_t ztest_zap; +ztest_func_t ztest_zap_parallel; +ztest_func_t ztest_zil_commit; +ztest_func_t ztest_zil_remount; +ztest_func_t ztest_dmu_read_write_zcopy; +ztest_func_t ztest_dmu_objset_create_destroy; +ztest_func_t ztest_dmu_prealloc; +ztest_func_t ztest_fzap; +ztest_func_t ztest_dmu_snapshot_create_destroy; +ztest_func_t ztest_dsl_prop_get_set; +ztest_func_t ztest_spa_prop_get_set; +ztest_func_t ztest_spa_create_destroy; +ztest_func_t ztest_fault_inject; +ztest_func_t ztest_dmu_snapshot_hold; +ztest_func_t ztest_mmp_enable_disable; +ztest_func_t ztest_scrub; +ztest_func_t ztest_dsl_dataset_promote_busy; +ztest_func_t ztest_vdev_attach_detach; +ztest_func_t ztest_vdev_LUN_growth; +ztest_func_t ztest_vdev_add_remove; +ztest_func_t ztest_vdev_class_add; +ztest_func_t ztest_vdev_aux_add_remove; +ztest_func_t ztest_split_pool; +ztest_func_t ztest_reguid; +ztest_func_t ztest_spa_upgrade; +ztest_func_t ztest_device_removal; +ztest_func_t ztest_spa_checkpoint_create_discard; +ztest_func_t ztest_initialize; +ztest_func_t ztest_trim; +ztest_func_t ztest_fletcher; +ztest_func_t ztest_fletcher_incr; +ztest_func_t ztest_verify_dnode_bt; + +uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ +uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ +uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ +uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ +uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ + +#define ZTI_INIT(func, iters, interval) \ + { .zi_func = (func), \ + .zi_iters = (iters), \ + .zi_interval = (interval), \ + .zi_funcname = # func } + +ztest_info_t ztest_info[] = { + ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), + ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), + ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), + ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), + ZTI_INIT(ztest_zap, 30, &zopt_always), + ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), + ZTI_INIT(ztest_split_pool, 1, &zopt_always), + ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), + ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), + ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), + ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), + ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), +#if 0 + ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), +#endif + ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), + ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), + ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), + ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), + ZTI_INIT(ztest_reguid, 1, &zopt_rarely), + ZTI_INIT(ztest_scrub, 1, &zopt_rarely), + ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), + ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), + ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), + ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), + ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), + ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), + ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), + ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), + ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), + ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), + ZTI_INIT(ztest_trim, 1, &zopt_sometimes), + ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), + ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), + ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), +}; + +#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) + +/* + * The following struct is used to hold a list of uncalled commit callbacks. + * The callbacks are ordered by txg number. + */ +typedef struct ztest_cb_list { + kmutex_t zcl_callbacks_lock; + list_t zcl_callbacks; +} ztest_cb_list_t; + +/* + * Stuff we need to share writably between parent and child. + */ +typedef struct ztest_shared { + boolean_t zs_do_init; + hrtime_t zs_proc_start; + hrtime_t zs_proc_stop; + hrtime_t zs_thread_start; + hrtime_t zs_thread_stop; + hrtime_t zs_thread_kill; + uint64_t zs_enospc_count; + uint64_t zs_vdev_next_leaf; + uint64_t zs_vdev_aux; + uint64_t zs_alloc; + uint64_t zs_space; + uint64_t zs_splits; + uint64_t zs_mirrors; + uint64_t zs_metaslab_sz; + uint64_t zs_metaslab_df_alloc_threshold; + uint64_t zs_guid; +} ztest_shared_t; + +#define ID_PARALLEL -1ULL + +static char ztest_dev_template[] = "%s/%s.%llua"; +static char ztest_aux_template[] = "%s/%s.%s.%llu"; +ztest_shared_t *ztest_shared; + +static spa_t *ztest_spa = NULL; +static ztest_ds_t *ztest_ds; + +static kmutex_t ztest_vdev_lock; +static boolean_t ztest_device_removal_active = B_FALSE; +static boolean_t ztest_pool_scrubbed = B_FALSE; +static kmutex_t ztest_checkpoint_lock; + +/* + * The ztest_name_lock protects the pool and dataset namespace used by + * the individual tests. To modify the namespace, consumers must grab + * this lock as writer. Grabbing the lock as reader will ensure that the + * namespace does not change while the lock is held. + */ +static pthread_rwlock_t ztest_name_lock; + +static boolean_t ztest_dump_core = B_TRUE; +static boolean_t ztest_exiting; + +/* Global commit callback list */ +static ztest_cb_list_t zcl; +/* Commit cb delay */ +static uint64_t zc_min_txg_delay = UINT64_MAX; +static int zc_cb_counter = 0; + +/* + * Minimum number of commit callbacks that need to be registered for us to check + * whether the minimum txg delay is acceptable. + */ +#define ZTEST_COMMIT_CB_MIN_REG 100 + +/* + * If a number of txgs equal to this threshold have been created after a commit + * callback has been registered but not called, then we assume there is an + * implementation bug. + */ +#define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) + +enum ztest_object { + ZTEST_META_DNODE = 0, + ZTEST_DIROBJ, + ZTEST_OBJECTS +}; + +static void usage(boolean_t) __NORETURN; +static int ztest_scrub_impl(spa_t *spa); + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} + +static void +dump_debug_buffer(void) +{ + ssize_t ret __attribute__((unused)); + + if (!ztest_opts.zo_dump_dbgmsg) + return; + + /* + * We use write() instead of printf() so that this function + * is safe to call from a signal handler. + */ + ret = write(STDOUT_FILENO, "\n", 1); + zfs_dbgmsg_print("ztest"); +} + +#define BACKTRACE_SZ 100 + +static void sig_handler(int signo) +{ + struct sigaction action; +#ifdef __GLIBC__ /* backtrace() is a GNU extension */ + int nptrs; + void *buffer[BACKTRACE_SZ]; + + nptrs = backtrace(buffer, BACKTRACE_SZ); + backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); +#endif + dump_debug_buffer(); + + /* + * Restore default action and re-raise signal so SIGSEGV and + * SIGABRT can trigger a core dump. + */ + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + (void) sigaction(signo, &action, NULL); + raise(signo); +} + +#define FATAL_MSG_SZ 1024 + +char *fatal_msg; + +static void +fatal(int do_perror, char *message, ...) +{ + va_list args; + int save_errno = errno; + char *buf; + + (void) fflush(stdout); + buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); + + va_start(args, message); + (void) sprintf(buf, "ztest: "); + /* LINTED */ + (void) vsprintf(buf + strlen(buf), message, args); + va_end(args); + if (do_perror) { + (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), + ": %s", strerror(save_errno)); + } + (void) fprintf(stderr, "%s\n", buf); + fatal_msg = buf; /* to ease debugging */ + + if (ztest_dump_core) + abort(); + else + dump_debug_buffer(); + + exit(3); +} + +static int +str2shift(const char *buf) +{ + const char *ends = "BKMGTPEZ"; + int i; + + if (buf[0] == '\0') + return (0); + for (i = 0; i < strlen(ends); i++) { + if (toupper(buf[0]) == ends[i]) + break; + } + if (i == strlen(ends)) { + (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", + buf); + usage(B_FALSE); + } + if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { + return (10*i); + } + (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); + usage(B_FALSE); + /* NOTREACHED */ +} + +static uint64_t +nicenumtoull(const char *buf) +{ + char *end; + uint64_t val; + + val = strtoull(buf, &end, 0); + if (end == buf) { + (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); + usage(B_FALSE); + } else if (end[0] == '.') { + double fval = strtod(buf, &end); + fval *= pow(2, str2shift(end)); + /* + * UINT64_MAX is not exactly representable as a double. + * The closest representation is UINT64_MAX + 1, so we + * use a >= comparison instead of > for the bounds check. + */ + if (fval >= (double)UINT64_MAX) { + (void) fprintf(stderr, "ztest: value too large: %s\n", + buf); + usage(B_FALSE); + } + val = (uint64_t)fval; + } else { + int shift = str2shift(end); + if (shift >= 64 || (val << shift) >> shift != val) { + (void) fprintf(stderr, "ztest: value too large: %s\n", + buf); + usage(B_FALSE); + } + val <<= shift; + } + return (val); +} + +static void +usage(boolean_t requested) +{ + const ztest_shared_opts_t *zo = &ztest_opts_defaults; + + char nice_vdev_size[NN_NUMBUF_SZ]; + char nice_force_ganging[NN_NUMBUF_SZ]; + FILE *fp = requested ? stdout : stderr; + + nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); + nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, + sizeof (nice_force_ganging)); + + (void) fprintf(fp, "Usage: %s\n" + "\t[-v vdevs (default: %llu)]\n" + "\t[-s size_of_each_vdev (default: %s)]\n" + "\t[-a alignment_shift (default: %d)] use 0 for random\n" + "\t[-m mirror_copies (default: %d)]\n" + "\t[-r raidz_disks (default: %d)]\n" + "\t[-R raidz_parity (default: %d)]\n" + "\t[-d datasets (default: %d)]\n" + "\t[-t threads (default: %d)]\n" + "\t[-g gang_block_threshold (default: %s)]\n" + "\t[-i init_count (default: %d)] initialize pool i times\n" + "\t[-k kill_percentage (default: %llu%%)]\n" + "\t[-p pool_name (default: %s)]\n" + "\t[-f dir (default: %s)] file directory for vdev files\n" + "\t[-M] Multi-host simulate pool imported on remote host\n" + "\t[-V] verbose (use multiple times for ever more blather)\n" + "\t[-E] use existing pool instead of creating new one\n" + "\t[-T time (default: %llu sec)] total run time\n" + "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" + "\t[-P passtime (default: %llu sec)] time per pass\n" + "\t[-B alt_ztest (default: <none>)] alternate ztest path\n" + "\t[-C vdev class state (default: random)] special=on|off|random\n" + "\t[-o variable=value] ... set global variable to an unsigned\n" + "\t 32-bit integer value\n" + "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n" + "\t[-h] (print help)\n" + "", + zo->zo_pool, + (u_longlong_t)zo->zo_vdevs, /* -v */ + nice_vdev_size, /* -s */ + zo->zo_ashift, /* -a */ + zo->zo_mirrors, /* -m */ + zo->zo_raidz, /* -r */ + zo->zo_raidz_parity, /* -R */ + zo->zo_datasets, /* -d */ + zo->zo_threads, /* -t */ + nice_force_ganging, /* -g */ + zo->zo_init, /* -i */ + (u_longlong_t)zo->zo_killrate, /* -k */ + zo->zo_pool, /* -p */ + zo->zo_dir, /* -f */ + (u_longlong_t)zo->zo_time, /* -T */ + (u_longlong_t)zo->zo_maxloops, /* -F */ + (u_longlong_t)zo->zo_passtime); + exit(requested ? 0 : 1); +} + + +static void +ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) +{ + char name[32]; + char *value; + int state = ZTEST_VDEV_CLASS_RND; + + (void) strlcpy(name, input, sizeof (name)); + + value = strchr(name, '='); + if (value == NULL) { + (void) fprintf(stderr, "missing value in property=value " + "'-C' argument (%s)\n", input); + usage(B_FALSE); + } + *(value) = '\0'; + value++; + + if (strcmp(value, "on") == 0) { + state = ZTEST_VDEV_CLASS_ON; + } else if (strcmp(value, "off") == 0) { + state = ZTEST_VDEV_CLASS_OFF; + } else if (strcmp(value, "random") == 0) { + state = ZTEST_VDEV_CLASS_RND; + } else { + (void) fprintf(stderr, "invalid property value '%s'\n", value); + usage(B_FALSE); + } + + if (strcmp(name, "special") == 0) { + zo->zo_special_vdevs = state; + } else { + (void) fprintf(stderr, "invalid property name '%s'\n", name); + usage(B_FALSE); + } + if (zo->zo_verbose >= 3) + (void) printf("%s vdev state is '%s'\n", name, value); +} + +static void +process_options(int argc, char **argv) +{ + char *path; + ztest_shared_opts_t *zo = &ztest_opts; + + int opt; + uint64_t value; + char altdir[MAXNAMELEN] = { 0 }; + + bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); + + while ((opt = getopt(argc, argv, + "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { + value = 0; + switch (opt) { + case 'v': + case 's': + case 'a': + case 'm': + case 'r': + case 'R': + case 'd': + case 't': + case 'g': + case 'i': + case 'k': + case 'T': + case 'P': + case 'F': + value = nicenumtoull(optarg); + } + switch (opt) { + case 'v': + zo->zo_vdevs = value; + break; + case 's': + zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); + break; + case 'a': + zo->zo_ashift = value; + break; + case 'm': + zo->zo_mirrors = value; + break; + case 'r': + zo->zo_raidz = MAX(1, value); + break; + case 'R': + zo->zo_raidz_parity = MIN(MAX(value, 1), 3); + break; + case 'd': + zo->zo_datasets = MAX(1, value); + break; + case 't': + zo->zo_threads = MAX(1, value); + break; + case 'g': + zo->zo_metaslab_force_ganging = + MAX(SPA_MINBLOCKSIZE << 1, value); + break; + case 'i': + zo->zo_init = value; + break; + case 'k': + zo->zo_killrate = value; + break; + case 'p': + (void) strlcpy(zo->zo_pool, optarg, + sizeof (zo->zo_pool)); + break; + case 'f': + path = realpath(optarg, NULL); + if (path == NULL) { + (void) fprintf(stderr, "error: %s: %s\n", + optarg, strerror(errno)); + usage(B_FALSE); + } else { + (void) strlcpy(zo->zo_dir, path, + sizeof (zo->zo_dir)); + free(path); + } + break; + case 'M': + zo->zo_mmp_test = 1; + break; + case 'V': + zo->zo_verbose++; + break; + case 'E': + zo->zo_init = 0; + break; + case 'T': + zo->zo_time = value; + break; + case 'P': + zo->zo_passtime = MAX(1, value); + break; + case 'F': + zo->zo_maxloops = MAX(1, value); + break; + case 'B': + (void) strlcpy(altdir, optarg, sizeof (altdir)); + break; + case 'C': + ztest_parse_name_value(optarg, zo); + break; + case 'o': + if (set_global_var(optarg) != 0) + usage(B_FALSE); + break; + case 'G': + zo->zo_dump_dbgmsg = 1; + break; + case 'h': + usage(B_TRUE); + break; + case '?': + default: + usage(B_FALSE); + break; + } + } + + zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); + + zo->zo_vdevtime = + (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : + UINT64_MAX >> 2); + + if (strlen(altdir) > 0) { + char *cmd; + char *realaltdir; + char *bin; + char *ztest; + char *isa; + int isalen; + + cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + VERIFY(NULL != realpath(getexecname(), cmd)); + if (0 != access(altdir, F_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate ztest path: %s", + altdir); + } + VERIFY(NULL != realpath(altdir, realaltdir)); + + /* + * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest". + * We want to extract <isa> to determine if we should use + * 32 or 64 bit binaries. + */ + bin = strstr(cmd, "/usr/bin/"); + ztest = strstr(bin, "/ztest"); + isa = bin + 9; + isalen = ztest - isa; + (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), + "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); + (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), + "%s/usr/lib/%.*s", realaltdir, isalen, isa); + + if (0 != access(zo->zo_alt_ztest, X_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate ztest: %s", + zo->zo_alt_ztest); + } else if (0 != access(zo->zo_alt_libpath, X_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate lib directory %s", + zo->zo_alt_libpath); + } + + umem_free(cmd, MAXPATHLEN); + umem_free(realaltdir, MAXPATHLEN); + } +} + +static void +ztest_kill(ztest_shared_t *zs) +{ + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); + + /* + * Before we kill off ztest, make sure that the config is updated. + * See comment above spa_write_cachefile(). + */ + mutex_enter(&spa_namespace_lock); + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); + mutex_exit(&spa_namespace_lock); + + (void) kill(getpid(), SIGKILL); +} + +static uint64_t +ztest_random(uint64_t range) +{ + uint64_t r; + + ASSERT3S(ztest_fd_rand, >=, 0); + + if (range == 0) + return (0); + + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) + fatal(1, "short read from /dev/urandom"); + + return (r % range); +} + +/* ARGSUSED */ +static void +ztest_record_enospc(const char *s) +{ + ztest_shared->zs_enospc_count++; +} + +static uint64_t +ztest_get_ashift(void) +{ + if (ztest_opts.zo_ashift == 0) + return (SPA_MINBLOCKSHIFT + ztest_random(5)); + return (ztest_opts.zo_ashift); +} + +static nvlist_t * +make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) +{ + char *pathbuf; + uint64_t vdev; + nvlist_t *file; + + pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + if (ashift == 0) + ashift = ztest_get_ashift(); + + if (path == NULL) { + path = pathbuf; + + if (aux != NULL) { + vdev = ztest_shared->zs_vdev_aux; + (void) snprintf(path, MAXPATHLEN, + ztest_aux_template, ztest_opts.zo_dir, + pool == NULL ? ztest_opts.zo_pool : pool, + aux, vdev); + } else { + vdev = ztest_shared->zs_vdev_next_leaf++; + (void) snprintf(path, MAXPATHLEN, + ztest_dev_template, ztest_opts.zo_dir, + pool == NULL ? ztest_opts.zo_pool : pool, vdev); + } + } + + if (size != 0) { + int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); + if (fd == -1) + fatal(1, "can't open %s", path); + if (ftruncate(fd, size) != 0) + fatal(1, "can't ftruncate %s", path); + (void) close(fd); + } + + VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); + VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); + VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); + umem_free(pathbuf, MAXPATHLEN); + + return (file); +} + +static nvlist_t * +make_vdev_raidz(char *path, char *aux, char *pool, size_t size, + uint64_t ashift, int r) +{ + nvlist_t *raidz, **child; + int c; + + if (r < 2) + return (make_vdev_file(path, aux, pool, size, ashift)); + child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (c = 0; c < r; c++) + child[c] = make_vdev_file(path, aux, pool, size, ashift); + + VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_RAIDZ) == 0); + VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, + ztest_opts.zo_raidz_parity) == 0); + VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, + child, r) == 0); + + for (c = 0; c < r; c++) + nvlist_free(child[c]); + + umem_free(child, r * sizeof (nvlist_t *)); + + return (raidz); +} + +static nvlist_t * +make_vdev_mirror(char *path, char *aux, char *pool, size_t size, + uint64_t ashift, int r, int m) +{ + nvlist_t *mirror, **child; + int c; + + if (m < 1) + return (make_vdev_raidz(path, aux, pool, size, ashift, r)); + + child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (c = 0; c < m; c++) + child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); + + VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MIRROR) == 0); + VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, + child, m) == 0); + + for (c = 0; c < m; c++) + nvlist_free(child[c]); + + umem_free(child, m * sizeof (nvlist_t *)); + + return (mirror); +} + +static nvlist_t * +make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, + const char *class, int r, int m, int t) +{ + nvlist_t *root, **child; + int c; + boolean_t log; + + ASSERT(t > 0); + + log = (class != NULL && strcmp(class, "log") == 0); + + child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (c = 0; c < t; c++) { + child[c] = make_vdev_mirror(path, aux, pool, size, ashift, + r, m); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + log) == 0); + + if (class != NULL && class[0] != '\0') { + ASSERT(m > 1 || log); /* expecting a mirror */ + VERIFY(nvlist_add_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0); + } + } + + VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, + child, t) == 0); + + for (c = 0; c < t; c++) + nvlist_free(child[c]); + + umem_free(child, t * sizeof (nvlist_t *)); + + return (root); +} + +/* + * Find a random spa version. Returns back a random spa version in the + * range [initial_version, SPA_VERSION_FEATURES]. + */ +static uint64_t +ztest_random_spa_version(uint64_t initial_version) +{ + uint64_t version = initial_version; + + if (version <= SPA_VERSION_BEFORE_FEATURES) { + version = version + + ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); + } + + if (version > SPA_VERSION_BEFORE_FEATURES) + version = SPA_VERSION_FEATURES; + + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); + return (version); +} + +static int +ztest_random_blocksize(void) +{ + ASSERT(ztest_spa->spa_max_ashift != 0); + + /* + * Choose a block size >= the ashift. + * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. + */ + int maxbs = SPA_OLD_MAXBLOCKSHIFT; + if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) + maxbs = 20; + uint64_t block_shift = + ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); + return (1 << (SPA_MINBLOCKSHIFT + block_shift)); +} + +static int +ztest_random_dnodesize(void) +{ + int slots; + int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; + + if (max_slots == DNODE_MIN_SLOTS) + return (DNODE_MIN_SIZE); + + /* + * Weight the random distribution more heavily toward smaller + * dnode sizes since that is more likely to reflect real-world + * usage. + */ + ASSERT3U(max_slots, >, 4); + switch (ztest_random(10)) { + case 0: + slots = 5 + ztest_random(max_slots - 4); + break; + case 1 ... 4: + slots = 2 + ztest_random(3); + break; + default: + slots = 1; + break; + } + + return (slots << DNODE_SHIFT); +} + +static int +ztest_random_ibshift(void) +{ + return (DN_MIN_INDBLKSHIFT + + ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); +} + +static uint64_t +ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) +{ + uint64_t top; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *tvd; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + do { + top = ztest_random(rvd->vdev_children); + tvd = rvd->vdev_child[top]; + } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || + tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); + + return (top); +} + +static uint64_t +ztest_random_dsl_prop(zfs_prop_t prop) +{ + uint64_t value; + + do { + value = zfs_prop_random_value(prop, ztest_random(-1ULL)); + } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); + + return (value); +} + +static int +ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, + boolean_t inherit) +{ + const char *propname = zfs_prop_to_name(prop); + const char *valname; + char *setpoint; + uint64_t curval; + int error; + + error = dsl_prop_set_int(osname, propname, + (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); + + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (error); + } + ASSERT0(error); + + setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); + + if (ztest_opts.zo_verbose >= 6) { + int err; + + err = zfs_prop_index_to_string(prop, curval, &valname); + if (err) + (void) printf("%s %s = %llu at '%s'\n", osname, + propname, (unsigned long long)curval, setpoint); + else + (void) printf("%s %s = %s at '%s'\n", + osname, propname, valname, setpoint); + } + umem_free(setpoint, MAXPATHLEN); + + return (error); +} + +static int +ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) +{ + spa_t *spa = ztest_spa; + nvlist_t *props = NULL; + int error; + + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); + + error = spa_prop_set(spa, props); + + nvlist_free(props); + + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (error); + } + ASSERT0(error); + + return (error); +} + +static int +ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) +{ + int err; + char *cp = NULL; + char ddname[ZFS_MAX_DATASET_NAME_LEN]; + + strcpy(ddname, name); + cp = strchr(ddname, '@'); + if (cp != NULL) + *cp = '\0'; + + err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); + while (decrypt && err == EACCES) { + dsl_crypto_params_t *dcp; + nvlist_t *crypto_args = fnvlist_alloc(); + + fnvlist_add_uint8_array(crypto_args, "wkeydata", + (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); + VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, + crypto_args, &dcp)); + err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); + dsl_crypto_params_free(dcp, B_FALSE); + fnvlist_free(crypto_args); + + if (err == EINVAL) { + /* + * We couldn't load a key for this dataset so try + * the parent. This loop will eventually hit the + * encryption root since ztest only makes clones + * as children of their origin datasets. + */ + cp = strrchr(ddname, '/'); + if (cp == NULL) + return (err); + + *cp = '\0'; + err = EACCES; + continue; + } else if (err != 0) { + break; + } + + err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); + break; + } + + return (err); +} + +static void +ztest_rll_init(rll_t *rll) +{ + rll->rll_writer = NULL; + rll->rll_readers = 0; + mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); +} + +static void +ztest_rll_destroy(rll_t *rll) +{ + ASSERT(rll->rll_writer == NULL); + ASSERT(rll->rll_readers == 0); + mutex_destroy(&rll->rll_lock); + cv_destroy(&rll->rll_cv); +} + +static void +ztest_rll_lock(rll_t *rll, rl_type_t type) +{ + mutex_enter(&rll->rll_lock); + + if (type == RL_READER) { + while (rll->rll_writer != NULL) + (void) cv_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_readers++; + } else { + while (rll->rll_writer != NULL || rll->rll_readers) + (void) cv_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_writer = curthread; + } + + mutex_exit(&rll->rll_lock); +} + +static void +ztest_rll_unlock(rll_t *rll) +{ + mutex_enter(&rll->rll_lock); + + if (rll->rll_writer) { + ASSERT(rll->rll_readers == 0); + rll->rll_writer = NULL; + } else { + ASSERT(rll->rll_readers != 0); + ASSERT(rll->rll_writer == NULL); + rll->rll_readers--; + } + + if (rll->rll_writer == NULL && rll->rll_readers == 0) + cv_broadcast(&rll->rll_cv); + + mutex_exit(&rll->rll_lock); +} + +static void +ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) +{ + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; + + ztest_rll_lock(rll, type); +} + +static void +ztest_object_unlock(ztest_ds_t *zd, uint64_t object) +{ + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; + + ztest_rll_unlock(rll); +} + +static rl_t * +ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, + uint64_t size, rl_type_t type) +{ + uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); + rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; + rl_t *rl; + + rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); + rl->rl_object = object; + rl->rl_offset = offset; + rl->rl_size = size; + rl->rl_lock = rll; + + ztest_rll_lock(rll, type); + + return (rl); +} + +static void +ztest_range_unlock(rl_t *rl) +{ + rll_t *rll = rl->rl_lock; + + ztest_rll_unlock(rll); + + umem_free(rl, sizeof (*rl)); +} + +static void +ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) +{ + zd->zd_os = os; + zd->zd_zilog = dmu_objset_zil(os); + zd->zd_shared = szd; + dmu_objset_name(os, zd->zd_name); + int l; + + if (zd->zd_shared != NULL) + zd->zd_shared->zd_seq = 0; + + VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); + mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); + + for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_init(&zd->zd_object_lock[l]); + + for (l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_init(&zd->zd_range_lock[l]); +} + +static void +ztest_zd_fini(ztest_ds_t *zd) +{ + int l; + + mutex_destroy(&zd->zd_dirobj_lock); + (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); + + for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_destroy(&zd->zd_object_lock[l]); + + for (l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_destroy(&zd->zd_range_lock[l]); +} + +#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) + +static uint64_t +ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) +{ + uint64_t txg; + int error; + + /* + * Attempt to assign tx to some transaction group. + */ + error = dmu_tx_assign(tx, txg_how); + if (error) { + if (error == ERESTART) { + ASSERT(txg_how == TXG_NOWAIT); + dmu_tx_wait(tx); + } else { + ASSERT3U(error, ==, ENOSPC); + ztest_record_enospc(tag); + } + dmu_tx_abort(tx); + return (0); + } + txg = dmu_tx_get_txg(tx); + ASSERT(txg != 0); + return (txg); +} + +static void +ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, + uint64_t crtxg) +{ + bt->bt_magic = BT_MAGIC; + bt->bt_objset = dmu_objset_id(os); + bt->bt_object = object; + bt->bt_dnodesize = dnodesize; + bt->bt_offset = offset; + bt->bt_gen = gen; + bt->bt_txg = txg; + bt->bt_crtxg = crtxg; +} + +static void +ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, + uint64_t crtxg) +{ + ASSERT3U(bt->bt_magic, ==, BT_MAGIC); + ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); + ASSERT3U(bt->bt_object, ==, object); + ASSERT3U(bt->bt_dnodesize, ==, dnodesize); + ASSERT3U(bt->bt_offset, ==, offset); + ASSERT3U(bt->bt_gen, <=, gen); + ASSERT3U(bt->bt_txg, <=, txg); + ASSERT3U(bt->bt_crtxg, ==, crtxg); +} + +static ztest_block_tag_t * +ztest_bt_bonus(dmu_buf_t *db) +{ + dmu_object_info_t doi; + ztest_block_tag_t *bt; + + dmu_object_info_from_db(db, &doi); + ASSERT3U(doi.doi_bonus_size, <=, db->db_size); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); + bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); + + return (bt); +} + +/* + * Generate a token to fill up unused bonus buffer space. Try to make + * it unique to the object, generation, and offset to verify that data + * is not getting overwritten by data from other dnodes. + */ +#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ + (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) + +/* + * Fill up the unused bonus buffer region before the block tag with a + * verifiable pattern. Filling the whole bonus area with non-zero data + * helps ensure that all dnode traversal code properly skips the + * interior regions of large dnodes. + */ +static void +ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, + objset_t *os, uint64_t gen) +{ + uint64_t *bonusp; + + ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); + + for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { + uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), + gen, bonusp - (uint64_t *)db->db_data); + *bonusp = token; + } +} + +/* + * Verify that the unused area of a bonus buffer is filled with the + * expected tokens. + */ +static void +ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, + objset_t *os, uint64_t gen) +{ + uint64_t *bonusp; + + for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { + uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), + gen, bonusp - (uint64_t *)db->db_data); + VERIFY3U(*bonusp, ==, token); + } +} + +/* + * ZIL logging ops + */ + +#define lrz_type lr_mode +#define lrz_blocksize lr_uid +#define lrz_ibshift lr_gid +#define lrz_bonustype lr_rdev +#define lrz_dnodesize lr_crtime[1] + +static void +ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + itx->itx_oid = object; + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) +{ + itx_t *itx; + itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) + write_state = WR_INDIRECT; + + itx = zil_itx_create(TX_WRITE, + sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); + + if (write_state == WR_COPIED && + dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, + ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + write_state = WR_NEED_COPY; + } + itx->itx_private = zd; + itx->itx_wr_state = write_state; + itx->itx_sync = (ztest_random(8) == 0); + + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) +{ + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + itx->itx_sync = B_FALSE; + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) +{ + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + itx->itx_sync = B_FALSE; + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +/* + * ZIL replay ops + */ +static int +ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_create_t *lr = arg2; + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + ztest_block_tag_t *bbt; + dmu_buf_t *db; + dmu_tx_t *tx; + uint64_t txg; + int error = 0; + int bonuslen; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } else { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + } + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) + return (ENOSPC); + + ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); + bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + if (lr->lr_foid == 0) { + lr->lr_foid = zap_create_dnsize(os, + lr->lrz_type, lr->lrz_bonustype, + bonuslen, lr->lrz_dnodesize, tx); + } else { + error = zap_create_claim_dnsize(os, lr->lr_foid, + lr->lrz_type, lr->lrz_bonustype, + bonuslen, lr->lrz_dnodesize, tx); + } + } else { + if (lr->lr_foid == 0) { + lr->lr_foid = dmu_object_alloc_dnsize(os, + lr->lrz_type, 0, lr->lrz_bonustype, + bonuslen, lr->lrz_dnodesize, tx); + } else { + error = dmu_object_claim_dnsize(os, lr->lr_foid, + lr->lrz_type, 0, lr->lrz_bonustype, + bonuslen, lr->lrz_dnodesize, tx); + } + } + + if (error) { + ASSERT3U(error, ==, EEXIST); + ASSERT(zd->zd_zilog->zl_replay); + dmu_tx_commit(tx); + return (error); + } + + ASSERT(lr->lr_foid != 0); + + if (lr->lrz_type != DMU_OT_ZAP_OTHER) + VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, + lr->lrz_blocksize, lr->lrz_ibshift, tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + bbt = ztest_bt_bonus(db); + dmu_buf_will_dirty(db, tx); + ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, + lr->lr_gen, txg, txg); + ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); + dmu_buf_rele(db, FTAG); + + VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, + &lr->lr_foid, tx)); + + (void) ztest_log_create(zd, tx, lr); + + dmu_tx_commit(tx); + + return (0); +} + +static int +ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_remove_t *lr = arg2; + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + dmu_object_info_t doi; + dmu_tx_t *tx; + uint64_t object, txg; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + VERIFY3U(0, ==, + zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); + ASSERT(object != 0); + + ztest_object_lock(zd, object, RL_WRITER); + + VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); + dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_object_unlock(zd, object); + return (ENOSPC); + } + + if (doi.doi_type == DMU_OT_ZAP_OTHER) { + VERIFY3U(0, ==, zap_destroy(os, object, tx)); + } else { + VERIFY3U(0, ==, dmu_object_free(os, object, tx)); + } + + VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); + + (void) ztest_log_remove(zd, tx, lr, object); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, object); + + return (0); +} + +static int +ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_write_t *lr = arg2; + objset_t *os = zd->zd_os; + void *data = lr + 1; /* data follows lr */ + uint64_t offset, length; + ztest_block_tag_t *bt = data; + ztest_block_tag_t *bbt; + uint64_t gen, txg, lrtxg, crtxg; + dmu_object_info_t doi; + dmu_tx_t *tx; + dmu_buf_t *db; + arc_buf_t *abuf = NULL; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + } + + if (bt->bt_magic == BSWAP_64(BT_MAGIC)) + byteswap_uint64_array(bt, sizeof (*bt)); + + if (bt->bt_magic != BT_MAGIC) + bt = NULL; + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + dmu_object_info_from_db(db, &doi); + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + gen = bbt->bt_gen; + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, lr->lr_foid, offset, length); + + if (ztest_random(8) == 0 && length == doi.doi_data_block_size && + P2PHASE(offset, length) == 0) + abuf = dmu_request_arcbuf(db, length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + dmu_buf_rele(db, FTAG); + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + if (bt != NULL) { + /* + * Usually, verify the old data before writing new data -- + * but not always, because we also want to verify correct + * behavior when the data was not recently read into cache. + */ + ASSERT(offset % doi.doi_data_block_size == 0); + if (ztest_random(4) != 0) { + int prefetch = ztest_random(2) ? + DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; + ztest_block_tag_t rbt; + + VERIFY(dmu_read(os, lr->lr_foid, offset, + sizeof (rbt), &rbt, prefetch) == 0); + if (rbt.bt_magic == BT_MAGIC) { + ztest_bt_verify(&rbt, os, lr->lr_foid, 0, + offset, gen, txg, crtxg); + } + } + + /* + * Writes can appear to be newer than the bonus buffer because + * the ztest_get_data() callback does a dmu_read() of the + * open-context data, which may be different than the data + * as it was when the write was generated. + */ + if (zd->zd_zilog->zl_replay) { + ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, + MAX(gen, bt->bt_gen), MAX(txg, lrtxg), + bt->bt_crtxg); + } + + /* + * Set the bt's gen/txg to the bonus buffer's gen/txg + * so that all of the usual ASSERTs will work. + */ + ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, + crtxg); + } + + if (abuf == NULL) { + dmu_write(os, lr->lr_foid, offset, length, data, tx); + } else { + bcopy(data, abuf->b_data, length); + dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx); + } + + (void) ztest_log_write(zd, tx, lr); + + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +static int +ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_truncate_t *lr = arg2; + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, + RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, + lr->lr_length, tx) == 0); + + (void) ztest_log_truncate(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +static int +ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_setattr_t *lr = arg2; + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + dmu_buf_t *db; + ztest_block_tag_t *bbt; + uint64_t txg, lrtxg, crtxg, dnodesize; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, lr->lr_foid); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + dnodesize = bbt->bt_dnodesize; + + if (zd->zd_zilog->zl_replay) { + ASSERT(lr->lr_size != 0); + ASSERT(lr->lr_mode != 0); + ASSERT(lrtxg != 0); + } else { + /* + * Randomly change the size and increment the generation. + */ + lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * + sizeof (*bbt); + lr->lr_mode = bbt->bt_gen + 1; + ASSERT(lrtxg == 0); + } + + /* + * Verify that the current bonus buffer is not newer than our txg. + */ + ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, + MAX(txg, lrtxg), crtxg); + + dmu_buf_will_dirty(db, tx); + + ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); + ASSERT3U(lr->lr_size, <=, db->db_size); + VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); + bbt = ztest_bt_bonus(db); + + ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, + txg, crtxg); + ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); + dmu_buf_rele(db, FTAG); + + (void) ztest_log_setattr(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { + NULL, /* 0 no such transaction type */ + ztest_replay_create, /* TX_CREATE */ + NULL, /* TX_MKDIR */ + NULL, /* TX_MKXATTR */ + NULL, /* TX_SYMLINK */ + ztest_replay_remove, /* TX_REMOVE */ + NULL, /* TX_RMDIR */ + NULL, /* TX_LINK */ + NULL, /* TX_RENAME */ + ztest_replay_write, /* TX_WRITE */ + ztest_replay_truncate, /* TX_TRUNCATE */ + ztest_replay_setattr, /* TX_SETATTR */ + NULL, /* TX_ACL */ + NULL, /* TX_CREATE_ACL */ + NULL, /* TX_CREATE_ATTR */ + NULL, /* TX_CREATE_ACL_ATTR */ + NULL, /* TX_MKDIR_ACL */ + NULL, /* TX_MKDIR_ATTR */ + NULL, /* TX_MKDIR_ACL_ATTR */ + NULL, /* TX_WRITE2 */ +}; + +/* + * ZIL get_data callbacks + */ + +/* ARGSUSED */ +static void +ztest_get_done(zgd_t *zgd, int error) +{ + ztest_ds_t *zd = zgd->zgd_private; + uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + ztest_range_unlock((rl_t *)zgd->zgd_lr); + ztest_object_unlock(zd, object); + + umem_free(zgd, sizeof (*zgd)); +} + +static int +ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, + zio_t *zio) +{ + ztest_ds_t *zd = arg; + objset_t *os = zd->zd_os; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + uint64_t txg = lr->lr_common.lrc_txg; + uint64_t crtxg; + dmu_object_info_t doi; + dmu_buf_t *db; + zgd_t *zgd; + int error; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + ztest_object_lock(zd, object, RL_READER); + error = dmu_bonus_hold(os, object, FTAG, &db); + if (error) { + ztest_object_unlock(zd, object); + return (error); + } + + crtxg = ztest_bt_bonus(db)->bt_crtxg; + + if (crtxg == 0 || crtxg > txg) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, object); + return (ENOENT); + } + + dmu_object_info_from_db(db, &doi); + dmu_buf_rele(db, FTAG); + db = NULL; + + zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zd; + + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, + object, offset, size, RL_READER); + + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + ASSERT(error == 0); + } else { + size = doi.doi_data_block_size; + if (ISP2(size)) { + offset = P2ALIGN(offset, size); + } else { + ASSERT(offset < size); + offset = 0; + } + + zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, + object, offset, size, RL_READER); + + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + ztest_get_done, zgd); + + if (error == 0) + return (0); + } + } + + ztest_get_done(zgd, error); + + return (error); +} + +static void * +ztest_lr_alloc(size_t lrsize, char *name) +{ + char *lr; + size_t namesize = name ? strlen(name) + 1 : 0; + + lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); + + if (name) + bcopy(name, lr + lrsize, namesize); + + return (lr); +} + +static void +ztest_lr_free(void *lr, size_t lrsize, char *name) +{ + size_t namesize = name ? strlen(name) + 1 : 0; + + umem_free(lr, lrsize + namesize); +} + +/* + * Lookup a bunch of objects. Returns the number of objects not found. + */ +static int +ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int error; + int i; + + ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); + + for (i = 0; i < count; i++, od++) { + od->od_object = 0; + error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, + sizeof (uint64_t), 1, &od->od_object); + if (error) { + ASSERT(error == ENOENT); + ASSERT(od->od_object == 0); + missing++; + } else { + dmu_buf_t *db; + ztest_block_tag_t *bbt; + dmu_object_info_t doi; + + ASSERT(od->od_object != 0); + ASSERT(missing == 0); /* there should be no gaps */ + + ztest_object_lock(zd, od->od_object, RL_READER); + VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, + od->od_object, FTAG, &db)); + dmu_object_info_from_db(db, &doi); + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + od->od_type = doi.doi_type; + od->od_blocksize = doi.doi_data_block_size; + od->od_gen = bbt->bt_gen; + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, od->od_object); + } + } + + return (missing); +} + +static int +ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int i; + + ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); + + for (i = 0; i < count; i++, od++) { + if (missing) { + od->od_object = 0; + missing++; + continue; + } + + lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ + lr->lrz_type = od->od_crtype; + lr->lrz_blocksize = od->od_crblocksize; + lr->lrz_ibshift = ztest_random_ibshift(); + lr->lrz_bonustype = DMU_OT_UINT64_OTHER; + lr->lrz_dnodesize = od->od_crdnodesize; + lr->lr_gen = od->od_crgen; + lr->lr_crtime[0] = time(NULL); + + if (ztest_replay_create(zd, lr, B_FALSE) != 0) { + ASSERT(missing == 0); + od->od_object = 0; + missing++; + } else { + od->od_object = lr->lr_foid; + od->od_type = od->od_crtype; + od->od_blocksize = od->od_crblocksize; + od->od_gen = od->od_crgen; + ASSERT(od->od_object != 0); + } + + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); +} + +static int +ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int error; + int i; + + ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); + + od += count - 1; + + for (i = count - 1; i >= 0; i--, od--) { + if (missing) { + missing++; + continue; + } + + /* + * No object was found. + */ + if (od->od_object == 0) + continue; + + lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + + if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { + ASSERT3U(error, ==, ENOSPC); + missing++; + } else { + od->od_object = 0; + } + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); +} + +static int +ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, + void *data) +{ + lr_write_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + bcopy(data, lr + 1, size); + + error = ztest_replay_write(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr) + size, NULL); + + return (error); +} + +static int +ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) +{ + lr_truncate_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + + error = ztest_replay_truncate(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); +} + +static int +ztest_setattr(ztest_ds_t *zd, uint64_t object) +{ + lr_setattr_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_size = 0; + lr->lr_mode = 0; + + error = ztest_replay_setattr(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); +} + +static void +ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) +{ + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + txg_wait_synced(dmu_objset_pool(os), 0); + + ztest_object_lock(zd, object, RL_READER); + rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, object, offset, size); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + + if (txg != 0) { + dmu_prealloc(os, object, offset, size, tx); + dmu_tx_commit(tx); + txg_wait_synced(dmu_objset_pool(os), txg); + } else { + (void) dmu_free_long_range(os, object, offset, size); + } + + ztest_range_unlock(rl); + ztest_object_unlock(zd, object); +} + +static void +ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) +{ + int err; + ztest_block_tag_t wbt; + dmu_object_info_t doi; + enum ztest_io_type io_type; + uint64_t blocksize; + void *data; + + VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); + blocksize = doi.doi_data_block_size; + data = umem_alloc(blocksize, UMEM_NOFAIL); + + /* + * Pick an i/o type at random, biased toward writing block tags. + */ + io_type = ztest_random(ZTEST_IO_TYPES); + if (ztest_random(2) == 0) + io_type = ZTEST_IO_WRITE_TAG; + + (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); + + switch (io_type) { + + case ZTEST_IO_WRITE_TAG: + ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, + offset, 0, 0, 0); + (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); + break; + + case ZTEST_IO_WRITE_PATTERN: + (void) memset(data, 'a' + (object + offset) % 5, blocksize); + if (ztest_random(2) == 0) { + /* + * Induce fletcher2 collisions to ensure that + * zio_ddt_collision() detects and resolves them + * when using fletcher2-verify for deduplication. + */ + ((uint64_t *)data)[0] ^= 1ULL << 63; + ((uint64_t *)data)[4] ^= 1ULL << 63; + } + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_WRITE_ZEROES: + bzero(data, blocksize); + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_TRUNCATE: + (void) ztest_truncate(zd, object, offset, blocksize); + break; + + case ZTEST_IO_SETATTR: + (void) ztest_setattr(zd, object); + break; + default: + break; + + case ZTEST_IO_REWRITE: + (void) pthread_rwlock_rdlock(&ztest_name_lock); + err = ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), + B_FALSE); + VERIFY(err == 0 || err == ENOSPC); + err = ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_COMPRESSION, + ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), + B_FALSE); + VERIFY(err == 0 || err == ENOSPC); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, + DMU_READ_NO_PREFETCH)); + + (void) ztest_write(zd, object, offset, blocksize, data); + break; + } + + (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); + + umem_free(data, blocksize); +} + +/* + * Initialize an object description template. + */ +static void +ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, + dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, + uint64_t gen) +{ + od->od_dir = ZTEST_DIROBJ; + od->od_object = 0; + + od->od_crtype = type; + od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); + od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); + od->od_crgen = gen; + + od->od_type = DMU_OT_NONE; + od->od_blocksize = 0; + od->od_gen = 0; + + (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", + tag, (longlong_t)id, (u_longlong_t)index); +} + +/* + * Lookup or create the objects for a test using the od template. + * If the objects do not all exist, or if 'remove' is specified, + * remove any existing objects and create new ones. Otherwise, + * use the existing objects. + */ +static int +ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) +{ + int count = size / sizeof (*od); + int rv = 0; + + mutex_enter(&zd->zd_dirobj_lock); + if ((ztest_lookup(zd, od, count) != 0 || remove) && + (ztest_remove(zd, od, count) != 0 || + ztest_create(zd, od, count) != 0)) + rv = -1; + zd->zd_od = od; + mutex_exit(&zd->zd_dirobj_lock); + + return (rv); +} + +/* ARGSUSED */ +void +ztest_zil_commit(ztest_ds_t *zd, uint64_t id) +{ + zilog_t *zilog = zd->zd_zilog; + + (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); + + zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); + + /* + * Remember the committed values in zd, which is in parent/child + * shared memory. If we die, the next iteration of ztest_run() + * will verify that the log really does contain this record. + */ + mutex_enter(&zilog->zl_lock); + ASSERT(zd->zd_shared != NULL); + ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); + zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; + mutex_exit(&zilog->zl_lock); + + (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); +} + +/* + * This function is designed to simulate the operations that occur during a + * mount/unmount operation. We hold the dataset across these operations in an + * attempt to expose any implicit assumptions about ZIL management. + */ +/* ARGSUSED */ +void +ztest_zil_remount(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + + /* + * We hold the ztest_vdev_lock so we don't cause problems with + * other threads that wish to remove a log device, such as + * ztest_device_removal(). + */ + mutex_enter(&ztest_vdev_lock); + + /* + * We grab the zd_dirobj_lock to ensure that no other thread is + * updating the zil (i.e. adding in-memory log records) and the + * zd_zilog_lock to block any I/O. + */ + mutex_enter(&zd->zd_dirobj_lock); + (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); + + /* zfsvfs_teardown() */ + zil_close(zd->zd_zilog); + + /* zfsvfs_setup() */ + VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); + zil_replay(os, zd, ztest_replay_vector); + + (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); + mutex_exit(&zd->zd_dirobj_lock); + mutex_exit(&ztest_vdev_lock); +} + +/* + * Verify that we can't destroy an active pool, create an existing pool, + * or create a pool with a bad vdev spec. + */ +/* ARGSUSED */ +void +ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_opts_t *zo = &ztest_opts; + spa_t *spa; + nvlist_t *nvroot; + + if (zo->zo_mmp_test) + return; + + /* + * Attempt to create using a bad file. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create using a bad mirror. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create an existing pool. It shouldn't matter + * what's in the nvroot; we should fail with EEXIST. + */ + (void) pthread_rwlock_rdlock(&ztest_name_lock); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); + VERIFY3U(EEXIST, ==, + spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * We open a reference to the spa and then we try to export it + * expecting one of the following errors: + * + * EBUSY + * Because of the reference we just opened. + * + * ZFS_ERR_EXPORT_IN_PROGRESS + * For the case that there is another ztest thread doing + * an export concurrently. + */ + VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); + int error = spa_destroy(zo->zo_pool); + if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { + fatal(0, "spa_destroy(%s) returned unexpected value %d", + spa->spa_name, error); + } + spa_close(spa, FTAG); + + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +/* + * Start and then stop the MMP threads to ensure the startup and shutdown code + * works properly. Actual protection and property-related code tested via ZTS. + */ +/* ARGSUSED */ +void +ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_opts_t *zo = &ztest_opts; + spa_t *spa = ztest_spa; + + if (zo->zo_mmp_test) + return; + + /* + * Since enabling MMP involves setting a property, it could not be done + * while the pool is suspended. + */ + if (spa_suspended(spa)) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + mutex_enter(&spa->spa_props_lock); + + zfs_multihost_fail_intervals = 0; + + if (!spa_multihost(spa)) { + spa->spa_multihost = B_TRUE; + mmp_thread_start(spa); + } + + mutex_exit(&spa->spa_props_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + txg_wait_synced(spa_get_dsl(spa), 0); + mmp_signal_all_threads(); + txg_wait_synced(spa_get_dsl(spa), 0); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + mutex_enter(&spa->spa_props_lock); + + if (spa_multihost(spa)) { + mmp_thread_stop(spa); + spa->spa_multihost = B_FALSE; + } + + mutex_exit(&spa->spa_props_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +/* ARGSUSED */ +void +ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa; + uint64_t initial_version = SPA_VERSION_INITIAL; + uint64_t version, newversion; + nvlist_t *nvroot, *props; + char *name; + + if (ztest_opts.zo_mmp_test) + return; + + mutex_enter(&ztest_vdev_lock); + name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); + + /* + * Clean up from previous runs. + */ + (void) spa_destroy(name); + + nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, + NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + + /* + * If we're configuring a RAIDZ device then make sure that the + * initial version is capable of supporting that feature. + */ + switch (ztest_opts.zo_raidz_parity) { + case 0: + case 1: + initial_version = SPA_VERSION_INITIAL; + break; + case 2: + initial_version = SPA_VERSION_RAIDZ2; + break; + case 3: + initial_version = SPA_VERSION_RAIDZ3; + break; + } + + /* + * Create a pool with a spa version that can be upgraded. Pick + * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. + */ + do { + version = ztest_random_spa_version(initial_version); + } while (version > SPA_VERSION_BEFORE_FEATURES); + + props = fnvlist_alloc(); + fnvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), version); + VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0); + fnvlist_free(nvroot); + fnvlist_free(props); + + VERIFY3S(spa_open(name, &spa, FTAG), ==, 0); + VERIFY3U(spa_version(spa), ==, version); + newversion = ztest_random_spa_version(version + 1); + + if (ztest_opts.zo_verbose >= 4) { + (void) printf("upgrading spa version from %llu to %llu\n", + (u_longlong_t)version, (u_longlong_t)newversion); + } + + spa_upgrade(spa, newversion); + VERIFY3U(spa_version(spa), >, version); + VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, + zpool_prop_to_name(ZPOOL_PROP_VERSION))); + spa_close(spa, FTAG); + + kmem_strfree(name); + mutex_exit(&ztest_vdev_lock); +} + +static void +ztest_spa_checkpoint(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); + + int error = spa_checkpoint(spa->spa_name); + + switch (error) { + case 0: + case ZFS_ERR_DEVRM_IN_PROGRESS: + case ZFS_ERR_DISCARDING_CHECKPOINT: + case ZFS_ERR_CHECKPOINT_EXISTS: + break; + case ENOSPC: + ztest_record_enospc(FTAG); + break; + default: + fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); + } +} + +static void +ztest_spa_discard_checkpoint(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); + + int error = spa_checkpoint_discard(spa->spa_name); + + switch (error) { + case 0: + case ZFS_ERR_DISCARDING_CHECKPOINT: + case ZFS_ERR_NO_CHECKPOINT: + break; + default: + fatal(0, "spa_discard_checkpoint(%s) = %d", + spa->spa_name, error); + } + +} + +/* ARGSUSED */ +void +ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + + mutex_enter(&ztest_checkpoint_lock); + if (ztest_random(2) == 0) { + ztest_spa_checkpoint(spa); + } else { + ztest_spa_discard_checkpoint(spa); + } + mutex_exit(&ztest_checkpoint_lock); +} + + +static vdev_t * +vdev_lookup_by_path(vdev_t *vd, const char *path) +{ + vdev_t *mvd; + int c; + + if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) + return (vd); + + for (c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != + NULL) + return (mvd); + + return (NULL); +} + +static int +spa_num_top_vdevs(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); + return (rvd->vdev_children); +} + +/* + * Verify that vdev_add() works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + uint64_t leaves; + uint64_t guid; + nvlist_t *nvroot; + int error; + + if (ztest_opts.zo_mmp_test) + return; + + mutex_enter(&ztest_vdev_lock); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; + + /* + * If we have slogs then remove them 1/4 of the time. + */ + if (spa_has_slogs(spa) && ztest_random(4) == 0) { + metaslab_group_t *mg; + + /* + * find the first real slog in log allocation class + */ + mg = spa_log_class(spa)->mc_rotor; + while (!mg->mg_vd->vdev_islog) + mg = mg->mg_next; + + guid = mg->mg_vd->vdev_guid; + + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between removing a slog (dmu_objset_find) + * and destroying a dataset. Removing the slog will + * grab a reference on the dataset which may cause + * dsl_destroy_head() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + pthread_rwlock_wrlock(&ztest_name_lock); + error = spa_vdev_remove(spa, guid, B_FALSE); + pthread_rwlock_unlock(&ztest_name_lock); + + switch (error) { + case 0: + case EEXIST: /* Generic zil_reset() error */ + case EBUSY: /* Replay required */ + case EACCES: /* Crypto key not loaded */ + case ZFS_ERR_CHECKPOINT_EXISTS: + case ZFS_ERR_DISCARDING_CHECKPOINT: + break; + default: + fatal(0, "spa_vdev_remove() = %d", error); + } + } else { + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * Make 1/4 of the devices be log devices + */ + nvroot = make_vdev_root(NULL, NULL, NULL, + ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? + "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + + switch (error) { + case 0: + break; + case ENOSPC: + ztest_record_enospc("spa_vdev_add"); + break; + default: + fatal(0, "spa_vdev_add() = %d", error); + } + } + + mutex_exit(&ztest_vdev_lock); +} + +/* ARGSUSED */ +void +ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + uint64_t leaves; + nvlist_t *nvroot; + const char *class = (ztest_random(2) == 0) ? + VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; + int error; + + /* + * By default add a special vdev 50% of the time + */ + if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || + (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && + ztest_random(2) == 0)) { + return; + } + + mutex_enter(&ztest_vdev_lock); + + /* Only test with mirrors */ + if (zs->zs_mirrors < 2) { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* requires feature@allocation_classes */ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { + mutex_exit(&ztest_vdev_lock); + return; + } + + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; + spa_config_exit(spa, SCL_VDEV, FTAG); + + nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, + class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + + if (error == ENOSPC) + ztest_record_enospc("spa_vdev_add"); + else if (error != 0) + fatal(0, "spa_vdev_add() = %d", error); + + /* + * 50% of the time allow small blocks in the special class + */ + if (error == 0 && + spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { + if (ztest_opts.zo_verbose >= 3) + (void) printf("Enabling special VDEV small blocks\n"); + (void) ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); + } + + mutex_exit(&ztest_vdev_lock); + + if (ztest_opts.zo_verbose >= 3) { + metaslab_class_t *mc; + + if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) + mc = spa_special_class(spa); + else + mc = spa_dedup_class(spa); + (void) printf("Added a %s mirrored vdev (of %d)\n", + class, (int)mc->mc_groups); + } +} + +/* + * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + vdev_t *rvd = spa->spa_root_vdev; + spa_aux_vdev_t *sav; + char *aux; + char *path; + uint64_t guid = 0; + int error; + + if (ztest_opts.zo_mmp_test) + return; + + path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + if (ztest_random(2) == 0) { + sav = &spa->spa_spares; + aux = ZPOOL_CONFIG_SPARES; + } else { + sav = &spa->spa_l2cache; + aux = ZPOOL_CONFIG_L2CACHE; + } + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + if (sav->sav_count != 0 && ztest_random(4) == 0) { + /* + * Pick a random device to remove. + */ + guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; + } else { + /* + * Find an unused device we can add. + */ + zs->zs_vdev_aux = 0; + for (;;) { + int c; + (void) snprintf(path, MAXPATHLEN, ztest_aux_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, aux, + zs->zs_vdev_aux); + for (c = 0; c < sav->sav_count; c++) + if (strcmp(sav->sav_vdevs[c]->vdev_path, + path) == 0) + break; + if (c == sav->sav_count && + vdev_lookup_by_path(rvd, path) == NULL) + break; + zs->zs_vdev_aux++; + } + } + + spa_config_exit(spa, SCL_VDEV, FTAG); + + if (guid == 0) { + /* + * Add a new device. + */ + nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, + (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); + error = spa_vdev_add(spa, nvroot); + + switch (error) { + case 0: + break; + default: + fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); + } + nvlist_free(nvroot); + } else { + /* + * Remove an existing device. Sometimes, dirty its + * vdev state first to make sure we handle removal + * of devices that have pending state changes. + */ + if (ztest_random(2) == 0) + (void) vdev_online(spa, guid, 0, NULL); + + error = spa_vdev_remove(spa, guid, B_FALSE); + + switch (error) { + case 0: + case EBUSY: + case ZFS_ERR_CHECKPOINT_EXISTS: + case ZFS_ERR_DISCARDING_CHECKPOINT: + break; + default: + fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); + } + } + + mutex_exit(&ztest_vdev_lock); + + umem_free(path, MAXPATHLEN); +} + +/* + * split a pool if it has mirror tlvdevs + */ +/* ARGSUSED */ +void +ztest_split_pool(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *tree, **child, *config, *split, **schild; + uint_t c, children, schildren = 0, lastlogid = 0; + int error = 0; + + if (ztest_opts.zo_mmp_test) + return; + + mutex_enter(&ztest_vdev_lock); + + /* ensure we have a usable config; mirrors of raidz aren't supported */ + if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* clean up the old pool, if any */ + (void) spa_destroy("splitp"); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* generate a config from the existing config */ + mutex_enter(&spa->spa_props_lock); + VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, + &tree) == 0); + mutex_exit(&spa->spa_props_lock); + + VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0); + + schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); + for (c = 0; c < children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + nvlist_t **mchild; + uint_t mchildren; + + if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { + VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, + 0) == 0); + VERIFY(nvlist_add_string(schild[schildren], + ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(schild[schildren], + ZPOOL_CONFIG_IS_HOLE, 1) == 0); + if (lastlogid == 0) + lastlogid = schildren; + ++schildren; + continue; + } + lastlogid = 0; + VERIFY(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); + VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); + } + + /* OK, create a config that can be used to split */ + VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, + lastlogid != 0 ? lastlogid : schildren) == 0); + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); + + for (c = 0; c < schildren; c++) + nvlist_free(schild[c]); + free(schild); + nvlist_free(split); + + spa_config_exit(spa, SCL_VDEV, FTAG); + + (void) pthread_rwlock_wrlock(&ztest_name_lock); + error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + nvlist_free(config); + + if (error == 0) { + (void) printf("successful split - results:\n"); + mutex_enter(&spa_namespace_lock); + show_pool_stats(spa); + show_pool_stats(spa_lookup("splitp")); + mutex_exit(&spa_namespace_lock); + ++zs->zs_splits; + --zs->zs_mirrors; + } + mutex_exit(&ztest_vdev_lock); +} + +/* + * Verify that we can attach and detach devices. + */ +/* ARGSUSED */ +void +ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *oldvd, *newvd, *pvd; + nvlist_t *root; + uint64_t leaves; + uint64_t leaf, top; + uint64_t ashift = ztest_get_ashift(); + uint64_t oldguid, pguid; + uint64_t oldsize, newsize; + char *oldpath, *newpath; + int replacing; + int oldvd_has_siblings = B_FALSE; + int newvd_is_spare = B_FALSE; + int oldvd_is_log; + int error, expected_error; + + if (ztest_opts.zo_mmp_test) + return; + + oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + mutex_enter(&ztest_vdev_lock); + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * If a vdev is in the process of being removed, its removal may + * finish while we are in progress, leading to an unexpected error + * value. Don't bother trying to attach while we are in the middle + * of removal. + */ + if (ztest_device_removal_active) { + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * Decide whether to do an attach or a replace. + */ + replacing = ztest_random(2); + + /* + * Pick a random top-level vdev. + */ + top = ztest_random_vdev_top(spa, B_TRUE); + + /* + * Pick a random leaf within it. + */ + leaf = ztest_random(leaves); + + /* + * Locate this vdev. + */ + oldvd = rvd->vdev_child[top]; + + /* pick a child from the mirror */ + if (zs->zs_mirrors >= 1) { + ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); + ASSERT(oldvd->vdev_children >= zs->zs_mirrors); + oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; + } + + /* pick a child out of the raidz group */ + if (ztest_opts.zo_raidz > 1) { + ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); + ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); + oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; + } + + /* + * If we're already doing an attach or replace, oldvd may be a + * mirror vdev -- in which case, pick a random child. + */ + while (oldvd->vdev_children != 0) { + oldvd_has_siblings = B_TRUE; + ASSERT(oldvd->vdev_children >= 2); + oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; + } + + oldguid = oldvd->vdev_guid; + oldsize = vdev_get_min_asize(oldvd); + oldvd_is_log = oldvd->vdev_top->vdev_islog; + (void) strcpy(oldpath, oldvd->vdev_path); + pvd = oldvd->vdev_parent; + pguid = pvd->vdev_guid; + + /* + * If oldvd has siblings, then half of the time, detach it. Prior + * to the detach the pool is scrubbed in order to prevent creating + * unrepairable blocks as a result of the data corruption injection. + */ + if (oldvd_has_siblings && ztest_random(2) == 0) { + spa_config_exit(spa, SCL_ALL, FTAG); + + error = ztest_scrub_impl(spa); + if (error) + goto out; + + error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); + if (error != 0 && error != ENODEV && error != EBUSY && + error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && + error != ZFS_ERR_DISCARDING_CHECKPOINT) + fatal(0, "detach (%s) returned %d", oldpath, error); + goto out; + } + + /* + * For the new vdev, choose with equal probability between the two + * standard paths (ending in either 'a' or 'b') or a random hot spare. + */ + if (sav->sav_count != 0 && ztest_random(3) == 0) { + newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; + newvd_is_spare = B_TRUE; + (void) strcpy(newpath, newvd->vdev_path); + } else { + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + leaf); + if (ztest_random(2) == 0) + newpath[strlen(newpath) - 1] = 'b'; + newvd = vdev_lookup_by_path(rvd, newpath); + } + + if (newvd) { + /* + * Reopen to ensure the vdev's asize field isn't stale. + */ + vdev_reopen(newvd); + newsize = vdev_get_min_asize(newvd); + } else { + /* + * Make newsize a little bigger or smaller than oldsize. + * If it's smaller, the attach should fail. + * If it's larger, and we're doing a replace, + * we should get dynamic LUN growth when we're done. + */ + newsize = 10 * oldsize / (9 + ztest_random(3)); + } + + /* + * If pvd is not a mirror or root, the attach should fail with ENOTSUP, + * unless it's a replace; in that case any non-replacing parent is OK. + * + * If newvd is already part of the pool, it should fail with EBUSY. + * + * If newvd is too small, it should fail with EOVERFLOW. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops && (!replacing || + pvd->vdev_ops == &vdev_replacing_ops || + pvd->vdev_ops == &vdev_spare_ops)) + expected_error = ENOTSUP; + else if (newvd_is_spare && (!replacing || oldvd_is_log)) + expected_error = ENOTSUP; + else if (newvd == oldvd) + expected_error = replacing ? 0 : EBUSY; + else if (vdev_lookup_by_path(rvd, newpath) != NULL) + expected_error = EBUSY; + else if (newsize < oldsize) + expected_error = EOVERFLOW; + else if (ashift > oldvd->vdev_top->vdev_ashift) + expected_error = EDOM; + else + expected_error = 0; + + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, + ashift, NULL, 0, 0, 1); + + /* + * When supported select either a healing or sequential resilver. + */ + boolean_t rebuilding = B_FALSE; + if (pvd->vdev_ops == &vdev_mirror_ops || + pvd->vdev_ops == &vdev_root_ops) { + rebuilding = !!ztest_random(2); + } + + error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); + + nvlist_free(root); + + /* + * If our parent was the replacing vdev, but the replace completed, + * then instead of failing with ENOTSUP we may either succeed, + * fail with ENODEV, or fail with EOVERFLOW. + */ + if (expected_error == ENOTSUP && + (error == 0 || error == ENODEV || error == EOVERFLOW)) + expected_error = error; + + /* + * If someone grew the LUN, the replacement may be too small. + */ + if (error == EOVERFLOW || error == EBUSY) + expected_error = error; + + if (error == ZFS_ERR_CHECKPOINT_EXISTS || + error == ZFS_ERR_DISCARDING_CHECKPOINT || + error == ZFS_ERR_RESILVER_IN_PROGRESS || + error == ZFS_ERR_REBUILD_IN_PROGRESS) + expected_error = error; + + if (error != expected_error && expected_error != EBUSY) { + fatal(0, "attach (%s %llu, %s %llu, %d) " + "returned %d, expected %d", + oldpath, oldsize, newpath, + newsize, replacing, error, expected_error); + } +out: + mutex_exit(&ztest_vdev_lock); + + umem_free(oldpath, MAXPATHLEN); + umem_free(newpath, MAXPATHLEN); +} + +/* ARGSUSED */ +void +ztest_device_removal(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + vdev_t *vd; + uint64_t guid; + int error; + + mutex_enter(&ztest_vdev_lock); + + if (ztest_device_removal_active) { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * Remove a random top-level vdev and wait for removal to finish. + */ + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); + guid = vd->vdev_guid; + spa_config_exit(spa, SCL_VDEV, FTAG); + + error = spa_vdev_remove(spa, guid, B_FALSE); + if (error == 0) { + ztest_device_removal_active = B_TRUE; + mutex_exit(&ztest_vdev_lock); + + /* + * spa->spa_vdev_removal is created in a sync task that + * is initiated via dsl_sync_task_nowait(). Since the + * task may not run before spa_vdev_remove() returns, we + * must wait at least 1 txg to ensure that the removal + * struct has been created. + */ + txg_wait_synced(spa_get_dsl(spa), 0); + + while (spa->spa_removing_phys.sr_state == DSS_SCANNING) + txg_wait_synced(spa_get_dsl(spa), 0); + } else { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * The pool needs to be scrubbed after completing device removal. + * Failure to do so may result in checksum errors due to the + * strategy employed by ztest_fault_inject() when selecting which + * offset are redundant and can be damaged. + */ + error = spa_scan(spa, POOL_SCAN_SCRUB); + if (error == 0) { + while (dsl_scan_scrubbing(spa_get_dsl(spa))) + txg_wait_synced(spa_get_dsl(spa), 0); + } + + mutex_enter(&ztest_vdev_lock); + ztest_device_removal_active = B_FALSE; + mutex_exit(&ztest_vdev_lock); +} + +/* + * Callback function which expands the physical size of the vdev. + */ +static vdev_t * +grow_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa __maybe_unused = vd->vdev_spa; + size_t *newsize = arg; + size_t fsize; + int fd; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if ((fd = open(vd->vdev_path, O_RDWR)) == -1) + return (vd); + + fsize = lseek(fd, 0, SEEK_END); + VERIFY(ftruncate(fd, *newsize) == 0); + + if (ztest_opts.zo_verbose >= 6) { + (void) printf("%s grew from %lu to %lu bytes\n", + vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); + } + (void) close(fd); + return (NULL); +} + +/* + * Callback function which expands a given vdev by calling vdev_online(). + */ +/* ARGSUSED */ +static vdev_t * +online_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + uint64_t guid = vd->vdev_guid; + uint64_t generation = spa->spa_config_generation + 1; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; + int error; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + /* Calling vdev_online will initialize the new metaslabs */ + spa_config_exit(spa, SCL_STATE, spa); + error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* + * If vdev_online returned an error or the underlying vdev_open + * failed then we abort the expand. The only way to know that + * vdev_open fails is by checking the returned newstate. + */ + if (error || newstate != VDEV_STATE_HEALTHY) { + if (ztest_opts.zo_verbose >= 5) { + (void) printf("Unable to expand vdev, state %llu, " + "error %d\n", (u_longlong_t)newstate, error); + } + return (vd); + } + ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); + + /* + * Since we dropped the lock we need to ensure that we're + * still talking to the original vdev. It's possible this + * vdev may have been detached/replaced while we were + * trying to online it. + */ + if (generation != spa->spa_config_generation) { + if (ztest_opts.zo_verbose >= 5) { + (void) printf("vdev configuration has changed, " + "guid %llu, state %llu, expected gen %llu, " + "got gen %llu\n", + (u_longlong_t)guid, + (u_longlong_t)tvd->vdev_state, + (u_longlong_t)generation, + (u_longlong_t)spa->spa_config_generation); + } + return (vd); + } + return (NULL); +} + +/* + * Traverse the vdev tree calling the supplied function. + * We continue to walk the tree until we either have walked all + * children or we receive a non-NULL return from the callback. + * If a NULL callback is passed, then we just return back the first + * leaf vdev we encounter. + */ +static vdev_t * +vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) +{ + uint_t c; + + if (vd->vdev_ops->vdev_op_leaf) { + if (func == NULL) + return (vd); + else + return (func(vd, arg)); + } + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) + return (cvd); + } + return (NULL); +} + +/* + * Verify that dynamic LUN growth works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + vdev_t *vd, *tvd; + metaslab_class_t *mc; + metaslab_group_t *mg; + size_t psize, newsize; + uint64_t top; + uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; + + mutex_enter(&ztest_checkpoint_lock); + mutex_enter(&ztest_vdev_lock); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* + * If there is a vdev removal in progress, it could complete while + * we are running, in which case we would not be able to verify + * that the metaslab_class space increased (because it decreases + * when the device removal completes). + */ + if (ztest_device_removal_active) { + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + + top = ztest_random_vdev_top(spa, B_TRUE); + + tvd = spa->spa_root_vdev->vdev_child[top]; + mg = tvd->vdev_mg; + mc = mg->mg_class; + old_ms_count = tvd->vdev_ms_count; + old_class_space = metaslab_class_get_space(mc); + + /* + * Determine the size of the first leaf vdev associated with + * our top-level device. + */ + vd = vdev_walk_tree(tvd, NULL, NULL); + ASSERT3P(vd, !=, NULL); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + psize = vd->vdev_psize; + + /* + * We only try to expand the vdev if it's healthy, less than 4x its + * original size, and it has a valid psize. + */ + if (tvd->vdev_state != VDEV_STATE_HEALTHY || + psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + ASSERT(psize > 0); + newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); + ASSERT3U(newsize, >, psize); + + if (ztest_opts.zo_verbose >= 6) { + (void) printf("Expanding LUN %s from %lu to %lu\n", + vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); + } + + /* + * Growing the vdev is a two step process: + * 1). expand the physical size (i.e. relabel) + * 2). online the vdev to create the new metaslabs + */ + if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || + vdev_walk_tree(tvd, online_vdev, NULL) != NULL || + tvd->vdev_state != VDEV_STATE_HEALTHY) { + if (ztest_opts.zo_verbose >= 5) { + (void) printf("Could not expand LUN because " + "the vdev configuration changed.\n"); + } + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + + spa_config_exit(spa, SCL_STATE, spa); + + /* + * Expanding the LUN will update the config asynchronously, + * thus we must wait for the async thread to complete any + * pending tasks before proceeding. + */ + for (;;) { + boolean_t done; + mutex_enter(&spa->spa_async_lock); + done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); + mutex_exit(&spa->spa_async_lock); + if (done) + break; + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); + } + + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + tvd = spa->spa_root_vdev->vdev_child[top]; + new_ms_count = tvd->vdev_ms_count; + new_class_space = metaslab_class_get_space(mc); + + if (tvd->vdev_mg != mg || mg->mg_class != mc) { + if (ztest_opts.zo_verbose >= 5) { + (void) printf("Could not verify LUN expansion due to " + "intervening vdev offline or remove.\n"); + } + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + + /* + * Make sure we were able to grow the vdev. + */ + if (new_ms_count <= old_ms_count) { + fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", + old_ms_count, new_ms_count); + } + + /* + * Make sure we were able to grow the pool. + */ + if (new_class_space <= old_class_space) { + fatal(0, "LUN expansion failed: class_space %llu < %llu\n", + old_class_space, new_class_space); + } + + if (ztest_opts.zo_verbose >= 5) { + char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; + + nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); + nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); + (void) printf("%s grew from %s to %s\n", + spa->spa_name, oldnumbuf, newnumbuf); + } + + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); +} + +/* + * Verify that dmu_objset_{create,destroy,open,close} work as expected. + */ +/* ARGSUSED */ +static void +ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + /* + * Create the objects common to all ztest datasets. + */ + VERIFY(zap_create_claim(os, ZTEST_DIROBJ, + DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); +} + +static int +ztest_dataset_create(char *dsname) +{ + int err; + uint64_t rand; + dsl_crypto_params_t *dcp = NULL; + + /* + * 50% of the time, we create encrypted datasets + * using a random cipher suite and a hard-coded + * wrapping key. + */ + rand = ztest_random(2); + if (rand != 0) { + nvlist_t *crypto_args = fnvlist_alloc(); + nvlist_t *props = fnvlist_alloc(); + + /* slight bias towards the default cipher suite */ + rand = ztest_random(ZIO_CRYPT_FUNCTIONS); + if (rand < ZIO_CRYPT_AES_128_CCM) + rand = ZIO_CRYPT_ON; + + fnvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); + fnvlist_add_uint8_array(crypto_args, "wkeydata", + (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); + + /* + * These parameters aren't really used by the kernel. They + * are simply stored so that userspace knows how to load + * the wrapping key. + */ + fnvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); + fnvlist_add_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); + fnvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); + fnvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); + + VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, + crypto_args, &dcp)); + + /* + * Cycle through all available encryption implementations + * to verify interoperability. + */ + VERIFY0(gcm_impl_set("cycle")); + VERIFY0(aes_impl_set("cycle")); + + fnvlist_free(crypto_args); + fnvlist_free(props); + } + + err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, + ztest_objset_create_cb, NULL); + dsl_crypto_params_free(dcp, !!err); + + rand = ztest_random(100); + if (err || rand < 80) + return (err); + + if (ztest_opts.zo_verbose >= 5) + (void) printf("Setting dataset %s to sync always\n", dsname); + return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, + ZFS_SYNC_ALWAYS, B_FALSE)); +} + +/* ARGSUSED */ +static int +ztest_objset_destroy_cb(const char *name, void *arg) +{ + objset_t *os; + dmu_object_info_t doi; + int error; + + /* + * Verify that the dataset contains a directory object. + */ + VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, + B_TRUE, FTAG, &os)); + error = dmu_object_info(os, ZTEST_DIROBJ, &doi); + if (error != ENOENT) { + /* We could have crashed in the middle of destroying it */ + ASSERT0(error); + ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); + ASSERT3S(doi.doi_physical_blocks_512, >=, 0); + } + dmu_objset_disown(os, B_TRUE, FTAG); + + /* + * Destroy the dataset. + */ + if (strchr(name, '@') != NULL) { + VERIFY0(dsl_destroy_snapshot(name, B_TRUE)); + } else { + error = dsl_destroy_head(name); + if (error == ENOSPC) { + /* There could be checkpoint or insufficient slop */ + ztest_record_enospc(FTAG); + } else if (error != EBUSY) { + /* There could be a hold on this dataset */ + ASSERT0(error); + } + } + return (0); +} + +static boolean_t +ztest_snapshot_create(char *osname, uint64_t id) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + + (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); + + error = dmu_objset_snapshot_one(osname, snapname); + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (B_FALSE); + } + if (error != 0 && error != EEXIST) { + fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, + snapname, error); + } + return (B_TRUE); +} + +static boolean_t +ztest_snapshot_destroy(char *osname, uint64_t id) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + + (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, + (u_longlong_t)id); + + error = dsl_destroy_snapshot(snapname, B_FALSE); + if (error != 0 && error != ENOENT) + fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); + return (B_TRUE); +} + +/* ARGSUSED */ +void +ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + ztest_ds_t *zdtmp; + int iters; + int error; + objset_t *os, *os2; + char name[ZFS_MAX_DATASET_NAME_LEN]; + zilog_t *zilog; + int i; + + zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + (void) snprintf(name, sizeof (name), "%s/temp_%llu", + ztest_opts.zo_pool, (u_longlong_t)id); + + /* + * If this dataset exists from a previous run, process its replay log + * half of the time. If we don't replay it, then dsl_destroy_head() + * (invoked from ztest_objset_destroy_cb()) should just throw it away. + */ + if (ztest_random(2) == 0 && + ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, + B_TRUE, FTAG, &os) == 0) { + ztest_zd_init(zdtmp, NULL, os); + zil_replay(os, zdtmp, ztest_replay_vector); + ztest_zd_fini(zdtmp); + dmu_objset_disown(os, B_TRUE, FTAG); + } + + /* + * There may be an old instance of the dataset we're about to + * create lying around from a previous run. If so, destroy it + * and all of its snapshots. + */ + (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, + DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + + /* + * Verify that the destroyed dataset is no longer in the namespace. + */ + VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, + B_TRUE, FTAG, &os)); + + /* + * Verify that we can create a new dataset. + */ + error = ztest_dataset_create(name); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", name, error); + } + + VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, + FTAG, &os)); + + ztest_zd_init(zdtmp, NULL, os); + + /* + * Open the intent log for it. + */ + zilog = zil_open(os, ztest_get_data); + + /* + * Put some objects in there, do a little I/O to them, + * and randomly take a couple of snapshots along the way. + */ + iters = ztest_random(5); + for (i = 0; i < iters; i++) { + ztest_dmu_object_alloc_free(zdtmp, id); + if (ztest_random(iters) == 0) + (void) ztest_snapshot_create(name, i); + } + + /* + * Verify that we cannot create an existing dataset. + */ + VERIFY3U(EEXIST, ==, + dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); + + /* + * Verify that we can hold an objset that is also owned. + */ + VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); + dmu_objset_rele(os2, FTAG); + + /* + * Verify that we cannot own an objset that is already owned. + */ + VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, + B_FALSE, B_TRUE, FTAG, &os2)); + + zil_close(zilog); + dmu_objset_disown(os, B_TRUE, FTAG); + ztest_zd_fini(zdtmp); +out: + (void) pthread_rwlock_unlock(&ztest_name_lock); + + umem_free(zdtmp, sizeof (ztest_ds_t)); +} + +/* + * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. + */ +void +ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + (void) pthread_rwlock_rdlock(&ztest_name_lock); + (void) ztest_snapshot_destroy(zd->zd_name, id); + (void) ztest_snapshot_create(zd->zd_name, id); + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +/* + * Cleanup non-standard snapshots and clones. + */ +static void +ztest_dsl_dataset_cleanup(char *osname, uint64_t id) +{ + char *snap1name; + char *clone1name; + char *snap2name; + char *clone2name; + char *snap3name; + int error; + + snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + + (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s1_%llu", osname, (u_longlong_t)id); + (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, + "%s/c1_%llu", osname, (u_longlong_t)id); + (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s2_%llu", clone1name, (u_longlong_t)id); + (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, + "%s/c2_%llu", osname, (u_longlong_t)id); + (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s3_%llu", clone1name, (u_longlong_t)id); + + error = dsl_destroy_head(clone2name); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); + error = dsl_destroy_snapshot(snap3name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); + error = dsl_destroy_snapshot(snap2name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); + error = dsl_destroy_head(clone1name); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); + error = dsl_destroy_snapshot(snap1name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); + + umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); +} + +/* + * Verify dsl_dataset_promote handles EBUSY + */ +void +ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os; + char *snap1name; + char *clone1name; + char *snap2name; + char *clone2name; + char *snap3name; + char *osname = zd->zd_name; + int error; + + snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + ztest_dsl_dataset_cleanup(osname, id); + + (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s1_%llu", osname, (u_longlong_t)id); + (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, + "%s/c1_%llu", osname, (u_longlong_t)id); + (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s2_%llu", clone1name, (u_longlong_t)id); + (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, + "%s/c2_%llu", osname, (u_longlong_t)id); + (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s3_%llu", clone1name, (u_longlong_t)id); + + error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); + } + + error = dmu_objset_clone(clone1name, snap1name); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); + } + + error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); + } + + error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); + } + + error = dmu_objset_clone(clone2name, snap3name); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); + } + + error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, + FTAG, &os); + if (error) + fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); + error = dsl_dataset_promote(clone2name, NULL); + if (error == ENOSPC) { + dmu_objset_disown(os, B_TRUE, FTAG); + ztest_record_enospc(FTAG); + goto out; + } + if (error != EBUSY) + fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, + error); + dmu_objset_disown(os, B_TRUE, FTAG); + +out: + ztest_dsl_dataset_cleanup(osname, id); + + (void) pthread_rwlock_unlock(&ztest_name_lock); + + umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); +} + +#undef OD_ARRAY_SIZE +#define OD_ARRAY_SIZE 4 + +/* + * Verify that dmu_object_{alloc,free} work as expected. + */ +void +ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t *od; + int batchsize; + int size; + int b; + + size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; + od = umem_alloc(size, UMEM_NOFAIL); + batchsize = OD_ARRAY_SIZE; + + for (b = 0; b < batchsize; b++) + ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, + 0, 0, 0); + + /* + * Destroy the previous batch of objects, create a new batch, + * and do some I/O on the new objects. + */ + if (ztest_object_init(zd, od, size, B_TRUE) != 0) + return; + + while (ztest_random(4 * batchsize) != 0) + ztest_io(zd, od[ztest_random(batchsize)].od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + + umem_free(od, size); +} + +/* + * Rewind the global allocator to verify object allocation backfilling. + */ +void +ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + uint64_t object; + + /* + * Rewind the global allocator randomly back to a lower object number + * to force backfilling and reclamation of recently freed dnodes. + */ + mutex_enter(&os->os_obj_lock); + object = ztest_random(os->os_obj_next_chunk); + os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); + mutex_exit(&os->os_obj_lock); +} + +#undef OD_ARRAY_SIZE +#define OD_ARRAY_SIZE 2 + +/* + * Verify that dmu_{read,write} work as expected. + */ +void +ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) +{ + int size; + ztest_od_t *od; + + objset_t *os = zd->zd_os; + size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; + od = umem_alloc(size, UMEM_NOFAIL); + dmu_tx_t *tx; + int i, freeit, error; + uint64_t n, s, txg; + bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; + uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; + uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); + uint64_t regions = 997; + uint64_t stride = 123456789ULL; + uint64_t width = 40; + int free_percent = 5; + + /* + * This test uses two objects, packobj and bigobj, that are always + * updated together (i.e. in the same tx) so that their contents are + * in sync and can be compared. Their contents relate to each other + * in a simple way: packobj is a dense array of 'bufwad' structures, + * while bigobj is a sparse array of the same bufwads. Specifically, + * for any index n, there are three bufwads that should be identical: + * + * packobj, at offset n * sizeof (bufwad_t) + * bigobj, at the head of the nth chunk + * bigobj, at the tail of the nth chunk + * + * The chunk size is arbitrary. It doesn't have to be a power of two, + * and it doesn't have any relation to the object blocksize. + * The only requirement is that it can hold at least two bufwads. + * + * Normally, we write the bufwad to each of these locations. + * However, free_percent of the time we instead write zeroes to + * packobj and perform a dmu_free_range() on bigobj. By comparing + * bigobj to packobj, we can verify that the DMU is correctly + * tracking which parts of an object are allocated and free, + * and that the contents of the allocated blocks are correct. + */ + + /* + * Read the directory info. If it's the first time, set things up. + */ + ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); + ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); + + if (ztest_object_init(zd, od, size, B_FALSE) != 0) { + umem_free(od, size); + return; + } + + bigobj = od[0].od_object; + packobj = od[1].od_object; + chunksize = od[0].od_gen; + ASSERT(chunksize == od[1].od_gen); + + /* + * Prefetch a random chunk of the big object. + * Our aim here is to get some async reads in flight + * for blocks that we may free below; the DMU should + * handle this race correctly. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(2 * width - 1); + dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, + ZIO_PRIORITY_SYNC_READ); + + /* + * Pick a random index and compute the offsets into packobj and bigobj. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(width - 1); + + packoff = n * sizeof (bufwad_t); + packsize = s * sizeof (bufwad_t); + + bigoff = n * chunksize; + bigsize = s * chunksize; + + packbuf = umem_alloc(packsize, UMEM_NOFAIL); + bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); + + /* + * free_percent of the time, free a range of bigobj rather than + * overwriting it. + */ + freeit = (ztest_random(100) < free_percent); + + /* + * Read the current contents of our objects. + */ + error = dmu_read(os, packobj, packoff, packsize, packbuf, + DMU_READ_PREFETCH); + ASSERT0(error); + error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, + DMU_READ_PREFETCH); + ASSERT0(error); + + /* + * Get a tx for the mods to both packobj and bigobj. + */ + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, packobj, packoff, packsize); + + if (freeit) + dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); + else + dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); + + /* This accounts for setting the checksum/compression. */ + dmu_tx_hold_bonus(tx, bigobj); + + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + umem_free(od, size); + return; + } + + enum zio_checksum cksum; + do { + cksum = (enum zio_checksum) + ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); + } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); + dmu_object_set_checksum(os, bigobj, cksum, tx); + + enum zio_compress comp; + do { + comp = (enum zio_compress) + ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); + } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); + dmu_object_set_compress(os, bigobj, comp, tx); + + /* + * For each index from n to n + s, verify that the existing bufwad + * in packobj matches the bufwads at the head and tail of the + * corresponding chunk in bigobj. Then update all three bufwads + * with the new values we want to write out. + */ + for (i = 0; i < s; i++) { + /* LINTED */ + pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); + /* LINTED */ + bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); + /* LINTED */ + bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; + + ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); + ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + + if (pack->bw_txg > txg) + fatal(0, "future leak: got %llx, open txg is %llx", + pack->bw_txg, txg); + + if (pack->bw_data != 0 && pack->bw_index != n + i) + fatal(0, "wrong index: got %llx, wanted %llx+%llx", + pack->bw_index, n, i); + + if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + + if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + + if (freeit) { + bzero(pack, sizeof (bufwad_t)); + } else { + pack->bw_index = n + i; + pack->bw_txg = txg; + pack->bw_data = 1 + ztest_random(-2ULL); + } + *bigH = *pack; + *bigT = *pack; + } + + /* + * We've verified all the old bufwads, and made new ones. + * Now write them out. + */ + dmu_write(os, packobj, packoff, packsize, packbuf, tx); + + if (freeit) { + if (ztest_opts.zo_verbose >= 7) { + (void) printf("freeing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); + } else { + if (ztest_opts.zo_verbose >= 7) { + (void) printf("writing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); + } + + dmu_tx_commit(tx); + + /* + * Sanity check the stuff we just wrote. + */ + { + void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); + void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, packobj, packoff, + packsize, packcheck, DMU_READ_PREFETCH)); + VERIFY(0 == dmu_read(os, bigobj, bigoff, + bigsize, bigcheck, DMU_READ_PREFETCH)); + + ASSERT(bcmp(packbuf, packcheck, packsize) == 0); + ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + + umem_free(packcheck, packsize); + umem_free(bigcheck, bigsize); + } + + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + umem_free(od, size); +} + +static void +compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, + uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) +{ + uint64_t i; + bufwad_t *pack; + bufwad_t *bigH; + bufwad_t *bigT; + + /* + * For each index from n to n + s, verify that the existing bufwad + * in packobj matches the bufwads at the head and tail of the + * corresponding chunk in bigobj. Then update all three bufwads + * with the new values we want to write out. + */ + for (i = 0; i < s; i++) { + /* LINTED */ + pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); + /* LINTED */ + bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); + /* LINTED */ + bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; + + ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); + ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + + if (pack->bw_txg > txg) + fatal(0, "future leak: got %llx, open txg is %llx", + pack->bw_txg, txg); + + if (pack->bw_data != 0 && pack->bw_index != n + i) + fatal(0, "wrong index: got %llx, wanted %llx+%llx", + pack->bw_index, n, i); + + if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + + if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + + pack->bw_index = n + i; + pack->bw_txg = txg; + pack->bw_data = 1 + ztest_random(-2ULL); + + *bigH = *pack; + *bigT = *pack; + } +} + +#undef OD_ARRAY_SIZE +#define OD_ARRAY_SIZE 2 + +void +ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + dmu_tx_t *tx; + uint64_t i; + int error; + int size; + uint64_t n, s, txg; + bufwad_t *packbuf, *bigbuf; + uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; + uint64_t blocksize = ztest_random_blocksize(); + uint64_t chunksize = blocksize; + uint64_t regions = 997; + uint64_t stride = 123456789ULL; + uint64_t width = 9; + dmu_buf_t *bonus_db; + arc_buf_t **bigbuf_arcbufs; + dmu_object_info_t doi; + + size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; + od = umem_alloc(size, UMEM_NOFAIL); + + /* + * This test uses two objects, packobj and bigobj, that are always + * updated together (i.e. in the same tx) so that their contents are + * in sync and can be compared. Their contents relate to each other + * in a simple way: packobj is a dense array of 'bufwad' structures, + * while bigobj is a sparse array of the same bufwads. Specifically, + * for any index n, there are three bufwads that should be identical: + * + * packobj, at offset n * sizeof (bufwad_t) + * bigobj, at the head of the nth chunk + * bigobj, at the tail of the nth chunk + * + * The chunk size is set equal to bigobj block size so that + * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. + */ + + /* + * Read the directory info. If it's the first time, set things up. + */ + ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); + ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); + + + if (ztest_object_init(zd, od, size, B_FALSE) != 0) { + umem_free(od, size); + return; + } + + bigobj = od[0].od_object; + packobj = od[1].od_object; + blocksize = od[0].od_blocksize; + chunksize = blocksize; + ASSERT(chunksize == od[1].od_gen); + + VERIFY(dmu_object_info(os, bigobj, &doi) == 0); + VERIFY(ISP2(doi.doi_data_block_size)); + VERIFY(chunksize == doi.doi_data_block_size); + VERIFY(chunksize >= 2 * sizeof (bufwad_t)); + + /* + * Pick a random index and compute the offsets into packobj and bigobj. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(width - 1); + + packoff = n * sizeof (bufwad_t); + packsize = s * sizeof (bufwad_t); + + bigoff = n * chunksize; + bigsize = s * chunksize; + + packbuf = umem_zalloc(packsize, UMEM_NOFAIL); + bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); + + VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); + + bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); + + /* + * Iteration 0 test zcopy for DB_UNCACHED dbufs. + * Iteration 1 test zcopy to already referenced dbufs. + * Iteration 2 test zcopy to dirty dbuf in the same txg. + * Iteration 3 test zcopy to dbuf dirty in previous txg. + * Iteration 4 test zcopy when dbuf is no longer dirty. + * Iteration 5 test zcopy when it can't be done. + * Iteration 6 one more zcopy write. + */ + for (i = 0; i < 7; i++) { + uint64_t j; + uint64_t off; + + /* + * In iteration 5 (i == 5) use arcbufs + * that don't match bigobj blksz to test + * dmu_assign_arcbuf_by_dbuf() when it can't directly + * assign an arcbuf to a dbuf. + */ + for (j = 0; j < s; j++) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { + bigbuf_arcbufs[j] = + dmu_request_arcbuf(bonus_db, chunksize); + } else { + bigbuf_arcbufs[2 * j] = + dmu_request_arcbuf(bonus_db, chunksize / 2); + bigbuf_arcbufs[2 * j + 1] = + dmu_request_arcbuf(bonus_db, chunksize / 2); + } + } + + /* + * Get a tx for the mods to both packobj and bigobj. + */ + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, packobj, packoff, packsize); + dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); + + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + for (j = 0; j < s; j++) { + if (i != 5 || + chunksize < (SPA_MINBLOCKSIZE * 2)) { + dmu_return_arcbuf(bigbuf_arcbufs[j]); + } else { + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j]); + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j + 1]); + } + } + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); + umem_free(od, size); + dmu_buf_rele(bonus_db, FTAG); + return; + } + + /* + * 50% of the time don't read objects in the 1st iteration to + * test dmu_assign_arcbuf_by_dbuf() for the case when there are + * no existing dbufs for the specified offsets. + */ + if (i != 0 || ztest_random(2) != 0) { + error = dmu_read(os, packobj, packoff, + packsize, packbuf, DMU_READ_PREFETCH); + ASSERT0(error); + error = dmu_read(os, bigobj, bigoff, bigsize, + bigbuf, DMU_READ_PREFETCH); + ASSERT0(error); + } + compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, + n, chunksize, txg); + + /* + * We've verified all the old bufwads, and made new ones. + * Now write them out. + */ + dmu_write(os, packobj, packoff, packsize, packbuf, tx); + if (ztest_opts.zo_verbose >= 7) { + (void) printf("writing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + for (off = bigoff, j = 0; j < s; j++, off += chunksize) { + dmu_buf_t *dbt; + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[j]->b_data, chunksize); + } else { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[2 * j]->b_data, + chunksize / 2); + bcopy((caddr_t)bigbuf + (off - bigoff) + + chunksize / 2, + bigbuf_arcbufs[2 * j + 1]->b_data, + chunksize / 2); + } + + if (i == 1) { + VERIFY(dmu_buf_hold(os, bigobj, off, + FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); + } + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { + VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, + off, bigbuf_arcbufs[j], tx)); + } else { + VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, + off, bigbuf_arcbufs[2 * j], tx)); + VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, + off + chunksize / 2, + bigbuf_arcbufs[2 * j + 1], tx)); + } + if (i == 1) { + dmu_buf_rele(dbt, FTAG); + } + } + dmu_tx_commit(tx); + + /* + * Sanity check the stuff we just wrote. + */ + { + void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); + void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, packobj, packoff, + packsize, packcheck, DMU_READ_PREFETCH)); + VERIFY(0 == dmu_read(os, bigobj, bigoff, + bigsize, bigcheck, DMU_READ_PREFETCH)); + + ASSERT(bcmp(packbuf, packcheck, packsize) == 0); + ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + + umem_free(packcheck, packsize); + umem_free(bigcheck, bigsize); + } + if (i == 2) { + txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); + } else if (i == 3) { + txg_wait_synced(dmu_objset_pool(os), 0); + } + } + + dmu_buf_rele(bonus_db, FTAG); + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); + umem_free(od, size); +} + +/* ARGSUSED */ +void +ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t *od; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + uint64_t offset = (1ULL << (ztest_random(20) + 43)) + + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + + /* + * Have multiple threads write to large offsets in an object + * to verify that parallel writes to an object -- even to the + * same blocks within the object -- doesn't cause any trouble. + */ + ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) + return; + + while (ztest_random(10) != 0) + ztest_io(zd, od->od_object, offset); + + umem_free(od, sizeof (ztest_od_t)); +} + +void +ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t *od; + uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + uint64_t count = ztest_random(20) + 1; + uint64_t blocksize = ztest_random_blocksize(); + void *data; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + + ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), + !ztest_random(2)) != 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + + if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + + ztest_prealloc(zd, od->od_object, offset, count * blocksize); + + data = umem_zalloc(blocksize, UMEM_NOFAIL); + + while (ztest_random(count) != 0) { + uint64_t randoff = offset + (ztest_random(count) * blocksize); + if (ztest_write(zd, od->od_object, randoff, blocksize, + data) != 0) + break; + while (ztest_random(4) != 0) + ztest_io(zd, od->od_object, randoff); + } + + umem_free(data, blocksize); + umem_free(od, sizeof (ztest_od_t)); +} + +/* + * Verify that zap_{create,destroy,add,remove,update} work as expected. + */ +#define ZTEST_ZAP_MIN_INTS 1 +#define ZTEST_ZAP_MAX_INTS 4 +#define ZTEST_ZAP_MAX_PROPS 1000 + +void +ztest_zap(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + uint64_t object; + uint64_t txg, last_txg; + uint64_t value[ZTEST_ZAP_MAX_INTS]; + uint64_t zl_ints, zl_intsize, prop; + int i, ints; + dmu_tx_t *tx; + char propname[100], txgname[100]; + int error; + char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), + !ztest_random(2)) != 0) + goto out; + + object = od->od_object; + + /* + * Generate a known hash collision, and verify that + * we can lookup and remove both entries. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + goto out; + for (i = 0; i < 2; i++) { + value[i] = i; + VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), + 1, &value[i], tx)); + } + for (i = 0; i < 2; i++) { + VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], + sizeof (uint64_t), 1, &value[i], tx)); + VERIFY3U(0, ==, + zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, 1); + } + for (i = 0; i < 2; i++) { + VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); + } + dmu_tx_commit(tx); + + /* + * Generate a bunch of random entries. + */ + ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); + + prop = ztest_random(ZTEST_ZAP_MAX_PROPS); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + bzero(value, sizeof (value)); + last_txg = 0; + + /* + * If these zap entries already exist, validate their contents. + */ + error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); + if (error == 0) { + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, 1); + + VERIFY(zap_lookup(os, object, txgname, zl_intsize, + zl_ints, &last_txg) == 0); + + VERIFY(zap_length(os, object, propname, &zl_intsize, + &zl_ints) == 0); + + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, ints); + + VERIFY(zap_lookup(os, object, propname, zl_intsize, + zl_ints, value) == 0); + + for (i = 0; i < ints; i++) { + ASSERT3U(value[i], ==, last_txg + object + i); + } + } else { + ASSERT3U(error, ==, ENOENT); + } + + /* + * Atomically update two entries in our zap object. + * The first is named txg_%llu, and contains the txg + * in which the property was last updated. The second + * is named prop_%llu, and the nth element of its value + * should be txg + object + n. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + goto out; + + if (last_txg > txg) + fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); + + for (i = 0; i < ints; i++) + value[i] = txg + object + i; + + VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), + 1, &txg, tx)); + VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), + ints, value, tx)); + + dmu_tx_commit(tx); + + /* + * Remove a random pair of entries. + */ + prop = ztest_random(ZTEST_ZAP_MAX_PROPS); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + + error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); + + if (error == ENOENT) + goto out; + + ASSERT0(error); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + goto out; + VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); + VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); + dmu_tx_commit(tx); +out: + umem_free(od, sizeof (ztest_od_t)); +} + +/* + * Test case to test the upgrading of a microzap to fatzap. + */ +void +ztest_fzap(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + uint64_t object, txg; + int i; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), + !ztest_random(2)) != 0) + goto out; + object = od->od_object; + + /* + * Add entries to this ZAP and make sure it spills over + * and gets upgraded to a fatzap. Also, since we are adding + * 2050 entries we should see ptrtbl growth and leaf-block split. + */ + for (i = 0; i < 2050; i++) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t value = i; + dmu_tx_t *tx; + int error; + + (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", + (u_longlong_t)id, (u_longlong_t)value); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, name); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + goto out; + error = zap_add(os, object, name, sizeof (uint64_t), 1, + &value, tx); + ASSERT(error == 0 || error == EEXIST); + dmu_tx_commit(tx); + } +out: + umem_free(od, sizeof (ztest_od_t)); +} + +/* ARGSUSED */ +void +ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; + dmu_tx_t *tx; + int i, namelen, error; + int micro = ztest_random(2); + char name[20], string_value[20]; + void *data; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + + object = od->od_object; + + /* + * Generate a random name of the form 'xxx.....' where each + * x is a random printable character and the dots are dots. + * There are 94 such characters, and the name length goes from + * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. + */ + namelen = ztest_random(sizeof (name) - 5) + 5 + 1; + + for (i = 0; i < 3; i++) + name[i] = '!' + ztest_random('~' - '!' + 1); + for (; i < namelen - 1; i++) + name[i] = '.'; + name[i] = '\0'; + + if ((namelen & 1) || micro) { + wsize = sizeof (txg); + wc = 1; + data = &txg; + } else { + wsize = 1; + wc = namelen; + data = string_value; + } + + count = -1ULL; + VERIFY0(zap_count(os, object, &count)); + ASSERT(count != -1ULL); + + /* + * Select an operation: length, lookup, add, update, remove. + */ + i = ztest_random(5); + + if (i >= 2) { + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + bcopy(name, string_value, namelen); + } else { + tx = NULL; + txg = 0; + bzero(string_value, namelen); + } + + switch (i) { + + case 0: + error = zap_length(os, object, name, &zl_wsize, &zl_wc); + if (error == 0) { + ASSERT3U(wsize, ==, zl_wsize); + ASSERT3U(wc, ==, zl_wc); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 1: + error = zap_lookup(os, object, name, wsize, wc, data); + if (error == 0) { + if (data == string_value && + bcmp(name, data, namelen) != 0) + fatal(0, "name '%s' != val '%s' len %d", + name, data, namelen); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 2: + error = zap_add(os, object, name, wsize, wc, data, tx); + ASSERT(error == 0 || error == EEXIST); + break; + + case 3: + VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); + break; + + case 4: + error = zap_remove(os, object, name, tx); + ASSERT(error == 0 || error == ENOENT); + break; + } + + if (tx != NULL) + dmu_tx_commit(tx); + + umem_free(od, sizeof (ztest_od_t)); +} + +/* + * Commit callback data. + */ +typedef struct ztest_cb_data { + list_node_t zcd_node; + uint64_t zcd_txg; + int zcd_expected_err; + boolean_t zcd_added; + boolean_t zcd_called; + spa_t *zcd_spa; +} ztest_cb_data_t; + +/* This is the actual commit callback function */ +static void +ztest_commit_callback(void *arg, int error) +{ + ztest_cb_data_t *data = arg; + uint64_t synced_txg; + + VERIFY(data != NULL); + VERIFY3S(data->zcd_expected_err, ==, error); + VERIFY(!data->zcd_called); + + synced_txg = spa_last_synced_txg(data->zcd_spa); + if (data->zcd_txg > synced_txg) + fatal(0, "commit callback of txg %" PRIu64 " called prematurely" + ", last synced txg = %" PRIu64 "\n", data->zcd_txg, + synced_txg); + + data->zcd_called = B_TRUE; + + if (error == ECANCELED) { + ASSERT0(data->zcd_txg); + ASSERT(!data->zcd_added); + + /* + * The private callback data should be destroyed here, but + * since we are going to check the zcd_called field after + * dmu_tx_abort(), we will destroy it there. + */ + return; + } + + ASSERT(data->zcd_added); + ASSERT3U(data->zcd_txg, !=, 0); + + (void) mutex_enter(&zcl.zcl_callbacks_lock); + + /* See if this cb was called more quickly */ + if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) + zc_min_txg_delay = synced_txg - data->zcd_txg; + + /* Remove our callback from the list */ + list_remove(&zcl.zcl_callbacks, data); + + (void) mutex_exit(&zcl.zcl_callbacks_lock); + + umem_free(data, sizeof (ztest_cb_data_t)); +} + +/* Allocate and initialize callback data structure */ +static ztest_cb_data_t * +ztest_create_cb_data(objset_t *os, uint64_t txg) +{ + ztest_cb_data_t *cb_data; + + cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); + + cb_data->zcd_txg = txg; + cb_data->zcd_spa = dmu_objset_spa(os); + list_link_init(&cb_data->zcd_node); + + return (cb_data); +} + +/* + * Commit callback test. + */ +void +ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + dmu_tx_t *tx; + ztest_cb_data_t *cb_data[3], *tmp_cb; + uint64_t old_txg, txg; + int i, error = 0; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + + tx = dmu_tx_create(os); + + cb_data[0] = ztest_create_cb_data(os, 0); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); + + dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); + + /* Every once in a while, abort the transaction on purpose */ + if (ztest_random(100) == 0) + error = -1; + + if (!error) + error = dmu_tx_assign(tx, TXG_NOWAIT); + + txg = error ? 0 : dmu_tx_get_txg(tx); + + cb_data[0]->zcd_txg = txg; + cb_data[1] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); + + if (error) { + /* + * It's not a strict requirement to call the registered + * callbacks from inside dmu_tx_abort(), but that's what + * it's supposed to happen in the current implementation + * so we will check for that. + */ + for (i = 0; i < 2; i++) { + cb_data[i]->zcd_expected_err = ECANCELED; + VERIFY(!cb_data[i]->zcd_called); + } + + dmu_tx_abort(tx); + + for (i = 0; i < 2; i++) { + VERIFY(cb_data[i]->zcd_called); + umem_free(cb_data[i], sizeof (ztest_cb_data_t)); + } + + umem_free(od, sizeof (ztest_od_t)); + return; + } + + cb_data[2] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); + + /* + * Read existing data to make sure there isn't a future leak. + */ + VERIFY(0 == dmu_read(os, od->od_object, 0, sizeof (uint64_t), + &old_txg, DMU_READ_PREFETCH)); + + if (old_txg > txg) + fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, + old_txg, txg); + + dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); + + (void) mutex_enter(&zcl.zcl_callbacks_lock); + + /* + * Since commit callbacks don't have any ordering requirement and since + * it is theoretically possible for a commit callback to be called + * after an arbitrary amount of time has elapsed since its txg has been + * synced, it is difficult to reliably determine whether a commit + * callback hasn't been called due to high load or due to a flawed + * implementation. + * + * In practice, we will assume that if after a certain number of txgs a + * commit callback hasn't been called, then most likely there's an + * implementation bug.. + */ + tmp_cb = list_head(&zcl.zcl_callbacks); + if (tmp_cb != NULL && + tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { + fatal(0, "Commit callback threshold exceeded, oldest txg: %" + PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); + } + + /* + * Let's find the place to insert our callbacks. + * + * Even though the list is ordered by txg, it is possible for the + * insertion point to not be the end because our txg may already be + * quiescing at this point and other callbacks in the open txg + * (from other objsets) may have sneaked in. + */ + tmp_cb = list_tail(&zcl.zcl_callbacks); + while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) + tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); + + /* Add the 3 callbacks to the list */ + for (i = 0; i < 3; i++) { + if (tmp_cb == NULL) + list_insert_head(&zcl.zcl_callbacks, cb_data[i]); + else + list_insert_after(&zcl.zcl_callbacks, tmp_cb, + cb_data[i]); + + cb_data[i]->zcd_added = B_TRUE; + VERIFY(!cb_data[i]->zcd_called); + + tmp_cb = cb_data[i]; + } + + zc_cb_counter += 3; + + (void) mutex_exit(&zcl.zcl_callbacks_lock); + + dmu_tx_commit(tx); + + umem_free(od, sizeof (ztest_od_t)); +} + +/* + * Visit each object in the dataset. Verify that its properties + * are consistent what was stored in the block tag when it was created, + * and that its unused bonus buffer space has not been overwritten. + */ +/* ARGSUSED */ +void +ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + uint64_t obj; + int err = 0; + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + ztest_block_tag_t *bt = NULL; + dmu_object_info_t doi; + dmu_buf_t *db; + + ztest_object_lock(zd, obj, RL_READER); + if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { + ztest_object_unlock(zd, obj); + continue; + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_size >= sizeof (*bt)) + bt = ztest_bt_bonus(db); + + if (bt && bt->bt_magic == BT_MAGIC) { + ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, + bt->bt_offset, bt->bt_gen, bt->bt_txg, + bt->bt_crtxg); + ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); + } + + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, obj); + } +} + +/* ARGSUSED */ +void +ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) +{ + zfs_prop_t proplist[] = { + ZFS_PROP_CHECKSUM, + ZFS_PROP_COMPRESSION, + ZFS_PROP_COPIES, + ZFS_PROP_DEDUP + }; + int p; + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) + (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], + ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); + + VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, + ztest_random_blocksize(), (int)ztest_random(2))); + + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +/* ARGSUSED */ +void +ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) +{ + nvlist_t *props = NULL; + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); + + VERIFY0(spa_prop_get(ztest_spa, &props)); + + if (ztest_opts.zo_verbose >= 6) + dump_nvlist(props, 4); + + nvlist_free(props); + + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +static int +user_release_one(const char *snapname, const char *holdname) +{ + nvlist_t *snaps, *holds; + int error; + + snaps = fnvlist_alloc(); + holds = fnvlist_alloc(); + fnvlist_add_boolean(holds, holdname); + fnvlist_add_nvlist(snaps, snapname, holds); + fnvlist_free(holds); + error = dsl_dataset_user_release(snaps, NULL); + fnvlist_free(snaps); + return (error); +} + +/* + * Test snapshot hold/release and deferred destroy. + */ +void +ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) +{ + int error; + objset_t *os = zd->zd_os; + objset_t *origin; + char snapname[100]; + char fullname[100]; + char clonename[100]; + char tag[100]; + char osname[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *holds; + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + dmu_objset_name(os, osname); + + (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", + (u_longlong_t)id); + (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); + (void) snprintf(clonename, sizeof (clonename), + "%s/ch1_%llu", osname, (u_longlong_t)id); + (void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id); + + /* + * Clean up from any previous run. + */ + error = dsl_destroy_head(clonename); + if (error != ENOENT) + ASSERT0(error); + error = user_release_one(fullname, tag); + if (error != ESRCH && error != ENOENT) + ASSERT0(error); + error = dsl_destroy_snapshot(fullname, B_FALSE); + if (error != ENOENT) + ASSERT0(error); + + /* + * Create snapshot, clone it, mark snap for deferred destroy, + * destroy clone, verify snap was also destroyed. + */ + error = dmu_objset_snapshot_one(osname, snapname); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_snapshot"); + goto out; + } + fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + } + + error = dmu_objset_clone(clonename, fullname); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_clone"); + goto out; + } + fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); + } + + error = dsl_destroy_snapshot(fullname, B_TRUE); + if (error) { + fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", + fullname, error); + } + + error = dsl_destroy_head(clonename); + if (error) + fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); + + error = dmu_objset_hold(fullname, FTAG, &origin); + if (error != ENOENT) + fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); + + /* + * Create snapshot, add temporary hold, verify that we can't + * destroy a held snapshot, mark for deferred destroy, + * release hold, verify snapshot was destroyed. + */ + error = dmu_objset_snapshot_one(osname, snapname); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_snapshot"); + goto out; + } + fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + } + + holds = fnvlist_alloc(); + fnvlist_add_string(holds, fullname, tag); + error = dsl_dataset_user_hold(holds, 0, NULL); + fnvlist_free(holds); + + if (error == ENOSPC) { + ztest_record_enospc("dsl_dataset_user_hold"); + goto out; + } else if (error) { + fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", + fullname, tag, error); + } + + error = dsl_destroy_snapshot(fullname, B_FALSE); + if (error != EBUSY) { + fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", + fullname, error); + } + + error = dsl_destroy_snapshot(fullname, B_TRUE); + if (error) { + fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", + fullname, error); + } + + error = user_release_one(fullname, tag); + if (error) + fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); + + VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); + +out: + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +/* + * Inject random faults into the on-disk data. + */ +/* ARGSUSED */ +void +ztest_fault_inject(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + int fd; + uint64_t offset; + uint64_t leaves; + uint64_t bad = 0x1990c0ffeedecadeull; + uint64_t top, leaf; + char *path0; + char *pathrand; + size_t fsize; + int bshift = SPA_MAXBLOCKSHIFT + 2; + int iters = 1000; + int maxfaults; + int mirror_save; + vdev_t *vd0 = NULL; + uint64_t guid0 = 0; + boolean_t islog = B_FALSE; + + path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + mutex_enter(&ztest_vdev_lock); + + /* + * Device removal is in progress, fault injection must be disabled + * until it completes and the pool is scrubbed. The fault injection + * strategy for damaging blocks does not take in to account evacuated + * blocks which may have already been damaged. + */ + if (ztest_device_removal_active) { + mutex_exit(&ztest_vdev_lock); + goto out; + } + + maxfaults = MAXFAULTS(zs); + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + mirror_save = zs->zs_mirrors; + mutex_exit(&ztest_vdev_lock); + + ASSERT(leaves >= 1); + + /* + * While ztest is running the number of leaves will not change. This + * is critical for the fault injection logic as it determines where + * errors can be safely injected such that they are always repairable. + * + * When restarting ztest a different number of leaves may be requested + * which will shift the regions to be damaged. This is fine as long + * as the pool has been scrubbed prior to using the new mapping. + * Failure to do can result in non-repairable damage being injected. + */ + if (ztest_pool_scrubbed == B_FALSE) + goto out; + + /* + * Grab the name lock as reader. There are some operations + * which don't like to have their vdevs changed while + * they are in progress (i.e. spa_change_guid). Those + * operations will have grabbed the name lock as writer. + */ + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + /* + * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + if (ztest_random(2) == 0) { + /* + * Inject errors on a normal data device or slog device. + */ + top = ztest_random_vdev_top(spa, B_TRUE); + leaf = ztest_random(leaves) + zs->zs_splits; + + /* + * Generate paths to the first leaf in this top-level vdev, + * and to the random leaf we selected. We'll induce transient + * write failures and random online/offline activity on leaf 0, + * and we'll write random garbage to the randomly chosen leaf. + */ + (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + zs->zs_splits); + (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + leaf); + + vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); + if (vd0 != NULL && vd0->vdev_top->vdev_islog) + islog = B_TRUE; + + /* + * If the top-level vdev needs to be resilvered + * then we only allow faults on the device that is + * resilvering. + */ + if (vd0 != NULL && maxfaults != 1 && + (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || + vd0->vdev_resilver_txg != 0)) { + /* + * Make vd0 explicitly claim to be unreadable, + * or unwriteable, or reach behind its back + * and close the underlying fd. We can do this if + * maxfaults == 0 because we'll fail and reexecute, + * and we can do it if maxfaults >= 2 because we'll + * have enough redundancy. If maxfaults == 1, the + * combination of this with injection of random data + * corruption below exceeds the pool's fault tolerance. + */ + vdev_file_t *vf = vd0->vdev_tsd; + + zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", + (long long)vd0->vdev_id, (int)maxfaults); + + if (vf != NULL && ztest_random(3) == 0) { + (void) close(vf->vf_file->f_fd); + vf->vf_file->f_fd = -1; + } else if (ztest_random(2) == 0) { + vd0->vdev_cant_read = B_TRUE; + } else { + vd0->vdev_cant_write = B_TRUE; + } + guid0 = vd0->vdev_guid; + } + } else { + /* + * Inject errors on an l2cache device. + */ + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + if (sav->sav_count == 0) { + spa_config_exit(spa, SCL_STATE, FTAG); + (void) pthread_rwlock_unlock(&ztest_name_lock); + goto out; + } + vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; + guid0 = vd0->vdev_guid; + (void) strcpy(path0, vd0->vdev_path); + (void) strcpy(pathrand, vd0->vdev_path); + + leaf = 0; + leaves = 1; + maxfaults = INT_MAX; /* no limit on cache devices */ + } + + spa_config_exit(spa, SCL_STATE, FTAG); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + /* + * If we can tolerate two or more faults, or we're dealing + * with a slog, randomly online/offline vd0. + */ + if ((maxfaults >= 2 || islog) && guid0 != 0) { + if (ztest_random(10) < 6) { + int flags = (ztest_random(2) == 0 ? + ZFS_OFFLINE_TEMPORARY : 0); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between offlining a slog and + * destroying a dataset. Offlining the slog will + * grab a reference on the dataset which may cause + * dsl_destroy_head() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + if (islog) + (void) pthread_rwlock_wrlock(&ztest_name_lock); + + VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); + + if (islog) + (void) pthread_rwlock_unlock(&ztest_name_lock); + } else { + /* + * Ideally we would like to be able to randomly + * call vdev_[on|off]line without holding locks + * to force unpredictable failures but the side + * effects of vdev_[on|off]line prevent us from + * doing so. We grab the ztest_vdev_lock here to + * prevent a race between injection testing and + * aux_vdev removal. + */ + mutex_enter(&ztest_vdev_lock); + (void) vdev_online(spa, guid0, 0, NULL); + mutex_exit(&ztest_vdev_lock); + } + } + + if (maxfaults == 0) + goto out; + + /* + * We have at least single-fault tolerance, so inject data corruption. + */ + fd = open(pathrand, O_RDWR); + + if (fd == -1) /* we hit a gap in the device namespace */ + goto out; + + fsize = lseek(fd, 0, SEEK_END); + + while (--iters != 0) { + /* + * The offset must be chosen carefully to ensure that + * we do not inject a given logical block with errors + * on two different leaf devices, because ZFS can not + * tolerate that (if maxfaults==1). + * + * To achieve this we divide each leaf device into + * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). + * Each chunk is further divided into error-injection + * ranges (can accept errors) and clear ranges (we do + * not inject errors in those). Each error-injection + * range can accept errors only for a single leaf vdev. + * Error-injection ranges are separated by clear ranges. + * + * For example, with 3 leaves, each chunk looks like: + * 0 to 32M: injection range for leaf 0 + * 32M to 64M: clear range - no injection allowed + * 64M to 96M: injection range for leaf 1 + * 96M to 128M: clear range - no injection allowed + * 128M to 160M: injection range for leaf 2 + * 160M to 192M: clear range - no injection allowed + * + * Each clear range must be large enough such that a + * single block cannot straddle it. This way a block + * can't be a target in two different injection ranges + * (on different leaf vdevs). + */ + offset = ztest_random(fsize / (leaves << bshift)) * + (leaves << bshift) + (leaf << bshift) + + (ztest_random(1ULL << (bshift - 1)) & -8ULL); + + /* + * Only allow damage to the labels at one end of the vdev. + * + * If all labels are damaged, the device will be totally + * inaccessible, which will result in loss of data, + * because we also damage (parts of) the other side of + * the mirror/raidz. + * + * Additionally, we will always have both an even and an + * odd label, so that we can handle crashes in the + * middle of vdev_config_sync(). + */ + if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) + continue; + + /* + * The two end labels are stored at the "end" of the disk, but + * the end of the disk (vdev_psize) is aligned to + * sizeof (vdev_label_t). + */ + uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); + if ((leaf & 1) == 1 && + offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) + continue; + + mutex_enter(&ztest_vdev_lock); + if (mirror_save != zs->zs_mirrors) { + mutex_exit(&ztest_vdev_lock); + (void) close(fd); + goto out; + } + + if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) + fatal(1, "can't inject bad word at 0x%llx in %s", + offset, pathrand); + + mutex_exit(&ztest_vdev_lock); + + if (ztest_opts.zo_verbose >= 7) + (void) printf("injected bad word into %s," + " offset 0x%llx\n", pathrand, (u_longlong_t)offset); + } + + (void) close(fd); +out: + umem_free(path0, MAXPATHLEN); + umem_free(pathrand, MAXPATHLEN); +} + +/* + * By design ztest will never inject uncorrectable damage in to the pool. + * Issue a scrub, wait for it to complete, and verify there is never any + * any persistent damage. + * + * Only after a full scrub has been completed is it safe to start injecting + * data corruption. See the comment in zfs_fault_inject(). + */ +static int +ztest_scrub_impl(spa_t *spa) +{ + int error = spa_scan(spa, POOL_SCAN_SCRUB); + if (error) + return (error); + + while (dsl_scan_scrubbing(spa_get_dsl(spa))) + txg_wait_synced(spa_get_dsl(spa), 0); + + if (spa_get_errlog_size(spa) > 0) + return (ECKSUM); + + ztest_pool_scrubbed = B_TRUE; + + return (0); +} + +/* + * Scrub the pool. + */ +/* ARGSUSED */ +void +ztest_scrub(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + int error; + + /* + * Scrub in progress by device removal. + */ + if (ztest_device_removal_active) + return; + + /* + * Start a scrub, wait a moment, then force a restart. + */ + (void) spa_scan(spa, POOL_SCAN_SCRUB); + (void) poll(NULL, 0, 100); + + error = ztest_scrub_impl(spa); + if (error == EBUSY) + error = 0; + ASSERT0(error); +} + +/* + * Change the guid for the pool. + */ +/* ARGSUSED */ +void +ztest_reguid(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + uint64_t orig, load; + int error; + + if (ztest_opts.zo_mmp_test) + return; + + orig = spa_guid(spa); + load = spa_load_guid(spa); + + (void) pthread_rwlock_wrlock(&ztest_name_lock); + error = spa_change_guid(spa); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + if (error != 0) + return; + + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Changed guid old %llu -> %llu\n", + (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); + } + + VERIFY3U(orig, !=, spa_guid(spa)); + VERIFY3U(load, ==, spa_load_guid(spa)); +} + +void +ztest_fletcher(ztest_ds_t *zd, uint64_t id) +{ + hrtime_t end = gethrtime() + NANOSEC; + + while (gethrtime() <= end) { + int run_count = 100; + void *buf; + struct abd *abd_data, *abd_meta; + uint32_t size; + int *ptr; + int i; + zio_cksum_t zc_ref; + zio_cksum_t zc_ref_byteswap; + + size = ztest_random_blocksize(); + + buf = umem_alloc(size, UMEM_NOFAIL); + abd_data = abd_alloc(size, B_FALSE); + abd_meta = abd_alloc(size, B_TRUE); + + for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) + *ptr = ztest_random(UINT_MAX); + + abd_copy_from_buf_off(abd_data, buf, 0, size); + abd_copy_from_buf_off(abd_meta, buf, 0, size); + + VERIFY0(fletcher_4_impl_set("scalar")); + fletcher_4_native(buf, size, NULL, &zc_ref); + fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); + + VERIFY0(fletcher_4_impl_set("cycle")); + while (run_count-- > 0) { + zio_cksum_t zc; + zio_cksum_t zc_byteswap; + + fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); + fletcher_4_native(buf, size, NULL, &zc); + + VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); + VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, + sizeof (zc_byteswap))); + + /* Test ABD - data */ + abd_fletcher_4_byteswap(abd_data, size, NULL, + &zc_byteswap); + abd_fletcher_4_native(abd_data, size, NULL, &zc); + + VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); + VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, + sizeof (zc_byteswap))); + + /* Test ABD - metadata */ + abd_fletcher_4_byteswap(abd_meta, size, NULL, + &zc_byteswap); + abd_fletcher_4_native(abd_meta, size, NULL, &zc); + + VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); + VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, + sizeof (zc_byteswap))); + + } + + umem_free(buf, size); + abd_free(abd_data); + abd_free(abd_meta); + } +} + +void +ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) +{ + void *buf; + size_t size; + int *ptr; + int i; + zio_cksum_t zc_ref; + zio_cksum_t zc_ref_bswap; + + hrtime_t end = gethrtime() + NANOSEC; + + while (gethrtime() <= end) { + int run_count = 100; + + size = ztest_random_blocksize(); + buf = umem_alloc(size, UMEM_NOFAIL); + + for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) + *ptr = ztest_random(UINT_MAX); + + VERIFY0(fletcher_4_impl_set("scalar")); + fletcher_4_native(buf, size, NULL, &zc_ref); + fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); + + VERIFY0(fletcher_4_impl_set("cycle")); + + while (run_count-- > 0) { + zio_cksum_t zc; + zio_cksum_t zc_bswap; + size_t pos = 0; + + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); + + while (pos < size) { + size_t inc = 64 * ztest_random(size / 67); + /* sometimes add few bytes to test non-simd */ + if (ztest_random(100) < 10) + inc += P2ALIGN(ztest_random(64), + sizeof (uint32_t)); + + if (inc > (size - pos)) + inc = size - pos; + + fletcher_4_incremental_native(buf + pos, inc, + &zc); + fletcher_4_incremental_byteswap(buf + pos, inc, + &zc_bswap); + + pos += inc; + } + + VERIFY3U(pos, ==, size); + + VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); + VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); + + /* + * verify if incremental on the whole buffer is + * equivalent to non-incremental version + */ + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); + + fletcher_4_incremental_native(buf, size, &zc); + fletcher_4_incremental_byteswap(buf, size, &zc_bswap); + + VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); + VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); + } + + umem_free(buf, size); + } +} + +static int +ztest_check_path(char *path) +{ + struct stat s; + /* return true on success */ + return (!stat(path, &s)); +} + +static void +ztest_get_zdb_bin(char *bin, int len) +{ + char *zdb_path; + /* + * Try to use ZDB_PATH and in-tree zdb path. If not successful, just + * let popen to search through PATH. + */ + if ((zdb_path = getenv("ZDB_PATH"))) { + strlcpy(bin, zdb_path, len); /* In env */ + if (!ztest_check_path(bin)) { + ztest_dump_core = 0; + fatal(1, "invalid ZDB_PATH '%s'", bin); + } + return; + } + + VERIFY(realpath(getexecname(), bin) != NULL); + if (strstr(bin, "/ztest/")) { + strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */ + strcat(bin, "/zdb/zdb"); + if (ztest_check_path(bin)) + return; + } + strcpy(bin, "zdb"); +} + +static vdev_t * +ztest_random_concrete_vdev_leaf(vdev_t *vd) +{ + if (vd == NULL) + return (NULL); + + if (vd->vdev_children == 0) + return (vd); + + vdev_t *eligible[vd->vdev_children]; + int eligible_idx = 0, i; + for (i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + if (cvd->vdev_top->vdev_removing) + continue; + if (cvd->vdev_children > 0 || + (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { + eligible[eligible_idx++] = cvd; + } + } + VERIFY(eligible_idx > 0); + + uint64_t child_no = ztest_random(eligible_idx); + return (ztest_random_concrete_vdev_leaf(eligible[child_no])); +} + +/* ARGSUSED */ +void +ztest_initialize(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + int error = 0; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* Random leaf vdev */ + vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); + if (rand_vd == NULL) { + spa_config_exit(spa, SCL_VDEV, FTAG); + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * The random vdev we've selected may change as soon as we + * drop the spa_config_lock. We create local copies of things + * we're interested in. + */ + uint64_t guid = rand_vd->vdev_guid; + char *path = strdup(rand_vd->vdev_path); + boolean_t active = rand_vd->vdev_initialize_thread != NULL; + + zfs_dbgmsg("vd %px, guid %llu", rand_vd, guid); + spa_config_exit(spa, SCL_VDEV, FTAG); + + uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); + + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *vdev_errlist = fnvlist_alloc(); + fnvlist_add_uint64(vdev_guids, path, guid); + error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); + fnvlist_free(vdev_guids); + fnvlist_free(vdev_errlist); + + switch (cmd) { + case POOL_INITIALIZE_CANCEL: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Cancel initialize %s", path); + if (!active) + (void) printf(" failed (no initialize active)"); + (void) printf("\n"); + } + break; + case POOL_INITIALIZE_START: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Start initialize %s", path); + if (active && error == 0) + (void) printf(" failed (already active)"); + else if (error != 0) + (void) printf(" failed (error %d)", error); + (void) printf("\n"); + } + break; + case POOL_INITIALIZE_SUSPEND: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Suspend initialize %s", path); + if (!active) + (void) printf(" failed (no initialize active)"); + (void) printf("\n"); + } + break; + } + free(path); + mutex_exit(&ztest_vdev_lock); +} + +/* ARGSUSED */ +void +ztest_trim(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + int error = 0; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* Random leaf vdev */ + vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); + if (rand_vd == NULL) { + spa_config_exit(spa, SCL_VDEV, FTAG); + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * The random vdev we've selected may change as soon as we + * drop the spa_config_lock. We create local copies of things + * we're interested in. + */ + uint64_t guid = rand_vd->vdev_guid; + char *path = strdup(rand_vd->vdev_path); + boolean_t active = rand_vd->vdev_trim_thread != NULL; + + zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); + spa_config_exit(spa, SCL_VDEV, FTAG); + + uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); + uint64_t rate = 1 << ztest_random(30); + boolean_t partial = (ztest_random(5) > 0); + boolean_t secure = (ztest_random(5) > 0); + + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *vdev_errlist = fnvlist_alloc(); + fnvlist_add_uint64(vdev_guids, path, guid); + error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, + secure, vdev_errlist); + fnvlist_free(vdev_guids); + fnvlist_free(vdev_errlist); + + switch (cmd) { + case POOL_TRIM_CANCEL: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Cancel TRIM %s", path); + if (!active) + (void) printf(" failed (no TRIM active)"); + (void) printf("\n"); + } + break; + case POOL_TRIM_START: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Start TRIM %s", path); + if (active && error == 0) + (void) printf(" failed (already active)"); + else if (error != 0) + (void) printf(" failed (error %d)", error); + (void) printf("\n"); + } + break; + case POOL_TRIM_SUSPEND: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Suspend TRIM %s", path); + if (!active) + (void) printf(" failed (no TRIM active)"); + (void) printf("\n"); + } + break; + } + free(path); + mutex_exit(&ztest_vdev_lock); +} + +/* + * Verify pool integrity by running zdb. + */ +static void +ztest_run_zdb(char *pool) +{ + int status; + char *bin; + char *zdb; + char *zbuf; + const int len = MAXPATHLEN + MAXNAMELEN + 20; + FILE *fp; + + bin = umem_alloc(len, UMEM_NOFAIL); + zdb = umem_alloc(len, UMEM_NOFAIL); + zbuf = umem_alloc(1024, UMEM_NOFAIL); + + ztest_get_zdb_bin(bin, len); + + (void) sprintf(zdb, + "%s -bcc%s%s -G -d -Y -e -y -p %s %s", + bin, + ztest_opts.zo_verbose >= 3 ? "s" : "", + ztest_opts.zo_verbose >= 4 ? "v" : "", + ztest_opts.zo_dir, + pool); + + if (ztest_opts.zo_verbose >= 5) + (void) printf("Executing %s\n", strstr(zdb, "zdb ")); + + fp = popen(zdb, "r"); + + while (fgets(zbuf, 1024, fp) != NULL) + if (ztest_opts.zo_verbose >= 3) + (void) printf("%s", zbuf); + + status = pclose(fp); + + if (status == 0) + goto out; + + ztest_dump_core = 0; + if (WIFEXITED(status)) + fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); + else + fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); +out: + umem_free(bin, len); + umem_free(zdb, len); + umem_free(zbuf, 1024); +} + +static void +ztest_walk_pool_directory(char *header) +{ + spa_t *spa = NULL; + + if (ztest_opts.zo_verbose >= 6) + (void) printf("%s\n", header); + + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) + if (ztest_opts.zo_verbose >= 6) + (void) printf("\t%s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); +} + +static void +ztest_spa_import_export(char *oldname, char *newname) +{ + nvlist_t *config, *newconfig; + uint64_t pool_guid; + spa_t *spa; + int error; + + if (ztest_opts.zo_verbose >= 4) { + (void) printf("import/export: old = %s, new = %s\n", + oldname, newname); + } + + /* + * Clean up from previous runs. + */ + (void) spa_destroy(newname); + + /* + * Get the pool's configuration and guid. + */ + VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); + + /* + * Kick off a scrub to tickle scrub/export races. + */ + if (ztest_random(2) == 0) + (void) spa_scan(spa, POOL_SCAN_SCRUB); + + pool_guid = spa_guid(spa); + spa_close(spa, FTAG); + + ztest_walk_pool_directory("pools before export"); + + /* + * Export it. + */ + VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); + + ztest_walk_pool_directory("pools after export"); + + /* + * Try to import it. + */ + newconfig = spa_tryimport(config); + ASSERT(newconfig != NULL); + nvlist_free(newconfig); + + /* + * Import it under the new name. + */ + error = spa_import(newname, config, NULL, 0); + if (error != 0) { + dump_nvlist(config, 0); + fatal(B_FALSE, "couldn't import pool %s as %s: error %u", + oldname, newname, error); + } + + ztest_walk_pool_directory("pools after import"); + + /* + * Try to import it again -- should fail with EEXIST. + */ + VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); + + /* + * Try to import it under a different name -- should fail with EEXIST. + */ + VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); + + /* + * Verify that the pool is no longer visible under the old name. + */ + VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); + + /* + * Verify that we can open and close the pool using the new name. + */ + VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); + ASSERT(pool_guid == spa_guid(spa)); + spa_close(spa, FTAG); + + nvlist_free(config); +} + +static void +ztest_resume(spa_t *spa) +{ + if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) + (void) printf("resuming from suspended state\n"); + spa_vdev_state_enter(spa, SCL_NONE); + vdev_clear(spa, NULL); + (void) spa_vdev_state_exit(spa, NULL, 0); + (void) zio_resume(spa); +} + +static void +ztest_resume_thread(void *arg) +{ + spa_t *spa = arg; + + while (!ztest_exiting) { + if (spa_suspended(spa)) + ztest_resume(spa); + (void) poll(NULL, 0, 100); + + /* + * Periodically change the zfs_compressed_arc_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_compressed_arc_enabled = ztest_random(2); + + /* + * Periodically change the zfs_abd_scatter_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_abd_scatter_enabled = ztest_random(2); + } + + thread_exit(); +} + +static void +ztest_deadman_thread(void *arg) +{ + ztest_shared_t *zs = arg; + spa_t *spa = ztest_spa; + hrtime_t delay, overdue, last_run = gethrtime(); + + delay = (zs->zs_thread_stop - zs->zs_thread_start) + + MSEC2NSEC(zfs_deadman_synctime_ms); + + while (!ztest_exiting) { + /* + * Wait for the delay timer while checking occasionally + * if we should stop. + */ + if (gethrtime() < last_run + delay) { + (void) poll(NULL, 0, 1000); + continue; + } + + /* + * If the pool is suspended then fail immediately. Otherwise, + * check to see if the pool is making any progress. If + * vdev_deadman() discovers that there hasn't been any recent + * I/Os then it will end up aborting the tests. + */ + if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { + fatal(0, "aborting test after %llu seconds because " + "pool has transitioned to a suspended state.", + zfs_deadman_synctime_ms / 1000); + } + vdev_deadman(spa->spa_root_vdev, FTAG); + + /* + * If the process doesn't complete within a grace period of + * zfs_deadman_synctime_ms over the expected finish time, + * then it may be hung and is terminated. + */ + overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); + if (gethrtime() > overdue) { + fatal(0, "aborting test after %llu seconds because " + "the process is overdue for termination.", + (gethrtime() - zs->zs_proc_start) / NANOSEC); + } + + (void) printf("ztest has been running for %lld seconds\n", + (gethrtime() - zs->zs_proc_start) / NANOSEC); + + last_run = gethrtime(); + delay = MSEC2NSEC(zfs_deadman_checktime_ms); + } + + thread_exit(); +} + +static void +ztest_execute(int test, ztest_info_t *zi, uint64_t id) +{ + ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; + ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); + hrtime_t functime = gethrtime(); + int i; + + for (i = 0; i < zi->zi_iters; i++) + zi->zi_func(zd, id); + + functime = gethrtime() - functime; + + atomic_add_64(&zc->zc_count, 1); + atomic_add_64(&zc->zc_time, functime); + + if (ztest_opts.zo_verbose >= 4) + (void) printf("%6.2f sec in %s\n", + (double)functime / NANOSEC, zi->zi_funcname); +} + +static void +ztest_thread(void *arg) +{ + int rand; + uint64_t id = (uintptr_t)arg; + ztest_shared_t *zs = ztest_shared; + uint64_t call_next; + hrtime_t now; + ztest_info_t *zi; + ztest_shared_callstate_t *zc; + + while ((now = gethrtime()) < zs->zs_thread_stop) { + /* + * See if it's time to force a crash. + */ + if (now > zs->zs_thread_kill) + ztest_kill(zs); + + /* + * If we're getting ENOSPC with some regularity, stop. + */ + if (zs->zs_enospc_count > 10) + break; + + /* + * Pick a random function to execute. + */ + rand = ztest_random(ZTEST_FUNCS); + zi = &ztest_info[rand]; + zc = ZTEST_GET_SHARED_CALLSTATE(rand); + call_next = zc->zc_next; + + if (now >= call_next && + atomic_cas_64(&zc->zc_next, call_next, call_next + + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { + ztest_execute(rand, zi, id); + } + } + + thread_exit(); +} + +static void +ztest_dataset_name(char *dsname, char *pool, int d) +{ + (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); +} + +static void +ztest_dataset_destroy(int d) +{ + char name[ZFS_MAX_DATASET_NAME_LEN]; + int t; + + ztest_dataset_name(name, ztest_opts.zo_pool, d); + + if (ztest_opts.zo_verbose >= 3) + (void) printf("Destroying %s to free up space\n", name); + + /* + * Cleanup any non-standard clones and snapshots. In general, + * ztest thread t operates on dataset (t % zopt_datasets), + * so there may be more than one thing to clean up. + */ + for (t = d; t < ztest_opts.zo_threads; + t += ztest_opts.zo_datasets) + ztest_dsl_dataset_cleanup(name, t); + + (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, + DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); +} + +static void +ztest_dataset_dirobj_verify(ztest_ds_t *zd) +{ + uint64_t usedobjs, dirobjs, scratch; + + /* + * ZTEST_DIROBJ is the object directory for the entire dataset. + * Therefore, the number of objects in use should equal the + * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. + * If not, we have an object leak. + * + * Note that we can only check this in ztest_dataset_open(), + * when the open-context and syncing-context values agree. + * That's because zap_count() returns the open-context value, + * while dmu_objset_space() returns the rootbp fill count. + */ + VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); + dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); + ASSERT3U(dirobjs + 1, ==, usedobjs); +} + +static int +ztest_dataset_open(int d) +{ + ztest_ds_t *zd = &ztest_ds[d]; + uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; + objset_t *os; + zilog_t *zilog; + char name[ZFS_MAX_DATASET_NAME_LEN]; + int error; + + ztest_dataset_name(name, ztest_opts.zo_pool, d); + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + error = ztest_dataset_create(name); + if (error == ENOSPC) { + (void) pthread_rwlock_unlock(&ztest_name_lock); + ztest_record_enospc(FTAG); + return (error); + } + ASSERT(error == 0 || error == EEXIST); + + VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, + B_TRUE, zd, &os)); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); + + zilog = zd->zd_zilog; + + if (zilog->zl_header->zh_claim_lr_seq != 0 && + zilog->zl_header->zh_claim_lr_seq < committed_seq) + fatal(0, "missing log records: claimed %llu < committed %llu", + zilog->zl_header->zh_claim_lr_seq, committed_seq); + + ztest_dataset_dirobj_verify(zd); + + zil_replay(os, zd, ztest_replay_vector); + + ztest_dataset_dirobj_verify(zd); + + if (ztest_opts.zo_verbose >= 6) + (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", + zd->zd_name, + (u_longlong_t)zilog->zl_parse_blk_count, + (u_longlong_t)zilog->zl_parse_lr_count, + (u_longlong_t)zilog->zl_replaying_seq); + + zilog = zil_open(os, ztest_get_data); + + if (zilog->zl_replaying_seq != 0 && + zilog->zl_replaying_seq < committed_seq) + fatal(0, "missing log records: replayed %llu < committed %llu", + zilog->zl_replaying_seq, committed_seq); + + return (0); +} + +static void +ztest_dataset_close(int d) +{ + ztest_ds_t *zd = &ztest_ds[d]; + + zil_close(zd->zd_zilog); + dmu_objset_disown(zd->zd_os, B_TRUE, zd); + + ztest_zd_fini(zd); +} + +/* ARGSUSED */ +static int +ztest_replay_zil_cb(const char *name, void *arg) +{ + objset_t *os; + ztest_ds_t *zdtmp; + + VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, + B_TRUE, FTAG, &os)); + + zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); + + ztest_zd_init(zdtmp, NULL, os); + zil_replay(os, zdtmp, ztest_replay_vector); + ztest_zd_fini(zdtmp); + + if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && + ztest_opts.zo_verbose >= 6) { + zilog_t *zilog = dmu_objset_zil(os); + + (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", + name, + (u_longlong_t)zilog->zl_parse_blk_count, + (u_longlong_t)zilog->zl_parse_lr_count, + (u_longlong_t)zilog->zl_replaying_seq); + } + + umem_free(zdtmp, sizeof (ztest_ds_t)); + + dmu_objset_disown(os, B_TRUE, FTAG); + return (0); +} + +static void +ztest_freeze(void) +{ + ztest_ds_t *zd = &ztest_ds[0]; + spa_t *spa; + int numloops = 0; + + if (ztest_opts.zo_verbose >= 3) + (void) printf("testing spa_freeze()...\n"); + + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(0)); + ztest_spa = spa; + + /* + * Force the first log block to be transactionally allocated. + * We have to do this before we freeze the pool -- otherwise + * the log chain won't be anchored. + */ + while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { + ztest_dmu_object_alloc_free(zd, 0); + zil_commit(zd->zd_zilog, 0); + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * Freeze the pool. This stops spa_sync() from doing anything, + * so that the only way to record changes from now on is the ZIL. + */ + spa_freeze(spa); + + /* + * Because it is hard to predict how much space a write will actually + * require beforehand, we leave ourselves some fudge space to write over + * capacity. + */ + uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; + + /* + * Run tests that generate log records but don't alter the pool config + * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). + * We do a txg_wait_synced() after each iteration to force the txg + * to increase well beyond the last synced value in the uberblock. + * The ZIL should be OK with that. + * + * Run a random number of times less than zo_maxloops and ensure we do + * not run out of space on the pool. + */ + while (ztest_random(10) != 0 && + numloops++ < ztest_opts.zo_maxloops && + metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { + ztest_od_t od; + ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); + VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); + ztest_io(zd, od.od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + txg_wait_synced(spa_get_dsl(spa), 0); + } + + /* + * Commit all of the changes we just generated. + */ + zil_commit(zd->zd_zilog, 0); + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * Close our dataset and close the pool. + */ + ztest_dataset_close(0); + spa_close(spa, FTAG); + kernel_fini(); + + /* + * Open and close the pool and dataset to induce log replay. + */ + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + ASSERT(spa_freeze_txg(spa) == UINT64_MAX); + VERIFY3U(0, ==, ztest_dataset_open(0)); + ztest_spa = spa; + txg_wait_synced(spa_get_dsl(spa), 0); + ztest_dataset_close(0); + ztest_reguid(NULL, 0); + + spa_close(spa, FTAG); + kernel_fini(); +} + +static void +ztest_import_impl(ztest_shared_t *zs) +{ + importargs_t args = { 0 }; + nvlist_t *cfg = NULL; + int nsearch = 1; + char *searchdirs[nsearch]; + int flags = ZFS_IMPORT_MISSING_LOG; + + searchdirs[0] = ztest_opts.zo_dir; + args.paths = nsearch; + args.path = searchdirs; + args.can_be_active = B_FALSE; + + VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, + &libzpool_config_ops)); + VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); +} + +/* + * Import a storage pool with the given name. + */ +static void +ztest_import(ztest_shared_t *zs) +{ + spa_t *spa; + + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + + ztest_import_impl(zs); + + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + spa_close(spa, FTAG); + + kernel_fini(); + + if (!ztest_opts.zo_mmp_test) { + ztest_run_zdb(ztest_opts.zo_pool); + ztest_freeze(); + ztest_run_zdb(ztest_opts.zo_pool); + } + + (void) pthread_rwlock_destroy(&ztest_name_lock); + mutex_destroy(&ztest_vdev_lock); + mutex_destroy(&ztest_checkpoint_lock); +} + +/* + * Kick off threads to run tests on all datasets in parallel. + */ +static void +ztest_run(ztest_shared_t *zs) +{ + spa_t *spa; + objset_t *os; + kthread_t *resume_thread, *deadman_thread; + kthread_t **run_threads; + uint64_t object; + int error; + int t, d; + + ztest_exiting = B_FALSE; + + /* + * Initialize parent/child shared state. + */ + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + zs->zs_thread_start = gethrtime(); + zs->zs_thread_stop = + zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; + zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); + zs->zs_thread_kill = zs->zs_thread_stop; + if (ztest_random(100) < ztest_opts.zo_killrate) { + zs->zs_thread_kill -= + ztest_random(ztest_opts.zo_passtime * NANOSEC); + } + + mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); + + list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), + offsetof(ztest_cb_data_t, zcd_node)); + + /* + * Open our pool. It may need to be imported first depending on + * what tests were running when the previous pass was terminated. + */ + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + error = spa_open(ztest_opts.zo_pool, &spa, FTAG); + if (error) { + VERIFY3S(error, ==, ENOENT); + ztest_import_impl(zs); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + } + + metaslab_preload_limit = ztest_random(20) + 1; + ztest_spa = spa; + + VERIFY0(vdev_raidz_impl_set("cycle")); + + dmu_objset_stats_t dds; + VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, + DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + zs->zs_guid = dds.dds_guid; + dmu_objset_disown(os, B_TRUE, FTAG); + + /* + * Create a thread to periodically resume suspended I/O. + */ + resume_thread = thread_create(NULL, 0, ztest_resume_thread, + spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + /* + * Create a deadman thread and set to panic if we hang. + */ + deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, + zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; + + /* + * Verify that we can safely inquire about any object, + * whether it's allocated or not. To make it interesting, + * we probe a 5-wide window around each power of two. + * This hits all edge cases, including zero and the max. + */ + for (t = 0; t < 64; t++) { + for (d = -5; d <= 5; d++) { + error = dmu_object_info(spa->spa_meta_objset, + (1ULL << t) + d, NULL); + ASSERT(error == 0 || error == ENOENT || + error == EINVAL); + } + } + + /* + * If we got any ENOSPC errors on the previous run, destroy something. + */ + if (zs->zs_enospc_count != 0) { + int d = ztest_random(ztest_opts.zo_datasets); + ztest_dataset_destroy(d); + } + zs->zs_enospc_count = 0; + + /* + * If we were in the middle of ztest_device_removal() and were killed + * we need to ensure the removal and scrub complete before running + * any tests that check ztest_device_removal_active. The removal will + * be restarted automatically when the spa is opened, but we need to + * initiate the scrub manually if it is not already in progress. Note + * that we always run the scrub whenever an indirect vdev exists + * because we have no way of knowing for sure if ztest_device_removal() + * fully completed its scrub before the pool was reimported. + */ + if (spa->spa_removing_phys.sr_state == DSS_SCANNING || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + while (spa->spa_removing_phys.sr_state == DSS_SCANNING) + txg_wait_synced(spa_get_dsl(spa), 0); + + error = ztest_scrub_impl(spa); + if (error == EBUSY) + error = 0; + ASSERT0(error); + } + + run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), + UMEM_NOFAIL); + + if (ztest_opts.zo_verbose >= 4) + (void) printf("starting main threads...\n"); + + /* + * Replay all logs of all datasets in the pool. This is primarily for + * temporary datasets which wouldn't otherwise get replayed, which + * can trigger failures when attempting to offline a SLOG in + * ztest_fault_inject(). + */ + (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, + NULL, DS_FIND_CHILDREN); + + /* + * Kick off all the tests that run in parallel. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { + umem_free(run_threads, ztest_opts.zo_threads * + sizeof (kthread_t *)); + return; + } + + run_threads[t] = thread_create(NULL, 0, ztest_thread, + (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, + defclsyspri); + } + + /* + * Wait for all of the tests to complete. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) + VERIFY0(thread_join(run_threads[t])); + + /* + * Close all datasets. This must be done after all the threads + * are joined so we can be sure none of the datasets are in-use + * by any of the threads. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets) + ztest_dataset_close(t); + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); + + /* Kill the resume and deadman threads */ + ztest_exiting = B_TRUE; + VERIFY0(thread_join(resume_thread)); + VERIFY0(thread_join(deadman_thread)); + ztest_resume(spa); + + /* + * Right before closing the pool, kick off a bunch of async I/O; + * spa_close() should wait for it to complete. + */ + for (object = 1; object < 50; object++) { + dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, + ZIO_PRIORITY_SYNC_READ); + } + + /* Verify that at least one commit cb was called in a timely fashion */ + if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) + VERIFY0(zc_min_txg_delay); + + spa_close(spa, FTAG); + + /* + * Verify that we can loop over all pools. + */ + mutex_enter(&spa_namespace_lock); + for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) + if (ztest_opts.zo_verbose > 3) + (void) printf("spa_next: found %s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); + + /* + * Verify that we can export the pool and reimport it under a + * different name. + */ + if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(name, sizeof (name), "%s_import", + ztest_opts.zo_pool); + ztest_spa_import_export(ztest_opts.zo_pool, name); + ztest_spa_import_export(name, ztest_opts.zo_pool); + } + + kernel_fini(); + + list_destroy(&zcl.zcl_callbacks); + mutex_destroy(&zcl.zcl_callbacks_lock); + (void) pthread_rwlock_destroy(&ztest_name_lock); + mutex_destroy(&ztest_vdev_lock); + mutex_destroy(&ztest_checkpoint_lock); +} + +static void +print_time(hrtime_t t, char *timebuf) +{ + hrtime_t s = t / NANOSEC; + hrtime_t m = s / 60; + hrtime_t h = m / 60; + hrtime_t d = h / 24; + + s -= m * 60; + m -= h * 60; + h -= d * 24; + + timebuf[0] = '\0'; + + if (d) + (void) sprintf(timebuf, + "%llud%02lluh%02llum%02llus", d, h, m, s); + else if (h) + (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); + else if (m) + (void) sprintf(timebuf, "%llum%02llus", m, s); + else + (void) sprintf(timebuf, "%llus", s); +} + +static nvlist_t * +make_random_props(void) +{ + nvlist_t *props; + + VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); + + if (ztest_random(2) == 0) + return (props); + + VERIFY0(nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1)); + + return (props); +} + +/* + * Create a storage pool with the given name and initial vdev size. + * Then test spa_freeze() functionality. + */ +static void +ztest_init(ztest_shared_t *zs) +{ + spa_t *spa; + nvlist_t *nvroot, *props; + int i; + + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + + /* + * Create the storage pool. + */ + (void) spa_destroy(ztest_opts.zo_pool); + ztest_shared->zs_vdev_next_leaf = 0; + zs->zs_splits = 0; + zs->zs_mirrors = ztest_opts.zo_mirrors; + nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, + NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + props = make_random_props(); + + /* + * We don't expect the pool to suspend unless maxfaults == 0, + * in which case ztest_fault_inject() temporarily takes away + * the only valid replica. + */ + VERIFY0(nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), + MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT)); + + for (i = 0; i < SPA_FEATURES; i++) { + char *buf; + + /* + * 75% chance of using the log space map feature. We want ztest + * to exercise both the code paths that use the log space map + * feature and the ones that don't. + */ + if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) + continue; + + VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", + spa_feature_table[i].fi_uname)); + VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); + free(buf); + } + + VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); + nvlist_free(nvroot); + nvlist_free(props); + + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + spa_close(spa, FTAG); + + kernel_fini(); + + if (!ztest_opts.zo_mmp_test) { + ztest_run_zdb(ztest_opts.zo_pool); + ztest_freeze(); + ztest_run_zdb(ztest_opts.zo_pool); + } + + (void) pthread_rwlock_destroy(&ztest_name_lock); + mutex_destroy(&ztest_vdev_lock); + mutex_destroy(&ztest_checkpoint_lock); +} + +static void +setup_data_fd(void) +{ + static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; + + ztest_fd_data = mkstemp(ztest_name_data); + ASSERT3S(ztest_fd_data, >=, 0); + (void) unlink(ztest_name_data); +} + +static int +shared_data_size(ztest_shared_hdr_t *hdr) +{ + int size; + + size = hdr->zh_hdr_size; + size += hdr->zh_opts_size; + size += hdr->zh_size; + size += hdr->zh_stats_size * hdr->zh_stats_count; + size += hdr->zh_ds_size * hdr->zh_ds_count; + + return (size); +} + +static void +setup_hdr(void) +{ + int size; + ztest_shared_hdr_t *hdr; + + hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); + ASSERT(hdr != MAP_FAILED); + + VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); + + hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); + hdr->zh_opts_size = sizeof (ztest_shared_opts_t); + hdr->zh_size = sizeof (ztest_shared_t); + hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); + hdr->zh_stats_count = ZTEST_FUNCS; + hdr->zh_ds_size = sizeof (ztest_shared_ds_t); + hdr->zh_ds_count = ztest_opts.zo_datasets; + + size = shared_data_size(hdr); + VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); + + (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); +} + +static void +setup_data(void) +{ + int size, offset; + ztest_shared_hdr_t *hdr; + uint8_t *buf; + + hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), + PROT_READ, MAP_SHARED, ztest_fd_data, 0); + ASSERT(hdr != MAP_FAILED); + + size = shared_data_size(hdr); + + (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); + hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); + ASSERT(hdr != MAP_FAILED); + buf = (uint8_t *)hdr; + + offset = hdr->zh_hdr_size; + ztest_shared_opts = (void *)&buf[offset]; + offset += hdr->zh_opts_size; + ztest_shared = (void *)&buf[offset]; + offset += hdr->zh_size; + ztest_shared_callstate = (void *)&buf[offset]; + offset += hdr->zh_stats_size * hdr->zh_stats_count; + ztest_shared_ds = (void *)&buf[offset]; +} + +static boolean_t +exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) +{ + pid_t pid; + int status; + char *cmdbuf = NULL; + + pid = fork(); + + if (cmd == NULL) { + cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); + cmd = cmdbuf; + } + + if (pid == -1) + fatal(1, "fork failed"); + + if (pid == 0) { /* child */ + char *emptyargv[2] = { cmd, NULL }; + char fd_data_str[12]; + + struct rlimit rl = { 1024, 1024 }; + (void) setrlimit(RLIMIT_NOFILE, &rl); + + (void) close(ztest_fd_rand); + VERIFY(11 >= snprintf(fd_data_str, 12, "%d", ztest_fd_data)); + VERIFY(0 == setenv("ZTEST_FD_DATA", fd_data_str, 1)); + + (void) enable_extended_FILE_stdio(-1, -1); + if (libpath != NULL) + VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); + (void) execv(cmd, emptyargv); + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "exec failed: %s", cmd); + } + + if (cmdbuf != NULL) { + umem_free(cmdbuf, MAXPATHLEN); + cmd = NULL; + } + + while (waitpid(pid, &status, 0) != pid) + continue; + if (statusp != NULL) + *statusp = status; + + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != 0) { + (void) fprintf(stderr, "child exited with code %d\n", + WEXITSTATUS(status)); + exit(2); + } + return (B_FALSE); + } else if (WIFSIGNALED(status)) { + if (!ignorekill || WTERMSIG(status) != SIGKILL) { + (void) fprintf(stderr, "child died with signal %d\n", + WTERMSIG(status)); + exit(3); + } + return (B_TRUE); + } else { + (void) fprintf(stderr, "something strange happened to child\n"); + exit(4); + /* NOTREACHED */ + } +} + +static void +ztest_run_init(void) +{ + int i; + + ztest_shared_t *zs = ztest_shared; + + /* + * Blow away any existing copy of zpool.cache + */ + (void) remove(spa_config_path); + + if (ztest_opts.zo_init == 0) { + if (ztest_opts.zo_verbose >= 1) + (void) printf("Importing pool %s\n", + ztest_opts.zo_pool); + ztest_import(zs); + return; + } + + /* + * Create and initialize our storage pool. + */ + for (i = 1; i <= ztest_opts.zo_init; i++) { + bzero(zs, sizeof (ztest_shared_t)); + if (ztest_opts.zo_verbose >= 3 && + ztest_opts.zo_init != 1) { + (void) printf("ztest_init(), pass %d\n", i); + } + ztest_init(zs); + } +} + +int +main(int argc, char **argv) +{ + int kills = 0; + int iters = 0; + int older = 0; + int newer = 0; + ztest_shared_t *zs; + ztest_info_t *zi; + ztest_shared_callstate_t *zc; + char timebuf[100]; + char numbuf[NN_NUMBUF_SZ]; + char *cmd; + boolean_t hasalt; + int f; + char *fd_data_str = getenv("ZTEST_FD_DATA"); + struct sigaction action; + + (void) setvbuf(stdout, NULL, _IOLBF, 0); + + dprintf_setup(&argc, argv); + zfs_deadman_synctime_ms = 300000; + zfs_deadman_checktime_ms = 30000; + /* + * As two-word space map entries may not come up often (especially + * if pool and vdev sizes are small) we want to force at least some + * of them so the feature get tested. + */ + zfs_force_some_double_word_sm_entries = B_TRUE; + + /* + * Verify that even extensively damaged split blocks with many + * segments can be reconstructed in a reasonable amount of time + * when reconstruction is known to be possible. + * + * Note: the lower this value is, the more damage we inflict, and + * the more time ztest spends in recovering that damage. We chose + * to induce damage 1/100th of the time so recovery is tested but + * not so frequently that ztest doesn't get to test other code paths. + */ + zfs_reconstruct_indirect_damage_fraction = 100; + + action.sa_handler = sig_handler; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + + if (sigaction(SIGSEGV, &action, NULL) < 0) { + (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (sigaction(SIGABRT, &action, NULL) < 0) { + (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* + * Force random_get_bytes() to use /dev/urandom in order to prevent + * ztest from needlessly depleting the system entropy pool. + */ + random_path = "/dev/urandom"; + ztest_fd_rand = open(random_path, O_RDONLY); + ASSERT3S(ztest_fd_rand, >=, 0); + + if (!fd_data_str) { + process_options(argc, argv); + + setup_data_fd(); + setup_hdr(); + setup_data(); + bcopy(&ztest_opts, ztest_shared_opts, + sizeof (*ztest_shared_opts)); + } else { + ztest_fd_data = atoi(fd_data_str); + setup_data(); + bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); + } + ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); + + /* Override location of zpool.cache */ + VERIFY(asprintf((char **)&spa_config_path, "%s/zpool.cache", + ztest_opts.zo_dir) != -1); + + ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), + UMEM_NOFAIL); + zs = ztest_shared; + + if (fd_data_str) { + metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; + metaslab_df_alloc_threshold = + zs->zs_metaslab_df_alloc_threshold; + + if (zs->zs_do_init) + ztest_run_init(); + else + ztest_run(zs); + exit(0); + } + + hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("%llu vdevs, %d datasets, %d threads," + " %llu seconds...\n", + (u_longlong_t)ztest_opts.zo_vdevs, + ztest_opts.zo_datasets, + ztest_opts.zo_threads, + (u_longlong_t)ztest_opts.zo_time); + } + + cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); + (void) strlcpy(cmd, getexecname(), MAXNAMELEN); + + zs->zs_do_init = B_TRUE; + if (strlen(ztest_opts.zo_alt_ztest) != 0) { + if (ztest_opts.zo_verbose >= 1) { + (void) printf("Executing older ztest for " + "initialization: %s\n", ztest_opts.zo_alt_ztest); + } + VERIFY(!exec_child(ztest_opts.zo_alt_ztest, + ztest_opts.zo_alt_libpath, B_FALSE, NULL)); + } else { + VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); + } + zs->zs_do_init = B_FALSE; + + zs->zs_proc_start = gethrtime(); + zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; + + for (f = 0; f < ZTEST_FUNCS; f++) { + zi = &ztest_info[f]; + zc = ZTEST_GET_SHARED_CALLSTATE(f); + if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) + zc->zc_next = UINT64_MAX; + else + zc->zc_next = zs->zs_proc_start + + ztest_random(2 * zi->zi_interval[0] + 1); + } + + /* + * Run the tests in a loop. These tests include fault injection + * to verify that self-healing data works, and forced crashes + * to verify that we never lose on-disk consistency. + */ + while (gethrtime() < zs->zs_proc_stop) { + int status; + boolean_t killed; + + /* + * Initialize the workload counters for each function. + */ + for (f = 0; f < ZTEST_FUNCS; f++) { + zc = ZTEST_GET_SHARED_CALLSTATE(f); + zc->zc_count = 0; + zc->zc_time = 0; + } + + /* Set the allocation switch size */ + zs->zs_metaslab_df_alloc_threshold = + ztest_random(zs->zs_metaslab_sz / 4) + 1; + + if (!hasalt || ztest_random(2) == 0) { + if (hasalt && ztest_opts.zo_verbose >= 1) { + (void) printf("Executing newer ztest: %s\n", + cmd); + } + newer++; + killed = exec_child(cmd, NULL, B_TRUE, &status); + } else { + if (hasalt && ztest_opts.zo_verbose >= 1) { + (void) printf("Executing older ztest: %s\n", + ztest_opts.zo_alt_ztest); + } + older++; + killed = exec_child(ztest_opts.zo_alt_ztest, + ztest_opts.zo_alt_libpath, B_TRUE, &status); + } + + if (killed) + kills++; + iters++; + + if (ztest_opts.zo_verbose >= 1) { + hrtime_t now = gethrtime(); + + now = MIN(now, zs->zs_proc_stop); + print_time(zs->zs_proc_stop - now, timebuf); + nicenum(zs->zs_space, numbuf, sizeof (numbuf)); + + (void) printf("Pass %3d, %8s, %3llu ENOSPC, " + "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", + iters, + WIFEXITED(status) ? "Complete" : "SIGKILL", + (u_longlong_t)zs->zs_enospc_count, + 100.0 * zs->zs_alloc / zs->zs_space, + numbuf, + 100.0 * (now - zs->zs_proc_start) / + (ztest_opts.zo_time * NANOSEC), timebuf); + } + + if (ztest_opts.zo_verbose >= 2) { + (void) printf("\nWorkload summary:\n\n"); + (void) printf("%7s %9s %s\n", + "Calls", "Time", "Function"); + (void) printf("%7s %9s %s\n", + "-----", "----", "--------"); + for (f = 0; f < ZTEST_FUNCS; f++) { + zi = &ztest_info[f]; + zc = ZTEST_GET_SHARED_CALLSTATE(f); + print_time(zc->zc_time, timebuf); + (void) printf("%7llu %9s %s\n", + (u_longlong_t)zc->zc_count, timebuf, + zi->zi_funcname); + } + (void) printf("\n"); + } + + if (!ztest_opts.zo_mmp_test) + ztest_run_zdb(ztest_opts.zo_pool); + } + + if (ztest_opts.zo_verbose >= 1) { + if (hasalt) { + (void) printf("%d runs of older ztest: %s\n", older, + ztest_opts.zo_alt_ztest); + (void) printf("%d runs of newer ztest: %s\n", newer, + cmd); + } + (void) printf("%d killed, %d completed, %.0f%% kill rate\n", + kills, iters - kills, (100.0 * kills) / MAX(1, iters)); + } + + umem_free(cmd, MAXNAMELEN); + + return (0); +} diff --git a/sys/contrib/openzfs/cmd/zvol_id/.gitignore b/sys/contrib/openzfs/cmd/zvol_id/.gitignore new file mode 100644 index 000000000000..8b757a2d6781 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zvol_id/.gitignore @@ -0,0 +1 @@ +zvol_id diff --git a/sys/contrib/openzfs/cmd/zvol_id/Makefile.am b/sys/contrib/openzfs/cmd/zvol_id/Makefile.am new file mode 100644 index 000000000000..a584875081eb --- /dev/null +++ b/sys/contrib/openzfs/cmd/zvol_id/Makefile.am @@ -0,0 +1,10 @@ +include $(top_srcdir)/config/Rules.am + +# Disable GCC stack protection for zvol_id. This is a kludge and should be +# removed once https://github.com/zfsonlinux/zfs/issues/569 is resolved. +AM_CFLAGS += -fno-stack-protector + +udev_PROGRAMS = zvol_id + +zvol_id_SOURCES = \ + zvol_id_main.c diff --git a/sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c b/sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c new file mode 100644 index 000000000000..4a2d74cc203c --- /dev/null +++ b/sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c @@ -0,0 +1,110 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Fajar A. Nugraha. All rights reserved. + * Use is subject to license terms. + */ + +#include <ctype.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <unistd.h> +#include <linux/ioctl.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/zfs_znode.h> +#include <sys/fs/zfs.h> + +static int +ioctl_get_msg(char *var, int fd) +{ + int error = 0; + char msg[ZFS_MAX_DATASET_NAME_LEN]; + + error = ioctl(fd, BLKZNAME, msg); + if (error < 0) { + return (error); + } + + snprintf(var, ZFS_MAX_DATASET_NAME_LEN, "%s", msg); + return (error); +} + +int +main(int argc, char **argv) +{ + int fd, error = 0; + char zvol_name[ZFS_MAX_DATASET_NAME_LEN]; + char *zvol_name_part = NULL; + char *dev_name; + struct stat64 statbuf; + int dev_minor, dev_part; + int i; + int rc; + + if (argc < 2) { + printf("Usage: %s /dev/zvol_device_node\n", argv[0]); + return (EINVAL); + } + + dev_name = argv[1]; + error = stat64(dev_name, &statbuf); + if (error != 0) { + printf("Unable to access device file: %s\n", dev_name); + return (errno); + } + + dev_minor = minor(statbuf.st_rdev); + dev_part = dev_minor % ZVOL_MINORS; + + fd = open(dev_name, O_RDONLY); + if (fd < 0) { + printf("Unable to open device file: %s\n", dev_name); + return (errno); + } + + error = ioctl_get_msg(zvol_name, fd); + if (error < 0) { + printf("ioctl_get_msg failed:%s\n", strerror(errno)); + return (errno); + } + if (dev_part > 0) + rc = asprintf(&zvol_name_part, "%s-part%d", zvol_name, + dev_part); + else + rc = asprintf(&zvol_name_part, "%s", zvol_name); + + if (rc == -1 || zvol_name_part == NULL) + goto error; + + for (i = 0; i < strlen(zvol_name_part); i++) { + if (isblank(zvol_name_part[i])) + zvol_name_part[i] = '+'; + } + + printf("%s\n", zvol_name_part); + free(zvol_name_part); +error: + close(fd); + return (error); +} diff --git a/sys/contrib/openzfs/cmd/zvol_wait/Makefile.am b/sys/contrib/openzfs/cmd/zvol_wait/Makefile.am new file mode 100644 index 000000000000..564031c9799d --- /dev/null +++ b/sys/contrib/openzfs/cmd/zvol_wait/Makefile.am @@ -0,0 +1 @@ +dist_bin_SCRIPTS = zvol_wait diff --git a/sys/contrib/openzfs/cmd/zvol_wait/zvol_wait b/sys/contrib/openzfs/cmd/zvol_wait/zvol_wait new file mode 100755 index 000000000000..9a3948da5564 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zvol_wait/zvol_wait @@ -0,0 +1,116 @@ +#!/bin/sh + +count_zvols() { + if [ -z "$zvols" ]; then + echo 0 + else + echo "$zvols" | wc -l + fi +} + +filter_out_zvols_with_links() { + while read -r zvol; do + if [ ! -L "/dev/zvol/$zvol" ]; then + echo "$zvol" + fi + done +} + +filter_out_deleted_zvols() { + while read -r zvol; do + if zfs list "$zvol" >/dev/null 2>&1; then + echo "$zvol" + fi + done +} + +list_zvols() { + zfs list -t volume -H -o \ + name,volmode,receive_resume_token,redact_snaps | + while read -r zvol_line; do + name=$(echo "$zvol_line" | awk '{print $1}') + volmode=$(echo "$zvol_line" | awk '{print $2}') + token=$(echo "$zvol_line" | awk '{print $3}') + redacted=$(echo "$zvol_line" | awk '{print $4}') + # + # /dev links are not created for zvols with volmode = "none" + # or for redacted zvols. + # + [ "$volmode" = "none" ] && continue + [ "$redacted" = "-" ] || continue + # + # We also also ignore partially received zvols if it is + # not an incremental receive, as those won't even have a block + # device minor node created yet. + # + if [ "$token" != "-" ]; then + # + # Incremental receives create an invisible clone that + # is not automatically displayed by zfs list. + # + if ! zfs list "$name/%recv" >/dev/null 2>&1; then + continue + fi + fi + echo "$name" + done +} + +zvols=$(list_zvols) +zvols_count=$(count_zvols) +if [ "$zvols_count" -eq 0 ]; then + echo "No zvols found, nothing to do." + exit 0 +fi + +echo "Testing $zvols_count zvol links" + +outer_loop=0 +while [ "$outer_loop" -lt 20 ]; do + outer_loop=$((outer_loop + 1)) + + old_zvols_count=$(count_zvols) + + inner_loop=0 + while [ "$inner_loop" -lt 30 ]; do + inner_loop=$((inner_loop + 1)) + + zvols="$(echo "$zvols" | filter_out_zvols_with_links)" + + zvols_count=$(count_zvols) + if [ "$zvols_count" -eq 0 ]; then + echo "All zvol links are now present." + exit 0 + fi + sleep 1 + done + + echo "Still waiting on $zvols_count zvol links ..." + # + # Although zvols should normally not be deleted at boot time, + # if that is the case then their links will be missing and + # we would stall. + # + if [ "$old_zvols_count" -eq "$zvols_count" ]; then + echo "No progress since last loop." + echo "Checking if any zvols were deleted." + + zvols=$(echo "$zvols" | filter_out_deleted_zvols) + zvols_count=$(count_zvols) + + if [ "$old_zvols_count" -ne "$zvols_count" ]; then + echo "$((old_zvols_count - zvols_count)) zvol(s) deleted." + fi + + if [ "$zvols_count" -ne 0 ]; then + echo "Remaining zvols:" + echo "$zvols" + else + echo "All zvol links are now present." + exit 0 + fi + fi +done + +echo "Timed out waiting on zvol links" +exit 1 |